#!/bin/env -S python3 from dataclasses import dataclass import numpy import pandas import random import math def main(): dataset = load_dataset("bin/iris/iris.data") # print("dataset", dataset) (dataset_training, dataset_test) = split_dataset(dataset, 0.8) # print("dataset_training", dataset_training, "dataset_test", dataset_test) centroid = KVMeans(3, 4) centroid.learn(dataset_training[["a", "b", "c", "d"]]) def load_dataset(path: str): data = pandas.read_csv(path, names=["a", "b", "c", "d", "category"]) return data def split_dataset(dataset: pandas.DataFrame, ratio: float): total_len = len(dataset) start_len = math.floor(total_len * ratio) (start, rest) = numpy.array_split(dataset, [start_len]) return (start, rest) @dataclass class Centroid: dimension: int position: list[float] @staticmethod def new_random(dimension: int, rng: random.Random): return Centroid(dimension, [rng.random() for _ in range(dimension)]) class KVMeans: def __init__(self, mean_count: int, dimension: int): rng = random.Random(0) self.means = [Centroid.new_random(dimension, rng) for _ in range(mean_count)] self.dimension = dimension self.mean_count = mean_count def learn(self, dataset: pandas.DataFrame): assert dataset.columns.size == self.dimension for (_, line) in dataset.iterrows(): for absc in range(self.dimension): pass # print([3 for _ in series]) # print(series) # dataset.map(lambda e: print(e)) def predict(self): pass def distance_dim(a: list[float], b: list[float]): acc = 0 for (a_, b_) in zip(a, b): acc += math.pow(a_ - b_, 2) return math.sqrt(acc) if __name__ == "__main__": main()