import numpy as np import matplotlib.pyplot as plt # step 01 --------------------------------------- # args # - filename: string filename # return # - x: data matrix of size (n,p) # - y: array of integer labels of size n def load_data(filename): x = np.loadtxt(filename) y = x[:,4].astype(int) x = x[:,:-1] return x, y # step 02 --------------------------------------- # args # - x: data matrix of size (n,p) # return # - x: center-normalized input def standardize(x: np.ndarray): def normalize_col(column: np.ndarray): avg = column.mean() avg_dist_to_avg = column.std() return [ (elt - avg) / avg_dist_to_avg for elt in column ] x = np.array([ normalize_col(column) for column in x.T ]).T return x # step 03 --------------------------------------- # args # - x: standardized data matrix of size (n,p) # return # - C: covariance matrix of size (p,p) def compute_covariance_matrix(x: np.ndarray): # def cov_cols(col_a: np.ndarray, col_b: np.ndarray): # return sum(a * b for (a, b) in zip(col_a, col_b)) / len(col_a) # C = np.array([[ cov_cols(col_a, col_b) for col_b in x.T] for col_a in x.T]) # return C return np.cov(x.T, bias=True) # step 04 --------------------------------------- # args # - C: covariance matrix of size (p,p) # return # - eigenval: eigenvalues of size p # - eigenvec: eigenvectors of size (p,p) def compute_eigenelements(C: np.ndarray): return np.linalg.eig(C) # step 05 --------------------------------------- # args # - eigenval: eigenvalues of size p # - eigenvec: eigenvectors of size (p,p) # return # - eigenval: ordered eigenvalues of size p # - eigenvec: ordered eigenvectors of size (p,p) def sort_eigenelements(eigenval: np.ndarray, eigenvec: np.ndarray): keyed = list(zip(eigenval, eigenvec.T)) keyed.sort(key=lambda a: a[0]) eigenval = np.flip([val for (val, _) in keyed]) eigenvec = np.rot90(np.flip([vec for (_, vec) in keyed])) return (eigenval, eigenvec) # step 06 --------------------------------------- # args # - eigenval: eigenvalues of size p # returns # - var: variances ratio explained by each components (of size p) # - var_accu: cumulated variances ratio (of size p) def compute_var_ratio(eigenval: np.ndarray): total = np.sum(eigenval) var = np.divide(eigenval, total) var_accu = np.cumsum(var) return (var, var_accu) # step 08 --------------------------------------- # args # - x: standardized data matrix of size (n,p) # - eigenvec: sorted eigenvectors (of size (p,p)) # return # - x: projected data of size (n,2) def project_on_2pc(x: np.ndarray, eigenvec: np.ndarray): projected = x @ eigenvec _2d = projected[:,:2] return _2d # step 11 --------------------------------------- # args # - x: standardized data matrix of size (n,p) # return # - y_pred: labels of size (n) def classify(x: np.ndarray): def class_n(n: float): if (n < -1): return 0 if (n < 1): return 1 return 2 return np.array([ class_n(n) for n in x[:,0] ]) # step 12 --------------------------------------- # args # - y: ground-truth labels (of size (n)) # - y_pred: predicted labels (of size (n)) # return # - conf: confusion matrix of integer of size (p,p) def compute_confusion_matrix(y: np.ndarray, y_pred: np.ndarray): size = max(y) + 1 res = np.zeros((size, size)).astype(np.int32) for (y, y_pred) in zip(y, y_pred): res[y, y_pred] += 1 return res # step 13 --------------------------------------- # args # - conf: confusion matrix of integer of size (p,p) # return # - prec: average precision over all classes # - rec: average recall over all classes def compute_metric(conf: np.ndarray): def precision(kind: int, mat: np.ndarray): return mat[kind] / mat[kind,:].sum() prec = np.average([ precision(kind) for kind in range(0, 3) ]) def rec(kind: int, mat: np.ndarray): return mat[kind] / mat[kind,:].sum() if __name__ == '__main__': # main program ------------------------------ values, kind = load_data("./iris.csv") standardized = standardize(values) covariance = compute_covariance_matrix(standardized) eigenval, eigenvec = compute_eigenelements(covariance) eigenval_sorted, eigenvec_sorted = sort_eigenelements(eigenval, eigenvec) variance, variance_accumulated = compute_var_ratio(eigenval_sorted) projected = project_on_2pc(standardized, eigenvec_sorted) predictions = classify(projected) # draw plt.figure() plt.scatter(projected[:,0], projected[:,1], c=predictions) plt.show() confusion = compute_confusion_matrix(kind, predictions) # ----------------------------------- plt.show() # -----------------------------------