153 lines
4.7 KiB
Python
153 lines
4.7 KiB
Python
import numpy as np
|
|
import matplotlib.pyplot as plt
|
|
|
|
|
|
# step 01 ---------------------------------------
|
|
# args
|
|
# - filename: string filename
|
|
# return
|
|
# - x: data matrix of size (n,p)
|
|
# - y: array of integer labels of size n
|
|
def load_data(filename):
|
|
x = np.loadtxt(filename)
|
|
y = x[:,4].astype(int)
|
|
x = x[:,:-1]
|
|
return x, y
|
|
|
|
|
|
# step 02 ---------------------------------------
|
|
# args
|
|
# - x: data matrix of size (n,p)
|
|
# return
|
|
# - x: center-normalized input
|
|
def standardize(x: np.ndarray):
|
|
def normalize_col(column: np.ndarray):
|
|
avg = column.mean()
|
|
avg_dist_to_avg = column.std()
|
|
return [ (elt - avg) / avg_dist_to_avg for elt in column ]
|
|
x = np.array([ normalize_col(column) for column in x.T ]).T
|
|
return x
|
|
|
|
|
|
# step 03 ---------------------------------------
|
|
# args
|
|
# - x: standardized data matrix of size (n,p)
|
|
# return
|
|
# - C: covariance matrix of size (p,p)
|
|
def compute_covariance_matrix(x: np.ndarray):
|
|
# def cov_cols(col_a: np.ndarray, col_b: np.ndarray):
|
|
# return sum(a * b for (a, b) in zip(col_a, col_b)) / len(col_a)
|
|
# C = np.array([[ cov_cols(col_a, col_b) for col_b in x.T] for col_a in x.T])
|
|
# return C
|
|
return np.cov(x.T, bias=True)
|
|
|
|
|
|
# step 04 ---------------------------------------
|
|
# args
|
|
# - C: covariance matrix of size (p,p)
|
|
# return
|
|
# - eigenval: eigenvalues of size p
|
|
# - eigenvec: eigenvectors of size (p,p)
|
|
def compute_eigenelements(C: np.ndarray):
|
|
return np.linalg.eig(C)
|
|
|
|
|
|
# step 05 ---------------------------------------
|
|
# args
|
|
# - eigenval: eigenvalues of size p
|
|
# - eigenvec: eigenvectors of size (p,p)
|
|
# return
|
|
# - eigenval: ordered eigenvalues of size p
|
|
# - eigenvec: ordered eigenvectors of size (p,p)
|
|
def sort_eigenelements(eigenval: np.ndarray, eigenvec: np.ndarray):
|
|
keyed = list(zip(eigenval, eigenvec.T))
|
|
keyed.sort(key=lambda a: a[0])
|
|
eigenval = np.flip([val for (val, _) in keyed])
|
|
eigenvec = np.rot90(np.flip([vec for (_, vec) in keyed]))
|
|
return (eigenval, eigenvec)
|
|
|
|
|
|
# step 06 ---------------------------------------
|
|
# args
|
|
# - eigenval: eigenvalues of size p
|
|
# returns
|
|
# - var: variances ratio explained by each components (of size p)
|
|
# - var_accu: cumulated variances ratio (of size p)
|
|
def compute_var_ratio(eigenval: np.ndarray):
|
|
total = np.sum(eigenval)
|
|
var = np.divide(eigenval, total)
|
|
var_accu = np.cumsum(var)
|
|
return (var, var_accu)
|
|
|
|
# step 08 ---------------------------------------
|
|
# args
|
|
# - x: standardized data matrix of size (n,p)
|
|
# - eigenvec: sorted eigenvectors (of size (p,p))
|
|
# return
|
|
# - x: projected data of size (n,2)
|
|
def project_on_2pc(x: np.ndarray, eigenvec: np.ndarray):
|
|
projected = x @ eigenvec
|
|
_2d = projected[:,:2]
|
|
return _2d
|
|
|
|
# step 11 ---------------------------------------
|
|
# args
|
|
# - x: standardized data matrix of size (n,p)
|
|
# return
|
|
# - y_pred: labels of size (n)
|
|
def classify(x: np.ndarray):
|
|
def class_n(n: float):
|
|
if (n < -1): return 0
|
|
if (n < 1): return 1
|
|
return 2
|
|
return np.array([ class_n(n) for n in x[:,0] ])
|
|
|
|
# step 12 ---------------------------------------
|
|
# args
|
|
# - y: ground-truth labels (of size (n))
|
|
# - y_pred: predicted labels (of size (n))
|
|
# return
|
|
# - conf: confusion matrix of integer of size (p,p)
|
|
def compute_confusion_matrix(y: np.ndarray, y_pred: np.ndarray):
|
|
size = max(y) + 1
|
|
res = np.zeros((size, size)).astype(np.int32)
|
|
for (y, y_pred) in zip(y, y_pred): res[y, y_pred] += 1
|
|
return res
|
|
|
|
|
|
# step 13 ---------------------------------------
|
|
# args
|
|
# - conf: confusion matrix of integer of size (p,p)
|
|
# return
|
|
# - prec: average precision over all classes
|
|
# - rec: average recall over all classes
|
|
def compute_metric(conf: np.ndarray):
|
|
def precision(kind: int, mat: np.ndarray):
|
|
return mat[kind] / mat[kind,:].sum()
|
|
prec = np.average([ precision(kind) for kind in range(0, 3) ])
|
|
def rec(kind: int, mat: np.ndarray):
|
|
return mat[kind] / mat[kind,:].sum()
|
|
|
|
|
|
if __name__ == '__main__':
|
|
# main program ------------------------------
|
|
|
|
values, kind = load_data("./iris.csv")
|
|
standardized = standardize(values)
|
|
covariance = compute_covariance_matrix(standardized)
|
|
eigenval, eigenvec = compute_eigenelements(covariance)
|
|
eigenval_sorted, eigenvec_sorted = sort_eigenelements(eigenval, eigenvec)
|
|
variance, variance_accumulated = compute_var_ratio(eigenval_sorted)
|
|
projected = project_on_2pc(standardized, eigenvec_sorted)
|
|
predictions = classify(projected)
|
|
|
|
# draw
|
|
plt.figure()
|
|
plt.scatter(projected[:,0], projected[:,1], c=predictions)
|
|
plt.show()
|
|
|
|
confusion = compute_confusion_matrix(kind, predictions)
|
|
|
|
# -----------------------------------
|
|
plt.show()
|
|
# -----------------------------------
|