init

2024-02-26 15:36:47 +01:00 · 2024-02-26 15:36:47 +01:00 · 96ca9fecd3
commit 96ca9fecd3
6 changed files with 769 additions and 0 deletions
--- a/ia/tp1/.gitignore
+++ b/ia/tp1/.gitignore
@ -0,0 +1,4 @@
+/__pycache__
+/.pytest_cache
+/env
+/*.zip
--- a/ia/tp1/iris.csv
+++ b/ia/tp1/iris.csv
@ -0,0 +1,150 @@
+5.1 3.5 1.4 0.2 0
+4.9 3.0 1.4 0.2 0
+4.7 3.2 1.3 0.2 0
+4.6 3.1 1.5 0.2 0
+5.0 3.6 1.4 0.2 0
+5.4 3.9 1.7 0.4 0
+4.6 3.4 1.4 0.3 0
+5.0 3.4 1.5 0.2 0
+4.4 2.9 1.4 0.2 0
+4.9 3.1 1.5 0.1 0
+5.4 3.7 1.5 0.2 0
+4.8 3.4 1.6 0.2 0
+4.8 3.0 1.4 0.1 0
+4.3 3.0 1.1 0.1 0
+5.8 4.0 1.2 0.2 0
+5.7 4.4 1.5 0.4 0
+5.4 3.9 1.3 0.4 0
+5.1 3.5 1.4 0.3 0
+5.7 3.8 1.7 0.3 0
+5.1 3.8 1.5 0.3 0
+5.4 3.4 1.7 0.2 0
+5.1 3.7 1.5 0.4 0
+4.6 3.6 1.0 0.2 0
+5.1 3.3 1.7 0.5 0
+4.8 3.4 1.9 0.2 0
+5.0 3.0 1.6 0.2 0
+5.0 3.4 1.6 0.4 0
+5.2 3.5 1.5 0.2 0
+5.2 3.4 1.4 0.2 0
+4.7 3.2 1.6 0.2 0
+4.8 3.1 1.6 0.2 0
+5.4 3.4 1.5 0.4 0
+5.2 4.1 1.5 0.1 0
+5.5 4.2 1.4 0.2 0
+4.9 3.1 1.5 0.2 0
+5.0 3.2 1.2 0.2 0
+5.5 3.5 1.3 0.2 0
+4.9 3.6 1.4 0.1 0
+4.4 3.0 1.3 0.2 0
+5.1 3.4 1.5 0.2 0
+5.0 3.5 1.3 0.3 0
+4.5 2.3 1.3 0.3 0
+4.4 3.2 1.3 0.2 0
+5.0 3.5 1.6 0.6 0
+5.1 3.8 1.9 0.4 0
+4.8 3.0 1.4 0.3 0
+5.1 3.8 1.6 0.2 0
+4.6 3.2 1.4 0.2 0
+5.3 3.7 1.5 0.2 0
+5.0 3.3 1.4 0.2 0
+7.0 3.2 4.7 1.4 1
+6.4 3.2 4.5 1.5 1
+6.9 3.1 4.9 1.5 1
+5.5 2.3 4.0 1.3 1
+6.5 2.8 4.6 1.5 1
+5.7 2.8 4.5 1.3 1
+6.3 3.3 4.7 1.6 1
+4.9 2.4 3.3 1.0 1
+6.6 2.9 4.6 1.3 1
+5.2 2.7 3.9 1.4 1
+5.0 2.0 3.5 1.0 1
+5.9 3.0 4.2 1.5 1
+6.0 2.2 4.0 1.0 1
+6.1 2.9 4.7 1.4 1
+5.6 2.9 3.6 1.3 1
+6.7 3.1 4.4 1.4 1
+5.6 3.0 4.5 1.5 1
+5.8 2.7 4.1 1.0 1
+6.2 2.2 4.5 1.5 1
+5.6 2.5 3.9 1.1 1
+5.9 3.2 4.8 1.8 1
+6.1 2.8 4.0 1.3 1
+6.3 2.5 4.9 1.5 1
+6.1 2.8 4.7 1.2 1
+6.4 2.9 4.3 1.3 1
+6.6 3.0 4.4 1.4 1
+6.8 2.8 4.8 1.4 1
+6.7 3.0 5.0 1.7 1
+6.0 2.9 4.5 1.5 1
+5.7 2.6 3.5 1.0 1
+5.5 2.4 3.8 1.1 1
+5.5 2.4 3.7 1.0 1
+5.8 2.7 3.9 1.2 1
+6.0 2.7 5.1 1.6 1
+5.4 3.0 4.5 1.5 1
+6.0 3.4 4.5 1.6 1
+6.7 3.1 4.7 1.5 1
+6.3 2.3 4.4 1.3 1
+5.6 3.0 4.1 1.3 1
+5.5 2.5 4.0 1.3 1
+5.5 2.6 4.4 1.2 1
+6.1 3.0 4.6 1.4 1
+5.8 2.6 4.0 1.2 1
+5.0 2.3 3.3 1.0 1
+5.6 2.7 4.2 1.3 1
+5.7 3.0 4.2 1.2 1
+5.7 2.9 4.2 1.3 1
+6.2 2.9 4.3 1.3 1
+5.1 2.5 3.0 1.1 1
+5.7 2.8 4.1 1.3 1
+6.3 3.3 6.0 2.5 2
+5.8 2.7 5.1 1.9 2
+7.1 3.0 5.9 2.1 2
+6.3 2.9 5.6 1.8 2
+6.5 3.0 5.8 2.2 2
+7.6 3.0 6.6 2.1 2
+4.9 2.5 4.5 1.7 2
+7.3 2.9 6.3 1.8 2
+6.7 2.5 5.8 1.8 2
+7.2 3.6 6.1 2.5 2
+6.5 3.2 5.1 2.0 2
+6.4 2.7 5.3 1.9 2
+6.8 3.0 5.5 2.1 2
+5.7 2.5 5.0 2.0 2
+5.8 2.8 5.1 2.4 2
+6.4 3.2 5.3 2.3 2
+6.5 3.0 5.5 1.8 2
+7.7 3.8 6.7 2.2 2
+7.7 2.6 6.9 2.3 2
+6.0 2.2 5.0 1.5 2
+6.9 3.2 5.7 2.3 2
+5.6 2.8 4.9 2.0 2
+7.7 2.8 6.7 2.0 2
+6.3 2.7 4.9 1.8 2
+6.7 3.3 5.7 2.1 2
+7.2 3.2 6.0 1.8 2
+6.2 2.8 4.8 1.8 2
+6.1 3.0 4.9 1.8 2
+6.4 2.8 5.6 2.1 2
+7.2 3.0 5.8 1.6 2
+7.4 2.8 6.1 1.9 2
+7.9 3.8 6.4 2.0 2
+6.4 2.8 5.6 2.2 2
+6.3 2.8 5.1 1.5 2
+6.1 2.6 5.6 1.4 2
+7.7 3.0 6.1 2.3 2
+6.3 3.4 5.6 2.4 2
+6.4 3.1 5.5 1.8 2
+6.0 3.0 4.8 1.8 2
+6.9 3.1 5.4 2.1 2
+6.7 3.1 5.6 2.4 2
+6.9 3.1 5.1 2.3 2
+5.8 2.7 5.1 1.9 2
+6.8 3.2 5.9 2.3 2
+6.7 3.3 5.7 2.5 2
+6.7 3.0 5.2 2.3 2
+6.3 2.5 5.0 1.9 2
+6.5 3.0 5.2 2.0 2
+6.2 3.4 5.4 2.3 2
+5.9 3.0 5.1 1.8 2
--- a/ia/tp1/main.py
+++ b/ia/tp1/main.py
@ -0,0 +1,153 @@
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+# step 01 ---------------------------------------
+# args
+#   - filename: string filename
+# return
+#   - x: data matrix of size (n,p)
+#   - y: array of integer labels of size n
+def load_data(filename):
+    x = np.loadtxt(filename)
+    y = x[:,4].astype(int)
+    x = x[:,:-1]
+    return x, y
+
+
+# step 02 ---------------------------------------
+# args
+#   - x: data matrix of size (n,p)
+# return
+#   - x: center-normalized input
+def standardize(x: np.ndarray):
+    def normalize_col(column: np.ndarray):
+        avg = column.mean()
+        avg_dist_to_avg = column.std()
+        return [ (elt - avg) / avg_dist_to_avg for elt in column ]
+    x = np.array([ normalize_col(column) for column in x.T ]).T
+    return x
+
+
+# step 03 ---------------------------------------
+# args
+#   - x: standardized data matrix of size (n,p)
+# return
+#   - C: covariance matrix of size (p,p)
+def compute_covariance_matrix(x: np.ndarray):
+    # def cov_cols(col_a: np.ndarray, col_b: np.ndarray):
+    #     return sum(a * b for (a, b) in zip(col_a, col_b)) / len(col_a)
+    # C = np.array([[ cov_cols(col_a, col_b) for col_b in x.T] for col_a in x.T])
+    # return C
+    return np.cov(x.T, bias=True)
+
+
+# step 04 ---------------------------------------
+# args
+#   - C: covariance matrix of size (p,p)
+# return
+#   - eigenval: eigenvalues of size p
+#   - eigenvec: eigenvectors of size (p,p)
+def compute_eigenelements(C: np.ndarray):
+    return np.linalg.eig(C)
+
+
+# step 05 ---------------------------------------
+# args
+#   - eigenval: eigenvalues of size p
+#   - eigenvec: eigenvectors of size (p,p)
+# return
+#   - eigenval: ordered eigenvalues of size p
+#   - eigenvec: ordered eigenvectors of size (p,p)
+def sort_eigenelements(eigenval: np.ndarray, eigenvec: np.ndarray):
+    keyed = list(zip(eigenval, eigenvec.T))
+    keyed.sort(key=lambda a: a[0])
+    eigenval = np.flip([val for (val, _) in keyed])
+    eigenvec = np.rot90(np.flip([vec for (_, vec) in keyed]))
+    return (eigenval, eigenvec)
+
+
+# step 06 ---------------------------------------
+# args
+#   - eigenval: eigenvalues of size p
+# returns
+#   - var: variances ratio explained by each components (of size p)
+#   - var_accu: cumulated variances ratio (of size p)
+def compute_var_ratio(eigenval: np.ndarray):
+    total = np.sum(eigenval)
+    var = np.divide(eigenval, total)
+    var_accu = np.cumsum(var)
+    return (var, var_accu)
+
+# step 08 ---------------------------------------
+# args
+#   - x: standardized data matrix of size (n,p)
+#   - eigenvec: sorted eigenvectors (of size (p,p))
+# return
+#   - x: projected data of size (n,2)
+def project_on_2pc(x: np.ndarray, eigenvec: np.ndarray):
+    projected = x @ eigenvec
+    _2d = projected[:,:2]
+    return _2d
+
+# step 11 ---------------------------------------
+# args
+#   - x: standardized data matrix of size (n,p)
+# return
+#   - y_pred: labels of size (n)
+def classify(x: np.ndarray):
+    def class_n(n: float):
+        if (n < -1): return 0
+        if (n < 1): return 1
+        return 2
+    return np.array([ class_n(n) for n in x[:,0] ])
+
+# step 12 ---------------------------------------
+# args
+#   - y: ground-truth labels (of size (n))
+#   - y_pred: predicted labels (of size (n))
+# return
+#   - conf: confusion matrix of integer of size (p,p)
+def compute_confusion_matrix(y: np.ndarray, y_pred: np.ndarray):
+    size = max(y) + 1
+    res = np.zeros((size, size)).astype(np.int32)
+    for (y, y_pred) in zip(y, y_pred): res[y, y_pred] += 1
+    return res
+
+
+# step 13 ---------------------------------------
+# args
+#   - conf: confusion matrix of integer of size (p,p)
+# return
+#   - prec: average precision over all classes
+#   - rec: average recall over all classes
+def compute_metric(conf: np.ndarray):
+    def precision(kind: int, mat: np.ndarray):
+        return mat[kind] / mat[kind,:].sum()
+    prec = np.average([ precision(kind) for kind in range(0, 3) ])
+    def rec(kind: int, mat: np.ndarray):
+        return mat[kind] / mat[kind,:].sum()
+
+
+if __name__ == '__main__':
+    # main program ------------------------------
+
+    values, kind = load_data("./iris.csv")
+    standardized = standardize(values)
+    covariance = compute_covariance_matrix(standardized)
+    eigenval, eigenvec = compute_eigenelements(covariance)
+    eigenval_sorted, eigenvec_sorted = sort_eigenelements(eigenval, eigenvec)
+    variance, variance_accumulated = compute_var_ratio(eigenval_sorted)
+    projected = project_on_2pc(standardized, eigenvec_sorted)
+    predictions = classify(projected)
+
+    # draw
+    plt.figure()
+    plt.scatter(projected[:,0], projected[:,1], c=predictions)
+    plt.show()
+
+    confusion = compute_confusion_matrix(kind, predictions)
+
+    # -----------------------------------
+    plt.show()
+    # -----------------------------------
--- a/ia/tp1/test_.py
+++ b/ia/tp1/test_.py
@ -0,0 +1,225 @@
+import numpy as np
+import main
+
+def test_step_01():
+    test = main.load_data('iris.csv')
+    assert test is not None
+    x,y = test
+    assert type(x) == np.ndarray
+    assert type(y) == np.ndarray
+    assert x.shape == (150,4)
+    assert y.shape == (150,)
+    assert x.dtype == np.float32 or x.dtype == np.float64
+    assert y.dtype == np.int32 or y.dtype == np.int64
+
+def test_step_02():
+    x = np.array([
+        [59.76270078546494,93.0378732744839,70.55267521432877,],
+        [58.976636599379376,34.73095986778094,79.17882261333122,],
+        [37.5174422525385,128.35460015641596,142.73255210020585,],
+        [26.68830376515554,108.34500761653291,55.778983950580894,],
+        [63.60891221878646,135.1193276585322,-35.79278836042261,],
+        [-32.57414005969186,-45.956320511934855,116.5239691095876,],
+        [105.6313501899701,124.00242964936382,145.7236684465528,],
+        [109.83171284334472,42.29587245058637,106.10583525729109,],
+        [-26.345114826213354,77.98420426550476,-21.32934251819072,],
+        [138.93378340991677,54.36966435001433,32.932387998104716,],
+    ])
+    x_ref = np.array([
+        [0.10485104208319312,0.3362373314630696,0.021964359573702855,],
+        [0.09002614318807045,-0.7645767522567821,0.166375923375568,],
+        [-0.31468684615052994,1.0030047535583193,1.2303379722621464,],
+        [-0.518920655600867,0.625230661423531,-0.22536411278355217,],
+        [0.17738926051062492,1.1307204372014275,-1.7583802123078445,],
+        [-1.636589865740423,-2.2879243182562656,0.7915763467485558,],
+        [0.9699179181914451,0.9208373001811287,1.2804126775864766,],
+        [1.0491353062386208,-0.6217538548213571,0.6171648821726928,],
+        [-1.5191126025141926,0.05202933813039103,-1.5162456027970177,],
+        [1.5979902997940605,-0.39380489662345974,-0.607842233830729,],
+    ])
+    x_test = main.standardize(x)
+    assert x_test is not None
+    error = np.max(np.abs(x_ref - x_test))
+    assert x_test.shape == x_ref.shape
+    assert error < 1e-6
+
+def test_step_03():
+    x = np.array([
+        [-1.105742368566682,1.2447972510784915,-0.10292263179401698,],
+        [0.5207869535408698,-1.2813536677837922,0.5399586812056,],
+        [0.7544895471006708,0.7187982077571432,1.838230783931978,],
+        [1.1276950515239543,-0.14201644595920881,-0.1790339466010727,],
+        [1.2123239446742684,-1.1427957147947074,0.7355527607135005,],
+        [1.0678403424190925,-0.6406810309665311,-1.405619490464019,],
+        [-0.8334425979838738,-0.12796234478939209,0.3511025041846954,],
+        [-0.17414992612560526,1.960866758149327,-1.5126361666589871,],
+        [-1.4037672060254274,-0.80477781558321,0.6811779598355046,],
+        [-1.1660337405572676,0.2151248028918804,-0.9458104543531817,],
+    ])
+    C_ref = np.array([
+        [0.9999999999999998,-0.30871041083062684,0.10882239075522164,],
+        [-0.30871041083062684,1.0,-0.3175955960839056,],
+        [0.10882239075522164,-0.3175955960839056,1.0,],
+    ])
+    C_test = main.compute_covariance_matrix(x)
+    assert C_test is not None
+    error = np.max(np.abs(C_ref - C_test))
+    assert C_test.shape == (3,3)
+    assert error < 1e-6
+
+def test_step_04():
+    C = np.array([
+        [0.9999999999999998,-0.30871041083062684,0.10882239075522164,],
+        [-0.30871041083062684,1.0,-0.3175955960839056,],
+        [0.10882239075522164,-0.3175955960839056,1.0,],
+    ])
+    eigval_ref = np.array([1.5006348637289717,0.8912274127999972,0.6081377234710301,])
+    eigvec_ref = np.array([
+        [-0.524750256529313,0.7183989075784977,0.4566619952026235,],
+        [0.6626030910538614,0.007925719552699393,0.7489287861308581,],
+        [-0.5344102469061465,-0.6955862222324367,0.4801723601396883,],
+    ])
+    test = main.compute_eigenelements(C)
+    assert test is not None
+    eigval_test, eigvec_test = test
+    assert eigval_test.shape == (3,)
+    assert eigvec_test.shape == (3,3)
+    error = np.max(np.abs(eigval_ref - eigval_test))
+    assert error < 1e-6
+    error = np.max(np.abs(eigvec_ref - eigvec_test))
+    assert error < 1e-6
+
+def test_step_05():
+    eigval = np.array([1.0, 3.0, 2.0])
+    eigvec = np.array([
+        [1.0, 3.0, 2.0],
+        [1.1, 3.1, 2.1],
+        [1.2, 3.2, 2.2],
+    ])
+    eigval_ref = np.array([3.0, 2.0, 1.0])
+    eigvec_ref = np.array([
+        [3.0, 2.0, 1.0],
+        [3.1, 2.1, 1.1],
+        [3.2, 2.2, 1.2],    
+    ])
+    test = main.sort_eigenelements(eigval, eigvec)
+    assert test is not None
+    eigval_test, eigvec_test = test
+    assert eigval_test.shape == (3,)
+    assert eigvec_test.shape == (3,3)
+    error = np.max(np.abs(eigval_ref - eigval_test))
+    assert error < 1e-6
+    error = np.max(np.abs(eigvec_ref - eigvec_test))
+    assert error < 1e-6
+
+def test_step_06():
+    eigval = np.array([0.9767610881903371,0.9764594650133958,0.8379449074988039,0.8209932298479351,0.7392635793983017,0.6563295894652734,0.604845519745046,0.4686512016477016,0.3687251706609641,0.29614019752214493,0.2828069625764096,0.1965823616800535,0.15896958364551972,0.1381829513486138,0.1201965612131689,0.11872771895424405,0.11037514116430513,0.09710127579306127,0.09609840789396307,0.039187792254320675,])
+    var_ref = np.array([0.12052317179600819,0.12048595432040264,0.10339455498701101,0.1013028767020668,0.09121820316104898,0.08098492540533334,0.07463227330373827,0.05782716978748539,0.045497233280881885,0.03654092728836284,0.03489573094977952,0.024256422614793195,0.019615358015076186,0.017050482237702617,0.014831130121310949,0.014649888740942673,0.013619258856022274,0.011981388166988682,0.011857643659193383,0.004835406605851003,])
+    var_accu_ref = np.array([0.12052317179600819,0.24100912611641084,0.34440368110342184,0.4457065578054886,0.5369247609665376,0.6179096863718709,0.6925419596756092,0.7503691294630945,0.7958663627439764,0.8324072900323393,0.8673030209821188,0.891559443596912,0.9111748016119882,0.9282252838496908,0.9430564139710018,0.9577063027119445,0.9713255615679667,0.9833069497349554,0.9951645933941488,0.9999999999999998,])
+    test = main.compute_var_ratio(eigval)
+    assert test is not None
+    ver_test, var_accu_test = test
+    assert ver_test.shape == eigval.shape
+    assert var_accu_test.shape == eigval.shape
+    error = np.max(np.abs(var_ref - ver_test))
+    assert error < 1e-6
+    error = np.max(np.abs(var_accu_ref - var_accu_test))
+    assert error < 1e-6
+
+# def test_step_07():
+#     pass
+
+def test_step_08():
+    x = np.array([
+        [-0.5692272135195057,-0.03821691441797431,-1.391429749664207,],
+        [0.8349483159428722,0.5897352401152248,-0.7587336993772015,],
+        [0.2004294637899314,-1.358613577394327,0.21764397273661457,],
+        [1.722938694045615,-0.43267593685524763,0.5052024345091827,],
+        [-1.2673436013591408,1.2069175815704556,-0.6832265505709595,],
+        [-1.0746396605693356,0.671812065430423,-1.529889431394962,],
+        [1.3466454025825358,-1.726489239590098,0.5379189375433167,],
+        [-0.7491139962207634,1.2846882183405577,1.4319721238227643,],
+        [-0.8288106401604025,0.6291253952950716,0.26824732034137383,],
+        [0.38417323546819404,-0.8262828324940856,1.4022946420540787,],
+    ])
+    eigvec = np.array([
+        [-0.6432311709554334,0.725040765848229,],
+        [0.6234976274295615,0.6825615590553287,],
+        [-0.44441463668629316,-0.09179109955857746,],
+    ])
+    proj_ref = np.array([
+        [0.9606882782960829,-0.3110774648390586,],
+        [0.16782610142261073,1.077547171895806,],
+        [-1.072738987857223,-0.8019956491849984,],
+        [-1.602539650115925,0.9074997412514691,],
+        [1.8713410365637415,-0.032366112869744654,],
+        [1.7900202119211768,-0.18017423862691137,],
+        [-2.1817252930269992,-0.2514385438029105,],
+        [0.6464641579366405,0.20229831190009098,],
+        [0.8061619944812769,-0.1961274071189318,],
+        [-1.385497849621383,-0.41416580860481034,],
+    ])
+    proj_test = main.project_on_2pc(x, eigvec)
+    assert proj_test is not None
+    assert proj_test.shape == (len(x), 2)
+    error = np.max(np.abs(proj_ref - proj_test))
+    assert error < 1e-6
+
+# def test_step_09():
+#     pass
+
+# def test_step_10():
+#     pass
+
+def test_step_11():
+    x = np.array([
+        [-2.0, -100.0],
+        [-2.0, +100.0],
+        [ 0.0, -100.0],
+        [ 0.0, +100.0],
+        [+2.0, -100.0],
+        [+2.0, +100.0],
+    ])
+    y_ref = np.array([0,0, 1,1, 2,2], dtype=int)
+    y_test = main.classify(x)
+    assert y_test is not None
+    assert y_test.shape == (6,)
+    assert y_test.dtype == np.int32 or y_test.dtype == np.int64
+    # print(y_ref, y_test)
+    # assert all(y_ref[i] == y_test[i] for i in range(len(x)))
+    error = np.max(np.abs(y_ref - y_test))
+    assert error == 0
+
+def test_step_12():
+    y      = np.array([0,0,0,0, 1,1,1,1, 2,2,2,2])
+    y_pred = np.array([0,0,1,2, 1,1,1,1, 2,2,2,1])
+    conf_ref = np.array([
+        [2,1,1,],
+        [0,4,0,],
+        [0,1,3,],
+    ], dtype=int)
+    conf_test = main.compute_confusion_matrix(y, y_pred)
+    assert conf_test is not None
+    assert conf_test.shape == (3,3)
+    assert conf_test.dtype == np.int32 or conf_test.dtype == np.int64
+    error = np.max(np.abs(conf_ref - conf_test))
+    assert error == 0
+
+def test_step_13():
+    conf = np.array([
+        [2,1,1,],
+        [0,4,0,],
+        [0,1,3,],
+    ], dtype=int)
+    prec_ref = 0.8055555555555555
+    rec_ref = 0.75
+    test = main.compute_metric(conf)
+    assert test is not None
+    prec_test, rec_test = test
+    error = np.max(np.abs(prec_ref - prec_test))
+    assert error < 1e-6
+    error = np.max(np.abs(rec_ref - rec_test))
+    assert error < 1e-6
+
+# def test_step_14():
+#     pass