#!/usr/bin/env -S python import pandas import numpy import seaborn import matplotlib.pyplot import sklearn.model_selection import sklearn.linear_model import sklearn.ensemble import sklearn.metrics DATA_COLUMNS = [ "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol", "quality" ] HAHA_FUNI_NUMBER = 42 def main(): wine_quality = q_01_01_load_data("./winequality-red.csv") # q_01_02_print_raw_data(wine_quality) # # > __q_02_print_raw_data__ # > fixed acidity volatile acidity citric acid ... sulphates alcohol quality # > 0 7.4 0.700 0.00 ... 0.56 9.4 5 # > 1 7.8 0.880 0.00 ... 0.68 9.8 5 # > ... ... # # note : # - Les colomnes "citric acid" (2) et "quality" (11) ont des valeur discrètes, donc des indices de cathégories. # - Les autres colomnes sont numériques. # q_01_03_plot_raw_data(wine_quality) # # note : # Aucune correllation n'est évidente. (quality_categories, parameters) = q_02_01_split_set(wine_quality) # print("quality_categories", quality_categories.shape, "parameters", parameters.shape) # # > quality_categories (1599, 1) parameters (1599, 11) (training_set, validation_set) = q_02_02_model_sets(parameters, quality_categories) # print("training_set", training_set.shape, "validation_set", validation_set.shape) # # > training_set (1199, 11) validation_set (400, 11) linear_model = q_02_03_train_linear_model(*training_set) linear_accuracy = q_02_04_evaluate_error(linear_model, *validation_set) # print("linear_accuracy", linear_accuracy) # # > linear_accuracy 0.38830173868689244 # q_02_05_print_one_prediction(linear_model, training_set[0]) # # > q_02_05_print_one_prediction 5.23381934012872 # # note : # Nous cherchons des valeurs discrètes pour procéder à une catégorisation, la nature de la prédiction est incorrecte. forest_model = q_03_01_train_forest_model(*training_set) forest_training_accuracy = q_03_02_evaluate_error(forest_model, *training_set) forest_validation_accuracy = q_03_02_evaluate_error(forest_model, *validation_set) # print("forest_training_accuracy", forest_training_accuracy, "forest_validation_accuracy", forest_validation_accuracy) # # > forest_training_accuracy 0.0 forest_validation_accuracy 0.42 # # note : # Le modèle semble sur-spécialisé pour le set d'entrainement. confusion = q_03_03_confusion_matrix(forest_model, *validation_set) # print("confusion", confusion) # # > confusion [[ 0 0 1 0 0 0] # > [ 0 0 5 8 0 0] # > [ 0 0 123 39 2 0] # > [ 0 0 37 122 10 0] # > [ 0 0 0 24 23 1] # > [ 0 0 0 1 4 0]] # # note : # Les occurences de classes fortement représentées statistiquement sont presque évaluées correctement. # Cependant le modèle tend à classifier les valeurs peu représentées dans les catégories fortement représentées. training_alternate_expects_set = q_04_01_reclassify_expects(training_set[1]) validation_alternate_expects_set = q_04_01_reclassify_expects(validation_set[1]) alt_forest_model = q_04_02_train_forest_model(training_set[0], training_alternate_expects_set) alt_forest_training_accuracy = q_04_03_evaluate_error(alt_forest_model, training_set[0], training_alternate_expects_set) alt_forest_validation_accuracy = q_04_03_evaluate_error(alt_forest_model, validation_set[0], validation_alternate_expects_set) # print("alt_forest_training_accuracy", alt_forest_training_accuracy, "alt_forest_validation_accuracy", alt_forest_validation_accuracy) # # > alt_forest_training_accuracy 0.0 alt_forest_validation_accuracy 0.085 # # note : # Le modèle obtient un bien meilleur score pour une tache plus simple et un set avec une représentation plus uniforme. # Le modèle ne souffre pas de sur-spécialisation. alt_confusion = q_04_04_confusion_matrix(alt_forest_model, validation_set[0], validation_alternate_expects_set) # print("alt_confusion", alt_confusion) # # > alt_confusion [[338 9] # > [ 25 28]] # # Tip top. q_05_01_read_doc() search_grid = q_05_02_make_search_grid(alt_forest_model) q_05_03_k() # best_estimator = q_05_04_get_best_params(search_grid, training_set[0], training_alternate_expects_set) # print("best_estimator", best_estimator) # # > best_estimator RandomForestClassifier(bootstrap=False, min_samples_leaf=2, n_estimators=200, random_state=42) # définition alternative pour 'best_estimator' best_estimator = sklearn.ensemble.RandomForestClassifier(bootstrap=False, min_samples_leaf=2, n_estimators=200, random_state=42).fit(training_set[0], training_alternate_expects_set) best_estimator_accuracy = q_05_05_evaluate_error(best_estimator, validation_set[0], validation_alternate_expects_set) # print("best_estimator_accuracy", best_estimator_accuracy) # # > best_estimator_accuracy 0.0975 def q_01_01_load_data(path: str): return pandas.read_csv(path, sep=';') def q_01_02_print_raw_data(data: pandas.DataFrame): print("__q_02_print_raw_data__\n", data) def q_01_03_plot_raw_data(data: pandas.DataFrame): seaborn.pairplot(data) matplotlib.pyplot.show() def q_02_01_split_set(data: pandas.DataFrame): quality_categories = data["quality"] parameters_columns = [ name for name in DATA_COLUMNS if name != "quality" ] parameters = data[parameters_columns] return (quality_categories, parameters) def q_02_02_model_sets(parameters: pandas.Series, expects: pandas.Series): ( training_parameters_set, validation_parameters_set, training_expects_set, validation_expects_set ) = sklearn.model_selection.train_test_split(parameters, expects, random_state=HAHA_FUNI_NUMBER) assert type(training_parameters_set) is pandas.DataFrame assert type(validation_parameters_set) is pandas.DataFrame assert type(training_expects_set) is pandas.Series assert type(validation_expects_set) is pandas.Series return ((training_parameters_set, training_expects_set), (validation_parameters_set, validation_expects_set)) def q_02_03_train_linear_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series): model = sklearn.linear_model.LinearRegression() model.fit(training_parameters_set, training_expects_set) return model def q_02_04_evaluate_error( model: sklearn.linear_model.LinearRegression, validation_parameters_set: pandas.DataFrame, validation_expects_set: pandas.Series ): predictions = model.predict(validation_parameters_set) error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions) return error def q_02_05_print_one_prediction(model: sklearn.linear_model.LinearRegression, parameter_set: pandas.DataFrame): parameters = parameter_set.head(1) [prediction, *_] = model.predict(parameters) print("q_02_05_print_one_prediction", prediction) def q_03_01_train_forest_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series): model = sklearn.ensemble.RandomForestClassifier(random_state=HAHA_FUNI_NUMBER) model.fit(training_parameters_set, training_expects_set) return model def q_03_02_evaluate_error( model: sklearn.ensemble.RandomForestClassifier, validation_parameters_set: pandas.DataFrame, validation_expects_set: pandas.Series ): predictions = model.predict(validation_parameters_set) error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions) return error def q_03_03_confusion_matrix( model: sklearn.ensemble.RandomForestClassifier, validation_parameters_set: pandas.DataFrame, validation_expects_set: pandas.Series ): predictions = model.predict(validation_parameters_set) confusion = sklearn.metrics.confusion_matrix(validation_expects_set, predictions) return confusion def q_04_01_reclassify_expects(quality_series: pandas.Series): QUALITY_THRESHOLD = 7 high_quality_series = quality_series.map(lambda quality: 1 if quality >= QUALITY_THRESHOLD else 0) return high_quality_series def q_04_02_train_forest_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series): model = sklearn.ensemble.RandomForestClassifier(random_state=HAHA_FUNI_NUMBER) model.fit(training_parameters_set, training_expects_set) return model def q_04_03_evaluate_error( model: sklearn.ensemble.RandomForestClassifier, validation_parameters_set: pandas.DataFrame, validation_expects_set: pandas.Series ): predictions = model.predict(validation_parameters_set) error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions) return error def q_04_04_confusion_matrix( model: sklearn.ensemble.RandomForestClassifier, validation_parameters_set: pandas.DataFrame, validation_expects_set: pandas.Series ): predictions = model.predict(validation_parameters_set) confusion = sklearn.metrics.confusion_matrix(validation_expects_set, predictions) return confusion def q_05_01_read_doc(): pass def q_05_02_make_search_grid(base_model: sklearn.ensemble.RandomForestClassifier): param_grid = { "n_estimators": [100, 200, 300], "max_depth": [None, 10, 20, 30], "min_samples_split": [2, 5, 10], "min_samples_leaf": [1, 2, 4], "bootstrap": [True, False] } grid_search = sklearn.model_selection.GridSearchCV(base_model, param_grid, n_jobs=-1) return grid_search def q_05_03_k(): pass def q_05_04_get_best_params( grid_search: sklearn.model_selection.GridSearchCV, training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series ): grid_search.fit(training_parameters_set, training_expects_set) return grid_search.best_estimator_ def q_05_05_evaluate_error( model: sklearn.ensemble.RandomForestClassifier, validation_parameters_set: pandas.DataFrame, validation_expects_set: pandas.Series ): predictions = model.predict(validation_parameters_set) error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions) return error if __name__ == "__main__": main()