notes-ing2/ia/tp2/main.py
2024-03-10 21:37:19 +01:00

262 lines
11 KiB
Python
Executable file
Raw Permalink Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env -S python
import pandas
import numpy
import seaborn
import matplotlib.pyplot
import sklearn.model_selection
import sklearn.linear_model
import sklearn.ensemble
import sklearn.metrics
DATA_COLUMNS = [ "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol", "quality" ]
HAHA_FUNI_NUMBER = 42
def main():
wine_quality = q_01_01_load_data("./winequality-red.csv")
# q_01_02_print_raw_data(wine_quality)
#
# > __q_02_print_raw_data__
# > fixed acidity volatile acidity citric acid ... sulphates alcohol quality
# > 0 7.4 0.700 0.00 ... 0.56 9.4 5
# > 1 7.8 0.880 0.00 ... 0.68 9.8 5
# > ... ...
#
# note :
# - Les colomnes "citric acid" (2) et "quality" (11) ont des valeur discrètes, donc des indices de cathégories.
# - Les autres colomnes sont numériques.
# q_01_03_plot_raw_data(wine_quality)
#
# note :
# Aucune correllation n'est évidente.
(quality_categories, parameters) = q_02_01_split_set(wine_quality)
# print("quality_categories", quality_categories.shape, "parameters", parameters.shape)
#
# > quality_categories (1599, 1) parameters (1599, 11)
(training_set, validation_set) = q_02_02_model_sets(parameters, quality_categories)
# print("training_set", training_set.shape, "validation_set", validation_set.shape)
#
# > training_set (1199, 11) validation_set (400, 11)
linear_model = q_02_03_train_linear_model(*training_set)
linear_accuracy = q_02_04_evaluate_error(linear_model, *validation_set)
# print("linear_accuracy", linear_accuracy)
#
# > linear_accuracy 0.38830173868689244
# q_02_05_print_one_prediction(linear_model, training_set[0])
#
# > q_02_05_print_one_prediction 5.23381934012872
#
# note :
# Nous cherchons des valeurs discrètes pour procéder à une catégorisation, la nature de la prédiction est incorrecte.
forest_model = q_03_01_train_forest_model(*training_set)
forest_training_accuracy = q_03_02_evaluate_error(forest_model, *training_set)
forest_validation_accuracy = q_03_02_evaluate_error(forest_model, *validation_set)
# print("forest_training_accuracy", forest_training_accuracy, "forest_validation_accuracy", forest_validation_accuracy)
#
# > forest_training_accuracy 0.0 forest_validation_accuracy 0.42
#
# note :
# Le modèle semble sur-spécialisé pour le set d'entrainement.
confusion = q_03_03_confusion_matrix(forest_model, *validation_set)
# print("confusion", confusion)
#
# > confusion [[ 0 0 1 0 0 0]
# > [ 0 0 5 8 0 0]
# > [ 0 0 123 39 2 0]
# > [ 0 0 37 122 10 0]
# > [ 0 0 0 24 23 1]
# > [ 0 0 0 1 4 0]]
#
# note :
# Les occurences de classes fortement représentées statistiquement sont presque évaluées correctement.
# Cependant le modèle tend à classifier les valeurs peu représentées dans les catégories fortement représentées.
training_alternate_expects_set = q_04_01_reclassify_expects(training_set[1])
validation_alternate_expects_set = q_04_01_reclassify_expects(validation_set[1])
alt_forest_model = q_04_02_train_forest_model(training_set[0], training_alternate_expects_set)
alt_forest_training_accuracy = q_04_03_evaluate_error(alt_forest_model, training_set[0], training_alternate_expects_set)
alt_forest_validation_accuracy = q_04_03_evaluate_error(alt_forest_model, validation_set[0], validation_alternate_expects_set)
# print("alt_forest_training_accuracy", alt_forest_training_accuracy, "alt_forest_validation_accuracy", alt_forest_validation_accuracy)
#
# > alt_forest_training_accuracy 0.0 alt_forest_validation_accuracy 0.085
#
# note :
# Le modèle obtient un bien meilleur score pour une tache plus simple et un set avec une représentation plus uniforme.
# Le modèle ne souffre pas de sur-spécialisation.
alt_confusion = q_04_04_confusion_matrix(alt_forest_model, validation_set[0], validation_alternate_expects_set)
# print("alt_confusion", alt_confusion)
#
# > alt_confusion [[338 9]
# > [ 25 28]]
#
# Tip top.
q_05_01_read_doc()
search_grid = q_05_02_make_search_grid(alt_forest_model)
q_05_03_k()
# best_estimator = q_05_04_get_best_params(search_grid, training_set[0], training_alternate_expects_set)
# print("best_estimator", best_estimator)
#
# > best_estimator RandomForestClassifier(bootstrap=False, min_samples_leaf=2, n_estimators=200, random_state=42)
# définition alternative pour 'best_estimator'
best_estimator = sklearn.ensemble.RandomForestClassifier(bootstrap=False, min_samples_leaf=2, n_estimators=200, random_state=42).fit(training_set[0], training_alternate_expects_set)
best_estimator_accuracy = q_05_05_evaluate_error(best_estimator, validation_set[0], validation_alternate_expects_set)
# print("best_estimator_accuracy", best_estimator_accuracy)
#
# > best_estimator_accuracy 0.0975
def q_01_01_load_data(path: str):
return pandas.read_csv(path, sep=';')
def q_01_02_print_raw_data(data: pandas.DataFrame):
print("__q_02_print_raw_data__\n", data)
def q_01_03_plot_raw_data(data: pandas.DataFrame):
seaborn.pairplot(data)
matplotlib.pyplot.show()
def q_02_01_split_set(data: pandas.DataFrame):
quality_categories = data["quality"]
parameters_columns = [ name for name in DATA_COLUMNS if name != "quality" ]
parameters = data[parameters_columns]
return (quality_categories, parameters)
def q_02_02_model_sets(parameters: pandas.Series, expects: pandas.Series):
(
training_parameters_set, validation_parameters_set,
training_expects_set, validation_expects_set
) = sklearn.model_selection.train_test_split(parameters, expects, random_state=HAHA_FUNI_NUMBER)
assert type(training_parameters_set) is pandas.DataFrame
assert type(validation_parameters_set) is pandas.DataFrame
assert type(training_expects_set) is pandas.Series
assert type(validation_expects_set) is pandas.Series
return ((training_parameters_set, training_expects_set), (validation_parameters_set, validation_expects_set))
def q_02_03_train_linear_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series):
model = sklearn.linear_model.LinearRegression()
model.fit(training_parameters_set, training_expects_set)
return model
def q_02_04_evaluate_error(
model: sklearn.linear_model.LinearRegression,
validation_parameters_set: pandas.DataFrame,
validation_expects_set: pandas.Series
):
predictions = model.predict(validation_parameters_set)
error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions)
return error
def q_02_05_print_one_prediction(model: sklearn.linear_model.LinearRegression, parameter_set: pandas.DataFrame):
parameters = parameter_set.head(1)
[prediction, *_] = model.predict(parameters)
print("q_02_05_print_one_prediction", prediction)
def q_03_01_train_forest_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series):
model = sklearn.ensemble.RandomForestClassifier(random_state=HAHA_FUNI_NUMBER)
model.fit(training_parameters_set, training_expects_set)
return model
def q_03_02_evaluate_error(
model: sklearn.ensemble.RandomForestClassifier,
validation_parameters_set: pandas.DataFrame,
validation_expects_set: pandas.Series
):
predictions = model.predict(validation_parameters_set)
error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions)
return error
def q_03_03_confusion_matrix(
model: sklearn.ensemble.RandomForestClassifier,
validation_parameters_set: pandas.DataFrame,
validation_expects_set: pandas.Series
):
predictions = model.predict(validation_parameters_set)
confusion = sklearn.metrics.confusion_matrix(validation_expects_set, predictions)
return confusion
def q_04_01_reclassify_expects(quality_series: pandas.Series):
QUALITY_THRESHOLD = 7
high_quality_series = quality_series.map(lambda quality: 1 if quality >= QUALITY_THRESHOLD else 0)
return high_quality_series
def q_04_02_train_forest_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series):
model = sklearn.ensemble.RandomForestClassifier(random_state=HAHA_FUNI_NUMBER)
model.fit(training_parameters_set, training_expects_set)
return model
def q_04_03_evaluate_error(
model: sklearn.ensemble.RandomForestClassifier,
validation_parameters_set: pandas.DataFrame,
validation_expects_set: pandas.Series
):
predictions = model.predict(validation_parameters_set)
error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions)
return error
def q_04_04_confusion_matrix(
model: sklearn.ensemble.RandomForestClassifier,
validation_parameters_set: pandas.DataFrame,
validation_expects_set: pandas.Series
):
predictions = model.predict(validation_parameters_set)
confusion = sklearn.metrics.confusion_matrix(validation_expects_set, predictions)
return confusion
def q_05_01_read_doc():
pass
def q_05_02_make_search_grid(base_model: sklearn.ensemble.RandomForestClassifier):
param_grid = {
"n_estimators": [100, 200, 300],
"max_depth": [None, 10, 20, 30],
"min_samples_split": [2, 5, 10],
"min_samples_leaf": [1, 2, 4],
"bootstrap": [True, False]
}
grid_search = sklearn.model_selection.GridSearchCV(base_model, param_grid, n_jobs=-1)
return grid_search
def q_05_03_k():
pass
def q_05_04_get_best_params(
grid_search: sklearn.model_selection.GridSearchCV,
training_parameters_set: pandas.DataFrame,
training_expects_set: pandas.Series
):
grid_search.fit(training_parameters_set, training_expects_set)
return grid_search.best_estimator_
def q_05_05_evaluate_error(
model: sklearn.ensemble.RandomForestClassifier,
validation_parameters_set: pandas.DataFrame,
validation_expects_set: pandas.Series
):
predictions = model.predict(validation_parameters_set)
error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions)
return error
if __name__ == "__main__": main()