ia tp2
This commit is contained in:
parent
4f2f24fe7e
commit
03e18ac7e4
4 changed files with 1872 additions and 0 deletions
4
ia/tp2/.gitignore
vendored
Normal file
4
ia/tp2/.gitignore
vendored
Normal file
|
@ -0,0 +1,4 @@
|
||||||
|
/__pycache__
|
||||||
|
/.pytest_cache
|
||||||
|
/env
|
||||||
|
/*.zip
|
262
ia/tp2/main.py
Executable file
262
ia/tp2/main.py
Executable file
|
@ -0,0 +1,262 @@
|
||||||
|
#!/usr/bin/env -S python
|
||||||
|
|
||||||
|
import pandas
|
||||||
|
import numpy
|
||||||
|
import seaborn
|
||||||
|
import matplotlib.pyplot
|
||||||
|
import sklearn.model_selection
|
||||||
|
import sklearn.linear_model
|
||||||
|
import sklearn.ensemble
|
||||||
|
import sklearn.metrics
|
||||||
|
|
||||||
|
DATA_COLUMNS = [ "fixed acidity", "volatile acidity", "citric acid", "residual sugar", "chlorides", "free sulfur dioxide", "total sulfur dioxide", "density", "pH", "sulphates", "alcohol", "quality" ]
|
||||||
|
HAHA_FUNI_NUMBER = 42
|
||||||
|
|
||||||
|
def main():
|
||||||
|
wine_quality = q_01_01_load_data("./winequality-red.csv")
|
||||||
|
|
||||||
|
# q_01_02_print_raw_data(wine_quality)
|
||||||
|
#
|
||||||
|
# > __q_02_print_raw_data__
|
||||||
|
# > fixed acidity volatile acidity citric acid ... sulphates alcohol quality
|
||||||
|
# > 0 7.4 0.700 0.00 ... 0.56 9.4 5
|
||||||
|
# > 1 7.8 0.880 0.00 ... 0.68 9.8 5
|
||||||
|
# > ... ...
|
||||||
|
#
|
||||||
|
# note :
|
||||||
|
# - Les colomnes "citric acid" (2) et "quality" (11) ont des valeur discrètes, donc des indices de cathégories.
|
||||||
|
# - Les autres colomnes sont numériques.
|
||||||
|
|
||||||
|
# q_01_03_plot_raw_data(wine_quality)
|
||||||
|
#
|
||||||
|
# note :
|
||||||
|
# Aucune correllation n'est évidente.
|
||||||
|
|
||||||
|
|
||||||
|
(quality_categories, parameters) = q_02_01_split_set(wine_quality)
|
||||||
|
# print("quality_categories", quality_categories.shape, "parameters", parameters.shape)
|
||||||
|
#
|
||||||
|
# > quality_categories (1599, 1) parameters (1599, 11)
|
||||||
|
|
||||||
|
(training_set, validation_set) = q_02_02_model_sets(parameters, quality_categories)
|
||||||
|
# print("training_set", training_set.shape, "validation_set", validation_set.shape)
|
||||||
|
#
|
||||||
|
# > training_set (1199, 11) validation_set (400, 11)
|
||||||
|
|
||||||
|
linear_model = q_02_03_train_linear_model(*training_set)
|
||||||
|
linear_accuracy = q_02_04_evaluate_error(linear_model, *validation_set)
|
||||||
|
# print("linear_accuracy", linear_accuracy)
|
||||||
|
#
|
||||||
|
# > linear_accuracy 0.38830173868689244
|
||||||
|
|
||||||
|
# q_02_05_print_one_prediction(linear_model, training_set[0])
|
||||||
|
#
|
||||||
|
# > q_02_05_print_one_prediction 5.23381934012872
|
||||||
|
#
|
||||||
|
# note :
|
||||||
|
# Nous cherchons des valeurs discrètes pour procéder à une catégorisation, la nature de la prédiction est incorrecte.
|
||||||
|
|
||||||
|
|
||||||
|
forest_model = q_03_01_train_forest_model(*training_set)
|
||||||
|
forest_training_accuracy = q_03_02_evaluate_error(forest_model, *training_set)
|
||||||
|
forest_validation_accuracy = q_03_02_evaluate_error(forest_model, *validation_set)
|
||||||
|
# print("forest_training_accuracy", forest_training_accuracy, "forest_validation_accuracy", forest_validation_accuracy)
|
||||||
|
#
|
||||||
|
# > forest_training_accuracy 0.0 forest_validation_accuracy 0.42
|
||||||
|
#
|
||||||
|
# note :
|
||||||
|
# Le modèle semble sur-spécialisé pour le set d'entrainement.
|
||||||
|
|
||||||
|
confusion = q_03_03_confusion_matrix(forest_model, *validation_set)
|
||||||
|
# print("confusion", confusion)
|
||||||
|
#
|
||||||
|
# > confusion [[ 0 0 1 0 0 0]
|
||||||
|
# > [ 0 0 5 8 0 0]
|
||||||
|
# > [ 0 0 123 39 2 0]
|
||||||
|
# > [ 0 0 37 122 10 0]
|
||||||
|
# > [ 0 0 0 24 23 1]
|
||||||
|
# > [ 0 0 0 1 4 0]]
|
||||||
|
#
|
||||||
|
# note :
|
||||||
|
# Les occurences de classes fortement représentées statistiquement sont presque évaluées correctement.
|
||||||
|
# Cependant le modèle tend à classifier les valeurs peu représentées dans les catégories fortement représentées.
|
||||||
|
|
||||||
|
|
||||||
|
training_alternate_expects_set = q_04_01_reclassify_expects(training_set[1])
|
||||||
|
validation_alternate_expects_set = q_04_01_reclassify_expects(validation_set[1])
|
||||||
|
alt_forest_model = q_04_02_train_forest_model(training_set[0], training_alternate_expects_set)
|
||||||
|
alt_forest_training_accuracy = q_04_03_evaluate_error(alt_forest_model, training_set[0], training_alternate_expects_set)
|
||||||
|
alt_forest_validation_accuracy = q_04_03_evaluate_error(alt_forest_model, validation_set[0], validation_alternate_expects_set)
|
||||||
|
# print("alt_forest_training_accuracy", alt_forest_training_accuracy, "alt_forest_validation_accuracy", alt_forest_validation_accuracy)
|
||||||
|
#
|
||||||
|
# > alt_forest_training_accuracy 0.0 alt_forest_validation_accuracy 0.085
|
||||||
|
#
|
||||||
|
# note :
|
||||||
|
# Le modèle obtient un bien meilleur score pour une tache plus simple et un set avec une représentation plus uniforme.
|
||||||
|
# Le modèle ne souffre pas de sur-spécialisation.
|
||||||
|
|
||||||
|
alt_confusion = q_04_04_confusion_matrix(alt_forest_model, validation_set[0], validation_alternate_expects_set)
|
||||||
|
# print("alt_confusion", alt_confusion)
|
||||||
|
#
|
||||||
|
# > alt_confusion [[338 9]
|
||||||
|
# > [ 25 28]]
|
||||||
|
#
|
||||||
|
# Tip top.
|
||||||
|
|
||||||
|
|
||||||
|
q_05_01_read_doc()
|
||||||
|
search_grid = q_05_02_make_search_grid(alt_forest_model)
|
||||||
|
q_05_03_k()
|
||||||
|
# best_estimator = q_05_04_get_best_params(search_grid, training_set[0], training_alternate_expects_set)
|
||||||
|
# print("best_estimator", best_estimator)
|
||||||
|
#
|
||||||
|
# > best_estimator RandomForestClassifier(bootstrap=False, min_samples_leaf=2, n_estimators=200, random_state=42)
|
||||||
|
|
||||||
|
# définition alternative pour 'best_estimator'
|
||||||
|
best_estimator = sklearn.ensemble.RandomForestClassifier(bootstrap=False, min_samples_leaf=2, n_estimators=200, random_state=42).fit(training_set[0], training_alternate_expects_set)
|
||||||
|
best_estimator_accuracy = q_05_05_evaluate_error(best_estimator, validation_set[0], validation_alternate_expects_set)
|
||||||
|
# print("best_estimator_accuracy", best_estimator_accuracy)
|
||||||
|
#
|
||||||
|
# > best_estimator_accuracy 0.0975
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
def q_01_01_load_data(path: str):
|
||||||
|
return pandas.read_csv(path, sep=';')
|
||||||
|
|
||||||
|
def q_01_02_print_raw_data(data: pandas.DataFrame):
|
||||||
|
print("__q_02_print_raw_data__\n", data)
|
||||||
|
|
||||||
|
def q_01_03_plot_raw_data(data: pandas.DataFrame):
|
||||||
|
seaborn.pairplot(data)
|
||||||
|
matplotlib.pyplot.show()
|
||||||
|
|
||||||
|
|
||||||
|
def q_02_01_split_set(data: pandas.DataFrame):
|
||||||
|
quality_categories = data["quality"]
|
||||||
|
parameters_columns = [ name for name in DATA_COLUMNS if name != "quality" ]
|
||||||
|
parameters = data[parameters_columns]
|
||||||
|
return (quality_categories, parameters)
|
||||||
|
|
||||||
|
|
||||||
|
def q_02_02_model_sets(parameters: pandas.Series, expects: pandas.Series):
|
||||||
|
(
|
||||||
|
training_parameters_set, validation_parameters_set,
|
||||||
|
training_expects_set, validation_expects_set
|
||||||
|
) = sklearn.model_selection.train_test_split(parameters, expects, random_state=HAHA_FUNI_NUMBER)
|
||||||
|
|
||||||
|
assert type(training_parameters_set) is pandas.DataFrame
|
||||||
|
assert type(validation_parameters_set) is pandas.DataFrame
|
||||||
|
assert type(training_expects_set) is pandas.Series
|
||||||
|
assert type(validation_expects_set) is pandas.Series
|
||||||
|
return ((training_parameters_set, training_expects_set), (validation_parameters_set, validation_expects_set))
|
||||||
|
|
||||||
|
def q_02_03_train_linear_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series):
|
||||||
|
model = sklearn.linear_model.LinearRegression()
|
||||||
|
model.fit(training_parameters_set, training_expects_set)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def q_02_04_evaluate_error(
|
||||||
|
model: sklearn.linear_model.LinearRegression,
|
||||||
|
validation_parameters_set: pandas.DataFrame,
|
||||||
|
validation_expects_set: pandas.Series
|
||||||
|
):
|
||||||
|
predictions = model.predict(validation_parameters_set)
|
||||||
|
error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions)
|
||||||
|
return error
|
||||||
|
|
||||||
|
def q_02_05_print_one_prediction(model: sklearn.linear_model.LinearRegression, parameter_set: pandas.DataFrame):
|
||||||
|
parameters = parameter_set.head(1)
|
||||||
|
[prediction, *_] = model.predict(parameters)
|
||||||
|
print("q_02_05_print_one_prediction", prediction)
|
||||||
|
|
||||||
|
|
||||||
|
def q_03_01_train_forest_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series):
|
||||||
|
model = sklearn.ensemble.RandomForestClassifier(random_state=HAHA_FUNI_NUMBER)
|
||||||
|
model.fit(training_parameters_set, training_expects_set)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def q_03_02_evaluate_error(
|
||||||
|
model: sklearn.ensemble.RandomForestClassifier,
|
||||||
|
validation_parameters_set: pandas.DataFrame,
|
||||||
|
validation_expects_set: pandas.Series
|
||||||
|
):
|
||||||
|
predictions = model.predict(validation_parameters_set)
|
||||||
|
error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions)
|
||||||
|
return error
|
||||||
|
|
||||||
|
def q_03_03_confusion_matrix(
|
||||||
|
model: sklearn.ensemble.RandomForestClassifier,
|
||||||
|
validation_parameters_set: pandas.DataFrame,
|
||||||
|
validation_expects_set: pandas.Series
|
||||||
|
):
|
||||||
|
predictions = model.predict(validation_parameters_set)
|
||||||
|
confusion = sklearn.metrics.confusion_matrix(validation_expects_set, predictions)
|
||||||
|
return confusion
|
||||||
|
|
||||||
|
|
||||||
|
def q_04_01_reclassify_expects(quality_series: pandas.Series):
|
||||||
|
QUALITY_THRESHOLD = 7
|
||||||
|
high_quality_series = quality_series.map(lambda quality: 1 if quality >= QUALITY_THRESHOLD else 0)
|
||||||
|
return high_quality_series
|
||||||
|
|
||||||
|
def q_04_02_train_forest_model(training_parameters_set: pandas.DataFrame, training_expects_set: pandas.Series):
|
||||||
|
model = sklearn.ensemble.RandomForestClassifier(random_state=HAHA_FUNI_NUMBER)
|
||||||
|
model.fit(training_parameters_set, training_expects_set)
|
||||||
|
return model
|
||||||
|
|
||||||
|
def q_04_03_evaluate_error(
|
||||||
|
model: sklearn.ensemble.RandomForestClassifier,
|
||||||
|
validation_parameters_set: pandas.DataFrame,
|
||||||
|
validation_expects_set: pandas.Series
|
||||||
|
):
|
||||||
|
predictions = model.predict(validation_parameters_set)
|
||||||
|
error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions)
|
||||||
|
return error
|
||||||
|
|
||||||
|
def q_04_04_confusion_matrix(
|
||||||
|
model: sklearn.ensemble.RandomForestClassifier,
|
||||||
|
validation_parameters_set: pandas.DataFrame,
|
||||||
|
validation_expects_set: pandas.Series
|
||||||
|
):
|
||||||
|
predictions = model.predict(validation_parameters_set)
|
||||||
|
confusion = sklearn.metrics.confusion_matrix(validation_expects_set, predictions)
|
||||||
|
return confusion
|
||||||
|
|
||||||
|
|
||||||
|
def q_05_01_read_doc():
|
||||||
|
pass
|
||||||
|
|
||||||
|
def q_05_02_make_search_grid(base_model: sklearn.ensemble.RandomForestClassifier):
|
||||||
|
param_grid = {
|
||||||
|
"n_estimators": [100, 200, 300],
|
||||||
|
"max_depth": [None, 10, 20, 30],
|
||||||
|
"min_samples_split": [2, 5, 10],
|
||||||
|
"min_samples_leaf": [1, 2, 4],
|
||||||
|
"bootstrap": [True, False]
|
||||||
|
}
|
||||||
|
grid_search = sklearn.model_selection.GridSearchCV(base_model, param_grid, n_jobs=-1)
|
||||||
|
return grid_search
|
||||||
|
|
||||||
|
def q_05_03_k():
|
||||||
|
pass
|
||||||
|
|
||||||
|
def q_05_04_get_best_params(
|
||||||
|
grid_search: sklearn.model_selection.GridSearchCV,
|
||||||
|
training_parameters_set: pandas.DataFrame,
|
||||||
|
training_expects_set: pandas.Series
|
||||||
|
):
|
||||||
|
grid_search.fit(training_parameters_set, training_expects_set)
|
||||||
|
return grid_search.best_estimator_
|
||||||
|
|
||||||
|
def q_05_05_evaluate_error(
|
||||||
|
model: sklearn.ensemble.RandomForestClassifier,
|
||||||
|
validation_parameters_set: pandas.DataFrame,
|
||||||
|
validation_expects_set: pandas.Series
|
||||||
|
):
|
||||||
|
predictions = model.predict(validation_parameters_set)
|
||||||
|
error = sklearn.metrics.mean_squared_error(validation_expects_set, predictions)
|
||||||
|
return error
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__": main()
|
6
ia/tp2/requirements.txt
Normal file
6
ia/tp2/requirements.txt
Normal file
|
@ -0,0 +1,6 @@
|
||||||
|
pandas
|
||||||
|
numpy
|
||||||
|
seaborn
|
||||||
|
matplotlib
|
||||||
|
scikit-learn
|
||||||
|
PyQt5
|
1600
ia/tp2/winequality-red.csv
Normal file
1600
ia/tp2/winequality-red.csv
Normal file
File diff suppressed because it is too large
Load diff
Loading…
Add table
Add a link
Reference in a new issue