%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
data = pd.read_csv("su00ODC7qjw.tsv", sep="\t")
data.sample(10)
len(data)
cleaned = data.dropna()
cleaned = cleaned.join(pd.get_dummies(cleaned.offres))
print(len(cleaned))
cleaned.sample(10)
cleaned.price.plot.density()
cleaned.km.plot.density()
cleaned.annee.plot.density()
cleaned.plot.scatter(x='km', y='price')
cleaned.plot.scatter(x='annee', y='price')
X = cleaned[['annee','km','private','pro','cp']]
Y = cleaned[['price']]
# Based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html
import numpy as np
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(
clf,
title,
X,
y,
train_sizes=np.linspace(.1, 1.0, 10),
n_splits=20):
plt.figure()
plt.figure(figsize=(6, 6))
plt.title(title)
plt.xlabel("Examples vus")
plt.ylabel("Score")
plt.ylim(0, 1)
cross_validation = ShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=0)
train_sizes, train_scores, test_scores = learning_curve(
clf, X, y, cv=cross_validation, n_jobs=-1, train_sizes=train_sizes, scoring="r2")
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
plt.grid()
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std, alpha=0.1,
color="r")
plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
label="Training score")
plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
label="Cross-validation score")
plt.legend(loc="best")
plt.show()
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
plot_learning_curve(clf, "Random Forest", X, Y)
from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor()
plot_learning_curve(clf, "Gradient Boosting", X, Y)
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
plot_learning_curve(clf, "Régression linéaire", X, Y)
Il n'y a que la régression linéaire qui n'overfitte pas : on peut espérer un score R2 entre 0.25 et 0.60.
Les autres modèles testés overfittent largement : il faudrait plus d'exemples pour améliorer le score.