In [1]:
%matplotlib inline

import matplotlib.pyplot as plt
import pandas as pd

Lecture et nettoyage des données

In [2]:
data = pd.read_csv("su00ODC7qjw.tsv", sep="\t")
In [3]:
data.sample(10)
Out[3]:
id annee cc km offres price cp pubdate
67 1573628281 2011.0 800 17051.0 pro 4990 58640 2019-02-23 15:39:31
74 1573950796 2014.0 800 18000.0 private 6250 34000 2019-02-24 10:18:56
33 1570187977 2014.0 800 49600.0 private 3800 13012 2019-02-17 10:09:05
19 1557151437 2002.0 600 49672.0 private 2100 95460 2019-01-22 15:28:43
236 1579731238 2012.0 800 12200.0 private 4600 88200 2019-03-06 23:05:22
178 1577955315 2012.0 800 48665.0 private 4700 60350 2019-03-03 16:40:02
128 1576080560 2011.0 800 24800.0 private 4500 84380 2019-02-28 09:30:07
28 1568405546 2012.0 800 48400.0 pro 3890 68200 2019-02-13 14:36:49
25 1566137197 2012.0 800 21300.0 private 4000 81350 2019-02-09 12:29:28
62 1573413055 2011.0 800 17000.0 private 5300 38260 2019-02-23 10:10:38
In [4]:
len(data)
Out[4]:
252
In [5]:
cleaned = data.dropna()
cleaned = cleaned.join(pd.get_dummies(cleaned.offres))
print(len(cleaned))
cleaned.sample(10)
248
Out[5]:
id annee cc km offres price cp pubdate private pro
35 1570924757 2013.0 800 26850.0 private 5000 63820 2019-02-18 13:46:30 1 0
155 1577267113 2010.0 799 10200.0 private 4800 24120 2019-03-02 14:08:07 1 0
93 1574550520 2016.0 800 6000.0 private 6200 69007 2019-02-25 09:58:46 1 0
80 1574090805 2011.0 800 29000.0 private 5000 95810 2019-02-24 13:58:34 1 0
211 1578880101 2011.0 799 22200.0 private 4599 31000 2019-03-05 10:28:27 1 0
156 1577271409 2013.0 800 12300.0 private 4800 13720 2019-03-02 14:19:56 1 0
159 1577315036 2011.0 800 23200.0 private 4800 74350 2019-03-02 15:24:06 1 0
167 1577484996 2011.0 800 20000.0 private 5100 74100 2019-03-02 19:18:04 1 0
41 1572497421 2011.0 800 10800.0 private 4500 35770 2019-02-21 14:08:57 1 0
190 1578177079 2011.0 800 11500.0 private 4990 76890 2019-03-03 22:24:40 1 0

Inspection des données

In [6]:
cleaned.price.plot.density()
Out[6]:
<matplotlib.axes._subplots.AxesSubplot at 0x1193762b0>
In [7]:
cleaned.km.plot.density()
Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x1258d3d68>
In [8]:
cleaned.annee.plot.density()
Out[8]:
<matplotlib.axes._subplots.AxesSubplot at 0x12597c2e8>
In [9]:
cleaned.plot.scatter(x='km', y='price')
Out[9]:
<matplotlib.axes._subplots.AxesSubplot at 0x125a75908>
In [10]:
cleaned.plot.scatter(x='annee', y='price')
Out[10]:
<matplotlib.axes._subplots.AxesSubplot at 0x125ae4b38>
In [11]:
X = cleaned[['annee','km','private','pro','cp']]
Y = cleaned[['price']]

Courbe d'apprentissage

In [12]:
# Based on https://scikit-learn.org/stable/auto_examples/model_selection/plot_learning_curve.html

import numpy as np
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit

def plot_learning_curve(
        clf,
        title,
        X,
        y, 
        train_sizes=np.linspace(.1, 1.0, 10),
        n_splits=20):
    plt.figure()
    plt.figure(figsize=(6, 6))
    plt.title(title)
    plt.xlabel("Examples vus")
    plt.ylabel("Score")
    plt.ylim(0, 1)

    cross_validation = ShuffleSplit(n_splits=n_splits, test_size=0.2, random_state=0)
    
    train_sizes, train_scores, test_scores = learning_curve(
        clf, X, y, cv=cross_validation, n_jobs=-1, train_sizes=train_sizes, scoring="r2")
    train_scores_mean = np.mean(train_scores, axis=1)
    train_scores_std = np.std(train_scores, axis=1)
    test_scores_mean = np.mean(test_scores, axis=1)
    test_scores_std = np.std(test_scores, axis=1)
    plt.grid()

    plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                     train_scores_mean + train_scores_std, alpha=0.1,
                     color="r")
    plt.fill_between(train_sizes, test_scores_mean - test_scores_std,
                     test_scores_mean + test_scores_std, alpha=0.1, color="g")
    plt.plot(train_sizes, train_scores_mean, 'o-', color="r",
             label="Training score")
    plt.plot(train_sizes, test_scores_mean, 'o-', color="g",
             label="Cross-validation score")

    plt.legend(loc="best")
    plt.show()
In [13]:
from sklearn.ensemble import RandomForestRegressor
clf = RandomForestRegressor()
plot_learning_curve(clf, "Random Forest", X, Y)
<Figure size 432x288 with 0 Axes>
In [14]:
from sklearn.ensemble import GradientBoostingRegressor
clf = GradientBoostingRegressor()
plot_learning_curve(clf, "Gradient Boosting", X, Y)
<Figure size 432x288 with 0 Axes>
In [15]:
from sklearn.linear_model import LinearRegression
clf = LinearRegression()
plot_learning_curve(clf, "Régression linéaire", X, Y)
<Figure size 432x288 with 0 Axes>

Conclusions

Il n'y a que la régression linéaire qui n'overfitte pas : on peut espérer un score R2 entre 0.25 et 0.60.

Les autres modèles testés overfittent largement : il faudrait plus d'exemples pour améliorer le score.