import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge, ElasticNet, Lasso
from sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from patsy import dmatrix
9 Régularisation des moindres carrés : ridge, lasso elastic net
Introduction
Problème du centrage-réduction
Propriétés des estimateurs
Régularisation avec le package scikitlearn
= pd.read_csv("../donnees/ozone.txt", header = 0, sep = ";", index_col=0)
ozone = ozone.iloc[:,1:10].to_numpy()
X = ozone["O3"].to_numpy() y
= StandardScaler()
cr = KFold(n_splits=10, shuffle=True, random_state=0) kf
= LassoCV(cv=kf)
lassocv = ElasticNetCV(cv=kf)
enetcv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)]) pipe_enetcv
pipe_lassocv.fit(X, y) pipe_enetcv.fit(X, y)
Pipeline(steps=[('cr', StandardScaler()), ('enetcv', ElasticNetCV(cv=KFold(n_splits=10, random_state=0, shuffle=True)))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('cr', StandardScaler()), ('enetcv', ElasticNetCV(cv=KFold(n_splits=10, random_state=0, shuffle=True)))])
StandardScaler()
ElasticNetCV(cv=KFold(n_splits=10, random_state=0, shuffle=True))
= pipe_lassocv.named_steps["lassocv"]
etape_lassocv = pipe_enetcv.named_steps["enetcv"] etape_enetcv
= etape_lassocv.alphas_ * 100
alphasridge = RidgeCV(cv=kf, alphas=alphasridge) ridgecv
= Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
pipe_ridgecv pipe_ridgecv.fit(X, y)
Pipeline(steps=[('cr', StandardScaler()), ('ridgecv', RidgeCV(alphas=array([1.82214402e+03, 1.69933761e+03, 1.58480794e+03, 1.47799719e+03, 1.37838513e+03, 1.28548658e+03, 1.19884909e+03, 1.11805068e+03, 1.04269780e+03, 9.72423459e+02, 9.06885373e+02, 8.45764334e+02, 7.88762649e+02, 7.35602686e+02, 6.86025527e+02, 6.39789702e+02, 5.96670018e+02, 5.56456456e+02, 5.18953... 6.86025527e+00, 6.39789702e+00, 5.96670018e+00, 5.56456456e+00, 5.18953153e+00, 4.83977447e+00, 4.51358987e+00, 4.20938902e+00, 3.92569029e+00, 3.66111190e+00, 3.41436521e+00, 3.18424843e+00, 2.96964074e+00, 2.76949689e+00, 2.58284207e+00, 2.40876716e+00, 2.24642431e+00, 2.09502283e+00, 1.95382531e+00, 1.82214402e+00]), cv=KFold(n_splits=10, random_state=0, shuffle=True)))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('cr', StandardScaler()), ('ridgecv', RidgeCV(alphas=array([1.82214402e+03, 1.69933761e+03, 1.58480794e+03, 1.47799719e+03, 1.37838513e+03, 1.28548658e+03, 1.19884909e+03, 1.11805068e+03, 1.04269780e+03, 9.72423459e+02, 9.06885373e+02, 8.45764334e+02, 7.88762649e+02, 7.35602686e+02, 6.86025527e+02, 6.39789702e+02, 5.96670018e+02, 5.56456456e+02, 5.18953... 6.86025527e+00, 6.39789702e+00, 5.96670018e+00, 5.56456456e+00, 5.18953153e+00, 4.83977447e+00, 4.51358987e+00, 4.20938902e+00, 3.92569029e+00, 3.66111190e+00, 3.41436521e+00, 3.18424843e+00, 2.96964074e+00, 2.76949689e+00, 2.58284207e+00, 2.40876716e+00, 2.24642431e+00, 2.09502283e+00, 1.95382531e+00, 1.82214402e+00]), cv=KFold(n_splits=10, random_state=0, shuffle=True)))])
StandardScaler()
RidgeCV(alphas=array([1.82214402e+03, 1.69933761e+03, 1.58480794e+03, 1.47799719e+03, 1.37838513e+03, 1.28548658e+03, 1.19884909e+03, 1.11805068e+03, 1.04269780e+03, 9.72423459e+02, 9.06885373e+02, 8.45764334e+02, 7.88762649e+02, 7.35602686e+02, 6.86025527e+02, 6.39789702e+02, 5.96670018e+02, 5.56456456e+02, 5.18953153e+02, 4.83977447e+02, 4.51358987e+02, 4.20938902e+0... 6.86025527e+00, 6.39789702e+00, 5.96670018e+00, 5.56456456e+00, 5.18953153e+00, 4.83977447e+00, 4.51358987e+00, 4.20938902e+00, 3.92569029e+00, 3.66111190e+00, 3.41436521e+00, 3.18424843e+00, 2.96964074e+00, 2.76949689e+00, 2.58284207e+00, 2.40876716e+00, 2.24642431e+00, 2.09502283e+00, 1.95382531e+00, 1.82214402e+00]), cv=KFold(n_splits=10, random_state=0, shuffle=True))
= pipe_ridgecv.named_steps["ridgecv"] etape_ridgecv
= pipe_lassocv.named_steps["lassocv"].alphas_
alphas = []; coefs_enet = []; coefs_ridge = []
coefs_lasso = StandardScaler().fit(X).transform(X)
Xcr for a in alphas:
## lasso
= Lasso(alpha=a, warm_start=True).fit(Xcr, y)
lasso
coefs_lasso.append(lasso.coef_)## enet
= ElasticNet(alpha=a*2, warm_start=True).fit(Xcr, y)
enet
coefs_enet.append(enet.coef_)## ridge
= Ridge(alpha=a*100).fit(Xcr, y)
ridge
coefs_ridge.append(ridge.coef_)= plt.subplots(1, 3)
fig, (ax1, ax2, ax3)
ax1.plot(np.log(alphas), coefs_lasso)r'$\log(\alpha)$')
ax1.set_xlabel('Coefficients')
ax1.set_ylabel(2*alphas), coefs_enet)
ax2.plot(np.log(r'$\log(2\alpha)$')
ax2.set_xlabel('Coefficients')
ax2.set_ylabel(100*alphas), coefs_ridge)
ax3.plot(np.log(r'$\log(100\alpha)$')
ax3.set_xlabel('Coefficients') ax3.set_ylabel(
Text(0, 0.5, 'Coefficients')
print("Lasso: ", round(etape_lassocv.alpha_,4))
print("ElasticNet: ", round(etape_enetcv.alpha_,4))
print("Ridge: ", round(etape_ridgecv.alpha_,4))
Lasso: 0.7356
ElasticNet: 0.3399
Ridge: 9.0689
= pd.read_csv("../donnees/ozone.txt", header = 0, sep = ";", index_col=0)
ozone = ozone.iloc[0:45,1:10].to_numpy()
Xapp = ozone.iloc[45:50,1:10].to_numpy()
Xnew = np.ravel(ozone.iloc[0:45,:1])
yapp = np.ravel(ozone.iloc[45:50,:1]) ynew
= KFold(n_splits=10, shuffle=True, random_state=0)
kf = StandardScaler()
cr = LassoCV(cv=kf)
lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv pipe_lassocv.fit(Xapp,yapp)
Pipeline(steps=[('cr', StandardScaler()), ('lassocv', LassoCV(cv=KFold(n_splits=10, random_state=0, shuffle=True)))])In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
Pipeline(steps=[('cr', StandardScaler()), ('lassocv', LassoCV(cv=KFold(n_splits=10, random_state=0, shuffle=True)))])
StandardScaler()
LassoCV(cv=KFold(n_splits=10, random_state=0, shuffle=True))
pipe_lassocv.predict(Xnew)
array([95.81725767, 63.675633 , 67.49560903, 67.57922292, 89.43629205])
Intégration de variables qualitatives
= pd.read_csv("../donnees/ozone.txt", header=0, sep=";", index_col=0)
ozone "vent"]=ozone["vent"].astype("category")
ozone["nebu"]=ozone["nebu"].astype("category") ozone[
= KFold(n_splits=10, shuffle=True, random_state=0)
kf = StandardScaler()
cr = LassoCV(cv=kf)
lassocv = ElasticNetCV(cv=kf)
enetcv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)]) pipe_lassocv
= list(ozone.iloc[:, 1:].columns)
noms = "~" + "+".join(noms[0:])
formule = dmatrix(formule, ozone, return_type="dataframe")
Xq = Xq.iloc[0:45, 1:].to_numpy()
Xapp = Xq.iloc[45:50, 1:].to_numpy()
Xnew = np.ravel(ozone.iloc[0:45,:1])
yapp = np.ravel(ozone.iloc[45:50,:1]) ynew
pipe_lassocv.fit(Xapp, yapp)round(2) pipe_lassocv.predict(Xnew).
array([97.41, 63.72, 66.26, 66.76, 90.23])
= "~" + "+".join(noms[0:]) + "+C(vent,Treatment(1))"
formule = dmatrix(formule,ozone,return_type="dataframe")
Xq = Xq.iloc[0:45,1:].to_numpy()
Xapp = Xq.iloc[45:50,1:].to_numpy()
Xnew = np.ravel(ozone.iloc[0:45,:1])
yapp = np.ravel(ozone.iloc[45:50,:1])
ynew
pipe_lassocv.fit(Xapp,yapp)round(2) pipe_lassocv.predict(Xnew).
array([97.92, 63.63, 65.91, 66.42, 90.29])
= "~" + "+".join(noms[0:-2]) + \
formule "+ C(nebu, Sum) + C(vent, Sum)"
= dmatrix(formule,ozone,return_type="dataframe")
Xq = Xq.iloc[0:45,1:].to_numpy()
Xapp = Xq.iloc[45:50,1:].to_numpy()
Xnew = np.ravel(ozone.iloc[0:45,:1])
yapp = np.ravel(ozone.iloc[45:50,:1])
ynew
pipe_lassocv.fit(Xapp,yapp)round(2) pipe_lassocv.predict(Xnew).
array([99.49, 63.82, 65.65, 66.18, 90.31])