import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, \
ElasticNet, Lassofrom sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from patsy import dmatrix
import sys
'../modules')
sys.path.append(import ols_step_sk
11 Comparaison des différentes méthodes, étude de cas réels
Erreur de prévision et validation croisée
Analyse de l’ozone
= pd.read_csv("../donnees/ozone_complet.txt", header = 0, sep = ";", index_col=0)
ozone ozone.shape
(1464, 23)
= pd.isna(ozone)
lesna = lesna.apply(any, axis=1)
indNA = ozone[~indNA]
ozone2 ozone2.shape
(1366, 23)
= pd.read_csv("../donnees/ozone_transf.txt", header = 0, sep = ";", index_col=0)
don
don.shape={"maxO3":"Y"},inplace=True)
don.rename(columns= don["Y"].to_numpy() Y
=10
nb= np.arange(don.shape[0])%nb tmp
= np.random.default_rng(seed=1234)
rng = rng.choice(tmp,size=don.shape[0],replace=False) bloc
= pd.DataFrame({"bloc":bloc,"Y":don["Y"],"MCO":0.0,"BIC":0.0,"AIC":0.0,
PREV "ridge":0.0,"lasso":0.0,"elast":0.0,
"pls":0.0,"pcr":0.0})
= list(don.columns.difference(["Y"]))
nomsvar #design matrix
= "~" + "+".join(nomsvar)
formule = dmatrix(formule,don)
dsX = np.asarray(dsX)[:,1:]
X = don["Y"].to_numpy() Y
= KFold(n_splits=10, shuffle=True, random_state=0)
kfregul = KFold(n_splits=4, shuffle=True, random_state=0)
kfaxes = 20
nbaxes # instanciation steps
= StandardScaler()
cr = LassoCV(cv=kfregul, n_jobs=3,max_iter=1000)
lassocv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=1000)
enetcv # instanciation pipeline
= Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
pipe_enetcv ## ridge : path
= pipe_lassocv.named_steps["lassocv"]
etape_lasso # intanciations
= Ridge()
ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
pipe_ridge = PCA()
acp = LinearRegression()
reg = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
pipe_pcr = PLSRegression()
regpls ## grille composantes et decoupage VC
= { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pcr = { "n_components" : list(range(1,nbaxes))} param_grid_pls
for i in np.arange(nb):
print(i)
= X[bloc!=i,:]
Xapp = X[bloc==i,:]
Xtest = don[bloc!=i]["Y"]
Yapp = don[bloc==i]["Y"]
Ytest #### reg
= LinearRegression()
reg
reg.fit(Xapp,Yapp)==i,"MCO"] = reg.predict(Xtest)
PREV.loc[PREV.bloc### bic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
inst_reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
reg_bic ==i,"BIC"] = reg_bic.predict(Xtest)
PREV.loc[PREV.bloc### aic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
inst_reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
reg_aic ==i,"AIC"] = reg_aic.predict(Xtest)
PREV.loc[PREV.bloc## lasso
pipe_lassocv.fit(Xapp,Yapp)==i,"lasso"] = pipe_lassocv.predict(Xtest)
PREV.loc[PREV.bloc## elastic net
pipe_enetcv.fit(Xapp,Yapp)==i,"elast"] = pipe_enetcv.predict(Xtest)
PREV.loc[PREV.bloc## params lambda
= etape_lasso.alphas_ * 100
path_ridge = {"ridge__alpha": path_ridge}
param_grid_ridge ## GridSearchCV
= GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
cv_ridge ==i,"ridge"] = cv_ridge.predict(Xtest)
PREV.loc[PREV.bloc## gridsearch instanciation et fit
= GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pcr = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pls ==i,"pcr"] = cv_pcr.predict(Xtest)
PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest) PREV.loc[PREV.bloc
0
Final: [0, 4, 5, 8, 11, 20]
Final: [0, 4, 5, 6, 8, 11, 18, 19, 20]
1
Final: [0, 4, 5, 8, 10, 20]
Final: [0, 4, 5, 6, 8, 10, 11, 12, 19, 20]
2
Final: [0, 5, 8, 11, 20]
Final: [0, 4, 5, 6, 8, 11, 12, 13, 20]
3
Final: [0, 4, 5, 8, 11, 20]
Final: [0, 4, 5, 6, 8, 11, 12, 18, 19, 20]
4
Final: [0, 5, 8, 10, 20]
Final: [0, 4, 5, 6, 8, 10, 20]
5
Final: [0, 5, 8, 10, 20]
Final: [0, 4, 5, 6, 8, 10, 18, 19, 20]
6
Final: [0, 5, 8, 10, 20]
Final: [0, 1, 4, 5, 8, 10, 11, 12, 15, 19, 20]
7
Final: [0, 5, 8, 10, 20]
Final: [0, 1, 4, 5, 8, 10, 11, 12, 15, 19, 20]
8
Final: [0, 5, 8, 10, 20]
Final: [0, 1, 4, 5, 8, 10, 19, 20]
9
Final: [0, 5, 8, 11, 20]
Final: [0, 4, 5, 6, 8, 11, 12, 18, 19, 20]
= PREV.iloc[:,1:]
prev round((prev.sub(PREV.Y, axis=0)**2).mean(),2) np.
Y 0.00
MCO 187.51
BIC 188.23
AIC 187.74
ridge 187.43
lasso 187.05
elast 187.20
pls 187.92
pcr 188.22
dtype: float64
Transformation des variables : feature engineering
Modèles de prévision avec interactions
= "1 + (" + "+".join(nomsvar) + ")**2"
formuleI = dmatrix(formuleI, don, return_type="dataframe").\
Xinter 1:].to_numpy() iloc[:,
= "1 + (" + "+".join(nomsvar) + ")**2"
formuleI = dmatrix(formuleI, don)
Xq = np.asarray(Xq)[:,1:]
Xinter Xinter.shape
(1366, 231)
= KFold(n_splits=10, shuffle=True, random_state=0)
kfregul = KFold(n_splits=4, shuffle=True, random_state=0)
kfaxes = 40
nbaxes # instanciation steps
= StandardScaler()
cr = LassoCV(cv=kfregul, n_jobs=3,max_iter=5000)
lassocv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=5000)
enetcv # instanciation pipeline
= Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
pipe_enetcv ## ridge : path
= pipe_lassocv.named_steps["lassocv"]
etape_lasso # intanciations
= Ridge()
ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
pipe_ridge = PCA()
acp = LinearRegression()
reg = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
pipe_pcr = PLSRegression()
regpls ## grille composantes et decoupage VC
= { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pcr = { "n_components" : list(range(1,nbaxes))}
param_grid_pls
for i in np.arange(nb):
print(i)
= Xinter[bloc!=i,:]
Xapp = Xinter[bloc==i,:]
Xtest = don[bloc!=i]["Y"]
Yapp = don[bloc==i]["Y"]
Ytest #### reg
= LinearRegression()
reg
reg.fit(Xapp,Yapp)==i,"MCO"] = reg.predict(Xtest)
PREV.loc[PREV.bloc### bic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
inst_reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
reg_bic ==i,"BIC"] = reg_bic.predict(Xtest)
PREV.loc[PREV.bloc### aic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
inst_reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
reg_aic ==i,"AIC"] = reg_aic.predict(Xtest)
PREV.loc[PREV.bloc## lasso
pipe_lassocv.fit(Xapp,Yapp)==i,"lasso"] = pipe_lassocv.predict(Xtest)
PREV.loc[PREV.bloc## elastic net
pipe_enetcv.fit(Xapp,Yapp)==i,"elast"] = pipe_enetcv.predict(Xtest)
PREV.loc[PREV.bloc## params lambda
= etape_lasso.alphas_ * 100
path_ridge = {"ridge__alpha": path_ridge}
param_grid_ridge ## GridSearchCV
= GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
cv_ridge ==i,"ridge"] = cv_ridge.predict(Xtest)
PREV.loc[PREV.bloc## gridsearch instanciation et fit
= GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pcr = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pls ==i,"pcr"] = cv_pcr.predict(Xtest)
PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest) PREV.loc[PREV.bloc
0
Final: [5, 20, 21, 47, 98, 111, 125, 139, 142, 164, 185, 228]
Final: [1, 4, 5, 8, 13, 20, 21, 38, 41, 49, 62, 64, 82, 90, 98, 108, 110, 111, 115, 125, 127, 137, 139, 142, 143, 150, 152, 164, 165, 168, 192, 206, 221, 229]
1
Final: [1, 5, 8, 20, 21, 98, 111, 125, 139, 142, 164, 185, 193]
Final: [1, 5, 7, 8, 9, 13, 20, 21, 22, 29, 38, 40, 41, 44, 46, 49, 73, 77, 98, 101, 108, 111, 114, 115, 125, 127, 137, 142, 143, 145, 150, 164, 165, 167, 183, 185, 193, 202, 206, 229]
2
Final: [6, 10, 21, 40, 59, 95, 111, 139, 142, 153]
Final: [2, 4, 6, 8, 10, 11, 13, 20, 21, 38, 41, 42, 59, 62, 64, 75, 85, 90, 91, 98, 100, 109, 110, 111, 116, 118, 123, 126, 128, 137, 139, 140, 144, 149, 150, 153, 164, 166, 172, 183, 185, 194, 202, 205, 209, 221, 229]
3
Final: [6, 9, 14, 20, 22, 53, 97, 111, 119, 164, 192]
Final: [6, 7, 8, 9, 13, 14, 20, 21, 41, 47, 53, 59, 61, 63, 64, 66, 85, 98, 101, 104, 111, 112, 119, 130, 137, 139, 143, 150, 158, 164, 168, 176, 192, 195, 221, 229, 230]
4
Final: [7, 10, 20, 27, 41, 59, 77, 115, 125, 126, 164]
Final: [2, 4, 7, 13, 20, 24, 37, 41, 42, 43, 49, 59, 85, 90, 97, 101, 110, 115, 125, 126, 132, 137, 139, 142, 150, 154, 164, 183, 185, 190, 191, 193, 206, 209, 221, 229]
5
Final: [6, 10, 20, 21, 59, 111, 113, 115]
Final: [4, 6, 8, 10, 11, 13, 20, 21, 22, 24, 41, 57, 59, 90, 97, 110, 111, 113, 115, 126, 128, 137, 142, 145, 150, 162, 164, 165, 183, 185, 194, 203, 209, 211, 221, 229]
6
Final: [4, 6, 20, 24, 28, 41, 77, 97, 115, 125, 126, 164, 185, 192, 205]
Final: [4, 6, 13, 20, 21, 24, 28, 41, 49, 59, 77, 90, 97, 110, 111, 115, 117, 118, 125, 126, 128, 137, 144, 150, 164, 167, 183, 185, 190, 191, 194, 206, 209, 221, 226]
7
Final: [5, 20, 21, 111, 125, 139, 142, 164, 185]
Final: [1, 4, 5, 8, 13, 20, 21, 41, 49, 77, 80, 90, 98, 101, 110, 111, 115, 116, 117, 125, 127, 130, 131, 137, 139, 149, 150, 154, 155, 156, 157, 161, 164, 167, 168, 185, 192, 201, 202, 221, 223, 227]
8
Final: [4, 7, 10, 20, 21, 59, 96, 115, 125, 126, 164]
Final: [4, 5, 11, 13, 20, 21, 24, 29, 38, 41, 51, 59, 90, 96, 97, 110, 111, 115, 117, 125, 128, 130, 137, 139, 142, 150, 152, 157, 162, 164, 167, 168, 221, 226, 229]
9
Final: [1, 5, 10, 20, 21, 111, 115, 125, 139, 164, 191, 206]
Final: [1, 2, 4, 5, 7, 13, 20, 21, 24, 37, 38, 41, 47, 62, 64, 73, 79, 81, 91, 95, 97, 110, 111, 115, 117, 122, 125, 127, 130, 131, 137, 139, 142, 145, 150, 166, 172, 192, 205, 229]
= PREV.iloc[:,1:]
prev round((prev.sub(PREV.Y, axis=0)**2).mean(),2)
Y 0.00
MCO 187.73
BIC 168.28
AIC 168.38
ridge 165.13
lasso 161.60
elast 164.25
pls 168.91
pcr 173.75
dtype: float64
Modèles de prévision avec des polynômes
= X**2
Xcar = X**3
Xcub = np.concatenate((X, Xcar, Xcub), axis=1)
Xpol Xpol.shape
(1366, 63)
= KFold(n_splits=10, shuffle=True, random_state=0)
kfregul = KFold(n_splits=4, shuffle=True, random_state=0)
kfaxes = 40
nbaxes # instanciation steps
= StandardScaler()
cr = LassoCV(cv=kfregul, n_jobs=3,max_iter=3000)
lassocv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=3000)
enetcv # instanciation pipeline
= Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
pipe_enetcv ## ridge : path
= pipe_lassocv.named_steps["lassocv"]
etape_lasso # intanciations
= Ridge()
ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
pipe_ridge = PCA()
acp = LinearRegression()
reg = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
pipe_pcr = PLSRegression()
regpls ## grille composantes et decoupage VC
= { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pcr = { "n_components" : list(range(1,nbaxes))}
param_grid_pls
for i in np.arange(nb):
print(i)
= Xpol[bloc!=i,:]
Xapp = Xpol[bloc==i,:]
Xtest = don[bloc!=i]["Y"]
Yapp = don[bloc==i]["Y"]
Ytest #### reg
= LinearRegression()
reg
reg.fit(Xapp,Yapp)==i,"MCO"] = reg.predict(Xtest)
PREV.loc[PREV.bloc### bic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
inst_reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
reg_bic ==i,"BIC"] = reg_bic.predict(Xtest)
PREV.loc[PREV.bloc### aic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
inst_reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
reg_aic ==i,"AIC"] = reg_aic.predict(Xtest)
PREV.loc[PREV.bloc## lasso
pipe_lassocv.fit(Xapp,Yapp)==i,"lasso"] = pipe_lassocv.predict(Xtest)
PREV.loc[PREV.bloc## elastic net
pipe_enetcv.fit(Xapp,Yapp)==i,"elast"] = pipe_enetcv.predict(Xtest)
PREV.loc[PREV.bloc## params lambda
= etape_lasso.alphas_ * 100
path_ridge = {"ridge__alpha": path_ridge}
param_grid_ridge ## GridSearchCV
= GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
cv_ridge ==i,"ridge"] = cv_ridge.predict(Xtest)
PREV.loc[PREV.bloc## gridsearch instanciation et fit
= GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pcr = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pls ==i,"pcr"] = cv_pcr.predict(Xtest)
PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest) PREV.loc[PREV.bloc
0
Final: [11, 20, 21, 26, 29, 43, 47]
Final: [0, 1, 4, 11, 19, 20, 21, 26, 27, 29, 31, 34, 35, 38, 42, 43, 47, 52, 53, 56]
1
Final: [11, 20, 21, 26, 29, 43, 47, 53]
Final: [0, 1, 4, 8, 11, 12, 13, 19, 20, 21, 22, 26, 29, 31, 34, 35, 38, 42, 47, 48, 52, 53, 56]
2
Final: [5, 11, 20, 29, 38, 42, 47, 48]
Final: [0, 1, 4, 5, 8, 11, 13, 19, 20, 21, 22, 29, 31, 34, 35, 38, 42, 47, 48, 52, 53, 56]
3
Final: [5, 11, 20, 21, 29, 33, 38, 47, 48, 53]
Final: [1, 5, 10, 11, 12, 19, 20, 21, 29, 31, 33, 34, 35, 38, 43, 46, 47, 48, 53, 54, 56]
4
Final: [11, 20, 21, 26, 29, 43, 47]
Final: [0, 1, 4, 8, 10, 11, 18, 19, 20, 21, 26, 27, 29, 31, 34, 35, 38, 42, 43, 47, 56, 60]
5
Final: [5, 11, 20, 29, 38, 42, 47, 48, 53]
Final: [0, 1, 4, 5, 8, 10, 11, 19, 20, 21, 22, 29, 34, 35, 38, 41, 42, 47, 48, 53, 56, 62]
6
Final: [5, 11, 20, 21, 29, 43, 47]
Final: [0, 5, 8, 11, 12, 19, 20, 21, 27, 29, 31, 34, 35, 38, 42, 43, 46, 47, 52, 53, 56]
7
Final: [11, 20, 21, 26, 29, 43, 47]
Final: [1, 11, 12, 19, 20, 21, 26, 27, 29, 31, 34, 35, 38, 43, 46, 47, 52, 53, 56]
8
Final: [5, 11, 20, 21, 29, 43, 47]
Final: [0, 1, 5, 11, 19, 20, 21, 22, 29, 33, 34, 35, 38, 42, 46, 47, 48, 52, 53, 56]
9
Final: [0, 1, 5, 11, 16, 20, 29, 38, 43, 47, 53]
Final: [0, 1, 5, 8, 10, 11, 19, 20, 21, 22, 29, 31, 34, 35, 38, 42, 46, 47, 48, 53, 56]
= PREV.iloc[:,1:]
prev round((prev.sub(PREV.Y, axis=0)**2).mean(),2)
Y 0.00
MCO 165.40
BIC 167.61
AIC 164.87
ridge 165.08
lasso 163.90
elast 164.76
pls 166.16
pcr 169.63
dtype: float64
Modèles de prévision des splines
= np.ones((X.shape[0],1))
Xp for i in nomsvar:
= don.loc[:,i].quantile([0.25, 0.5, 0.75])
xi = "-1 + bs(" + i + ",knots=xi, degree=3)"
formule = dmatrix(formule, don)
BX = np.concatenate((Xp, BX), axis=1)
Xp
= Xp[:,1:] Xspline
= KFold(n_splits=10, shuffle=True, random_state=0)
kfregul = KFold(n_splits=4, shuffle=True, random_state=0)
kfaxes = 40
nbaxes # instanciation steps
= StandardScaler()
cr = LassoCV(cv=kfregul, n_jobs=3,max_iter=3000)
lassocv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=3000)
enetcv # instanciation pipeline
= Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
pipe_enetcv ## ridge : path
= pipe_lassocv.named_steps["lassocv"]
etape_lasso # intanciations
= Ridge()
ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
pipe_ridge = PCA()
acp = LinearRegression()
reg = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
pipe_pcr = PLSRegression()
regpls ## grille composantes et decoupage VC
= { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pcr = { "n_components" : list(range(1,nbaxes))}
param_grid_pls
for i in np.arange(nb):
print(i)
= Xspline[bloc!=i,:]
Xapp = Xspline[bloc==i,:]
Xtest = don[bloc!=i]["Y"]
Yapp = don[bloc==i]["Y"]
Ytest #### reg
= LinearRegression()
reg
reg.fit(Xapp,Yapp)==i,"MCO"] = reg.predict(Xtest)
PREV.loc[PREV.bloc### bic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
inst_reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
reg_bic ==i,"BIC"] = reg_bic.predict(Xtest)
PREV.loc[PREV.bloc### aic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
inst_reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
reg_aic ==i,"AIC"] = reg_aic.predict(Xtest)
PREV.loc[PREV.bloc## lasso
pipe_lassocv.fit(Xapp,Yapp)==i,"lasso"] = pipe_lassocv.predict(Xtest)
PREV.loc[PREV.bloc## elastic net
pipe_enetcv.fit(Xapp,Yapp)==i,"elast"] = pipe_enetcv.predict(Xtest)
PREV.loc[PREV.bloc## params lambda
= etape_lasso.alphas_ * 100
path_ridge = {"ridge__alpha": path_ridge}
param_grid_ridge ## GridSearchCV
= GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
cv_ridge ==i,"ridge"] = cv_ridge.predict(Xtest)
PREV.loc[PREV.bloc## gridsearch instanciation et fit
= GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pcr = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pls ==i,"pcr"] = cv_pcr.predict(Xtest)
PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest) PREV.loc[PREV.bloc
0
Final: [3, 5, 29, 34, 35, 39, 51, 53, 67, 87, 120, 121, 124]
Final: [1, 3, 5, 7, 8, 23, 26, 28, 29, 34, 35, 39, 48, 51, 52, 53, 65, 67, 70, 73, 80, 87, 88, 92, 104, 107, 108, 120, 121, 123, 124, 125]
1
Final: [3, 5, 34, 39, 41, 48, 51, 52, 87, 120, 121, 124]
Final: [0, 1, 3, 5, 7, 8, 23, 26, 28, 33, 34, 39, 40, 41, 46, 48, 49, 51, 52, 54, 59, 62, 65, 67, 70, 80, 82, 87, 88, 99, 102, 106, 120, 121, 123, 124, 125]
2
Final: [1, 3, 5, 7, 34, 39, 41, 49, 52, 87, 120, 121, 124]
Final: [0, 1, 3, 5, 7, 12, 23, 26, 27, 28, 29, 33, 34, 39, 40, 41, 46, 49, 51, 52, 59, 65, 70, 82, 84, 87, 88, 92, 104, 107, 120, 121, 124]
3
Final: [3, 5, 7, 28, 34, 41, 51, 52, 70, 77, 87, 88, 120, 121, 124]
Final: [0, 3, 5, 7, 8, 23, 26, 28, 31, 34, 38, 39, 40, 41, 44, 48, 49, 51, 52, 53, 56, 62, 67, 70, 73, 77, 80, 87, 88, 99, 103, 104, 122, 123, 124, 125]
4
Final: [3, 5, 34, 39, 41, 51, 52, 87, 120, 121, 124]
Final: [0, 1, 3, 5, 7, 8, 23, 25, 26, 27, 28, 29, 33, 34, 35, 39, 40, 41, 45, 46, 49, 51, 52, 59, 65, 67, 70, 75, 80, 87, 88, 92, 102, 106, 120, 121, 123, 124]
5
Final: [3, 5, 34, 39, 41, 48, 51, 52, 87, 104, 120, 121, 123, 124, 125]
Final: [0, 2, 4, 5, 7, 8, 23, 26, 28, 29, 34, 35, 39, 40, 41, 46, 48, 49, 51, 52, 59, 65, 67, 70, 73, 80, 87, 88, 92, 104, 122, 123, 124, 125]
6
Final: [1, 3, 5, 7, 28, 34, 39, 40, 41, 49, 52, 87, 120, 121, 124]
Final: [0, 1, 3, 5, 7, 26, 28, 33, 34, 39, 40, 41, 46, 48, 49, 51, 52, 59, 65, 70, 80, 87, 88, 92, 104, 108, 109, 117, 122, 123, 124, 125]
7
Final: [2, 7, 26, 28, 34, 39, 41, 49, 52, 65, 70, 87, 88, 120, 121, 124]
Final: [2, 7, 17, 23, 26, 27, 28, 29, 33, 34, 38, 39, 40, 41, 43, 46, 48, 49, 51, 52, 53, 65, 67, 70, 73, 80, 85, 87, 88, 92, 104, 106, 108, 122, 123, 124, 125]
8
Final: [3, 5, 7, 34, 39, 40, 41, 51, 52, 63, 120, 121, 124]
Final: [0, 2, 3, 4, 5, 7, 8, 23, 26, 28, 33, 34, 39, 40, 41, 46, 48, 51, 52, 59, 63, 65, 70, 80, 87, 88, 89, 92, 104, 105, 122, 123, 124, 125]
9
Final: [1, 3, 5, 7, 28, 34, 39, 41, 51, 53, 87, 104, 120, 121, 124]
Final: [0, 1, 2, 3, 5, 7, 8, 26, 28, 34, 39, 40, 41, 46, 48, 51, 52, 53, 63, 65, 70, 80, 87, 88, 89, 92, 104, 108, 122, 123, 124, 125]
= PREV.iloc[:,1:]
prev round((prev.sub(PREV.Y, axis=0)**2).mean(),2)
Y 0.00
MCO 162.74
BIC 163.77
AIC 158.73
ridge 157.86
lasso 155.43
elast 156.55
pls 160.24
pcr 167.39
dtype: float64
Modèles de prévision avec des splines et des interactions
= np.concatenate((Xinter[:,22:],Xspline),axis=1) Xsplineinter
= KFold(n_splits=10, shuffle=True, random_state=0)
kfregul = KFold(n_splits=4, shuffle=True, random_state=0)
kfaxes = 40
nbaxes # instanciation steps
= StandardScaler()
cr = LassoCV(cv=kfregul, n_jobs=3,max_iter=3000)
lassocv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=3000)
enetcv # instanciation pipeline
= Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
pipe_enetcv ## ridge : path
= pipe_lassocv.named_steps["lassocv"]
etape_lasso # intanciations
= Ridge()
ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
pipe_ridge = PCA()
acp = LinearRegression()
reg = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
pipe_pcr = PLSRegression()
regpls ## grille composantes et decoupage VC
= { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pcr = { "n_components" : list(range(1,nbaxes))}
param_grid_pls
for i in np.arange(nb):
print(i)
= Xsplineinter[bloc!=i,:]
Xapp = Xsplineinter[bloc==i,:]
Xtest = don[bloc!=i]["Y"]
Yapp = don[bloc==i]["Y"]
Ytest #### reg
= LinearRegression()
reg
reg.fit(Xapp,Yapp)==i,"MCO"] = reg.predict(Xtest)
PREV.loc[PREV.bloc### bic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
inst_reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
reg_bic ==i,"BIC"] = reg_bic.predict(Xtest)
PREV.loc[PREV.bloc### aic
= ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
inst_reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
reg_aic ==i,"AIC"] = reg_aic.predict(Xtest)
PREV.loc[PREV.bloc## lasso
pipe_lassocv.fit(Xapp,Yapp)==i,"lasso"] = pipe_lassocv.predict(Xtest)
PREV.loc[PREV.bloc## elastic net
pipe_enetcv.fit(Xapp,Yapp)==i,"elast"] = pipe_enetcv.predict(Xtest)
PREV.loc[PREV.bloc## params lambda
= etape_lasso.alphas_ * 100
path_ridge = {"ridge__alpha": path_ridge}
param_grid_ridge ## GridSearchCV
= GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
cv_ridge ==i,"ridge"] = cv_ridge.predict(Xtest)
PREV.loc[PREV.bloc## gridsearch instanciation et fit
= GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pcr = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
cv_pls ==i,"pcr"] = cv_pcr.predict(Xtest)
PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest) PREV.loc[PREV.bloc
0
Final: [2, 79, 117, 121, 131, 211, 243, 244, 249, 313, 330]
Final: [2, 4, 16, 45, 76, 79, 117, 121, 128, 140, 142, 147, 158, 161, 171, 174, 181, 184, 207, 209, 216, 217, 232, 236, 243, 244, 249, 253, 257, 291, 294, 296, 297, 301, 308, 313, 329, 330, 332, 333, 334]
1
Final: [5, 7, 19, 55, 76, 130, 131, 211, 243, 244, 296, 330]
Final: [5, 7, 8, 11, 16, 19, 25, 29, 55, 56, 66, 69, 76, 80, 86, 93, 97, 117, 118, 131, 136, 142, 143, 147, 163, 171, 174, 184, 185, 187, 211, 238, 241, 243, 245, 249, 252, 274, 276, 282, 291, 295, 296, 297, 298, 301, 308, 315, 329, 330, 333]
2
Final: [6, 21, 79, 105, 117, 121, 211, 243, 250, 274, 291, 296, 297, 313, 330]
Final: [3, 11, 19, 37, 55, 56, 75, 95, 97, 106, 108, 109, 115, 117, 124, 128, 131, 133, 136, 142, 144, 145, 147, 171, 174, 184, 187, 204, 207, 209, 210, 211, 222, 232, 238, 241, 243, 248, 249, 250, 255, 258, 259, 266, 274, 281, 291, 293, 294, 296, 297, 298, 300, 308, 313, 330, 331, 333]
3
Final: [5, 19, 25, 55, 76, 79, 108, 117, 131, 211, 243, 244, 286, 289, 296, 297, 313, 330]
Final: [5, 6, 16, 19, 22, 24, 36, 51, 55, 57, 59, 69, 74, 75, 94, 95, 102, 108, 109, 115, 124, 128, 130, 134, 136, 142, 145, 147, 153, 166, 170, 181, 183, 207, 209, 213, 216, 221, 223, 228, 238, 243, 245, 249, 250, 256, 261, 277, 280, 286, 291, 292, 294, 297, 298, 305, 315, 329, 330, 333]
4
Final: [2, 19, 55, 79, 105, 117, 121, 211, 243, 244, 286, 289, 296, 297, 330]
Final: [2, 5, 15, 19, 20, 55, 63, 69, 79, 93, 115, 117, 120, 128, 136, 142, 147, 151, 165, 168, 169, 171, 184, 193, 209, 215, 216, 232, 238, 243, 244, 249, 255, 258, 259, 268, 276, 284, 286, 291, 293, 294, 296, 297, 301, 308, 311, 315, 329, 330, 333]
5
Final: [6, 75, 93, 97, 105, 117, 171, 211, 216, 243, 289, 295, 297, 330]
Final: [2, 16, 18, 19, 21, 22, 23, 55, 56, 68, 74, 75, 81, 91, 93, 105, 115, 123, 124, 127, 128, 130, 131, 135, 142, 143, 147, 150, 155, 161, 174, 189, 196, 200, 207, 209, 213, 216, 221, 228, 232, 236, 243, 244, 248, 249, 255, 262, 267, 268, 270, 276, 289, 295, 297, 298, 301, 307, 309, 312, 313, 325, 329, 330, 333]
6
Final: [19, 55, 75, 93, 97, 103, 142, 155, 211, 241, 242, 243, 289, 297, 330]
Final: [2, 3, 16, 19, 25, 29, 53, 55, 74, 75, 81, 86, 94, 97, 103, 130, 142, 164, 170, 184, 208, 209, 232, 238, 242, 243, 245, 248, 249, 255, 259, 274, 275, 280, 284, 291, 295, 297, 298, 301, 308, 313, 329, 330, 333]
7
Final: [76, 78, 117, 120, 131, 174, 211, 216, 243, 249, 291, 296, 297, 313, 330]
Final: [0, 5, 31, 36, 42, 51, 55, 61, 69, 75, 81, 93, 115, 117, 120, 123, 128, 140, 142, 163, 170, 172, 174, 184, 208, 209, 216, 221, 224, 232, 238, 243, 244, 245, 249, 255, 257, 274, 289, 290, 294, 296, 297, 315, 325, 329, 330, 333]
8
Final: [0, 55, 76, 93, 117, 131, 155, 211, 243, 244, 289, 296, 297, 313, 330]
Final: [0, 5, 6, 11, 28, 31, 34, 36, 55, 68, 76, 93, 117, 120, 122, 127, 128, 136, 142, 147, 150, 151, 164, 172, 174, 184, 187, 193, 199, 208, 209, 211, 216, 229, 232, 238, 243, 244, 245, 249, 255, 259, 274, 280, 281, 291, 295, 297, 298, 301, 309, 311, 313, 314, 329, 330, 333]
9
Final: [0, 55, 76, 94, 105, 117, 155, 211, 243, 244, 296, 297, 313, 330]
Final: [5, 11, 12, 16, 19, 22, 24, 25, 55, 57, 59, 68, 76, 78, 93, 100, 114, 115, 117, 120, 123, 127, 128, 142, 150, 154, 160, 168, 169, 170, 174, 176, 177, 184, 199, 207, 209, 211, 216, 231, 243, 245, 249, 255, 256, 259, 273, 275, 280, 281, 291, 294, 296, 297, 298, 301, 309, 313, 314, 329, 330, 333, 334]
= PREV.iloc[:,1:]
prev round((prev.sub(PREV.Y, axis=0)**2).mean(),2)
Y 0.00
MCO 193.29
BIC 153.88
AIC 164.73
ridge 156.45
lasso 154.41
elast 154.17
pls 160.23
pcr 164.32
dtype: float64