11 Comparaison des différentes méthodes, étude de cas réels

import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, Ridge, \
ElasticNet, Lasso
from sklearn.linear_model import RidgeCV, ElasticNetCV, LassoCV
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from patsy import dmatrix

import sys
sys.path.append('../modules')
import ols_step_sk

Erreur de prévision et validation croisée

Analyse de l’ozone

ozone = pd.read_csv("../donnees/ozone_complet.txt", header = 0, sep = ";", index_col=0)
ozone.shape

(1464, 23)

lesna = pd.isna(ozone)
indNA = lesna.apply(any, axis=1)
ozone2 = ozone[~indNA]
ozone2.shape

(1366, 23)

don = pd.read_csv("../donnees/ozone_transf.txt", header = 0, sep = ";", index_col=0)
don.shape
don.rename(columns={"maxO3":"Y"},inplace=True)
Y = don["Y"].to_numpy()

nb=10
tmp = np.arange(don.shape[0])%nb

rng = np.random.default_rng(seed=1234)
bloc = rng.choice(tmp,size=don.shape[0],replace=False)

PREV = pd.DataFrame({"bloc":bloc,"Y":don["Y"],"MCO":0.0,"BIC":0.0,"AIC":0.0,
                    "ridge":0.0,"lasso":0.0,"elast":0.0,
                    "pls":0.0,"pcr":0.0})

nomsvar = list(don.columns.difference(["Y"]))
#design matrix
formule = "~" + "+".join(nomsvar)
dsX = dmatrix(formule,don)
X = np.asarray(dsX)[:,1:]
Y = don["Y"].to_numpy()

kfregul = KFold(n_splits=10, shuffle=True, random_state=0)
kfaxes = KFold(n_splits=4, shuffle=True, random_state=0)
nbaxes = 20
# instanciation steps
cr = StandardScaler()
lassocv = LassoCV(cv=kfregul, n_jobs=3,max_iter=1000)
enetcv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=1000)
# instanciation pipeline
pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
## ridge : path
etape_lasso = pipe_lassocv.named_steps["lassocv"]
# intanciations
ridge = Ridge()
pipe_ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
acp = PCA()
reg = LinearRegression()
pipe_pcr = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
regpls = PLSRegression()
## grille composantes et decoupage VC
param_grid_pcr = { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pls = { "n_components" : list(range(1,nbaxes))}

for i in np.arange(nb):
    print(i)
    Xapp = X[bloc!=i,:]
    Xtest = X[bloc==i,:]
    Yapp = don[bloc!=i]["Y"]
    Ytest = don[bloc==i]["Y"]
    #### reg
    reg = LinearRegression()
    reg.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"MCO"] = reg.predict(Xtest)
    ### bic
    inst_reg_bic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
    reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"BIC"] = reg_bic.predict(Xtest)
    ### aic
    inst_reg_aic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
    reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"AIC"] = reg_aic.predict(Xtest)
    ## lasso
    pipe_lassocv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"lasso"] = pipe_lassocv.predict(Xtest)
    ## elastic net
    pipe_enetcv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"elast"] = pipe_enetcv.predict(Xtest)    
    ## params lambda
    path_ridge = etape_lasso.alphas_ * 100    
    param_grid_ridge = {"ridge__alpha": path_ridge}
    ## GridSearchCV
    cv_ridge = GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
    PREV.loc[PREV.bloc==i,"ridge"] = cv_ridge.predict(Xtest)
    ## gridsearch instanciation et fit
    cv_pcr = GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    cv_pls = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"pcr"] = cv_pcr.predict(Xtest)
    PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest)

0
Final: [0, 4, 5, 8, 11, 20]
Final: [0, 4, 5, 6, 8, 11, 18, 19, 20]
1
Final: [0, 4, 5, 8, 10, 20]
Final: [0, 4, 5, 6, 8, 10, 11, 12, 19, 20]
2
Final: [0, 5, 8, 11, 20]
Final: [0, 4, 5, 6, 8, 11, 12, 13, 20]
3
Final: [0, 4, 5, 8, 11, 20]
Final: [0, 4, 5, 6, 8, 11, 12, 18, 19, 20]
4
Final: [0, 5, 8, 10, 20]
Final: [0, 4, 5, 6, 8, 10, 20]
5
Final: [0, 5, 8, 10, 20]
Final: [0, 4, 5, 6, 8, 10, 18, 19, 20]
6
Final: [0, 5, 8, 10, 20]
Final: [0, 1, 4, 5, 8, 10, 11, 12, 15, 19, 20]
7
Final: [0, 5, 8, 10, 20]
Final: [0, 1, 4, 5, 8, 10, 11, 12, 15, 19, 20]
8
Final: [0, 5, 8, 10, 20]
Final: [0, 1, 4, 5, 8, 10, 19, 20]
9
Final: [0, 5, 8, 11, 20]
Final: [0, 4, 5, 6, 8, 11, 12, 18, 19, 20]

prev = PREV.iloc[:,1:]
np.round((prev.sub(PREV.Y, axis=0)**2).mean(),2)

Y          0.00
MCO      187.51
BIC      188.23
AIC      187.74
ridge    187.43
lasso    187.05
elast    187.20
pls      187.92
pcr      188.22
dtype: float64

Transformation des variables : feature engineering

Modèles de prévision avec interactions

formuleI = "1 + (" + "+".join(nomsvar) + ")**2"
Xinter = dmatrix(formuleI, don, return_type="dataframe").\
    iloc[:,1:].to_numpy()

formuleI = "1 + (" + "+".join(nomsvar) + ")**2"
Xq = dmatrix(formuleI, don)
Xinter = np.asarray(Xq)[:,1:]
Xinter.shape

(1366, 231)

kfregul = KFold(n_splits=10, shuffle=True, random_state=0)
kfaxes = KFold(n_splits=4, shuffle=True, random_state=0)
nbaxes = 40
# instanciation steps
cr = StandardScaler()
lassocv = LassoCV(cv=kfregul, n_jobs=3,max_iter=5000)
enetcv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=5000)
# instanciation pipeline
pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
## ridge : path
etape_lasso = pipe_lassocv.named_steps["lassocv"]
# intanciations
ridge = Ridge()
pipe_ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
acp = PCA()
reg = LinearRegression()
pipe_pcr = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
regpls = PLSRegression()
## grille composantes et decoupage VC
param_grid_pcr = { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pls = { "n_components" : list(range(1,nbaxes))}
 
for i in np.arange(nb):
    print(i)
    Xapp = Xinter[bloc!=i,:]
    Xtest = Xinter[bloc==i,:]
    Yapp = don[bloc!=i]["Y"]
    Ytest = don[bloc==i]["Y"]
    #### reg
    reg = LinearRegression()
    reg.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"MCO"] = reg.predict(Xtest)
    ### bic
    inst_reg_bic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
    reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"BIC"] = reg_bic.predict(Xtest)
    ### aic
    inst_reg_aic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
    reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"AIC"] = reg_aic.predict(Xtest)
    ## lasso
    pipe_lassocv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"lasso"] = pipe_lassocv.predict(Xtest)
    ## elastic net
    pipe_enetcv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"elast"] = pipe_enetcv.predict(Xtest)    
    ## params lambda
    path_ridge = etape_lasso.alphas_ * 100    
    param_grid_ridge = {"ridge__alpha": path_ridge}
    ## GridSearchCV
    cv_ridge = GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
    PREV.loc[PREV.bloc==i,"ridge"] = cv_ridge.predict(Xtest)
    ## gridsearch instanciation et fit
    cv_pcr = GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    cv_pls = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"pcr"] = cv_pcr.predict(Xtest)
    PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest)

0
Final: [5, 20, 21, 47, 98, 111, 125, 139, 142, 164, 185, 228]
Final: [1, 4, 5, 8, 13, 20, 21, 38, 41, 49, 62, 64, 82, 90, 98, 108, 110, 111, 115, 125, 127, 137, 139, 142, 143, 150, 152, 164, 165, 168, 192, 206, 221, 229]
1
Final: [1, 5, 8, 20, 21, 98, 111, 125, 139, 142, 164, 185, 193]
Final: [1, 5, 7, 8, 9, 13, 20, 21, 22, 29, 38, 40, 41, 44, 46, 49, 73, 77, 98, 101, 108, 111, 114, 115, 125, 127, 137, 142, 143, 145, 150, 164, 165, 167, 183, 185, 193, 202, 206, 229]
2
Final: [6, 10, 21, 40, 59, 95, 111, 139, 142, 153]
Final: [2, 4, 6, 8, 10, 11, 13, 20, 21, 38, 41, 42, 59, 62, 64, 75, 85, 90, 91, 98, 100, 109, 110, 111, 116, 118, 123, 126, 128, 137, 139, 140, 144, 149, 150, 153, 164, 166, 172, 183, 185, 194, 202, 205, 209, 221, 229]
3
Final: [6, 9, 14, 20, 22, 53, 97, 111, 119, 164, 192]
Final: [6, 7, 8, 9, 13, 14, 20, 21, 41, 47, 53, 59, 61, 63, 64, 66, 85, 98, 101, 104, 111, 112, 119, 130, 137, 139, 143, 150, 158, 164, 168, 176, 192, 195, 221, 229, 230]

4
Final: [7, 10, 20, 27, 41, 59, 77, 115, 125, 126, 164]
Final: [2, 4, 7, 13, 20, 24, 37, 41, 42, 43, 49, 59, 85, 90, 97, 101, 110, 115, 125, 126, 132, 137, 139, 142, 150, 154, 164, 183, 185, 190, 191, 193, 206, 209, 221, 229]
5
Final: [6, 10, 20, 21, 59, 111, 113, 115]
Final: [4, 6, 8, 10, 11, 13, 20, 21, 22, 24, 41, 57, 59, 90, 97, 110, 111, 113, 115, 126, 128, 137, 142, 145, 150, 162, 164, 165, 183, 185, 194, 203, 209, 211, 221, 229]
6
Final: [4, 6, 20, 24, 28, 41, 77, 97, 115, 125, 126, 164, 185, 192, 205]
Final: [4, 6, 13, 20, 21, 24, 28, 41, 49, 59, 77, 90, 97, 110, 111, 115, 117, 118, 125, 126, 128, 137, 144, 150, 164, 167, 183, 185, 190, 191, 194, 206, 209, 221, 226]
7
Final: [5, 20, 21, 111, 125, 139, 142, 164, 185]
Final: [1, 4, 5, 8, 13, 20, 21, 41, 49, 77, 80, 90, 98, 101, 110, 111, 115, 116, 117, 125, 127, 130, 131, 137, 139, 149, 150, 154, 155, 156, 157, 161, 164, 167, 168, 185, 192, 201, 202, 221, 223, 227]
8
Final: [4, 7, 10, 20, 21, 59, 96, 115, 125, 126, 164]
Final: [4, 5, 11, 13, 20, 21, 24, 29, 38, 41, 51, 59, 90, 96, 97, 110, 111, 115, 117, 125, 128, 130, 137, 139, 142, 150, 152, 157, 162, 164, 167, 168, 221, 226, 229]

9
Final: [1, 5, 10, 20, 21, 111, 115, 125, 139, 164, 191, 206]
Final: [1, 2, 4, 5, 7, 13, 20, 21, 24, 37, 38, 41, 47, 62, 64, 73, 79, 81, 91, 95, 97, 110, 111, 115, 117, 122, 125, 127, 130, 131, 137, 139, 142, 145, 150, 166, 172, 192, 205, 229]

prev = PREV.iloc[:,1:]
round((prev.sub(PREV.Y, axis=0)**2).mean(),2)

Y          0.00
MCO      187.73
BIC      168.28
AIC      168.38
ridge    165.13
lasso    161.60
elast    164.25
pls      168.91
pcr      173.75
dtype: float64

Modèles de prévision avec des polynômes

Xcar = X**2
Xcub = X**3
Xpol = np.concatenate((X, Xcar, Xcub), axis=1)
Xpol.shape

(1366, 63)

kfregul = KFold(n_splits=10, shuffle=True, random_state=0)
kfaxes = KFold(n_splits=4, shuffle=True, random_state=0)
nbaxes = 40
# instanciation steps
cr = StandardScaler()
lassocv = LassoCV(cv=kfregul, n_jobs=3,max_iter=3000)
enetcv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=3000)
# instanciation pipeline
pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
## ridge : path
etape_lasso = pipe_lassocv.named_steps["lassocv"]
# intanciations
ridge = Ridge()
pipe_ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
acp = PCA()
reg = LinearRegression()
pipe_pcr = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
regpls = PLSRegression()
## grille composantes et decoupage VC
param_grid_pcr = { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pls = { "n_components" : list(range(1,nbaxes))}
 
for i in np.arange(nb):
    print(i)
    Xapp = Xpol[bloc!=i,:]
    Xtest = Xpol[bloc==i,:]
    Yapp = don[bloc!=i]["Y"]
    Ytest = don[bloc==i]["Y"]
    #### reg
    reg = LinearRegression()
    reg.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"MCO"] = reg.predict(Xtest)
    ### bic
    inst_reg_bic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
    reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"BIC"] = reg_bic.predict(Xtest)
    ### aic
    inst_reg_aic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
    reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"AIC"] = reg_aic.predict(Xtest)
    ## lasso
    pipe_lassocv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"lasso"] = pipe_lassocv.predict(Xtest)
    ## elastic net
    pipe_enetcv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"elast"] = pipe_enetcv.predict(Xtest)    
    ## params lambda
    path_ridge = etape_lasso.alphas_ * 100    
    param_grid_ridge = {"ridge__alpha": path_ridge}
    ## GridSearchCV
    cv_ridge = GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
    PREV.loc[PREV.bloc==i,"ridge"] = cv_ridge.predict(Xtest)
    ## gridsearch instanciation et fit
    cv_pcr = GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    cv_pls = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"pcr"] = cv_pcr.predict(Xtest)
    PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest)

0
Final: [11, 20, 21, 26, 29, 43, 47]
Final: [0, 1, 4, 11, 19, 20, 21, 26, 27, 29, 31, 34, 35, 38, 42, 43, 47, 52, 53, 56]
1
Final: [11, 20, 21, 26, 29, 43, 47, 53]
Final: [0, 1, 4, 8, 11, 12, 13, 19, 20, 21, 22, 26, 29, 31, 34, 35, 38, 42, 47, 48, 52, 53, 56]

2
Final: [5, 11, 20, 29, 38, 42, 47, 48]
Final: [0, 1, 4, 5, 8, 11, 13, 19, 20, 21, 22, 29, 31, 34, 35, 38, 42, 47, 48, 52, 53, 56]
3
Final: [5, 11, 20, 21, 29, 33, 38, 47, 48, 53]
Final: [1, 5, 10, 11, 12, 19, 20, 21, 29, 31, 33, 34, 35, 38, 43, 46, 47, 48, 53, 54, 56]
4
Final: [11, 20, 21, 26, 29, 43, 47]
Final: [0, 1, 4, 8, 10, 11, 18, 19, 20, 21, 26, 27, 29, 31, 34, 35, 38, 42, 43, 47, 56, 60]

5
Final: [5, 11, 20, 29, 38, 42, 47, 48, 53]
Final: [0, 1, 4, 5, 8, 10, 11, 19, 20, 21, 22, 29, 34, 35, 38, 41, 42, 47, 48, 53, 56, 62]

6
Final: [5, 11, 20, 21, 29, 43, 47]
Final: [0, 5, 8, 11, 12, 19, 20, 21, 27, 29, 31, 34, 35, 38, 42, 43, 46, 47, 52, 53, 56]

7
Final: [11, 20, 21, 26, 29, 43, 47]
Final: [1, 11, 12, 19, 20, 21, 26, 27, 29, 31, 34, 35, 38, 43, 46, 47, 52, 53, 56]

8
Final: [5, 11, 20, 21, 29, 43, 47]
Final: [0, 1, 5, 11, 19, 20, 21, 22, 29, 33, 34, 35, 38, 42, 46, 47, 48, 52, 53, 56]
9
Final: [0, 1, 5, 11, 16, 20, 29, 38, 43, 47, 53]
Final: [0, 1, 5, 8, 10, 11, 19, 20, 21, 22, 29, 31, 34, 35, 38, 42, 46, 47, 48, 53, 56]

prev = PREV.iloc[:,1:]
round((prev.sub(PREV.Y, axis=0)**2).mean(),2)

Y          0.00
MCO      165.40
BIC      167.61
AIC      164.87
ridge    165.08
lasso    163.90
elast    164.76
pls      166.16
pcr      169.63
dtype: float64

Modèles de prévision des splines

Xp = np.ones((X.shape[0],1))
for i in nomsvar:
    xi = don.loc[:,i].quantile([0.25, 0.5, 0.75])
    formule = "-1 + bs(" + i + ",knots=xi, degree=3)"
    BX = dmatrix(formule, don)
    Xp = np.concatenate((Xp, BX), axis=1)

Xspline = Xp[:,1:]

kfregul = KFold(n_splits=10, shuffle=True, random_state=0)
kfaxes = KFold(n_splits=4, shuffle=True, random_state=0)
nbaxes = 40
# instanciation steps
cr = StandardScaler()
lassocv = LassoCV(cv=kfregul, n_jobs=3,max_iter=3000)
enetcv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=3000)
# instanciation pipeline
pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
## ridge : path
etape_lasso = pipe_lassocv.named_steps["lassocv"]
# intanciations
ridge = Ridge()
pipe_ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
acp = PCA()
reg = LinearRegression()
pipe_pcr = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
regpls = PLSRegression()
## grille composantes et decoupage VC
param_grid_pcr = { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pls = { "n_components" : list(range(1,nbaxes))}
 
for i in np.arange(nb):
    print(i)
    Xapp = Xspline[bloc!=i,:]
    Xtest = Xspline[bloc==i,:]
    Yapp = don[bloc!=i]["Y"]
    Ytest = don[bloc==i]["Y"]
    #### reg
    reg = LinearRegression()
    reg.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"MCO"] = reg.predict(Xtest)
    ### bic
    inst_reg_bic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
    reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"BIC"] = reg_bic.predict(Xtest)
    ### aic
    inst_reg_aic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
    reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"AIC"] = reg_aic.predict(Xtest)
    ## lasso
    pipe_lassocv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"lasso"] = pipe_lassocv.predict(Xtest)
    ## elastic net
    pipe_enetcv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"elast"] = pipe_enetcv.predict(Xtest)    
    ## params lambda
    path_ridge = etape_lasso.alphas_ * 100    
    param_grid_ridge = {"ridge__alpha": path_ridge}
    ## GridSearchCV
    cv_ridge = GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
    PREV.loc[PREV.bloc==i,"ridge"] = cv_ridge.predict(Xtest)
    ## gridsearch instanciation et fit
    cv_pcr = GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    cv_pls = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"pcr"] = cv_pcr.predict(Xtest)
    PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest)

0
Final: [3, 5, 29, 34, 35, 39, 51, 53, 67, 87, 120, 121, 124]
Final: [1, 3, 5, 7, 8, 23, 26, 28, 29, 34, 35, 39, 48, 51, 52, 53, 65, 67, 70, 73, 80, 87, 88, 92, 104, 107, 108, 120, 121, 123, 124, 125]
1
Final: [3, 5, 34, 39, 41, 48, 51, 52, 87, 120, 121, 124]
Final: [0, 1, 3, 5, 7, 8, 23, 26, 28, 33, 34, 39, 40, 41, 46, 48, 49, 51, 52, 54, 59, 62, 65, 67, 70, 80, 82, 87, 88, 99, 102, 106, 120, 121, 123, 124, 125]
2
Final: [1, 3, 5, 7, 34, 39, 41, 49, 52, 87, 120, 121, 124]
Final: [0, 1, 3, 5, 7, 12, 23, 26, 27, 28, 29, 33, 34, 39, 40, 41, 46, 49, 51, 52, 59, 65, 70, 82, 84, 87, 88, 92, 104, 107, 120, 121, 124]
3
Final: [3, 5, 7, 28, 34, 41, 51, 52, 70, 77, 87, 88, 120, 121, 124]
Final: [0, 3, 5, 7, 8, 23, 26, 28, 31, 34, 38, 39, 40, 41, 44, 48, 49, 51, 52, 53, 56, 62, 67, 70, 73, 77, 80, 87, 88, 99, 103, 104, 122, 123, 124, 125]
4
Final: [3, 5, 34, 39, 41, 51, 52, 87, 120, 121, 124]
Final: [0, 1, 3, 5, 7, 8, 23, 25, 26, 27, 28, 29, 33, 34, 35, 39, 40, 41, 45, 46, 49, 51, 52, 59, 65, 67, 70, 75, 80, 87, 88, 92, 102, 106, 120, 121, 123, 124]
5
Final: [3, 5, 34, 39, 41, 48, 51, 52, 87, 104, 120, 121, 123, 124, 125]
Final: [0, 2, 4, 5, 7, 8, 23, 26, 28, 29, 34, 35, 39, 40, 41, 46, 48, 49, 51, 52, 59, 65, 67, 70, 73, 80, 87, 88, 92, 104, 122, 123, 124, 125]
6
Final: [1, 3, 5, 7, 28, 34, 39, 40, 41, 49, 52, 87, 120, 121, 124]
Final: [0, 1, 3, 5, 7, 26, 28, 33, 34, 39, 40, 41, 46, 48, 49, 51, 52, 59, 65, 70, 80, 87, 88, 92, 104, 108, 109, 117, 122, 123, 124, 125]
7
Final: [2, 7, 26, 28, 34, 39, 41, 49, 52, 65, 70, 87, 88, 120, 121, 124]
Final: [2, 7, 17, 23, 26, 27, 28, 29, 33, 34, 38, 39, 40, 41, 43, 46, 48, 49, 51, 52, 53, 65, 67, 70, 73, 80, 85, 87, 88, 92, 104, 106, 108, 122, 123, 124, 125]
8
Final: [3, 5, 7, 34, 39, 40, 41, 51, 52, 63, 120, 121, 124]
Final: [0, 2, 3, 4, 5, 7, 8, 23, 26, 28, 33, 34, 39, 40, 41, 46, 48, 51, 52, 59, 63, 65, 70, 80, 87, 88, 89, 92, 104, 105, 122, 123, 124, 125]
9
Final: [1, 3, 5, 7, 28, 34, 39, 41, 51, 53, 87, 104, 120, 121, 124]
Final: [0, 1, 2, 3, 5, 7, 8, 26, 28, 34, 39, 40, 41, 46, 48, 51, 52, 53, 63, 65, 70, 80, 87, 88, 89, 92, 104, 108, 122, 123, 124, 125]

prev = PREV.iloc[:,1:]
round((prev.sub(PREV.Y, axis=0)**2).mean(),2)

Y          0.00
MCO      162.74
BIC      163.77
AIC      158.73
ridge    157.86
lasso    155.43
elast    156.55
pls      160.24
pcr      167.39
dtype: float64

Modèles de prévision avec des splines et des interactions

Xsplineinter = np.concatenate((Xinter[:,22:],Xspline),axis=1)

kfregul = KFold(n_splits=10, shuffle=True, random_state=0)
kfaxes = KFold(n_splits=4, shuffle=True, random_state=0)
nbaxes = 40
# instanciation steps
cr = StandardScaler()
lassocv = LassoCV(cv=kfregul, n_jobs=3,max_iter=3000)
enetcv = ElasticNetCV(cv=kfregul, n_jobs=3,max_iter=3000)
# instanciation pipeline
pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
## ridge : path
etape_lasso = pipe_lassocv.named_steps["lassocv"]
# intanciations
ridge = Ridge()
pipe_ridge = Pipeline(steps=[("cr", cr), ("ridge", ridge)])
acp = PCA()
reg = LinearRegression()
pipe_pcr = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
regpls = PLSRegression()
## grille composantes et decoupage VC
param_grid_pcr = { "acp__n_components" : list(range(1,nbaxes))}
param_grid_pls = { "n_components" : list(range(1,nbaxes))}
 
for i in np.arange(nb):
    print(i)
    Xapp = Xsplineinter[bloc!=i,:]
    Xtest = Xsplineinter[bloc==i,:]
    Yapp = don[bloc!=i]["Y"]
    Ytest = don[bloc==i]["Y"]
    #### reg
    reg = LinearRegression()
    reg.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"MCO"] = reg.predict(Xtest)
    ### bic
    inst_reg_bic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="bic")
    reg_bic = inst_reg_bic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"BIC"] = reg_bic.predict(Xtest)
    ### aic
    inst_reg_aic = ols_step_sk.LinearRegressionSelectionFeatureIC(verbose=1,crit="aic")
    reg_aic = inst_reg_aic.fit(X=Xapp, y=Yapp)
    PREV.loc[PREV.bloc==i,"AIC"] = reg_aic.predict(Xtest)
    ## lasso
    pipe_lassocv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"lasso"] = pipe_lassocv.predict(Xtest)
    ## elastic net
    pipe_enetcv.fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"elast"] = pipe_enetcv.predict(Xtest)    
    ## params lambda
    path_ridge = etape_lasso.alphas_ * 100    
    param_grid_ridge = {"ridge__alpha": path_ridge}
    ## GridSearchCV
    cv_ridge = GridSearchCV(pipe_ridge, param_grid_ridge, cv=kfregul, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp, Yapp)
    PREV.loc[PREV.bloc==i,"ridge"] = cv_ridge.predict(Xtest)
    ## gridsearch instanciation et fit
    cv_pcr = GridSearchCV(pipe_pcr, param_grid_pcr, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    cv_pls = GridSearchCV(regpls, param_grid_pls, cv=kfaxes, scoring = "neg_mean_squared_error", n_jobs=3).fit(Xapp,Yapp)
    PREV.loc[PREV.bloc==i,"pcr"] = cv_pcr.predict(Xtest)
    PREV.loc[PREV.bloc==i,"pls"] = cv_pls.predict(Xtest)

0
Final: [2, 79, 117, 121, 131, 211, 243, 244, 249, 313, 330]
Final: [2, 4, 16, 45, 76, 79, 117, 121, 128, 140, 142, 147, 158, 161, 171, 174, 181, 184, 207, 209, 216, 217, 232, 236, 243, 244, 249, 253, 257, 291, 294, 296, 297, 301, 308, 313, 329, 330, 332, 333, 334]
1
Final: [5, 7, 19, 55, 76, 130, 131, 211, 243, 244, 296, 330]
Final: [5, 7, 8, 11, 16, 19, 25, 29, 55, 56, 66, 69, 76, 80, 86, 93, 97, 117, 118, 131, 136, 142, 143, 147, 163, 171, 174, 184, 185, 187, 211, 238, 241, 243, 245, 249, 252, 274, 276, 282, 291, 295, 296, 297, 298, 301, 308, 315, 329, 330, 333]
2
Final: [6, 21, 79, 105, 117, 121, 211, 243, 250, 274, 291, 296, 297, 313, 330]
Final: [3, 11, 19, 37, 55, 56, 75, 95, 97, 106, 108, 109, 115, 117, 124, 128, 131, 133, 136, 142, 144, 145, 147, 171, 174, 184, 187, 204, 207, 209, 210, 211, 222, 232, 238, 241, 243, 248, 249, 250, 255, 258, 259, 266, 274, 281, 291, 293, 294, 296, 297, 298, 300, 308, 313, 330, 331, 333]
3
Final: [5, 19, 25, 55, 76, 79, 108, 117, 131, 211, 243, 244, 286, 289, 296, 297, 313, 330]
Final: [5, 6, 16, 19, 22, 24, 36, 51, 55, 57, 59, 69, 74, 75, 94, 95, 102, 108, 109, 115, 124, 128, 130, 134, 136, 142, 145, 147, 153, 166, 170, 181, 183, 207, 209, 213, 216, 221, 223, 228, 238, 243, 245, 249, 250, 256, 261, 277, 280, 286, 291, 292, 294, 297, 298, 305, 315, 329, 330, 333]
4
Final: [2, 19, 55, 79, 105, 117, 121, 211, 243, 244, 286, 289, 296, 297, 330]
Final: [2, 5, 15, 19, 20, 55, 63, 69, 79, 93, 115, 117, 120, 128, 136, 142, 147, 151, 165, 168, 169, 171, 184, 193, 209, 215, 216, 232, 238, 243, 244, 249, 255, 258, 259, 268, 276, 284, 286, 291, 293, 294, 296, 297, 301, 308, 311, 315, 329, 330, 333]
5
Final: [6, 75, 93, 97, 105, 117, 171, 211, 216, 243, 289, 295, 297, 330]
Final: [2, 16, 18, 19, 21, 22, 23, 55, 56, 68, 74, 75, 81, 91, 93, 105, 115, 123, 124, 127, 128, 130, 131, 135, 142, 143, 147, 150, 155, 161, 174, 189, 196, 200, 207, 209, 213, 216, 221, 228, 232, 236, 243, 244, 248, 249, 255, 262, 267, 268, 270, 276, 289, 295, 297, 298, 301, 307, 309, 312, 313, 325, 329, 330, 333]
6
Final: [19, 55, 75, 93, 97, 103, 142, 155, 211, 241, 242, 243, 289, 297, 330]
Final: [2, 3, 16, 19, 25, 29, 53, 55, 74, 75, 81, 86, 94, 97, 103, 130, 142, 164, 170, 184, 208, 209, 232, 238, 242, 243, 245, 248, 249, 255, 259, 274, 275, 280, 284, 291, 295, 297, 298, 301, 308, 313, 329, 330, 333]
7
Final: [76, 78, 117, 120, 131, 174, 211, 216, 243, 249, 291, 296, 297, 313, 330]
Final: [0, 5, 31, 36, 42, 51, 55, 61, 69, 75, 81, 93, 115, 117, 120, 123, 128, 140, 142, 163, 170, 172, 174, 184, 208, 209, 216, 221, 224, 232, 238, 243, 244, 245, 249, 255, 257, 274, 289, 290, 294, 296, 297, 315, 325, 329, 330, 333]
8
Final: [0, 55, 76, 93, 117, 131, 155, 211, 243, 244, 289, 296, 297, 313, 330]
Final: [0, 5, 6, 11, 28, 31, 34, 36, 55, 68, 76, 93, 117, 120, 122, 127, 128, 136, 142, 147, 150, 151, 164, 172, 174, 184, 187, 193, 199, 208, 209, 211, 216, 229, 232, 238, 243, 244, 245, 249, 255, 259, 274, 280, 281, 291, 295, 297, 298, 301, 309, 311, 313, 314, 329, 330, 333]

9
Final: [0, 55, 76, 94, 105, 117, 155, 211, 243, 244, 296, 297, 313, 330]
Final: [5, 11, 12, 16, 19, 22, 24, 25, 55, 57, 59, 68, 76, 78, 93, 100, 114, 115, 117, 120, 123, 127, 128, 142, 150, 154, 160, 168, 169, 170, 174, 176, 177, 184, 199, 207, 209, 211, 216, 231, 243, 245, 249, 255, 256, 259, 273, 275, 280, 281, 291, 294, 296, 297, 298, 301, 309, 313, 314, 329, 330, 333, 334]

prev = PREV.iloc[:,1:]
round((prev.sub(PREV.Y, axis=0)**2).mean(),2)

Y          0.00
MCO      193.29
BIC      153.88
AIC      164.73
ridge    156.45
lasso    154.41
elast    154.17
pls      160.23
pcr      164.32
dtype: float64

Retour au sommet