15 Comparaison des différentes méthodes en classification supervisée, étude de cas réels

import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, \
    LogisticRegressionCV
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics as sklm
from patsy import dmatrix

import sys
sys.path.append('../modules')
import logistic_step_sk as lss 

Analyse des données chd

don = pd.read_csv("../donnees/SAh.csv", header=0,  sep=",")
don.rename(columns={"chd": "Y"},inplace=True)
don.Y.value_counts()
Y
0    302
1    160
Name: count, dtype: int64
Y = don["Y"].to_numpy()
PROB = pd.DataFrame({"Y":Y,"log":0.0,"BIC":0.0,"AIC":0.0,
                    "ridge":0.0,"lasso":0.0,"elast":0.0})
nb=10
skf = StratifiedKFold(n_splits=nb, shuffle=True, random_state=123)
nomsvar = don.columns.difference(["Y"])
formule = "~" + "+".join(nomsvar)
X = dmatrix(formule, don, return_type="dataframe").\
                                            iloc[:,1:].to_numpy()
def grille(X, y, type = "lasso", ng=400):
    scalerX = StandardScaler().fit(X)
    Xcr= scalerX.transform(X)
    l0 = np.abs(Xcr.transpose().dot((y-y.mean()))).max()/X.shape[0]
    llc = np.linspace(0,-4,ng)
    ll = l0*10**llc
    if type=="lasso":
        Cs = 1/ 0.9/ X.shape[0] / (l0*10**(llc))
    elif type=="ridge":
        Cs = 1/ 0.9/ X.shape[0] / ((l0*10**(llc)) * 100)
    elif type=="enet":
        Cs = 1/ 0.9/ X.shape[0] / ((l0*10**(llc)) * 2)
    return Cs
for app_index, val_index in skf.split(X,Y):
    Xapp = X[app_index,:]
    Xtest = X[val_index,:]
    Yapp = Y[app_index]
    log = LogisticRegression(penalty=None,solver="newton-cholesky").fit(Xapp,Yapp)
    PROB.loc[val_index,"log"] = log.predict_proba(Xtest)[:,1]
    ### bic
    choixbic = lss.LogisticRegressionSelectionFeatureIC(start=[], \
        direction="forward",crit="bic").fit(Xapp,Yapp)
    PROB.loc[val_index, "BIC"] = choixbic.predict_proba(Xtest)[:,1]
    ### aic
    choixaic = lss.LogisticRegressionSelectionFeatureIC(start=[], \
        direction="forward",crit="aic").fit(Xapp,Yapp)
    PROB.loc[val_index, "AIC"] = choixaic.predict_proba(Xtest)[:,1]
    ### lasso
    cr = StandardScaler()
    Cs_lasso = grille(Xapp,Yapp, "lasso")
    lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10,\
                 Cs=Cs_lasso,  solver="saga", max_iter=2000)
    pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
    pipe_lassocv.fit(Xapp,Yapp)
    PROB.loc[val_index,"lasso"] = pipe_lassocv.predict_proba(Xtest)[:,1]
    ### elastic net
    cr = StandardScaler()
    Cs_enet = grille(Xapp,Yapp,"enet")
    enetcv=LogisticRegressionCV(cv=10,penalty="elasticnet",n_jobs=10,\
          l1_ratios=[0.5],Cs=Cs_enet,solver="saga",max_iter=2000)
    pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
    pipe_enetcv.fit(Xapp,Yapp)
    PROB.loc[val_index,"elast"] = pipe_enetcv.predict_proba(Xtest)[:,1] 
    ### ridge
    cr = StandardScaler()
    Cs_ridge = grille(Xapp,Yapp,"ridge")
    ridgecv = LogisticRegressionCV(cv=10, penalty="l2", \
            Cs=Cs_ridge,  max_iter=1000)
    pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
    pipe_ridgecv.fit(Xapp,Yapp)
    PROB.loc[val_index,"ridge"] = pipe_ridgecv.predict_proba(Xtest)[:,1]
round(PROB.iloc[0:4,:],3)
Y log BIC AIC ridge lasso elast
0 1 0.742 0.689 0.689 0.675 0.587 0.598
1 1 0.292 0.362 0.333 0.321 0.373 0.358
2 0 0.251 0.313 0.230 0.286 0.316 0.297
3 1 0.719 0.720 0.681 0.676 0.694 0.682
mc = pd.Series(0.0, index=PROB.columns[1:])
s = 0.5
for i in range(mc.shape[0]):
    mc.iloc[i] = sklm.zero_one_loss(PROB.Y, PROB.iloc[:,i+1]>s)
round(mc,3)
log      0.281
BIC      0.281
AIC      0.279
ridge    0.266
lasso    0.279
elast    0.271
dtype: float64

On ajoute les méthodes régularisées avec sélection du paramètre par opposé de la log-vraisemblance :

PROB = PROB.assign(LassoL=0.0)
PROB = PROB.assign(RidgeL=0.0)
PROB = PROB.assign(EnetL=0.0)
for app_index, val_index in skf.split(X,Y):
    Xapp = X[app_index,:-1]
    Yapp = Y[app_index]
    Xval = X[val_index,:-1]
    # grille
    Cs_lasso = grille(Xapp, Yapp, "lasso")
    Cs_ridge = grille(Xapp, Yapp, "ridge")
    Cs_enet = grille(Xapp, Yapp, "enet")
    # instanciation
    cr = StandardScaler()
    lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso,  solver="saga", max_iter=2000, scoring="neg_log_loss")
    ridgecv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge,  max_iter=1000, scoring="neg_log_loss")
    enetcv = LogisticRegressionCV(cv=10, penalty="elasticnet", l1_ratios = [0.5], n_jobs=10, Cs=Cs_enet,  solver="saga", max_iter=2000, scoring="neg_log_loss")
    pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
    pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
    pipe_enetcv = Pipeline(steps=[("cr", cr), ("enet", enetcv)])
    # fit brut
    pipe_lassocv.fit(Xapp, Yapp)
    pipe_ridgecv.fit(Xapp, Yapp)
    pipe_enetcv.fit(Xapp, Yapp)
    # prediction
    PROB.loc[val_index,"LassoL"] = pipe_lassocv.predict_proba(Xval)[:,1]
    PROB.loc[val_index,"RidgeL"] = pipe_ridgecv.predict_proba(Xval)[:,1]
    PROB.loc[val_index,"EnetL"] = pipe_enetcv.predict_proba(Xval)[:,1]

et par AUC

PROB = PROB.assign(LassoA=0.0)
PROB = PROB.assign(RidgeA=0.0)
PROB = PROB.assign(EnetA=0.0)
for app_index, val_index in skf.split(X,Y):
    Xapp = X[app_index,:-1]
    Yapp = Y[app_index]
    Xval = X[val_index,:-1]
    # grille
    Cs_lasso = grille(Xapp, Yapp, "lasso")
    Cs_ridge = grille(Xapp, Yapp, "ridge")
    Cs_enet = grille(Xapp, Yapp, "enet")
    # instanciation
    cr = StandardScaler()
    lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso,  solver="saga", max_iter=2000, scoring="roc_auc")
    ridgecv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge,  max_iter=1000, scoring="roc_auc")
    enetcv = LogisticRegressionCV(cv=10, penalty="elasticnet", l1_ratios = [0.5], n_jobs=10, Cs=Cs_enet,  solver="saga", max_iter=2000, scoring="roc_auc")
    pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
    pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
    pipe_enetcv = Pipeline(steps=[("cr", cr), ("enet", enetcv)])
    # fit brut
    pipe_lassocv.fit(Xapp, Yapp)
    pipe_ridgecv.fit(Xapp, Yapp)
    pipe_enetcv.fit(Xapp, Yapp)
    # prediction
    PROB.loc[val_index,"LassoA"] = pipe_lassocv.predict_proba(Xval)[:,1]
    PROB.loc[val_index,"RidgeA"] = pipe_ridgecv.predict_proba(Xval)[:,1]
    PROB.loc[val_index,"EnetA"] = pipe_enetcv.predict_proba(Xval)[:,1]
round(PROB.iloc[0:4,:],2)
Y log BIC AIC ridge lasso elast LassoL RidgeL EnetL LassoA RidgeA EnetA
0 1 0.74 0.69 0.69 0.68 0.59 0.60 0.67 0.72 0.66 0.67 0.58 0.57
1 1 0.29 0.36 0.33 0.32 0.37 0.36 0.33 0.28 0.31 0.28 0.27 0.28
2 0 0.25 0.31 0.23 0.29 0.32 0.30 0.29 0.29 0.29 0.32 0.33 0.32
3 1 0.72 0.72 0.68 0.68 0.69 0.68 0.70 0.70 0.70 0.60 0.64 0.63
auc = pd.Series(0.0, index=PROB.columns[1:])
for i in range(auc.shape[0]):
    auc.iloc[i] = sklm.roc_auc_score(PROB.Y, PROB.iloc[:,i+1])
round(auc.sort_values(ascending=False)[:6],3)
ridge    0.779
elast    0.776
log      0.773
AIC      0.772
BIC      0.772
EnetL    0.770
dtype: float64
fig, ax = plt.subplots(1,1)
noms = auc.sort_values(ascending=False)
for i, nom in enumerate(noms.index):
    if i<4:
        roc = sklm.RocCurveDisplay.from_predictions(PROB.Y, \
            PROB.loc[:,nom], ax=ax, name=nom, plot_chance_level=(i==0))
fig.tight_layout()

noms = PROB.columns[1:]
matsB = pd.DataFrame({"seuil": pd.Series(0.0, index=noms)})
s = .5
for nom in noms:
    matsB.loc[nom,"seuil"] = s
    confmat = sklm.confusion_matrix(PROB.Y, PROB.loc[:,nom]>=s)
    matsB.loc[nom, "tn"] = confmat[0,0]
    matsB.loc[nom, "tp"] = confmat[1,1]
    matsB.loc[nom, "fn"] = confmat[1,0]
    matsB.loc[nom, "fp"] = confmat[0,1]
    matsB.loc[nom,"sensitivity"] = confmat[1,1]/(confmat[1,1]+confmat[1,0])
    matsB.loc[nom,"specificity"] = confmat[0,0]/(confmat[0,0]+confmat[0,1])
    matsB.loc[nom,"accuracy"] = sklm.accuracy_score(PROB.Y, PROB.loc[:,nom]>=s)
print(matsB.round(3))
        seuil     tn    tp     fn    fp  sensitivity  specificity  accuracy
log       0.5  250.0  82.0   78.0  52.0        0.512        0.828     0.719
BIC       0.5  250.0  82.0   78.0  52.0        0.512        0.828     0.719
AIC       0.5  250.0  83.0   77.0  52.0        0.519        0.828     0.721
ridge     0.5  262.0  77.0   83.0  40.0        0.481        0.868     0.734
lasso     0.5  269.0  64.0   96.0  33.0        0.400        0.891     0.721
elast     0.5  268.0  69.0   91.0  34.0        0.431        0.887     0.729
LassoL    0.5  256.0  77.0   83.0  46.0        0.481        0.848     0.721
RidgeL    0.5  257.0  77.0   83.0  45.0        0.481        0.851     0.723
EnetL     0.5  260.0  76.0   84.0  42.0        0.475        0.861     0.727
LassoA    0.5  265.0  73.0   87.0  37.0        0.456        0.877     0.732
RidgeA    0.5  271.0  60.0  100.0  31.0        0.375        0.897     0.716
EnetA     0.5  269.0  68.0   92.0  33.0        0.425        0.891     0.729
matsN = pd.DataFrame({"seuil": pd.Series(0.0, index=noms)})
nbr0 = don.Y.value_counts()[0]
for nom in noms:
    tmp = PROB.loc[:,nom].sort_values(ascending=True)
    s = (tmp.iloc[nbr0-1]+tmp.iloc[nbr0])/2
    confmat = sklm.confusion_matrix(PROB.Y, PROB.loc[:,nom]>=s)
    matsN.loc[nom,"seuil"] = s
    matsN.loc[nom, "tn"] = confmat[0,0]
    matsN.loc[nom, "tp"] = confmat[1,1]
    matsN.loc[nom, "fn"] = confmat[1,0]
    matsN.loc[nom, "fp"] = confmat[0,1]
    matsN.loc[nom,"sensitivity"] = confmat[1,1]/(confmat[1,1]+confmat[1,0])
    matsN.loc[nom,"specificity"] = confmat[0,0]/(confmat[0,0]+confmat[0,1])
    matsN.loc[nom,"accuracy"] = sklm.accuracy_score(PROB.Y, PROB.loc[:,nom]>=s)
print(matsN.round(3))
        seuil     tn    tp    fn    fp  sensitivity  specificity  accuracy
log     0.431  236.0  94.0  66.0  66.0        0.588        0.781     0.714
BIC     0.444  235.0  93.0  67.0  67.0        0.581        0.778     0.710
AIC     0.441  236.0  94.0  66.0  66.0        0.588        0.781     0.714
ridge   0.418  237.0  95.0  65.0  65.0        0.594        0.785     0.719
lasso   0.428  236.0  94.0  66.0  66.0        0.588        0.781     0.714
elast   0.425  238.0  96.0  64.0  64.0        0.600        0.788     0.723
LassoL  0.442  235.0  93.0  67.0  67.0        0.581        0.778     0.710
RidgeL  0.432  235.0  93.0  67.0  67.0        0.581        0.778     0.710
EnetL   0.436  235.0  93.0  67.0  67.0        0.581        0.778     0.710
LassoA  0.431  236.0  94.0  66.0  66.0        0.588        0.781     0.714
RidgeA  0.418  236.0  94.0  66.0  66.0        0.588        0.781     0.714
EnetA   0.425  233.0  91.0  69.0  69.0        0.569        0.772     0.701
matsY = pd.DataFrame({"seuil": pd.Series(0.0, index=noms)})
for nom in noms:
    fpr, tpr, thr  = sklm.roc_curve(PROB.Y, PROB.loc[:,nom])
    ii = (tpr-fpr).argmax()
    s = thr[ii]
    matsY.loc[nom,"seuil"] = s
    confmat = sklm.confusion_matrix(PROB.Y, PROB.loc[:,nom]>=s)
    matsY.loc[nom, "tn"] = confmat[0,0]
    matsY.loc[nom, "tp"] = confmat[1,1]
    matsY.loc[nom, "fn"] = confmat[1,0]
    matsY.loc[nom, "fp"] = confmat[0,1]
    matsY.loc[nom,"sensitivity"] = tpr[ii]
    matsY.loc[nom,"sensitivity"] = 1-fpr[ii]
    matsY.loc[nom,"accuracy"] = sklm.accuracy_score(PROB.Y, PROB.loc[:,nom]>=s)
print(matsY.round(3))
        seuil     tn     tp    fn     fp  sensitivity  accuracy
log     0.292  188.0  128.0  32.0  114.0        0.623     0.684
BIC     0.280  182.0  129.0  31.0  120.0        0.603     0.673
AIC     0.300  196.0  124.0  36.0  106.0        0.649     0.693
ridge   0.311  192.0  129.0  31.0  110.0        0.636     0.695
lasso   0.360  210.0  118.0  42.0   92.0        0.695     0.710
elast   0.362  212.0  116.0  44.0   90.0        0.702     0.710
LassoL  0.348  202.0  118.0  42.0  100.0        0.669     0.693
RidgeL  0.330  197.0  121.0  39.0  105.0        0.652     0.688
EnetL   0.278  173.0  133.0  27.0  129.0        0.573     0.662
LassoA  0.367  209.0  113.0  47.0   93.0        0.692     0.697
RidgeA  0.346  197.0  121.0  39.0  105.0        0.652     0.688
EnetA   0.364  205.0  115.0  45.0   97.0        0.679     0.693

Feature engineering

Interactions

formuleI = "1 + (" + "+".join(nomsvar) + ")**2"
PROB = pd.DataFrame({"Y":Y,"log":0.0,"BIC":0.0,"AIC":0.0,
                    "ridge":0.0,"lasso":0.0,"elast":0.0})


Xinter = dmatrix(formuleI, don, return_type="dataframe").iloc[:,1:].to_numpy()
Xinter.shape
(462, 45)
cr = StandardScaler()
lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso,  solver="saga", max_iter=2000)
enetcv = LogisticRegressionCV(cv=10, penalty="elasticnet", l1_ratios = [0.5], n_jobs=10,  Cs=Cs_enet, solver="saga", max_iter=2000)
ridgecv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge,  max_iter=1000)
pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])

nb=10
skf = StratifiedKFold(n_splits=nb, shuffle=True, random_state=1234)
for app_index, val_index in skf.split(X,Y):
    Xapp = Xinter[app_index,:]
    Xtest = Xinter[val_index,:]
    Yapp = Y[app_index]
    ### log
    log = LogisticRegression(penalty=None,solver="newton-cholesky").fit(Xapp,Yapp)
    PROB.loc[val_index,"log"] = log.predict_proba(Xtest)[:,1]
    ### bic
    choixbic = lss.LogisticRegressionSelectionFeatureIC(start=[], \
        direction="forward",crit="bic").fit(Xapp,Yapp)
    PROB.loc[val_index, "BIC"] = choixbic.predict_proba(Xtest)[:,1]
    ### aic
    choixaic = lss.LogisticRegressionSelectionFeatureIC(start=[], \
        direction="forward",crit="aic").fit(Xapp,Yapp)
    PROB.loc[val_index, "AIC"] = choixaic.predict_proba(Xtest)[:,1]
    ### lasso
    cr = StandardScaler()
    Cs_lasso = grille(Xapp,Yapp, "lasso")
    lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10,\
                 Cs=Cs_lasso,  solver="saga", max_iter=2000)
    pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
    pipe_lassocv.fit(Xapp,Yapp)
    PROB.loc[val_index,"lasso"] = pipe_lassocv.predict_proba(Xtest)[:,1]
    ### elastic net
    cr = StandardScaler()
    Cs_enet = grille(Xapp,Yapp,"enet")
    enetcv=LogisticRegressionCV(cv=10,penalty="elasticnet",n_jobs=10,\
          l1_ratios=[0.5],Cs=Cs_enet,solver="saga",max_iter=2000)
    pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
    pipe_enetcv.fit(Xapp,Yapp)
    PROB.loc[val_index,"elast"] = pipe_enetcv.predict_proba(Xtest)[:,1] 
    ### ridge
    cr = StandardScaler()
    Cs_ridge = grille(Xapp,Yapp,"ridge")
    ridgecv = LogisticRegressionCV(cv=10, penalty="l2", \
            Cs=Cs_ridge,  max_iter=1000)
    pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
    pipe_ridgecv.fit(Xapp,Yapp)
    PROB.loc[val_index,"ridge"] = pipe_ridgecv.predict_proba(Xtest)[:,1]
mc = pd.Series(0.0, index=PROB.columns[1:])
s = 0.5
for i in range(mc.shape[0]):
    mc.iloc[i] = sklm.zero_one_loss(PROB.Y, PROB.iloc[:,i+1]>s)

round(mc.sort_values(ascending=True),3)
AIC      0.245
BIC      0.251
ridge    0.260
elast    0.264
lasso    0.268
log      0.288
dtype: float64

Polynôme

Xquanti = don.drop(columns="Y").\
                    select_dtypes(include=[np.number]).to_numpy()
Xcar = Xquanti**2
Xcub = Xquanti**3
formule = "~" + "+".join(nomsvar)
X = dmatrix(formule, don, return_type="dataframe").\
                                            iloc[:,1:].to_numpy()
Xpol = np.concatenate((X, Xcar, Xcub), axis=1)
Xpol.shape
(462, 25)
cr = StandardScaler()
lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso,  solver="saga", max_iter=2000)
enetcv = LogisticRegressionCV(cv=10, penalty="elasticnet", l1_ratios = [0.5], n_jobs=10,  Cs=Cs_enet, solver="saga", max_iter=2000)
ridgecv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge,  max_iter=1000)
pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])

nb=10
skf = StratifiedKFold(n_splits=nb, shuffle=True, random_state=1234)
for app_index, val_index in skf.split(X,Y):
    Xapp = Xpol[app_index,:]
    Xtest = Xpol[val_index,:]
    Yapp = Y[app_index]
    ### log
    log = LogisticRegression(penalty=None,solver="newton-cholesky").fit(Xapp,Yapp)
    PROB.loc[val_index,"log"] = log.predict_proba(Xtest)[:,1]
    ### bic
    choixbic = lss.LogisticRegressionSelectionFeatureIC(start=[], \
        direction="forward",crit="bic").fit(Xapp,Yapp)
    PROB.loc[val_index, "BIC"] = choixbic.predict_proba(Xtest)[:,1]
    ### aic
    choixaic = lss.LogisticRegressionSelectionFeatureIC(start=[], \
        direction="forward",crit="aic").fit(Xapp,Yapp)
    PROB.loc[val_index, "AIC"] = choixaic.predict_proba(Xtest)[:,1]
    ### lasso
    cr = StandardScaler()
    Cs_lasso = grille(Xapp,Yapp, "lasso")
    lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10,\
                 Cs=Cs_lasso,  solver="saga", max_iter=2000)
    pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
    pipe_lassocv.fit(Xapp,Yapp)
    PROB.loc[val_index,"lasso"] = pipe_lassocv.predict_proba(Xtest)[:,1]
    ### elastic net
    cr = StandardScaler()
    Cs_enet = grille(Xapp,Yapp,"enet")
    enetcv=LogisticRegressionCV(cv=10,penalty="elasticnet",n_jobs=10,\
          l1_ratios=[0.5],Cs=Cs_enet,solver="saga",max_iter=2000)
    pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
    pipe_enetcv.fit(Xapp,Yapp)
    PROB.loc[val_index,"elast"] = pipe_enetcv.predict_proba(Xtest)[:,1] 
    ### ridge
    cr = StandardScaler()
    Cs_ridge = grille(Xapp,Yapp,"ridge")
    ridgecv = LogisticRegressionCV(cv=10, penalty="l2", \
            Cs=Cs_ridge,  max_iter=1000)
    pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
    pipe_ridgecv.fit(Xapp,Yapp)
    PROB.loc[val_index,"ridge"] = pipe_ridgecv.predict_proba(Xtest)[:,1]
mc = pd.Series(0.0, index=PROB.columns[1:])
s = 0.5
for i in range(mc.shape[0]):
    mc.iloc[i] = sklm.zero_one_loss(PROB.Y, PROB.iloc[:,i+1]>s)

round(mc.sort_values(ascending=True),3)
BIC      0.266
lasso    0.266
ridge    0.271
elast    0.277
AIC      0.279
log      0.286
dtype: float64

Splines

nomsquanti = don.columns[np.isin(don.dtypes, ["float64", \
                          "int64"])].difference(["Y"])
nomsquali = don.columns[np.isin(don.dtypes, ["object", \
                          "category"])].difference(["Y"])
formule = "~" + "+".join(nomsquali)
Xspline = dmatrix(formule, don, return_type="dataframe").\
                                           iloc[:,1:].to_numpy()
for i in nomsquanti:
    xi = don.loc[:,i].quantile([0.25, 0.5, 0.75])
    formule = "-1 + bs(" + i + ",knots=xi, degree=3)"
    BX = dmatrix(formule, don, return_type="dataframe").to_numpy()
    Xspline = np.concatenate((Xspline, BX), axis=1)

Xspline.shape
(462, 49)
cr = StandardScaler()
lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso,  solver="saga", max_iter=2000)
enetcv = LogisticRegressionCV(cv=10, penalty="elasticnet", l1_ratios = [0.5], n_jobs=10,  Cs=Cs_enet, solver="saga", max_iter=2000)
ridgecv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge,  max_iter=1000)
pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])

nb=10
skf = StratifiedKFold(n_splits=nb, shuffle=True, random_state=1234)
for app_index, val_index in skf.split(X,Y):
    Xapp = Xspline[app_index,:]
    Xtest = Xspline[val_index,:]
    Yapp = Y[app_index]
    ### log
    log = LogisticRegression(penalty=None,solver="newton-cholesky").fit(Xapp,Yapp)
    PROB.loc[val_index,"log"] = log.predict_proba(Xtest)[:,1]
    ### bic
    choixbic = lss.LogisticRegressionSelectionFeatureIC(start=[], \
        direction="forward",crit="bic").fit(Xapp,Yapp)
    PROB.loc[val_index, "BIC"] = choixbic.predict_proba(Xtest)[:,1]
    ### aic
    choixaic = lss.LogisticRegressionSelectionFeatureIC(start=[], \
        direction="forward",crit="aic").fit(Xapp,Yapp)
    PROB.loc[val_index, "AIC"] = choixaic.predict_proba(Xtest)[:,1]
    ### lasso
    cr = StandardScaler()
    Cs_lasso = grille(Xapp,Yapp, "lasso")
    lassocv =  LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10,\
                 Cs=Cs_lasso,  solver="saga", max_iter=2000)
    pipe_lassocv = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
    pipe_lassocv.fit(Xapp,Yapp)
    PROB.loc[val_index,"lasso"] = pipe_lassocv.predict_proba(Xtest)[:,1]
    ### elastic net
    cr = StandardScaler()
    Cs_enet = grille(Xapp,Yapp,"enet")
    enetcv=LogisticRegressionCV(cv=10,penalty="elasticnet",n_jobs=10,\
          l1_ratios=[0.5],Cs=Cs_enet,solver="saga",max_iter=2000)
    pipe_enetcv = Pipeline(steps=[("cr", cr), ("enetcv", enetcv)])
    pipe_enetcv.fit(Xapp,Yapp)
    PROB.loc[val_index,"elast"] = pipe_enetcv.predict_proba(Xtest)[:,1] 
    ### ridge
    cr = StandardScaler()
    Cs_ridge = grille(Xapp,Yapp,"ridge")
    ridgecv = LogisticRegressionCV(cv=10, penalty="l2", \
            Cs=Cs_ridge,  max_iter=1000)
    pipe_ridgecv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
    pipe_ridgecv.fit(Xapp,Yapp)
    PROB.loc[val_index,"ridge"] = pipe_ridgecv.predict_proba(Xtest)[:,1]
mc = pd.Series(0.0, index=PROB.columns[1:])
s = 0.5
for i in range(mc.shape[0]):
    mc.iloc[i] = sklm.zero_one_loss(PROB.Y, PROB.iloc[:,i+1]>s)

round(mc.sort_values(ascending=True),3)
ridge    0.279
AIC      0.286
elast    0.290
log      0.292
lasso    0.297
BIC      0.305
dtype: float64
Retour au sommet