8 Choix de variables

import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm

import sys
sys.path.append('../modules')  # Ajoute le répertoire 'modules' au chemin de recherche des modules
import choixolsstats

Introduction

Notations

Choix incorrects des variables

Critères classiques de choix de modèles

Procédure de sélection

Exemple : la concentration en ozone

ozone = pd.read_csv("../donnees/ozone.txt", header = 0, sep = ";")
ozone["vent"] = ozone["vent"].astype("category")
upper = "O3 ~ T12 + T15 + Ne12 + N12 + S12 + E12 + W12 + Vx + O3v"
mustbe = "1"
modcomplet = smf.ols(upper + "+" + mustbe, data=ozone).fit()
modsel = choixolsstats.bestols(ozone, upper=upper, mustbe=mustbe)
modsel['Cp'] = modsel['SSR']/modcomplet.mse_resid - modcomplet.nobs + 2 *  modsel['nb_var']
modsel.sort_values(by=["BIC","nb_var"]).iloc[:4,[1,3]]
var_added BIC
366 (T15, O3v, Ne12, Vx) 411.974567
381 (T12, O3v, Ne12, Vx) 412.085048
443 (T15, O3v, Ne12) 412.146450
499 (O3v, Ne12) 412.310100
modsel.sort_values(by=["AIC","nb_var"]).iloc[:4,[1,2]]
var_added AIC
366 (T15, O3v, Ne12, Vx) 402.414452
381 (T12, O3v, Ne12, Vx) 402.524933
219 (S12, T15, O3v, Ne12, Vx) 403.238253
234 (S12, T12, O3v, Ne12, Vx) 403.563498
modsel.sort_values(by=["Cp","nb_var"]).iloc[:5,[1,7]]
var_added Cp
366 (T15, O3v, Ne12, Vx) 2.947034
381 (T12, O3v, Ne12, Vx) 3.042036
219 (S12, T15, O3v, Ne12, Vx) 3.948540
234 (S12, T12, O3v, Ne12, Vx) 4.222300
354 (T15, E12, O3v, Ne12) 4.264315
modsel.sort_values(by=["R2adj","nb_var"],ascending=False). \
  iloc[:4,[1,6]]
var_added R2adj
115 (S12, T15, W12, O3v, Ne12, Vx) 0.709059
219 (S12, T15, O3v, Ne12, Vx) 0.708613
366 (T15, O3v, Ne12, Vx) 0.708307
23 (N12, S12, T15, W12, O3v, Ne12, Vx) 0.707954
modsel.sort_values(by=["R2","nb_var"], ascending=False). \
    iloc[:4,[1,5]]
var_added R2
0 (N12, S12, T15, E12, W12, T12, O3v, Ne12, Vx) 0.750501
4 (N12, S12, T15, E12, W12, O3v, Ne12, Vx) 0.750189
6 (N12, S12, T15, W12, T12, O3v, Ne12, Vx) 0.750040
23 (N12, S12, T15, W12, O3v, Ne12, Vx) 0.749675
selec = modsel.sort_values(by=["BIC","nb_var"]).iloc[:1,[1,3]]
formule = "O3 ~ 1 +" + "+".join(selec.iloc[0,0])
modsel = smf.ols(formule, data=ozone).fit()
Retour au sommet