import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
import sys
'../modules') # Ajoute le répertoire 'modules' au chemin de recherche des modules
sys.path.append(import choixolsstats
8 Choix de variables
Introduction
Notations
Choix incorrects des variables
Critères classiques de choix de modèles
Procédure de sélection
Exemple : la concentration en ozone
= pd.read_csv("../donnees/ozone.txt", header = 0, sep = ";")
ozone "vent"] = ozone["vent"].astype("category") ozone[
= "O3 ~ T12 + T15 + Ne12 + N12 + S12 + E12 + W12 + Vx + O3v"
upper = "1"
mustbe = smf.ols(upper + "+" + mustbe, data=ozone).fit()
modcomplet = choixolsstats.bestols(ozone, upper=upper, mustbe=mustbe)
modsel 'Cp'] = modsel['SSR']/modcomplet.mse_resid - modcomplet.nobs + 2 * modsel['nb_var'] modsel[
=["BIC","nb_var"]).iloc[:4,[1,3]] modsel.sort_values(by
var_added | BIC | |
---|---|---|
366 | (T15, O3v, Ne12, Vx) | 411.974567 |
381 | (T12, O3v, Ne12, Vx) | 412.085048 |
443 | (T15, O3v, Ne12) | 412.146450 |
499 | (O3v, Ne12) | 412.310100 |
=["AIC","nb_var"]).iloc[:4,[1,2]] modsel.sort_values(by
var_added | AIC | |
---|---|---|
366 | (T15, O3v, Ne12, Vx) | 402.414452 |
381 | (T12, O3v, Ne12, Vx) | 402.524933 |
219 | (S12, T15, O3v, Ne12, Vx) | 403.238253 |
234 | (S12, T12, O3v, Ne12, Vx) | 403.563498 |
=["Cp","nb_var"]).iloc[:5,[1,7]] modsel.sort_values(by
var_added | Cp | |
---|---|---|
366 | (T15, O3v, Ne12, Vx) | 2.947034 |
381 | (T12, O3v, Ne12, Vx) | 3.042036 |
219 | (S12, T15, O3v, Ne12, Vx) | 3.948540 |
234 | (S12, T12, O3v, Ne12, Vx) | 4.222300 |
354 | (T15, E12, O3v, Ne12) | 4.264315 |
=["R2adj","nb_var"],ascending=False). \
modsel.sort_values(by4,[1,6]] iloc[:
var_added | R2adj | |
---|---|---|
115 | (S12, T15, W12, O3v, Ne12, Vx) | 0.709059 |
219 | (S12, T15, O3v, Ne12, Vx) | 0.708613 |
366 | (T15, O3v, Ne12, Vx) | 0.708307 |
23 | (N12, S12, T15, W12, O3v, Ne12, Vx) | 0.707954 |
=["R2","nb_var"], ascending=False). \
modsel.sort_values(by4,[1,5]] iloc[:
var_added | R2 | |
---|---|---|
0 | (N12, S12, T15, E12, W12, T12, O3v, Ne12, Vx) | 0.750501 |
4 | (N12, S12, T15, E12, W12, O3v, Ne12, Vx) | 0.750189 |
6 | (N12, S12, T15, W12, T12, O3v, Ne12, Vx) | 0.750040 |
23 | (N12, S12, T15, W12, O3v, Ne12, Vx) | 0.749675 |
= modsel.sort_values(by=["BIC","nb_var"]).iloc[:1,[1,3]]
selec = "O3 ~ 1 +" + "+".join(selec.iloc[0,0])
formule = smf.ols(formule, data=ozone).fit() modsel