import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
import sys
sys.path.append('../modules') # Ajoute le répertoire 'modules' au chemin de recherche des modules
import choixolsstats8 Choix de variables
Introduction
Notations
Choix incorrects des variables
Critères classiques de choix de modèles
Procédure de sélection
Exemple : la concentration en ozone
ozone = pd.read_csv("../donnees/ozone.txt", header = 0, sep = ";")
ozone["vent"] = ozone["vent"].astype("category")upper = "O3 ~ T12 + T15 + Ne12 + N12 + S12 + E12 + W12 + Vx + O3v"
mustbe = "1"
modcomplet = smf.ols(upper + "+" + mustbe, data=ozone).fit()
modsel = choixolsstats.bestols(ozone, upper=upper, mustbe=mustbe)
modsel['Cp'] = modsel['SSR']/modcomplet.mse_resid - modcomplet.nobs + 2 * modsel['nb_var']modsel.sort_values(by=["BIC","nb_var"]).iloc[:4,[1,3]]| var_added | BIC | |
|---|---|---|
| 366 | (T15, O3v, Ne12, Vx) | 411.974567 |
| 381 | (T12, O3v, Ne12, Vx) | 412.085048 |
| 443 | (T15, O3v, Ne12) | 412.146450 |
| 499 | (O3v, Ne12) | 412.310100 |
modsel.sort_values(by=["AIC","nb_var"]).iloc[:4,[1,2]]| var_added | AIC | |
|---|---|---|
| 366 | (T15, O3v, Ne12, Vx) | 402.414452 |
| 381 | (T12, O3v, Ne12, Vx) | 402.524933 |
| 219 | (S12, T15, O3v, Ne12, Vx) | 403.238253 |
| 234 | (S12, T12, O3v, Ne12, Vx) | 403.563498 |
modsel.sort_values(by=["Cp","nb_var"]).iloc[:5,[1,7]]| var_added | Cp | |
|---|---|---|
| 366 | (T15, O3v, Ne12, Vx) | 2.947034 |
| 381 | (T12, O3v, Ne12, Vx) | 3.042036 |
| 219 | (S12, T15, O3v, Ne12, Vx) | 3.948540 |
| 234 | (S12, T12, O3v, Ne12, Vx) | 4.222300 |
| 354 | (T15, E12, O3v, Ne12) | 4.264315 |
modsel.sort_values(by=["R2adj","nb_var"],ascending=False). \
iloc[:4,[1,6]]| var_added | R2adj | |
|---|---|---|
| 115 | (S12, T15, W12, O3v, Ne12, Vx) | 0.709059 |
| 219 | (S12, T15, O3v, Ne12, Vx) | 0.708613 |
| 366 | (T15, O3v, Ne12, Vx) | 0.708307 |
| 23 | (N12, S12, T15, W12, O3v, Ne12, Vx) | 0.707954 |
modsel.sort_values(by=["R2","nb_var"], ascending=False). \
iloc[:4,[1,5]]| var_added | R2 | |
|---|---|---|
| 0 | (N12, S12, T15, E12, W12, T12, O3v, Ne12, Vx) | 0.750501 |
| 4 | (N12, S12, T15, E12, W12, O3v, Ne12, Vx) | 0.750189 |
| 6 | (N12, S12, T15, W12, T12, O3v, Ne12, Vx) | 0.750040 |
| 23 | (N12, S12, T15, W12, O3v, Ne12, Vx) | 0.749675 |
selec = modsel.sort_values(by=["BIC","nb_var"]).iloc[:1,[1,3]]
formule = "O3 ~ 1 +" + "+".join(selec.iloc[0,0])
modsel = smf.ols(formule, data=ozone).fit()