import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import statsmodels.regression.linear_model as smlm
from patsy import dmatrix
from scipy.stats import norm, chi2
import sys
'../modules')
sys.path.append(import choixglmstats
12 Régression logistique
Présentation du modèle
= pd.read_csv('../donnees/artere.txt', header=0, index_col=0, sep=' ') artere
= plt.figure()
fig 'o')
plt.plot(artere.age, artere.chd, 'chd')
plt.ylabel('age')
plt.xlabel( fig.tight_layout()
= pd.crosstab(artere.agrp, artere.chd)
artere_summary = ['chd0', 'chd1']
artere_summary.columns 'Effectifs'] = artere_summary.chd0 + artere_summary.chd1
artere_summary['Frequence'] = artere_summary.chd1 / artere_summary.Effectifs
artere_summary['age_min'] = [artere[artere.agrp == agrp].age.min() - 1 for agrp in artere_summary.index]
artere_summary['age_min'] = artere.groupby(["agrp"]).age.min() +1
artere_summary['age_max'] = [artere[artere.agrp == agrp].age.max() for agrp in artere_summary.index]
artere_summary['age_max'] = artere.groupby(["agrp"]).age.max()
artere_summary['age'] = [f']{artere_summary.age_min[i]};{artere_summary.age_max[i]}]' for i in artere_summary.index]
artere_summary['age'] = "]" + artere_summary.age_min.astype(str) + ";" + artere_summary.age_max.astype(str) + "]"
artere_summary['age', 'Effectifs', 'chd0', 'chd1', 'Frequence']].to_string(index=False)
artere_summary.reset_index()[[
= plt.figure()
fig 'o')
plt.plot(artere.age, artere.chd, 'chd')
plt.ylabel('age')
plt.xlabel('k')
plt.hlines(artere_summary.Frequence,artere_summary.age_min, artere_summary.age_max, fig.tight_layout()
= smf.glm('chd~age', data=artere, family=sm.families.Binomial()).fit()
modele print(modele.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: chd No. Observations: 100
Model: GLM Df Residuals: 98
Model Family: Binomial Df Model: 1
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: -53.677
Date: Tue, 04 Feb 2025 Deviance: 107.35
Time: 18:32:49 Pearson chi2: 102.
No. Iterations: 4 Pseudo R-squ. (CS): 0.2541
Covariance Type: nonrobust
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept -5.3095 1.134 -4.683 0.000 -7.531 -3.088
age 0.1109 0.024 4.610 0.000 0.064 0.158
==============================================================================
'o')
plt.plot(artere.age, artere.chd, 'chd')
plt.ylabel('age')
plt.xlabel('k')
plt.hlines(artere_summary.Frequence, artere_summary.age_min, artere_summary.age_max, = np.arange(artere_summary.age_min.min(), artere_summary.age_max.max(), step=0.01)
x = np.exp(modele.params.Intercept + modele.params.age * x) / (1.0 + np.exp(modele.params.Intercept + modele.params.age * x))
y 'b--')
plt.plot(x, y, fig.tight_layout()
= np.random.choice(['A', 'B', 'C'], 100)
X = np.zeros(X.shape, dtype=int)
Y =='A'] = np.random.binomial(1, 0.9, (X=='A').sum())
Y[X=='B'] = np.random.binomial(1, 0.1, (X=='B').sum())
Y[X=='C'] = np.random.binomial(1, 0.9, (X=='C').sum())
Y[X= pd.DataFrame({'X': X, 'Y': Y}) don
= smf.glm("Y~X", data=don, family=sm.families.Binomial()).fit()
mod print(mod.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: Y No. Observations: 100
Model: GLM Df Residuals: 97
Model Family: Binomial Df Model: 2
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: -13.869
Date: Tue, 04 Feb 2025 Deviance: 27.738
Time: 18:32:49 Pearson chi2: 32.0
No. Iterations: 23 Pseudo R-squ. (CS): 0.6689
Covariance Type: nonrobust
==============================================================================
coef std err z P>|z| [0.025 0.975]
------------------------------------------------------------------------------
Intercept 24.5661 2.57e+04 0.001 0.999 -5.03e+04 5.04e+04
X[T.B] -49.1321 3.27e+04 -0.002 0.999 -6.41e+04 6.4e+04
X[T.C] -22.8797 2.57e+04 -0.001 0.999 -5.04e+04 5.03e+04
==============================================================================
= smf.glm("Y~C(X, Sum)", data=don, family=sm.families.Binomial()).fit()
mod1 print(mod1.summary())
Generalized Linear Model Regression Results
==============================================================================
Dep. Variable: Y No. Observations: 100
Model: GLM Df Residuals: 97
Model Family: Binomial Df Model: 2
Link Function: Logit Scale: 1.0000
Method: IRLS Log-Likelihood: -13.869
Date: Tue, 04 Feb 2025 Deviance: 27.738
Time: 18:32:49 Pearson chi2: 32.0
No. Iterations: 23 Pseudo R-squ. (CS): 0.6689
Covariance Type: nonrobust
==================================================================================
coef std err z P>|z| [0.025 0.975]
----------------------------------------------------------------------------------
Intercept 0.5621 1.09e+04 5.16e-05 1.000 -2.14e+04 2.14e+04
C(X, Sum)[S.A] 24.0039 1.84e+04 0.001 0.999 -3.61e+04 3.61e+04
C(X, Sum)[S.B] -25.1282 1.6e+04 -0.002 0.999 -3.13e+04 3.13e+04
==================================================================================
Estimation
= pd.read_csv("../donnees/SAh.csv", header=0, sep=",")
SAh = SAh.iloc[[1,407,34],]
newSAh = newSAh.reset_index().drop("index",axis=1)
newSAh = SAh.drop([1,407,34]).reset_index().drop("index",axis=1) SAh
="chd ~ " + "+".join(SAh.columns[ :-1 ])
form = smf.glm(form, data=SAh, family=sm.families.Binomial()).fit()
mod mod.summary()
Dep. Variable: | chd | No. Observations: | 459 |
Model: | GLM | Df Residuals: | 449 |
Model Family: | Binomial | Df Model: | 9 |
Link Function: | Logit | Scale: | 1.0000 |
Method: | IRLS | Log-Likelihood: | -234.36 |
Date: | Tue, 04 Feb 2025 | Deviance: | 468.72 |
Time: | 18:32:49 | Pearson chi2: | 449. |
No. Iterations: | 5 | Pseudo R-squ. (CS): | 0.2339 |
Covariance Type: | nonrobust |
coef | std err | z | P>|z| | [0.025 | 0.975] | |
Intercept | -6.0837 | 1.314 | -4.629 | 0.000 | -8.659 | -3.508 |
famhist[T.Present] | 0.9325 | 0.229 | 4.069 | 0.000 | 0.483 | 1.382 |
sbp | 0.0065 | 0.006 | 1.127 | 0.260 | -0.005 | 0.018 |
tobacco | 0.0814 | 0.027 | 3.023 | 0.003 | 0.029 | 0.134 |
ldl | 0.1794 | 0.060 | 2.989 | 0.003 | 0.062 | 0.297 |
adiposity | 0.0184 | 0.030 | 0.622 | 0.534 | -0.039 | 0.076 |
typea | 0.0392 | 0.012 | 3.184 | 0.001 | 0.015 | 0.063 |
obesity | -0.0637 | 0.045 | -1.430 | 0.153 | -0.151 | 0.024 |
alcohol | 0.0002 | 0.004 | 0.035 | 0.972 | -0.009 | 0.009 |
age | 0.0439 | 0.012 | 3.592 | 0.000 | 0.020 | 0.068 |
print(mod.conf_int(alpha=0.05))
0 1
Intercept -8.659355 -3.507984
famhist[T.Present] 0.483354 1.381573
sbp -0.004798 0.017773
tobacco 0.028628 0.134174
ldl 0.061771 0.297043
adiposity -0.039461 0.076187
typea 0.015090 0.063396
obesity -0.151055 0.023612
alcohol -0.008640 0.008950
age 0.019931 0.067793
= pd.read_csv("../donnees/logit_donnees.csv", sep=",", header=0)
don = smf.logit("Y ~ X1 + X2 + X3", data=don).fit() modsim
Optimization terminated successfully.
Current function value: 0.366463
Iterations 7
modsim.wald_test_terms()
<class 'statsmodels.stats.contrast.WaldTestResults'>
chi2 P>chi2 df constraint
Intercept [[28.46981615119592]] 9.517067345514757e-08 1
X1 [[212.50601519640435]] 7.159869628652175e-47 2
X2 [[210.39004424749865]] 1.129196632385863e-47 1
X3 [[0.3095790886927727]] 0.5779385882309931 1
= smf.logit("Y~X2+X3",data=don).fit()
modsim01 = smf.logit("Y~X1+X3",data=don).fit()
modsim02 = smf.logit("Y~X1+X2",data=don).fit() modsim03
Optimization terminated successfully.
Current function value: 0.554834
Iterations 6
Optimization terminated successfully.
Current function value: 0.575293
Iterations 5
Optimization terminated successfully.
Current function value: 0.366618
Iterations 7
import statsmodels.regression.linear_model as smlm
smlm.RegressionResults.compare_lr_test(modsim,modsim01)
(376.7417147005359, 1.554447652669738e-82, 2.0)
smlm.RegressionResults.compare_lr_test(modsim,modsim02)
(417.66072160440774, 7.881723945480014e-93, 1.0)
smlm.RegressionResults.compare_lr_test(modsim,modsim03)
(0.3097619772612461, 0.5778262823265808, 1.0)
print(newSAh)
sbp tobacco ldl adiposity famhist typea obesity alcohol age chd
0 144 0.01 4.41 28.61 Absent 55 28.87 2.06 63 1
1 200 19.20 4.43 40.60 Present 55 32.04 36.00 60 1
2 148 5.50 7.10 25.31 Absent 56 29.84 3.60 48 0
print(mod.predict(newSAh))
0 0.320889
1 0.881177
2 0.369329
dtype: float64
= mod.cov_params().values
varbetac = mod.params.values
betac = mod.model.formula.split("~")[1]
ff = dmatrix("~"+ff, data=newSAh, return_type="dataframe").to_numpy()
xetoile = np.dot(xetoile,betac)
prev_fit = np.diag(np.dot(np.dot(xetoile,varbetac), np.transpose(xetoile)))**0.5
prev_se = prev_fit-norm.ppf(0.975)*prev_se
cl_inf = prev_fit+norm.ppf(0.975)*prev_se
cl_sup = np.exp(cl_inf)/(1+np.exp(cl_inf))
binf = np.exp(cl_sup)/(1+np.exp(cl_sup))
bsup print(pd.DataFrame({"binf": binf, "bsup": bsup}))
binf bsup
0 0.199717 0.472200
1 0.713881 0.956601
2 0.246181 0.512220
= artere.groupby(["age"])
g = pd.concat([g["chd"].mean(), g["chd"].count()], axis=1)
dfsat = ["p", "n"]
dfsat.columns print(dfsat.iloc[0:5])
p n
age
20 0.0 1
23 0.0 1
24 0.0 1
25 0.5 2
26 0.0 2
'o', dfsat.index, dfsat.p, "-")
plt.plot(artere.age, artere.chd, 'chd')
plt.ylabel('age')
plt.xlabel( fig.tight_layout()
=10
K= pd.DataFrame({"ajust": mod.predict()}, index=SAh.index)
ajust "Y"] = SAh["chd"]
ajust['decile'] = pd.qcut(ajust["ajust"], K)
ajust[= ajust['Y'].groupby(ajust.decile).sum()
ok = ajust["ajust"].groupby(ajust.decile).mean()
muk = ajust['Y'].groupby(ajust.decile).count()
mk = ((ok - mk*muk)**2/(mk*muk*(1-muk))).sum() C2
print('chi-square: {:.3f}'.format(C2))
chi-square: 6.659
=1-chi2.cdf(C2, K-2)
pvalueprint('p-value: {:.3f}'.format(pvalue))
p-value: 0.574
="chd ~ " + "+".join(SAh.columns[ :-1 ])
form = smf.glm(form, data=SAh, family=sm.families.Binomial()).fit() mod
= mod.resid_deviance/np.sqrt(1-mod.get_hat_matrix_diag())
resdev = mod.resid_pearson/np.sqrt(1-mod.get_hat_matrix_diag()) respea
= plt.figure()
fig 'o')
plt.plot(resdev, 'residus deviance')
plt.ylabel( fig.tight_layout()
= plt.figure()
fig 'o')
plt.plot(respea, 'residus Pearson')
plt.ylabel( fig.tight_layout()
Choix de variables
= smf.glm("chd~sbp+ldl", data=SAh, family=sm.families.Binomial()).fit()
mod0 = smf.glm("chd~sbp+ldl+famhist+alcohol", data=SAh, family=sm.families.Binomial()).fit()
mod1 def lr_test(restr, full):
from scipy import stats
= (restr.df_resid - full.df_resid)
lr_df = -2*(restr.llf - full.llf)
lr_stat = stats.chi2.sf(lr_stat, df=lr_df)
lr_pvalue return {"lr": lr_stat, "pvalue": lr_pvalue, "df": lr_df}
lr_test(mod0, mod1)
{'lr': 25.5447172394405, 'pvalue': 2.838148600168801e-06, 'df': 2}
= choixglmstats.bestglm(SAh, upper=form) mod_sel
print(mod_sel.sort_values(by=["BIC","nb_var"]).iloc[:5,[1,3]])
var_added BIC
176 (tobacco, famhist, ldl, typea, age) 509.100392
287 (tobacco, famhist, typea, age) 512.495444
284 (tobacco, famhist, ldl, age) 512.537897
91 (tobacco, famhist, ldl, typea, obesity, age) 513.424654
358 (famhist, ldl, typea, age) 513.471151
print(mod_sel.sort_values(by=["AIC","nb_var"]).iloc[:5,[1,2]])
var_added AIC
176 (tobacco, famhist, ldl, typea, age) 484.326091
91 (tobacco, famhist, ldl, typea, obesity, age) 484.521302
36 (tobacco, famhist, ldl, typea, obesity, age, sbp) 485.117363
93 (tobacco, famhist, ldl, typea, age, sbp) 485.311962
31 (tobacco, famhist, adiposity, ldl, typea, obes... 486.031360