12 Régression logistique

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.formula.api as smf
import statsmodels.api as sm
import statsmodels.regression.linear_model as smlm
from patsy import dmatrix
from scipy.stats import norm, chi2
import sys
sys.path.append('../modules')
import choixglmstats

Présentation du modèle

artere = pd.read_csv('../donnees/artere.txt', header=0, index_col=0, sep=' ')

fig = plt.figure()
plt.plot(artere.age, artere.chd, 'o')
plt.ylabel('chd')
plt.xlabel('age')
fig.tight_layout()

artere_summary = pd.crosstab(artere.agrp, artere.chd)
artere_summary.columns = ['chd0', 'chd1']
artere_summary['Effectifs'] = artere_summary.chd0 + artere_summary.chd1
artere_summary['Frequence'] = artere_summary.chd1 / artere_summary.Effectifs
artere_summary['age_min'] = [artere[artere.agrp == agrp].age.min() - 1 for agrp in artere_summary.index]
artere_summary['age_min'] = artere.groupby(["agrp"]).age.min() +1
artere_summary['age_max'] = [artere[artere.agrp == agrp].age.max() for agrp in artere_summary.index]
artere_summary['age_max'] = artere.groupby(["agrp"]).age.max()
artere_summary['age'] = [f']{artere_summary.age_min[i]};{artere_summary.age_max[i]}]' for i in artere_summary.index]
artere_summary['age'] = "]" + artere_summary.age_min.astype(str) + ";" + artere_summary.age_max.astype(str) + "]"
artere_summary.reset_index()[['age', 'Effectifs', 'chd0', 'chd1', 'Frequence']].to_string(index=False)


fig = plt.figure()
plt.plot(artere.age, artere.chd, 'o')
plt.ylabel('chd')
plt.xlabel('age')
plt.hlines(artere_summary.Frequence,artere_summary.age_min, artere_summary.age_max, 'k')
fig.tight_layout()

modele = smf.glm('chd~age', data=artere, family=sm.families.Binomial()).fit()
print(modele.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                    chd   No. Observations:                  100
Model:                            GLM   Df Residuals:                       98
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -53.677
Date:                Tue, 04 Feb 2025   Deviance:                       107.35
Time:                        18:32:49   Pearson chi2:                     102.
No. Iterations:                     4   Pseudo R-squ. (CS):             0.2541
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -5.3095      1.134     -4.683      0.000      -7.531      -3.088
age            0.1109      0.024      4.610      0.000       0.064       0.158
==============================================================================

plt.plot(artere.age, artere.chd, 'o')
plt.ylabel('chd')
plt.xlabel('age')
plt.hlines(artere_summary.Frequence, artere_summary.age_min, artere_summary.age_max, 'k')
x = np.arange(artere_summary.age_min.min(), artere_summary.age_max.max(), step=0.01)
y = np.exp(modele.params.Intercept + modele.params.age * x) / (1.0 + np.exp(modele.params.Intercept + modele.params.age * x))
plt.plot(x, y, 'b--')
fig.tight_layout()

X = np.random.choice(['A', 'B', 'C'], 100)
Y = np.zeros(X.shape, dtype=int)
Y[X=='A'] = np.random.binomial(1, 0.9, (X=='A').sum())
Y[X=='B'] = np.random.binomial(1, 0.1, (X=='B').sum())
Y[X=='C'] = np.random.binomial(1, 0.9, (X=='C').sum())
don = pd.DataFrame({'X': X, 'Y': Y})

mod = smf.glm("Y~X", data=don, family=sm.families.Binomial()).fit()
print(mod.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      Y   No. Observations:                  100
Model:                            GLM   Df Residuals:                       97
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13.869
Date:                Tue, 04 Feb 2025   Deviance:                       27.738
Time:                        18:32:49   Pearson chi2:                     32.0
No. Iterations:                    23   Pseudo R-squ. (CS):             0.6689
Covariance Type:            nonrobust                                         
==============================================================================
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     24.5661   2.57e+04      0.001      0.999   -5.03e+04    5.04e+04
X[T.B]       -49.1321   3.27e+04     -0.002      0.999   -6.41e+04     6.4e+04
X[T.C]       -22.8797   2.57e+04     -0.001      0.999   -5.04e+04    5.03e+04
==============================================================================

mod1 = smf.glm("Y~C(X, Sum)", data=don, family=sm.families.Binomial()).fit()
print(mod1.summary())

                 Generalized Linear Model Regression Results                  
==============================================================================
Dep. Variable:                      Y   No. Observations:                  100
Model:                            GLM   Df Residuals:                       97
Model Family:                Binomial   Df Model:                            2
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -13.869
Date:                Tue, 04 Feb 2025   Deviance:                       27.738
Time:                        18:32:49   Pearson chi2:                     32.0
No. Iterations:                    23   Pseudo R-squ. (CS):             0.6689
Covariance Type:            nonrobust                                         
==================================================================================
                     coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------
Intercept          0.5621   1.09e+04   5.16e-05      1.000   -2.14e+04    2.14e+04
C(X, Sum)[S.A]    24.0039   1.84e+04      0.001      0.999   -3.61e+04    3.61e+04
C(X, Sum)[S.B]   -25.1282    1.6e+04     -0.002      0.999   -3.13e+04    3.13e+04
==================================================================================

Estimation

SAh = pd.read_csv("../donnees/SAh.csv", header=0, sep=",")
newSAh = SAh.iloc[[1,407,34],]
newSAh = newSAh.reset_index().drop("index",axis=1)
SAh = SAh.drop([1,407,34]).reset_index().drop("index",axis=1)

form ="chd ~ " + "+".join(SAh.columns[ :-1 ])
mod = smf.glm(form, data=SAh, family=sm.families.Binomial()).fit()
mod.summary()

Generalized Linear Model Regression Results
Dep. Variable:	chd	No. Observations:	459
Model:	GLM	Df Residuals:	449
Model Family:	Binomial	Df Model:	9
Link Function:	Logit	Scale:	1.0000
Method:	IRLS	Log-Likelihood:	-234.36
Date:	Tue, 04 Feb 2025	Deviance:	468.72
Time:	18:32:49	Pearson chi2:	449.
No. Iterations:	5	Pseudo R-squ. (CS):	0.2339
Covariance Type:	nonrobust

	coef	std err	z	P>\|z\|	[0.025	0.975]
Intercept	-6.0837	1.314	-4.629	0.000	-8.659	-3.508
famhist[T.Present]	0.9325	0.229	4.069	0.000	0.483	1.382
sbp	0.0065	0.006	1.127	0.260	-0.005	0.018
tobacco	0.0814	0.027	3.023	0.003	0.029	0.134
ldl	0.1794	0.060	2.989	0.003	0.062	0.297
adiposity	0.0184	0.030	0.622	0.534	-0.039	0.076
typea	0.0392	0.012	3.184	0.001	0.015	0.063
obesity	-0.0637	0.045	-1.430	0.153	-0.151	0.024
alcohol	0.0002	0.004	0.035	0.972	-0.009	0.009
age	0.0439	0.012	3.592	0.000	0.020	0.068

print(mod.conf_int(alpha=0.05))

                           0         1
Intercept          -8.659355 -3.507984
famhist[T.Present]  0.483354  1.381573
sbp                -0.004798  0.017773
tobacco             0.028628  0.134174
ldl                 0.061771  0.297043
adiposity          -0.039461  0.076187
typea               0.015090  0.063396
obesity            -0.151055  0.023612
alcohol            -0.008640  0.008950
age                 0.019931  0.067793

don = pd.read_csv("../donnees/logit_donnees.csv", sep=",", header=0)
modsim = smf.logit("Y ~ X1 + X2 + X3", data=don).fit()

Optimization terminated successfully.
         Current function value: 0.366463
         Iterations 7

modsim.wald_test_terms()

<class 'statsmodels.stats.contrast.WaldTestResults'>
                             chi2                 P>chi2  df constraint
Intercept   [[28.46981615119592]]  9.517067345514757e-08              1
X1         [[212.50601519640435]]  7.159869628652175e-47              2
X2         [[210.39004424749865]]  1.129196632385863e-47              1
X3         [[0.3095790886927727]]     0.5779385882309931              1

modsim01 = smf.logit("Y~X2+X3",data=don).fit()
modsim02 = smf.logit("Y~X1+X3",data=don).fit()
modsim03 = smf.logit("Y~X1+X2",data=don).fit()

Optimization terminated successfully.
         Current function value: 0.554834
         Iterations 6
Optimization terminated successfully.
         Current function value: 0.575293
         Iterations 5
Optimization terminated successfully.
         Current function value: 0.366618
         Iterations 7

import statsmodels.regression.linear_model as smlm
smlm.RegressionResults.compare_lr_test(modsim,modsim01)

(376.7417147005359, 1.554447652669738e-82, 2.0)

smlm.RegressionResults.compare_lr_test(modsim,modsim02)

(417.66072160440774, 7.881723945480014e-93, 1.0)

smlm.RegressionResults.compare_lr_test(modsim,modsim03)

(0.3097619772612461, 0.5778262823265808, 1.0)

print(newSAh)

   sbp  tobacco   ldl  adiposity  famhist  typea  obesity  alcohol  age  chd
0  144     0.01  4.41      28.61   Absent     55    28.87     2.06   63    1
1  200    19.20  4.43      40.60  Present     55    32.04    36.00   60    1
2  148     5.50  7.10      25.31   Absent     56    29.84     3.60   48    0

print(mod.predict(newSAh))

0    0.320889
1    0.881177
2    0.369329
dtype: float64

varbetac = mod.cov_params().values
betac = mod.params.values
ff = mod.model.formula.split("~")[1]
xetoile =  dmatrix("~"+ff, data=newSAh, return_type="dataframe").to_numpy()
prev_fit = np.dot(xetoile,betac)
prev_se = np.diag(np.dot(np.dot(xetoile,varbetac), np.transpose(xetoile)))**0.5
cl_inf = prev_fit-norm.ppf(0.975)*prev_se
cl_sup = prev_fit+norm.ppf(0.975)*prev_se
binf = np.exp(cl_inf)/(1+np.exp(cl_inf))
bsup = np.exp(cl_sup)/(1+np.exp(cl_sup))
print(pd.DataFrame({"binf": binf, "bsup": bsup}))

       binf      bsup
0  0.199717  0.472200
1  0.713881  0.956601
2  0.246181  0.512220

g = artere.groupby(["age"])
dfsat = pd.concat([g["chd"].mean(), g["chd"].count()], axis=1)
dfsat.columns = ["p", "n"]
print(dfsat.iloc[0:5])

       p  n
age        
20   0.0  1
23   0.0  1
24   0.0  1
25   0.5  2
26   0.0  2

plt.plot(artere.age, artere.chd, 'o', dfsat.index, dfsat.p, "-")
plt.ylabel('chd')
plt.xlabel('age')
fig.tight_layout()

K=10
ajust = pd.DataFrame({"ajust": mod.predict()}, index=SAh.index)
ajust["Y"] = SAh["chd"]
ajust['decile'] = pd.qcut(ajust["ajust"], K)
ok = ajust['Y'].groupby(ajust.decile).sum()
muk = ajust["ajust"].groupby(ajust.decile).mean()
mk = ajust['Y'].groupby(ajust.decile).count()
C2 = ((ok - mk*muk)**2/(mk*muk*(1-muk))).sum()

print('chi-square: {:.3f}'.format(C2))

chi-square: 6.659

pvalue=1-chi2.cdf(C2, K-2)
print('p-value: {:.3f}'.format(pvalue))

p-value: 0.574

form ="chd ~ " + "+".join(SAh.columns[ :-1 ])
mod = smf.glm(form, data=SAh, family=sm.families.Binomial()).fit()

resdev = mod.resid_deviance/np.sqrt(1-mod.get_hat_matrix_diag())
respea = mod.resid_pearson/np.sqrt(1-mod.get_hat_matrix_diag())

fig = plt.figure()
plt.plot(resdev, 'o')
plt.ylabel('residus deviance')
fig.tight_layout()

fig = plt.figure()
plt.plot(respea, 'o')
plt.ylabel('residus Pearson')
fig.tight_layout()

Choix de variables

mod0 = smf.glm("chd~sbp+ldl", data=SAh, family=sm.families.Binomial()).fit()
mod1 = smf.glm("chd~sbp+ldl+famhist+alcohol", data=SAh, family=sm.families.Binomial()).fit()
def lr_test(restr, full):
    from scipy import stats
    lr_df = (restr.df_resid - full.df_resid)
    lr_stat = -2*(restr.llf - full.llf)
    lr_pvalue = stats.chi2.sf(lr_stat, df=lr_df)
    return {"lr": lr_stat, "pvalue": lr_pvalue, "df": lr_df}

lr_test(mod0, mod1)

{'lr': 25.5447172394405, 'pvalue': 2.838148600168801e-06, 'df': 2}

mod_sel = choixglmstats.bestglm(SAh, upper=form)

print(mod_sel.sort_values(by=["BIC","nb_var"]).iloc[:5,[1,3]])

                                        var_added         BIC
176           (tobacco, famhist, ldl, typea, age)  509.100392
287                (tobacco, famhist, typea, age)  512.495444
284                  (tobacco, famhist, ldl, age)  512.537897
91   (tobacco, famhist, ldl, typea, obesity, age)  513.424654
358                    (famhist, ldl, typea, age)  513.471151

print(mod_sel.sort_values(by=["AIC","nb_var"]).iloc[:5,[1,2]])

                                             var_added         AIC
176                (tobacco, famhist, ldl, typea, age)  484.326091
91        (tobacco, famhist, ldl, typea, obesity, age)  484.521302
36   (tobacco, famhist, ldl, typea, obesity, age, sbp)  485.117363
93            (tobacco, famhist, ldl, typea, age, sbp)  485.311962
31   (tobacco, famhist, adiposity, ldl, typea, obes...  486.031360

Retour au sommet