import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
= np.random.default_rng(seed=1234) rng
16 Données déséquilibrées
Exercice 1 (Critères pour un exemple de données déséquilibrées)
= 500 n = 0.05 p = rng.binomial(1, p=p, size=n) Y
= np.random.default_rng(seed=123) rng = rng.binomial(1, p=0.005, size=n) P1
= np.zeros_like(P1) P2 for yy in range(n): if Y[yy]==0: = rng.binomial(1, p=0.10, size=1)[0] P2[yy] else: = rng.binomial(1, p=0.85, size=1)[0] P2[yy]
from sklearn.metrics import confusion_matrix print(confusion_matrix(Y, P1))
[[478 0] [ 22 0]]
print(confusion_matrix(Y, P2))
[[432 46] [ 4 18]]
= confusion_matrix(Y, P2) cm = cm.diagonal().sum()/cm.sum() acc = cm[1,1]/cm[1,:].sum() rec = cm[1,1]/cm[:,1].sum() prec print(acc) print(rec) print(prec)
0.9 0.8181818181818182 0.28125
= 2*(rec*prec)/(rec+prec) F1 print(F1) = cm[:,0].sum()/n*cm[0,:].sum()/n + cm[:,1].sum()/n*cm[1,:].sum()/n rand = (acc-rand)/(1-rand) kappa print(kappa)
0.41860465116279066 0.37786183555644093
from sklearn.metrics import accuracy_score, recall_score, precision_score from sklearn.metrics import f1_score, cohen_kappa_score print(accuracy_score(Y, P2), "**", accuracy_score(Y, P1)) print(recall_score(Y, P2), "**", recall_score(Y, P1)) print(precision_score(Y, P2), "**", precision_score(Y, P1)) print(f1_score(Y, P2), "**", f1_score(Y, P1)) print(cohen_kappa_score(Y, P2), "**", cohen_kappa_score(Y, P1))
0.9 ** 0.956 0.8181818181818182 ** 0.0 0.28125 ** 0.0 0.4186046511627907 ** 0.0 0.3778618355564404 ** 0.0
Exercice 2 (Échantillonnage rétrospectif) On remarque d’abord que \(\mathbf P(\tilde y_i=1)=\mathbf P(y_i=1|s_i=1)\). De plus \[ \text{logit}\, p_\beta(x_i)=\log\frac{\mathbf P(y_i=1)}{\mathbf P(y_i=0)}\quad\text{et}\quad \text{logit}\, p_\gamma(x_i)=\log\frac{\mathbf P(y_i=1|s_i=1)}{\mathbf P(y_i=0|s_i=1)}. \] Or \[ \mathbf P(y_i=1|s_i=1)=\frac{\mathbf P(y_i=1,s_i=1)}{\mathbf P(s_i=1)}=\frac{\mathbf P(s_i=1|y_i=1)\mathbf P(y_i=1)}{\mathbf P(s_i=1)} \] et \[ \mathbf P(y_i=0|s_i=1)=\frac{\mathbf P(y_i=0,s_i=1)}{\mathbf P(s_i=1)}=\frac{\mathbf P(s_i=1|y_i=0)\mathbf P(y_i=0)}{\mathbf P(s_i=1)}. \] Donc \[ \text{logit}\, p_\gamma(x_i)=\log\frac{\mathbf P(y_i=1)}{\mathbf P(y_i=0)}+\log\frac{\mathbf P(s_i=1|y_i=1)}{\mathbf P(s_i=1|y_i=0)}=\text{logit}\,p_\beta(x_i)+\log\left(\frac{\tau_{1i}}{\tau_{0i}}\right). \]
Exercice 3 (Rééquilibrage)
= pd.read_csv("../donnees/dd_exo3_1.csv", header=0, sep=',') df1 = pd.read_csv("../donnees/dd_exo3_2.csv", header=0, sep=',') df2 = pd.read_csv("../donnees/dd_exo3_3.csv", header=0, sep=',') df3
print(df1.describe()) print(df2.describe()) print(df3.describe())
X1 X2 Y count 1000.000000 1000.000000 1000.000000 mean 0.514433 0.492924 0.441000 std 0.281509 0.291467 0.496755 min 0.000516 0.000613 0.000000 25% 0.284947 0.238695 0.000000 50% 0.518250 0.494121 0.000000 75% 0.753628 0.739679 1.000000 max 0.999567 0.999829 1.000000 X1 X2 Y count 1000.000000 1000.000000 1000.000000 mean 0.520809 0.472473 0.308000 std 0.280013 0.283496 0.461898 min 0.002732 0.000890 0.000000 25% 0.296167 0.225272 0.000000 50% 0.521226 0.468858 0.000000 75% 0.764060 0.693746 1.000000 max 0.996044 0.999183 1.000000 X1 X2 Y count 1000.000000 1000.000000 1000.000000 mean 0.538032 0.454919 0.158000 std 0.273863 0.271638 0.364924 min 0.004914 0.000613 0.000000 25% 0.322489 0.221447 0.000000 50% 0.545587 0.450438 0.000000 75% 0.781116 0.663637 0.000000 max 0.996044 0.999829 1.000000
= ["C1", "C2"] colo = ["o", "d"] mark for yy in [0, 1]: ==yy, "X1"], df1.loc[df1.Y==yy, "X2"], color=colo[yy], marker=mark[yy]) plt.scatter(df1.loc[df1.Y
for yy in [0, 1]: ==yy, "X1"], df2.loc[df2.Y==yy, "X2"], color=colo[yy], marker=mark[yy]) plt.scatter(df2.loc[df2.Y
for yy in [0, 1]: ==yy, "X1"], df3.loc[df3.Y==yy, "X2"], color=colo[yy], marker=mark[yy]) plt.scatter(df3.loc[df3.Y
from sklearn.model_selection import train_test_split ## separation en matrice X, Y (et creation du produit=interaction) = df1.drop(columns="Y") T1 = T1.assign(inter= T1.X1 * T1.X2).to_numpy() X1 = df1.Y.to_numpy() y1 = df2.drop(columns="Y") T2 = T2.assign(inter= T2.X1 * T2.X2).to_numpy() X2 = df2.Y.to_numpy() y2 = df3.drop(columns="Y") T3 = T3.assign(inter= T3.X1 * T3.X2).to_numpy() X3 = df3.Y.to_numpy() y3 ## separation apprentissage/validation = train_test_split( X1_app, X1_valid, y1_app, y1_valid =0.33, random_state=1234) X1, y1, test_size= train_test_split( X2_app, X2_valid, y2_app, y2_valid =0.33, random_state=1234) X2, y2, test_size= train_test_split( X3_app, X3_valid, y3_app, y3_valid =0.33, random_state=1234) X3, y3, test_size
from sklearn.linear_model import LogisticRegression = LogisticRegression(penalty=None, solver="newton-cholesky").fit(X1_app, y1_app) mod1 = LogisticRegression(penalty=None, solver="newton-cholesky").fit(X2_app, y2_app) mod2 = LogisticRegression(penalty=None, solver="newton-cholesky").fit(X3_app, y3_app) mod3
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, cohen_kappa_score = mod1.predict(X1_valid) P1 = mod1.predict(X2_valid) P2 = mod1.predict(X3_valid) P3 = pd.DataFrame({"crit": ["acc", "bal_acc", "F1", "Kappa"]}) s1 = pd.DataFrame({"crit": ["acc", "bal_acc", "F1", "Kappa"]}) s2 = pd.DataFrame({"crit": ["acc", "bal_acc", "F1", "Kappa"]}) s3 print("--- donnees 1 ---") = s1.assign(brut=0.0) s1 0,1] = accuracy_score(y1_valid, P1) s1.iloc[1,1] = balanced_accuracy_score(y1_valid, P1) s1.iloc[2,1] = f1_score(y1_valid, P1) s1.iloc[3,1] = cohen_kappa_score(y1_valid, P1) s1.iloc[print(s1) print("--- donnees 2 ---") = s2.assign(brut=0.0) s2 0,1] = accuracy_score(y2_valid, P2) s2.iloc[1,1] = balanced_accuracy_score(y2_valid, P2) s2.iloc[2,1] = f1_score(y2_valid, P2) s2.iloc[3,1] = cohen_kappa_score(y2_valid, P2) s2.iloc[print(s2) print("--- donnees 3 ---") = s3.assign(brut=0.0) s3 0,1] = accuracy_score(y3_valid, P3) s3.iloc[1,1] = balanced_accuracy_score(y3_valid, P3) s3.iloc[2,1] = f1_score(y3_valid, P3) s3.iloc[3,1] = cohen_kappa_score(y3_valid, P3) s3.iloc[print(s3)
--- donnees 1 --- crit brut 0 acc 0.657576 1 bal_acc 0.655825 2 F1 0.622074 3 Kappa 0.309572 --- donnees 2 --- crit brut 0 acc 0.724242 1 bal_acc 0.740841 2 F1 0.637450 3 Kappa 0.427280 --- donnees 3 --- crit brut 0 acc 0.693939 1 bal_acc 0.729286 2 F1 0.435754 3 Kappa 0.278103
from imblearn.under_sampling import RandomUnderSampler from imblearn.under_sampling import TomekLinks from imblearn.over_sampling import RandomOverSampler from imblearn.over_sampling import SMOTE ## RandomOverSampler = RandomOverSampler(random_state=123) ros3 = ros3.fit_resample(X3_app, y3_app) X3_app_reech, y3_app_reech = LogisticRegression(penalty=None, solver="newton-cholesky").fit(X3_app_reech, y3_app_reech) mod3_ros ## Smote = RandomOverSampler(random_state=123) sm = sm.fit_resample(X3_app, y3_app) X3_app_reech, y3_app_reech = LogisticRegression(penalty=None, solver="newton-cholesky").fit(X3_app_reech, y3_app_reech) mod3_sm ## RandomUnderSampler = RandomUnderSampler(random_state=123) rus3 = rus3.fit_resample(X3_app, y3_app) X3_app_reech, y3_app_reech = LogisticRegression(penalty=None, solver="newton-cholesky").fit(X3_app_reech, y3_app_reech) mod3_rus ## Tomek = TomekLinks(sampling_strategy='all') tl = tl.fit_resample(X3_app, y3_app) X3_app_reech, y3_app_reech = LogisticRegression(penalty=None, solver="newton-cholesky").fit(X3_app_reech, y3_app_reech) mod3_tl
= mod3_ros.predict(X3_valid) P3_ros = mod3_sm.predict(X3_valid) P3_sm = mod3_rus.predict(X3_valid) P3_rus = mod3_tl.predict(X3_valid) P3_tl
= s3.assign(ros=[accuracy_score(y3_valid, P3_ros), s3 balanced_accuracy_score(y3_valid, P3_ros), f1_score(y3_valid, P3_ros), cohen_kappa_score(y3_valid, P3_ros)])= s3.assign(sm=[accuracy_score(y3_valid, P3_sm), s3 balanced_accuracy_score(y3_valid, P3_sm), f1_score(y3_valid, P3_sm), cohen_kappa_score(y3_valid, P3_sm)])= s3.assign(rus=[accuracy_score(y3_valid, P3_rus), s3 balanced_accuracy_score(y3_valid, P3_rus), f1_score(y3_valid, P3_rus), cohen_kappa_score(y3_valid, P3_rus)])= s3.assign(tl=[accuracy_score(y3_valid, P3_tl), s3 balanced_accuracy_score(y3_valid, P3_tl), f1_score(y3_valid, P3_tl), cohen_kappa_score(y3_valid, P3_tl)])print(s3)
crit brut ros sm rus tl 0 acc 0.693939 0.603030 0.603030 0.612121 0.854545 1 bal_acc 0.729286 0.683929 0.683929 0.689286 0.520000 2 F1 0.435754 0.379147 0.379147 0.384615 0.076923 3 Kappa 0.278103 0.192415 0.192415 0.200606 0.066038