import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression,\
LogisticRegressionCVfrom sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold
from imblearn.under_sampling import RandomUnderSampler, TomekLinks
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import accuracy_score, f1_score, \
balanced_accuracy_score, cohen_kappa_score, roc_auc_scoreimport sklearn.metrics as sklm
16 Données déséquilibrées
Données déséquilibrées et modèle logistique
= np.random.default_rng(seed=1234)
rng =200
n=1000
N= np.repeat(0.0, N)
beta1_m1 = np.repeat(0.0, N)
beta2_m1 = np.repeat(0.0, N)
beta1_m2 = np.repeat(0.0, N)
beta2_m2 for k in range(0,N):
= rng.uniform(0,1,n)
X = 3 - 6*X
Z1 = -1 - 6*X
Z2 = np.exp(Z1)/(1+np.exp(Z1))
p1 = np.exp(Z2)/(1+np.exp(Z2))
p2 = np.repeat(0,n)
Y1 = np.repeat(0,n)
Y2 for i in range(0,n):
= rng.binomial(1, p1[i], size=1)[0]
Y1[i] = rng.binomial(1, p2[i], size=1)[0]
Y2[i] =pd.DataFrame({"X": X, "Y1": Y1, "Y2": Y2})
df= smf.glm("Y1~1+X", data=df, family=sm.families.Binomial()).fit()
mod1 = smf.glm("Y2~1+X", data=df, family=sm.families.Binomial()).fit()
mod2 = mod1.params.Intercept - 3
beta1_m1[k] = mod1.params.X + 6
beta2_m1[k] = mod2.params.Intercept + 1
beta1_m2[k] = mod2.params.X + 6
beta2_m2[k]
= ["mod1", "mod2"]
labels = plt.subplots(1,2)
fig, (ax1, ax2) =labels, widths=0.6)
ax1.boxplot([beta1_m1, beta1_m2], tick_labels-6.5,6.5)
ax1.set_ylim(0, linestyle='dashed')
ax1.axhline('intercept', {'fontweight': "bold", 'fontsize': 7})
ax1.set_title(=labels, widths=0.6)
ax2.boxplot([beta2_m1, beta2_m2], tick_labels-6.5,6.5)
ax2.set_ylim(0, linestyle='dashed')
ax2.axhline("slope", {'fontweight': "bold", 'fontsize': 7})
ax2.set_title( fig.tight_layout()
= np.random.default_rng(seed=1234)
rng =200
n=1000
N= np.repeat(0.0, N)
beta1_m2 = np.repeat(0.0, N)
beta2_m2 = np.repeat(0.0, N)
beta1_m3 = np.repeat(0.0, N)
beta2_m3 for k in range(0,N):
= rng.uniform(0,1,n)
X = -1 - 6*X
Z2 = np.exp(Z2)/(1+np.exp(Z2))
p2 = np.repeat(0,n)
Y2 = np.repeat(0,n)
Y3 for i in range(0,n):
= 0
s = rng.binomial(1, p2[i], size=1)[0]
Y2[i] while (s==0):
= rng.binomial(1, p2[i], size=1)[0]
Y3[i] = Y3[i]*0.95 + (1-Y3[i])*0.05
tau = rng.binomial(1, tau, size=1)[0]
s =pd.DataFrame({"X": X, "Y2": Y2, "Y3": Y3})
df= smf.glm("Y2~1+X", data=df, family=sm.families.Binomial()).fit()
mod2 = smf.glm("Y3~1+X", data=df, family=sm.families.Binomial()).fit()
mod3 = mod2.params.Intercept + 1
beta1_m2[k] = mod2.params.X + 6
beta2_m2[k] = mod3.params.Intercept + 1
beta1_m3[k] = mod3.params.X + 6
beta2_m3[k]
"text.usetex": True})
plt.rcParams.update({= [r"$\hat\beta$", r"$\hat\gamma$"]
labels = plt.subplots(1,2)
fig, (ax1, ax2) =labels, widths=0.6)
ax1.boxplot([beta1_m2, beta1_m3], tick_labels-6,6)
ax1.set_ylim(0, linestyle='dashed')
ax1.axhline('intercept', {'fontweight': "bold", 'fontsize': 7})
ax1.set_title(=labels, widths=0.6)
ax2.boxplot([beta2_m2, beta2_m3], tick_labels-6.8,6)
ax2.set_ylim(0, linestyle='dashed')
ax2.axhline("slope", {'fontweight': "bold", 'fontsize': 7})
ax2.set_title( fig.tight_layout()
"text.usetex": True})
plt.rcParams.update({= beta1_m3 - np.log(0.95/0.05)
beta1_m3cor = [r"$\hat\beta_1$", r"$\hat\gamma_1$", r"$\hat\gamma_{1,\mbox{cor}}$"]
labels = plt.subplots(1,1)
fig, ax1 =labels, widths=0.5)
ax1.boxplot([beta1_m2, beta1_m3, beta1_m3cor], tick_labels-6,6)
ax1.set_ylim(0, linestyle='dashed') ax1.axhline(
= pd.DataFrame({"MALADE": [208, 42], "NON_MALADE": [48, 202], "FUMEUR": ["OUI", "NON"]})
df = smf.glm("MALADE+NON_MALADE~FUMEUR", data=df, family=sm.families.Binomial()).fit()
model round(model.params,3)
Intercept -1.571
FUMEUR[T.OUI] 3.037
dtype: float64
=pd.DataFrame({"FUMEUR": ["OUI", "NON"]})
newX model.predict(newX)
0 0.812500
1 0.172131
dtype: float64
= model.params.iloc[0] - np.log(0.995/0.005)
beta1_cor = model.params.iloc[1]
beta2 round((np.exp(beta1_cor+beta2)/(1+np.exp(beta1_cor+beta2))),3)
0.021
round((np.exp(beta1_cor)/(1+np.exp(beta1_cor))),3)
0.001
= 1
r = RandomUnderSampler(sampling_strategy=r, random_state=0)
rus = rus.fit_resample(X, y)
X_r, y_r = pd.DataFrame(X_r).assign(y_r = y_r) dfr
=smf.glm("y_r~...",data=dfr,family=sm.families.Binomial()).fit() mod
= df.Y.value_counts()[1]
n1 = model.params
gamma 0] - log(n1*r) gamma[
Stratégies pour données déséquilibrées
Quelques méthodes de rééquilibrages
= pd.read_csv('../donnees/dd_ex_ech_des1.csv', header=0, sep=';')
df
df.Y.value_counts()= df.Y
y = df.loc[:,["X1", "X2"]] X
= plt.figure()
fig ==0, "X1"], df.loc[df.Y==0, "X2"], 'o', df.loc[df.Y==1, "X1"], df.loc[df.Y==1, "X2"], '^')
plt.plot(df.loc[df.Y fig.tight_layout()
= RandomOverSampler(random_state=0)
ros1 = ros1.fit_resample(X, y)
Xreech1, yreech1 print(yreech1.value_counts())
Y
0 80
1 80
Name: count, dtype: int64
= RandomOverSampler(random_state=0, sampling_strategy={0: 80, 1: 40})
ros2 = ros2.fit_resample(X, y)
Xreech2, yreech2 print(yreech2.value_counts())
Y
0 80
1 40
Name: count, dtype: int64
= pd.DataFrame(Xreech1)
over1 "Y"] = yreech1 over1[
= SMOTE(random_state=42, k_neighbors=4)
smote1 = smote1.fit_resample(X, y)
Xreech1, yreech1 print(yreech1.value_counts())
Y
0 80
1 80
Name: count, dtype: int64
= SMOTE(random_state=423, k_neighbors=4, \
smote2 ={0: 80, 1: 40})
sampling_strategy= smote2.fit_resample(X, y)
Xreech2, yreech2 print(yreech2.value_counts())
Y
0 80
1 40
Name: count, dtype: int64
= Xreech1.assign(Yreech = yreech1)
df1 = df1.merge(df, how="outer", on=['X1', 'X2'])
tmp = tmp.loc[tmp.Y.isna(), :]
nouv1
nouv1.Yreech.value_counts()
= Xreech2.assign(Yreech = yreech2)
df2 = df2.merge(df, how="outer", on=['X1', 'X2'])
tmp = tmp.loc[tmp.Y.isna(), :]
nouv2
nouv2.Yreech.value_counts()
"lines", markersize=2)
plt.rc(= ["C0", "C1"]
coul = ["^", "o"]
mark = plt.subplots(1,2)
fig, (ax1, ax2) for i in range(0,2):
==i, "X1"], df.loc[df.Y==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax1.plot(df.loc[df.Y==i, "X1"], nouv1.loc[nouv1.Yreech==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax1.plot(nouv1.loc[nouv1.Yreech==i, "X1"], nouv1.loc[nouv1.Yreech==i, "X2"],\
ax1.plot(nouv1.loc[nouv1.Yreech= mark[1], ms=8, mec=coul[i], mfc='#ffffff00', ls='')
marker ==i, "X1"], df.loc[df.Y==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax2.plot(df.loc[df.Y==i, "X1"], nouv2.loc[nouv2.Yreech==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax2.plot(nouv2.loc[nouv2.Yreech==i, "X1"], nouv2.loc[nouv2.Yreech==i, "X2"],\
ax2.plot(nouv2.loc[nouv2.Yreech= mark[1], ms=8, mec=coul[i], mfc="#ffffff00", ls='')
marker
fig.tight_layout()
= df.Y
y = df.loc[:,["X1", "X2"]]
X = RandomUnderSampler(random_state=38)
rus1 = rus1.fit_resample(X, y)
Xreech1, yreech1 print(yreech1.value_counts())
Y
0 20
1 20
Name: count, dtype: int64
= RandomUnderSampler(random_state=38, sampling_strategy={0: 40, 1: 20})
rus2 = rus2.fit_resample(X, y)
Xreech2, yreech2 print(yreech2.value_counts())
Y
0 40
1 20
Name: count, dtype: int64
= TomekLinks(sampling_strategy='all')
tl1 = tl1.fit_resample(X, y)
Xreech1, yreech1 print(yreech1.value_counts())
Y
0 76
1 16
Name: count, dtype: int64
= TomekLinks(sampling_strategy='majority')
tl2 = tl2.fit_resample(X, y)
Xreech2, yreech2 print(yreech2.value_counts())
Y
0 76
1 20
Name: count, dtype: int64
= Xreech1.assign(Yreech = yreech1)
df1 = df.merge(df1, how="outer", on=['X1', 'X2'])
tmp = tmp.loc[tmp.Yreech.isna(), :]
nouv1
nouv1.Y.value_counts()
= Xreech2.assign(Yreech = yreech2)
df2 = df.merge(df2, how="outer", on=['X1', 'X2'])
tmp = tmp.loc[tmp.Yreech.isna(), :]
nouv2
nouv2.Y.value_counts()
"lines", markersize=3)
plt.rc(= ["C0", "C1"]
coul = ["^", "o"]
mark = plt.subplots(1,2)
fig, (ax1, ax2) for i in range(0,2):
==i, "X1"], df.loc[df.Y==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax1.plot(df.loc[df.Y==i, "X1"], nouv1.loc[nouv1.Y==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax1.plot(nouv1.loc[nouv1.Y==i, "X1"], nouv1.loc[nouv1.Y==i, "X2"],\
ax1.plot(nouv1.loc[nouv1.Y= mark[1], ms=8, mec=coul[i], mfc='#ffffff00', ls='')
marker ==i, "X1"], df.loc[df.Y==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax2.plot(df.loc[df.Y==i, "X1"], nouv2.loc[nouv2.Y==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax2.plot(nouv2.loc[nouv2.Y==i, "X1"], nouv2.loc[nouv2.Y==i, "X2"],\
ax2.plot(nouv2.loc[nouv2.Y= mark[1], ms=8, mec=coul[i], mfc="#ffffff00", ls='')
marker
fig.tight_layout()
= ["C0", "C1"]
coul = ["^", "o"]
mark = plt.subplots(1,2)
fig, (ax1, ax2) for i in range(0,2):
==i, "X1"], df1.loc[df1.Yreech==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax1.plot(df1.loc[df1.Yreech==i, "X1"], df2.loc[df2.Yreech==i, "X2"], marker=mark[i], c=coul[i], ls='')
ax2.plot(df2.loc[df2.Yreech
fig.tight_layout()
Critères pour données déséquilibrées
= pd.read_csv("../donnees/donnees_dondesequilib.csv", header=0, sep=';')
df print(pd.crosstab(index=df.Y, columns=df.P1))
P1 0 1
Y
0 468 0
1 31 1
print(pd.crosstab(index=df.Y, columns=df.P2))
P2 0 1
Y
0 407 61
1 4 28
from sklearn.metrics import accuracy_score, balanced_accuracy_score, f1_score, cohen_kappa_score
print(np.round(accuracy_score(df.Y, df.P2), 3))
print(np.round(balanced_accuracy_score(df.Y, df.P2), 3))
print(np.round(f1_score(df.Y, df.P2), 3))
print(np.round(cohen_kappa_score(df.Y, df.P2), 3))
0.87
0.872
0.463
0.407
Choisir un algortihme de rééquilibrage
= pd.read_csv("../donnees/ad_data.txt", header=None ,sep=",",\
ad = "?", skipinitialspace=True, keep_default_na=True)
na_values = ["X" + str(i+1) for i in range(ad.shape[1] - 1)]
noms "Y")
noms.append(= noms
ad.columns = ad.dropna(axis=1)
ad1 "Y"] = ad1["Y"].astype("category").cat.rename_categories({"nonad.": 0, "ad.": 1})
ad1.loc[:,= ad1.iloc[:,:-1]
X = pd.to_numeric(ad1.iloc[:,-1])
y ad1.Y.value_counts()
Y
0 2820
1 459
Name: count, dtype: int64
from sklearn.preprocessing import StandardScaler
def grille(X, y, type = "lasso", ng=400):
"""
X: tableau des var explicatives au format sklearn
y: variable a expliquer au format sklearn
type: "lasso" ou "ridge" ou "enet"
ng: nombre de valeur dans le chemin
retourne la grille
"""
= StandardScaler().fit(X)
scalerX = scalerX.transform(X)
Xcr= np.abs(Xcr.transpose().dot((y - y.mean()))).max()/X.shape[0]
l0 = np.linspace(0,-4,ng)
llc = l0*10**llc
ll if type=="lasso":
= 1/ 0.9/ X.shape[0] / (l0*10**(llc))
Cs elif type=="ridge":
= 1/ 0.9/ X.shape[0] / ((l0*10**(llc)) * 100)
Cs elif type=="enet":
= 1/ 0.9/ X.shape[0] / ((l0*10**(llc)) * 2)
Cs return Cs
= StratifiedKFold(n_splits=10, shuffle=True, random_state=0)
skf = pd.DataFrame(pd.to_numeric(ad1.iloc[:,ad1.shape[1]-1]))
RES for i in ["Logistic", "Lasso", "Ridge"]:
for j in ["brut", "ros", "smote", "rus", "tomek"]:
= RES.assign(**{i + "_" + j: 0}) RES
for app_index, val_index in skf.split(X,y):
= X.iloc[app_index,:-1]
Xapp = y.iloc[app_index]
yapp = X.iloc[val_index,:-1]
Xval # grille
= grille(Xapp, yapp, "lasso")
Cs_lasso = grille(Xapp, yapp, "ridge")
Cs_ridge # instanciation
= StandardScaler()
cr = LogisticRegression(penalty=None, solver="newton-cholesky" )
logistic = LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso, solver="saga", max_iter=2000)
lassocv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge, max_iter=1000)
ridgecv = Pipeline(steps=[("cr", cr), ("logistic", logistic)])
pipe_logistic = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
pipe_ridgecv # fit brut
pipe_logistic.fit(Xapp, yapp)
pipe_lassocv.fit(Xapp, yapp)
pipe_ridgecv.fit(Xapp, yapp)# prediction
"Logistic_brut"] = pipe_logistic.predict(Xval).ravel()
RES.loc[val_index,"Lasso_brut"] = pipe_lassocv.predict(Xval).ravel()
RES.loc[val_index,"Ridge_brut"] = pipe_ridgecv.predict(Xval).ravel() RES.loc[val_index,
for app_index, val_index in skf.split(X,y):
= X.iloc[app_index,:-1]
Xapp = y.iloc[app_index]
yapp = X.iloc[val_index,:-1]
Xval ## RandomOverSampler
= RandomOverSampler(random_state=123)
ros1 = ros1.fit_resample(Xapp, yapp)
Xreech, yreech # grille
= grille(Xreech, yreech, "lasso")
Cs_lasso = grille(Xreech, yreech, "ridge")
Cs_ridge # instanciation
= StandardScaler()
cr = LogisticRegression(penalty=None, solver="newton-cholesky")
logistic = LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso, solver="saga", max_iter=2000)
lassocv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge, max_iter=1000)
ridgecv = Pipeline(steps=[("cr", cr), ("logistic", logistic)])
pipe_logistic = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
pipe_ridgecv # fit
pipe_logistic.fit(Xreech, yreech)
pipe_lassocv.fit(Xapp, yapp)
pipe_ridgecv.fit(Xreech, yreech)# prediction
"Logistic_ros"] = pipe_logistic.predict(Xval).ravel()
RES.loc[val_index,"Lasso_ros"] = pipe_lassocv.predict(Xval).ravel()
RES.loc[val_index,"Ridge_ros"] = pipe_ridgecv.predict(Xval).ravel()
RES.loc[val_index,## Smote
= RandomOverSampler(random_state=123)
sm = sm.fit_resample(Xapp, yapp)
Xreech, yreech # grille
= grille(Xreech, yreech, "lasso")
Cs_lasso = grille(Xreech, yreech, "ridge")
Cs_ridge # instanciation
= StandardScaler()
cr = LogisticRegression(penalty=None, solver="newton-cholesky")
logistic = LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso, solver="saga", max_iter=2000)
lassocv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge, max_iter=1000)
ridgecv = Pipeline(steps=[("cr", cr), ("logistic", logistic)])
pipe_logistic = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
pipe_ridgecv # fit
pipe_logistic.fit(Xreech, yreech)
pipe_lassocv.fit(Xapp, yapp)
pipe_ridgecv.fit(Xreech, yreech)# prediction
"Logistic_smote"] = pipe_logistic.predict(Xval).ravel()
RES.loc[val_index,"Lasso_smote"] = pipe_lassocv.predict(Xval).ravel()
RES.loc[val_index,"Ridge_smote"] = pipe_ridgecv.predict(Xval).ravel() RES.loc[val_index,
for app_index, val_index in skf.split(X,y):
= X.iloc[app_index,:-1]
Xapp = y.iloc[app_index]
yapp = X.iloc[val_index,:-1]
Xval ## RandomUnderSampler
= RandomUnderSampler(random_state=123)
rus1 = rus1.fit_resample(Xapp, yapp)
Xreech, yreech # grille
= grille(Xreech, yreech, "lasso")
Cs_lasso = grille(Xreech, yreech, "ridge")
Cs_ridge # instanciation
= StandardScaler()
cr = LogisticRegression(penalty=None,solver="newton-cholesky" )
logistic = LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso, solver="saga", max_iter=2000)
lassocv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge, max_iter=1000)
ridgecv = Pipeline(steps=[("cr", cr), ("logistic", logistic)])
pipe_logistic = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
pipe_ridgecv # fit
pipe_logistic.fit(Xreech, yreech)
pipe_lassocv.fit(Xapp, yapp)
pipe_ridgecv.fit(Xreech, yreech)# prediction
"Logistic_rus"] = pipe_logistic.predict(Xval).ravel()
RES.loc[val_index,"Lasso_rus"] = pipe_lassocv.predict(Xval).ravel()
RES.loc[val_index,"Ridge_rus"] = pipe_ridgecv.predict(Xval).ravel()
RES.loc[val_index,## Tomek
= TomekLinks(sampling_strategy='all')
tl = tl.fit_resample(Xapp, yapp)
Xreech, yreech # grille
= grille(Xreech, yreech, "lasso")
Cs_lasso = grille(Xreech, yreech, "ridge")
Cs_ridge # instanciation
= StandardScaler()
cr = LogisticRegression(penalty=None, solver="newton-cholesky")
logistic = LogisticRegressionCV(cv=10, penalty="l1", n_jobs=10, Cs=Cs_lasso, solver="saga", max_iter=2000)
lassocv = LogisticRegressionCV(cv=10, penalty="l2", n_jobs=10, Cs=Cs_ridge, max_iter=1000)
ridgecv = Pipeline(steps=[("cr", cr), ("logistic", logistic)])
pipe_logistic = Pipeline(steps=[("cr", cr), ("lassocv", lassocv)])
pipe_lassocv = Pipeline(steps=[("cr", cr), ("ridgecv", ridgecv)])
pipe_ridgecv # fit
pipe_logistic.fit(Xreech, yreech)
pipe_lassocv.fit(Xapp, yapp)
pipe_ridgecv.fit(Xreech, yreech)# prediction
"Logistic_tomek"] = pipe_logistic.predict(Xval).ravel()
RES.loc[val_index,"Lasso_tomek"] = pipe_lassocv.predict(Xval).ravel()
RES.loc[val_index,"Ridge_tomek"] = pipe_ridgecv.predict(Xval).ravel() RES.loc[val_index,
= pd.Series(0.0, index=RES.columns[1:])
auc for i in range(auc.shape[0]):
= sklm.roc_auc_score(RES.Y, RES.iloc[:,i+1])
auc.iloc[i]
round(auc,3)
Logistic_brut 0.914
Logistic_ros 0.905
Logistic_smote 0.905
Logistic_rus 0.890
Logistic_tomek 0.917
Lasso_brut 0.907
Lasso_ros 0.687
Lasso_smote 0.687
Lasso_rus 0.690
Lasso_tomek 0.908
Ridge_brut 0.912
Ridge_ros 0.924
Ridge_smote 0.924
Ridge_rus 0.919
Ridge_tomek 0.911
dtype: float64
= pd.Series(0.0, index=RES.columns[1:])
acc = 0.5
s for i in range(acc.shape[0]):
= 1-sklm.zero_one_loss(RES.Y, RES.iloc[:,i+1]>s)
acc.iloc[i]
round(acc,3)
Logistic_brut 0.956
Logistic_ros 0.949
Logistic_smote 0.949
Logistic_rus 0.892
Logistic_tomek 0.958
Lasso_brut 0.970
Lasso_ros 0.912
Lasso_smote 0.912
Lasso_rus 0.912
Lasso_tomek 0.970
Ridge_brut 0.971
Ridge_ros 0.958
Ridge_smote 0.958
Ridge_rus 0.947
Ridge_tomek 0.970
dtype: float64
= pd.Series(0.0, index=RES.columns[1:])
bal = 0.5
s for i in range(bal.shape[0]):
= sklm.balanced_accuracy_score(RES.Y, RES.iloc[:,i+1]>s)
bal.iloc[i]
round(bal,3)
Logistic_brut 0.914
Logistic_ros 0.905
Logistic_smote 0.905
Logistic_rus 0.890
Logistic_tomek 0.917
Lasso_brut 0.907
Lasso_ros 0.687
Lasso_smote 0.687
Lasso_rus 0.690
Lasso_tomek 0.908
Ridge_brut 0.912
Ridge_ros 0.924
Ridge_smote 0.924
Ridge_rus 0.919
Ridge_tomek 0.911
dtype: float64
= pd.Series(0.0, index=RES.columns[1:])
f1s = 0.5
s for i in range(f1s.shape[0]):
= sklm.f1_score(RES.Y, RES.iloc[:,i+1]>s)
f1s.iloc[i]
round(f1s,3)
Logistic_brut 0.845
Logistic_ros 0.822
Logistic_smote 0.822
Logistic_rus 0.697
Logistic_tomek 0.852
Lasso_brut 0.883
Lasso_ros 0.543
Lasso_smote 0.543
Lasso_rus 0.549
Lasso_tomek 0.885
Ridge_brut 0.889
Ridge_ros 0.854
Ridge_smote 0.854
Ridge_rus 0.822
Ridge_tomek 0.887
dtype: float64
= pd.Series(0.0, index=RES.columns[1:])
kappa_scores = 0.5
s for i in range(kappa_scores.shape[0]):
= sklm.cohen_kappa_score(RES.Y, RES.iloc[:,i+1] > s)
kappa_scores.iloc[i]
round(kappa_scores, 3)
Logistic_brut 0.820
Logistic_ros 0.792
Logistic_smote 0.792
Logistic_rus 0.635
Logistic_tomek 0.828
Lasso_brut 0.865
Lasso_ros 0.504
Lasso_smote 0.504
Lasso_rus 0.510
Lasso_tomek 0.868
Ridge_brut 0.873
Ridge_ros 0.829
Ridge_smote 0.829
Ridge_rus 0.791
Ridge_tomek 0.870
dtype: float64