import pandas as pd; import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.metrics import mean_squared_error
import sys
'../modules')
sys.path.append(from ols_step_sk import LinearRegressionSelectionFeatureIC
10 Régression sur composantes : PCR et PLS
Régression sur composantes principales (PCR)
= pd.read_csv("../donnees/ozone.txt", header = 0, sep = ";", index_col=0).iloc[:,:10]
ozone = ozone.iloc[:,1:10]
X = ozone.iloc[:,:1]
y = smf.ols('O3 ~ T12 + T15 + Ne12 + N12 + S12 + E12 + W12 + Vx + O3v', data=ozone).fit()
reg reg.summary()
Dep. Variable: | O3 | R-squared: | 0.751 |
Model: | OLS | Adj. R-squared: | 0.694 |
Method: | Least Squares | F-statistic: | 13.37 |
Date: | Fri, 31 Jan 2025 | Prob (F-statistic): | 1.51e-09 |
Time: | 17:30:32 | Log-Likelihood: | -194.43 |
No. Observations: | 50 | AIC: | 408.9 |
Df Residuals: | 40 | BIC: | 428.0 |
Df Model: | 9 | ||
Covariance Type: | nonrobust |
coef | std err | t | P>|t| | [0.025 | 0.975] | |
Intercept | 54.7278 | 17.279 | 3.167 | 0.003 | 19.806 | 89.650 |
T12 | -0.3518 | 1.573 | -0.224 | 0.824 | -3.531 | 2.827 |
T15 | 1.4972 | 1.538 | 0.974 | 0.336 | -1.611 | 4.605 |
Ne12 | -4.1922 | 1.064 | -3.941 | 0.000 | -6.342 | -2.042 |
N12 | 1.2755 | 1.363 | 0.936 | 0.355 | -1.480 | 4.031 |
S12 | 3.1711 | 1.911 | 1.660 | 0.105 | -0.691 | 7.033 |
E12 | 0.5277 | 1.943 | 0.272 | 0.787 | -3.399 | 4.454 |
W12 | 2.4749 | 2.072 | 1.194 | 0.239 | -1.713 | 6.663 |
Vx | 0.6077 | 0.486 | 1.251 | 0.218 | -0.374 | 1.589 |
O3v | 0.2454 | 0.096 | 2.543 | 0.015 | 0.050 | 0.440 |
Omnibus: | 0.252 | Durbin-Watson: | 1.887 |
Prob(Omnibus): | 0.881 | Jarque-Bera (JB): | 0.364 |
Skew: | 0.153 | Prob(JB): | 0.833 |
Kurtosis: | 2.716 | Cond. No. | 859. |
Notes:
[1] Standard Errors assume that the covariance matrix of the errors is correctly specified.
= LinearRegression()
mod_lin9v
mod_lin9v.fit(X,y)round(mod_lin9v.coef_,2) np.
array([[-0.35, 1.5 , -4.19, 1.28, 3.17, 0.53, 2.47, 0.61, 0.25]])
= LinearRegressionSelectionFeatureIC(crit="bic")
reg_bic
reg_bic.fit(X, y) X.columns[reg_bic.selected_features_]
Index(['T15', 'Ne12', 'Vx', 'O3v'], dtype='object')
= StandardScaler()
cr = PCA(n_components=9)
acp = Pipeline(steps=[("cr", cr), ("acp", acp)])
pipe_acp
pipe_acp.fit(X)= pipe_acp.fit_transform(X)
Xortho
reg_bic.fit(Xortho, y)= reg_bic.selected_features_
varsel varsel
[0, 4]
= pipe_acp.named_steps["cr"]
step_cr = step_cr.scale_
stdX = step_cr.mean_
meanX =reg_bic.coef_
coef_pcr = pipe_acp.named_steps["acp"]
step_acp = step_acp.components_[reg_bic.selected_features_,]
P = P.T.dot(coef_pcr.T)[:,0]/stdX
betafinpcr = y.mean(axis=0) - meanX.dot(betafinpcr.T)
intercept round(np.append(intercept, betafinpcr),2) np.
array([ 5.279e+01, 4.100e-01, 4.700e-01, -2.540e+00, -8.700e-01,
2.000e-02, 1.060e+00, -8.500e-01, 2.300e-01, 3.400e-01])
= StandardScaler()
cr = PCA()
acp = LinearRegression()
reg = Pipeline(steps=[("cr", cr), ("acp", acp), ("reg", reg)])
pipe_pcr = { "acp__n_components" : list(range(1,10))}
param_grid_pcr = KFold(n_splits=4, shuffle=True, random_state=0)
kf = GridSearchCV(pipe_pcr, param_grid_pcr, cv=kf, scoring = 'neg_mean_squared_error', n_jobs=3).fit(X, y) cv_pcr
import matplotlib.cm as cm
= cm.Set1(range(1))
colors = colors
colors2 3] = colors2[:,3] -0.7
colors2[:,= plt.subplots(1, 1)
fig, ax1 list(range(1,10)), -cv_pcr.cv_results_["mean_test_score"],
ax1.errorbar("std_test_score"]/kf.n_splits**0.5 , capsize=10, c=colors[0], ecolor=colors2[0])
cv_pcr.cv_results_[ fig.tight_layout()
= cv_pcr.best_estimator_
bestpcr
cv_pcr.best_params_= bestpcr.named_steps["reg"]
step_reg = step_reg.coef_
coef_pcr = bestpcr.named_steps["cr"]
step_cr = step_cr.scale_
stdX = step_cr.mean_
meanX = bestpcr.named_steps["acp"]
step_acp = step_acp.components_
P = P.T.dot(coef_pcr.T)[:,0]/stdX
betafinpcr = y.mean(axis=0) - meanX.dot(betafinpcr.T)
intercept round(np.append(intercept, betafinpcr),2) np.
array([58.33, 0.68, 0.44, -4.45, 1.24, 2.11, 1.84, 0.6 , 0.15,
0.25])
Régression aux moindres carrés partiels (PLS)
= PLSRegression()
regpls = { "n_components" : list(range(1,10))}
param_grid_pls = KFold(n_splits=4, shuffle=True, random_state=0)
kf = GridSearchCV(regpls, param_grid_pls, cv=kf, scoring = 'neg_mean_squared_error').fit(X, y) cv_pls
= cm.Set1(range(1))
colors = colors
colors2 3] = colors2[:,3] -0.6
colors2[:,= plt.subplots(1, 1)
fig, ax1 list(range(1,10)), -cv_pls.cv_results_["mean_test_score"], cv_pls.cv_results_["std_test_score"]/kf.n_splits**0.5 , capsize=10, c=colors[0], ecolor=colors2[0])
ax1.errorbar( fig.tight_layout()
= cv_pls.best_estimator_
bestpls
cv_pls.best_params_
bestpls.fit(X,y) bestpls.coef_
array([[ 0.41610174, 0.53217084, -4.4244714 , 0.37042358, 1.60857405,
0.98429296, 0.56053927, 0.24800203, 0.26473814]])
= StandardScaler().fit(X)
scalerX = scalerX.scale_
stdX = scalerX.mean_
meanX = bestpls.coef_[:,0]/stdX
betafinpls = y.mean(axis=0) - meanX.dot(betafinpls.T)
intercept np.append(intercept, betafinpls)
array([7.95723726e+01, 8.99163083e-02, 8.43936432e-02, 1.65784166e-01,
1.98000066e-01, 2.80306490e-01, 1.88545955e-01, 1.71475028e-01,
3.09237815e-02, 1.68441318e-02])