728x90
반응형
SMALL
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston, load_iris
from sklearn.linear_model import Ridge,Lasso,ElasticNet,LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
import mglearn
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False
import warnings
warnings.simplefilter('ignore')
릿지 회귀(Ridge Regression) L2규제
- 릿지 회귀는 선형 회귀를 개선한 선형 모델
- 릿지 회귀는 선형 회귀와 비슷하지만, 가중치의 값을 최대한 작게 만든다는 것이 다름
- 이러한 방법은 각각의 특성(feature)이 출력 값에 주는 영향을 최소한으로 만들도록 규제(regularization)를 거는 것
- 규제를 사용하면 다중공선성(multicollinearity) 문제를 방지하기 때문에 모델의 과대적합을 막을 수 있게 됨
- 다중공선성 문제는 두 특성이 일치에 가까울 정도로 관련성(상관관계)이 높을 경우 발생
- 릿지 회귀는 다음과 같은 함수를 최소화 하는 파라미터 w를 찾음
- α=0이면 릿지회귀는 선형회귀와 같아지고, α가 커질수록 모든 가중치가 0에 가까워져 결국 데이터의 평균을 지나는 수평선이 된다.
- α: 사용자가 지정하는 매개변수
- α가 크면 규제의 효과가 커지고, α가 작으면 규제의 효과가 작아짐
boston = load_boston()
boston_df = pd.DataFrame(boston['data'],columns=boston['feature_names'])
boston_df['MEDV'] = boston.target
boston_df
x_data = boston_df.iloc[:,:-1]
y_data = boston_df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x_data,y_data,
test_size = 0.2,random_state=1)
model_ridge = make_pipeline(StandardScaler(),Ridge())
model_ridge.fit(x_train,y_train)
print(model_ridge.score(x_train,y_train))
print(model_ridge.score(x_test,y_test))
[OUT]:
0.7293360329044442
0.7634038058032349
model5_ridge = make_pipeline(StandardScaler(),Ridge(alpha=5))
model5_ridge.fit(x_train,y_train)
print(model_ridge5.score(x_train,y_train))
print(model_ridge5.score(x_test,y_test))
[OUT]:
0.7198579444418842
0.76137898445528
model10_ridge = make_pipeline(StandardScaler(),Ridge(alpha=10))
model10_ridge.fit(x_train,y_train)
print(model_ridge10.score(x_train,y_train))
print(model_ridge10.score(x_test,y_test))
[OUT]:
0.7173630227001735
0.7569931620201513
결론 : Ridge의 alpha값에 따라 큰 차이가 없음
라쏘 회귀(Lasso Regression) L1규제
- 선형 회귀에 규제를 적용한 또 다른 모델로 라쏘 회귀가 있음
- 라쏘 회귀는 릿지 회귀와 비슷하게 가중치를 0에 가깝게 만들지만, 조금 다른 방식을 사용
- 라쏘 회귀에서는 다음과 같은 함수를 최소화 하는 파라미터 w를 찾음
- 라쏘 회귀도 매개변수인 α 값을 통해 규제의 강도 조절 가능
model_lasso = Lasso(alpha=0.5)
model_lasso.fit(x_train,y_train)
print(model_lasso.score(x_train,y_train))
print(model_lasso.score(x_test,y_test))
[OUT]:
0.7003419060083982
0.7092912054989273
param_value = {'alpha':[0.0001,0.01,1,2,3,4]}
modelLasso = Lasso()
gridSearch = GridSearchCV(modelLasso,param_grid = param_value,cv=10) # 교차검증, 파라미터 값 찾기
gridSearch.fit(x_train,y_train)
[OUT]:
GridSearchCV(cv=10, error_score=nan,
estimator=Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
max_iter=1000, normalize=False, positive=False,
precompute=False, random_state=None,
selection='cyclic', tol=0.0001, warm_start=False),
iid='deprecated', n_jobs=None,
param_grid={'alpha': [0.0001, 0.01, 1, 2, 3, 4]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
gridSearch.best_params_ # 참고해서 alpha값을 다양하게 바꿔보기
[OUT]:
{'alpha': 0.0001}
gridSearch.best_score_
[OUT]:
0.6774218498018814
gridSearch.cv_results_
[OUT]:
{'mean_fit_time': array([0.00179813, 0.00139689, 0.00129669, 0.00119786, 0.00129778,
0.00109549]),
'std_fit_time': array([0.00071171, 0.00048804, 0.00045716, 0.00040151, 0.00045661,
0.00030043]),
'mean_score_time': array([0.00069833, 0.00059776, 0.00069804, 0.00070024, 0.000494 ,
0.00089939]),
'std_score_time': array([0.00045717, 0.00048808, 0.00045698, 0.00045846, 0.0004941 ,
0.00030033]),
'param_alpha': masked_array(data=[0.0001, 0.01, 1, 2, 3, 4],
mask=[False, False, False, False, False, False],
fill_value='?',
dtype=object),
'params': [{'alpha': 0.0001},
{'alpha': 0.01},
{'alpha': 1},
{'alpha': 2},
{'alpha': 3},
{'alpha': 4}],
'split0_test_score': array([0.86745132, 0.87159532, 0.77786625, 0.7049526 , 0.65308043,
0.61660916]),
'split1_test_score': array([0.538554 , 0.54065526, 0.59957882, 0.55813648, 0.52150994,
0.47828249]),
'split2_test_score': array([0.45805684, 0.44812811, 0.34775631, 0.2820583 , 0.24400292,
0.21386043]),
'split3_test_score': array([0.84033252, 0.83841814, 0.73468643, 0.70542295, 0.68816699,
0.66971321]),
'split4_test_score': array([0.69762499, 0.69553781, 0.60382977, 0.58891015, 0.58349326,
0.58780199]),
'split5_test_score': array([0.74195821, 0.74393307, 0.69987696, 0.67539683, 0.6700896 ,
0.67008788]),
'split6_test_score': array([0.49837397, 0.5083782 , 0.61719606, 0.58645479, 0.56054857,
0.5277361 ]),
'split7_test_score': array([0.7219826 , 0.72205428, 0.61751933, 0.58268779, 0.54074573,
0.50472987]),
'split8_test_score': array([0.65840082, 0.65557979, 0.62450096, 0.59653796, 0.56022088,
0.52822911]),
'split9_test_score': array([0.75148323, 0.74978923, 0.67064674, 0.63077101, 0.61736658,
0.60592246]),
'mean_test_score': array([0.67742185, 0.67740692, 0.62934576, 0.59113289, 0.56392249,
0.54029727]),
'std_test_score': array([0.13210608, 0.13264151, 0.10998914, 0.11464378, 0.11941783,
0.12568065]),
'rank_test_score': array([1, 2, 3, 4, 5, 6])}
gridSearch.best_estimator_.predict(x_test)
[OUT]:
array([32.65451755, 28.09425116, 18.02726711, 21.47629568, 18.82908552,
19.88077272, 32.41998143, 18.06708964, 24.41908704, 27.00953035,
27.03787347, 28.75393746, 21.15618569, 26.85041791, 23.38805586,
20.66176498, 17.33031598, 38.24714047, 30.50412507, 8.74996034,
20.80262417, 16.26920784, 25.21841717, 24.85309794, 31.38345601,
10.71240508, 13.8046697 , 16.66025695, 36.52842352, 14.66802031,
21.12315375, 13.95484118, 43.160864 , 17.97790542, 21.80116716,
20.58470696, 17.60156619, 27.22266959, 9.46018155, 19.82795804,
24.30765777, 21.18903539, 29.57143119, 16.33972352, 19.31295781,
14.56458086, 39.20563841, 18.1096834 , 25.91031161, 20.32943949,
25.16213668, 24.42813032, 25.07251632, 26.65826038, 4.55804761,
24.08270516, 10.88692104, 26.89010932, 16.85748247, 35.88685368,
19.55883708, 27.51889298, 16.58465045, 18.7722969 , 11.13779245,
32.36427557, 36.7282874 , 21.96093821, 24.5767824 , 25.14700054,
23.43038287, 6.90751624, 16.56443681, 20.42095538, 20.80330135,
21.54021442, 33.85102664, 27.9452562 , 25.17979562, 34.65867138,
18.62514518, 23.97296001, 34.64008293, 13.34923602, 20.71340001,
30.07957403, 17.13541468, 24.30765555, 19.25709168, 16.98041183,
27.00792994, 41.85734251, 14.10871267, 23.25740407, 14.66374983,
21.86973593, 23.02740033, 29.08777596, 37.11849424, 20.53314057,
17.36584101, 17.71617663])
다중공선성, 과대적합
fig = plt.figure(figsize=[12,6])
rng = np.linspace(-10,10,100)
mse = ( 0.5*(rng-3) )**2 + 30
l2 = rng**2
l1 = 5*np.abs(rng)
ridge = mse + l2
lasso = mse + l1
plt.subplot(1,2,1)
plt.plot(rng,mse,label='MSE')
plt.plot(rng,l2,'--',label='L2')
plt.plot(rng,ridge, lw=2, label='Ridge')
plt.xlabel('w'); plt.ylabel('Error')
plt.legend()
plt.subplot(1,2,2)
plt.plot(rng,mse,label='MSE')
plt.plot(rng,l1,'--',label='L1')
plt.plot(rng,lasso, lw=2, label='Lasso')
plt.xlabel('w'); plt.ylabel('Error')
plt.legend()
신축망 (Elastic-Net)
- 신축망은 릿지 회귀와 라쏘 회귀, 두 모델의 모든 규제를 사용하는 선형 모델
- 두 모델의 장점을 모두 갖고 있기 때문에 좋은 성능을 보임
- 데이터 특성이 많거나 서로 상관 관계가 높은 특성이 존재 할 때 위의 두 모델보다 좋은 성능을 보여 줌
- 신축망은 다음과 같은 함수를 최소화 하는 파라미터 w를 찾음
- α: 규제의 강도를 조절하는 매개변수
- ρ: 라쏘 규제와 릿지 규제 사이의 가중치를 조절하는 매개변수
model_elastic = ElasticNet(alpha=0.01, l1_ratio=0.5)
model_elastic.fit(x_train,y_train)
print(model_elastic.score(x_train,y_train))
print(model_elastic.score(x_test,y_test))
[OUT]:
0.7231027704401571
0.7639149100563347
modelElastic = make_pipeline(StandardScaler(),ElasticNet())
param_value = {'elasticnet__alpha':[0.0001,0.01,1,2,3,4]}
gridSearch = GridSearchCV(modelElastic,param_grid = param_value,cv=10) # 교차검증, 파라미터 값 찾기
gridSearch.fit(x_train,y_train)
[OUT]:
GridSearchCV(cv=10, error_score=nan,
estimator=Pipeline(memory=None,
steps=[('standardscaler',
StandardScaler(copy=True,
with_mean=True,
with_std=True)),
('elasticnet',
ElasticNet(alpha=1.0, copy_X=True,
fit_intercept=True,
l1_ratio=0.5, max_iter=1000,
normalize=False,
positive=False,
precompute=False,
random_state=None,
selection='cyclic',
tol=0.0001,
warm_start=False))],
verbose=False),
iid='deprecated', n_jobs=None,
param_grid={'elasticnet__alpha': [0.0001, 0.01, 1, 2, 3, 4]},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring=None, verbose=0)
gridSearch.best_params_ # 참고해서 alpha값을 다양하게 바꿔보기
[OUT]:
{'elasticnet__alpha': 0.01}
gridSearch.best_score_
[OUT]:
0.6779152369708372
다항 회귀 (Polynomial Regression)
- 입력 데이터를 비선형 변환 후 사용하는 방법
- 모델 자체는 선형 모델
- 차수가 높아질수록 더 복잡한 데이터 학습 가능
model = make_pipeline(PolynomialFeatures(degree=2),StandardScaler(),
LinearRegression()) # 데이터 특성에 맞게 곡선형태로 예측함 but 과적합 위험이 있음
model.fit(x_train,y_train)
[OUT]:
Pipeline(memory=None,
steps=[('polynomialfeatures',
PolynomialFeatures(degree=2, include_bias=True,
interaction_only=False, order='C')),
('standardscaler',
StandardScaler(copy=True, with_mean=True, with_std=True)),
('linearregression',
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
normalize=False))],
verbose=False)
print(model.score(x_train,y_train))
print(model.score(x_test,y_test))
[OUT]:
0.9264046957163292
0.9116244055634332
NN (Neural Network)
sample_size = 500
x = np.random.uniform(-20, 20, sample_size).reshape(-1, 1) # 균등분포 n X 1 행렬
y = x**4 + x**3 + x**2 + x + np.random.normal(0, 10, sample_size).reshape(-1, 1) # 정규분포 n X 1 행렬
print(x.shape,y.shape)
[OUT]:
(500, 1) (500, 1)
plt.figure(figsize=(12, 4))
plt.scatter(x, y, marker='o', s=20, alpha=0.4, c='red')
plt.show()
x_train, x_test, y_train, y_test = train_test_split(x, y,
train_size =0.7, test_size=0.3)
model_lr = LinearRegression()
model_lr.fit(x_train,y_train)
model_lr.score(x_test,y_test)
[OUT]:
0.008758601067365368
modelML = MLPRegressor(hidden_layer_sizes=[500,4],max_iter=5000,alpha=0.005,verbose=1)
modelML.fit(x_train,y_train)
modelML.score(x_test,y_test)
[OUT]:
Iteration 1, loss = 1348086820.24380994
Iteration 2, loss = 1348078679.67286968
Iteration 3, loss = 1348074303.01434731
Iteration 4, loss = 1348071849.70030856
Iteration 5, loss = 1348068903.29246926
Iteration 6, loss = 1348065898.06415272
Iteration 7, loss = 1348062829.72495413
Iteration 8, loss = 1348059495.56497169
Iteration 9, loss = 1348055918.82756877
Iteration 10, loss = 1348052390.85303426
Iteration 11, loss = 1348048433.81728268
Iteration 12, loss = 1348044496.10167122
Iteration 13, loss = 1348040582.50184202
Iteration 14, loss = 1348036040.05279827
Iteration 15, loss = 1348031565.83421302
Iteration 16, loss = 1348026659.68228030
Iteration 17, loss = 1348021647.88695765
Iteration 18, loss = 1348016564.40440392
Iteration 19, loss = 1348010995.40316415
Iteration 20, loss = 1348004849.84743071
Iteration 21, loss = 1347999057.00061202
Iteration 22, loss = 1347992455.14815497
Iteration 23, loss = 1347985390.90506387
Iteration 24, loss = 1347977849.65841413
Iteration 25, loss = 1347970452.59378242
Iteration 26, loss = 1347963177.62631512
Iteration 27, loss = 1347954570.51775265
Iteration 28, loss = 1347945369.36891294
Iteration 29, loss = 1347935866.58309317
Iteration 30, loss = 1347925657.66056752
Iteration 31, loss = 1347915565.77228522
Iteration 32, loss = 1347904299.23952842
Iteration 33, loss = 1347892380.43749285
Iteration 34, loss = 1347880609.74821854
Iteration 35, loss = 1347867452.12774944
Iteration 36, loss = 1347853558.87485480
Iteration 37, loss = 1347840350.82384372
Iteration 38, loss = 1347825141.23070741
Iteration 39, loss = 1347808559.47639847
# ...
# 중략
# ...
Iteration 5000, loss = 586793279.16171575
0.3029099690074444
score가 너무 낮음 파라미터 조정 필요
plt.figure(figsize=(12, 4))
plt.scatter(x, y, marker='o', s=20, alpha=0.4, c='red')
plt.scatter(x, model_lr.predict(x))
plt.scatter(x, modelML.predict(x),c='g')
plt.show()
modelML = MLPRegressor(hidden_layer_sizes=[1000,4],max_iter=5000,alpha=0.005,
verbose=1,random_state=1) # 히든레이어 1000 연산 오래걸림 -> 현업가면 gpu사용함
modelML.fit(x_train,y_train)
modelML.score(x_test,y_test)
[OUT]:
Iteration 1, loss = 1348082167.11370158
Iteration 2, loss = 1348054279.17683601
Iteration 3, loss = 1348025379.20340586
Iteration 4, loss = 1347993817.01645923
Iteration 5, loss = 1347960917.97664762
Iteration 6, loss = 1347927300.33108878
Iteration 7, loss = 1347892046.10749412
Iteration 8, loss = 1347854434.15682220
Iteration 9, loss = 1347816626.75935817
Iteration 10, loss = 1347776381.52606511
Iteration 11, loss = 1347736780.02402973
Iteration 12, loss = 1347693869.82800388
Iteration 13, loss = 1347650455.63751054
Iteration 14, loss = 1347602740.23888922
Iteration 15, loss = 1347553971.01440787
Iteration 16, loss = 1347500823.88644743
Iteration 17, loss = 1347446682.39321089
Iteration 18, loss = 1347389207.54090166
Iteration 19, loss = 1347326391.13907576
Iteration 20, loss = 1347263404.99006796
Iteration 21, loss = 1347193990.77164769
Iteration 22, loss = 1347122124.68278670
Iteration 23, loss = 1347043504.53793883
Iteration 24, loss = 1346962323.89247322
Iteration 25, loss = 1346873350.76161456
Iteration 26, loss = 1346780786.94481969
Iteration 27, loss = 1346686211.12362766
Iteration 28, loss = 1346580265.49404049
Iteration 29, loss = 1346472776.83622432
Iteration 30, loss = 1346357176.87810993
Iteration 31, loss = 1346238958.68130088
Iteration 32, loss = 1346110250.23181081
Iteration 33, loss = 1345977837.19453406
Iteration 34, loss = 1345831128.12127399
Iteration 35, loss = 1345683809.39496994
Iteration 36, loss = 1345529645.87906837
Iteration 37, loss = 1345364790.25585914
Iteration 38, loss = 1345185370.55407333
Iteration 39, loss = 1345012395.20316696
# ...
# 중략
# ...
Iteration 4999, loss = 15779628.29590784
Iteration 5000, loss = 15758118.27470470
0.9828074205844423
plt.figure(figsize=(12, 4))
plt.scatter(x, y, marker='o', s=20, alpha=0.4, c='red')
plt.scatter(x, model_lr.predict(x))
plt.scatter(x, modelML.predict(x),c='g')
plt.show() # 거의 적합하게 잘 나옴 그런데 왜 잘되는지는 잘 모름 = AI의한계 & 과적합의 위험
models = [
LinearRegression(),
MLPRegressor(hidden_layer_sizes=[512, 4], alpha=0.005, random_state=42),
MLPRegressor(hidden_layer_sizes=[48, 4], max_iter=5000, alpha=0.005, random_state=42),
MLPRegressor(hidden_layer_sizes=[512, 4], max_iter=5000, alpha=0.005, random_state=42),
MLPRegressor(hidden_layer_sizes=[1024, 4], max_iter=5000, alpha=0.005, random_state=42),
MLPRegressor(hidden_layer_sizes=[1024, 512, 4], max_iter=5000, alpha=0.005, random_state=42),
]
for m in models:
m.fit(x_train, y_train)
print(m.__class__)
print(r2_score(y_train, m.predict(x_train)))
print(r2_score(y_test, m.predict(x_test)))
[OUT]:
<class 'sklearn.linear_model._base.LinearRegression'>
0.013123375713710916
0.008758601067365368
<class 'sklearn.neural_network._multilayer_perceptron.MLPRegressor'>
-0.3768815016497078
-0.37210306791446146
<class 'sklearn.neural_network._multilayer_perceptron.MLPRegressor'>
0.6586188841641721
0.6401151351442247
<class 'sklearn.neural_network._multilayer_perceptron.MLPRegressor'>
0.929262631598112
0.9301365276356841
<class 'sklearn.neural_network._multilayer_perceptron.MLPRegressor'>
0.9914546127677839
0.9920660983258889
<class 'sklearn.neural_network._multilayer_perceptron.MLPRegressor'>
0.9997121618110802
0.9998180180922505
plt.figure(figsize=(12, 6))
plt.scatter(x, y, marker='^', s=50, alpha=0.7, label='y_true')
for i, m in enumerate(models):
plt.scatter(x, m.predict(x), marker='o', s=10, alpha=0.7, label='nn_{}'.format(i))
plt.legend()
plt.savefig('scatter.svg')
plt.show()
review
- hidden layer size 결정 중요
- 딥러닝 NN 최고 그치만 과적합 위험성 있음
- 다른 모델들이 돌아가는 원리를 잘 알아놓기
728x90
반응형
LIST
'코딩으로 익히는 Python > 모델링' 카테고리의 다른 글
[Python] 6. L1 norm, L2 norm (2) | 2021.01.20 |
---|---|
[Python] 5. 문자열encoding : LabelEncoder, OneHotEncoder, get_dummies(), make_column_transformer 예제 (0) | 2021.01.20 |
[Python] 3. 다중선형회귀 (0) | 2021.01.20 |
[Python] 2. 정규화 : 상관관계, 다중공선성 (0) | 2021.01.20 |
[Python] 1. 선형회귀분석 (0) | 2021.01.20 |