728x90
반응형
SMALL
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston, load_iris
from sklearn.linear_model import LinearRegression,Ridge, SGDRegressor # SGDRegressor은 학습 나머지는 공식
from sklearn.neural_network import MLPRegressor # MLPRegressor은 딥러닝 학습
from sklearn.metrics import r2_score # 선형 모델(Linear Models)
from sklearn.model_selection import train_test_split
import mglearn # !pip install mglearn
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False
x_d = [1,2,3,4,5,6,7,8,9,10]
y_d = [1,2,3,4,5,6,7,8,9,10]
x_data1 = np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9],[10]])
x_data =np.array([1,2,3,4,5,6,7,8,9,10]).reshape(-1,1) # x_data1이랑 같음 -> 10X1로 변경
y_data = np.array([1,2,3,4,5,6,7,8,9,10]).reshape(-1,1)
행(데이터의 개수), 열(특성데이터의 개수)
x_data1.shape
[OUT]:
(10, 1)
x_data.shape
[OUT]:
(10, 1)
model_lr = LinearRegression() # 공식
# model_lr.fit(x_d,y_d) # error -> 특성데이터는 반드시 (2차원행렬or데이터프레임)로 주어야 함
model_lr.fit(x_data,y_data)
[OUT]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
model_lr.coef_ # w값 기울기
[OUT]:
array([[1.]])
model_lr.intercept_ # b값
[OUT]:
array([2.66453526e-15])
데이터 불러오기 (pd.read_csv)
python 파일 경로에 data4 폴더 만든 후 다음의 cars.csv파일 넣어놓기
carDF = pd.read_csv('data4/cars.csv')
carDF
carDF['speed'] # 자동차 속도라는 특성 1개 -> series니까 벡터
[OUT]:
0 4
1 4
2 7
3 7
4 8
5 9
6 10
7 10
8 10
9 11
10 11
11 12
12 12
13 12
14 12
15 13
16 13
17 13
18 13
19 14
20 14
21 14
22 14
23 15
24 15
25 15
26 16
27 16
28 17
29 17
30 17
31 18
32 18
33 18
34 18
35 19
36 19
37 19
38 20
39 20
40 20
41 20
42 20
43 22
44 23
45 24
46 24
47 24
48 24
49 25
Name: speed, dtype: int64
carDF[['speed']] # 행렬로 변환(복수개의 행선택)
carDF[['speed']].shape
[OUT]:
(50, 1)
carDF['dist'] # 얘는 행렬로 변환할 필요 없음
[OUT]:
0 2
1 10
2 4
3 22
4 16
5 10
6 18
7 26
8 34
9 17
10 28
11 14
12 20
13 24
14 28
15 26
16 34
17 34
18 46
19 26
20 36
21 60
22 80
23 20
24 26
25 54
26 32
27 40
28 32
29 40
30 50
31 42
32 56
33 76
34 84
35 36
36 46
37 68
38 32
39 48
40 52
41 56
42 64
43 66
44 54
45 70
46 92
47 93
48 120
49 85
Name: dist, dtype: int64
modelCar = LinearRegression() # 카멜식 표기법 or 언더바 표기법 중 하나로 통일
modelCar.fit(carDF[['speed']],carDF['dist'])
[OUT]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
modelCar.coef_ # weight
[OUT]:
array([3.93240876])
modelCar.intercept_ # bias
[OUT]:
-17.57909489051095
plt.xlabel('speed')
plt.ylabel('dist')
plt.scatter(carDF['speed'],carDF['dist'])
plt.plot(carDF['speed'],modelCar.predict(carDF[['speed']]),'r--') # 예측선
plt.show()
연습문제
1. 자동차 속도가 15일 때 제동거리는 얼마로 예측되는지 구하시오
2. 자동차 속도가 13, 15일 때 제동거리는 얼마로 예측되는지 구하시오
Solution
1. 자동차 속도가 15일 때 제동거리는 얼마로 예측되는지 구하시오
# 방법1
modelCar.coef_[0] * 15 + modelCar.intercept_
[OUT]:
41.40703649635036
# 방법2
modelCar.predict([[15]]) # 매트릭스(행렬) : 행렬곱을 수행
[OUT]:
array([41.4070365])
2. 자동차 속도가 13, 15일 때 제동거리는 얼마로 예측되는지 구하시오
modelCar.predict([[13],[15]])
[OUT]:
array([33.54221898, 41.4070365 ])
연습문제
다음의 전기 데이터를 들고와서 w, b를 구하시고 전기생산량이 3.2, 4.5 인경우 전기 사용량을 예측하고 scatter(실제데이터), plot(예측데이터)를 그리시오
Solution
elecDF = pd.read_csv('data4/electric.csv')
elecDF
elecDF = pd.read_csv('data4/electric.csv', index_col='Unnamed: 0') # Unnamed: 0 컬럼을 인덱스로 설정
elecDF
modelElec = LinearRegression()
modelElec.fit(elecDF[['전기생산량']],elecDF['전기사용량'])
[OUT]:
LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)
modelElec.coef_ # w
[OUT]:
array([0.49560324])
modelElec.intercept_ # b
[OUT]:
0.919581428068942
modelElec.predict([[3.2],[4.5]]) # 3.2, 4.5 인경우 전기 사용량을 예측
[OUT]:
array([2.50551178, 3.14979599])
plt.xlabel('전기생산량')
plt.ylabel('전기사용량')
plt.scatter(elecDF['전기생산량'],elecDF['전기사용량']) # 실제데이터
plt.plot(elecDF['전기생산량'],modelElec.predict(elecDF[['전기생산량']]),'r--') # 예측데이터
plt.show()
결정 계수 𝑅2
- 회귀모델의 검증을 위한 또 다른 측정 지표 중 하나로 결정 계수(coefficient of determination, 𝑅2) 사용
- R 제곱 (R^2, R-squared, 피어슨 상관 계수의 제곱)
- 1 - (오차의 제곱합)/(편차의 제곱합)
- 오차 : 실제 값과 예측 값의 차이
- 편차 : 실제 값과 평균 값의 차이
- 0 <= 결정 계수 <= 1 (0이면 0점, 1이면 100점)
- 높을 수록 좋음
- from sklearn.metrics import r2_score
- y_predict = model.predict( x_data )
- r2_score( y_data, y_predict )
plt.figure( figsize=(10,6))
plt.xlabel('speed')
plt.ylabel('dist')
predictions = modelCar.predict( carDF[['speed']] )
plt.scatter(carDF.speed, carDF.dist) # 실제값 carDF.speed == carDF['speed']
plt.plot( carDF.speed, predictions, 'r--') # 예측값
plt.axhline( carDF.dist.mean(),color='blue') # 평균
plt.show()
carDF.dist.mean()
[OUT]:
42.98
predictions
[OUT]:
array([-1.84945985, -1.84945985, 9.94776642, 9.94776642, 13.88017518,
17.81258394, 21.7449927 , 21.7449927 , 21.7449927 , 25.67740146,
25.67740146, 29.60981022, 29.60981022, 29.60981022, 29.60981022,
33.54221898, 33.54221898, 33.54221898, 33.54221898, 37.47462774,
37.47462774, 37.47462774, 37.47462774, 41.4070365 , 41.4070365 ,
41.4070365 , 45.33944526, 45.33944526, 49.27185401, 49.27185401,
49.27185401, 53.20426277, 53.20426277, 53.20426277, 53.20426277,
57.13667153, 57.13667153, 57.13667153, 61.06908029, 61.06908029,
61.06908029, 61.06908029, 61.06908029, 68.93389781, 72.86630657,
76.79871533, 76.79871533, 76.79871533, 76.79871533, 80.73112409])
score (결정계수 통한 설명력 확인)
modelCar.score(carDF[['speed']],carDF.dist) # 65%의 설명력을 가진다 score(예측값,실제값)
[OUT]:
0.6510793807582509
y_predict = modelCar.predict(carDF[['speed']])
r2_score(carDF.dist,y_predict) # 65%의 설명력을 가진다 r2_score(실제값,예측값)
[OUT]:
0.6510793807582509
학습을 통한 선형회귀
# alpha = 0.0001 (learning rate)
# max_iter = 1000 (학습횟수, 1000번 학습)
# early_stopping = False (cost값이 변화가 없다. 학습이 되어 최저점에 도달)
# verbose=1 (몇번 학습했는지 출력)
modelSGD = SGDRegressor(max_iter=100, alpha=0.0001,
early_stopping=True, verbose=1)
modelSGD.fit(carDF[['speed']],carDF['dist'])
[OUT]:
-- Epoch 1
Norm: 1.47, NNZs: 1, Bias: -0.207088, T: 45, Avg. loss: 958.504253
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 2.36, NNZs: 1, Bias: -0.536527, T: 90, Avg. loss: 343.295596
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 2.56, NNZs: 1, Bias: -0.951716, T: 135, Avg. loss: 276.255796
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 3.52, NNZs: 1, Bias: -1.079889, T: 180, Avg. loss: 194.543746
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 2.79, NNZs: 1, Bias: -1.355384, T: 225, Avg. loss: 250.073654
Total training time: 0.00 seconds.
-- Epoch 6
Norm: 2.89, NNZs: 1, Bias: -1.517053, T: 270, Avg. loss: 173.248761
Total training time: 0.00 seconds.
Convergence after 6 epochs took 0.00 seconds
SGDRegressor(alpha=0.0001, average=False, early_stopping=True, epsilon=0.1,
eta0=0.01, fit_intercept=True, l1_ratio=0.15,
learning_rate='invscaling', loss='squared_loss', max_iter=100,
n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
shuffle=True, tol=0.001, validation_fraction=0.1, verbose=1,
warm_start=False)
modelSGD.coef_
[OUT]:
array([2.89058854])
modelSGD.intercept_
[OUT]:
array([-1.51705295])
딥러닝을 이용한 선형회귀
#hidden_layer_sizes = (크기, 출력개수, 히든레이어개수)
modelNN = MLPRegressor(max_iter=5000, alpha=0.1,
verbose=1 ,hidden_layer_sizes=(100,10))
modelNN.fit(carDF[['speed']],carDF['dist'])
[OUT]:
Iteration 1, loss = 1280.40997069
Iteration 2, loss = 1265.45608004
Iteration 3, loss = 1250.62804876
Iteration 4, loss = 1235.93327095
Iteration 5, loss = 1221.41236757
Iteration 6, loss = 1210.87315560
Iteration 7, loss = 1201.53323199
Iteration 8, loss = 1191.96220774
Iteration 9, loss = 1182.25450188
Iteration 10, loss = 1172.45068552
Iteration 11, loss = 1162.59722917
Iteration 12, loss = 1152.80588181
Iteration 13, loss = 1142.96771008
Iteration 14, loss = 1133.96987421
Iteration 15, loss = 1126.56266741
Iteration 16, loss = 1119.11260521
Iteration 17, loss = 1111.61086719
# ...
# 중략
# ...
Iteration 2233, loss = 111.04735804
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
MLPRegressor(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(100, 10), learning_rate='constant',
learning_rate_init=0.001, max_fun=15000, max_iter=5000,
momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
power_t=0.5, random_state=None, shuffle=True, solver='adam',
tol=0.0001, validation_fraction=0.1, verbose=1, warm_start=False)
modelNN.coefs_ # 엄청많음
[OUT]:
[array([[ 1.79979938e-01, 1.44437584e-01, -5.47887850e-02,
2.76259624e-01, 2.86315635e-37, 3.44123034e-01,
3.12338345e-01, 2.31227763e-01, 1.59538711e-01,
3.64747781e-01, 2.38467676e-01, 1.73625753e-01,
1.89891742e-01, 1.56042067e-01, 2.18377601e-01,
1.30112960e-01, 2.20809222e-01, -4.63745784e-02,
1.94739106e-01, 3.77540490e-01, 3.48798433e-01,
-5.32543472e-02, -4.09314338e-52, 3.15894821e-01,
-6.98304202e-03, -3.52417673e-44, 3.80250301e-01,
-2.07596164e-01, 3.04884913e-01, 3.75936565e-02,
2.33880786e-01, -4.64267017e-02, 4.60074395e-41,
-1.99020201e-01, -4.86720205e-02, 1.66951584e-01,
-5.59361633e-03, 2.12410749e-01, -1.00317279e-53,
1.99618427e-01, 1.01983710e-43, 1.78528076e-01,
1.94124811e-01, -4.34162520e-52, 1.41773761e-01,
3.30975465e-01, -8.65415961e-41, 2.95249428e-01,
1.73024578e-01, 1.80943072e-01, -1.57766713e-02,
7.08186827e-48, -5.52869426e-04, 7.58792127e-03,
-1.23768242e-01, 2.23082163e-01, 1.59347875e-01,
1.97116847e-01, 2.36303842e-50, 1.77707768e-01,
-5.52483392e-02, 7.10413313e-01, 1.86845024e-01,
4.05925819e-52, 3.40381813e-02, 1.54663316e-01,
2.20125957e-34, 4.04754592e-01, -5.30159447e-02,
-6.27212346e-51, 1.87460166e-01, 2.73717672e-42,
7.06552132e-03, 3.74487905e-01, 2.32332397e-01,
2.36970788e-01, -1.11516321e-02, 2.02251097e-01,
-8.89500927e-03, -4.24244452e-02, 8.18033655e-54,
3.56312713e-01, -1.24902300e-03, 2.84295392e-01,
6.25702598e-02, 1.88389751e-51, 1.60223879e-01,
-5.41090727e-02, 1.84418754e-01, 4.72581397e-47,
1.14218298e-01, -6.24757097e-03, -4.59968247e-03,
-9.47990546e-48, -1.62660713e-53, -5.47502843e-53,
5.78005249e-02, -4.36209180e-49, -3.89486064e-02,
7.28823571e-54]]),
array([[ 6.12318732e-48, 1.67801158e-02, -5.12982889e-02,
-7.35832146e-03, -1.84695695e-39, -1.98487365e-01,
-1.67177666e-46, -1.23212234e-54, -2.11449832e-01,
9.42129575e-02],
[ 2.73558145e-33, 3.48880674e-48, 9.63481055e-02,
2.88790507e-02, -8.96648155e-49, -3.77248658e-03,
7.01886696e-51, 9.31256532e-47, -1.63379853e-33,
3.79838582e-01],
[ 3.44136845e-36, -1.91307151e-52, -1.26232023e+00,
1.62939794e-04, 7.97889727e-53, 1.31591654e-53,
-1.33041460e-52, 2.68656706e-53, 1.37499003e-45,
-1.30302264e+00],
[-9.44665703e-53, 1.30659312e-01, 6.04110255e-02,
8.08219134e-02, 8.16840933e-38, -1.73673565e-01,
2.31591993e-53, -2.46673695e-46, -2.61882909e-02,
9.79997920e-02],
[ 2.51824973e-33, 2.33434914e-52, 2.35445132e-52,
-7.19864289e-50, -4.06140972e-47, 3.28411501e-53,
7.01506440e-53, 4.27791094e-30, 2.70420792e-45,
2.88274554e-48],
[-2.41020111e-36, 2.03179733e-01, 1.23393567e-01,
-4.84517016e-03, -3.17172503e-46, -1.59599235e-01,
-2.57968298e-45, 4.46705632e-47, 1.50772335e-01,
2.63460825e-01],
# ...
# 중략
# ...
array([[ 2.82593630e-36],
[-3.01817508e-01],
[ 5.22758369e-01],
[-4.36995601e-02],
[ 4.59977782e-16],
[-4.10872076e-02],
[-2.58608304e-22],
[-5.90717792e-13],
[-7.17931801e-01],
[ 9.00693894e-01]])]
plt.scatter(carDF['speed'],carDF['dist'])
plt.plot(carDF['speed'],modelNN.predict(carDF[['speed']]),'r--')
plt.show() # 휘어져서 나옴 즉, 더욱 정확하게 나옴
review
- 선형회귀모델의 측정 지표 중 하나인 결정계수(R2 score)
728x90
반응형
LIST
'코딩으로 익히는 Python > 모델링' 카테고리의 다른 글
[Python] 5. 문자열encoding : LabelEncoder, OneHotEncoder, get_dummies(), make_column_transformer 예제 (0) | 2021.01.20 |
---|---|
[Python] 4. 다중선형회귀 : 릿지L2규제, 라쏘L1규제, 엘라스틱넷 (0) | 2021.01.20 |
[Python] 3. 다중선형회귀 (0) | 2021.01.20 |
[Python] 2. 정규화 : 상관관계, 다중공선성 (0) | 2021.01.20 |
[Python] 0. sklearn 구조 (0) | 2021.01.20 |