[Python] 1. 선형회귀분석

728x90

SMALL

import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.datasets import load_boston, load_iris
from sklearn.linear_model import LinearRegression,Ridge, SGDRegressor # SGDRegressor은 학습 나머지는 공식
from sklearn.neural_network import MLPRegressor # MLPRegressor은 딥러닝 학습
from sklearn.metrics import r2_score # 선형 모델(Linear Models)

from sklearn.model_selection import train_test_split

import mglearn # !pip install mglearn
import matplotlib.pyplot as plt
import matplotlib
matplotlib.rcParams['font.family']='Malgun Gothic'
matplotlib.rcParams['axes.unicode_minus'] = False

x_d = [1,2,3,4,5,6,7,8,9,10]
y_d = [1,2,3,4,5,6,7,8,9,10]
x_data1 = np.array([[1],[2],[3],[4],[5],[6],[7],[8],[9],[10]])
x_data =np.array([1,2,3,4,5,6,7,8,9,10]).reshape(-1,1) # x_data1이랑 같음 -> 10X1로 변경
y_data = np.array([1,2,3,4,5,6,7,8,9,10]).reshape(-1,1)

행(데이터의 개수), 열(특성데이터의 개수)

x_data1.shape

[OUT]:

(10, 1)

x_data.shape

[OUT]:

(10, 1)

model_lr = LinearRegression() # 공식
# model_lr.fit(x_d,y_d) # error -> 특성데이터는 반드시 (2차원행렬or데이터프레임)로 주어야 함
model_lr.fit(x_data,y_data)

[OUT]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

model_lr.coef_ # w값 기울기

[OUT]:

array([[1.]])

model_lr.intercept_ # b값

[OUT]:

array([2.66453526e-15])

데이터 불러오기 (pd.read_csv)

python 파일 경로에 data4 폴더 만든 후 다음의 cars.csv파일 넣어놓기

cars.csv

0.00MB

carDF = pd.read_csv('data4/cars.csv')
carDF

carDF['speed'] # 자동차 속도라는 특성 1개 -> series니까 벡터

[OUT]:

0      4
1      4
2      7
3      7
4      8
5      9
6     10
7     10
8     10
9     11
10    11
11    12
12    12
13    12
14    12
15    13
16    13
17    13
18    13
19    14
20    14
21    14
22    14
23    15
24    15
25    15
26    16
27    16
28    17
29    17
30    17
31    18
32    18
33    18
34    18
35    19
36    19
37    19
38    20
39    20
40    20
41    20
42    20
43    22
44    23
45    24
46    24
47    24
48    24
49    25
Name: speed, dtype: int64

carDF[['speed']] # 행렬로 변환(복수개의 행선택)

carDF[['speed']].shape

[OUT]:

(50, 1)

carDF['dist'] # 얘는 행렬로 변환할 필요 없음

[OUT]:

0       2
1      10
2       4
3      22
4      16
5      10
6      18
7      26
8      34
9      17
10     28
11     14
12     20
13     24
14     28
15     26
16     34
17     34
18     46
19     26
20     36
21     60
22     80
23     20
24     26
25     54
26     32
27     40
28     32
29     40
30     50
31     42
32     56
33     76
34     84
35     36
36     46
37     68
38     32
39     48
40     52
41     56
42     64
43     66
44     54
45     70
46     92
47     93
48    120
49     85
Name: dist, dtype: int64

modelCar = LinearRegression() # 카멜식 표기법 or 언더바 표기법 중 하나로 통일
modelCar.fit(carDF[['speed']],carDF['dist'])

[OUT]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

modelCar.coef_ # weight

[OUT]:

array([3.93240876])

modelCar.intercept_ # bias

[OUT]:

-17.57909489051095

plt.xlabel('speed')
plt.ylabel('dist')
plt.scatter(carDF['speed'],carDF['dist'])
plt.plot(carDF['speed'],modelCar.predict(carDF[['speed']]),'r--') # 예측선
plt.show()

연습문제

1. 자동차 속도가 15일 때 제동거리는 얼마로 예측되는지 구하시오

2. 자동차 속도가 13, 15일 때 제동거리는 얼마로 예측되는지 구하시오

Solution

1. 자동차 속도가 15일 때 제동거리는 얼마로 예측되는지 구하시오

# 방법1
modelCar.coef_[0]  * 15 + modelCar.intercept_

[OUT]:

41.40703649635036

# 방법2
modelCar.predict([[15]]) # 매트릭스(행렬) : 행렬곱을 수행

[OUT]:

array([41.4070365])

2. 자동차 속도가 13, 15일 때 제동거리는 얼마로 예측되는지 구하시오

modelCar.predict([[13],[15]])

[OUT]:

array([33.54221898, 41.4070365 ])

연습문제

다음의 전기 데이터를 들고와서 w, b를 구하시고 전기생산량이 3.2, 4.5 인경우 전기 사용량을 예측하고 scatter(실제데이터), plot(예측데이터)를 그리시오

electric.csv

0.00MB

Solution

elecDF = pd.read_csv('data4/electric.csv')
elecDF

elecDF = pd.read_csv('data4/electric.csv', index_col='Unnamed: 0') # Unnamed: 0 컬럼을 인덱스로 설정
elecDF

modelElec = LinearRegression() 
modelElec.fit(elecDF[['전기생산량']],elecDF['전기사용량'])

[OUT]:

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

modelElec.coef_ # w

[OUT]:

array([0.49560324])

modelElec.intercept_ # b

[OUT]:

0.919581428068942

modelElec.predict([[3.2],[4.5]]) # 3.2, 4.5 인경우 전기 사용량을 예측

[OUT]:

array([2.50551178, 3.14979599])

plt.xlabel('전기생산량')
plt.ylabel('전기사용량')
plt.scatter(elecDF['전기생산량'],elecDF['전기사용량']) # 실제데이터
plt.plot(elecDF['전기생산량'],modelElec.predict(elecDF[['전기생산량']]),'r--') # 예측데이터
plt.show()

결정 계수 𝑅2

회귀모델의 검증을 위한 또 다른 측정 지표 중 하나로 결정 계수(coefficient of determination, $R^{2}$ ) 사용
R 제곱 (R^2, R-squared, 피어슨 상관 계수의 제곱)
1 - (오차의 제곱합)/(편차의 제곱합)
오차 : 실제 값과 예측 값의 차이
편차 : 실제 값과 평균 값의 차이
0 <= 결정 계수 <= 1 (0이면 0점, 1이면 100점)
높을 수록 좋음
from sklearn.metrics import r2_score
y_predict = model.predict( x_data )
r2_score( y_data, y_predict )

plt.figure( figsize=(10,6))
plt.xlabel('speed')
plt.ylabel('dist')
predictions = modelCar.predict(  carDF[['speed']] )
plt.scatter(carDF.speed, carDF.dist) # 실제값 carDF.speed == carDF['speed']
plt.plot( carDF.speed, predictions, 'r--') # 예측값
plt.axhline( carDF.dist.mean(),color='blue') # 평균
plt.show()

carDF.dist.mean()

[OUT]:

42.98

predictions

[OUT]:

array([-1.84945985, -1.84945985,  9.94776642,  9.94776642, 13.88017518,
       17.81258394, 21.7449927 , 21.7449927 , 21.7449927 , 25.67740146,
       25.67740146, 29.60981022, 29.60981022, 29.60981022, 29.60981022,
       33.54221898, 33.54221898, 33.54221898, 33.54221898, 37.47462774,
       37.47462774, 37.47462774, 37.47462774, 41.4070365 , 41.4070365 ,
       41.4070365 , 45.33944526, 45.33944526, 49.27185401, 49.27185401,
       49.27185401, 53.20426277, 53.20426277, 53.20426277, 53.20426277,
       57.13667153, 57.13667153, 57.13667153, 61.06908029, 61.06908029,
       61.06908029, 61.06908029, 61.06908029, 68.93389781, 72.86630657,
       76.79871533, 76.79871533, 76.79871533, 76.79871533, 80.73112409])

score (결정계수 통한 설명력 확인)

modelCar.score(carDF[['speed']],carDF.dist) # 65%의 설명력을 가진다 score(예측값,실제값)

[OUT]:

0.6510793807582509

y_predict = modelCar.predict(carDF[['speed']])
r2_score(carDF.dist,y_predict) # 65%의 설명력을 가진다 r2_score(실제값,예측값)

[OUT]:

0.6510793807582509

학습을 통한 선형회귀

# alpha = 0.0001 (learning rate)
# max_iter = 1000 (학습횟수, 1000번 학습)
# early_stopping = False (cost값이 변화가 없다. 학습이 되어 최저점에 도달)
# verbose=1 (몇번 학습했는지 출력)
modelSGD = SGDRegressor(max_iter=100, alpha=0.0001,
                        early_stopping=True, verbose=1)
modelSGD.fit(carDF[['speed']],carDF['dist'])

[OUT]:

-- Epoch 1
Norm: 1.47, NNZs: 1, Bias: -0.207088, T: 45, Avg. loss: 958.504253
Total training time: 0.00 seconds.
-- Epoch 2
Norm: 2.36, NNZs: 1, Bias: -0.536527, T: 90, Avg. loss: 343.295596
Total training time: 0.00 seconds.
-- Epoch 3
Norm: 2.56, NNZs: 1, Bias: -0.951716, T: 135, Avg. loss: 276.255796
Total training time: 0.00 seconds.
-- Epoch 4
Norm: 3.52, NNZs: 1, Bias: -1.079889, T: 180, Avg. loss: 194.543746
Total training time: 0.00 seconds.
-- Epoch 5
Norm: 2.79, NNZs: 1, Bias: -1.355384, T: 225, Avg. loss: 250.073654
Total training time: 0.00 seconds.
-- Epoch 6
Norm: 2.89, NNZs: 1, Bias: -1.517053, T: 270, Avg. loss: 173.248761
Total training time: 0.00 seconds.
Convergence after 6 epochs took 0.00 seconds
SGDRegressor(alpha=0.0001, average=False, early_stopping=True, epsilon=0.1,
             eta0=0.01, fit_intercept=True, l1_ratio=0.15,
             learning_rate='invscaling', loss='squared_loss', max_iter=100,
             n_iter_no_change=5, penalty='l2', power_t=0.25, random_state=None,
             shuffle=True, tol=0.001, validation_fraction=0.1, verbose=1,
             warm_start=False)

modelSGD.coef_

[OUT]:

array([2.89058854])

modelSGD.intercept_

[OUT]:

array([-1.51705295])

딥러닝을 이용한 선형회귀

#hidden_layer_sizes = (크기, 출력개수, 히든레이어개수)
modelNN = MLPRegressor(max_iter=5000, alpha=0.1,
                       verbose=1 ,hidden_layer_sizes=(100,10))
modelNN.fit(carDF[['speed']],carDF['dist'])

[OUT]:

Iteration 1, loss = 1280.40997069
Iteration 2, loss = 1265.45608004
Iteration 3, loss = 1250.62804876
Iteration 4, loss = 1235.93327095
Iteration 5, loss = 1221.41236757
Iteration 6, loss = 1210.87315560
Iteration 7, loss = 1201.53323199
Iteration 8, loss = 1191.96220774
Iteration 9, loss = 1182.25450188
Iteration 10, loss = 1172.45068552
Iteration 11, loss = 1162.59722917
Iteration 12, loss = 1152.80588181
Iteration 13, loss = 1142.96771008
Iteration 14, loss = 1133.96987421
Iteration 15, loss = 1126.56266741
Iteration 16, loss = 1119.11260521
Iteration 17, loss = 1111.61086719
# ...
# 중략
# ...
Iteration 2233, loss = 111.04735804
Training loss did not improve more than tol=0.000100 for 10 consecutive epochs. Stopping.
MLPRegressor(activation='relu', alpha=0.1, batch_size='auto', beta_1=0.9,
             beta_2=0.999, early_stopping=False, epsilon=1e-08,
             hidden_layer_sizes=(100, 10), learning_rate='constant',
             learning_rate_init=0.001, max_fun=15000, max_iter=5000,
             momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
             power_t=0.5, random_state=None, shuffle=True, solver='adam',
             tol=0.0001, validation_fraction=0.1, verbose=1, warm_start=False)

modelNN.coefs_ # 엄청많음

[OUT]:

[array([[ 1.79979938e-01,  1.44437584e-01, -5.47887850e-02,
          2.76259624e-01,  2.86315635e-37,  3.44123034e-01,
          3.12338345e-01,  2.31227763e-01,  1.59538711e-01,
          3.64747781e-01,  2.38467676e-01,  1.73625753e-01,
          1.89891742e-01,  1.56042067e-01,  2.18377601e-01,
          1.30112960e-01,  2.20809222e-01, -4.63745784e-02,
          1.94739106e-01,  3.77540490e-01,  3.48798433e-01,
         -5.32543472e-02, -4.09314338e-52,  3.15894821e-01,
         -6.98304202e-03, -3.52417673e-44,  3.80250301e-01,
         -2.07596164e-01,  3.04884913e-01,  3.75936565e-02,
          2.33880786e-01, -4.64267017e-02,  4.60074395e-41,
         -1.99020201e-01, -4.86720205e-02,  1.66951584e-01,
         -5.59361633e-03,  2.12410749e-01, -1.00317279e-53,
          1.99618427e-01,  1.01983710e-43,  1.78528076e-01,
          1.94124811e-01, -4.34162520e-52,  1.41773761e-01,
          3.30975465e-01, -8.65415961e-41,  2.95249428e-01,
          1.73024578e-01,  1.80943072e-01, -1.57766713e-02,
          7.08186827e-48, -5.52869426e-04,  7.58792127e-03,
         -1.23768242e-01,  2.23082163e-01,  1.59347875e-01,
          1.97116847e-01,  2.36303842e-50,  1.77707768e-01,
         -5.52483392e-02,  7.10413313e-01,  1.86845024e-01,
          4.05925819e-52,  3.40381813e-02,  1.54663316e-01,
          2.20125957e-34,  4.04754592e-01, -5.30159447e-02,
         -6.27212346e-51,  1.87460166e-01,  2.73717672e-42,
          7.06552132e-03,  3.74487905e-01,  2.32332397e-01,
          2.36970788e-01, -1.11516321e-02,  2.02251097e-01,
         -8.89500927e-03, -4.24244452e-02,  8.18033655e-54,
          3.56312713e-01, -1.24902300e-03,  2.84295392e-01,
          6.25702598e-02,  1.88389751e-51,  1.60223879e-01,
         -5.41090727e-02,  1.84418754e-01,  4.72581397e-47,
          1.14218298e-01, -6.24757097e-03, -4.59968247e-03,
         -9.47990546e-48, -1.62660713e-53, -5.47502843e-53,
          5.78005249e-02, -4.36209180e-49, -3.89486064e-02,
          7.28823571e-54]]),
 array([[ 6.12318732e-48,  1.67801158e-02, -5.12982889e-02,
         -7.35832146e-03, -1.84695695e-39, -1.98487365e-01,
         -1.67177666e-46, -1.23212234e-54, -2.11449832e-01,
          9.42129575e-02],
        [ 2.73558145e-33,  3.48880674e-48,  9.63481055e-02,
          2.88790507e-02, -8.96648155e-49, -3.77248658e-03,
          7.01886696e-51,  9.31256532e-47, -1.63379853e-33,
          3.79838582e-01],
        [ 3.44136845e-36, -1.91307151e-52, -1.26232023e+00,
          1.62939794e-04,  7.97889727e-53,  1.31591654e-53,
         -1.33041460e-52,  2.68656706e-53,  1.37499003e-45,
         -1.30302264e+00],
        [-9.44665703e-53,  1.30659312e-01,  6.04110255e-02,
          8.08219134e-02,  8.16840933e-38, -1.73673565e-01,
          2.31591993e-53, -2.46673695e-46, -2.61882909e-02,
          9.79997920e-02],
        [ 2.51824973e-33,  2.33434914e-52,  2.35445132e-52,
         -7.19864289e-50, -4.06140972e-47,  3.28411501e-53,
          7.01506440e-53,  4.27791094e-30,  2.70420792e-45,
          2.88274554e-48],
        [-2.41020111e-36,  2.03179733e-01,  1.23393567e-01,
         -4.84517016e-03, -3.17172503e-46, -1.59599235e-01,
         -2.57968298e-45,  4.46705632e-47,  1.50772335e-01,
          2.63460825e-01],
 # ...
 # 중략
 # ...
 array([[ 2.82593630e-36],
        [-3.01817508e-01],
        [ 5.22758369e-01],
        [-4.36995601e-02],
        [ 4.59977782e-16],
        [-4.10872076e-02],
        [-2.58608304e-22],
        [-5.90717792e-13],
        [-7.17931801e-01],
        [ 9.00693894e-01]])]

plt.scatter(carDF['speed'],carDF['dist'])
plt.plot(carDF['speed'],modelNN.predict(carDF[['speed']]),'r--')
plt.show() # 휘어져서 나옴 즉, 더욱 정확하게 나옴

review
- 선형회귀모델의 측정 지표 중 하나인 결정계수(R2 score)

728x90

LIST

'코딩으로 익히는 Python > 모델링' 카테고리의 다른 글

[Python] 5. 문자열encoding : LabelEncoder, OneHotEncoder, get_dummies(), make_column_transformer 예제 (0)	2021.01.20
[Python] 4. 다중선형회귀 : 릿지L2규제, 라쏘L1규제, 엘라스틱넷 (0)	2021.01.20
[Python] 3. 다중선형회귀 (0)	2021.01.20
[Python] 2. 정규화 : 상관관계, 다중공선성 (0)	2021.01.20
[Python] 0. sklearn 구조 (0)	2021.01.20

Seize the Data

[Python] 1. 선형회귀분석

'코딩으로 익히는 Python > 모델링' 카테고리의 다른 글

티스토리툴바

[Python] 1. 선형회귀분석

'코딩으로 익히는 Python > 모델링' 카테고리의 다른 글

'코딩으로 익히는 Python/모델링' Related Articles

티스토리툴바