본문 바로가기

코딩으로 익히는 Python/통계

[Python] 4. 회귀 : cost(mse) test, gradient, scipy.stats 예제

728x90
반응형
SMALL
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as st

cost(MSE) test

 

def cost(x,y,w):
    c=0
    for i in np.arange(len(x)):
        hx = w*x[i]
        c = c+(hx-y[i])**2
    return c/len(x)

 

x_data = [1,2,3]
y_data = [1,2,3]
print(cost(x_data,y_data,-1))
print(cost(x_data,y_data,0))
print(cost(x_data,y_data,1))
print(cost(x_data,y_data,2))
print(cost(x_data,y_data,3))
[OUT] :

18.666666666666668
4.666666666666667
0.0
4.666666666666667
18.666666666666668

 

plt.scatter(x_data,y_data)
plt.show()

 

np.linspace(1,10,7) # 간격 동일하게 만들어주는 함수
[OUT] :

array([ 1. ,  2.5,  4. ,  5.5,  7. ,  8.5, 10. ])

 

plt.xlabel('w')
plt.ylabel('cost')
for w in np.linspace(-3,5,50):
    c = cost(x_data, y_data, w)
    print(w,c)
    plt.plot(w,c,'ro')
plt.show()
[OUT] :

-3.0 74.66666666666667
-2.836734693877551 68.69582118561709
-2.673469387755102 62.97376093294461
-2.510204081632653 57.500485908649175
-2.3469387755102042 52.275996112730816
-2.183673469387755 47.3002915451895
-2.020408163265306 42.57337220602526
-1.8571428571428572 38.095238095238095
-1.6938775510204083 33.86588921282799
-1.5306122448979593 29.88532555879495
-1.3673469387755104 26.15354713313897
-1.2040816326530615 22.670553935860067
-1.0408163265306123 19.436345966958214
-0.8775510204081636 16.450923226433435
-0.7142857142857144 13.714285714285717
-0.5510204081632657 11.226433430515067
-0.38775510204081653 8.98736637512148
-0.22448979591836737 6.997084548104957
-0.06122448979591866 5.255587949465504
0.1020408163265305 3.7628765792031107
0.2653061224489792 2.518950437317787
0.4285714285714284 1.5238095238095248
0.5918367346938771 0.7774538386783302
0.7551020408163263 0.27988338192419887
0.9183673469387754 0.031098153547133207
1.0816326530612246 0.031098153547133207
1.2448979591836729 0.2798833819241968
1.408163265306122 0.7774538386783268
1.5714285714285712 1.5238095238095226
1.7346938775510203 2.5189504373177836
1.8979591836734686 3.762876579203103
2.0612244897959178 5.255587949465494
2.224489795918367 6.997084548104951
2.387755102040816 8.987366375121475
2.5510204081632653 11.226433430515064
2.7142857142857135 13.714285714285703
2.8775510204081627 16.45092322643342
3.040816326530612 19.436345966958203
3.204081632653061 22.670553935860056
3.3673469387755093 26.153547133138954
3.5306122448979584 29.88532555879492
3.6938775510204076 33.86588921282798
3.8571428571428568 38.09523809523808
4.020408163265306 42.57337220602526
4.183673469387754 47.30029154518948
4.346938775510203 52.27599611273078
4.5102040816326525 57.50048590864915
4.673469387755102 62.97376093294459
4.836734693877551 68.69582118561709
5.0 74.66666666666667


미분(접선의 기울기)

y = 2 기울기 : 0

y = x 기울기 : 1

y = 2x 기울기 : 2

y = x^2 기울기 : 2x

 

w,b

  1. 공식이용
  2. 학습시키기
    w 초기값을 부여 ex) w=5
    -> w = w - a(러닝레이트) * 미분
def gradient_descent( x,y,w ):
    c = 0
    for i in range(len(x)):
        hx = w*x[i]
        c = c + (hx-y[i]) *x[i] #편미분
    return c/len(x)

 

def show_gradient(x,y,w):
    for i in range(200): # 학습횟수
        c = cost(x,y,w)
        # print(i,c,w)
        g = gradient_descent(x,y,w)
        print(i,w,c,g,0.1*g,w-0.1*g)
        w = w - 0.1 * g # 0.1 (learning rate)*기울기
    print("w : ",w)

 

show_gradient(x_data,y_data,10)
[OUT] :

0 10 378.0 42.0 4.2 5.8
1 5.8 107.51999999999998 22.399999999999995 2.2399999999999998 3.56
2 3.56 30.583466666666666 11.946666666666667 1.1946666666666668 2.365333333333333
3 2.365333333333333 8.69929718518518 6.371555555555553 0.6371555555555554 1.7281777777777778
4 1.7281777777777778 2.47446675489712 3.3981629629629633 0.33981629629629634 1.3883614814814815
5 1.3883614814814815 0.7038483213929583 1.8123535802469135 0.18123535802469137 1.2071261234567903
6 1.2071261234567903 0.2002057447517751 0.9665885761316879 0.0966588576131688 1.1104672658436214
7 1.1104672658436214 0.05694741184050483 0.5155139072702332 0.05155139072702332 1.0589158751165981
8 1.0589158751165981 0.016198374923521403 0.2749407505441246 0.02749407505441246 1.0314218000621858
9 1.0314218000621858 0.004607537756023892 0.14663506695686687 0.014663506695686689 1.0167582933664991
10 1.0167582933664991 0.0013105885172690224 0.07820536904366245 0.007820536904366245 1.008937756462133
11 1.008937756462133 0.0003727896226898598 0.04170953015662023 0.004170953015662024 1.004766803446471
12 1.004766803446471 0.0001060379371206724 0.02224508275019758 0.0022245082750197583 1.002542295171451
13 1.002542295171451 3.0161902114324568e-05 0.011864044133438703 0.0011864044133438705 1.0013558907581073
14 1.0013558907581073 8.579385490296031e-06 0.0063274902045003705 0.0006327490204500371 1.0007231417376572
15 1.0007231417376572 2.4403585394623746e-06 0.0033746614424004693 0.00033746614424004696 1.000385675593417
...
# 중략
...
59 1.0000000000000007 1.791371638939381e-30 2.886579864025407e-15 2.886579864025407e-16 1.0000000000000004
60 1.0000000000000004 9.203377227578472e-31 2.0724163126336257e-15 2.0724163126336258e-16 1.0000000000000002
61 1.0000000000000002 3.4512664603419266e-31 1.2582527612418441e-15 1.2582527612418443e-16 1.0
62 1.0 0.0 0.0 0.0 1.0
63 1.0 0.0 0.0 0.0 1.0
64 1.0 0.0 0.0 0.0 1.0
...
# 중략
...
199 1.0 0.0 0.0 0.0 1.0
w :  1.0

회귀테스트

 

데이터 불러오기 (pd.read_csv)

python 파일 경로에 data3 폴더 만든 후 다음의 'ch2_scores_em.csv'파일 넣어놓기

ch2_scores_em.csv
0.00MB

df = pd.read_csv('data3/ch2_scores_em.csv',
                 index_col='student number')
df.head()

 

plt.xlabel('english')
plt.ylabel('mathematics')
plt.scatter(df['english'],df['mathematics'])
plt.show()

 

result = st.linregress(df['english'],df['mathematics'])
result
[OUT] :

LinregressResult(slope=0.6214230159505968, intercept=42.601324328804154, rvalue=0.7237414863069244, pvalue=2.8760870522821355e-09, stderr=0.08552186247949062)

 

print("기울기 :",result.slope) # w
print("절편 :",result.intercept)
print("상관계수 :",result.rvalue)
print("p-value :",result.pvalue) # p-value
print("표준편차 :", result.stderr)
[OUT] :

기울기 : 0.6214230159505968
절편 : 42.601324328804154
상관계수 : 0.7237414863069244
p-value : 2.8760870522821355e-09
표준편차 : 0.08552186247949062

 

# 귀무가설 : 영어점수와 수학점수간에 연관성이 없다. (기울기 : 0)
# 대립가설 : 기울기가 0이 아니다(영어점수와 수학점수에 연관성이 있다.)
# 회귀계수값(w,b) 의미가 있냐없냐
# 0.05보다 작으면 귀무가설 기각
# 0.05보다 크면 귀무가설 채택
# 결론 : 회귀계수를 예측값으로 활용할 수 있다

 

Example

 

 영어점수가 70점일 때 수학점수를 예측하시오


Solution

 

result.slope *70 + result.intercept
[OUT] :

86.10093544534593

 

plt.xlabel('english')
plt.ylabel('mathematics')
plt.scatter(df['english'],df['mathematics'])

lm = result.slope * df['english'] + result.intercept
plt.plot(df['english'],lm,'r--')
plt.show()


review
- 회귀 w구하는 법(경사하강법)
728x90
반응형
LIST