[Python] 2. 대표값, 산포도, 정규화, 도수

728x90

SMALL

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import mode

데이터 불러오기 (pd.read_csv)

python 파일 경로에 data3 폴더 만든 후 다음의 ch2_scores_em.csv파일 넣어놓기

ch2_scores_em.csv

0.00MB

대표값

평균값
중앙값
최빈값

df = pd.read_csv('data3/ch2_scores_em.csv',
                 index_col='student number')
df.head()

scores = df['english'].values
scores

[OUT] :

array([42, 69, 56, 41, 57, 48, 65, 49, 65, 58, 70, 47, 51, 64, 62, 70, 71,
       68, 73, 37, 65, 65, 61, 52, 57, 57, 75, 61, 47, 54, 66, 54, 54, 42,
       37, 79, 56, 62, 62, 55, 63, 57, 57, 67, 55, 45, 66, 55, 64, 66],
      dtype=int64)

평균값

scores.sum()/len(scores)

[OUT] :

58.38

scores.mean()

[OUT] :

58.38

중앙값

np.median(scores)

[OUT] :

57.5

sorted_scores = np.sort(scores)
sorted_scores

[OUT] :

array([37, 37, 41, 42, 42, 45, 47, 47, 48, 49, 51, 52, 54, 54, 54, 55, 55,
       55, 56, 56, 57, 57, 57, 57, 57, 58, 61, 61, 62, 62, 62, 63, 64, 64,
       65, 65, 65, 65, 66, 66, 66, 67, 68, 69, 70, 70, 71, 73, 75, 79],
      dtype=int64)

n = len(sorted_scores)
n

[OUT] :

50

n = len(sorted_scores)
if n % 2 == 0:
    m0 = sorted_scores[n//2 - 1]
    m1 = sorted_scores[n//2]
    median = (m0 + m1) / 2
else:
    median = sorted_scores[ (n+1)//2 - 1 ]
    # sorted_scores[ n//2 ]
median

[OUT] :

57.5

최빈값

arr = [1,1,1,2,2,3]
m = mode(arr)
print(m)
print(m[0],m[1]) # 최빈값, 갯수

[OUT] :

ModeResult(mode=array([1]), count=array([3]))
[1] [3]

np.unique(arr) # 중복된값 제거하고 들고옴

[OUT] :

array([1, 2, 3])

np.unique(arr,return_counts=True) # 개수도 함께 들고옴

[OUT] :

(array([1, 2, 3]), array([3, 2, 1], dtype=int64))

dt, c = np.unique(arr,return_counts=True)
for a,b in zip(dt,c):
    print(a,b) # 1은 3개, 2는 2개, 3은 1개 있다고 결과 반환

[OUT] :

1 3
2 2
3 1

산포도

분산
표준편차

dt = np.array([1,2,3,4,5])
dt

array([1, 2, 3, 4, 5])

dt.mean()

3.0

d = dt - dt.mean() # 편차 : 개별데이터 - 평균값
d

array([-16.38,  10.62,  -2.38, -17.38,  -1.38, -10.38,   6.62,  -9.38,
         6.62,  -0.38,  11.62, -11.38,  -7.38,   5.62,   3.62,  11.62,
        12.62,   9.62,  14.62, -21.38,   6.62,   6.62,   2.62,  -6.38,
        -1.38,  -1.38,  16.62,   2.62, -11.38,  -4.38,   7.62,  -4.38,
        -4.38, -16.38, -21.38,  20.62,  -2.38,   3.62,   3.62,  -3.38,
         4.62,  -1.38,  -1.38,   8.62,  -3.38, -13.38,   7.62,  -3.38,
         5.62,   7.62])

aa = (dt-dt.mean())**2 #편차^2
aa

array([4., 1., 0., 1., 4.])

# 편차^2의 평균
aa.mean() # aa.sum()/len(aa)

2.0

dt.var() # 분산 : 편차제곱의 합을 개수로 나눈 것

2.0

-> aa.mean() == dt.var()

-> 편차제곱의 평균 == 분산

np.sqrt(aa.mean())

1.4142135623730951

np.sqrt(dt.var())

1.4142135623730951

dt.std() # 표준편차(단위를 맞추기 위한 목적)

1.4142135623730951

deviation = scores - scores.mean() # 편차 array
deviation

array([-16.38,  10.62,  -2.38, -17.38,  -1.38, -10.38,   6.62,  -9.38,
         6.62,  -0.38,  11.62, -11.38,  -7.38,   5.62,   3.62,  11.62,
        12.62,   9.62,  14.62, -21.38,   6.62,   6.62,   2.62,  -6.38,
        -1.38,  -1.38,  16.62,   2.62, -11.38,  -4.38,   7.62,  -4.38,
        -4.38, -16.38, -21.38,  20.62,  -2.38,   3.62,   3.62,  -3.38,
         4.62,  -1.38,  -1.38,   8.62,  -3.38, -13.38,   7.62,  -3.38,
         5.62,   7.62])

summary_df = pd.DataFrame([scores,deviation])
summary_df = summary_df.T
summary_df['square of deviation'] = np.square(deviation) # deviation**2
summary_df.rename(columns = {0:'scores',1:'deviation'})

summary_df['square of deviation'].sum()/len(summary_df['square of deviation'])

94.1156

scores.var()

94.1156

-> 편차제곱의 합 / 개수 == 분산

범위

scores.max() - scores.min()

Q1 = np.percentile(scores,25)
Q3 = np.percentile(scores,75)
Q3 - Q1

11.0

데이터의 정규화

필요이유: 평균이 30점에서 60점 인경우와 평균이 90 점인 경우 60은 평가가 달라야 하므로 표준화 방법이 필요
방법 : 평균을 뺀 후 편차로 나눠줌
평균0, 표준편차1

dt = np.array([1,2,3,4,5])
dt

array([1, 2, 3, 4, 5])

dt - dt.mean()

array([-2., -1.,  0.,  1.,  2.])

dt.std()

1.4142135623730951

result = (dt-dt.mean())/dt.std() # 정규화 시키기
result

array([-1.41421356, -0.70710678,  0.        ,  0.70710678,  1.41421356])

result.mean()

0.0

result.std()

0.9999999999999999

Example) 평균이 50, 편차 10이 되도록 정규화하시오

z = 50 + 10 * (scores - scores.mean())/scores.std()
z

array([33.11569885, 60.94696448, 47.54672547, 32.08491123, 48.57751309,
       39.30042454, 56.82381402, 40.33121216, 56.82381402, 49.60830071,
       61.9777521 , 38.26963693, 42.39278739, 55.7930264 , 53.73145117,
       61.9777521 , 63.00853972, 59.91617687, 65.07011495, 27.96176076,
       56.82381402, 56.82381402, 52.70066355, 43.42357501, 48.57751309,
       48.57751309, 67.13169018, 52.70066355, 38.26963693, 45.48515024,
       57.85460164, 45.48515024, 45.48515024, 33.11569885, 27.96176076,
       71.25484065, 47.54672547, 53.73145117, 53.73145117, 46.51593786,
       54.76223879, 48.57751309, 48.57751309, 58.88538925, 46.51593786,
       36.20806169, 57.85460164, 46.51593786, 55.7930264 , 57.85460164])

z.mean()

50.0

평균 50을 기준으로 성적에 대해 한눈에 알 수 있음

score_df = pd.DataFrame([scores,z])
score_df =  score_df.T
score_df.rename(columns = {0:'scores',1:'z'})

도수

np.historam 의 범위

0<= r <10
10<= r <20
20<= r <30
...
90<= r<=100

s =[ 9, 10, 18,  20, 30, 45,47, 90, 100]
np.histogram(s,bins=10)

(array([3, 1, 1, 1, 1, 0, 0, 0, 1, 1], dtype=int64),
 array([  9. ,  18.1,  27.2,  36.3,  45.4,  54.5,  63.6,  72.7,  81.8,
         90.9, 100. ]))

s =[ 9, 10, 18,  20, 30, 45,47, 90, 100]
np.histogram(s,bins=10, range=(0,100))

(array([1, 2, 1, 1, 2, 0, 0, 0, 0, 2], dtype=int64),
 array([  0.,  10.,  20.,  30.,  40.,  50.,  60.,  70.,  80.,  90., 100.]))

pd.cut 의 범위

0< r <=10
10< r <=20
20< r <=30
...
90< r<=100

sr = pd.Series(s)
sr

0      9
1     10
2     18
3     20
4     30
5     45
6     47
7     90
8    100
dtype: int64

pd.cut(sr,bins=range(0,101,10))

0      (0, 10]
1      (0, 10]
2     (10, 20]
3     (10, 20]
4     (20, 30]
5     (40, 50]
6     (40, 50]
7     (80, 90]
8    (90, 100]
dtype: category
Categories (10, interval[int64]): [(0, 10] < (10, 20] < (20, 30] < (30, 40] ... (60, 70] < (70, 80] < (80, 90] < (90, 100]]

plt.hist(sr,bins=10,range=(0,100))

freq,r,_ = plt.hist(sr,bins=10,range=(0,100))
plt.show()
print(freq)
print(r)

scores

array([42, 69, 56, 41, 57, 48, 65, 49, 65, 58, 70, 47, 51, 64, 62, 70, 71,
       68, 73, 37, 65, 65, 61, 52, 57, 57, 75, 61, 47, 54, 66, 54, 54, 42,
       37, 79, 56, 62, 62, 55, 63, 57, 57, 67, 55, 45, 66, 55, 64, 66],
      dtype=int64)

freq,r = np.histogram(scores, bins=10, range=(0,100))
print(freq)
print(r)

[ 0  0  0  2  8 16 18  6  0  0]
[  0.  10.  20.  30.  40.  50.  60.  70.  80.  90. 100.]

freq_class = [f'{i}~{i+10}' for i in range(0, 100, 10)]
freq_dist_df = pd.DataFrame({'frequency':freq},
                            index=pd.Index(freq_class,
                                           name='class'))
freq_dist_df

# 각계급의 중앙값이 계급값
class_value = [(i+(i+10))//2 for i in range(0, 100, 10)]
class_value

[5, 15, 25, 35, 45, 55, 65, 75, 85, 95]

# 상대도수
rel_freq = freq / freq.sum()
rel_freq

array([0.  , 0.  , 0.  , 0.04, 0.16, 0.32, 0.36, 0.12, 0.  , 0.  ])

# 누적 상대도수 함수 cumsum()
aa = np.array([1,2,3,4,5])
np.cumsum(aa)

array([ 1,  3,  6, 10, 15], dtype=int32)

# 누적 상대도수
cum_rel_freq = np.cumsum(rel_freq)
cum_rel_freq

array([0.  , 0.  , 0.  , 0.04, 0.2 , 0.52, 0.88, 1.  , 1.  , 1.  ])

도수분포표

freq_dist_df['class value'] = class_value
freq_dist_df['relative frequency'] = rel_freq
freq_dist_df['cumulative relative frequency'] = cum_rel_freq
freq_dist_df = freq_dist_df[['class value', 'frequency',
                             'relative frequency', 'cumulative relative frequency']]

freq_dist_df

Example) 수학 점수의 도수 분포표를 그리시오

math = df['mathematics'].tolist()
freqM,rM = np.histogram(math, bins=10, range=(0,100))
print(freq)
print(r)

[ 0  0  0  0  0  1  6 16 23  4]
[  0.  10.  20.  30.  40.  50.  60.  70.  80.  90. 100.]

# 각계급의 중앙값이 계급값
class_value = [(i+(i+10))//2 for i in range(0, 100, 10)]
class_value

[5, 15, 25, 35, 45, 55, 65, 75, 85, 95]

# 상대도수
rel_freqM = freqM / freqM.sum()
rel_freqM

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.02, 0.12, 0.32, 0.46, 0.08])

# 누적 상대도수
cum_rel_freqM = np.cumsum(rel_freqM)
cum_rel_freqM

array([0.  , 0.  , 0.  , 0.  , 0.  , 0.02, 0.14, 0.46, 0.92, 1.  ])

freq_class = [f'{i}~{i+10}' for i in range(0, 100, 10)]
freq_dist_df = pd.DataFrame({'frequency':freqM},
                            index=pd.Index(freq_class,
                                           name='class'))
freq_dist_df

freq_dist_df['class value'] = class_value
freq_dist_df['relative frequency'] = rel_freqM
freq_dist_df['cumulative relative frequency'] = cum_rel_freqM
freq_dist_df = freq_dist_df[['class value', 'frequency',
                             'relative frequency', 'cumulative relative frequency']]

freq_dist_df

fig = plt.figure(figsize=(10,6))
ax1 = fig.add_subplot()
ax2 = ax1.twinx()
freq, r, _ = ax1.hist(math,bins=10,range=(0,100))
ax2.plot(class_value,freq_dist_df['cumulative relative frequency'],'ro--')
plt.show()

728x90

LIST

'코딩으로 익히는 Python > 통계' 카테고리의 다른 글

[Python] 5. 모집단과 표본 : 모평균 추정, 중심극한정리 (0)	2021.01.17
[Python] 4. 회귀 : cost(mse) test, gradient, scipy.stats 예제 (0)	2021.01.17
[Python] 3. 공분산, 상관계수 (0)	2021.01.17
[Python] 1. 데이터 종류 : 이산 변수, 연속 변수 (0)	2021.01.17
[Python] 0. SciPy Library(사이파이 라이브러리) 구조 : 확률분포 클래스, 모수 지정, 확률분포 매서드 (0)	2021.01.17

Seize the Data

[Python] 2. 대표값, 산포도, 정규화, 도수

'코딩으로 익히는 Python > 통계' 카테고리의 다른 글

티스토리툴바

[Python] 2. 대표값, 산포도, 정규화, 도수

'코딩으로 익히는 Python > 통계' 카테고리의 다른 글

'코딩으로 익히는 Python/통계' Related Articles

티스토리툴바