본문 바로가기

코딩으로 익히는 Python/모델링

[Python] 20. 나이브베이즈

728x90
반응형
SMALL
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
import multiprocessing

from sklearn.svm import SVC, SVR
from sklearn.manifold import TSNE
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.datasets import load_boston, load_breast_cancer,load_iris,load_wine
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

 

나이브 베이스 분류기(Naive Bayes Classification)

 

  • 베이즈 정리를 적용한 확률적 분류 알고리즘
  • 모든 특성들이 독립임을 가정 (naive 가정)
  • 입력 특성에 따라 3개의 분류기 존재
    - 가우시안 나이브 베이즈 분류기, 베르누이 나이브 베이즈 분류기, 다항 나이브 베이즈 분류기
  • 정수형 또는 실수형: GaussianNB (1, 3, 2), (0.1, 0.9, 1.5)
  • 정수형: MultinomialNB (1, 3, 2)
  • 바이너리형: BernoulliNB (0,1,0)
  • alpha : smoothing

나이브 베이즈 분류기의 확률 모델

 


weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']
play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']

 

df = pd.DataFrame([weather,temp])
df = df.T
df.columns = ['날씨','온도']
df['외출여부'] = play
df

 

x_data = df.iloc[:,:-1]
y_data = df.iloc[:,-1]

 

ct = make_column_transformer((OneHotEncoder(),['날씨','온도']))

 

result = ct.fit_transform(df)
result
[OUT] :

array([[0., 0., 1., 0., 1., 0.],
       [0., 0., 1., 0., 1., 0.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 1., 0., 1., 0., 0.],
       [0., 1., 0., 1., 0., 0.],
       [1., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 1.],
       [0., 0., 1., 1., 0., 0.],
       [0., 1., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 1.],
       [1., 0., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 1.]])

 

ct.get_feature_names()
[OUT] :

['onehotencoder__x0_Overcast',
 'onehotencoder__x0_Rainy',
 'onehotencoder__x0_Sunny',
 'onehotencoder__x1_Cool',
 'onehotencoder__x1_Hot',
 'onehotencoder__x1_Mild']

 

model_pipe = make_pipeline(ct,BernoulliNB()) # alpha : 1 = smoothing으로 곱했을때 0이 안되도록 만듦
model_pipe.fit(x_data,y_data)
[OUT] :

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('onehotencoder',
                                                  OneHotEncoder(categories='auto',
                                                                drop=None,
                                                                dtype=<class 'numpy.float64'>,
                                                                handle_unknown='error',
                                                                sparse=True),
                                                  ['날씨', '온도'])],
                                   verbose=False)),
                ('bernoullinb',
                 BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
                             fit_prior=True))],
         verbose=False)

 

test_df = pd.DataFrame([['Sunny','Hot']],columns=['날씨','온도'])
test_df

 

model_pipe.predict(test_df)
[OUT] :

array(['No'], dtype='<U3')

연습문제

 

wine데이터셋을 불러온 후 나이브베이즈를 이용하여 test[0] 데이터에 대한 분류를 출력하고 score를 구하시오


Solution

 

wine = load_wine()

x_data = wine['data']
y_data = wine['target']

x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, test_size=0.2,
                                                    random_state=42, stratify=y_data)

y_data #다항분류
[OUT] :

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

 

# 가우시안 
model_pipe = make_pipeline( StandardScaler(), GaussianNB() )
param_grid={"gaussiannb__var_smoothing":np.linspace(0.1, 1, 10)}
model_wine = GridSearchCV(model_pipe, param_grid, scoring='f1_macro')
# model_pipe.fit(x_train, y_train)
model_wine.fit( x_train, y_train )
[OUT] :

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('gaussiannb',
                                        GaussianNB(priors=None,
                                                   var_smoothing=1e-09))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'gaussiannb__var_smoothing': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=0)

 

# 예측
model_wine.best_estimator_.predict( [x_test[0]] )
[OUT] :

array([0])

 

# 스코어
model_wine.best_estimator_.score( x_test, y_test )
[OUT] :

1.0

728x90
반응형
LIST