728x90
반응형
SMALL
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
import multiprocessing
from sklearn.svm import SVC, SVR
from sklearn.manifold import TSNE
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.datasets import load_boston, load_breast_cancer,load_iris,load_wine
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.naive_bayes import GaussianNB, BernoulliNB, MultinomialNB
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
나이브 베이스 분류기(Naive Bayes Classification)
- 베이즈 정리를 적용한 확률적 분류 알고리즘
- 모든 특성들이 독립임을 가정 (naive 가정)
- 입력 특성에 따라 3개의 분류기 존재
- 가우시안 나이브 베이즈 분류기, 베르누이 나이브 베이즈 분류기, 다항 나이브 베이즈 분류기
- 정수형 또는 실수형: GaussianNB (1, 3, 2), (0.1, 0.9, 1.5)
- 정수형: MultinomialNB (1, 3, 2)
- 바이너리형: BernoulliNB (0,1,0)
- alpha : smoothing
나이브 베이즈 분류기의 확률 모델
weather=['Sunny','Sunny','Overcast','Rainy','Rainy','Rainy','Overcast','Sunny','Sunny',
'Rainy','Sunny','Overcast','Overcast','Rainy']
temp=['Hot','Hot','Hot','Mild','Cool','Cool','Cool','Mild','Cool','Mild','Mild','Mild','Hot','Mild']
play=['No','No','Yes','Yes','Yes','No','Yes','No','Yes','Yes','Yes','Yes','Yes','No']
df = pd.DataFrame([weather,temp])
df = df.T
df.columns = ['날씨','온도']
df['외출여부'] = play
df
x_data = df.iloc[:,:-1]
y_data = df.iloc[:,-1]
ct = make_column_transformer((OneHotEncoder(),['날씨','온도']))
result = ct.fit_transform(df)
result
[OUT] :
array([[0., 0., 1., 0., 1., 0.],
[0., 0., 1., 0., 1., 0.],
[1., 0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0., 1.],
[0., 1., 0., 1., 0., 0.],
[0., 1., 0., 1., 0., 0.],
[1., 0., 0., 1., 0., 0.],
[0., 0., 1., 0., 0., 1.],
[0., 0., 1., 1., 0., 0.],
[0., 1., 0., 0., 0., 1.],
[0., 0., 1., 0., 0., 1.],
[1., 0., 0., 0., 0., 1.],
[1., 0., 0., 0., 1., 0.],
[0., 1., 0., 0., 0., 1.]])
ct.get_feature_names()
[OUT] :
['onehotencoder__x0_Overcast',
'onehotencoder__x0_Rainy',
'onehotencoder__x0_Sunny',
'onehotencoder__x1_Cool',
'onehotencoder__x1_Hot',
'onehotencoder__x1_Mild']
model_pipe = make_pipeline(ct,BernoulliNB()) # alpha : 1 = smoothing으로 곱했을때 0이 안되도록 만듦
model_pipe.fit(x_data,y_data)
[OUT] :
Pipeline(memory=None,
steps=[('columntransformer',
ColumnTransformer(n_jobs=None, remainder='drop',
sparse_threshold=0.3,
transformer_weights=None,
transformers=[('onehotencoder',
OneHotEncoder(categories='auto',
drop=None,
dtype=<class 'numpy.float64'>,
handle_unknown='error',
sparse=True),
['날씨', '온도'])],
verbose=False)),
('bernoullinb',
BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None,
fit_prior=True))],
verbose=False)
test_df = pd.DataFrame([['Sunny','Hot']],columns=['날씨','온도'])
test_df
model_pipe.predict(test_df)
[OUT] :
array(['No'], dtype='<U3')
연습문제
wine데이터셋을 불러온 후 나이브베이즈를 이용하여 test[0] 데이터에 대한 분류를 출력하고 score를 구하시오
Solution
wine = load_wine()
x_data = wine['data']
y_data = wine['target']
x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, test_size=0.2,
random_state=42, stratify=y_data)
y_data #다항분류
[OUT] :
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2])
# 가우시안
model_pipe = make_pipeline( StandardScaler(), GaussianNB() )
param_grid={"gaussiannb__var_smoothing":np.linspace(0.1, 1, 10)}
model_wine = GridSearchCV(model_pipe, param_grid, scoring='f1_macro')
# model_pipe.fit(x_train, y_train)
model_wine.fit( x_train, y_train )
[OUT] :
GridSearchCV(cv=None, error_score=nan,
estimator=Pipeline(memory=None,
steps=[('standardscaler',
StandardScaler(copy=True,
with_mean=True,
with_std=True)),
('gaussiannb',
GaussianNB(priors=None,
var_smoothing=1e-09))],
verbose=False),
iid='deprecated', n_jobs=None,
param_grid={'gaussiannb__var_smoothing': array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1. ])},
pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
scoring='f1_macro', verbose=0)
# 예측
model_wine.best_estimator_.predict( [x_test[0]] )
[OUT] :
array([0])
# 스코어
model_wine.best_estimator_.score( x_test, y_test )
[OUT] :
1.0
728x90
반응형
LIST
'코딩으로 익히는 Python > 모델링' 카테고리의 다른 글
[Python] 22. Kmeans (0) | 2021.01.26 |
---|---|
[Python] 21. SVM(서포트벡터머신) (0) | 2021.01.26 |
[Python] 19. MLP : pima-indians 예제 (0) | 2021.01.26 |
[Python] 18. Ensemble(앙상블) : breast_cancer,wine 예제 (5) | 2021.01.25 |
[Python] 17. Decision Tree(의사결정나무) : iris, breast_cancer (0) | 2021.01.25 |