본문 바로가기

코딩으로 익히는 Python/모델링

[Python] 21. SVM(서포트벡터머신)

728x90
반응형
SMALL
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score, cross_validate
import multiprocessing

from sklearn.svm import SVC, SVR
from sklearn.manifold import TSNE
from sklearn.linear_model import SGDRegressor, SGDClassifier
from sklearn.datasets import load_boston, load_breast_cancer,load_iris,load_wine
from sklearn.datasets import load_wine
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.svm import SVC # SVM
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
import sklearn.metrics as m
import seaborn as sns

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')

SVM(서포트벡터머신)

 

wine데이터셋을 불러와서 실습해보기

 

wine = load_wine()

x_data = wine['data']
y_data = wine['target']

x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, test_size=0.2,
                                                    random_state=42, stratify=y_data)

 

model_wine = make_pipeline( StandardScaler(), SVC() )
model_wine.fit(x_train,y_train)
[OUT] :

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('svc',
                 SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None,
                     coef0=0.0, decision_function_shape='ovr', degree=3,
                     gamma='scale', kernel='rbf', max_iter=-1,
                     probability=False, random_state=None, shrinking=True,
                     tol=0.001, verbose=False))],
         verbose=False)

 

model_wine.score(x_test,y_test)
[OUT] :

0.9722222222222222

연습문제

 

데이터 불러오기 (pd.read_csv)

python 파일 경로에 만든 후 data5폴더에 다음의 'pima-indians-diabetes.data.csv' 파일 넣어놓기

pima-indians-diabetes.data.csv
0.02MB

 

pima-indians 데이터 GridSearch 통해 최적의 파라미터 찾고 confusion matrix, score 확인하시오


Solution

 

df = pd.read_csv('data5/pima-indians-diabetes.data.csv')
df.head(3)

x_data = df.iloc[:,:-1]
y_data = df.iloc[:,-1]

x_train, x_test, y_train, y_test = train_test_split( x_data, y_data, test_size=0.2,
                                                    random_state=42, stratify=y_data)


# SVC
model_pipe = make_pipeline( StandardScaler(), SVC() )
param_grid={"svc__C":range(11),
           "svc__kernel":['linear', 'poly', 'rbf', 'sigmoid', 'precomputed'],
           "svc__gamma" : ['scale', 'auto']}
model_pima = GridSearchCV(model_pipe, param_grid, scoring='f1_macro')
model_pima.fit( x_train, y_train )
[OUT] :

GridSearchCV(cv=None, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('standardscaler',
                                        StandardScaler(copy=True,
                                                       with_mean=True,
                                                       with_std=True)),
                                       ('svc',
                                        SVC(C=1.0, break_ties=False,
                                            cache_size=200, class_weight=None,
                                            coef0=0.0,
                                            decision_function_shape='ovr',
                                            degree=3, gamma='scale',
                                            kernel='rbf', max_iter=-1,
                                            probability=False,
                                            random_state=None, shrinking=True,
                                            tol=0.001, verbose=False))],
                                verbose=False),
             iid='deprecated', n_jobs=None,
             param_grid={'svc__C': range(0, 11),
                         'svc__gamma': ['scale', 'auto'],
                         'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid',
                                         'precomputed']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_macro', verbose=0)

 

# 예측
y_pred = model_pima.best_estimator_.predict( x_test )
y_pred
[OUT] :

array([1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0],
      dtype=int64)

 

# best parameter
model_pima.best_params_
[OUT] :

{'svc__C': 1, 'svc__gamma': 'scale', 'svc__kernel': 'linear'}

 

# 스코어
model_pima.best_estimator_.score( x_test, y_test )
[OUT] :

0.7207792207792207

 

# confusion_matrix
cm = m.confusion_matrix(y_test,y_pred)
plt.figure(figsize=(15,10))
sns.heatmap(cm,annot=True,cmap='Reds')
plt.show()


728x90
반응형
LIST