728x90
반응형
SMALL
from sklearn.datasets import make_classification, load_breast_cancer,load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import sklearn.metrics as metrics
import seaborn as sns
import matplotlib
matplotlib.rcParams["font.family"]="Malgun Gothic"
matplotlib.rcParams["axes.unicode_minus"]= False
import warnings
warnings.simplefilter('ignore')
출처 : https://drive.google.com/uc?id=1M-rZSCsgOylvAq82HwPdmn2MikZeDQ9a
iris = load_iris() # 프랑스의 국화 붖꽃
iris.keys()
[OUT]:
dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])
iris['data'] # 특성데이터
[OUT]:
array([[5.1, 3.5, 1.4, 0.2],
[4.9, 3. , 1.4, 0.2],
[4.7, 3.2, 1.3, 0.2],
[4.6, 3.1, 1.5, 0.2],
[5. , 3.6, 1.4, 0.2],
[5.4, 3.9, 1.7, 0.4],
[4.6, 3.4, 1.4, 0.3],
[5. , 3.4, 1.5, 0.2],
[4.4, 2.9, 1.4, 0.2],
[4.9, 3.1, 1.5, 0.1],
[5.4, 3.7, 1.5, 0.2],
[4.8, 3.4, 1.6, 0.2],
[4.8, 3. , 1.4, 0.1],
[4.3, 3. , 1.1, 0.1],
[5.8, 4. , 1.2, 0.2],
[5.7, 4.4, 1.5, 0.4],
[5.4, 3.9, 1.3, 0.4],
[5.1, 3.5, 1.4, 0.3],
[5.7, 3.8, 1.7, 0.3],
[5.1, 3.8, 1.5, 0.3],
# ...
# 중략
# ...
[5.9, 3. , 5.1, 1.8]])
iris['target'] # 라벨(종분류)
[OUT]:
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])
iris['feature_names']
[OUT]:
['sepal length (cm)',
'sepal width (cm)',
'petal length (cm)',
'petal width (cm)']
iris['target_names']
[OUT]:
array(['setosa', 'versicolor', 'virginica'], dtype='<U10')
iris_df = pd.DataFrame(iris.data)
iris_df.columns = iris['feature_names']
iris_df['specis'] = iris.target
iris_df
iris_df.corr()
시각화
sns.heatmap(iris_df.corr(),annot=True,cmap='Reds',vmin=1,vmax=1 )
plt.show()
sns.pairplot(data = iris_df, hue='specis')
plt.show()
plt.xticks(rotation=45)
sns.boxplot(data=iris_df[iris['feature_names']])
plt.show()
iris_df.boxplot(rot=45)
plt.show()
data set 설정
x_data = iris_df.iloc[:,:-1]
y_data = iris_df.iloc[:,-1]
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.2,
random_state=1,stratify=y_data)
# 다중분류 시 multi_class : 'multinomial' 로 꼭 주기 (auto로 되어있지만 오류 날때를 대비)
model_logi = LogisticRegression(multi_class='multinomial')
model_logi.fit(x_train,y_train)
[OUT]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=100,
multi_class='multinomial', n_jobs=None, penalty='l2',
random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
warm_start=False)
model_logi.score(x_test,y_test) # f1 score아닌 정확도
[OUT]:
0.9666666666666667
model_logi.coef_ # 12개 왜?
[OUT]:
array([[-0.51418077, 0.77926692, -2.38300365, -0.96097857],
[ 0.30080885, -0.27663283, -0.15953256, -0.76810102],
[ 0.21337192, -0.50263409, 2.54253621, 1.72907959]])
model_logi.intercept_ # 3개 왜?
[OUT]:
array([ 10.43736141, 2.92425059, -13.361612 ])
다중분류
- w의 개수 : 피쳐의 개수 * 분류의 개수
ws1x1 + ws2x2 + ws3x3 + ws4x4 + bs
wvs1x1 + wvs2x2 + wvs3x3 + wvs4x4 + bvs
wvc1x1 + wvc2x2 + wvc3x3 + wvc4x4 + bvc
#1
(np.matmul(x_test.values,model_logi.coef_.T) + model_logi.intercept_).argmax(axis=1)
[OUT]:
array([2, 0, 1, 0, 0, 0, 2, 2, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 1, 2, 1, 1,
0, 0, 2, 1, 0, 0, 1, 1], dtype=int64)
#2
model_logi.predict_proba(x_test).argmax(axis=1)
[OUT]:
array([2, 0, 1, 0, 0, 0, 2, 2, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 1, 2, 1, 1,
0, 0, 2, 1, 0, 0, 1, 1], dtype=int64)
#3
y_predict = model_logi.predict(x_test)
y_predict
[OUT]:
array([2, 0, 1, 0, 0, 0, 2, 2, 2, 1, 0, 1, 2, 1, 2, 0, 2, 1, 1, 2, 1, 1,
0, 0, 2, 1, 0, 0, 1, 1])
연습문제
[5.2,3.5,1.5,0.2]를 예측하시오
Solution
#1
model_logi.predict_proba([[5.2,3.5,1.5,0.2]]).argmax(axis=1)
[OUT]:
array([0], dtype=int64)
#2
model_logi.predict([[5.2,3.5,1.5,0.2]])
[OUT]:
array([0])
Confusion Matrix, f1 score
cm = metrics.confusion_matrix(y_test,y_predict)
sns.heatmap(cm, annot=True, cmap='Reds',
xticklabels=['setosa로 예측', 'versicolor로 예측', 'virginica로 예측'],
yticklabels=iris['target_names'])
plt.show()
metrics.f1_score(y_test, y_predict, average='macro')
[OUT]:
0.9665831244778613
review
- 다중분류 시 multi_class : 'multinomial' -> LogisticRegression(multi_class='multinomial')
728x90
반응형
LIST
'코딩으로 익히는 Python > 모델링' 카테고리의 다른 글
[Python] 14. NN : XOR문제, MLPClassifier (0) | 2021.01.22 |
---|---|
[Python] 13. KNN분류 (0) | 2021.01.21 |
[Python] 11. softmax (4) | 2021.01.21 |
[Python] 10. confusion matrix : precision,recall,f1,ROC (2) | 2021.01.21 |
[Python] 9. Logistic Regression (로지스틱 회귀) (0) | 2021.01.20 |