In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
mush = pd.read_csv('../../data/mushroom.csv', header=None)
mush.head()
Out[1]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | p | x | s | n | t | p | f | c | n | k | ... | s | w | w | p | w | o | p | k | s | u |
1 | e | x | s | y | t | a | f | c | b | k | ... | s | w | w | p | w | o | p | n | n | g |
2 | e | b | s | w | t | l | f | c | b | n | ... | s | w | w | p | w | o | p | n | n | m |
3 | p | x | y | w | t | p | f | c | n | n | ... | s | w | w | p | w | o | p | k | s | u |
4 | e | x | s | g | f | n | f | w | b | k | ... | s | w | w | p | w | o | e | n | a | g |
5 rows × 23 columns
In [2]:
# 데이터 변환
from sklearn.preprocessing import LabelEncoder
encoding = LabelEncoder()
for col in mush.columns:
mush[col] = encoding.fit_transform(mush[col])
mush.head()
Out[2]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 5 | 2 | 4 | 1 | 6 | 1 | 0 | 1 | 4 | ... | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 2 | 3 | 5 |
1 | 0 | 5 | 2 | 9 | 1 | 0 | 1 | 0 | 0 | 4 | ... | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 3 | 2 | 1 |
2 | 0 | 0 | 2 | 8 | 1 | 3 | 1 | 0 | 0 | 5 | ... | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 3 | 2 | 3 |
3 | 1 | 5 | 3 | 8 | 1 | 6 | 1 | 0 | 1 | 5 | ... | 2 | 7 | 7 | 0 | 2 | 1 | 4 | 2 | 3 | 5 |
4 | 0 | 5 | 2 | 3 | 0 | 5 | 1 | 1 | 0 | 4 | ... | 2 | 7 | 7 | 0 | 2 | 1 | 0 | 3 | 0 | 1 |
5 rows × 23 columns
In [3]:
mush.corr()
Out[3]:
0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | ... | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1.000000 | 0.052951 | 0.178446 | -0.031384 | -0.501530 | -0.093552 | 0.129200 | -0.348387 | 0.540024 | -0.530566 | ... | -0.298801 | -0.154003 | -0.146730 | NaN | 0.145142 | -0.214366 | -0.411771 | 0.171961 | 0.298686 | 0.217179 |
1 | 0.052951 | 1.000000 | -0.050454 | -0.048203 | -0.035374 | -0.021935 | 0.078865 | 0.013196 | 0.054050 | -0.006039 | ... | -0.032591 | -0.031659 | -0.030390 | NaN | 0.072560 | -0.106534 | -0.025457 | -0.073416 | 0.063413 | -0.042221 |
2 | 0.178446 | -0.050454 | 1.000000 | -0.019402 | 0.070228 | 0.045233 | -0.034180 | -0.282306 | 0.208100 | -0.161017 | ... | 0.107965 | 0.066050 | 0.068885 | NaN | -0.016603 | -0.026147 | -0.106407 | 0.230364 | 0.021555 | 0.163887 |
3 | -0.031384 | -0.048203 | -0.019402 | 1.000000 | -0.000764 | -0.387121 | 0.041436 | 0.144259 | -0.169464 | 0.084659 | ... | -0.047710 | 0.002364 | 0.008057 | NaN | 0.036130 | -0.005822 | 0.162513 | -0.293523 | -0.144770 | 0.033925 |
4 | -0.501530 | -0.035374 | 0.070228 | -0.000764 | 1.000000 | -0.061825 | 0.137359 | -0.299473 | -0.369596 | 0.527120 | ... | 0.458983 | 0.083538 | 0.092874 | NaN | 0.119770 | 0.056788 | 0.692973 | -0.285008 | 0.088137 | -0.075095 |
5 | -0.093552 | -0.021935 | 0.045233 | -0.387121 | -0.061825 | 1.000000 | -0.059590 | 0.063936 | 0.310495 | -0.129213 | ... | 0.061820 | 0.174532 | 0.169407 | NaN | -0.057747 | 0.111905 | -0.281387 | 0.469055 | -0.043623 | -0.026610 |
6 | 0.129200 | 0.078865 | -0.034180 | 0.041436 | 0.137359 | -0.059590 | 1.000000 | 0.071489 | 0.108984 | -0.128567 | ... | -0.116177 | 0.099299 | 0.097160 | NaN | 0.897518 | 0.093236 | -0.146689 | -0.029524 | 0.165575 | -0.030304 |
7 | -0.348387 | 0.013196 | -0.282306 | 0.144259 | -0.299473 | 0.063936 | 0.071489 | 1.000000 | -0.108333 | 0.100193 | ... | -0.213775 | 0.274574 | 0.253505 | NaN | 0.073363 | 0.243014 | -0.195897 | 0.047323 | -0.529253 | -0.154680 |
8 | 0.540024 | 0.054050 | 0.208100 | -0.169464 | -0.369596 | 0.310495 | 0.108984 | -0.108333 | 1.000000 | -0.516736 | ... | 0.010894 | 0.296548 | 0.278708 | NaN | 0.103809 | -0.171362 | -0.460872 | 0.622991 | 0.147682 | 0.161418 |
9 | -0.530566 | -0.006039 | -0.161017 | 0.084659 | 0.527120 | -0.129213 | -0.128567 | 0.100193 | -0.516736 | 1.000000 | ... | 0.257224 | -0.058299 | -0.074781 | NaN | -0.097583 | 0.096054 | 0.629398 | -0.416135 | -0.034090 | -0.202972 |
10 | -0.102019 | 0.063794 | -0.014123 | -0.456496 | 0.099364 | 0.459766 | 0.186485 | 0.080895 | 0.214576 | -0.175699 | ... | -0.034399 | 0.223439 | 0.235794 | NaN | 0.162604 | -0.293221 | -0.291444 | 0.258831 | 0.087383 | -0.269216 |
11 | -0.379361 | 0.030191 | -0.126245 | 0.321274 | 0.244188 | -0.205215 | 0.144063 | 0.350548 | -0.344345 | 0.315080 | ... | 0.087454 | 0.157140 | 0.159805 | NaN | 0.156213 | -0.247357 | 0.210155 | -0.536996 | -0.306747 | -0.007668 |
12 | -0.334593 | -0.030417 | 0.089090 | -0.060837 | 0.460824 | 0.118617 | -0.088916 | -0.212359 | 0.056310 | 0.224287 | ... | 0.437164 | 0.132708 | 0.142835 | NaN | -0.090591 | 0.107904 | 0.390091 | 0.100764 | 0.079604 | -0.058076 |
13 | -0.298801 | -0.032591 | 0.107965 | -0.047710 | 0.458983 | 0.061820 | -0.116177 | -0.213775 | 0.010894 | 0.257224 | ... | 1.000000 | 0.106933 | 0.110656 | NaN | -0.077284 | 0.040006 | 0.394644 | 0.130974 | 0.046797 | -0.039628 |
14 | -0.154003 | -0.031659 | 0.066050 | 0.002364 | 0.083538 | 0.174532 | 0.099299 | 0.274574 | 0.296548 | -0.058299 | ... | 0.106933 | 1.000000 | 0.491510 | NaN | 0.067377 | 0.084917 | -0.048878 | 0.271533 | -0.240261 | 0.042561 |
15 | -0.146730 | -0.030390 | 0.068885 | 0.008057 | 0.092874 | 0.169407 | 0.097160 | 0.253505 | 0.278708 | -0.074781 | ... | 0.110656 | 0.491510 | 1.000000 | NaN | 0.065567 | 0.087580 | -0.034284 | 0.254518 | -0.242792 | 0.041594 |
16 | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
17 | 0.145142 | 0.072560 | -0.016603 | 0.036130 | 0.119770 | -0.057747 | 0.897518 | 0.073363 | 0.103809 | -0.097583 | ... | -0.077284 | 0.067377 | 0.065567 | NaN | 1.000000 | 0.036380 | -0.143673 | -0.003600 | 0.124924 | -0.040581 |
18 | -0.214366 | -0.106534 | -0.026147 | -0.005822 | 0.056788 | 0.111905 | 0.093236 | 0.243014 | -0.171362 | 0.096054 | ... | 0.040006 | 0.084917 | 0.087580 | NaN | 0.036380 | 1.000000 | 0.058312 | 0.338417 | -0.242020 | 0.235835 |
19 | -0.411771 | -0.025457 | -0.106407 | 0.162513 | 0.692973 | -0.281387 | -0.146689 | -0.195897 | -0.460872 | 0.629398 | ... | 0.394644 | -0.048878 | -0.034284 | NaN | -0.143673 | 0.058312 | 1.000000 | -0.487048 | 0.211763 | -0.212080 |
20 | 0.171961 | -0.073416 | 0.230364 | -0.293523 | -0.285008 | 0.469055 | -0.029524 | 0.047323 | 0.622991 | -0.416135 | ... | 0.130974 | 0.271533 | 0.254518 | NaN | -0.003600 | 0.338417 | -0.487048 | 1.000000 | -0.126859 | 0.185954 |
21 | 0.298686 | 0.063413 | 0.021555 | -0.144770 | 0.088137 | -0.043623 | 0.165575 | -0.529253 | 0.147682 | -0.034090 | ... | 0.046797 | -0.240261 | -0.242792 | NaN | 0.124924 | -0.242020 | 0.211763 | -0.126859 | 1.000000 | -0.174529 |
22 | 0.217179 | -0.042221 | 0.163887 | 0.033925 | -0.075095 | -0.026610 | -0.030304 | -0.154680 | 0.161418 | -0.202972 | ... | -0.039628 | 0.042561 | 0.041594 | NaN | -0.040581 | 0.235835 | -0.212080 | 0.185954 | -0.174529 | 1.000000 |
23 rows × 23 columns
2, 4번 열만 사용했을 때¶
In [4]:
data = mush[[2, 4]].to_numpy()
target = mush[0].to_numpy()
train_input, test_input, train_target, test_target = train_test_split(
data, target, test_size=0.2, random_state=42
)
In [5]:
# random_forest
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rf, train_input, train_target,
return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(train_input, train_target)
print("RF_acc: {:.2f}%".format(rf.oob_score_*100))
0.7487305546924704 0.7487302658850004
RF_acc: 74.87%
In [6]:
# SVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
x_train, x_test, y_train, y_test = train_test_split(data, target,
test_size=0.2, shuffle=True, stratify=target, random_state=2019)
svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
print('SVM_acc : {:.2f}%'.format(accuracy_score(y_test, y_pred)*100))
SVM_acc : 74.95%
모든 열 사용했을 때¶
In [7]:
data_all = mush.drop(0, axis=1).to_numpy()
target = mush[0].to_numpy()
# target = data['class']
# feature = data.drop('class',axis = 1)
train_input, test_input, train_target, test_target = train_test_split(
data_all, target, test_size=0.2, random_state=42
)
In [8]:
# random_forest
from sklearn.model_selection import cross_validate
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_jobs=-1, random_state=42)
scores = cross_validate(rf, train_input, train_target,
return_train_score=True, n_jobs=-1)
print(np.mean(scores['train_score']), np.mean(scores['test_score']))
rf = RandomForestClassifier(oob_score=True, n_jobs=-1, random_state=42)
rf.fit(train_input, train_target)
print("RF_acc: {:.2f}%".format(rf.oob_score_*100))
1.0 1.0
RF_acc: 100.00%
In [9]:
# SVM
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
x_train, x_test, y_train, y_test = train_test_split(data_all, target,
test_size=0.2, shuffle=True, stratify=target, random_state=2019)
svc = SVC()
svc.fit(x_train, y_train)
y_pred = svc.predict(x_test)
print('SVM_acc : {:.2f}%'.format(accuracy_score(y_test, y_pred)*100))
SVM_acc : 99.08%
반응형
'데이터분석' 카테고리의 다른 글
[23.07.21] 머신러닝(Random Forest) - 36(1) (0) | 2023.07.21 |
---|---|
[23.07.18] 머신러닝(선형회귀) - 33(3) (0) | 2023.07.18 |
[23.07.18] 머신러닝(k-최근접 이웃 회귀) - 33(2) (0) | 2023.07.18 |
[23.07.18] 머신러닝(iris붓꽃데이터) - 33(1) (0) | 2023.07.18 |
[23.07.17] 머신러닝(Machine Learning) - 32(1) (0) | 2023.07.18 |