UTS Data Mining#

Klasifikasi Data Pada Dataset Breats Cancer#

Lakukan analisa terhadap data pada https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Coimbra dengan menggunakan klasifikasi

  • Metode KNN

  • Metode pohon keputusan (Desision tree)

Data#

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import numba
import cv2 as cv
import pandas as pd
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn import tree
from matplotlib import pyplot as plt 
data = pd.read_csv("https://raw.githubusercontent.com/LALA09-erha/Python-StrukturData/master/dataR2.csv")
data_1 = pd.read_csv("https://raw.githubusercontent.com/LALA09-erha/Python-StrukturData/master/dataR2.csv")
data
Age BMI Glucose Insulin HOMA Leptin Adiponectin Resistin MCP.1 Classification
0 48 23.500000 70 2.707 0.467409 8.8071 9.702400 7.99585 417.114 1
1 83 20.690495 92 3.115 0.706897 8.8438 5.429285 4.06405 468.786 1
2 82 23.124670 91 4.498 1.009651 17.9393 22.432040 9.27715 554.697 1
3 68 21.367521 77 3.226 0.612725 9.8827 7.169560 12.76600 928.220 1
4 86 21.111111 92 3.549 0.805386 6.6994 4.819240 10.57635 773.920 1
... ... ... ... ... ... ... ... ... ... ...
111 45 26.850000 92 3.330 0.755688 54.6800 12.100000 10.96000 268.230 2
112 62 26.840000 100 4.530 1.117400 12.4500 21.420000 7.32000 330.160 2
113 65 32.050000 97 5.730 1.370998 61.4800 22.540000 10.33000 314.050 2
114 72 25.590000 82 2.820 0.570392 24.9600 33.750000 3.27000 392.460 2
115 86 27.180000 138 19.910 6.777364 90.2800 14.110000 4.35000 90.090 2

116 rows × 10 columns

Ubah Label 1=Healthy controls dan 2=Patients#

data_1['Classification'] = data_1['Classification'].apply({1:'Healthy controls', 2:'Patients'}.get)
data_1
Age BMI Glucose Insulin HOMA Leptin Adiponectin Resistin MCP.1 Classification
0 48 23.500000 70 2.707 0.467409 8.8071 9.702400 7.99585 417.114 Healthy controls
1 83 20.690495 92 3.115 0.706897 8.8438 5.429285 4.06405 468.786 Healthy controls
2 82 23.124670 91 4.498 1.009651 17.9393 22.432040 9.27715 554.697 Healthy controls
3 68 21.367521 77 3.226 0.612725 9.8827 7.169560 12.76600 928.220 Healthy controls
4 86 21.111111 92 3.549 0.805386 6.6994 4.819240 10.57635 773.920 Healthy controls
... ... ... ... ... ... ... ... ... ... ...
111 45 26.850000 92 3.330 0.755688 54.6800 12.100000 10.96000 268.230 Patients
112 62 26.840000 100 4.530 1.117400 12.4500 21.420000 7.32000 330.160 Patients
113 65 32.050000 97 5.730 1.370998 61.4800 22.540000 10.33000 314.050 Patients
114 72 25.590000 82 2.820 0.570392 24.9600 33.750000 3.27000 392.460 Patients
115 86 27.180000 138 19.910 6.777364 90.2800 14.110000 4.35000 90.090 Patients

116 rows × 10 columns

type(data)
pandas.core.frame.DataFrame

Metode KNN Clafification#

split data

data.shape
(116, 10)

memisahkan data dengan kelas ke dalam dua variabel, X dan y

X = data.drop(columns=["Classification"])
X.head()
Age BMI Glucose Insulin HOMA Leptin Adiponectin Resistin MCP.1
0 48 23.500000 70 2.707 0.467409 8.8071 9.702400 7.99585 417.114
1 83 20.690495 92 3.115 0.706897 8.8438 5.429285 4.06405 468.786
2 82 23.124670 91 4.498 1.009651 17.9393 22.432040 9.27715 554.697
3 68 21.367521 77 3.226 0.612725 9.8827 7.169560 12.76600 928.220
4 86 21.111111 92 3.549 0.805386 6.6994 4.819240 10.57635 773.920
#separate target values
y = data["Classification"].values
#view target values
y[0:150]
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2])

Memisahkan Data Training dan Test#

from sklearn.model_selection import train_test_split
#split dataset into train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1, stratify=y)

membuat Model#

Dengan menetapkan N = 3 atau 3 tetangga

from sklearn.neighbors import KNeighborsClassifier
# Create KNN classifier
knn = KNeighborsClassifier(n_neighbors = 3)
# Fit the classifier to the data
knn.fit(X_train,y_train)
KNeighborsClassifier(n_neighbors=3)

Melakukan testing data pada model#

#show first 5 model predictions on the test data
knn.predict(X_test)[1:25]
array([2, 1, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 1, 1, 2, 2, 2, 1, 2, 2, 2, 2,
       2])

Menghitung akurasi yang didapat#

#check accuracy of our model on the test data
knn.score(X_test, y_test)
0.3333333333333333

Melakukan K-Fold Cross Validation#

from sklearn.model_selection import cross_val_score
import numpy as np
#create a new KNN model
knn_cv = KNeighborsClassifier(n_neighbors=6)
#train model with cv of 5 
cv_scores = cross_val_score(knn_cv, X, y, cv=5)
#print each cv score (accuracy) and average them
print(cv_scores)
print("cv_scores mean:{}".format(np.mean(cv_scores)))
[0.45833333 0.60869565 0.47826087 0.43478261 0.43478261]
cv_scores mean:0.4829710144927536

Hypertuning model parameters menggunakan GridSearchCV#

from sklearn.model_selection import GridSearchCV
#create new a knn model
knn2 = KNeighborsClassifier()
#create a dictionary of all values we want to test for n_neighbors
param_grid = {'n_neighbors': np.arange(1, 25)}
#use gridsearch to test all values for n_neighbors
knn_gscv = GridSearchCV(knn2, param_grid, cv=5)
#fit model to data
knn_gscv.fit(X, y)
GridSearchCV(cv=5, estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24])})

Mengecek n yang terbaik#

#check top performing n_neighbors value
knn_gscv.best_params_
{'n_neighbors': 23}

Mengecek Hasil dari n yang terbaik#

#check mean score for the top performing value of n_neighbors
knn_gscv.best_score_
0.5688405797101449

Kesimpulan#

Dari Hasil diatas bahwa n yang terbaik yaitu 23 dengan hasil 0.5688405797101449

Metode Decision Tree#

data
Age BMI Glucose Insulin HOMA Leptin Adiponectin Resistin MCP.1 Classification
0 48 23.500000 70 2.707 0.467409 8.8071 9.702400 7.99585 417.114 1
1 83 20.690495 92 3.115 0.706897 8.8438 5.429285 4.06405 468.786 1
2 82 23.124670 91 4.498 1.009651 17.9393 22.432040 9.27715 554.697 1
3 68 21.367521 77 3.226 0.612725 9.8827 7.169560 12.76600 928.220 1
4 86 21.111111 92 3.549 0.805386 6.6994 4.819240 10.57635 773.920 1
... ... ... ... ... ... ... ... ... ... ...
111 45 26.850000 92 3.330 0.755688 54.6800 12.100000 10.96000 268.230 2
112 62 26.840000 100 4.530 1.117400 12.4500 21.420000 7.32000 330.160 2
113 65 32.050000 97 5.730 1.370998 61.4800 22.540000 10.33000 314.050 2
114 72 25.590000 82 2.820 0.570392 24.9600 33.750000 3.27000 392.460 2
115 86 27.180000 138 19.910 6.777364 90.2800 14.110000 4.35000 90.090 2

116 rows × 10 columns

Membuat Model Clasifier Decision Tree#

y = data["Classification"]
X = data.drop(columns=["Classification"])
clf = tree.DecisionTreeClassifier(criterion="gini")
clf = clf.fit(X, y)
data.value_counts
<bound method DataFrame.value_counts of      Age        BMI  Glucose  Insulin      HOMA   Leptin  Adiponectin  \
0     48  23.500000       70    2.707  0.467409   8.8071     9.702400   
1     83  20.690495       92    3.115  0.706897   8.8438     5.429285   
2     82  23.124670       91    4.498  1.009651  17.9393    22.432040   
3     68  21.367521       77    3.226  0.612725   9.8827     7.169560   
4     86  21.111111       92    3.549  0.805386   6.6994     4.819240   
..   ...        ...      ...      ...       ...      ...          ...   
111   45  26.850000       92    3.330  0.755688  54.6800    12.100000   
112   62  26.840000      100    4.530  1.117400  12.4500    21.420000   
113   65  32.050000       97    5.730  1.370998  61.4800    22.540000   
114   72  25.590000       82    2.820  0.570392  24.9600    33.750000   
115   86  27.180000      138   19.910  6.777364  90.2800    14.110000   

     Resistin    MCP.1  Classification  
0     7.99585  417.114               1  
1     4.06405  468.786               1  
2     9.27715  554.697               1  
3    12.76600  928.220               1  
4    10.57635  773.920               1  
..        ...      ...             ...  
111  10.96000  268.230               2  
112   7.32000  330.160               2  
113  10.33000  314.050               2  
114   3.27000  392.460               2  
115   4.35000   90.090               2  

[116 rows x 10 columns]>

Membuat Grafik Decision Tree dari hasil Klasifikasi#

fig = plt.figure(figsize=(15,10))
_ = tree.plot_tree(clf, feature_names=list(data.columns.values)[:9], class_names=list(data.columns.values)[4] ,filled=True)
_images/notebooks6_40_0.png