Studi Kasus Heart Disease 5 Fitur#

Implementasi dengan menggunakan Naive Bayes , K-NN , K-Means Clustering dan Decision Tree

Membaca Data#

import pandas as pd

data = pd.read_csv('https://raw.githubusercontent.com/soumya-mishra/Heart-Disease_DT/main/heart_v2.csv')
data
age sex BP cholestrol heart disease
0 70 1 130 322 1
1 67 0 115 564 0
2 57 1 124 261 1
3 64 1 128 263 0
4 74 0 120 269 0
... ... ... ... ... ...
265 52 1 172 199 0
266 44 1 120 263 0
267 56 0 140 294 0
268 57 1 140 192 0
269 67 1 160 286 1

270 rows × 5 columns

Class#

y_class = data['heart disease']
y = y_class.values.tolist()
print(y[:5])
[1, 0, 1, 0, 0]

Drop Target / Class#

X = data.drop(columns='heart disease')
X
age sex BP cholestrol
0 70 1 130 322
1 67 0 115 564
2 57 1 124 261
3 64 1 128 263
4 74 0 120 269
... ... ... ... ...
265 52 1 172 199
266 44 1 120 263
267 56 0 140 294
268 57 1 140 192
269 67 1 160 286

270 rows × 4 columns

Preprocessing Min-Max#

Normalisasi data menggunakan Min - Max

from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
scaled = scaler.fit_transform(X)
nama_fitur = X.columns.copy()
scaled_fitur = pd.DataFrame(scaled,columns=nama_fitur)
scaled_fitur
age sex BP cholestrol
0 0.854167 1.0 0.339623 0.447489
1 0.791667 0.0 0.198113 1.000000
2 0.583333 1.0 0.283019 0.308219
3 0.729167 1.0 0.320755 0.312785
4 0.937500 0.0 0.245283 0.326484
... ... ... ... ...
265 0.479167 1.0 0.735849 0.166667
266 0.312500 1.0 0.245283 0.312785
267 0.562500 0.0 0.433962 0.383562
268 0.583333 1.0 0.433962 0.150685
269 0.791667 1.0 0.622642 0.365297

270 rows × 4 columns

Save Normalisasi#

import joblib
filename = '/content/drive/MyDrive/datamining/tugas/model/norm.sav'
joblib.dump(scaler, filename) 
['/content/drive/MyDrive/datamining/tugas/model/norm.sav']

Split Data#

split data 20%

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(scaled_fitur, y, test_size=0.2, random_state=1)
X_train.shape + X_test.shape
(216, 4, 54, 4)

Inisialisasi Model Naive Bayes (gaussian)#

Eksekusi pada Model#

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
probas = clf.predict_proba(X_test)[:,1]
y_pred
array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0])

Save Model Naive bayes#

filenameNB = '/content/drive/MyDrive/datamining/tugas/model/modelNB.pkl'
joblib.dump(clf,filenameNB)
['/content/drive/MyDrive/datamining/tugas/model/modelNB.pkl']

Menghitung Probas#

probas
array([0.45679326, 0.71532781, 0.51497631, 0.14216997, 0.6032597 ,
       0.56976274, 0.64399715, 0.55675502, 0.44854497, 0.6579511 ,
       0.55449199, 0.62978641, 0.53488247, 0.19771043, 0.60989258,
       0.84122473, 0.22204937, 0.63114903, 0.51273849, 0.07377463,
       0.61553336, 0.50078841, 0.46260317, 0.2475638 , 0.17060665,
       0.87967817, 0.71289895, 0.15236739, 0.18455015, 0.57164786,
       0.74028585, 0.46144298, 0.36198486, 0.58321812, 0.32051104,
       0.29148322, 0.14421113, 0.16577117, 0.16712698, 0.44960568,
       0.28132963, 0.73285876, 0.49066188, 0.28033567, 0.68523524,
       0.5878491 , 0.42924538, 0.38648143, 0.51074566, 0.64821159,
       0.35333848, 0.60848559, 0.0816834 , 0.21387506])

Menghitung Hasil Akhir#

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
cm = confusion_matrix(y_test,y_pred)
precision = round(precision_score(y_test,y_pred, average="macro")*100,2)
acc_nb = round(accuracy_score(y_test,y_pred)*100,2)
recall = round(recall_score(y_test,y_pred, average="macro")*100,2)
f1score = round(f1_score(y_test, y_pred, average="macro")*100,2)
print('Konfusi Matrix\n',cm)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(f1score))
print('accuracy: {}'.format(acc_nb))
Konfusi Matrix
 [[22  9]
 [ 5 18]]
precision: 74.07
recall: 74.61
fscore: 73.93
accuracy: 74.07

Predict Input To Naive Bayes Model#

list_input = []
list_input.append('60 1 500 322'.split())
# list_input.append('50 0 120 289'.split())
# list_input.append('70 1 130 322'.split())
# list_input.append('67 0 115 564'.split())
list_input
[['60', '1', '500', '322']]

input to Model Normalisasi

norm = joblib.load(filename)
pred_input = norm.fit_transform(list_input)
pred_input=pd.DataFrame(pred_input,columns=nama_fitur)
pred_input
age sex BP cholestrol
0 0.0 0.0 0.0 0.0

Input to Model Naive Bayes

nb = joblib.load(filenameNB)
input_pred = nb.predict(pred_input)
input_pred
array([0])

Inisialisasi Model KNN#

Eksekusi Pada model#

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
#Try running from k=1 through 30 and record testing accuracy
k_range = range(1,31)
scores = {}
scores_list = []
for k in k_range:
        # install model
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train,y_train)
        # save model
        filenameKNN = '/content/drive/MyDrive/datamining/tugas/model/modelKNN'+str(k)+'.pkl'
        joblib.dump(knn,filenameKNN)
        y_pred=knn.predict(X_test)
        scores[k] = accuracy_score(y_test,y_pred)
        scores_list.append(accuracy_score(y_test,y_pred))
scores
{1: 0.6296296296296297,
 2: 0.6111111111111112,
 3: 0.6851851851851852,
 4: 0.6296296296296297,
 5: 0.6851851851851852,
 6: 0.6481481481481481,
 7: 0.7407407407407407,
 8: 0.6666666666666666,
 9: 0.7407407407407407,
 10: 0.7222222222222222,
 11: 0.7592592592592593,
 12: 0.7037037037037037,
 13: 0.7407407407407407,
 14: 0.7592592592592593,
 15: 0.7407407407407407,
 16: 0.7592592592592593,
 17: 0.7592592592592593,
 18: 0.7592592592592593,
 19: 0.7407407407407407,
 20: 0.7407407407407407,
 21: 0.7407407407407407,
 22: 0.7222222222222222,
 23: 0.7222222222222222,
 24: 0.7407407407407407,
 25: 0.7222222222222222,
 26: 0.7407407407407407,
 27: 0.7222222222222222,
 28: 0.7407407407407407,
 29: 0.7037037037037037,
 30: 0.7222222222222222}

Visualisasi Score#

%matplotlib inline
import matplotlib.pyplot as plt

#plot the relationship between K and the testing accuracy
plt.plot(k_range,scores_list)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')
Text(0, 0.5, 'Testing Accuracy')
_images/notebooks8_35_1.png

nilai k dengan akurasi tertinggi

scores_list.index(max(scores_list))+1 , max(scores_list)
(11, 0.7592592592592593)
knn = KNeighborsClassifier(n_neighbors=scores_list.index(max(scores_list))+1)
knn.fit(X_train,y_train)
y_pred_knn =knn.predict(X_test)

cm = confusion_matrix(y_test,y_pred_knn)
precision = round(precision_score(y_test,y_pred_knn, average="macro")*100,2)
acc = round(accuracy_score(y_test,y_pred_knn)*100,2)
recall = round(recall_score(y_test,y_pred_knn, average="macro")*100,2)
f1score = round(f1_score(y_test, y_pred_knn, average="macro")*100,2)
print('Konfusi Matrix\n',cm)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(f1score))
print('accuracy: {}'.format(acc))
Konfusi Matrix
 [[25  6]
 [ 7 16]]
precision: 75.43
recall: 75.11
fscore: 75.24
accuracy: 75.93

Implementasi Pada data Input#

Menggunakan KNN dengan nilai K = 11

knn11 = joblib.load('/content/drive/MyDrive/datamining/tugas/model/modelKNN11.pkl')
knn_pred = knn11.predict(pred_input)
knn_pred
array([0])

Inisialisasi K-Means Clustering#

Eksekusi Pada Model#

from sklearn.cluster import KMeans

# #Try running from n=1 through 30 and record testing accuracy
n_range = range(1,31)
akurasi = {}
akurasi_score = []
for k in n_range:
        # install model
        kmeans = KMeans(n_clusters=k,random_state=0)
        kmeans.fit(X_train,y_train)
        # save model
        filenameKMeans = '/content/drive/MyDrive/datamining/tugas/model/modelKMeans'+str(k)+'.pkl'
        joblib.dump(kmeans,filenameKMeans)
        y_pred=kmeans.predict(X_test)
        akurasi[k] = accuracy_score(y_test,y_pred)
        akurasi_score.append(accuracy_score(y_test,y_pred))
akurasi_score
[0.5740740740740741,
 0.4074074074074074,
 0.3888888888888889,
 0.1111111111111111,
 0.14814814814814814,
 0.2222222222222222,
 0.14814814814814814,
 0.16666666666666666,
 0.1111111111111111,
 0.1111111111111111,
 0.09259259259259259,
 0.09259259259259259,
 0.1111111111111111,
 0.1111111111111111,
 0.07407407407407407,
 0.09259259259259259,
 0.0,
 0.037037037037037035,
 0.07407407407407407,
 0.0,
 0.0,
 0.0,
 0.0,
 0.0,
 0.018518518518518517,
 0.018518518518518517,
 0.037037037037037035,
 0.018518518518518517,
 0.018518518518518517,
 0.018518518518518517]

Visualisasi Hasil K-means#

%matplotlib inline
import matplotlib.pyplot as plt

#plot the relationship between K and the testing accuracy
plt.plot(n_range,akurasi_score)
plt.xlabel('Value of K for K-means')
plt.ylabel('Testing Accuracy')
Text(0, 0.5, 'Testing Accuracy')
_images/notebooks8_46_1.png

Nilai n dengan akurasi tertinggi

akurasi_score.index(max(akurasi_score)) , max(akurasi_score)
(0, 0.5740740740740741)

Inisialisasi Decision Tree#

Eksekusi Pada Model#

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth =5, random_state = 42)
dtc.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=5, random_state=42)

Decision Tree Rules Text#

#import relevant functions
from sklearn.tree import export_text
#export the decision rules
tree_rules = export_text(dtc,
                        feature_names = list(nama_fitur))
#print the result
print(tree_rules)
|--- sex <= 0.50
|   |--- BP <= 0.33
|   |   |--- age <= 0.70
|   |   |   |--- class: 0
|   |   |--- age >  0.70
|   |   |   |--- age <= 0.75
|   |   |   |   |--- class: 1
|   |   |   |--- age >  0.75
|   |   |   |   |--- class: 0
|   |--- BP >  0.33
|   |   |--- age <= 0.53
|   |   |   |--- cholestrol <= 0.41
|   |   |   |   |--- class: 0
|   |   |   |--- cholestrol >  0.41
|   |   |   |   |--- class: 1
|   |   |--- age >  0.53
|   |   |   |--- age <= 0.70
|   |   |   |   |--- cholestrol <= 0.29
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- cholestrol >  0.29
|   |   |   |   |   |--- class: 1
|   |   |   |--- age >  0.70
|   |   |   |   |--- cholestrol <= 0.25
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- cholestrol >  0.25
|   |   |   |   |   |--- class: 0
|--- sex >  0.50
|   |--- age <= 0.51
|   |   |--- cholestrol <= 0.14
|   |   |   |--- class: 1
|   |   |--- cholestrol >  0.14
|   |   |   |--- BP <= 0.39
|   |   |   |   |--- BP <= 0.33
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- BP >  0.33
|   |   |   |   |   |--- class: 0
|   |   |   |--- BP >  0.39
|   |   |   |   |--- cholestrol <= 0.35
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- cholestrol >  0.35
|   |   |   |   |   |--- class: 1
|   |--- age >  0.51
|   |   |--- cholestrol <= 0.27
|   |   |   |--- BP <= 0.51
|   |   |   |   |--- age <= 0.76
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  0.76
|   |   |   |   |   |--- class: 1
|   |   |   |--- BP >  0.51
|   |   |   |   |--- class: 0
|   |   |--- cholestrol >  0.27
|   |   |   |--- BP <= 0.24
|   |   |   |   |--- class: 0
|   |   |   |--- BP >  0.24
|   |   |   |   |--- age <= 0.55
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  0.55
|   |   |   |   |   |--- class: 1

Rules Decision Tree Plot Diagram#

#import relevant packages
from sklearn import tree
import matplotlib.pyplot as plt

#plt the figure, setting a black background
plt.figure(figsize=(30,10), facecolor ='k')
#create the tree plot
a = tree.plot_tree(dtc,feature_names = nama_fitur,class_names = str(y),rounded = True,filled = True,fontsize=14)
#show the plot
plt.show()
_images/notebooks8_55_0.png

Hasil#

dtc_pred = dtc.predict(X_test)
dtc_pred
array([0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0])
cm_dtc = confusion_matrix(y_test,dtc_pred)
precision_dtc = round(precision_score(y_test,dtc_pred, average="macro")*100,2)
acc_dtc = round(accuracy_score(y_test,dtc_pred)*100,2)
recall_dtc = round(recall_score(y_test,dtc_pred, average="macro")*100,2)
f1score_dtc = round(f1_score(y_test, dtc_pred, average="macro")*100,2)
print('Konfusi Matrix\n',cm_dtc)
print('precision: {}'.format(precision_dtc))
print('recall: {}'.format(recall_dtc))
print('fscore: {}'.format(f1score_dtc))
print('accuracy: {}'.format(acc_dtc))
Konfusi Matrix
 [[21 10]
 [11 12]]
precision: 60.09
recall: 59.96
fscore: 60.0
accuracy: 61.11