Studi Kasus Heart Disease 5#

Implementasi dengan menggunakan Naive Bayes , K-NN , K-Means Clustering dan Decision Tree Dengan data tanpa Normalisasi.

Membaca Data#

import pandas as pd

data = pd.read_csv('https://raw.githubusercontent.com/soumya-mishra/Heart-Disease_DT/main/heart_v2.csv')
data
age sex BP cholestrol heart disease
0 70 1 130 322 1
1 67 0 115 564 0
2 57 1 124 261 1
3 64 1 128 263 0
4 74 0 120 269 0
... ... ... ... ... ...
265 52 1 172 199 0
266 44 1 120 263 0
267 56 0 140 294 0
268 57 1 140 192 0
269 67 1 160 286 1

270 rows × 5 columns

Class#

y_class = data['heart disease']
y = y_class.values.tolist()
print(y[:5])
[1, 0, 1, 0, 0]

Drop Target / Class#

X = data.drop(columns='heart disease')
X
age sex BP cholestrol
0 70 1 130 322
1 67 0 115 564
2 57 1 124 261
3 64 1 128 263
4 74 0 120 269
... ... ... ... ...
265 52 1 172 199
266 44 1 120 263
267 56 0 140 294
268 57 1 140 192
269 67 1 160 286

270 rows × 4 columns

Preprocessing Min-Max#

Normalisasi data menggunakan Min - Max

# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# scaled = scaler.fit_transform(X)
# nama_fitur = X.columns.copy()
# scaled_fitur = pd.DataFrame(scaled,columns=nama_fitur)
# scaled_fitur

Save Normalisasi#

import joblib
# filename = '/content/drive/MyDrive/datamining/tugas/cobamodel/norm.sav'
# joblib.dump(scaler, filename) 
from google.colab import drive
drive.mount('/content/drive')
KeyboardInterruptTraceback (most recent call last)
<ipython-input-6-d5df0069828e> in <module>
      1 from google.colab import drive
----> 2 drive.mount('/content/drive')

/usr/local/lib/python3.7/dist-packages/google/colab/drive.py in mount(mountpoint, force_remount, timeout_ms, readonly)
    104       timeout_ms=timeout_ms,
    105       ephemeral=True,
--> 106       readonly=readonly)
    107 
    108 

/usr/local/lib/python3.7/dist-packages/google/colab/drive.py in _mount(mountpoint, force_remount, timeout_ms, ephemeral, readonly)
    123   if ephemeral:
    124     _message.blocking_request(
--> 125         'request_auth', request={'authType': 'dfs_ephemeral'}, timeout_sec=None)
    126 
    127   mountpoint = _os.path.expanduser(mountpoint)

/usr/local/lib/python3.7/dist-packages/google/colab/_message.py in blocking_request(request_type, request, timeout_sec, parent)
    169   request_id = send_request(
    170       request_type, request, parent=parent, expect_reply=True)
--> 171   return read_reply_from_input(request_id, timeout_sec)

/usr/local/lib/python3.7/dist-packages/google/colab/_message.py in read_reply_from_input(message_id, timeout_sec)
     95     reply = _read_next_input_message()
     96     if reply == _NOT_READY or not isinstance(reply, dict):
---> 97       time.sleep(0.025)
     98       continue
     99     if (reply.get('type') == 'colab_reply' and

KeyboardInterrupt: 

Split Data#

split data 20%

from sklearn.model_selection import train_test_split
from pathlib import Path  
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=1)
# save data test

filepath = Path('/content/drive/MyDrive/datamining/tugas/cobamodel/data_test.csv')  
filepath.parent.mkdir(parents=True, exist_ok=True) 
X_test.to_csv(filepath)

Inisialisasi Model Naive Bayes (gaussian)#

Eksekusi pada Model#

from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
probas = clf.predict_proba(X_test)[:,1]
y_pred
array([0, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1,
       0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 1, 0, 0])

Save Model Naive bayes#

filenameNB = '/content/drive/MyDrive/datamining/tugas/cobamodel/modelNB.pkl'
joblib.dump(clf,filenameNB)
['/content/drive/MyDrive/datamining/tugas/cobamodel/modelNB.pkl']

Menghitung Probas#

probas
array([0.45679169, 0.71532652, 0.51497473, 0.1421692 , 0.60325819,
       0.56976119, 0.6439957 , 0.55676177, 0.44854341, 0.65794967,
       0.55449043, 0.62978494, 0.5348809 , 0.19771477, 0.60989107,
       0.84122388, 0.22205409, 0.63114756, 0.51273691, 0.0737765 ,
       0.61553186, 0.50078683, 0.4626016 , 0.24756263, 0.17061052,
       0.8796775 , 0.71289765, 0.15237092, 0.18455427, 0.57164631,
       0.74028463, 0.46144141, 0.36198339, 0.58321659, 0.32050968,
       0.29148192, 0.14421451, 0.16577495, 0.16713078, 0.44960412,
       0.28133516, 0.73285752, 0.49066029, 0.28033441, 0.68523387,
       0.58784757, 0.42924383, 0.38647993, 0.51074407, 0.64821014,
       0.35333704, 0.60848408, 0.08168293, 0.213874  ])

Menghitung Hasil Akhir#

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
cm = confusion_matrix(y_test,y_pred)
precision = round(precision_score(y_test,y_pred, average="macro")*100,2)
acc_nb = round(accuracy_score(y_test,y_pred)*100,2)
recall = round(recall_score(y_test,y_pred, average="macro")*100,2)
f1score = round(f1_score(y_test, y_pred, average="macro")*100,2)
print('Konfusi Matrix\n',cm)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(f1score))
print('accuracy: {}'.format(acc_nb))
Konfusi Matrix
 [[22  9]
 [ 5 18]]
precision: 74.07
recall: 74.61
fscore: 73.93
accuracy: 74.07

Predict Input To Naive Bayes Model#

list_input = []
list_input.append('60 1 500 322'.split())
list_input.append('50 0 120 289'.split())
list_input.append('70 1 130 322'.split())
list_input.append('67 0 115 564'.split())
list_input
[['60', '1', '500', '322'],
 ['50', '0', '120', '289'],
 ['70', '1', '130', '322'],
 ['67', '0', '115', '564']]

input to Model Normalisasi

norm = joblib.load(filename)
pred_input = norm.fit_transform(list_input)
pred_input=pd.DataFrame(pred_input,columns=nama_fitur)
pred_input
age sex BP cholestrol
0 0.50 1.0 1.000000 0.12
1 0.00 0.0 0.012987 0.00
2 1.00 1.0 0.038961 0.12
3 0.85 0.0 0.000000 1.00

Input to Model Naive Bayes

nb = joblib.load(filenameNB)
input_pred = nb.predict(pred_input)
input_pred
array([0, 0, 0, 0])

Inisialisasi Model KNN#

Eksekusi Pada model#

from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics
#Try running from k=1 through 30 and record testing accuracy
k_range = range(1,31)
scores = {}
scores_list = []
for k in k_range:
        # install model
        knn = KNeighborsClassifier(n_neighbors=k)
        knn.fit(X_train,y_train)
        # save model
        filenameKNN = '/content/drive/MyDrive/datamining/tugas/cobamodel/modelKNN'+str(k)+'.pkl'
        joblib.dump(knn,filenameKNN)
        y_pred=knn.predict(X_test)
        scores[k] = accuracy_score(y_test,y_pred)
        scores_list.append(accuracy_score(y_test,y_pred))
scores
{1: 0.6481481481481481,
 2: 0.5740740740740741,
 3: 0.6111111111111112,
 4: 0.6111111111111112,
 5: 0.6481481481481481,
 6: 0.6111111111111112,
 7: 0.5925925925925926,
 8: 0.5925925925925926,
 9: 0.5925925925925926,
 10: 0.6296296296296297,
 11: 0.6111111111111112,
 12: 0.6111111111111112,
 13: 0.7037037037037037,
 14: 0.6666666666666666,
 15: 0.7037037037037037,
 16: 0.6481481481481481,
 17: 0.6481481481481481,
 18: 0.6296296296296297,
 19: 0.6111111111111112,
 20: 0.6296296296296297,
 21: 0.5925925925925926,
 22: 0.6111111111111112,
 23: 0.5925925925925926,
 24: 0.6111111111111112,
 25: 0.5925925925925926,
 26: 0.5740740740740741,
 27: 0.5740740740740741,
 28: 0.5740740740740741,
 29: 0.5555555555555556,
 30: 0.5740740740740741}

Visualisasi Score#

%matplotlib inline
import matplotlib.pyplot as plt

#plot the relationship between K and the testing accuracy
plt.plot(k_range,scores_list)
plt.xlabel('Value of K for KNN')
plt.ylabel('Testing Accuracy')
Text(0, 0.5, 'Testing Accuracy')
_images/Salinan8_36_1.png

nilai k dengan akurasi tertinggi

scores_list.index(max(scores_list))+1 , max(scores_list)
(13, 0.7037037037037037)
knn = KNeighborsClassifier(n_neighbors=scores_list.index(max(scores_list))+1)
knn.fit(X_train,y_train)
y_pred_knn =knn.predict(X_test)

cm = confusion_matrix(y_test,y_pred_knn)
precision = round(precision_score(y_test,y_pred_knn, average="macro")*100,2)
acc = round(accuracy_score(y_test,y_pred_knn)*100,2)
recall = round(recall_score(y_test,y_pred_knn, average="macro")*100,2)
f1score = round(f1_score(y_test, y_pred_knn, average="macro")*100,2)
print('Konfusi Matrix\n',cm)
print('precision: {}'.format(precision))
print('recall: {}'.format(recall))
print('fscore: {}'.format(f1score))
print('accuracy: {}'.format(acc))
Konfusi Matrix
 [[23  8]
 [ 8 15]]
precision: 69.71
recall: 69.71
fscore: 69.71
accuracy: 70.37

Implementasi Pada data Input#

Menggunakan KNN dengan nilai K = 11

knn11 = joblib.load('/content/drive/MyDrive/datamining/tugas/cobamodel/modelKNN11.pkl')
knn_pred = knn11.predict(pred_input)
knn_pred
array([0, 0, 0, 0])

Inisialisasi K-Means Clustering#

Eksekusi Pada Model#

from sklearn.cluster import KMeans

# #Try running from n=1 through 30 and record testing accuracy
n_range = range(1,31)
akurasi = {}
akurasi_score = []
for k in n_range:
        # install model
        kmeans = KMeans(n_clusters=k,random_state=0)
        kmeans.fit(X_train,y_train)
        # save model
        filenameKMeans = '/content/drive/MyDrive/datamining/tugas/cobamodel/modelKMeans'+str(k)+'.pkl'
        joblib.dump(kmeans,filenameKMeans)
        y_pred=kmeans.predict(X_test)
        akurasi[k] = accuracy_score(y_test,y_pred)
        akurasi_score.append(accuracy_score(y_test,y_pred))
akurasi_score
[0.5740740740740741,
 0.6111111111111112,
 0.2962962962962963,
 0.1111111111111111,
 0.2037037037037037,
 0.35185185185185186,
 0.24074074074074073,
 0.12962962962962962,
 0.25925925925925924,
 0.09259259259259259,
 0.24074074074074073,
 0.09259259259259259,
 0.07407407407407407,
 0.037037037037037035,
 0.14814814814814814,
 0.037037037037037035,
 0.07407407407407407,
 0.037037037037037035,
 0.037037037037037035,
 0.1111111111111111,
 0.07407407407407407,
 0.037037037037037035,
 0.018518518518518517,
 0.037037037037037035,
 0.018518518518518517,
 0.018518518518518517,
 0.018518518518518517,
 0.037037037037037035,
 0.0,
 0.018518518518518517]

Visualisasi Hasil K-means#

%matplotlib inline
import matplotlib.pyplot as plt

#plot the relationship between K and the testing accuracy
plt.plot(n_range,akurasi_score)
plt.xlabel('Value of K for K-means')
plt.ylabel('Testing Accuracy')
Text(0, 0.5, 'Testing Accuracy')
_images/Salinan8_47_1.png

Nilai n dengan akurasi tertinggi

akurasi_score.index(max(akurasi_score)) , max(akurasi_score)
(1, 0.6111111111111112)

Inisialisasi Decision Tree#

Eksekusi Pada Model#

from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier(max_depth =5, random_state = 42)
dtc.fit(X_train, y_train)
DecisionTreeClassifier(max_depth=5, random_state=42)

Decision Tree Rules Text#

#import relevant functions
from sklearn.tree import export_text
#export the decision rules
tree_rules = export_text(dtc,
                        feature_names = list(nama_fitur))
#print the result
print(tree_rules)
|--- sex <= 0.50
|   |--- BP <= 129.00
|   |   |--- age <= 62.50
|   |   |   |--- class: 0
|   |   |--- age >  62.50
|   |   |   |--- age <= 65.00
|   |   |   |   |--- class: 1
|   |   |   |--- age >  65.00
|   |   |   |   |--- class: 0
|   |--- BP >  129.00
|   |   |--- age <= 54.50
|   |   |   |--- cholestrol <= 304.50
|   |   |   |   |--- class: 0
|   |   |   |--- cholestrol >  304.50
|   |   |   |   |--- class: 1
|   |   |--- age >  54.50
|   |   |   |--- age <= 62.50
|   |   |   |   |--- cholestrol <= 254.00
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- cholestrol >  254.00
|   |   |   |   |   |--- class: 1
|   |   |   |--- age >  62.50
|   |   |   |   |--- cholestrol <= 233.50
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- cholestrol >  233.50
|   |   |   |   |   |--- class: 0
|--- sex >  0.50
|   |--- age <= 53.50
|   |   |--- cholestrol <= 187.00
|   |   |   |--- class: 1
|   |   |--- cholestrol >  187.00
|   |   |   |--- BP <= 135.00
|   |   |   |   |--- BP <= 129.00
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- BP >  129.00
|   |   |   |   |   |--- class: 0
|   |   |   |--- BP >  135.00
|   |   |   |   |--- cholestrol <= 280.00
|   |   |   |   |   |--- class: 0
|   |   |   |   |--- cholestrol >  280.00
|   |   |   |   |   |--- class: 1
|   |--- age >  53.50
|   |   |--- cholestrol <= 245.50
|   |   |   |--- BP <= 148.00
|   |   |   |   |--- age <= 65.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  65.50
|   |   |   |   |   |--- class: 1
|   |   |   |--- BP >  148.00
|   |   |   |   |--- class: 0
|   |   |--- cholestrol >  245.50
|   |   |   |--- BP <= 119.00
|   |   |   |   |--- class: 0
|   |   |   |--- BP >  119.00
|   |   |   |   |--- age <= 55.50
|   |   |   |   |   |--- class: 1
|   |   |   |   |--- age >  55.50
|   |   |   |   |   |--- class: 1

Rules Decision Tree Plot Diagram#

#import relevant packages
from sklearn import tree
import matplotlib.pyplot as plt

#plt the figure, setting a black background
plt.figure(figsize=(30,10), facecolor ='k')
#create the tree plot
a = tree.plot_tree(dtc,feature_names = nama_fitur,class_names = str(y),rounded = True,filled = True,fontsize=14)
#show the plot
plt.show()
_images/Salinan8_56_0.png

Hasil#

dtc_pred = dtc.predict(X_test)
dtc_pred
array([0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       1, 1, 1, 0, 0, 1, 0, 1, 1, 0])
cm_dtc = confusion_matrix(y_test,dtc_pred)
precision_dtc = round(precision_score(y_test,dtc_pred, average="macro")*100,2)
acc_dtc = round(accuracy_score(y_test,dtc_pred)*100,2)
recall_dtc = round(recall_score(y_test,dtc_pred, average="macro")*100,2)
f1score_dtc = round(f1_score(y_test, dtc_pred, average="macro")*100,2)
print('Konfusi Matrix\n',cm_dtc)
print('precision: {}'.format(precision_dtc))
print('recall: {}'.format(recall_dtc))
print('fscore: {}'.format(f1score_dtc))
print('accuracy: {}'.format(acc_dtc))
Konfusi Matrix
 [[21 10]
 [11 12]]
precision: 60.09
recall: 59.96
fscore: 60.0
accuracy: 61.11