stroke_machine_learning
In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from tensorflow.keras.utils import to_categorical
import numpy as np
In [2]:
stroke_df = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_df.head()
Out[2]:
id gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 9046 Male 67.0 0 1 Yes Private Urban 228.69 36.6 formerly smoked 1
1 51676 Female 61.0 0 0 Yes Self-employed Rural 202.21 NaN never smoked 1
2 31112 Male 80.0 0 1 Yes Private Rural 105.92 32.5 never smoked 1
3 60182 Female 49.0 0 0 Yes Private Urban 171.23 34.4 smokes 1
4 1665 Female 79.0 1 0 Yes Self-employed Rural 174.12 24.0 never smoked 1
In [3]:
stroke_df.drop(columns=['id'], inplace = True)
stroke_df.dropna(inplace=True)
In [4]:
stroke_df.work_type.value_counts()
Out[4]:
Private          2811
Self-employed     775
children          671
Govt_job          630
Never_worked       22
Name: work_type, dtype: int64
In [5]:
stroke_df = stroke_df[stroke_df.gender != 'Other']
stroke_df.gender.value_counts()
Out[5]:
Female    2897
Male      2011
Name: gender, dtype: int64
In [6]:
#replace male with 0, female with 1
stroke_df['gender'].replace({'Male': 0, 'Female':1}, inplace=True)

#replace married with 1, not married with 0
stroke_df['ever_married'].replace({'Yes': 1, 'No':0}, inplace=True)

#Replace Urban with 0, Rural with 1
stroke_df['Residence_type'].replace({'Urban': 0, 'Rural':1}, inplace=True)

#Replace Smoker : 0, Former Smoker : 1, Never smoked : 2, Unknown : 3
stroke_df['smoking_status'].replace({'smokes': 0, 'formerly smoked':1, 
                                    'never smoked': 2, 'Unknown':3}, inplace=True)

#Replace Work type. Private :0, Self-employed : 1, child : 2, Govt-job :3, Never worked : 4
stroke_df['work_type'].replace({'Private': 0, 'Self-employed':1, 
                                    'children': 2, 'Govt_job':3, 'Never_worked' : 4}, inplace=True)


stroke_df.head(50)
Out[6]:
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 0 67.0 0 1 1 0 0 228.69 36.6 1 1
2 0 80.0 0 1 1 0 1 105.92 32.5 2 1
3 1 49.0 0 0 1 0 0 171.23 34.4 0 1
4 1 79.0 1 0 1 1 1 174.12 24.0 2 1
5 0 81.0 0 0 1 0 0 186.21 29.0 1 1
6 0 74.0 1 1 1 0 1 70.09 27.4 2 1
7 1 69.0 0 0 0 0 0 94.39 22.8 2 1
9 1 78.0 0 0 1 0 0 58.57 24.2 3 1
10 1 81.0 1 0 1 0 1 80.43 29.7 2 1
11 1 61.0 0 1 1 3 1 120.46 36.8 0 1
12 1 54.0 0 0 1 0 0 104.51 27.3 0 1
14 1 79.0 0 1 1 0 0 214.09 28.2 2 1
15 1 50.0 1 0 1 1 1 167.41 30.9 2 1
16 0 64.0 0 1 1 0 0 191.61 37.5 0 1
17 0 75.0 1 0 1 0 0 221.29 25.8 0 1
18 1 60.0 0 0 0 0 0 89.22 37.8 2 1
20 1 71.0 0 0 1 3 1 193.94 22.4 0 1
21 1 52.0 1 0 1 1 0 233.29 48.9 2 1
22 1 79.0 0 0 1 1 0 228.70 26.6 2 1
23 0 82.0 0 1 1 0 1 208.30 32.5 3 1
24 0 71.0 0 0 1 0 0 102.87 27.2 1 1
25 0 80.0 0 0 1 1 1 104.12 23.5 2 1
26 1 65.0 0 0 1 0 1 100.98 28.2 1 1
28 0 69.0 0 1 1 1 0 195.23 28.3 0 1
30 0 57.0 1 0 1 0 0 212.08 44.2 0 1
31 0 42.0 0 0 1 0 1 83.41 25.4 3 1
32 1 82.0 1 0 1 1 0 196.92 22.2 2 1
33 0 80.0 0 1 1 1 0 252.72 30.5 1 1
34 0 48.0 0 0 0 3 0 84.20 29.7 2 1
35 1 82.0 1 1 0 0 1 84.03 26.5 1 1
36 0 74.0 0 0 1 0 1 219.72 33.7 1 1
37 1 72.0 1 0 1 0 1 74.63 23.1 1 1
38 0 58.0 0 0 0 0 1 92.62 32.0 3 1
39 1 49.0 0 0 1 0 0 60.91 29.9 2 1
40 0 78.0 0 0 1 0 1 78.03 23.9 1 1
41 0 54.0 0 0 1 0 0 71.22 28.5 2 1
42 0 82.0 0 1 1 0 0 144.90 26.4 0 1
44 0 60.0 1 0 1 3 0 213.03 20.2 0 1
45 0 76.0 1 0 1 0 1 243.58 33.6 2 1
47 1 58.0 0 0 1 0 0 107.26 38.6 1 1
48 0 81.0 0 0 1 1 0 99.33 33.7 2 1
49 1 39.0 1 0 1 0 1 58.09 39.2 0 1
52 1 79.0 0 1 1 0 1 127.29 27.7 2 1
53 1 77.0 1 0 1 1 0 124.13 31.4 2 1
55 0 63.0 0 1 1 0 1 196.71 36.5 1 1
56 1 82.0 0 0 1 0 1 59.32 33.2 2 1
58 0 73.0 1 0 1 1 0 194.99 32.8 2 1
59 1 54.0 1 0 1 3 0 180.93 27.7 2 1
60 1 56.0 0 0 1 0 0 185.17 40.4 1 1
61 1 80.0 1 0 1 0 1 74.90 22.2 2 1
In [7]:
stroke_df.head()
Out[7]:
gender age hypertension heart_disease ever_married work_type Residence_type avg_glucose_level bmi smoking_status stroke
0 0 67.0 0 1 1 0 0 228.69 36.6 1 1
2 0 80.0 0 1 1 0 1 105.92 32.5 2 1
3 1 49.0 0 0 1 0 0 171.23 34.4 0 1
4 1 79.0 1 0 1 1 1 174.12 24.0 2 1
5 0 81.0 0 0 1 0 0 186.21 29.0 1 1
In [46]:
dummies_stroke_df = pd.get_dummies(stroke_df)
dummies_stroke_df.rename(columns={"gender_Female":"female","gender_Male":"male", 
                                  "ever_married_No":"unmarried","ever_married_Yes":"married",
                                 "Residence_type_Urban":"Urban", "Residence_type_Rural":"Rural",
                                 "smoking_status_smokes": "smokes", "smoking_status_never smoked": "never_smoked",
                                  "smoking_status_formerly smoked": "formerly smoked",
                                  "smoking_status_Unknown": "unknown_smoker",
                                  "work_type_Private" : "private_worker", "work_type_Self-employed" : "self_employed",
                                  "work_type_children" : "child_non_worker", "work_type_Govt_job" : "govt_job",
                                  "work_type_Never_worked" : "never_worked"
                                 }, inplace =True)
dummies_stroke_df.head(130)
Out[46]:
age hypertension heart_disease avg_glucose_level bmi stroke female male unmarried married ... never_worked private_worker self_employed child_non_worker Rural Urban unknown_smoker formerly smoked never_smoked smokes
0 67.0 0 1 228.69 36.6 1 0 1 0 1 ... 0 1 0 0 0 1 0 1 0 0
2 80.0 0 1 105.92 32.5 1 0 1 0 1 ... 0 1 0 0 1 0 0 0 1 0
3 49.0 0 0 171.23 34.4 1 1 0 0 1 ... 0 1 0 0 0 1 0 0 0 1
4 79.0 1 0 174.12 24.0 1 1 0 0 1 ... 0 0 1 0 1 0 0 0 1 0
5 81.0 0 0 186.21 29.0 1 0 1 0 1 ... 0 1 0 0 0 1 0 1 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
149 70.0 0 1 239.07 26.1 1 1 0 0 1 ... 0 1 0 0 1 0 0 0 1 0
151 68.0 0 1 223.83 31.9 1 0 1 0 1 ... 0 1 0 0 0 1 0 1 0 0
152 80.0 0 0 76.57 34.1 1 1 0 0 1 ... 0 0 1 0 0 1 0 0 1 0
153 68.0 0 0 77.82 27.5 1 0 1 0 1 ... 0 0 1 0 0 1 0 0 0 1
154 55.0 0 0 92.98 25.6 1 1 0 0 1 ... 0 0 1 0 1 0 0 0 1 0

130 rows × 21 columns

In [39]:
data = stroke_df.values

X = data[:, 0:-1]
y = data[:, -1]
In [64]:
X[0]
Out[64]:
array([  1.  ,  67.  ,   0.  ,   1.  ,   1.  ,   0.  ,   1.  , 228.69,
        36.6 ,   1.  ])
In [91]:
individual_test = np.array([[  0.  ,  60.  ,   0.  ,   0.  ,   1.  ,   1.  ,   0.  , 180.12,
        23.5 ,   1.  ]])
In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
In [59]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train.data, y_train)
rf.score(X_test, y_test)
Out[59]:
0.9494702526487367
In [60]:
importances = rf.feature_importances_
importances
Out[60]:
array([0.03205654, 0.22055866, 0.02308919, 0.02170081, 0.01521427,
       0.04883788, 0.0316753 , 0.29692578, 0.24064122, 0.06930033])
In [61]:
column_names = list(stroke_df.columns.values)
In [62]:
sorted(zip(rf.feature_importances_, column_names[0:-1]), reverse=True)
Out[62]:
[(0.2969257802742991, 'avg_glucose_level'),
 (0.24064122113899675, 'bmi'),
 (0.22055866114766043, 'age'),
 (0.06930033413476237, 'smoking_status'),
 (0.04883788011262535, 'work_type'),
 (0.03205654357216044, 'gender'),
 (0.031675304837187376, 'Residence_type'),
 (0.023089194471989545, 'hypertension'),
 (0.02170081088602318, 'heart_disease'),
 (0.0152142694242955, 'ever_married')]
In [92]:
rf.predict_proba(individual_test)
Out[92]:
array([[0.87, 0.13]])
In [ ]: