In [21]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from tensorflow.keras.utils import to_categorical
import numpy as np
In [2]:
stroke_df = pd.read_csv("healthcare-dataset-stroke-data.csv")
stroke_df.head()
Out[2]:
In [3]:
stroke_df.drop(columns=['id'], inplace = True)
stroke_df.dropna(inplace=True)
In [4]:
stroke_df.work_type.value_counts()
Out[4]:
In [5]:
stroke_df = stroke_df[stroke_df.gender != 'Other']
stroke_df.gender.value_counts()
Out[5]:
In [6]:
#replace male with 0, female with 1
stroke_df['gender'].replace({'Male': 0, 'Female':1}, inplace=True)
#replace married with 1, not married with 0
stroke_df['ever_married'].replace({'Yes': 1, 'No':0}, inplace=True)
#Replace Urban with 0, Rural with 1
stroke_df['Residence_type'].replace({'Urban': 0, 'Rural':1}, inplace=True)
#Replace Smoker : 0, Former Smoker : 1, Never smoked : 2, Unknown : 3
stroke_df['smoking_status'].replace({'smokes': 0, 'formerly smoked':1,
'never smoked': 2, 'Unknown':3}, inplace=True)
#Replace Work type. Private :0, Self-employed : 1, child : 2, Govt-job :3, Never worked : 4
stroke_df['work_type'].replace({'Private': 0, 'Self-employed':1,
'children': 2, 'Govt_job':3, 'Never_worked' : 4}, inplace=True)
stroke_df.head(50)
Out[6]:
In [7]:
stroke_df.head()
Out[7]:
In [46]:
dummies_stroke_df = pd.get_dummies(stroke_df)
dummies_stroke_df.rename(columns={"gender_Female":"female","gender_Male":"male",
"ever_married_No":"unmarried","ever_married_Yes":"married",
"Residence_type_Urban":"Urban", "Residence_type_Rural":"Rural",
"smoking_status_smokes": "smokes", "smoking_status_never smoked": "never_smoked",
"smoking_status_formerly smoked": "formerly smoked",
"smoking_status_Unknown": "unknown_smoker",
"work_type_Private" : "private_worker", "work_type_Self-employed" : "self_employed",
"work_type_children" : "child_non_worker", "work_type_Govt_job" : "govt_job",
"work_type_Never_worked" : "never_worked"
}, inplace =True)
dummies_stroke_df.head(130)
Out[46]:
In [39]:
data = stroke_df.values
X = data[:, 0:-1]
y = data[:, -1]
In [64]:
X[0]
Out[64]:
In [91]:
individual_test = np.array([[ 0. , 60. , 0. , 0. , 1. , 1. , 0. , 180.12,
23.5 , 1. ]])
In [58]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
In [59]:
rf = RandomForestClassifier(n_estimators=200)
rf = rf.fit(X_train.data, y_train)
rf.score(X_test, y_test)
Out[59]:
In [60]:
importances = rf.feature_importances_
importances
Out[60]:
In [61]:
column_names = list(stroke_df.columns.values)
In [62]:
sorted(zip(rf.feature_importances_, column_names[0:-1]), reverse=True)
Out[62]:
In [92]:
rf.predict_proba(individual_test)
Out[92]:
In [ ]: