#Load packages
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#Load data
data = pd.read_csv('FraminghamClean.csv')
data.head()
Look for correlations
#-------------------------
#-------------------------
#Partition the data into 80% training and 20% testing (you can also do this using train_test_split from sklearn)
np.random.seed(1)
mask = np.random.rand(len(data)) < 0.8
dfTrain = data[mask]
dfTest = data[~mask]
#Train a logistic regression for explanation with all features
Y = dfTrain['TenYearCHD']
X = dfTrain.drop(['TenYearCHD'],axis=1)
model = sm.Logit(Y, sm.add_constant(X))
result = model.fit()
result.summary()
#Train an explanation model with the Framinham Risk Score Features
Y = dfTrain['TenYearCHD']
X = dfTrain[['age','male','diabetes','sysBP','totChol','currentSmoker']]
model = sm.Logit(Y, sm.add_constant(X))
result = model.fit()
result.summary()
#Train with Qmodel features
Y_Q = dfTrain['TenYearCHD']
X_Q = dfTrain[['age','male','currentSmoker','diabetes','prevalentHyp']]
Qmodel = sm.Logit(Y_Q, sm.add_constant(X_Q))
Qresult = Qmodel.fit()
Qresult.summary()
Train a model with the DModel features from class
#-------------------------
#-------------------------
#Get test sets for each of the Q and D models
Xtest_Q = dfTest[['age','male','currentSmoker','diabetes','prevalentHyp']]
Xtest_D = dfTest[['age','male','cigsPerDay','glucose','sysBP']]
#Predict test set probabilities
yhatQ = Qmodel.predict(Qresult.params,sm.add_constant(Xtest_Q))
yhatD = Dmodel.predict(Dresult.params,sm.add_constant(Xtest_D))
#Plot the ROC curve
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score
from matplotlib.pyplot import plot, show, savefig, xlim, figure, \
ylim, legend, boxplot, setp, axes
#Compute all points along the curve (by varying the threshold and determining the FPR and TPR)
fprQ, tprQ, threshQ = roc_curve(dfTest.TenYearCHD, yhatQ)
roc_auc_Q= roc_auc_score(dfTest.TenYearCHD, yhatQ)
fprD, tprD, threshD = roc_curve(dfTest.TenYearCHD, yhatD)
roc_auc_D= roc_auc_score(dfTest.TenYearCHD, yhatD)
#Plot the ROC curves!
fig = figure(figsize=(10, 6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fprQ, tprQ, label='QModel (AUC = %0.2f)' % roc_auc_Q)
plt.plot(fprD, tprD, label='DModel (AUC = %0.2f)' % roc_auc_D)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right');
plt.show()
#fig.savefig('QvsD_ModelROC.pdf', bbox_inches='tight')
#Manually round the predictions according to the threshold with a TPR of 0.8
#Dmodel
yhatDround = []
for i in yhatD:
if i > threshD[next(x[0] for x in enumerate(tprD) if x[1] > 0.8)]:
yhatDround.append(1)
else:
yhatDround.append(0)
#Qmodel
yhatQround = []
for i in yhatQ:
if i > threshQ[next(x[0] for x in enumerate(tprQ) if x[1] > 0.8)]:
yhatQround.append(1)
else:
yhatQround.append(0)
#Print the thresholds corresponding to the above rounding scheme
print(threshQ[next(x[0] for x in enumerate(tprQ) if x[1] > 0.8)])
print(threshD[next(x[0] for x in enumerate(tprQ) if x[1] > 0.8)])
#Create a confusion matrix for D model
from sklearn.metrics import confusion_matrix
#Get the confusion matrix
cMatrix = confusion_matrix(y_true = dfTest.TenYearCHD, y_pred = yhatDround)
#Plot the confusion matrix
fig = figure(figsize=(10, 6))
plt.imshow(cMatrix, cmap=plt.cm.Blues)
plt.text(0, 0, '{}'.format(cMatrix[0, 0]), horizontalalignment='center',fontsize = 'xx-large')
plt.text(0, 1, '{}'.format(cMatrix[1, 0]), horizontalalignment='center',fontsize = 'xx-large')
plt.text(1, 0, '{}'.format(cMatrix[0, 1]), horizontalalignment='center',fontsize = 'xx-large')
plt.text(1, 1, '{}'.format(cMatrix[1, 1]), horizontalalignment='center',fontsize = 'xx-large')
tick_marks = [0,1]
labels = ['Healthy', 'Heart Disease']
plt.xticks(tick_marks, labels, rotation=90,fontsize = 'x-large')
plt.ylim([-0.5,1.5])
plt.yticks(tick_marks, labels,fontsize = 'x-large')
plt.ylabel('True label',fontsize = 'xx-large')
plt.xlabel('Predicted label',fontsize = 'xx-large')
plt.show()
#fig.savefig('Dmodel_CM.pdf', bbox_inches='tight')
Create the confusion matrix for the Q model
#Create the confusion matrix for the Q model
#Train a regularized logistic regression model using grid search
from sklearn.model_selection import GridSearchCV
#Pick a model
model = LogisticRegression(solver='liblinear')
#Choose some hyperparameters to search over
params = {'C':[0.0001,0.001,0.01,0.1,1,10,100,250,500,750,1000],'penalty':['l1','l2']}
#Run gridsearch (there is also a specific grid search function for logistic regression - LogisticRegressionCV)
clf = GridSearchCV(model,params, cv=10,scoring='roc_auc') #10 fold CV with AUC as the accuracy metric
clf.fit(dfTrain.drop(['TenYearCHD'],axis=1),dfTrain.TenYearCHD)
#Print out the results
clf.cv_results_
#Plot the grid search results
def plot_grid_search(cv_results, grid_param_1, grid_param_2, name_param_1, name_param_2):
# Get Test Scores Mean and std for each grid search
scores_mean = cv_results['mean_test_score']
scores_mean = np.array(scores_mean).reshape(len(grid_param_2),len(grid_param_1))
scores_sd = cv_results['std_test_score']
scores_sd = np.array(scores_sd).reshape(len(grid_param_2),len(grid_param_1))
# Plot Grid search scores
_, ax = plt.subplots(1,1)
# Param1 is the X-axis, Param 2 is represented as a different curve (color line)
for idx, val in enumerate(grid_param_2):
ax.plot(grid_param_1, scores_mean[idx,:], '-o', label= name_param_2 + ': ' + str(val))
ax.set_xlabel(name_param_1, fontsize=16)
ax.set_ylabel('Average AUC', fontsize=16)
ax.legend(loc="best", fontsize=15)
ax.grid('on')
plt.show()
# Calling Method
plot_grid_search(clf.cv_results_, params['C'], params['penalty'], 'Hyperparameter value (C)', 'Regularization type')
Pick the best model and initalize as 'model'. See here for documentation.
#Pick and define the best model
model = LogisticRegression(solver = 'liblinear',penalty='l2',C=10)
#Train the model
model.fit(dfTrain.drop(['TenYearCHD'],axis=1),dfTrain.TenYearCHD)
#Predict test set probabilities
preds = model.predict_proba(dfTest.drop(['TenYearCHD'],axis=1)).T[1]
#Create the ROC curve
fprLR, tprLR, threshLR = roc_curve(dfTest.TenYearCHD, preds)
roc_auc_LR= roc_auc_score(dfTest.TenYearCHD, preds)
#Plot and compare with the Q and D models
fig = figure(figsize=(10, 6))
plt.plot([0, 1], [0, 1], linestyle='--')
plt.plot(fprQ, tprQ, label='QModel (AUC = %0.2f)' % roc_auc_Q)
plt.plot(fprD, tprD, label='DModel (AUC = %0.2f)' % roc_auc_D)
plt.plot(fprLR, tprLR, label='Regularized (AUC = %0.2f)' % roc_auc_LR)
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend(loc='lower right');
plt.show()
fig.savefig('QvsDvsLR_ModelROC.pdf', bbox_inches='tight')
#Round the output using a threshold corresponding to a TPR of 0.8
yhatLRround = []
for i in preds:
if i > threshLR[next(x[0] for x in enumerate(tprLR) if x[1] > 0.8)]:
yhatLRround.append(1)
else:
yhatLRround.append(0)
#Create a confusion matrix
cMatrix = confusion_matrix(y_true = dfTest.TenYearCHD, y_pred = yhatLRround)
fig = figure(figsize=(10, 6))
plt.imshow(cMatrix, cmap=plt.cm.Blues)
plt.text(0, 0, '{}'.format(cMatrix[0, 0]), horizontalalignment='center',fontsize = 'xx-large')
plt.text(0, 1, '{}'.format(cMatrix[1, 0]), horizontalalignment='center',fontsize = 'xx-large')
plt.text(1, 0, '{}'.format(cMatrix[0, 1]), horizontalalignment='center',fontsize = 'xx-large')
plt.text(1, 1, '{}'.format(cMatrix[1, 1]), horizontalalignment='center',fontsize = 'xx-large')
tick_marks = [0,1]
plt.ylim([-0.5,1.5])
labels = ['Healthy', 'Heart Disease']
plt.xticks(tick_marks, labels, rotation=90,fontsize = 'x-large')
plt.yticks(tick_marks, labels,fontsize = 'x-large')
plt.ylabel('True label',fontsize = 'xx-large')
plt.xlabel('Predicted label',fontsize = 'xx-large')
plt.show()
fig.savefig('LR_CM.pdf', bbox_inches='tight')
Pick a different "best" model and repeat the above steps