Matthew Morgan Project Three Notebook

For Project three I used a data set located at that indicates many variable for people such as sex and race and whether they were re-admitted to prison within 3 years of having been released.

I took a sample of a little over 100 people from the data set and created a simplified data set. I coded their sex as either 1 (male), or 0 (female); their race as either 1 (white non-hispanic), 2 (white hispanic), 3 (black non-hispanic), 4 (black hispanic), or 5 (american indian non-hispanic); and there recidivism as either 1 (re-admittded) or 0 (not readmitted).

I then used this data set for the fllowing program to see if there is any relationship between sex and race on recidivism. The model didn't perform well. The accuracy in training was 0. There probably isn't enough data to properly form a good model.

In [304]:
import os
    inputFunc = raw_input
except NameError:
    inputFunc = input

import pandas as pd
from import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
import seaborn as sns
from statsmodels.formula.api import ols

from sklearn import linear_model
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from patsy import dmatrices

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

import random

def evaluate(pred, labels_test):
    acc = accuracy_score(pred, labels_test)
    print ("Accuracey: %s"%acc)
    tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()

    recall = tp / (tp + fp)
    percision = tp / (tp + fn)
    f1 = (2 / ((1/recall)+(1/percision)))

    print ("")
    print ("True Negatives: %s"%tn)
    print ("False Positives: %s"%fp)
    print ("False Negatives: %s"%fn)
    print ("True Positives: %s"%tp)
    print ("Recall: %s"%recall)
    print ("Precision: %s"%percision)
    print ("F1 Score: %s"%f1)

def plot_bound(Z_val,data,col1,col2,binary):

    x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10 
    x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
    y_min = 0.0; 
    y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
    h_x = (x_max-x_min)/100  
    h_y = (y_max-y_min)/100  
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
    if binary == 1:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])   
        Z = np.where(Z=="Y",1,0)
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.pcolormesh(xx, yy, Z)


In [305]:
raw_data_df = pd.read_csv('Jail2.csv', parse_dates=[0]) 
Sex Race Recidivism
0 1 1 1
1 0 1 1
2 1 1 1
3 1 1 1
4 1 1 1
In [306]:
raw_data_df["recidivism_Y_N"] = "N"
Sex Race Recidivism recidivism_Y_N
0 1 1 1 N
1 0 1 1 N
2 1 1 1 N
3 1 1 1 N
4 1 1 1 N
In [307]:
raw_data_df.loc[raw_data_df['Recidivism'] == 1, 'recidivism_Y_N'] = "Y"

Sex Race Recidivism recidivism_Y_N
0 1 1 1 Y
1 0 1 1 Y
2 1 1 1 Y
3 1 1 1 Y
4 1 1 1 Y
In [308]:
Rec_lin_df = raw_data_df[[
Sex Race Recidivism
0 1 1 1
1 0 1 1
2 1 1 1
3 1 1 1
4 1 1 1
In [309]:
Rec_class_df = raw_data_df[[
recidivism_Y_N Sex Race
0 Y 1 1
1 Y 0 1
2 Y 1 1
3 Y 1 1
4 Y 1 1

Taining and Validation


In [310]:
data = Rec_lin_df

data = data[data["Recidivism"]<=1]

holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]
In [311]:
sns.lmplot(x="Sex", y="Recidivism", data=training, x_estimator=np.mean, order=1)
In [312]:
sns.lmplot(x="Race", y="Recidivism", data=training, x_estimator=np.mean, order=1)
<seaborn.axisgrid.FacetGrid at 0x121706ac8>
In [313]:
model = ols("Recidivism ~ Race + Sex", training).fit()
OLS Regression Results
Dep. Variable: Recidivism R-squared: 0.007
Model: OLS Adj. R-squared: -0.012
Method: Least Squares F-statistic: 0.3816
Date: Sun, 03 Dec 2017 Prob (F-statistic): 0.684
Time: 16:54:33 Log-Likelihood: -7.7700
No. Observations: 109 AIC: 21.54
Df Residuals: 106 BIC: 29.61
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.9822 0.068 14.345 0.000 0.846 1.118
Sex[T.1] -0.0458 0.062 -0.743 0.459 -0.168 0.076
Race -0.0136 0.024 -0.563 0.575 -0.062 0.034
Omnibus: 94.297 Durbin-Watson: 1.356
Prob(Omnibus): 0.000 Jarque-Bera (JB): 524.194
Skew: -3.236 Prob(JB): 1.49e-114
Kurtosis: 11.575 Cond. No. 7.30
In [314]:
# Rerun with SciKitLearn because it's easy to check accuracy
features_train = training.drop("Recidivism", axis=1).as_matrix(columns=None)
labels_train = training["Recidivism"].as_matrix(columns=None)

features_test = holdout.drop("Recidivism", axis=1).as_matrix(columns=None)
labels_test = holdout["Recidivism"].as_matrix(columns=None)

lm = linear_model.LinearRegression()
clf =, labels_train)
pred = clf.predict(features_test)
accuracy = metrics.r2_score(labels_test, pred)
print("R squared:",lm.score(features_train,labels_train))
R squared: 0.00714872102992
Accuracy: -0.0712769379579


In [315]:
data = Rec_class_df
holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]

features_train = training.drop("recidivism_Y_N", axis=1).as_matrix(columns=None)
labels_train = training["recidivism_Y_N"].as_matrix(columns=None)

features_test = holdout.drop("recidivism_Y_N", axis=1).as_matrix(columns=None)
labels_test = holdout["recidivism_Y_N"].as_matrix(columns=None)

print("Percentage of Ys: %s\n"%(len(data[data["recidivism_Y_N"]=="Y"])/len(data)))

feature_1_no = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_2_no = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_1_yes = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
feature_2_yes = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
plt.scatter(feature_1_yes, feature_2_yes, color = "g", label="Recidivism")
plt.scatter(feature_1_no, feature_2_no, color = "r", label="No Recidivism")
Percentage of Ys: 0.9217391304347826

In [316]:
# Logistic Regression
model = LogisticRegression(fit_intercept = False, C = 1e9)
clf =, labels_train)
pred = clf.predict(features_test)
print("Logistic Regression")
evaluate(pred, labels_test)  

# Test some spot
x_test = 70
y_test = 160000

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf =, labels_train)
pred = clf.predict(features_test)
print("\nDecision Tree")
evaluate(pred, labels_test)

from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf =, labels_train)
pred = clf.predict(features_test)
print("Random Forest")
evaluate(pred, labels_test)  

from sklearn.svm import SVC
clf = SVC(kernel="rbf",probability=True)
clf =, labels_train)
pred = clf.predict(features_test)
evaluate(pred, labels_test)  
#plot_bound("Y",holdout,1,2,0) # plot doesn't work with SVM
Logistic Regression
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091

Decision Tree
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091
Random Forest
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091
