Matthew Morgan Project Three Notebook

For Project three I used a data set located at https://catalog.data.gov/dataset/3-year-recidivism-for-offenders-released-from-prison that indicates many variable for people such as sex and race and whether they were re-admitted to prison within 3 years of having been released.

I took a sample of a little over 100 people from the data set and created a simplified data set. I coded their sex as either 1 (male), or 0 (female); their race as either 1 (white non-hispanic), 2 (white hispanic), 3 (black non-hispanic), 4 (black hispanic), or 5 (american indian non-hispanic); and there recidivism as either 1 (re-admittded) or 0 (not readmitted).

I then used this data set for the fllowing program to see if there is any relationship between sex and race on recidivism. The model didn't perform well. The accuracy in training was 0. There probably isn't enough data to properly form a good model.

In [304]:
import os
try:
    inputFunc = raw_input
except NameError:
    inputFunc = input

import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
 
import seaborn as sns
from statsmodels.formula.api import ols

from sklearn import linear_model
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from patsy import dmatrices

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

import random





def evaluate(pred, labels_test):
    acc = accuracy_score(pred, labels_test)
    print ("Accuracey: %s"%acc)
    tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()

    recall = tp / (tp + fp)
    percision = tp / (tp + fn)
    f1 = (2 / ((1/recall)+(1/percision)))

    print ("")
    print ("True Negatives: %s"%tn)
    print ("False Positives: %s"%fp)
    print ("False Negatives: %s"%fn)
    print ("True Positives: %s"%tp)
    print ("Recall: %s"%recall)
    print ("Precision: %s"%percision)
    print ("F1 Score: %s"%f1)

def plot_bound(Z_val,data,col1,col2,binary):

    
    x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10 
    x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
    y_min = 0.0; 
    y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
    h_x = (x_max-x_min)/100  
    h_y = (y_max-y_min)/100  
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
    if binary == 1:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])   
        Z = np.where(Z=="Y",1,0)
    else:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
  
    Z = Z.reshape(xx.shape)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.pcolormesh(xx, yy, Z)
    plt.show()

Data

In [305]:
raw_data_df = pd.read_csv('Jail2.csv', parse_dates=[0]) 
raw_data_df.head()
Out[305]:
Sex Race Recidivism
0 1 1 1
1 0 1 1
2 1 1 1
3 1 1 1
4 1 1 1
In [306]:
raw_data_df["recidivism_Y_N"] = "N"
raw_data_df.head()
Out[306]:
Sex Race Recidivism recidivism_Y_N
0 1 1 1 N
1 0 1 1 N
2 1 1 1 N
3 1 1 1 N
4 1 1 1 N
In [307]:
 
raw_data_df.loc[raw_data_df['Recidivism'] == 1, 'recidivism_Y_N'] = "Y"

raw_data_df.head()
Out[307]:
Sex Race Recidivism recidivism_Y_N
0 1 1 1 Y
1 0 1 1 Y
2 1 1 1 Y
3 1 1 1 Y
4 1 1 1 Y
In [308]:
Rec_lin_df = raw_data_df[[
                               'Sex', 
                               'Race', 
                               'Recidivism'
                               ]].copy()
Rec_lin_df.head()
Out[308]:
Sex Race Recidivism
0 1 1 1
1 0 1 1
2 1 1 1
3 1 1 1
4 1 1 1
In [309]:
Rec_class_df = raw_data_df[[
                               'recidivism_Y_N', 
                               'Sex', 
                               'Race'
                               ]].copy()
Rec_class_df.head()
Out[309]:
recidivism_Y_N Sex Race
0 Y 1 1
1 Y 0 1
2 Y 1 1
3 Y 1 1
4 Y 1 1

Taining and Validation

Rec_lin_df

In [310]:
data = Rec_lin_df

data = data[data["Recidivism"]<=1]

holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]
In [311]:
sns.lmplot(x="Sex", y="Recidivism", data=training, x_estimator=np.mean, order=1)
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-311-2df74713507d> in <module>()
----> 1 sns.lmplot(x="Sex", y="Recidivism", data=training, x_estimator=np.mean, order=1)

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in lmplot(x, y, data, hue, col, row, palette, col_wrap, size, aspect, markers, sharex, sharey, hue_order, col_order, row_order, legend, legend_out, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, x_jitter, y_jitter, scatter_kws, line_kws)
    577         scatter_kws=scatter_kws, line_kws=line_kws,
    578         )
--> 579     facets.map_dataframe(regplot, x, y, **regplot_kws)
    580 
    581     # Add a legend

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/axisgrid.py in map_dataframe(self, func, *args, **kwargs)
    792 
    793             # Draw the plot
--> 794             self._facet_plot(func, ax, args, kwargs)
    795 
    796         # Finalize the annotations and layout

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/axisgrid.py in _facet_plot(self, func, ax, plot_args, plot_kwargs)
    810 
    811         # Draw the plot
--> 812         func(*plot_args, **plot_kwargs)
    813 
    814         # Sort out the supporting information

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax)
    777     scatter_kws["marker"] = marker
    778     line_kws = {} if line_kws is None else copy.copy(line_kws)
--> 779     plotter.plot(ax, scatter_kws, line_kws)
    780     return ax
    781 

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in plot(self, ax, scatter_kws, line_kws)
    330             self.scatterplot(ax, scatter_kws)
    331         if self.fit_reg:
--> 332             self.lineplot(ax, line_kws)
    333 
    334         # Label the axes

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in lineplot(self, ax, kws)
    375 
    376         # Fit the regression model
--> 377         grid, yhat, err_bands = self.fit_regression(ax)
    378 
    379         # Get set default aesthetics

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in fit_regression(self, ax, x_range, grid)
    207             yhat, yhat_boots = self.fit_logx(grid)
    208         else:
--> 209             yhat, yhat_boots = self.fit_fast(grid)
    210 
    211         # Compute the confidence interval at each grid point

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in fit_fast(self, grid)
    222         grid = np.c_[np.ones(len(grid)), grid]
    223         reg_func = lambda _x, _y: np.linalg.pinv(_x).dot(_y)
--> 224         yhat = grid.dot(reg_func(X, y))
    225         if self.ci is None:
    226             return yhat, None

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in <lambda>(_x, _y)
    221         X, y = np.c_[np.ones(len(self.x)), self.x], self.y
    222         grid = np.c_[np.ones(len(grid)), grid]
--> 223         reg_func = lambda _x, _y: np.linalg.pinv(_x).dot(_y)
    224         yhat = grid.dot(reg_func(X, y))
    225         if self.ci is None:

/Users/mtm2482/anaconda/lib/python3.6/site-packages/numpy/linalg/linalg.py in pinv(a, rcond)
   1659     a, wrap = _makearray(a)
   1660     _assertNoEmpty2d(a)
-> 1661     a = a.conjugate()
   1662     u, s, vt = svd(a, 0)
   1663     m = u.shape[0]

AttributeError: 'str' object has no attribute 'conjugate'
In [312]:
sns.lmplot(x="Race", y="Recidivism", data=training, x_estimator=np.mean, order=1)
Out[312]:
<seaborn.axisgrid.FacetGrid at 0x121706ac8>
In [313]:
model = ols("Recidivism ~ Race + Sex", training).fit()
model.summary()
Out[313]:
OLS Regression Results
Dep. Variable: Recidivism R-squared: 0.007
Model: OLS Adj. R-squared: -0.012
Method: Least Squares F-statistic: 0.3816
Date: Sun, 03 Dec 2017 Prob (F-statistic): 0.684
Time: 16:54:33 Log-Likelihood: -7.7700
No. Observations: 109 AIC: 21.54
Df Residuals: 106 BIC: 29.61
Df Model: 2
Covariance Type: nonrobust
coef std err t P>|t| [0.025 0.975]
Intercept 0.9822 0.068 14.345 0.000 0.846 1.118
Sex[T.1] -0.0458 0.062 -0.743 0.459 -0.168 0.076
Race -0.0136 0.024 -0.563 0.575 -0.062 0.034
Omnibus: 94.297 Durbin-Watson: 1.356
Prob(Omnibus): 0.000 Jarque-Bera (JB): 524.194
Skew: -3.236 Prob(JB): 1.49e-114
Kurtosis: 11.575 Cond. No. 7.30
In [314]:
# Rerun with SciKitLearn because it's easy to check accuracy
features_train = training.drop("Recidivism", axis=1).as_matrix(columns=None)
labels_train = training["Recidivism"].as_matrix(columns=None)

features_test = holdout.drop("Recidivism", axis=1).as_matrix(columns=None)
labels_test = holdout["Recidivism"].as_matrix(columns=None)

lm = linear_model.LinearRegression()
clf = lm.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = metrics.r2_score(labels_test, pred)
print("R squared:",lm.score(features_train,labels_train))
print("Accuracy:",accuracy)
R squared: 0.00714872102992
Accuracy: -0.0712769379579

Rec_class_df

In [315]:
data = Rec_class_df
holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]

features_train = training.drop("recidivism_Y_N", axis=1).as_matrix(columns=None)
labels_train = training["recidivism_Y_N"].as_matrix(columns=None)

features_test = holdout.drop("recidivism_Y_N", axis=1).as_matrix(columns=None)
labels_test = holdout["recidivism_Y_N"].as_matrix(columns=None)

print("Percentage of Ys: %s\n"%(len(data[data["recidivism_Y_N"]=="Y"])/len(data)))

feature_1_no = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_2_no = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_1_yes = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
feature_2_yes = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
plt.scatter(feature_1_yes, feature_2_yes, color = "g", label="Recidivism")
plt.scatter(feature_1_no, feature_2_no, color = "r", label="No Recidivism")
plt.legend()
plt.xlabel("Race")
plt.ylabel("Sex")
plt.show()
Percentage of Ys: 0.9217391304347826

In [316]:
# Logistic Regression
model = LogisticRegression(fit_intercept = False, C = 1e9)
clf = model.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Logistic Regression")
evaluate(pred, labels_test)  
plot_bound("Y",holdout,1,2,0)


# Test some spot
x_test = 70
y_test = 160000
print("")
print(clf.predict([[x_test,y_test]])[0])
print(clf.predict_proba([[x_test,y_test]])[0][1])
print("")

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("\nDecision Tree")
evaluate(pred, labels_test)
plot_bound("Y",holdout,1,2,0)



from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Random Forest")
evaluate(pred, labels_test)  
plot_bound("Y",holdout,1,2,0)


from sklearn.svm import SVC
clf = SVC(kernel="rbf",probability=True)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("SVM")
evaluate(pred, labels_test)  
#plot_bound("Y",holdout,1,2,0) # plot doesn't work with SVM
Logistic Regression
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091
Y
1.0


Decision Tree
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091
Random Forest
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091
SVM
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091
In [ ]:
 
In [ ]: