Matthew Morgan Project Three Notebook¶

For Project three I used a data set located at https://catalog.data.gov/dataset/3-year-recidivism-for-offenders-released-from-prison that indicates many variable for people such as sex and race and whether they were re-admitted to prison within 3 years of having been released.

I took a sample of a little over 100 people from the data set and created a simplified data set. I coded their sex as either 1 (male), or 0 (female); their race as either 1 (white non-hispanic), 2 (white hispanic), 3 (black non-hispanic), 4 (black hispanic), or 5 (american indian non-hispanic); and there recidivism as either 1 (re-admittded) or 0 (not readmitted).

I then used this data set for the fllowing program to see if there is any relationship between sex and race on recidivism. The model didn't perform well. The accuracy in training was 0. There probably isn't enough data to properly form a good model.

import os
try:
    inputFunc = raw_input
except NameError:
    inputFunc = input

import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
 
import seaborn as sns
from statsmodels.formula.api import ols

from sklearn import linear_model
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from patsy import dmatrices

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

import random





def evaluate(pred, labels_test):
    acc = accuracy_score(pred, labels_test)
    print ("Accuracey: %s"%acc)
    tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()

    recall = tp / (tp + fp)
    percision = tp / (tp + fn)
    f1 = (2 / ((1/recall)+(1/percision)))

    print ("")
    print ("True Negatives: %s"%tn)
    print ("False Positives: %s"%fp)
    print ("False Negatives: %s"%fn)
    print ("True Positives: %s"%tp)
    print ("Recall: %s"%recall)
    print ("Precision: %s"%percision)
    print ("F1 Score: %s"%f1)

def plot_bound(Z_val,data,col1,col2,binary):

    
    x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10 
    x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
    y_min = 0.0; 
    y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
    h_x = (x_max-x_min)/100  
    h_y = (y_max-y_min)/100  
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
    if binary == 1:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])   
        Z = np.where(Z=="Y",1,0)
    else:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
  
    Z = Z.reshape(xx.shape)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.pcolormesh(xx, yy, Z)
    plt.show()

Data¶

raw_data_df = pd.read_csv('Jail2.csv', parse_dates=[0]) 
raw_data_df.head()

raw_data_df["recidivism_Y_N"] = "N"
raw_data_df.head()

 
raw_data_df.loc[raw_data_df['Recidivism'] == 1, 'recidivism_Y_N'] = "Y"

raw_data_df.head()

Rec_lin_df = raw_data_df[[
                               'Sex', 
                               'Race', 
                               'Recidivism'
                               ]].copy()
Rec_lin_df.head()

Rec_class_df = raw_data_df[[
                               'recidivism_Y_N', 
                               'Sex', 
                               'Race'
                               ]].copy()
Rec_class_df.head()

Taining and Validation¶

Rec_lin_df¶

data = Rec_lin_df

data = data[data["Recidivism"]<=1]

holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]

sns.lmplot(x="Sex", y="Recidivism", data=training, x_estimator=np.mean, order=1)

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-311-2df74713507d> in <module>()
----> 1 sns.lmplot(x="Sex", y="Recidivism", data=training, x_estimator=np.mean, order=1)

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in lmplot(x, y, data, hue, col, row, palette, col_wrap, size, aspect, markers, sharex, sharey, hue_order, col_order, row_order, legend, legend_out, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, x_jitter, y_jitter, scatter_kws, line_kws)
    577         scatter_kws=scatter_kws, line_kws=line_kws,
    578         )
--> 579     facets.map_dataframe(regplot, x, y, **regplot_kws)
    580 
    581     # Add a legend

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/axisgrid.py in map_dataframe(self, func, *args, **kwargs)
    792 
    793             # Draw the plot
--> 794             self._facet_plot(func, ax, args, kwargs)
    795 
    796         # Finalize the annotations and layout

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/axisgrid.py in _facet_plot(self, func, ax, plot_args, plot_kwargs)
    810 
    811         # Draw the plot
--> 812         func(*plot_args, **plot_kwargs)
    813 
    814         # Sort out the supporting information

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in regplot(x, y, data, x_estimator, x_bins, x_ci, scatter, fit_reg, ci, n_boot, units, order, logistic, lowess, robust, logx, x_partial, y_partial, truncate, dropna, x_jitter, y_jitter, label, color, marker, scatter_kws, line_kws, ax)
    777     scatter_kws["marker"] = marker
    778     line_kws = {} if line_kws is None else copy.copy(line_kws)
--> 779     plotter.plot(ax, scatter_kws, line_kws)
    780     return ax
    781 

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in plot(self, ax, scatter_kws, line_kws)
    330             self.scatterplot(ax, scatter_kws)
    331         if self.fit_reg:
--> 332             self.lineplot(ax, line_kws)
    333 
    334         # Label the axes

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in lineplot(self, ax, kws)
    375 
    376         # Fit the regression model
--> 377         grid, yhat, err_bands = self.fit_regression(ax)
    378 
    379         # Get set default aesthetics

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in fit_regression(self, ax, x_range, grid)
    207             yhat, yhat_boots = self.fit_logx(grid)
    208         else:
--> 209             yhat, yhat_boots = self.fit_fast(grid)
    210 
    211         # Compute the confidence interval at each grid point

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in fit_fast(self, grid)
    222         grid = np.c_[np.ones(len(grid)), grid]
    223         reg_func = lambda _x, _y: np.linalg.pinv(_x).dot(_y)
--> 224         yhat = grid.dot(reg_func(X, y))
    225         if self.ci is None:
    226             return yhat, None

/Users/mtm2482/anaconda/lib/python3.6/site-packages/seaborn/linearmodels.py in <lambda>(_x, _y)
    221         X, y = np.c_[np.ones(len(self.x)), self.x], self.y
    222         grid = np.c_[np.ones(len(grid)), grid]
--> 223         reg_func = lambda _x, _y: np.linalg.pinv(_x).dot(_y)
    224         yhat = grid.dot(reg_func(X, y))
    225         if self.ci is None:

/Users/mtm2482/anaconda/lib/python3.6/site-packages/numpy/linalg/linalg.py in pinv(a, rcond)
   1659     a, wrap = _makearray(a)
   1660     _assertNoEmpty2d(a)
-> 1661     a = a.conjugate()
   1662     u, s, vt = svd(a, 0)
   1663     m = u.shape[0]

AttributeError: 'str' object has no attribute 'conjugate'

sns.lmplot(x="Race", y="Recidivism", data=training, x_estimator=np.mean, order=1)

<seaborn.axisgrid.FacetGrid at 0x121706ac8>

model = ols("Recidivism ~ Race + Sex", training).fit()
model.summary()

# Rerun with SciKitLearn because it's easy to check accuracy
features_train = training.drop("Recidivism", axis=1).as_matrix(columns=None)
labels_train = training["Recidivism"].as_matrix(columns=None)

features_test = holdout.drop("Recidivism", axis=1).as_matrix(columns=None)
labels_test = holdout["Recidivism"].as_matrix(columns=None)

lm = linear_model.LinearRegression()
clf = lm.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = metrics.r2_score(labels_test, pred)
print("R squared:",lm.score(features_train,labels_train))
print("Accuracy:",accuracy)

R squared: 0.00714872102992
Accuracy: -0.0712769379579

Rec_class_df¶

data = Rec_class_df
holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]

features_train = training.drop("recidivism_Y_N", axis=1).as_matrix(columns=None)
labels_train = training["recidivism_Y_N"].as_matrix(columns=None)

features_test = holdout.drop("recidivism_Y_N", axis=1).as_matrix(columns=None)
labels_test = holdout["recidivism_Y_N"].as_matrix(columns=None)

print("Percentage of Ys: %s\n"%(len(data[data["recidivism_Y_N"]=="Y"])/len(data)))

feature_1_no = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_2_no = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_1_yes = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
feature_2_yes = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
plt.scatter(feature_1_yes, feature_2_yes, color = "g", label="Recidivism")
plt.scatter(feature_1_no, feature_2_no, color = "r", label="No Recidivism")
plt.legend()
plt.xlabel("Race")
plt.ylabel("Sex")
plt.show()

Percentage of Ys: 0.9217391304347826

# Logistic Regression
model = LogisticRegression(fit_intercept = False, C = 1e9)
clf = model.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Logistic Regression")
evaluate(pred, labels_test)  
plot_bound("Y",holdout,1,2,0)


# Test some spot
x_test = 70
y_test = 160000
print("")
print(clf.predict([[x_test,y_test]])[0])
print(clf.predict_proba([[x_test,y_test]])[0][1])
print("")

from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("\nDecision Tree")
evaluate(pred, labels_test)
plot_bound("Y",holdout,1,2,0)



from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Random Forest")
evaluate(pred, labels_test)  
plot_bound("Y",holdout,1,2,0)


from sklearn.svm import SVC
clf = SVC(kernel="rbf",probability=True)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("SVM")
evaluate(pred, labels_test)  
#plot_bound("Y",holdout,1,2,0) # plot doesn't work with SVM

Logistic Regression
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091

Y
1.0


Decision Tree
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091

Random Forest
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091

SVM
Accuracey: 0.833333333333

True Negatives: 0
False Positives: 1
False Negatives: 0
True Positives: 5
Recall: 0.833333333333
Precision: 1.0
F1 Score: 0.909090909091

Dep. Variable:	Recidivism	R-squared:	0.007
Model:	OLS	Adj. R-squared:	-0.012
Method:	Least Squares	F-statistic:	0.3816
Date:	Sun, 03 Dec 2017	Prob (F-statistic):	0.684
Time:	16:54:33	Log-Likelihood:	-7.7700
No. Observations:	109	AIC:	21.54
Df Residuals:	106	BIC:	29.61
Df Model:	2
Covariance Type:	nonrobust

	coef	std err	t	P>\|t\|	[0.025	0.975]
Intercept	0.9822	0.068	14.345	0.000	0.846	1.118
Sex[T.1]	-0.0458	0.062	-0.743	0.459	-0.168	0.076
Race	-0.0136	0.024	-0.563	0.575	-0.062	0.034

Omnibus:	94.297	Durbin-Watson:	1.356
Prob(Omnibus):	0.000	Jarque-Bera (JB):	524.194
Skew:	-3.236	Prob(JB):	1.49e-114
Kurtosis:	11.575	Cond. No.	7.30

	Sex	Race	Recidivism	recidivism_Y_N
0	1	1	1	N
1	0	1	1	N
2	1	1	1	N
3	1	1	1	N
4	1	1	1	N

	Sex	Race	Recidivism	recidivism_Y_N
0	1	1	1	Y
1	0	1	1	Y
2	1	1	1	Y
3	1	1	1	Y
4	1	1	1	Y

	recidivism_Y_N	Sex	Race
0	Y	1	1
1	Y	0	1
2	Y	1	1
3	Y	1	1
4	Y	1	1