This is a project to predict the total numbers of 911 dispatch between 2011-2014 and compare these data to the year before to see whether there is a correlation.
I cleaned this 911 Dispatch data. I downloaded this data from Analyze Boston, a data hub of the City of Boston. The Dataset includes date, year, month, day of year and the amount of calls in total and separated by the Boston Police Department, Boston Fire Department, and Emergency Medical Services.
You can see the process of the Data cleaning with short descriptions below.
import os
try:
inputFunc = raw_input
except NameError:
inputFunc = input
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from patsy import dmatrices
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
# Custom functions
def evaluate(pred, labels_test):
acc = accuracy_score(pred, labels_test)
print ("Accuracey: %s"%acc)
tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()
recall = tp / (tp + fp)
percision = tp / (tp + fn)
f1 = (2 / ((1/recall)+(1/percision)))
print ("")
print ("True Negatives: %s"%tn)
print ("False Positives: %s"%fp)
print ("False Negatives: %s"%fn)
print ("True Positives: %s"%tp)
print ("Recall: %s"%recall)
print ("Precision: %s"%percision)
print ("F1 Score: %s"%f1)
def plot_bound(Z_val,data,col1,col2,binary):
# Z-val equals "Yes" value. E.g., "Y" or "1".
# data equals df
# col1 and col2 defines which colums to use from data
# Plot binary decision boundary.
# For this, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10
x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
y_min = 0.0;
y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
h_x = (x_max-x_min)/100 # step size in the mesh
h_y = (y_max-y_min)/100 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
if binary == 1:
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.where(Z=="Y",1,0)
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z)
plt.show()
raw_data_df = pd.read_csv('911.csv')
raw_data_df.head()
I ran the following code to see the total number of rows of data.
print("Size of entire table: %s "%len(raw_data_df))
# This is the step that I take just to make sure all of your columns are stored as numbers.
raw_data_df = raw_data_df.apply(pd.to_numeric, errors='coerce')
# errors='coerce' will set things that can't be converted to numbers to NaN
# so you'll want to drop these like so.
raw_data_df = raw_data_df.dropna()
raw_data_df.head()
I then made a loop to clean the data. The loop combines all the total calls of each month instead of having all the day of year. Because there are some missing data (2010 starts with November and 2014 ends with April), I set up the loop so that it starts on November of 2011 and ends on April of 2014.
df = pd.DataFrame()
n = 0
for j in range(2010,2015):
for i in range(1,13):
if (j == 2011 and i >= 11) or (j > 2011 and j < 2014) or (j == 2014 and i <= 4):
data = pd.DataFrame([[
i ,
raw_data_df[(raw_data_df["Year"]==j) & (raw_data_df["Month"]==i)]["Total"].sum(),
raw_data_df[(raw_data_df["Year"]==j-1) & (raw_data_df["Month"]==i)]["Total"].sum(),
]],index=[n],columns=['Month','Total','Total_last_year'])
df = df.append(data)
n = n + 1
df
I then used the following codes in order to create a new table with just 2 variables:
The purpose of this is to be able to see the correlation between the current total of dispatch that we are looking at and the total of dispatch last year as you can see in the graph below.
dispatch_lin_df = df[[
'Total',
'Total_last_year'
]].copy()
dispatch_lin_df
As you can see in the result below. R squared is around 29% and Accuracy is around 62%.
This result shows that the model explains low of the variability of the response data around its mean and therefore, show that this is not an accurate representation of the correlation between the two feature variables. However, a low R-squared does not necessarily indicate that the model is bad because R-squared cannot determine whether the coefficient estimates and predictions are biased. The 911 dispatch data involves human behaviors which are simply hard to predict.
Consequently, although we can see some correlation from this dataset, I do have to agree with the result that it is not an accurate representation of the correlation between the two feature variables. A different set of data and a different data sample size will likely demonstrate a better correlation and prediction.
data = dispatch_lin_df
holdout = data.sample(frac=0.3)
training = data.loc[~data.index.isin(holdout.index)]
training
sns.lmplot(x="Total_last_year", y="Total", data=training, x_estimator=np.mean, order=1)
model = ols("Total ~ Total_last_year", training).fit()
#model = ols("Total ~ Month + np.power(Month, 2)", training).fit()
model.summary()
# Rerun with SciKitLearn because it's easy to check accuracy
features_train = training.drop("Total", axis=1).as_matrix(columns=None)
labels_train = training["Total"].as_matrix(columns=None)
features_test = holdout.drop("Total", axis=1).as_matrix(columns=None)
labels_test = holdout["Total"].as_matrix(columns=None)
lm = linear_model.LinearRegression()
clf = lm.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = metrics.r2_score(labels_test, pred)
print("R squared:",lm.score(features_train,labels_train))
print("Accuracy:",accuracy)