This project centers on measuring whether an orange line train is likely to be delayed at one stop compared to another stop. The stops used are Ruggles, Roxbury Crossing, and Jackson station. As seen in the table, they are under the category of "line 1" and show each stop's measure of delay calculated in seconds.
This is where we load libraires and the like so we can do what we need. If you get an error saying a module is not loaded, open a new terminal/cmd line and try running: pip install [module name]
.
import os
try:
inputFunc = raw_input
except NameError:
inputFunc = input
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from patsy import dmatrices
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
# Custom functions
def evaluate(pred, labels_test):
acc = accuracy_score(pred, labels_test)
print ("Accuracey: %s"%acc)
tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()
recall = tp / (tp + fp)
percision = tp / (tp + fn)
f1 = (2 / ((1/recall)+(1/percision)))
print ("")
print ("True Negatives: %s"%tn)
print ("False Positives: %s"%fp)
print ("False Negatives: %s"%fn)
print ("True Positives: %s"%tp)
print ("Recall: %s"%recall)
print ("Precision: %s"%percision)
print ("F1 Score: %s"%f1)
def plot_bound(Z_val,data,col1,col2,binary):
# Z-val equals "Yes" value. E.g., "Y" or "1".
# data equals df
# col1 and col2 defines which colums to use from data
# Plot binary decision boundary.
# For this, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10
x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
y_min = 0.0;
y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
h_x = (x_max-x_min)/100 # step size in the mesh
h_y = (y_max-y_min)/100 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
if binary == 1:
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.where(Z=="Y",1,0)
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z)
plt.show()
Here we load the data we collected and get it all ready to feed to our statistical model(s). That is, we are trying to make a table with one target column and one or more features. Here I'm loading mbta.csv from: http://mbtaviz.github.io/ Note: you can find information on the data elements at this link.
# Load and peek at your data. Change the file name as needed.
raw_data_df = pd.read_csv('mbta.csv', parse_dates=[0])
raw_data_df.head()
# I'm now going to make a set of tables to be used in training some models
# The first set will be for linear regressions where the traget is numeric.
# Delay
Delay_lin_df = processed_data_df[[
'lines/1/delay_actual/place-masta|place-rugg',
'lines/1/delay-actual/place-rugg|place-rcmnl',
'lines/1/delay-actual/place-rcmnl|place-jaksn'
]].copy()
Delay_lin_df.head()
data = Delay_class_df
holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]
# Define the target (y) and feature(s) (X)
features_train = training.drop(lines/1/delay_actual/place-masta|place-rugg", axis=1).as_matrix(columns=None)
labels_train = training["lines/1/delay_actual/place-masta|place-rugg"].as_matrix(columns=None)
features_test = holdout.drop("lines/1/delay_actual/place-masta|place-rugg", axis=1).as_matrix(columns=None)
labels_test = holdout["lines/1/delay_actual/place-masta|place-rugg"].as_matrix(columns=None)
# What percentage of the time is target Y?
print("Percentage of Ys: %s\n"%(len(data[data["lines/1/delay_actual/place-masta|place-rugg"]=="Y"])/len(data)))
#### initial visualization
feature_1_no = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_2_no = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_1_yes = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
feature_2_yes = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
plt.scatter(feature_1_yes, feature_2_yes, color = "g", label="Delay")
plt.scatter(feature_1_no, feature_2_no, color = "r", label="Delay")
plt.legend()
plt.xlabel("lines/1/delay_actual/place-masta|place-rugg")
plt.ylabel("lines/1/delay_actual/place-masta|place-rugg")
plt.show()