Project Three Notebook

This project centers on measuring whether an orange line train is likely to be delayed at one stop compared to another stop. The stops used are Ruggles, Roxbury Crossing, and Jackson station. As seen in the table, they are under the category of "line 1" and show each stop's measure of delay calculated in seconds.

Load Some Stuff

This is where we load libraires and the like so we can do what we need. If you get an error saying a module is not loaded, open a new terminal/cmd line and try running: pip install [module name].

In [19]:
import os
try:
    inputFunc = raw_input
except NameError:
    inputFunc = input

import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
 
import seaborn as sns
from statsmodels.formula.api import ols

from sklearn import linear_model
from sklearn import metrics

from sklearn.linear_model import LogisticRegression
from patsy import dmatrices

import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

import matplotlib.pyplot as plt

import random



# Custom functions

def evaluate(pred, labels_test):
    acc = accuracy_score(pred, labels_test)
    print ("Accuracey: %s"%acc)
    tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()

    recall = tp / (tp + fp)
    percision = tp / (tp + fn)
    f1 = (2 / ((1/recall)+(1/percision)))

    print ("")
    print ("True Negatives: %s"%tn)
    print ("False Positives: %s"%fp)
    print ("False Negatives: %s"%fn)
    print ("True Positives: %s"%tp)
    print ("Recall: %s"%recall)
    print ("Precision: %s"%percision)
    print ("F1 Score: %s"%f1)

def plot_bound(Z_val,data,col1,col2,binary):
    # Z-val equals "Yes" value. E.g., "Y" or "1". 
    # data equals df
    # col1 and col2 defines which colums to use from data
    # Plot binary decision boundary. 
    # For this, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    
    x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10 
    x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
    y_min = 0.0; 
    y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
    h_x = (x_max-x_min)/100  # step size in the mesh
    h_y = (y_max-y_min)/100  # step size in the mesh
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
    if binary == 1:
        Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])   
        Z = np.where(Z=="Y",1,0)
    else:
        Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())
    plt.pcolormesh(xx, yy, Z)
    plt.show()

Data Cleaning

Here we load the data we collected and get it all ready to feed to our statistical model(s). That is, we are trying to make a table with one target column and one or more features. Here I'm loading mbta.csv from: http://mbtaviz.github.io/ Note: you can find information on the data elements at this link.

In [20]:
# Load and peek at your data. Change the file name as needed. 
raw_data_df = pd.read_csv('mbta.csv', parse_dates=[0]) 
raw_data_df.head()
Out[20]:
lines/1/delay_actual/place-masta|place-rugg lines/1/delay_actual/place-rugg|place-rcmnl lines/1/delay_actual/place-rcmnl|place-jaksn lines/1/delay_actual/place-jaksn|place-sbmnl lines/1/delay_actual/place-sbmnl|place-grnst lines/1/delay_actual/place-grnst|place-forhl lines/1/delay_actual/place-ogmnl|place-mlmnl lines/1/delay_actual/place-forhl|place-grnst lines/1/delay_actual/place-grnst|place-sbmnl lines/1/delay_actual/place-sbmnl|place-jaksn ... lines/1/delay_actual/place-haecl|place-state lines/1/delay_actual/place-state|place-dwnxg lines/1/delay_actual/place-dwnxg|place-chncl lines/1/delay_actual/place-chncl|place-tumnl lines/1/delay_actual/place-tumnl|place-bbsta lines/1/delay_actual/place-bbsta|place-masta lines/1/delay_actual/place-mlmnl|place-welln lines/1/delay_actual/place-welln|place-sull lines/1/delay_actual/place-sull|place-ccmnl lines/1/delay_actual/place-ccmnl|place-north
0 0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 ... 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
1 88 101.5 96.0 114.0 90.0 153.0 81.5 115.0 96.0 101.0 ... 83.0 69.0 78.0 69.0 113.0 140.0 209.0 204.0 170.0 137.0
2 79 98.5 104.0 116.0 107.0 163.0 29.0 115.0 94.0 103.0 ... 88.0 63.0 65.0 60.0 311.0 159.0 197.0 205.0 151.0 165.0
3 77 91.5 90.0 109.0 106.0 148.0 135.0 117.0 95.0 107.0 ... 95.0 78.0 223.0 63.0 104.0 126.0 212.0 209.0 120.0 135.0
4 0 0.0 90.0 100.0 85.0 139.0 0.0 0.0 0.0 0.0 ... 77.0 310.0 1009.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0

5 rows × 36 columns

In [24]:
# I'm now going to make a set of tables to be used in training some models
# The first set will be for linear regressions where the traget is numeric.
# Delay
Delay_lin_df = processed_data_df[[
                               'lines/1/delay_actual/place-masta|place-rugg', 
                               'lines/1/delay-actual/place-rugg|place-rcmnl', 
                               'lines/1/delay-actual/place-rcmnl|place-jaksn'
                               ]].copy()
Delay_lin_df.head()
---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-24-ea27a74e7277> in <module>()
      2 # The first set will be for linear regressions where the traget is numeric.
      3 # Delay
----> 4 Delay_lin_df = processed_data_df[[
      5                                'lines/1/delay_actual/place-masta|place-rugg',
      6                                'lines/1/delay-actual/place-rugg|place-rcmnl',

NameError: name 'processed_data_df' is not defined

Delay_class_df

In [ ]:
data = Delay_class_df
holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]

# Define the target (y) and feature(s) (X)
features_train = training.drop(lines/1/delay_actual/place-masta|place-rugg", axis=1).as_matrix(columns=None)
labels_train = training["lines/1/delay_actual/place-masta|place-rugg"].as_matrix(columns=None)

features_test = holdout.drop("lines/1/delay_actual/place-masta|place-rugg", axis=1).as_matrix(columns=None)
labels_test = holdout["lines/1/delay_actual/place-masta|place-rugg"].as_matrix(columns=None)

# What percentage of the time is target Y?
print("Percentage of Ys: %s\n"%(len(data[data["lines/1/delay_actual/place-masta|place-rugg"]=="Y"])/len(data)))

#### initial visualization
feature_1_no = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_2_no = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_1_yes = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
feature_2_yes = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
plt.scatter(feature_1_yes, feature_2_yes, color = "g", label="Delay")
plt.scatter(feature_1_no, feature_2_no, color = "r", label="Delay")
plt.legend()
plt.xlabel("lines/1/delay_actual/place-masta|place-rugg")
plt.ylabel("lines/1/delay_actual/place-masta|place-rugg")
plt.show()
In [ ]: