Welcome to my project 3! Given my usually overly complicated ideas for a project, I stuck with something "relatively" simple: determining tomorrow's temperature based on the weather from the past year. Using the average temperature as well as wind speed, humidity, dew point, uv index, visibility, atmospheric pressure, and precipitation, I hope to determine tomorrow's temperature.
Where I found my data:
I was able to find my data using Dark Sky API thanks to Professor Colarusso's suggestion. I was provided a secret key with which to acces the data. I can include the key with this notebook if that is required, desired, or recommended.
What my data says:
Throughout this code sample, I am attempting to use "Today's Temperature," "Yesterday's Temperature," "Today's Humidity," and "Today's Wind Speed" to determine Tomorrow's Temperature. So far, this data demonstrates a correlation between yesterday's temperature, today's temperature, today's wind speed, and today's humidity.
I want my model to predict tomorrow's temperature with only Today's information. To test whether the model worked, I took the data from "Today's" weather a.k.a. 11/26/17 and the data from one running of the OLS Regression Results (as instructed by Professor Colarusso) and did the math:
Intercept + (Today's Humidity .0463) + (Today's Wind Speed -.3542) + (Yesterday's Temperature .0140) + (Today's Temperature .8407) = Tomorrow's Temperature
The math looked as follows:
9.2862 + (50 .0463) + (6 -.3542) + (55 .0140) + (46 .8407) = 48.9182
11/27/17's actual temperature was 44 Degrees Fahrenheit, only 4 - 5 degrees off from what the model predicted.
import os
try:
inputFunc = raw_input
except NameError:
inputFunc = input
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from patsy import dmatrices
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
# Custom functions
def evaluate(pred, labels_test):
acc = accuracy_score(pred, labels_test)
print ("Accuracey: %s"%acc)
tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()
recall = tp / (tp + fp)
percision = tp / (tp + fn)
f1 = (2 / ((1/recall)+(1/percision)))
print ("")
print ("True Negatives: %s"%tn)
print ("False Positives: %s"%fp)
print ("False Negatives: %s"%fn)
print ("True Positives: %s"%tp)
print ("Recall: %s"%recall)
print ("Precision: %s"%percision)
print ("F1 Score: %s"%f1)
def plot_bound(Z_val,data,col1,col2,binary):
# Z-val equals "Yes" value. E.g., "Y" or "1".
# data equals df
# col1 and col2 defines which colums to use from data
# Plot binary decision boundary.
# For this, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10
x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
y_min = 0.0;
y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
h_x = (x_max-x_min)/100 # step size in the mesh
h_y = (y_max-y_min)/100 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
if binary == 1:
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.where(Z=="Y",1,0)
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z)
plt.show()
# Load and peek at your data. Change the file name as needed.
raw_data_df = pd.read_csv('Tomorrow.csv', parse_dates=[0])
raw_data_df.head()
# You can explore unique entries by stating the column and using .unique() like this:
print(raw_data_df["Temperature Today in Degrees Fahrenheit"].unique())
print(raw_data_df["Temperature Yesterday"].unique())
# You can explore rows with a specific value like so
raw_data_df[raw_data_df["Temperature Today in Degrees Fahrenheit"]==65].head() # remove .head() to see all entires
# You can count the number of rows like so
print("Size of entire table: %s "%len(raw_data_df))
print("Size of entries matching filter: %s "%len(raw_data_df[raw_data_df["Temperature Today in Degrees Fahrenheit"]==65]))
# You can invert a match by using the 'not equal' evaluation.
print("Size of entries matching filter: %s "%len(raw_data_df[raw_data_df["Temperature Today in Degrees Fahrenheit"]!=65]))
raw_data_df[raw_data_df["Temperature Today in Degrees Fahrenheit"]!=65].head()
# You can make a new table from your filtered rows like so
processed_data_df = raw_data_df[raw_data_df["Temperature Today in Degrees Fahrenheit"]!=65]
processed_data_df = processed_data_df[processed_data_df["Temperature Yesterday"]!=4]
# Note how I filtered first on raw_data_df and then on processed_data_df
# Now let's remove unnecessary data
processed_data_df = processed_data_df[processed_data_df["Visibility by Miles"]!=0]
# So how many entires are there?
print("Size of entire table: %s "%len(processed_data_df))
# Let's peak at the table.
processed_data_df.head()
# for the special case of when a value is NaN, you can filter based on the value not being null (i.e., empty)
processed_data_df = processed_data_df[pd.notnull(processed_data_df["Temperature Today in Degrees Fahrenheit"])]
processed_data_df = processed_data_df[pd.notnull(processed_data_df["Temperature Yesterday"])]
print("Size of entire table: %s "%len(processed_data_df)) # in the example data, this gets rid of a few rows
processed_data_df.head()
# You can remove unwanted colums like so
# for a single column
processed_data_df = processed_data_df.drop('UV Index', 1)
# for multiple columns
processed_data_df = processed_data_df.drop(['Precipitation in Percentage',
'Visibility by Miles'], 1)
processed_data_df.head()
# Alternativly, if you want to make a new table from a subset of columns, you can do so like this.
processed_data_df = processed_data_df[[
'Temperature Today in Degrees Fahrenheit',
'Wind Speed in MPH',
'Humidity Percentage',
'Pressure in mb',
'Dew Pt in Degress Fahrenheit',
'Temperature Yesterday',
'Temperature Tomorrow'
]].copy()
processed_data_df.head()
# You can rename columns like so.
processed_data_df = processed_data_df.rename(columns={
'Temperature Today in Degrees Fahrenheit': 'Today',
'Wind Speed in MPH': 'Wind',
'Humidity Percentage': 'Humidity',
'Pressure in mb': 'Pressure',
'Dew Pt in Degress Fahrenheit': 'Dew',
'Temperature Yesterday': 'Yesterday',
'Temperature Tomorrow': 'Tomorrow'
})
processed_data_df.head()
# I'm now going to make a set of tables to be used in training some models
# The first set will be for linear regressions where the traget is numeric.
# Today
Todays_lin_df = processed_data_df[[
'Today',
'Wind',
'Humidity',
'Yesterday',
'Tomorrow'
]].copy()
Todays_lin_df.head()
Above I created four datasets worth exploring:
Todays_lin_df
. The data needed to access temperature along a continuous variable.Let's take them each in turn.
data = Todays_lin_df
holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]
sns.lmplot(x="Today", y="Tomorrow", data=training, x_estimator=np.mean, order=1)
sns.lmplot(x="Humidity", y="Tomorrow", data=training, x_estimator=np.mean, order=1)
sns.lmplot(x="Wind", y="Tomorrow", data=training, x_estimator=np.mean, order=1)
model = ols("Tomorrow ~ Humidity + Wind + Yesterday + Today", training).fit()
#model = ols("happy ~ age + income + np.power(age, 2) + np.power(income, 2)", training).fit()
model.summary()
# Rerun with SciKitLearn because it's easy to check accuracy
features_train = training.drop("Tomorrow", axis=1).as_matrix(columns=None)
labels_train = training["Tomorrow"].as_matrix(columns=None)
features_test = holdout.drop("Tomorrow", axis=1).as_matrix(columns=None)
labels_test = holdout["Tomorrow"].as_matrix(columns=None)
lm = linear_model.LinearRegression()
clf = lm.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = metrics.r2_score(labels_test, pred)
print("R squared:",lm.score(features_train,labels_train))
print("Accuracy:",accuracy)