By: Charmaine Wood
I began this project not knowing what I wanted to focus on but after a few days I really wanted to work with information regarding the Bar Exam. I was really excited to begin the research and was actually pretty successful obtaining information. I began working with the National Conference of Bar Examiners and was able to get getting for the last 27 years. I organized the information and later realized when I began implementing the information into my notebook that the information is very interesting but would be bery difficult to incorporate into this project. This is when I needed to figure out a different route. I began looking at the website that were sent to the class and came across a spreadsheet with information regarding the property value of a variety of properties in Cambridge.
As I mentioned above I had to start from square one after a lot of research and organizing the data related to the Bar Exam. When I began working with the new set of data I knew that the outcome would be the price at which the property was sold but I was not sure right away which variables I would focus on. As Iwas looking through the data I kept asking myself what would affect the cost of a property. There were actually a few components I thought may work but I was most interested in the affect the assessment and building size would have on the price it was sold for. When I got to the chart section I realized that I had way to much data and had to trim it down, that is why I only worked with 3 family households.
I believe that this project does have rel-world viability due to the R squared value being 0.346219175385 and the accuracy value being 0.115427249333. There are definitely a few outliers but most of my data clumped and I believe this would be the end result of any other property type entered into the code.
import os
try:
inputFunc = raw_input
except NameError:
inputFunc = input
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from patsy import dmatrices
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
# Custom functions
def evaluate(pred, labels_test):
acc = accuracy_score(pred, labels_test)
print ("Accuracey: %s"%acc)
tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()
recall = tp / (tp + fp)
percision = tp / (tp + fn)
f1 = (2 / ((1/recall)+(1/percision)))
print ("")
print ("True Negatives: %s"%tn)
print ("False Positives: %s"%fp)
print ("False Negatives: %s"%fn)
print ("True Positives: %s"%tp)
print ("Recall: %s"%recall)
print ("Precision: %s"%percision)
print ("F1 Score: %s"%f1)
def plot_bound(Z_val,data,col1,col2,binary):
# Z-val equals "Yes" value. E.g., "Y" or "1".
# data equals df
# col1 and col2 defines which colums to use from data
# Plot binary decision boundary.
# For this, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10
x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
y_min = 0.0;
y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
h_x = (x_max-x_min)/100 # step size in the mesh
h_y = (y_max-y_min)/100 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
if binary == 1:
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.where(Z=="Y",1,0)
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z)
plt.show()
raw_data_df = pd.read_csv('Cambridge2.csv', sep=',', error_bad_lines=False, index_col=False, dtype='unicode')
raw_data_df.head()
print(raw_data_df["BuildingValue"].unique())
print(raw_data_df["SalePrice"].unique())
print(raw_data_df["Interior_LivingArea"].unique())
raw_data_df[raw_data_df["Interior_LivingArea"]=='0'].head()
print("Size of entire table: %s "%len(raw_data_df))
print("Size of entires matching filter: %s "%len(raw_data_df[raw_data_df["Interior_LivingArea"]=='0']))
print("Size of entires matching filter: %s "%len(raw_data_df[raw_data_df["Interior_LivingArea"]!='0']))
raw_data_df[raw_data_df["Interior_LivingArea"]!='0'].head()
processed_data_df = raw_data_df[raw_data_df["Interior_LivingArea"]!='0']
processed_data_df = processed_data_df[processed_data_df["Interior_LivingArea"]!='0']
print("Size of entire table: %s "%len(processed_data_df))
processed_data_df.head()
processed_data_df = processed_data_df[pd.notnull(processed_data_df["BuildingValue"])]
processed_data_df = processed_data_df[pd.notnull(processed_data_df["SalePrice"])]
processed_data_df = processed_data_df[pd.notnull(processed_data_df["Interior_LivingArea"])]
print("Size of entire table: %s "%len(processed_data_df))
processed_data_df.head()
processed_data_df = processed_data_df[[
'BuildingValue',
'SalePrice',
'Interior_LivingArea'
]].copy()
processed_data_df.head()
processed_data_df = processed_data_df.rename(columns={
'BuildingValue': 'ValueOfBuilding',
'SalePrice': 'SalePrice',
'Interior_LivingArea': 'SizeOfInterior'
})
processed_data_df.head()
processed_data_df = processed_data_df.apply(pd.to_numeric, errors='coerce')
processed_data_df = processed_data_df.dropna()
processed_data_df.head()
Property_lin_df = processed_data_df[[
'SalePrice',
'ValueOfBuilding',
'SizeOfInterior'
]].copy()
Property_lin_df.head()
data = Property_lin_df
data = data[data["SalePrice"]>100000]
data = data[data["SizeOfInterior"]>2000]
data = data[data["ValueOfBuilding"]>150000]
print(len(data))
holdout = data.sample(frac=0.05)
training = data.loc[~data.index.isin(holdout.index)]
sns.lmplot(x="ValueOfBuilding", y="SalePrice", data=training, x_estimator=np.mean, order=1)
sns.lmplot(x="SizeOfInterior", y="SalePrice", data=training, x_estimator=np.mean, order=1)
model = ols("SalePrice ~ SizeOfInterior + ValueOfBuilding", training).fit()
model.summary()
features_train = training.drop("SalePrice", axis=1).as_matrix(columns=None)
labels_train = training["SalePrice"].as_matrix(columns=None)
features_test = holdout.drop("SalePrice", axis=1).as_matrix(columns=None)
labels_test = holdout["SalePrice"].as_matrix(columns=None)
lm = linear_model.LinearRegression()
clf = lm.fit(features_train, labels_train)
pred = clf.predict(features_test)
accuracy = metrics.r2_score(labels_test, pred)
print("R squared:",lm.score(features_train,labels_train))
print("Accuracy:",accuracy)