The project is to predict if there is a significant likelihood of pregnancy using the race and age of a unmarried woman as factors based on US data from 2006-2015. The different races being measured are All Races, Asian/Pacific Islander, African American/Black, Hispanic, Non-hispanic White, and All whites. The age groups measured are 15-19, 20-24, 25-29, 30-34, 35-39, and 40-44.
import os
try:
inputFunc = raw_input
except NameError:
inputFunc = input
import pandas as pd
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay
import numpy as np
import seaborn as sns
from statsmodels.formula.api import ols
from sklearn import linear_model
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from patsy import dmatrices
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import random
def evaluate(pred, labels_test):
acc = accuracy_score(pred, labels_test)
print ("Accuracey: %s"%acc)
tn, fp, fn, tp = confusion_matrix(labels_test, pred).ravel()
recall = tp / (tp + fp)
percision = tp / (tp + fn)
f1 = (2 / ((1/recall)+(1/percision)))
print ("")
print ("True Negatives: %s"%tn)
print ("False Positives: %s"%fp)
print ("False Negatives: %s"%fn)
print ("True Positives: %s"%tp)
print ("Recall: %s"%recall)
print ("Precision: %s"%percision)
print ("F1 Score: %s"%f1)
def plot_bound(Z_val,data,col1,col2,binary):
x_min = float(data.iloc[:,[col1]].min())-float(data.iloc[:,[col1]].min())*0.10
x_max = float(data.iloc[:,[col1]].max()+float(data.iloc[:,[col1]].min())*0.10)
y_min = 0.0;
y_max = float(training.iloc[:,[col2]].max())+float(training.iloc[:,[col2]].max())*0.10
h_x = (x_max-x_min)/100 # step size in the mesh
h_y = (y_max-y_min)/100 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h_x), np.arange(y_min, y_max, h_y))
if binary == 1:
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
Z = np.where(Z=="Y",1,0)
else:
Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z)
plt.show()
I retrieved this data from data.gov. Here I'm loading an edited version of unmarried_woman.csv from: https://catalog.data.gov/dataset/nonmarital-birth-rates-by-race-and-hispanic-origin-for-women-aged-15-44-united-states-1970. I used that data to create a table that relies on two variables to possibly affect the target variable.
raw_data_df = pd.read_csv('unmarried_woman.csv', parse_dates=[0])
raw_data_df.head()
I started filtering through the data, making sense of the different variables and the certain number of entries. I edited the entries for the variables so that they would all appear as numerical values. I also created a new column, which I named Pregnancy, which would gauge if there is a likelihood that the woman would become pregnant. This column uses the Birth_Rate column, and if that number is greater than 50, it would be "Y" for pregnancy likelihood. Otherwise it would be "N".
I made a table to train the data to use the variables age and race to gauge birth rate. This table falls under pregnant_class.df and will be used to analyze the data to make predictions.
print(raw_data_df["Age"].unique())
print(raw_data_df["Race"].unique())
print(raw_data_df["Birth_Rate"].unique())
raw_data_df[raw_data_df["Age"]=='20-24 years'].head()
print("Size of entire table: %s "%len(raw_data_df))
print("Size of entires matching filter: %s "%len(raw_data_df[raw_data_df["Age"]=="20-24 years"]))
raw_data_df.loc[raw_data_df['Age'] == '15-19 years', 'Age'] = 18
raw_data_df.loc[raw_data_df['Age'] == '20-24 years', 'Age'] = 21
raw_data_df.loc[raw_data_df['Age'] == '25-29 years', 'Age'] = 26
raw_data_df.loc[raw_data_df['Age'] == '30-34 years', 'Age'] = 30
raw_data_df.loc[raw_data_df['Age'] == '35-39 years', 'Age'] = 35
raw_data_df.loc[raw_data_df['Age'] == '40-44 years', 'Age'] = 42
raw_data_df.loc[raw_data_df['Race'] == 'All Races', 'Race'] = 1
raw_data_df.loc[raw_data_df['Race'] == 'Asian or Pacific Islander total', 'Race'] = 2
raw_data_df.loc[raw_data_df['Race'] == 'Black total', 'Race'] = 3
raw_data_df.loc[raw_data_df['Race'] == 'Hispanic', 'Race'] = 4
raw_data_df.loc[raw_data_df['Race'] == 'Non-Hispanic white', 'Race'] = 5
raw_data_df.loc[raw_data_df['Race'] == 'White total', 'Race'] = 6
raw_data_df.head()
raw_data_df["Pregnant"] = "N"
raw_data_df.head()
raw_data_df.loc[raw_data_df['Birth_Rate'] >= 60, 'Pregnant'] = "Y"
raw_data_df.head()
# I'm now going to make a table to be used in training some models
# The set will be for classifiers where the target is a class.
pregnant_class_df = raw_data_df[[
'Pregnant',
'Age',
'Race'
]].copy()
pregnant_class_df.head()
Above I created the dataset worth exploring:
pregnancy_class_df
. The data needed to access pregnancy likelihood as a categorical variable.data = pregnant_class_df
holdout = data.sample(frac=0.20)
training = data.loc[~data.index.isin(holdout.index)]
# Define the target (y) and feature(s) (X)
features_train = training.drop("Pregnant", axis=1).as_matrix(columns=None)
labels_train = training["Pregnant"].as_matrix(columns=None)
features_test = holdout.drop( "Pregnant", axis=1).as_matrix(columns=None)
labels_test = holdout["Pregnant"].as_matrix(columns=None)
# What percentage of the time is target Y?
print("Percentage of Ys: %s\n"%(len(data[data["Pregnant"]=="Y"])/len(data)))
#### initial visualization
feature_1_no = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_2_no = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="N"]
feature_1_yes = [features_test[ii][0] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
feature_2_yes = [features_test[ii][1] for ii in range(0, len(features_test)) if labels_test[ii]=="Y"]
plt.scatter(feature_1_yes, feature_2_yes, color = "g", label="Likely Pregnant")
plt.scatter(feature_1_no, feature_2_no, color = "r", label="Unlikely Pregnant")
plt.legend()
plt.xlabel("Age")
plt.ylabel("Race")
plt.show()
# Logistic Regression
model = LogisticRegression(fit_intercept = False, C = 1e9)
clf = model.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Logistic Regression")
evaluate(pred, labels_test)
plot_bound("Y",holdout,1,2,0)
# Test some spot
x_test = 70
y_test = 16000
print("")
print(clf.predict([[x_test,y_test]])[0])
print(clf.predict_proba([[x_test,y_test]])[0][1])
print("")
from sklearn import tree
clf = tree.DecisionTreeClassifier(min_samples_split=40)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("\nDecision Tree")
evaluate(pred, labels_test)
plot_bound("Y",holdout,1,2,0)
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("Random Forest")
evaluate(pred, labels_test)
plot_bound("Y",holdout,1,2,0)
from sklearn.svm import SVC
clf = SVC(kernel="rbf",probability=True)
clf = clf.fit(features_train, labels_train)
pred = clf.predict(features_test)
print("SVM")
evaluate(pred, labels_test)
Based on the result, the analysis is that there is a possibility at most ages and races that pregnancy will occur. However, the most common age was the 25-29 range, and the most common races to get pregnant were Hispanics and African-Americans. The logisitical regression was the least accurate with many false negatives, meaning that it was predicting that the pregnancy probability would be less than was suggested in the data. There was the fact that there were several variables being tested and spewing out multiple results for each data point, which may have resulted in the inconsistencies. There was also fewer false positives, proving that the data was skewed toward the side of less likely. Overall, the conclusion that all the graphs and analysis pointed to was that unmarried Hispanic and African-American women were more likely to become pregnant.