Predict future cases
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pylab as plt
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline
TRAIN_SIZE = 0.8
WINDOWSIZE = 6 #days to use to predict
#open the file
df =pd.read_csv("cases-michigan.csv", sep=",")
print(df.head(5))
print(df.dtypes)
#convert date string to datetime obj
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
print(df.index)
#convert to time series:
ts = df['cases']
ts.head(10)
#make sure the data is correct and makes sense
plt.plot(ts)
#loop and create timeseries format
df = df.sort_values('date')
countyRecords = {}
for index, row in df.iterrows():
#print(row['county'], row['cases'], index)
if row['county'] in countyRecords.keys():
# add the cases # to the array
countyRecords[row['county']].append(int(row['cases']))
else:
countyRecords[row['county']] = [int(row['cases'])]
lenMostData = len(max(countyRecords.values(), key=lambda coll: len(coll)))
print(lenMostData) #determine length of longest time series to padd
#make sure all lists are the same length
for i in countyRecords.keys():
#check if less than the length of the longest array
padLen = lenMostData - len(countyRecords[i])
if padLen > 0:
zeroPad = list(np.zeros(padLen))
countyRecords[i] = zeroPad + countyRecords[i]
print(countyRecords['Ingham'])
#setup the dataframe for providing the model
finalX = []
finalY = []
idx = 0
for k, v in countyRecords.items():
maxTimeFrameLength = len(v) - 1
for r in v:
if maxTimeFrameLength >= (idx + WINDOWSIZE):
#can continue
finArr = []
for c in range(0,WINDOWSIZE):
finArr.append(v[idx + c])
finalY.append(v[idx + WINDOWSIZE])
finalX.append(finArr)
idx += 1
print(finalX[:10])
print(finalY[:10])
#setup the model
def naivebayes(ts):
X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
gnb = GaussianNB()
gnb.fit(X_train, y_train)
predictions = gnb.predict(X_test)
return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
#Logistic Regression
def logisticRegression(ts):
X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
lgr = LogisticRegression(random_state=0, solver="lbfgs",
multi_class="multinomial").fit(X_train, y_train)
predictions = lgr.predict(X_test)
return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
def svmModel(ts):
X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
smodel = svm.SVC(gamma=0.001, C=2.0).fit(X_train, y_train)
predictions = smodel.predict(X_test)
return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
#BDT
def bdtModel(ts):
X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
bdt = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
predictions = bdt.predict(X_test)
return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
#look at training
results = [['model', 'Training Size', 'Mean Square Err', "R2 Score"]]
for tsize in [.9, .8, .7, .6, .5, .4, .3, .2]:
print(tsize)
results.append((['NB', 1- tsize] + naivebayes(tsize)))
results.append((['LR', 1- tsize] + logisticRegression(tsize)))
results.append((['SVM', 1- tsize] + svmModel(tsize)))
results.append((['BDT', 1- tsize] + bdtModel(tsize)))
dfresultsTemp = pd.DataFrame(results)
headers = dfresultsTemp.iloc[0]
dfresults = pd.DataFrame(dfresultsTemp.values[1:], columns=headers)
dfresults.sort_values(['model', 'Training Size'], ascending=[True, True], inplace=True)
print(dfresults)
def lineplot(df, metric_to_plot):
fig, ax = plt.subplots()
for key, grp in df.groupby(['model']):
ax = grp.plot(ax=ax, kind='line', x='Training Size', y=metric_to_plot, label=key)
plt.legend(loc='best')
plt.title(metric_to_plot + ' vs Training Size Percentage')
plt.show()
lineplot(dfresults, 'Mean Square Err')
lineplot(dfresults, "R2 Score")
#BDT
def bdtModel(ts):
X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
bdt = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
predictions = bdt.predict(X_test)
return bdt
bdt = bdtModel(0.6)
nextDayPredictions = []
for k, v in countyRecords.items():
#for a county
#i know this is inefficent but it works
pred = int(bdt.predict([v[(len(v) - WINDOWSIZE):]])[0])
pred2 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 1)):] + [pred] ])[0])
pred3 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 2)):] + [pred,pred2] ])[0])
pred4 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 3)):] + [pred,pred2,pred3] ])[0])
pred5 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 4)):] + [pred,pred2,pred3,pred4] ])[0])
nextDayPredictions.append([k,pred,pred2,pred3,pred4,pred5])
print(nextDayPredictions)
dfFinal = pd.DataFrame(nextDayPredictions, columns =["County", "Day1", "Day2", "Day3","Day4","Day5"])
dfFinal.to_csv('predicted-cases-michigan.csv')