Predict future cases
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pylab as plt
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline
TRAIN_SIZE = 0.8
WINDOWSIZE = 6 #days to use to predict
#open the file
df =pd.read_csv("cases-michigan.csv", sep=",")
print(df.head(5))
print(df.dtypes)
#convert date string to datetime obj
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
print(df.index)
#convert to time series:
ts = df['cases']
ts.head(10)
#make sure the data is correct and makes sense
plt.plot(ts)
#loop and create timeseries format
df = df.sort_values('date')
countyRecords = {}
for index, row in df.iterrows():
    #print(row['county'], row['cases'], index)
    if row['county'] in countyRecords.keys():
        # add the cases # to the array
        countyRecords[row['county']].append(int(row['cases']))
    else:
        countyRecords[row['county']] = [int(row['cases'])]
        
lenMostData = len(max(countyRecords.values(), key=lambda coll: len(coll)))
print(lenMostData) #determine length of longest time series to padd
#make sure all lists are the same length
for i in countyRecords.keys():
    #check if less than the length of the longest array
    padLen = lenMostData - len(countyRecords[i])
    if padLen > 0:
        zeroPad = list(np.zeros(padLen))
        countyRecords[i] = zeroPad + countyRecords[i]
print(countyRecords['Ingham'])
#setup the dataframe for providing the model
finalX = []
finalY = []
idx = 0
for k, v in countyRecords.items():
    maxTimeFrameLength = len(v) - 1
    for r in v:
        if maxTimeFrameLength >= (idx + WINDOWSIZE):
            #can continue
            finArr = []
            for c in range(0,WINDOWSIZE):
                finArr.append(v[idx + c])
            finalY.append(v[idx + WINDOWSIZE])
            finalX.append(finArr)
        idx += 1
        
print(finalX[:10])
print(finalY[:10])
#setup the model
def naivebayes(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictions = gnb.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
#Logistic Regression
def logisticRegression(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    lgr = LogisticRegression(random_state=0, solver="lbfgs",
                             multi_class="multinomial").fit(X_train, y_train)
    predictions = lgr.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
def svmModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    smodel = svm.SVC(gamma=0.001, C=2.0).fit(X_train, y_train)
    predictions = smodel.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
#BDT
def bdtModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    bdt = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
    predictions = bdt.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
#look at training
results = [['model', 'Training Size', 'Mean Square Err', "R2 Score"]]
for tsize in [.9, .8, .7, .6, .5, .4, .3, .2]:
    print(tsize)
    results.append((['NB', 1- tsize] + naivebayes(tsize)))
    results.append((['LR', 1- tsize] + logisticRegression(tsize)))
    results.append((['SVM', 1- tsize] + svmModel(tsize)))
    results.append((['BDT', 1- tsize] + bdtModel(tsize)))
    
dfresultsTemp = pd.DataFrame(results)
headers = dfresultsTemp.iloc[0]
dfresults  = pd.DataFrame(dfresultsTemp.values[1:], columns=headers)
dfresults.sort_values(['model', 'Training Size'], ascending=[True, True], inplace=True)
print(dfresults)
def lineplot(df, metric_to_plot):
    fig, ax = plt.subplots()
    for key, grp in df.groupby(['model']):
        ax = grp.plot(ax=ax, kind='line', x='Training Size', y=metric_to_plot, label=key)
    plt.legend(loc='best')
    plt.title(metric_to_plot + ' vs Training Size Percentage')
    plt.show()
    
lineplot(dfresults, 'Mean Square Err')
lineplot(dfresults, "R2 Score")
#BDT 
def bdtModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    bdt = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
    predictions = bdt.predict(X_test)
    return bdt
bdt = bdtModel(0.6)
nextDayPredictions = []
for k, v in countyRecords.items():
    #for a county
    #i know this is inefficent but it works
    pred = int(bdt.predict([v[(len(v) - WINDOWSIZE):]])[0])
    pred2 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 1)):] + [pred] ])[0])
    pred3 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 2)):] + [pred,pred2] ])[0])
    pred4 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 3)):] + [pred,pred2,pred3] ])[0])
    pred5 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 4)):] + [pred,pred2,pred3,pred4] ])[0])
    nextDayPredictions.append([k,pred,pred2,pred3,pred4,pred5])
print(nextDayPredictions)
dfFinal = pd.DataFrame(nextDayPredictions, columns =["County", "Day1", "Day2", "Day3","Day4","Day5"])
dfFinal.to_csv('predicted-cases-michigan.csv')