ML Model COVID Prediction¶

Predict future cases

import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pylab as plt
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline
TRAIN_SIZE = 0.8
WINDOWSIZE = 6 #days to use to predict

#open the file
df =pd.read_csv("cases-michigan.csv", sep=",")
print(df.head(5))
print(df.dtypes)

#convert date string to datetime obj
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
print(df.index)
#convert to time series:
ts = df['cases']
ts.head(10)

#make sure the data is correct and makes sense
plt.plot(ts)

   Unnamed: 0        date   county     state     fips  cases
0        1079  2020-03-10  Oakland  Michigan  26125.0      1
1        1080  2020-03-10    Wayne  Michigan  26163.0      1
2        1259  2020-03-11  Oakland  Michigan  26125.0      1
3        1260  2020-03-11    Wayne  Michigan  26163.0      1
4        1477  2020-03-12   Ingham  Michigan  26065.0      1
Unnamed: 0      int64
date           object
county         object
state          object
fips          float64
cases           int64
dtype: object
DatetimeIndex(['2020-03-10', '2020-03-10', '2020-03-11', '2020-03-11',
               '2020-03-12', '2020-03-12', '2020-03-12', '2020-03-12',
               '2020-03-12', '2020-03-12',
               ...
               '2020-09-17', '2020-09-17', '2020-09-17', '2020-09-17',
               '2020-09-17', '2020-09-17', '2020-09-17', '2020-09-17',
               '2020-09-17', '2020-09-17'],
              dtype='datetime64[ns]', name='date', length=14550, freq=None)

[<matplotlib.lines.Line2D at 0x23c196a8fc8>]

#loop and create timeseries format
df = df.sort_values('date')
countyRecords = {}

for index, row in df.iterrows():
    #print(row['county'], row['cases'], index)
    if row['county'] in countyRecords.keys():
        # add the cases # to the array
        countyRecords[row['county']].append(int(row['cases']))
    else:
        countyRecords[row['county']] = [int(row['cases'])]
        
lenMostData = len(max(countyRecords.values(), key=lambda coll: len(coll)))
print(lenMostData) #determine length of longest time series to padd

192

#make sure all lists are the same length
for i in countyRecords.keys():
    #check if less than the length of the longest array
    padLen = lenMostData - len(countyRecords[i])
    if padLen > 0:
        zeroPad = list(np.zeros(padLen))
        countyRecords[i] = zeroPad + countyRecords[i]

print(countyRecords['Ingham'])

[0.0, 0.0, 1, 1, 1, 1, 1, 2, 2, 7, 7, 11, 11, 12, 15, 18, 22, 26, 32, 43, 73, 91, 121, 128, 152, 168, 172, 178, 189, 205, 222, 239, 241, 250, 254, 263, 270, 278, 298, 304, 308, 314, 335, 350, 370, 391, 395, 406, 413, 426, 446, 470, 483, 497, 506, 506, 518, 539, 557, 563, 575, 579, 586, 594, 605, 619, 629, 643, 648, 649, 650, 667, 674, 682, 695, 696, 701, 705, 717, 729, 734, 736, 738, 740, 745, 746, 755, 793, 801, 806, 809, 808, 810, 832, 815, 824, 825, 825, 825, 825, 825, 831, 837, 845, 859, 867, 896, 916, 954, 980, 992, 1002, 1019, 1042, 1054, 1070, 1085, 1092, 1095, 1114, 1129, 1142, 1153, 1163, 1174, 1187, 1207, 1210, 1242, 1257, 1265, 1287, 1286, 1301, 1308, 1318, 1336, 1341, 1387, 1398, 1415, 1430, 1438, 1459, 1473, 1482, 1493, 1500, 1503, 1520, 1536, 1541, 1551, 1554, 1558, 1570, 1588, 1597, 1610, 1624, 1641, 1640, 1652, 1661, 1661, 1679, 1691, 1705, 1714, 1742, 1735, 1749, 1766, 1772, 1783, 1796, 1814, 1840, 1889, 1958, 1958, 2037, 2066, 2170, 2296, 2424, 2525, 2525, 2655, 2758, 2887, 3012]

#setup the dataframe for providing the model
finalX = []
finalY = []

idx = 0
for k, v in countyRecords.items():
    maxTimeFrameLength = len(v) - 1
    for r in v:
        if maxTimeFrameLength >= (idx + WINDOWSIZE):
            #can continue
            finArr = []
            for c in range(0,WINDOWSIZE):
                finArr.append(v[idx + c])
            finalY.append(v[idx + WINDOWSIZE])
            finalX.append(finArr)
        idx += 1
        
print(finalX[:10])
print(finalY[:10])

[[1, 1, 3, 6, 9, 14], [1, 3, 6, 9, 14, 14], [3, 6, 9, 14, 14, 16], [6, 9, 14, 14, 16, 23], [9, 14, 14, 16, 23, 105], [14, 14, 16, 23, 105, 184], [14, 16, 23, 105, 184, 229], [16, 23, 105, 184, 229, 277], [23, 105, 184, 229, 277, 329], [105, 184, 229, 277, 329, 428]]
[14, 16, 23, 105, 184, 229, 277, 329, 428, 543]

#setup the model

def naivebayes(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictions = gnb.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]

#Logistic Regression
def logisticRegression(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    lgr = LogisticRegression(random_state=0, solver="lbfgs",
                             multi_class="multinomial").fit(X_train, y_train)
    predictions = lgr.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]


def svmModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    smodel = svm.SVC(gamma=0.001, C=2.0).fit(X_train, y_train)
    predictions = smodel.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]

#BDT
def bdtModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    bdt = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
    predictions = bdt.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]

#look at training
results = [['model', 'Training Size', 'Mean Square Err', "R2 Score"]]
for tsize in [.9, .8, .7, .6, .5, .4, .3, .2]:
    print(tsize)
    results.append((['NB', 1- tsize] + naivebayes(tsize)))
    results.append((['LR', 1- tsize] + logisticRegression(tsize)))
    results.append((['SVM', 1- tsize] + svmModel(tsize)))
    results.append((['BDT', 1- tsize] + bdtModel(tsize)))
    

dfresultsTemp = pd.DataFrame(results)
headers = dfresultsTemp.iloc[0]
dfresults  = pd.DataFrame(dfresultsTemp.values[1:], columns=headers)
dfresults.sort_values(['model', 'Training Size'], ascending=[True, True], inplace=True)
print(dfresults)

def lineplot(df, metric_to_plot):
    fig, ax = plt.subplots()
    for key, grp in df.groupby(['model']):
        ax = grp.plot(ax=ax, kind='line', x='Training Size', y=metric_to_plot, label=key)

    plt.legend(loc='best')
    plt.title(metric_to_plot + ' vs Training Size Percentage')
    plt.show()
    
lineplot(dfresults, 'Mean Square Err')
lineplot(dfresults, "R2 Score")

#BDT 
def bdtModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    bdt = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
    predictions = bdt.predict(X_test)
    return bdt

bdt = bdtModel(0.6)

nextDayPredictions = []
for k, v in countyRecords.items():
    #for a county
    #i know this is inefficent but it works
    pred = int(bdt.predict([v[(len(v) - WINDOWSIZE):]])[0])
    pred2 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 1)):] + [pred] ])[0])
    pred3 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 2)):] + [pred,pred2] ])[0])
    pred4 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 3)):] + [pred,pred2,pred3] ])[0])
    pred5 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 4)):] + [pred,pred2,pred3,pred4] ])[0])

    nextDayPredictions.append([k,pred,pred2,pred3,pred4,pred5])

print(nextDayPredictions)

[['Oakland', 19450, 19450, 19450, 19450, 19450], ['Wayne', 19450, 19450, 19450, 19450, 19450], ['Ingham', 3696, 3973, 4222, 4541, 4743], ['Kent', 10655, 10881, 10963, 11082, 11153], ['Montcalm', 593, 676, 763, 891, 1003], ['St. Clair', 1591, 1744, 1854, 1978, 2122], ['Washtenaw', 4727, 4928, 5021, 5174, 5357], ['Macomb', 14357, 14501, 14604, 14697, 14802], ['Leelanau', 260, 295, 331, 407, 512], ['Charlevoix', 210, 233, 285, 334, 450], ['Bay', 1495, 1576, 1757, 1901, 2054], ['Monroe', 2075, 2178, 2296, 2459, 2735], ['Ottawa', 4025, 4251, 4485, 4659, 4946], ['Jackson', 3619, 3786, 4089, 4333, 4652], ['Otsego', 414, 463, 534, 666, 791], ['Livingston', 1998, 2173, 2291, 2455, 2715], ['Clinton', 1109, 1190, 1277, 1436, 1585], ['Eaton', 1109, 1190, 1273, 1436, 1585], ['Genesee', 5032, 5168, 5290, 5472, 5654], ['Midland', 1000, 1113, 1184, 1295, 1503], ['Berrien', 2598, 2705, 2932, 3161, 3461], ['Calhoun', 1798, 1924, 2021, 2158, 2368], ['Clare', 260, 295, 331, 407, 512], ['Allegan', 1289, 1351, 1451, 1654, 1833], ['Barry', 581, 663, 748, 840, 995], ['Saginaw', 3842, 4081, 4333, 4579, 4721], ['Tuscola', 953, 1034, 1130, 1219, 1408], ['Wexford', 284, 319, 365, 474, 586], ['Gladwin', 203, 222, 271, 316, 403], ['Grand Traverse', 893, 973, 1071, 1170, 1335], ['Emmet', 330, 382, 439, 550, 683], ['Roscommon', 203, 222, 271, 316, 403], ['Chippewa', 203, 222, 271, 316, 403], ['Kalamazoo', 3258, 3467, 3656, 4006, 4284], ['Muskegon', 3291, 3503, 3677, 4061, 4280], ['Newaygo', 660, 745, 821, 930, 1052], ['Kalkaska', 203, 222, 271, 316, 403], ['Hillsdale', 660, 745, 821, 930, 1052], ['Isabella', 1140, 1212, 1287, 1438, 1639], ['Manistee', 203, 222, 271, 316, 403], ['Lapeer', 1462, 1555, 1740, 1842, 2020], ['Unknown', 615, 663, 728, 787, 875], ['Iosco', 411, 468, 543, 680, 798], ['Van Buren', 1289, 1351, 1451, 1654, 1833], ['Lenawee', 2111, 2297, 2415, 2663, 2942], ['Marquette', 652, 736, 810, 917, 1043], ['Sanilac', 330, 366, 425, 523, 628], ['Ionia', 827, 927, 1006, 1127, 1300], ['Gogebic', 380, 426, 496, 584, 740], ['Ogemaw', 203, 222, 271, 316, 403], ['Mecosta', 382, 445, 500, 638, 781], ['Oceana', 963, 1045, 1139, 1219, 1408], ['Missaukee', 148, 172, 196, 237, 311], ['Shiawassee', 953, 1034, 1139, 1219, 1408], ['Crawford', 262, 302, 339, 422, 521], ['Cass', 953, 1045, 1139, 1219, 1408], ['Gratiot', 524, 567, 675, 796, 920], ['Huron', 479, 539, 618, 732, 834], ['Osceola', 203, 222, 271, 316, 403], ['Cheboygan', 203, 222, 271, 316, 403], ['Delta', 510, 561, 646, 796, 894], ['Antrim', 203, 222, 271, 316, 403], ['Houghton', 507, 553, 643, 796, 894], ['St. Joseph', 1289, 1351, 1451, 1654, 1833], ['Branch', 2071, 2173, 2291, 2455, 2731], ['Arenac', 203, 222, 271, 316, 403], ['Oscoda', 143, 163, 187, 211, 277], ['Mackinac', 143, 163, 187, 211, 277], ['Dickinson', 203, 222, 271, 316, 403], ['Luce', 497, 542, 629, 769, 885], ['Presque Isle', 143, 163, 187, 211, 277], ['Mason', 330, 366, 422, 519, 624], ['Schoolcraft', 133, 163, 187, 208, 277], ['Menominee', 598, 683, 763, 891, 1007], ['Alpena', 330, 366, 422, 519, 624], ['Lake', 143, 163, 187, 211, 277], ['Montmorency', 112, 119, 147, 189, 208], ['Alcona', 143, 163, 187, 211, 277], ['Baraga', 95, 119, 127, 192, 208], ['Benzie', 203, 222, 271, 316, 403], ['Iron', 143, 163, 197, 211, 277], ['Keweenaw', 52, 61, 78, 76, 133], ['Alger', 126, 143, 177, 198, 245], ['Ontonagon', 143, 163, 187, 211, 277]]

dfFinal = pd.DataFrame(nextDayPredictions, columns =["County", "Day1", "Day2", "Day3","Day4","Day5"])
dfFinal.to_csv('predicted-cases-michigan.csv')