ML Model COVID Prediction

Predict future cases

In [67]:
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pylab as plt
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import SGDRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
%matplotlib inline
TRAIN_SIZE = 0.8
WINDOWSIZE = 6 #days to use to predict
In [21]:
#open the file
df =pd.read_csv("cases-michigan.csv", sep=",")
print(df.head(5))
print(df.dtypes)

#convert date string to datetime obj
df['date'] = pd.to_datetime(df['date'])
df.set_index('date', inplace=True)
print(df.index)
#convert to time series:
ts = df['cases']
ts.head(10)

#make sure the data is correct and makes sense
plt.plot(ts)
   Unnamed: 0        date   county     state     fips  cases
0        1079  2020-03-10  Oakland  Michigan  26125.0      1
1        1080  2020-03-10    Wayne  Michigan  26163.0      1
2        1259  2020-03-11  Oakland  Michigan  26125.0      1
3        1260  2020-03-11    Wayne  Michigan  26163.0      1
4        1477  2020-03-12   Ingham  Michigan  26065.0      1
Unnamed: 0      int64
date           object
county         object
state          object
fips          float64
cases           int64
dtype: object
DatetimeIndex(['2020-03-10', '2020-03-10', '2020-03-11', '2020-03-11',
               '2020-03-12', '2020-03-12', '2020-03-12', '2020-03-12',
               '2020-03-12', '2020-03-12',
               ...
               '2020-09-17', '2020-09-17', '2020-09-17', '2020-09-17',
               '2020-09-17', '2020-09-17', '2020-09-17', '2020-09-17',
               '2020-09-17', '2020-09-17'],
              dtype='datetime64[ns]', name='date', length=14550, freq=None)
Out[21]:
[<matplotlib.lines.Line2D at 0x23c196a8fc8>]
In [34]:
#loop and create timeseries format
df = df.sort_values('date')
countyRecords = {}

for index, row in df.iterrows():
    #print(row['county'], row['cases'], index)
    if row['county'] in countyRecords.keys():
        # add the cases # to the array
        countyRecords[row['county']].append(int(row['cases']))
    else:
        countyRecords[row['county']] = [int(row['cases'])]
        
lenMostData = len(max(countyRecords.values(), key=lambda coll: len(coll)))
print(lenMostData) #determine length of longest time series to padd
192
In [39]:
#make sure all lists are the same length
for i in countyRecords.keys():
    #check if less than the length of the longest array
    padLen = lenMostData - len(countyRecords[i])
    if padLen > 0:
        zeroPad = list(np.zeros(padLen))
        countyRecords[i] = zeroPad + countyRecords[i]

print(countyRecords['Ingham'])
[0.0, 0.0, 1, 1, 1, 1, 1, 2, 2, 7, 7, 11, 11, 12, 15, 18, 22, 26, 32, 43, 73, 91, 121, 128, 152, 168, 172, 178, 189, 205, 222, 239, 241, 250, 254, 263, 270, 278, 298, 304, 308, 314, 335, 350, 370, 391, 395, 406, 413, 426, 446, 470, 483, 497, 506, 506, 518, 539, 557, 563, 575, 579, 586, 594, 605, 619, 629, 643, 648, 649, 650, 667, 674, 682, 695, 696, 701, 705, 717, 729, 734, 736, 738, 740, 745, 746, 755, 793, 801, 806, 809, 808, 810, 832, 815, 824, 825, 825, 825, 825, 825, 831, 837, 845, 859, 867, 896, 916, 954, 980, 992, 1002, 1019, 1042, 1054, 1070, 1085, 1092, 1095, 1114, 1129, 1142, 1153, 1163, 1174, 1187, 1207, 1210, 1242, 1257, 1265, 1287, 1286, 1301, 1308, 1318, 1336, 1341, 1387, 1398, 1415, 1430, 1438, 1459, 1473, 1482, 1493, 1500, 1503, 1520, 1536, 1541, 1551, 1554, 1558, 1570, 1588, 1597, 1610, 1624, 1641, 1640, 1652, 1661, 1661, 1679, 1691, 1705, 1714, 1742, 1735, 1749, 1766, 1772, 1783, 1796, 1814, 1840, 1889, 1958, 1958, 2037, 2066, 2170, 2296, 2424, 2525, 2525, 2655, 2758, 2887, 3012]
In [58]:
#setup the dataframe for providing the model
finalX = []
finalY = []

idx = 0
for k, v in countyRecords.items():
    maxTimeFrameLength = len(v) - 1
    for r in v:
        if maxTimeFrameLength >= (idx + WINDOWSIZE):
            #can continue
            finArr = []
            for c in range(0,WINDOWSIZE):
                finArr.append(v[idx + c])
            finalY.append(v[idx + WINDOWSIZE])
            finalX.append(finArr)
        idx += 1
        
print(finalX[:10])
print(finalY[:10])
[[1, 1, 3, 6, 9, 14], [1, 3, 6, 9, 14, 14], [3, 6, 9, 14, 14, 16], [6, 9, 14, 14, 16, 23], [9, 14, 14, 16, 23, 105], [14, 14, 16, 23, 105, 184], [14, 16, 23, 105, 184, 229], [16, 23, 105, 184, 229, 277], [23, 105, 184, 229, 277, 329], [105, 184, 229, 277, 329, 428]]
[14, 16, 23, 105, 184, 229, 277, 329, 428, 543]
In [68]:
#setup the model

def naivebayes(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    gnb = GaussianNB()
    gnb.fit(X_train, y_train)
    predictions = gnb.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]

#Logistic Regression
def logisticRegression(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    lgr = LogisticRegression(random_state=0, solver="lbfgs",
                             multi_class="multinomial").fit(X_train, y_train)
    predictions = lgr.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]


def svmModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    smodel = svm.SVC(gamma=0.001, C=2.0).fit(X_train, y_train)
    predictions = smodel.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]

#BDT
def bdtModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    bdt = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
    predictions = bdt.predict(X_test)
    return [mean_squared_error(y_test, predictions), r2_score(y_test, predictions)]
In [ ]:
#look at training
results = [['model', 'Training Size', 'Mean Square Err', "R2 Score"]]
for tsize in [.9, .8, .7, .6, .5, .4, .3, .2]:
    print(tsize)
    results.append((['NB', 1- tsize] + naivebayes(tsize)))
    results.append((['LR', 1- tsize] + logisticRegression(tsize)))
    results.append((['SVM', 1- tsize] + svmModel(tsize)))
    results.append((['BDT', 1- tsize] + bdtModel(tsize)))
    

dfresultsTemp = pd.DataFrame(results)
headers = dfresultsTemp.iloc[0]
dfresults  = pd.DataFrame(dfresultsTemp.values[1:], columns=headers)
dfresults.sort_values(['model', 'Training Size'], ascending=[True, True], inplace=True)
print(dfresults)
In [72]:
def lineplot(df, metric_to_plot):
    fig, ax = plt.subplots()
    for key, grp in df.groupby(['model']):
        ax = grp.plot(ax=ax, kind='line', x='Training Size', y=metric_to_plot, label=key)

    plt.legend(loc='best')
    plt.title(metric_to_plot + ' vs Training Size Percentage')
    plt.show()
    
lineplot(dfresults, 'Mean Square Err')
lineplot(dfresults, "R2 Score")
In [77]:
#BDT 
def bdtModel(ts):
    X_train, X_test, y_train, y_test = train_test_split(finalX, finalY, test_size=ts, random_state=0)
    bdt = RandomForestRegressor(n_estimators=1000).fit(X_train, y_train)
    predictions = bdt.predict(X_test)
    return bdt

bdt = bdtModel(0.6)

nextDayPredictions = []
for k, v in countyRecords.items():
    #for a county
    #i know this is inefficent but it works
    pred = int(bdt.predict([v[(len(v) - WINDOWSIZE):]])[0])
    pred2 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 1)):] + [pred] ])[0])
    pred3 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 2)):] + [pred,pred2] ])[0])
    pred4 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 3)):] + [pred,pred2,pred3] ])[0])
    pred5 = int(bdt.predict([v[(len(v) - (WINDOWSIZE - 4)):] + [pred,pred2,pred3,pred4] ])[0])

    nextDayPredictions.append([k,pred,pred2,pred3,pred4,pred5])

print(nextDayPredictions)
[['Oakland', 19450, 19450, 19450, 19450, 19450], ['Wayne', 19450, 19450, 19450, 19450, 19450], ['Ingham', 3696, 3973, 4222, 4541, 4743], ['Kent', 10655, 10881, 10963, 11082, 11153], ['Montcalm', 593, 676, 763, 891, 1003], ['St. Clair', 1591, 1744, 1854, 1978, 2122], ['Washtenaw', 4727, 4928, 5021, 5174, 5357], ['Macomb', 14357, 14501, 14604, 14697, 14802], ['Leelanau', 260, 295, 331, 407, 512], ['Charlevoix', 210, 233, 285, 334, 450], ['Bay', 1495, 1576, 1757, 1901, 2054], ['Monroe', 2075, 2178, 2296, 2459, 2735], ['Ottawa', 4025, 4251, 4485, 4659, 4946], ['Jackson', 3619, 3786, 4089, 4333, 4652], ['Otsego', 414, 463, 534, 666, 791], ['Livingston', 1998, 2173, 2291, 2455, 2715], ['Clinton', 1109, 1190, 1277, 1436, 1585], ['Eaton', 1109, 1190, 1273, 1436, 1585], ['Genesee', 5032, 5168, 5290, 5472, 5654], ['Midland', 1000, 1113, 1184, 1295, 1503], ['Berrien', 2598, 2705, 2932, 3161, 3461], ['Calhoun', 1798, 1924, 2021, 2158, 2368], ['Clare', 260, 295, 331, 407, 512], ['Allegan', 1289, 1351, 1451, 1654, 1833], ['Barry', 581, 663, 748, 840, 995], ['Saginaw', 3842, 4081, 4333, 4579, 4721], ['Tuscola', 953, 1034, 1130, 1219, 1408], ['Wexford', 284, 319, 365, 474, 586], ['Gladwin', 203, 222, 271, 316, 403], ['Grand Traverse', 893, 973, 1071, 1170, 1335], ['Emmet', 330, 382, 439, 550, 683], ['Roscommon', 203, 222, 271, 316, 403], ['Chippewa', 203, 222, 271, 316, 403], ['Kalamazoo', 3258, 3467, 3656, 4006, 4284], ['Muskegon', 3291, 3503, 3677, 4061, 4280], ['Newaygo', 660, 745, 821, 930, 1052], ['Kalkaska', 203, 222, 271, 316, 403], ['Hillsdale', 660, 745, 821, 930, 1052], ['Isabella', 1140, 1212, 1287, 1438, 1639], ['Manistee', 203, 222, 271, 316, 403], ['Lapeer', 1462, 1555, 1740, 1842, 2020], ['Unknown', 615, 663, 728, 787, 875], ['Iosco', 411, 468, 543, 680, 798], ['Van Buren', 1289, 1351, 1451, 1654, 1833], ['Lenawee', 2111, 2297, 2415, 2663, 2942], ['Marquette', 652, 736, 810, 917, 1043], ['Sanilac', 330, 366, 425, 523, 628], ['Ionia', 827, 927, 1006, 1127, 1300], ['Gogebic', 380, 426, 496, 584, 740], ['Ogemaw', 203, 222, 271, 316, 403], ['Mecosta', 382, 445, 500, 638, 781], ['Oceana', 963, 1045, 1139, 1219, 1408], ['Missaukee', 148, 172, 196, 237, 311], ['Shiawassee', 953, 1034, 1139, 1219, 1408], ['Crawford', 262, 302, 339, 422, 521], ['Cass', 953, 1045, 1139, 1219, 1408], ['Gratiot', 524, 567, 675, 796, 920], ['Huron', 479, 539, 618, 732, 834], ['Osceola', 203, 222, 271, 316, 403], ['Cheboygan', 203, 222, 271, 316, 403], ['Delta', 510, 561, 646, 796, 894], ['Antrim', 203, 222, 271, 316, 403], ['Houghton', 507, 553, 643, 796, 894], ['St. Joseph', 1289, 1351, 1451, 1654, 1833], ['Branch', 2071, 2173, 2291, 2455, 2731], ['Arenac', 203, 222, 271, 316, 403], ['Oscoda', 143, 163, 187, 211, 277], ['Mackinac', 143, 163, 187, 211, 277], ['Dickinson', 203, 222, 271, 316, 403], ['Luce', 497, 542, 629, 769, 885], ['Presque Isle', 143, 163, 187, 211, 277], ['Mason', 330, 366, 422, 519, 624], ['Schoolcraft', 133, 163, 187, 208, 277], ['Menominee', 598, 683, 763, 891, 1007], ['Alpena', 330, 366, 422, 519, 624], ['Lake', 143, 163, 187, 211, 277], ['Montmorency', 112, 119, 147, 189, 208], ['Alcona', 143, 163, 187, 211, 277], ['Baraga', 95, 119, 127, 192, 208], ['Benzie', 203, 222, 271, 316, 403], ['Iron', 143, 163, 197, 211, 277], ['Keweenaw', 52, 61, 78, 76, 133], ['Alger', 126, 143, 177, 198, 245], ['Ontonagon', 143, 163, 187, 211, 277]]
In [81]:
dfFinal = pd.DataFrame(nextDayPredictions, columns =["County", "Day1", "Day2", "Day3","Day4","Day5"])
dfFinal.to_csv('predicted-cases-michigan.csv')
In [ ]: