-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathpreprocessing_COVID.py
More file actions
102 lines (73 loc) · 2.42 KB
/
preprocessing_COVID.py
File metadata and controls
102 lines (73 loc) · 2.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
# -*- coding: utf-8 -*-
"""
Created on Fri Jun 26 18:41:53 2020
@author: Danish
"""
import pandas as pd
import os
from utilities import to_weeks, extract_sub_df
import numpy as np
import pickle
path = r'C:\Users\danis\Documents\USFoods'
csv_files = os.listdir(path+'/COVID')
#removes the first file which is non csv
csv_files.pop(0)
#reading all the csv files
covid_df = []
for f in csv_files:
df = pd.read_csv(path+'/COVID/'+f)
covid_df.append(df)
covid_df1 = covid_df.pop(18)
covid_df = pd.concat(covid_df)
############################ Converting Dates to Weeks ############################
#converting date to weeks
date = covid_df.date.copy()
date = date.reset_index(drop=True)
weeks0 = to_weeks(date, format_='%Y%m%d', splitter='-', year_index=0)
covid_df['fisc_wk'] = weeks0
#converting date to weeks
date = covid_df1.date.copy()
date = date.reset_index(drop=True)
weeks1 = to_weeks(date, format_='%d%m%Y', splitter='/', year_index=2)
covid_df1['fisc_wk'] = weeks1
############################ Adding zip codes ############################
#concatenated DF
covid_df = pd.concat([covid_df, covid_df1])
covid_df = covid_df.reset_index(drop=True)
county = covid_df.county.unique()
zip_df = pd.read_csv(path+'/zip_to_county.csv')
county_name = zip_df.countyname.unique()
stcountyfp = zip_df.stcountyfp.unique()
fips = covid_df.fips.unique()
#verify that we have all fips id
fips_lst = []
for f in fips:
if f in stcountyfp:
fips_lst.append(f)
#creating a mapping of zip codes and fips
zip_df_tmp = zip_df.copy()
zip_df_tmp = zip_df_tmp.set_index('stcountyfp').sort_index()
fips_2_zips = {}
for f in fips_lst:
df = extract_sub_df(zip_df_tmp, f)
fips_2_zips[f] = df.zip.unique()
with open(path+'/Data/fips2zips', 'wb') as f:
pickle.dump(fips_2_zips, f)
covid_df = covid_df.reset_index(drop=True)
#adding zip codes
zip_cd = []
for row in range(len(covid_df)):
fip = covid_df['fips'][row]
zip_codes = fips_2_zips[fip]
#getting the index to choose the zip code randomly
idx = np.random.randint(0, len(zip_codes), size=1)[0]
zip_cd.append(zip_codes[idx])
covid_df['zip_cd'] = zip_cd
#deleting unecessary columns
try:
covid_df = covid_df.drop(['index'], axis=1)
except Exception as e:
print('Requested Columns not found in the df')
covid_df['zip_cd2'] = covid_df['zip_cd']
covid_df['fisc_wk2'] = covid_df['fisc_wk']
covid_df.to_csv(path+'/Data/Processed_COVID.csv')