-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscrape.py
153 lines (139 loc) · 5.97 KB
/
scrape.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
import os
import requests
import pandas as pd
from bs4 import BeautifulSoup
import numpy as np
def get_data(number_of_days=None, NoneChar = '-'):
'''
Accesses GitHub page and returns a list of composed of "number_of_days" components: each component is a pandas.DataFrame
object which contains the regional data, stored as in https://github.com/pcm-dpc/COVID-19/tree/master/dati-regioni
(use https://github.com/pcm-dpc/COVID-19/blob/master/dati-regioni/dpc-covid19-ita-regioni-20220418.csv as reference).
INPUTS:
- number_of_days: e.g. if "number_of_days = 4" the function returns 4 csv files containing the 4 most recent csv files from GitHub repository.
- NoneChar: char used to fill empty or missing data.
'''
print("=====================================================")
print("Starting Scraping")
dates = []
table_list = []
csv_names = []
url = 'https://github.com/pcm-dpc/COVID-19/tree/master/dati-regioni'
response = requests.get(url)
if response.status_code != 200:
print("Error: response_status != 200")
return
doc = BeautifulSoup(response.content, "html.parser")
tags = doc.find_all("a")
for tag in tags:
n = str(tag.string)
signature_string = "dpc-covid19-ita-regioni-2"
if n[:len(signature_string)] == signature_string:
dates.append(n)
# start loop over days
if number_of_days == None:
selected_dates = dates
else:
selected_dates = dates[-number_of_days:]
for count, additional_string in enumerate(selected_dates):
new_url = url + "/" + additional_string
response = requests.get(new_url)
doc = BeautifulSoup(response.content, "html.parser")
tbody = doc.tbody
trs = tbody.contents
trs_correct = []
for t in trs:
if t != '\n':
trs_correct.append(t)
# get the column names
thead = doc.thead
column_names = []
for col_name in thead.find_all("th"):
column_names.append(col_name.text)
region_list = list(tbody.find_all("tr", class_="js-file-line"))
region_list_clean = []
for c in region_list:
if c != '\n':
region_list_clean.append(c)
num_of_features = len(column_names) # == 24
num_of_regions = len(region_list_clean) # == 21
d = []
numerical_features_index = [2,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,21]
for region_index in range(len(region_list_clean)):
region_data = list(region_list_clean[region_index].find_all("td"))
row = []
for col in range(num_of_features):
if region_data[col+1].string == None:
row.append(NoneChar)
else:
row.append(region_data[col+1].string)
d.append(row)
# d is a list with 21 components, each of them is a list with 24 components
# now i need to post process each component to transform string into numbers where necessary
for region_index, region in enumerate(d):
for feature_index in range(len(region)):
if feature_index in numerical_features_index and region[feature_index] != NoneChar:
d[region_index][feature_index] = float(d[region_index][feature_index])
# construct the dictionary and the pandas.DataFrame (organized as in the GitHub repo)
data_dictionary = {}
for i,cname in enumerate(column_names):
l = []
for region in d:
l.append(region[i])
data_dictionary[cname] = l
table_list.append(pd.DataFrame(data=data_dictionary))
csv_names.append(additional_string)
print(additional_string,"|",count+1,"out of",len(selected_dates))
print("=====================================================")
print("Scraping Ended")
return table_list
def get_csv_names(number_of_days=None, NoneChar = '-'):
'''
Returns the names of the csv files in the GitHub repository.
Used to save the files into a dedicated folder.
INPUTS:
- number_of_days: e.g. if "number_of_days = 4" the function returns the names of the 4 most recent csv files from GitHub repository.
- NoneChar: char used to fill empty or missing data.
'''
dates = []
csv_names = []
url = 'https://github.com/pcm-dpc/COVID-19/tree/master/dati-regioni'
response = requests.get(url)
if response.status_code != 200:
print("Error: response_status != 200")
return
doc = BeautifulSoup(response.content, "html.parser")
tags = doc.find_all("a")
for tag in tags:
n = str(tag.string)
signature_string = "dpc-covid19-ita-regioni-2"
if n[:len(signature_string)] == signature_string:
dates.append(n)
# start loop over days
if number_of_days == None:
selected_dates = dates
else:
selected_dates = dates[-number_of_days:]
for additional_string in selected_dates:
csv_names.append(additional_string)
return csv_names
def save_data(table_list, directory_name, number_of_days=None, NoneChar = '-'):
'''
Saves the "table_list" of csv files into a directory which is named as in "directory_name".
ADDITIONAL INPUTS:
- number_of_days: used to call get_csv_names().
- NoneChar: char used to fill empty or missing data.
'''
os.makedirs(directory_name, exist_ok=True)
day_names = get_csv_names(number_of_days, NoneChar)
for index, day in enumerate(table_list):
filename = directory_name + "/" + day_names[index]
day.to_csv(filename, index = False)
return 0
def scrape(foldername, save_files = True, number_of_days=None, NoneChar = '-'):
'''
Scrapes https://github.com/pcm-dpc/COVID-19/tree/master/dati-regioni and returns the list of tables.
'''
table_list = get_data(number_of_days, NoneChar)
if save_files == True:
save_data(table_list, foldername, number_of_days, NoneChar)
return table_list