-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_section_titles.py
52 lines (36 loc) · 1.54 KB
/
get_section_titles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
import wikipediaapi
from commons import all_countries
def get_relevant_section_titles_tokens():
with open("stemmed_keywords_for_relevant_section_titles.txt", "r") as input_file:
res = input_file.readlines()
return [x.strip() for x in res]
def get_all_section_titles():
titles = {}
wiki = wikipediaapi.Wikipedia()
for c in all_countries.keys():
year = "2019-20" if c == "mainland China" else "2020"
sect = wiki.page(f'{year} coronavirus pandemic in {c}').sections
for s in sect:
titles[s.title] = c
print('\n')
return titles
def check_for_new_section_titles(all_section_titles):
new_section_titles = []
wiki = wikipediaapi.Wikipedia()
for c in all_countries.keys():
sect = wiki.page(f'2020 coronavirus pandemic in {c}').sections
for s in sect:
if s.title not in all_section_titles:
new_section_titles.append(s.title)
return new_section_titles
def update_section_titles(all_section_titles, new_sections):
for section, country in new_sections.items():
print(f'There is new section title: "{section}" for 2020 coronavirus pandemic in {country}')
all_section_titles.update(set(new_sections.keys()))
return all_section_titles
def get_relevant_section_titles(all_section_titles, relevant_tokens):
rel_section_titles = set()
for sect_title in all_section_titles:
if any(word in sect_title.lower() for word in relevant_tokens):
rel_section_titles.add(sect_title)
return rel_section_titles