-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathf_Web_scrap.py
150 lines (64 loc) · 3.49 KB
/
f_Web_scrap.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import pandas as pd
#import ipdb
"""
Retrieving research papers from Google Scholar based on the keyword list. Search across PubMed was performed by modifying the link structure.
"""
def scrape_research_papers(keywords):
papers = []
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
}
african_countries = ['Algeria', 'Angola', 'Benin', 'Botswana', 'Burkina Faso', 'Burundi', 'Cabo Verde', 'Cameroon',
'Central African Republic', 'Chad', 'Comoros', 'Democratic Republic of the Congo', 'Republic of the Congo',
'Djibouti', 'Egypt', 'Equatorial Guinea', 'Eritrea', 'Eswatini', 'Ethiopia', 'Gabon', 'Gambia', 'Ghana',
'Guinea', 'Guinea-Bissau', 'Ivory Coast', 'Kenya', 'Lesotho', 'Liberia', 'Libya', 'Madagascar', 'Malawi',
'Mali', 'Mauritania', 'Mauritius', 'Morocco', 'Mozambique', 'Namibia', 'Niger', 'Nigeria', 'Rwanda',
'Sao Tome and Principe', 'Senegal', 'Seychelles', 'Sierra Leone', 'Somalia', 'South Africa', 'South Sudan',
'Sudan', 'Tanzania', 'Togo', 'Tunisia', 'Uganda', 'Zambia', 'Zimbabwe']
page_number = 0
while page_number<1000:
url = f"https://scholar.google.com/scholar?start={page_number}&q=Zoo+OR+animal+AND+health+AND+practice+AND+tradition+AND+Morocco+OR+Algeria+OR+Tunisia+OR+Libya+OR+Egypt+OR+Erythrea+OR+Djibouti+OR+Sudan+OR+Mali+OR+Chad+OR+Niger+OR+Senegal+OR+Gambia+OR+Mauritania+OR+Benin+OR+Togo+OR+Burkina+Faso+OR+Nigeria+OR+Camero&hl=en&as_sdt=0,5"
response = requests.get(url, headers=headers)
soup = BeautifulSoup(response.text, 'html.parser')
#ipdb.set_trace()
for result in soup.find_all('div', {'class': 'gs_r gs_or gs_scl'}):
title = result.find('h3', {'class': 'gs_rt'}).text
authors = result.find('div', {'class': 'gs_a'}).text
if result.find('h3', {'class': 'gs_rt'}).find('a')==None:
link=''
else:
link = result.find('h3', {'class': 'gs_rt'}).find('a')['href']
description = result.find('div', {'class': 'gs_rs'}).text
year = extract_year_from_result(result)
if 'Brazil' not in title:
region, country = extract_region_and_country_from_title(title, african_countries)
papers.append({
'title': title,
'authors': authors,
'link': link,
'description': description,
'year': year,
'region': region,
'country': country
})
page_number += 10
return papers
def extract_region_and_country_from_title(title, african_countries):
for country in african_countries:
if country in title:
return 'Africa', country
if 'Africa' in title:
return 'Africa', None
return None, None
def extract_year_from_result(result):
year_element = result.find('div', {'class': 'gs_a'})
if year_element:
year_text = year_element.text
year = re.search(r'\b\d{4}\b', year_text)
if year:
return year.group(0)
return None