-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathscraper_manager.py
216 lines (184 loc) · 7.34 KB
/
scraper_manager.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
import re
import requests
import logging
import logging_config
from multiprocessing import Pool
from urllib.parse import urlparse
from bs4 import BeautifulSoup, ResultSet, Tag
from utils import get_header
class OlxScraper:
"""Class used to scrape data from OLX Romania."""
def __init__(self):
self.headers = get_header()
self.netloc = "www.olx.ro"
self.schema = "https"
self.current_page = 1
self.last_page = None
def parse_content(self, target_url: str) -> BeautifulSoup:
"""
Parse content from a given URL.
Args:
target_url (str): A string representing the URL to be processed.
Returns:
BeautifulSoup: An object representing the processed content,
or None in case of error.
"""
try:
r = requests.get(target_url, headers=self.headers, timeout=60)
r.raise_for_status()
except requests.exceptions.RequestException as error:
logging.error(f"Connection error: {error}")
else:
parsed_content = BeautifulSoup(r.text, "html.parser")
return parsed_content
def get_ads(self, parsed_content: BeautifulSoup) -> ResultSet[Tag]:
"""
Returns all ads found on the parsed web page.
Args:
parsed_content (BeautifulSoup): a BeautifulSoup object created as
a result of parsing the web page.
Returns:
ResultSet[Tag]: A ResultSet containing all HTML tags that contain ads.
"""
if parsed_content is None:
return None
ads = parsed_content.select("div.css-1sw7q4x")
return ads
def get_last_page(self, parsed_content: BeautifulSoup) -> int:
"""
Returns the number of the last page available for processing.
Args:
parsed_content (BeautifulSoup): a BeautifulSoup object created
as a result of parsing the web page.
Returns:
int: The number of the last page available for parsing. If
there is no paging or the parsed object is None, it will return None.
"""
if parsed_content is not None:
pagination_ul = parsed_content.find("ul", class_="pagination-list")
if pagination_ul is not None:
pages = pagination_ul.find_all("li", class_="pagination-item")
if pages:
return int(pages[-1].text)
return None
def scrape_ads_urls(self, target_url: str) -> list:
"""
Scrapes the URLs of all valid ads present on an OLX page. Search all relevant
URLs of the ads and adds them to a set. Parses all pages, from first to last.
Args:
target_url (str): URL of the OLX page to start the search from.
Returns:
list: a list of relevant URLs of the ads found on the page.
Raises:
ValueError: If the URL is invalid or does not belong to the specified domain.
"""
ads_links = set()
if self.netloc != urlparse(target_url).netloc:
raise ValueError(
f"Bad URL! OLXRadar is configured to process {self.netloc} links only.")
while True:
url = f"{target_url}/?page={self.current_page}"
parsed_content = self.parse_content(url)
self.last_page = self.get_last_page(parsed_content)
ads = self.get_ads(parsed_content)
if ads is None:
return ads_links
for ad in ads:
link = ad.find("a", class_="css-rc5s2u")
if link is not None and link.has_attr("href"):
link_href = link["href"]
if not self.is_internal_url(link_href, self.netloc):
continue
if not self.is_relevant_url(link_href):
continue
if self.is_relative_url(link_href):
link_href = f"{self.schema}://{self.netloc}{link_href}"
ads_links.add(link_href)
if self.last_page is None or self.current_page >= self.last_page:
break
self.current_page += 1
return ads_links
def is_relevant_url(self, url: str) -> bool:
"""
Determines whether a particular URL is relevant by analyzing the query segment it contains.
Args:
url (str): A string representing the URL whose relevance is to be checked.
Returns:
bool: True if the URL is relevant, False if not.
The query (or search) segments, such as "?reason=extended-region", show that the ad
is added to the search results list by OLX when there are not enough ads
available for the user's region. Therefore, such a URL is not useful
(relevant) for monitoring.
"""
segments = urlparse(url)
if segments.query != "":
return False
return True
def is_internal_url(self, url: str, domain: str) -> bool:
"""
Checks if the URL has the same domain as the page it was taken from.
Args:
url (str): the URL to check.
domain (str): Domain of the current page.
Returns:
bool: True if the URL is an internal link, False otherwise.
"""
# URL starts with "/"
if self.is_relative_url(url):
return True
parsed_url = urlparse(url)
if parsed_url.netloc == domain:
return True
return False
def is_relative_url(self, url: str) -> bool:
"""
Check if the given url is relative or absolute.
Args:
url (str): url to check.
Returns:
True if the url is relative, otherwise False.
"""
parsed_url = urlparse(url)
if not parsed_url.netloc:
return True
if re.search(r"^\/[\w.\-\/]+", url):
return True
return False
def get_ad_data(self, ad_url: str) -> dict[str]:
"""
Extracts data from the HTML page of the ad.
Args:
ad_url (str): the URL of the ad.
Returns:
dict or None: A dictionary containing the scraped ad data
or None if the required information is missing.
"""
logging.info(f"Processing {ad_url}")
content = self.parse_content(ad_url)
if content is None:
return None
title = None
if content.find("h1", class_="css-1soizd2"):
title = content.find(
"h1", class_="css-1soizd2").get_text(strip=True)
price = None
if content.find("h3", class_="css-ddweki"):
price = content.find(
"h3", class_="css-ddweki").get_text(strip=True)
description = None
if content.find("div", class_="css-bgzo2k"):
description = content.find(
"div", class_="css-bgzo2k").get_text(strip=True, separator="\n")
seller = None
if content.find("h4", class_="css-1lcz6o7"):
seller = content.find(
"h4", class_="css-1lcz6o7").get_text(strip=True)
if any(item is None for item in [title, price, description]):
return None
ad_data = {
"title": title,
"price": price,
"url": ad_url,
"description": description
}
return ad_data