Skip to content

Commit

Permalink
Move ignore conditions to a new function
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 21, 2024
1 parent 44889fe commit fae7997
Showing 1 changed file with 15 additions and 9 deletions.
24 changes: 15 additions & 9 deletions scrapy_webarchive/downloadermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,8 @@ class WaczMiddleware(BaseWaczMiddleware):
This helps to work with large archives, including remote ones.
"""

def process_request(self, request: Request, spider: Spider):
# Continue default crawl behaviour
if not self.crawl:
return None

# If the attribute has not been set, none of the WACZ could be opened.
if self.crawl and not hasattr(self, 'wacz'):
raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")
def _check_ignore_conditions(self, request: Request, spider: Spider) -> None:
"""Check conditions that would lead to ignoring the request and raise IgnoreRequest if necessary"""

# Ignore when crawling and flag indicates this request needs to be skipped during WACZ crawl
if "wacz_crawl_skip" in request.flags:
Expand All @@ -35,12 +29,24 @@ def process_request(self, request: Request, spider: Spider):
if self._is_off_site(request.url, spider):
self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
raise IgnoreRequest()

# Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
if self._is_disallowed_by_spider(request.url, spider):
self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
raise IgnoreRequest()

def process_request(self, request: Request, spider: Spider):
# Continue default crawl behaviour
if not self.crawl:
return None

# If the attribute has not been set, none of the WACZ could be opened
if self.crawl and not hasattr(self, 'wacz'):
raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")

# Check if the request should be ignored
self._check_ignore_conditions(request, spider)

# Get record from existing index entry, or else lookup by URL
if request.meta.get("cdxj_record"):
warc_record = self.wacz.get_warc_from_cdxj_record(cdxj_record=request.meta["cdxj_record"])
Expand Down

0 comments on commit fae7997

Please sign in to comment.