From fae79978ad9d9c8619967e230c353ed5f8da5de7 Mon Sep 17 00:00:00 2001 From: Wesley van Lee Date: Mon, 21 Oct 2024 09:37:41 +0200 Subject: [PATCH] Move ignore conditions to a new function --- scrapy_webarchive/downloadermiddlewares.py | 24 ++++++++++++++-------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py index 129c4b9..b29836f 100644 --- a/scrapy_webarchive/downloadermiddlewares.py +++ b/scrapy_webarchive/downloadermiddlewares.py @@ -17,14 +17,8 @@ class WaczMiddleware(BaseWaczMiddleware): This helps to work with large archives, including remote ones. """ - def process_request(self, request: Request, spider: Spider): - # Continue default crawl behaviour - if not self.crawl: - return None - - # If the attribute has not been set, none of the WACZ could be opened. - if self.crawl and not hasattr(self, 'wacz'): - raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.") + def _check_ignore_conditions(self, request: Request, spider: Spider) -> None: + """Check conditions that would lead to ignoring the request and raise IgnoreRequest if necessary""" # Ignore when crawling and flag indicates this request needs to be skipped during WACZ crawl if "wacz_crawl_skip" in request.flags: @@ -35,12 +29,24 @@ def process_request(self, request: Request, spider: Spider): if self._is_off_site(request.url, spider): self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider) raise IgnoreRequest() - + # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones) if self._is_disallowed_by_spider(request.url, spider): self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider) raise IgnoreRequest() + def process_request(self, request: Request, spider: Spider): + # Continue default crawl behaviour + if not self.crawl: + return None + + # If the attribute has not been set, none of the WACZ could be opened + if self.crawl and not hasattr(self, 'wacz'): + raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.") + + # Check if the request should be ignored + self._check_ignore_conditions(request, spider) + # Get record from existing index entry, or else lookup by URL if request.meta.get("cdxj_record"): warc_record = self.wacz.get_warc_from_cdxj_record(cdxj_record=request.meta["cdxj_record"])