Move ignore conditions to a new function

q-m · Oct 21, 2024 · fae7997 · fae7997
1 parent 44889fe
commit fae7997
Showing 1 changed file with 15 additions and 9 deletions.
diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py
@@ -17,14 +17,8 @@ class WaczMiddleware(BaseWaczMiddleware):
     This helps to work with large archives, including remote ones.
     """
 
-    def process_request(self, request: Request, spider: Spider):
-        # Continue default crawl behaviour
-        if not self.crawl:
-            return None
-
-        # If the attribute has not been set, none of the WACZ could be opened.
-        if self.crawl and not hasattr(self, 'wacz'):
-            raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")
+    def _check_ignore_conditions(self, request: Request, spider: Spider) -> None:
+        """Check conditions that would lead to ignoring the request and raise IgnoreRequest if necessary"""
 
         # Ignore when crawling and flag indicates this request needs to be skipped during WACZ crawl
         if "wacz_crawl_skip" in request.flags:
@@ -35,12 +29,24 @@ def process_request(self, request: Request, spider: Spider):
         if self._is_off_site(request.url, spider):
             self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
             raise IgnoreRequest()
-    
+
         # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
         if self._is_disallowed_by_spider(request.url, spider):
             self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
             raise IgnoreRequest()
 
+    def process_request(self, request: Request, spider: Spider):
+        # Continue default crawl behaviour
+        if not self.crawl:
+            return None
+
+        # If the attribute has not been set, none of the WACZ could be opened
+        if self.crawl and not hasattr(self, 'wacz'):
+            raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")
+
+        # Check if the request should be ignored
+        self._check_ignore_conditions(request, spider)
+
         # Get record from existing index entry, or else lookup by URL
         if request.meta.get("cdxj_record"):
             warc_record = self.wacz.get_warc_from_cdxj_record(cdxj_record=request.meta["cdxj_record"])