From fae79978ad9d9c8619967e230c353ed5f8da5de7 Mon Sep 17 00:00:00 2001
From: Wesley van Lee <wesley.vanlee@ordina.nl>
Date: Mon, 21 Oct 2024 09:37:41 +0200
Subject: [PATCH] Move ignore conditions to a new function

---
 scrapy_webarchive/downloadermiddlewares.py | 24 ++++++++++++++--------
 1 file changed, 15 insertions(+), 9 deletions(-)

diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py
index 129c4b9..b29836f 100644
--- a/scrapy_webarchive/downloadermiddlewares.py
+++ b/scrapy_webarchive/downloadermiddlewares.py
@@ -17,14 +17,8 @@ class WaczMiddleware(BaseWaczMiddleware):
     This helps to work with large archives, including remote ones.
     """
 
-    def process_request(self, request: Request, spider: Spider):
-        # Continue default crawl behaviour
-        if not self.crawl:
-            return None
-
-        # If the attribute has not been set, none of the WACZ could be opened.
-        if self.crawl and not hasattr(self, 'wacz'):
-            raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")
+    def _check_ignore_conditions(self, request: Request, spider: Spider) -> None:
+        """Check conditions that would lead to ignoring the request and raise IgnoreRequest if necessary"""
 
         # Ignore when crawling and flag indicates this request needs to be skipped during WACZ crawl
         if "wacz_crawl_skip" in request.flags:
@@ -35,12 +29,24 @@ def process_request(self, request: Request, spider: Spider):
         if self._is_off_site(request.url, spider):
             self.stats.inc_value("webarchive/crawl_skip/off_site", spider=spider)
             raise IgnoreRequest()
-    
+
         # Ignore disallowed pages (to avoid crawling e.g. redirects from whitelisted pages to unwanted ones)
         if self._is_disallowed_by_spider(request.url, spider):
             self.stats.inc_value("webarchive/crawl_skip/disallowed", spider=spider)
             raise IgnoreRequest()
 
+    def process_request(self, request: Request, spider: Spider):
+        # Continue default crawl behaviour
+        if not self.crawl:
+            return None
+
+        # If the attribute has not been set, none of the WACZ could be opened
+        if self.crawl and not hasattr(self, 'wacz'):
+            raise WaczMiddlewareException("Could not open any WACZ files, check your WACZ URIs and authentication.")
+
+        # Check if the request should be ignored
+        self._check_ignore_conditions(request, spider)
+
         # Get record from existing index entry, or else lookup by URL
         if request.meta.get("cdxj_record"):
             warc_record = self.wacz.get_warc_from_cdxj_record(cdxj_record=request.meta["cdxj_record"])