WARC lookup for remote zipfile (WACZ) (#23)

* Add remote zipfile implementation with range requests * Disable python3.7 workflow for now * Update docs and drop support for Python 3.7 * Store WACZ entires with store mode in ZIP file * Raise exception for unsupported scheme in URI * Slight optimization by avoiding unnecessary list comprehension --------- Co-authored-by: Wesley van Lee <[email protected]>
q-m · Dec 31, 2024 · 2db01ad · 2db01ad
1 parent 70e90ab
commit 2db01ad
Show file tree

Hide file tree

Showing 25 changed files with 794 additions and 392 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -31,16 +31,13 @@ jobs:
     strategy:
       matrix:
         toxenv:
-          - py37-scrapy29
           - py312-scrapy29
           - py38-scrapy210
           - py312-scrapy210
           - py38-scrapy211
           - py312-scrapy211
           - py312-scrapymaster
         include:
-          - toxenv: py37-scrapy29
-            python-version: 3.7 
           - toxenv: py312-scrapy29
             python-version: '3.12'
 

diff --git a/README.md b/README.md
@@ -12,7 +12,7 @@ Scrapy Webarchive is a plugin for Scrapy that allows users to capture and export
 
 ## Compatibility
 
-* Python 3.7, 3.8, 3.9, 3.10, 3.11 and 3.12
+* Python 3.8, 3.9, 3.10, 3.11 and 3.12
 
 ## Documentation
 

diff --git a/docs/settings.md b/docs/settings.md
@@ -39,11 +39,14 @@ This setting defines the description of the WACZ used in the `datapackage.json`,
 
 ### `SW_WACZ_SOURCE_URI`
 
+⚠️ Scraping against a remote source currently only supports AWS S3.
+
 ```python
+SW_WACZ_SOURCE_URI = "file:///Users/username/Documents/archive.wacz"
 SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
 
 # Allows multiple sources, comma seperated.
-SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz"
+SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz,file:///Users/username/Documents/archive.wacz"
 ```
 
 This setting defines the location of the WACZ file that should be used as a source for the crawl job.

diff --git a/pyproject.toml b/pyproject.toml
@@ -13,7 +13,7 @@ dependencies = [
     "wacz==0.5.0",
     "cdxj-indexer==1.4.5",
 ]
-requires-python = ">=3.7,<3.13"
+requires-python = ">=3.8,<3.13"
 authors = []
 maintainers = []
 description = "A webarchive extension for Scrapy"

diff --git a/scrapy_webarchive/cdxj.py b/scrapy_webarchive/cdxj.py
@@ -8,7 +8,7 @@
 from typing_extensions import TYPE_CHECKING, List
 
 if TYPE_CHECKING:
-    from scrapy_webarchive.wacz import WaczFile
+    from scrapy_webarchive.wacz.wacz_file import WaczFile
 
 CDXREC = re.compile(
     r"^(?P<surt>(?P<host>[^\)\s]+)\)(?P<path>[^\?\s]+)?(\?(?P<query>\S+))?)"

diff --git a/scrapy_webarchive/exceptions.py b/scrapy_webarchive/exceptions.py
@@ -2,3 +2,9 @@ class WaczMiddlewareException(Exception):
     """Indicates a critical issue in the middleware."""
 
     pass
+
+
+class UnsupportedURIException(Exception):
+    """Raised when the given URI scheme is not supported by the factory."""
+
+    pass
diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py
@@ -17,7 +17,7 @@
 from typing_extensions import Any, Dict, Protocol, Self, Type, Union, cast
 
 from scrapy_webarchive.utils import WARC_DT_FORMAT, get_formatted_dt_string, get_scheme_from_uri
-from scrapy_webarchive.wacz import WaczFileCreator
+from scrapy_webarchive.wacz.creator import WaczFileCreator
 from scrapy_webarchive.warc import WarcFileWriter
 
 

diff --git a/scrapy_webarchive/spidermiddlewares.py b/scrapy_webarchive/spidermiddlewares.py
@@ -11,7 +11,8 @@
 from typing_extensions import Iterable, Self, Union
 
 from scrapy_webarchive.exceptions import WaczMiddlewareException
-from scrapy_webarchive.wacz import MultiWaczFile, WaczFile, open_wacz_file
+from scrapy_webarchive.wacz.storages import ZipStorageHandlerFactory
+from scrapy_webarchive.wacz.wacz_file import MultiWaczFile, WaczFile
 from scrapy_webarchive.warc import record_transformer
 
 
@@ -55,20 +56,19 @@ def spider_opened(self, spider: Spider) -> None:
 
         for wacz_uri in self.wacz_uris:
             spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_uri}")
-            wacz_file = open_wacz_file(wacz_uri, self.timeout, spider.settings)
-            if wacz_file:
-                wacz_files.append(wacz_file)
-            else:
+            storage_handler = ZipStorageHandlerFactory.get_handler(wacz_uri, spider.settings)
+
+            if not storage_handler.zip_exists:
                 spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}")
+                continue
+
+            wacz_files.append(WaczFile(storage_handler=storage_handler))
 
         if wacz_files:
             spider.logger.info(
                 f"[WACZDownloader] Continuing with {len(wacz_files)}/{len(self.wacz_uris)} valid WACZ files"
             )
-            if len(wacz_files) == 1:
-                self.wacz = WaczFile(wacz_files[0])
-            else:
-                self.wacz = MultiWaczFile(wacz_files)
+            self.wacz = wacz_files[0] if len(wacz_files) == 1 else MultiWaczFile(wacz_files)
 
         # If there are not wacz_files, we raise a `WaczMiddlewareException` in the downloader/spider middleware.
         # Raising an exception here does not stop the job from running. If there are no valid WACZ files configured

diff --git a/scrapy_webarchive/utils.py b/scrapy_webarchive/utils.py
@@ -5,9 +5,7 @@
 from datetime import datetime, timezone
 from pathlib import Path
 from typing import IO, Tuple
-from urllib.parse import urlparse, urlunparse
-
-from scrapy.settings import Settings
+from urllib.parse import urlparse
 
 WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
 TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S"
@@ -36,51 +34,6 @@ def get_scheme_from_uri(uri: str) -> str:
         return urlparse(uri).scheme
 
 
-def get_s3_client(settings: Settings):
-    """Create an S3 client using the given settings."""
-
-    import botocore.session
-    session = botocore.session.get_session()
-    return session.create_client(
-        "s3",
-        aws_access_key_id=settings["AWS_ACCESS_KEY_ID"],
-        aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"],
-        aws_session_token=settings["AWS_SESSION_TOKEN"],
-        endpoint_url=settings["AWS_ENDPOINT_URL"],
-        region_name=settings["AWS_REGION_NAME"],
-        use_ssl=settings["AWS_USE_SSL"],
-        verify=settings["AWS_VERIFY"],
-    )
-
-
-def get_gcs_client(settings: Settings):
-    """Create a Google Cloud Storage client using the given settings."""
-
-    from google.cloud import storage
-    return storage.Client(project=settings["GCS_PROJECT_ID"])
-
-
-def add_ftp_credentials(wacz_uri: str, settings: Settings) -> str:
-    """Add FTP username and password to the URI if not present."""
-
-    parsed_uri = urlparse(wacz_uri)
-
-    if parsed_uri.username is None:
-        # Build netloc with credentials.
-        credentials = f'{settings["FTP_USER"]}:{settings["FTP_PASSWORD"]}'
-        netloc = f'{credentials}@{parsed_uri.hostname}'
-
-        # Add port if present.
-        if parsed_uri.port:
-            netloc += f":{parsed_uri.port}"
-
-        # Update and return the URI with credentials.
-        updated_uri = parsed_uri._replace(netloc=netloc)
-        return urlunparse(updated_uri)
-
-    return wacz_uri
-
-
 def hash_stream(hash_type: str, stream: IO) -> Tuple[int, str]:
     """Hashes the stream with given hash_type hasher."""