diff --git a/docs/settings.md b/docs/settings.md index e24b1e2..f238722 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -7,9 +7,15 @@ ### `SW_EXPORT_URI` ```python +# Either configure the directory where the output should be uploaded to SW_EXPORT_URI = "s3://scrapy-webarchive/" SW_EXPORT_URI = "s3://scrapy-webarchive/{spider}/" SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/{spider}/" + +# OR add the file name for full control of the output +SW_EXPORT_URI = "s3://scrapy-webarchive/output.wacz" +SW_EXPORT_URI = "s3://scrapy-webarchive/{spider}/output-{timestamp}.wacz" +SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/{spider}-{timestamp}.wacz" ``` This is the output path of the WACZ file. Multiple variables can be added that allow dynamic generation of the output path. diff --git a/pyproject.toml b/pyproject.toml index 0df93f2..a64501f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -57,14 +57,14 @@ exclude = [ "venv", ] -line-length = 119 +line-length = 120 indent-width = 4 # Assume Python 3.8 target-version = "py38" [tool.ruff.lint] -select = ["E4", "E7", "E9", "F", "I"] +select = ["E4", "E7", "E9", "E501", "F", "I"] ignore = [] fixable = ["ALL"] unfixable = [] diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py index af1fc17..4587cb0 100644 --- a/scrapy_webarchive/extensions.py +++ b/scrapy_webarchive/extensions.py @@ -1,7 +1,9 @@ from __future__ import annotations +import os from datetime import datetime from io import BytesIO +from typing import Tuple from scrapy import Spider, signals from scrapy.crawler import Crawler @@ -39,6 +41,7 @@ def persist_file( class WaczExporter: """WACZ exporter extension that writes spider requests/responses as WARC and later compiles them to a WACZ.""" + wacz_fname = None STORE_SCHEMES: Dict[str, Type[FilesStoreProtocol]] = { "": FSFilesStore, "file": FSFilesStore, @@ -50,27 +53,47 @@ class WaczExporter: def __init__(self, settings: Settings, crawler: Crawler) -> None: self.settings = settings self.stats = crawler.stats + self.crawler = crawler - if not self.settings["SW_EXPORT_URI"]: - raise NotConfigured - - if "scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware" in settings.getlist('SPIDER_MIDDLEWARES'): - raise NotConfigured("You must disable the WaczCrawlMiddleware before you can use this extension.") + # Check configuration prerequisites + self._check_configuration_prerequisites() - if "scrapy_webarchive.downloadermiddlewares.WaczMiddleware" in settings.getlist('DOWNLOADER_MIDDLEWARES'): - raise NotConfigured("You must disable the WaczMiddleware before you can use this extension.") + # Get the store URI and configure the WACZ filename + store_uri, self.wacz_fname = self._retrieve_store_uri_and_wacz_fname() - self.store: FilesStoreProtocol = self._get_store(spider_name=crawler.spider.name) + # Initialize store and writer + self.store: FilesStoreProtocol = self._get_store(store_uri) self.writer = WarcFileWriter(collection_name=crawler.spider.name) - def _get_store(self, spider_name: str) -> FilesStoreProtocol: - archive_uri_template = self.settings["SW_EXPORT_URI"] - uri = archive_uri_template.format(**{ - "spider": spider_name, + def _check_configuration_prerequisites(self) -> None: + """raises NotConfigured if essential settings or middleware configurations are incorrect.""" + + if not self.settings.get("SW_EXPORT_URI"): + raise NotConfigured("Missing SW_EXPORT_URI setting.") + + forbidden_middleware = [ + ("scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware", "SPIDER_MIDDLEWARES"), + ("scrapy_webarchive.downloadermiddlewares.WaczMiddleware", "DOWNLOADER_MIDDLEWARES"), + ] + if any(middleware in self.settings.getlist(key) for middleware, key in forbidden_middleware): + raise NotConfigured("Disable WACZ middlewares in SPIDER_MIDDLEWARES and DOWNLOADER_MIDDLEWARES.") + + def _retrieve_store_uri_and_wacz_fname(self) -> Tuple[str, Union[str, None]]: + """Sets up the export URI based on configuration and spider context.""" + + export_uri = self.settings["SW_EXPORT_URI"].format( + spider=self.crawler.spider.name, **get_archive_uri_template_dt_variables(), - }) - store_cls = self.STORE_SCHEMES[get_scheme_from_uri(uri)] - return store_cls(uri) + ) + + if os.path.isdir(export_uri): + return export_uri, None + else: + return os.path.split(export_uri) + + def _get_store(self, store_uri: str) -> FilesStoreProtocol: + store_cls = self.STORE_SCHEMES[get_scheme_from_uri(store_uri)] + return store_cls(store_uri) @classmethod def from_crawler(cls, crawler: Crawler) -> Self: @@ -134,8 +157,9 @@ def response_received(self, response: Response, request: Request, spider: Spider def spider_closed(self, spider: Spider) -> None: wacz_creator = WaczFileCreator( - store=self.store, - warc_fname=self.writer.warc_fname, + store=self.store, + warc_fname=self.writer.warc_fname, + wacz_fname=self.wacz_fname, collection_name=spider.name, title=self.settings["SW_WACZ_TITLE"], description=self.settings["SW_WACZ_DESCRIPTION"], diff --git a/scrapy_webarchive/spidermiddlewares.py b/scrapy_webarchive/spidermiddlewares.py index f4c4245..1d8b4e5 100644 --- a/scrapy_webarchive/spidermiddlewares.py +++ b/scrapy_webarchive/spidermiddlewares.py @@ -47,7 +47,7 @@ def spider_opened(self, spider: Spider) -> None: process, and collects valid WACZ files for further use. If only one WACZ URI is provided, it opens and assigns the file to `self.wacz` as a `WaczFile` instance. - If multiple URIs are provided, valid files are grouped and assigned to `self.wacz` as a `MultiWaczFile` instance. + If multiple URIs are provided, valid files are assigned to `self.wacz` as a `MultiWaczFile` instance. """ spider.logger.info(f"[WACZDownloader] Found {len(self.wacz_uris)} WACZ URI(s) to open") @@ -62,7 +62,9 @@ def spider_opened(self, spider: Spider) -> None: spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}") if wacz_files: - spider.logger.info(f"[WACZDownloader] Continuing with {len(wacz_files)}/{len(self.wacz_uris)} valid WACZ files") + spider.logger.info( + f"[WACZDownloader] Continuing with {len(wacz_files)}/{len(self.wacz_uris)} valid WACZ files" + ) if len(wacz_files) == 1: self.wacz = WaczFile(wacz_files[0]) else: diff --git a/scrapy_webarchive/wacz.py b/scrapy_webarchive/wacz.py index 055136a..6c5c04b 100644 --- a/scrapy_webarchive/wacz.py +++ b/scrapy_webarchive/wacz.py @@ -41,13 +41,23 @@ class WaczFileCreator: hash_type = "sha256" datapackage_fname = "datapackage.json" - def __init__(self, store: 'FilesStoreProtocol', warc_fname: str, collection_name: str, title: str, description: str, cdxj_fname: str = "index.cdxj") -> None: + def __init__( + self, + store: 'FilesStoreProtocol', + warc_fname: str, + collection_name: str, + title: str, + description: str, + wacz_fname: Union[str, None], + cdxj_fname: str = "index.cdxj", + ) -> None: self.store = store self.warc_fname = warc_fname self.cdxj_fname = cdxj_fname self.collection_name = collection_name self._title = title self._description = description + self.wacz_fname = wacz_fname or self.get_wacz_fname() def create(self) -> None: """Create the WACZ file from the WARC and CDXJ index and save it in the configured store.""" @@ -63,7 +73,7 @@ def create(self) -> None: # Save WACZ to the storage zip_buffer.seek(0) - self.store.persist_file(path=self.get_wacz_fname(), buf=zip_buffer, info=None) + self.store.persist_file(path=self.wacz_fname, buf=zip_buffer, info=None) def create_wacz_zip(self) -> io.BytesIO: """Create the WACZ zip file and return the in-memory buffer.""" @@ -160,12 +170,15 @@ def collect_resources(self, zip_file: zipfile.ZipFile) -> List[Dict[str, Any]]: @property def title(self): return self._title or self.collection_name - + @property def description(self): - return self._description or "This is the web archive generated by a scrapy-webarchive extension for the " \ - f"{self.collection_name} spider. It is mainly for scraping purposes as it does not contain " \ - "any js/css data. Though it can be replayed as bare HTML if the site does not depend on JavaScript." + return ( + self._description + or f"This is the web archive generated by a scrapy-webarchive extension for the {self.collection_name} " + "spider. It is mainly for scraping purposes as it does not contain any js/css data. Though it can be " + "replayed as bare HTML if the site does not depend on JavaScript." + ) class WaczFile: diff --git a/tests/test_wacz.py b/tests/test_wacz.py index 8d6da25..871d14f 100644 --- a/tests/test_wacz.py +++ b/tests/test_wacz.py @@ -30,6 +30,7 @@ def wacz_file_creator(self): cdxj_fname=self.cdxj_fname, title="Testing", description="WACZ generated durning a unit-test", + wacz_fname=None, ) @freeze_time("2024-10-04 08:27:11") diff --git a/tests/test_warc.py b/tests/test_warc.py index 535e3db..31313f8 100644 --- a/tests/test_warc.py +++ b/tests/test_warc.py @@ -24,7 +24,11 @@ def test_generate_warc_fname(monkeypatch): @pytest.fixture def warc_record_response(): - payload = b"""HTTP/1.0 200\r\nContent-Length: 11064\r\nDate: Mon, 07 Oct 2024 09:58:44 GMT\r\nContent-Type: text/html; charset=utf-8\r\nStrict-Transport-Security: max-age=0; includeSubDomains; preload\r\n\r\n\nWelcome to scrapy-webarchive!""" + payload = ( + b"HTTP/1.0 200\r\nContent-Length: 11064\r\nDate: Mon, 07 Oct 2024 09:58:44 GMT\r\nContent-Type: text/html; " + b"charset=utf-8\r\nStrict-Transport-Security: max-age=0; includeSubDomains; preload\r\n\r\n\n" + b"Welcome to scrapy-webarchive!" + ) return WARCRecord(payload=payload, headers={"WARC-Target-URI": "http://example.com"})