diff --git a/docs/index.md b/docs/index.md index b28ffeb..aef4de2 100644 --- a/docs/index.md +++ b/docs/index.md @@ -7,4 +7,7 @@ - Crawl against WACZ format archives. - Integrate seamlessly with Scrapy’s spider request and response cycle. +## Limitations +- WACZ supports saving images but this module does not yet integrate with Scrapy's image/file pipeline for retrieving images/files from the WACZ. Future support for this feature is planned. + **Source Code**: https://github.com/q-m/scrapy-webarchive diff --git a/docs/settings.md b/docs/settings.md index ccd7392..ad86796 100644 --- a/docs/settings.md +++ b/docs/settings.md @@ -1,37 +1,45 @@ # Settings -`scrapy-webarchive` makes use of the following settings, in addition to Scrapy's settings: +`scrapy-webarchive` makes use of the following settings, in addition to Scrapy's settings. Note that all the settings are prefixed with `SW_`. ## Extensions -### `ARCHIVE_EXPORT_URI` +### `SW_EXPORT_URI` ```python -ARCHIVE_EXPORT_URI = "s3://scrapy-webarchive/" -ARCHIVE_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/" +SW_EXPORT_URI = "s3://scrapy-webarchive/" +SW_EXPORT_URI = "s3://scrapy-webarchive/{year}/{month}/{day}/" ``` This is the output path of the WACZ file. Multiple variables can be added that allow dynamic generation of the output path. Supported variables: `year`, `month`, `day` and `timestamp`. -## Downloader middleware +## Downloader middleware and spider middleware -### `WACZ_SOURCE_URL` +### `SW_WACZ_SOURCE_URL` ```python -WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz" +SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz" # Allows multiple sources, comma seperated. -WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz" +SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz" ``` This setting defines the location of the WACZ file that should be used as a source for the crawl job. -### `WACZ_CRAWL` +### `SW_WACZ_CRAWL` ```python -WACZ_CRAWL = True +SW_WACZ_CRAWL = True ``` Setting to ignore original `start_requests`, just yield all responses found. + +### `SW_WACZ_TIMEOUT` + +```python +SW_WACZ_TIMEOUT = 60 +``` + +Transport parameter for retrieving the `SW_WACZ_SOURCE_URL` from the defined location. diff --git a/docs/usage.md b/docs/usage.md new file mode 100644 index 0000000..f2e1376 --- /dev/null +++ b/docs/usage.md @@ -0,0 +1,62 @@ +# Usage + +## Exporting + +### Exporting a WACZ archive + +To archive the requests/responses during a crawl job you need to enable the `WaczExporter` extension. + +```python +EXTENSIONS = { + "scrapy_webarchive.extensions.WaczExporter": 543, +} +``` + +This extension also requires you to set the export location using the `SW_EXPORT_URI` settings. + +```python +SW_EXPORT_URI = "s3://scrapy-webarchive/" +``` + +Running a crawl job using these settings will result in a newly created WACZ file. + +## Crawling + +There are 2 ways to crawl against a WACZ archive. Choose a strategy that you want to use for your crawl job, and follow the instruction as described below. Using both strategies at the same time is not allowed. + +## Lookup in a WACZ archive + +One of the ways to crawl against a WACZ archive is to use the `WaczMiddleware` downloader middleware. Instead of fetching the live resource the middleware will instead retrieve it from the archive and recreate a response using the data from the archive. + +To use the downloader middleware, enable it in the settings like so: + +```python +DOWNLOADER_MIDDLEWARES = { + "scrapy_webarchive.downloadermiddlewares.WaczMiddleware": 543, +} +``` + +Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URL` setting: + +```python +SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz" +``` + +## Iterating a WACZ archive + +Going around the default behaviour of the spider, the `WaczCrawlMiddleware` spider middleware will, when enabled, replace the crawl by an iteration through all the entries in the WACZ archive. + +To use the spider middleware, enable it in the settings like so: + +```python +SPIDER_MIDDLEWARES = { + "scrapy_webarchive.middleware.WaczCrawlMiddleware": 532, +} +``` + +Then define the location of the WACZ archive with `SW_WACZ_SOURCE_URL` setting: + +```python +SW_WACZ_SOURCE_URL = "s3://scrapy-webarchive/archive.wacz" +SW_WACZ_CRAWL = True +``` diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py index 32f134b..ff1fdb2 100644 --- a/scrapy_webarchive/downloadermiddlewares.py +++ b/scrapy_webarchive/downloadermiddlewares.py @@ -25,14 +25,14 @@ class WaczMiddleware: def __init__(self, settings: Settings, stats: StatsCollector) -> None: self.stats = stats - wacz_url = settings.get("WACZ_SOURCE_URL", None) + wacz_url = settings.get("SW_WACZ_SOURCE_URL", None) if not wacz_url: raise NotConfigured self.wacz_urls = re.split(r"\s*,\s*", wacz_url) - self.crawl = settings.get("WACZ_CRAWL", False) - self.timeout = settings.getfloat("WACZ_TIMEOUT", 60) + self.crawl = settings.get("SW_WACZ_CRAWL", False) + self.timeout = settings.getfloat("SW_WACZ_TIMEOUT", 60) @classmethod def from_crawler(cls, crawler: Crawler) -> Self: diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py index 5723e1b..d909443 100644 --- a/scrapy_webarchive/extensions.py +++ b/scrapy_webarchive/extensions.py @@ -31,14 +31,14 @@ def __init__(self, settings: Settings, crawler: Crawler) -> None: self.settings = settings self.stats = crawler.stats - if not self.settings["ARCHIVE_EXPORT_URI"]: + if not self.settings["SW_EXPORT_URI"]: raise NotConfigured self.store = self._get_store() self.writer = WarcFileWriter(collection_name=crawler.spider.name) def _get_store(self): - archive_uri_template = self.settings["ARCHIVE_EXPORT_URI"] + archive_uri_template = self.settings["SW_EXPORT_URI"] uri = archive_uri_template.format(**get_archive_uri_template_variables()) if Path(uri).is_absolute(): # to support win32 paths like: C:\\some\dir diff --git a/scrapy_webarchive/middleware.py b/scrapy_webarchive/middleware.py index dfdf6f1..f14393d 100644 --- a/scrapy_webarchive/middleware.py +++ b/scrapy_webarchive/middleware.py @@ -16,14 +16,14 @@ class WaczCrawlMiddleware: def __init__(self, settings: Settings, stats: StatsCollector) -> None: self.stats = stats - wacz_url = settings.get("WACZ_SOURCE_URL", None) + wacz_url = settings.get("SW_WACZ_SOURCE_URL", None) if not wacz_url: raise NotConfigured self.wacz_urls = re.split(r"\s*,\s*", wacz_url) - self.crawl = settings.get("WACZ_CRAWL", False) - self.timeout = settings.getfloat("WACZ_TIMEOUT", 60) + self.crawl = settings.get("SW_WACZ_CRAWL", False) + self.timeout = settings.getfloat("SW_WACZ_TIMEOUT", 60) @classmethod def from_crawler(cls, crawler: Crawler) -> Self: diff --git a/tests/test_downloadermiddlewares.py b/tests/test_downloadermiddlewares.py index 424e59b..9bba363 100644 --- a/tests/test_downloadermiddlewares.py +++ b/tests/test_downloadermiddlewares.py @@ -17,9 +17,9 @@ def setup_method(self): def _get_settings(self, **new_settings): settings = { - "WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(), - "WACZ_CRAWL": False, - "WACZ_TIMEOUT": 60, + "SW_WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(), + "SW_WACZ_CRAWL": False, + "SW_WACZ_TIMEOUT": 60, } settings.update(new_settings) return Settings(settings) diff --git a/tests/test_extensions.py b/tests/test_extensions.py index 9af120d..0baa50d 100644 --- a/tests/test_extensions.py +++ b/tests/test_extensions.py @@ -19,22 +19,22 @@ def test_archive_export_uri_invalid_raises_not_configured(self): @mock.patch('scrapy_webarchive.extensions.FTPFilesStore.__init__', return_value=None) @mock.patch('scrapy_webarchive.extensions.FSFilesStore.__init__', return_value=None) def test_get_store(self, *args): - crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "/tmp/scrapy-webarchive/wacz/"}) + crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "/tmp/scrapy-webarchive/wacz/"}) crawler.spider = crawler._create_spider("quotes") extension = WaczExporter.from_crawler(crawler) assert isinstance(extension.store, FSFilesStore) - crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "s3://scrapy-webarchive/wacz/"}) + crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "s3://scrapy-webarchive/wacz/"}) crawler.spider = crawler._create_spider("quotes") extension = WaczExporter.from_crawler(crawler) assert isinstance(extension.store, S3FilesStore) - crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "gs://scrapy-webarchive/wacz/"}) + crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "gs://scrapy-webarchive/wacz/"}) crawler.spider = crawler._create_spider("quotes") extension = WaczExporter.from_crawler(crawler) assert isinstance(extension.store, GCSFilesStore) - crawler = get_crawler(settings_dict={"ARCHIVE_EXPORT_URI": "ftp://scrapy-webarchive/wacz/"}) + crawler = get_crawler(settings_dict={"SW_EXPORT_URI": "ftp://scrapy-webarchive/wacz/"}) crawler.spider = crawler._create_spider("quotes") extension = WaczExporter.from_crawler(crawler) assert isinstance(extension.store, FTPFilesStore) diff --git a/tests/test_middleware.py b/tests/test_middleware.py index 36edc32..76d7644 100644 --- a/tests/test_middleware.py +++ b/tests/test_middleware.py @@ -16,8 +16,8 @@ def setup_method(self): def _get_settings(self, **new_settings): settings = { - "WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(), - "WACZ_TIMEOUT": 60, + "SW_WACZ_SOURCE_URL": get_test_data_path("warc_1_1", "quotes.wacz.gz").as_uri(), + "SW_WACZ_TIMEOUT": 60, } settings.update(new_settings) return Settings(settings) @@ -32,25 +32,25 @@ def _middleware(self, **new_settings): def test_wacz_archive_is_ignored_follow_original_behaviour(self): request = Request("https://quotes.toscrape.com") - with self._middleware(WACZ_CRAWL=False) as mw: + with self._middleware(SW_WACZ_CRAWL=False) as mw: out = list(mw.process_start_requests([request], self.spider)) assert out == [request] def test_wacz_archive_iterates_all_records(self): - with self._middleware(WACZ_CRAWL=True) as mw: + with self._middleware(SW_WACZ_CRAWL=True) as mw: out = list(mw.process_start_requests([], self.spider)) assert len(out) == 101 def test_wacz_archive_filters_allowed_domains(self): setattr(self.spider, "allowed_domains", "quotes.toscrape.com") - with self._middleware(WACZ_CRAWL=True) as mw: + with self._middleware(SW_WACZ_CRAWL=True) as mw: out = list(mw.process_start_requests([], self.spider)) assert len(out) == 61 def test_wacz_archive_filters_archive_regex(self): setattr(self.spider, "archive_regex", r"https://quotes\.toscrape\.com/page/\d+/") - with self._middleware(WACZ_CRAWL=True) as mw: + with self._middleware(SW_WACZ_CRAWL=True) as mw: out = list(mw.process_start_requests([], self.spider)) assert len(out) == 9