From 11b5d85941926089758048a9aabe3c27fc5a3f76 Mon Sep 17 00:00:00 2001 From: Wesley van Lee Date: Wed, 9 Oct 2024 16:21:05 +0200 Subject: [PATCH] Update README.md --- README.md | 22 +++++++++------------- scrapy_webarchive/middleware.py | 2 +- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index ae94e75..e42806f 100644 --- a/README.md +++ b/README.md @@ -1,21 +1,17 @@ # Scrapy Webarchive -A Web Archive extension for Scrapy +Scrapy Webarchive is a plugin for Scrapy that allows users to capture and export web archives in the WARC and WACZ formats during crawling. +## Features -# Installation +* Save web crawls in WACZ format (multiple storages supported; local and cloud). +* Crawl against WACZ format archives. +* Integrate seamlessly with Scrapy’s spider request and response cycle. -Add to your `settings.py` or your spider configuration. +## Compatibility -```python -EXTENSIONS = { - 'scrapy_webarchive.extensions.WaczExporter': 543, -} +* Python 3.8+ -DOWNLOADER_MIDDLEWARES = { - 'scrapy_webarchive.downloadermiddlewares.WaczMiddleware': 543, -} +## Documentation -# year, month, day and timestamp are the supported template variables that you can use. -ARCHIVE_EXPORT_URI = 's3://scrapy-webarchive/{year}/{month}/{day}/' -``` +Documentation is available online at [developers.thequestionmark.org/scrapy-webarchive/](https://developers.thequestionmark.org/scrapy-webarchive/) diff --git a/scrapy_webarchive/middleware.py b/scrapy_webarchive/middleware.py index 2b53155..8d3c95d 100644 --- a/scrapy_webarchive/middleware.py +++ b/scrapy_webarchive/middleware.py @@ -58,7 +58,7 @@ def process_start_requests(self, start_requests: Iterable[Request], spider: Spid url = entry["url"] # filter out off-site responses - if hasattr(spider, 'allowed_domains') and urlparse(url).hostname not in spider.allowed_domains: + if hasattr(spider, "allowed_domains") and urlparse(url).hostname not in spider.allowed_domains: continue # only accept whitelisted responses if requested by spider