From c27ef4c8710f356d9445af4a09dfb13dc07e558f Mon Sep 17 00:00:00 2001 From: <> Date: Fri, 10 Jan 2025 09:21:25 +0000 Subject: [PATCH] Deployed 92146c3 with MkDocs version: 1.5.3 --- .nojekyll | 0 404.html | 392 + advanced_usage/index.html | 651 ++ assets/images/favicon.png | Bin 0 -> 1870 bytes assets/javascripts/bundle.3220b9d7.min.js | 29 + assets/javascripts/bundle.3220b9d7.min.js.map | 7 + assets/javascripts/lunr/min/lunr.ar.min.js | 1 + assets/javascripts/lunr/min/lunr.da.min.js | 18 + assets/javascripts/lunr/min/lunr.de.min.js | 18 + assets/javascripts/lunr/min/lunr.du.min.js | 18 + assets/javascripts/lunr/min/lunr.el.min.js | 1 + assets/javascripts/lunr/min/lunr.es.min.js | 18 + assets/javascripts/lunr/min/lunr.fi.min.js | 18 + assets/javascripts/lunr/min/lunr.fr.min.js | 18 + assets/javascripts/lunr/min/lunr.he.min.js | 1 + assets/javascripts/lunr/min/lunr.hi.min.js | 1 + assets/javascripts/lunr/min/lunr.hu.min.js | 18 + assets/javascripts/lunr/min/lunr.hy.min.js | 1 + assets/javascripts/lunr/min/lunr.it.min.js | 18 + assets/javascripts/lunr/min/lunr.ja.min.js | 1 + assets/javascripts/lunr/min/lunr.jp.min.js | 1 + assets/javascripts/lunr/min/lunr.kn.min.js | 1 + assets/javascripts/lunr/min/lunr.ko.min.js | 1 + assets/javascripts/lunr/min/lunr.multi.min.js | 1 + assets/javascripts/lunr/min/lunr.nl.min.js | 18 + assets/javascripts/lunr/min/lunr.no.min.js | 18 + assets/javascripts/lunr/min/lunr.pt.min.js | 18 + assets/javascripts/lunr/min/lunr.ro.min.js | 18 + assets/javascripts/lunr/min/lunr.ru.min.js | 18 + assets/javascripts/lunr/min/lunr.sa.min.js | 1 + .../lunr/min/lunr.stemmer.support.min.js | 1 + assets/javascripts/lunr/min/lunr.sv.min.js | 18 + assets/javascripts/lunr/min/lunr.ta.min.js | 1 + assets/javascripts/lunr/min/lunr.te.min.js | 1 + assets/javascripts/lunr/min/lunr.th.min.js | 1 + assets/javascripts/lunr/min/lunr.tr.min.js | 18 + assets/javascripts/lunr/min/lunr.vi.min.js | 1 + assets/javascripts/lunr/min/lunr.zh.min.js | 1 + assets/javascripts/lunr/tinyseg.js | 206 + assets/javascripts/lunr/wordcut.js | 6708 +++++++++++++++++ .../workers/search.b8dbb3d2.min.js | 42 + .../workers/search.b8dbb3d2.min.js.map | 7 + assets/stylesheets/main.66ac8b77.min.css | 1 + assets/stylesheets/main.66ac8b77.min.css.map | 1 + assets/stylesheets/palette.06af60db.min.css | 1 + .../stylesheets/palette.06af60db.min.css.map | 1 + index.html | 535 ++ installation/index.html | 438 ++ search/search_index.json | 1 + settings/index.html | 655 ++ sitemap.xml | 28 + sitemap.xml.gz | Bin 0 -> 257 bytes usage/index.html | 594 ++ 53 files changed, 10584 insertions(+) create mode 100644 .nojekyll create mode 100644 404.html create mode 100644 advanced_usage/index.html create mode 100644 assets/images/favicon.png create mode 100644 assets/javascripts/bundle.3220b9d7.min.js create mode 100644 assets/javascripts/bundle.3220b9d7.min.js.map create mode 100644 assets/javascripts/lunr/min/lunr.ar.min.js create mode 100644 assets/javascripts/lunr/min/lunr.da.min.js create mode 100644 assets/javascripts/lunr/min/lunr.de.min.js create mode 100644 assets/javascripts/lunr/min/lunr.du.min.js create mode 100644 assets/javascripts/lunr/min/lunr.el.min.js create mode 100644 assets/javascripts/lunr/min/lunr.es.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.fr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.he.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hu.min.js create mode 100644 assets/javascripts/lunr/min/lunr.hy.min.js create mode 100644 assets/javascripts/lunr/min/lunr.it.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ja.min.js create mode 100644 assets/javascripts/lunr/min/lunr.jp.min.js create mode 100644 assets/javascripts/lunr/min/lunr.kn.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ko.min.js create mode 100644 assets/javascripts/lunr/min/lunr.multi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.nl.min.js create mode 100644 assets/javascripts/lunr/min/lunr.no.min.js create mode 100644 assets/javascripts/lunr/min/lunr.pt.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ro.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ru.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sa.min.js create mode 100644 assets/javascripts/lunr/min/lunr.stemmer.support.min.js create mode 100644 assets/javascripts/lunr/min/lunr.sv.min.js create mode 100644 assets/javascripts/lunr/min/lunr.ta.min.js create mode 100644 assets/javascripts/lunr/min/lunr.te.min.js create mode 100644 assets/javascripts/lunr/min/lunr.th.min.js create mode 100644 assets/javascripts/lunr/min/lunr.tr.min.js create mode 100644 assets/javascripts/lunr/min/lunr.vi.min.js create mode 100644 assets/javascripts/lunr/min/lunr.zh.min.js create mode 100644 assets/javascripts/lunr/tinyseg.js create mode 100644 assets/javascripts/lunr/wordcut.js create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js create mode 100644 assets/javascripts/workers/search.b8dbb3d2.min.js.map create mode 100644 assets/stylesheets/main.66ac8b77.min.css create mode 100644 assets/stylesheets/main.66ac8b77.min.css.map create mode 100644 assets/stylesheets/palette.06af60db.min.css create mode 100644 assets/stylesheets/palette.06af60db.min.css.map create mode 100644 index.html create mode 100644 installation/index.html create mode 100644 search/search_index.json create mode 100644 settings/index.html create mode 100644 sitemap.xml create mode 100644 sitemap.xml.gz create mode 100644 usage/index.html diff --git a/.nojekyll b/.nojekyll new file mode 100644 index 0000000..e69de29 diff --git a/404.html b/404.html new file mode 100644 index 0000000..e30f008 --- /dev/null +++ b/404.html @@ -0,0 +1,392 @@ + + + +
+ + + + + + + + + + + + + + +The wacz_crawl_skip
flag is applied to requests that should be ignored by the crawler. When this flag is present, the middleware intercepts the request and prevents it from being processed further, skipping both download and parsing. This is useful in scenarios where the request should not be collected during a scraping session. Usage:
yield Request(url, callback=cb_func, flags=["wacz_crawl_skip"])
+
+When this happens, the statistic webarchive/crawl_skip
is increased.
If the spider has the attribute archive_disallow_regexp
, all requests returned from the spider that match this regular expression, are ignored. For example, when a product page was returned in start_requests
, but the product page disappeared and redirected to its category page, the category page can be disallowed, so as to avoid crawling the whole category, which would take much more time and could lead to unknown URLs (e.g. the spider's requested pagination size could be different from the website default).
When this happens, the statistic wacz/crawl_skip/disallowed
is increased.
When using a WACZ file that is not generated by your own spiders, it might be that the spider for crawling is not in place. In order to crawl this WACZ you need to tailor a spider to work with this specific WACZ file. This will require building the spider different to what it is supposed to look like with a live resource.
+Going around the default behaviour of the spider, the WaczCrawlMiddleware
spider middleware will, when enabled, replace the crawl by an iteration through all the entries in the WACZ archive index.
To use this strategy, enable both the spider- and the downloadermiddleware in the spider settings like so:
+DOWNLOADER_MIDDLEWARES = {
+ "scrapy_webarchive.downloadermiddlewares.WaczMiddleware": 543,
+}
+
+SPIDER_MIDDLEWARES = {
+ "scrapy_webarchive.spidermiddlewares.WaczCrawlMiddleware": 543,
+}
+
+Then define the location of the WACZ archive with SW_WACZ_SOURCE_URI
setting:
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"
+SW_WACZ_CRAWL = True
+
+Not all URLs will be interesting for the crawl since your WACZ will most likely contain static files such as fonts, JavaScript (website and external), stylesheets, etc. In order to improve the performance of the spider by not reading all the irrelevant request/response entries, you can configure the following atrribute in your spider, archive_regex
:
class MyWaczSpider(Spider):
+ name = "myspider"
+ archive_regex = r"^/tag/[\w-]+/$"
+
+If the spider has an archive_regexp
attribute, only response URLs matching this regexp are presented in start_requests
. To visualise that, the spider above will only crawl the indented cdxj records below:
com,toscrape,quotes)/favicon.ico 20241007081411465 {...}
+com,gstatic,fonts)/s/raleway/v34/1ptug8zys_skggpnyc0it4ttdfa.woff2 {...}
+com,googleapis,fonts)/css?family=raleway%3A400%2C700 20241007081525229 {...}
+com,toscrape,quotes)/static/bootstrap.min.css 20241007081525202 {...}
+com,toscrape,quotes)/static/main.css 20241007081525074 {...}
+> com,toscrape,quotes)/tag/books/ 20241007081513898 {...}
+> com,toscrape,quotes)/tag/friends/ 20241007081520928 {...}
+> com,toscrape,quotes)/tag/friendship/ 20241007081519648 {...}
+> com,toscrape,quotes)/tag/humor/ 20241007081512594 {...}
+> com,toscrape,quotes)/tag/inspirational/ 20241007081506990 {...}
+> com,toscrape,quotes)/tag/life/ 20241007081510349 {...}
+> com,toscrape,quotes)/tag/love/ 20241007081503814 {...}
+> com,toscrape,quotes)/tag/reading/ 20241007081516781 {...}
+> com,toscrape,quotes)/tag/simile/ 20241007081524944 {...}
+> com,toscrape,quotes)/tag/truth/ 20241007081523804 {...}
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+