diff --git a/scrapy_webarchive/downloadermiddlewares.py b/scrapy_webarchive/downloadermiddlewares.py index 83cdc30..e26e06a 100644 --- a/scrapy_webarchive/downloadermiddlewares.py +++ b/scrapy_webarchive/downloadermiddlewares.py @@ -1,5 +1,5 @@ import re -from typing import IO, List +from typing import IO, List, Union from scrapy import signals from scrapy.crawler import Crawler @@ -24,7 +24,7 @@ class WaczMiddleware: This helps to work with large archives, including remote ones. """ - wacz: WaczFile | MultiWaczFile + wacz: Union[WaczFile, MultiWaczFile] def __init__(self, settings: Settings, stats: StatsCollector) -> None: self.stats = stats @@ -48,7 +48,7 @@ def spider_opened(self, spider: Spider) -> None: tp = {"timeout": self.timeout} multiple_entries = len(self.wacz_urls) != 1 - def open_wacz_file(wacz_url: str) -> IO[bytes] | None: + def open_wacz_file(wacz_url: str) -> Union[IO[bytes], None]: spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_url}") try: diff --git a/scrapy_webarchive/middleware.py b/scrapy_webarchive/middleware.py index 6891182..fb916be 100644 --- a/scrapy_webarchive/middleware.py +++ b/scrapy_webarchive/middleware.py @@ -1,5 +1,5 @@ import re -from typing import IO, List +from typing import IO, List, Union from urllib.parse import urlparse from scrapy import Request, Spider, signals @@ -15,7 +15,7 @@ class WaczCrawlMiddleware: - wacz: WaczFile | MultiWaczFile + wacz: Union[WaczFile, MultiWaczFile] def __init__(self, settings: Settings, stats: StatsCollector) -> None: self.stats = stats @@ -42,7 +42,7 @@ def spider_opened(self, spider: Spider) -> None: tp = {"timeout": self.timeout} multiple_entries = len(self.wacz_urls) != 1 - def open_wacz_file(wacz_url: str) -> IO[bytes] | None: + def open_wacz_file(wacz_url: str) -> Union[IO[bytes], None]: spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_url}") try: diff --git a/scrapy_webarchive/wacz.py b/scrapy_webarchive/wacz.py index d9357c6..effa0ba 100644 --- a/scrapy_webarchive/wacz.py +++ b/scrapy_webarchive/wacz.py @@ -3,7 +3,7 @@ import os import zipfile from collections import defaultdict -from typing import IO, Generator, List +from typing import IO, Generator, List, Union from warc import WARCReader as BaseWARCReader from warc.warc import WARCRecord @@ -81,14 +81,14 @@ def __init__(self, file: IO[bytes]): self.wacz_file = zipfile.ZipFile(file) self.index = self._parse_index(self._get_index(self.wacz_file)) - def _find_in_index(self, url: str) -> CdxjRecord | None: + def _find_in_index(self, url: str) -> Union[CdxjRecord, None]: records = self.index.get(url, []) # If multiple entries are present, the last one is most likely to be relevant return records[-1] if records else None - def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> WARCRecord | None: - warc_file: gzip.GzipFile | IO[bytes] + def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> Union[WARCRecord, None]: + warc_file: Union[gzip.GzipFile, IO[bytes]] try: warc_file = self.wacz_file.open("archive/" + cdxj_record.data["filename"]) @@ -101,7 +101,7 @@ def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> WARCRecord | Non return WARCReader(warc_file).read_record() - def get_warc_from_url(self, url: str) -> WARCRecord | None: + def get_warc_from_url(self, url: str) -> Union[WARCRecord, None]: cdxj_record = self._find_in_index(url) return self.get_warc_from_cdxj_record(cdxj_record) if cdxj_record else None @@ -111,7 +111,7 @@ def iter_index(self) -> Generator[CdxjRecord, None, None]: yield cdxj_record @staticmethod - def _get_index(wacz_file: zipfile.ZipFile) -> gzip.GzipFile | IO[bytes]: + def _get_index(wacz_file: zipfile.ZipFile) -> Union[gzip.GzipFile, IO[bytes]]: """Opens the index file from the WACZ archive, checking for .cdxj, .cdxj.gz, .cdx. and .cdx.gz""" index_paths = [ @@ -134,7 +134,7 @@ def _get_index(wacz_file: zipfile.ZipFile) -> gzip.GzipFile | IO[bytes]: raise FileNotFoundError("No valid index file found.") - def _parse_index(self, index_file: gzip.GzipFile | IO[bytes]) -> dict[str, List[CdxjRecord]]: + def _parse_index(self, index_file: Union[gzip.GzipFile, IO[bytes]]) -> dict[str, List[CdxjRecord]]: cdxj_records = defaultdict(list) for line in index_file: @@ -155,10 +155,10 @@ class MultiWaczFile: def __init__(self, wacz_files: List[IO[bytes]]) -> None: self.waczs = [WaczFile(wacz_file) for wacz_file in wacz_files] - def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> WARCRecord | None: + def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> Union[WARCRecord, None]: return cdxj_record.wacz_file.get_warc_from_cdxj_record(cdxj_record) if cdxj_record.wacz_file else None - def get_warc_from_url(self, url: str) -> WARCRecord | None: + def get_warc_from_url(self, url: str) -> Union[WARCRecord, None]: for wacz in self.waczs: warc_record = wacz.get_warc_from_url(url) if warc_record: