Skip to content

Commit

Permalink
Replace X | Y type annotation with Union for Python <3.10 compatibility
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 15, 2024
1 parent e0dea76 commit 1dc564d
Show file tree
Hide file tree
Showing 3 changed files with 15 additions and 15 deletions.
6 changes: 3 additions & 3 deletions scrapy_webarchive/downloadermiddlewares.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from typing import IO, List
from typing import IO, List, Union

from scrapy import signals
from scrapy.crawler import Crawler
Expand All @@ -24,7 +24,7 @@ class WaczMiddleware:
This helps to work with large archives, including remote ones.
"""

wacz: WaczFile | MultiWaczFile
wacz: Union[WaczFile, MultiWaczFile]

def __init__(self, settings: Settings, stats: StatsCollector) -> None:
self.stats = stats
Expand All @@ -48,7 +48,7 @@ def spider_opened(self, spider: Spider) -> None:
tp = {"timeout": self.timeout}
multiple_entries = len(self.wacz_urls) != 1

def open_wacz_file(wacz_url: str) -> IO[bytes] | None:
def open_wacz_file(wacz_url: str) -> Union[IO[bytes], None]:
spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_url}")

try:
Expand Down
6 changes: 3 additions & 3 deletions scrapy_webarchive/middleware.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import re
from typing import IO, List
from typing import IO, List, Union
from urllib.parse import urlparse

from scrapy import Request, Spider, signals
Expand All @@ -15,7 +15,7 @@


class WaczCrawlMiddleware:
wacz: WaczFile | MultiWaczFile
wacz: Union[WaczFile, MultiWaczFile]

def __init__(self, settings: Settings, stats: StatsCollector) -> None:
self.stats = stats
Expand All @@ -42,7 +42,7 @@ def spider_opened(self, spider: Spider) -> None:
tp = {"timeout": self.timeout}
multiple_entries = len(self.wacz_urls) != 1

def open_wacz_file(wacz_url: str) -> IO[bytes] | None:
def open_wacz_file(wacz_url: str) -> Union[IO[bytes], None]:
spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_url}")

try:
Expand Down
18 changes: 9 additions & 9 deletions scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import os
import zipfile
from collections import defaultdict
from typing import IO, Generator, List
from typing import IO, Generator, List, Union

from warc import WARCReader as BaseWARCReader
from warc.warc import WARCRecord
Expand Down Expand Up @@ -81,14 +81,14 @@ def __init__(self, file: IO[bytes]):
self.wacz_file = zipfile.ZipFile(file)
self.index = self._parse_index(self._get_index(self.wacz_file))

def _find_in_index(self, url: str) -> CdxjRecord | None:
def _find_in_index(self, url: str) -> Union[CdxjRecord, None]:
records = self.index.get(url, [])

# If multiple entries are present, the last one is most likely to be relevant
return records[-1] if records else None

def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> WARCRecord | None:
warc_file: gzip.GzipFile | IO[bytes]
def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> Union[WARCRecord, None]:
warc_file: Union[gzip.GzipFile, IO[bytes]]

try:
warc_file = self.wacz_file.open("archive/" + cdxj_record.data["filename"])
Expand All @@ -101,7 +101,7 @@ def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> WARCRecord | Non

return WARCReader(warc_file).read_record()

def get_warc_from_url(self, url: str) -> WARCRecord | None:
def get_warc_from_url(self, url: str) -> Union[WARCRecord, None]:
cdxj_record = self._find_in_index(url)
return self.get_warc_from_cdxj_record(cdxj_record) if cdxj_record else None

Expand All @@ -111,7 +111,7 @@ def iter_index(self) -> Generator[CdxjRecord, None, None]:
yield cdxj_record

@staticmethod
def _get_index(wacz_file: zipfile.ZipFile) -> gzip.GzipFile | IO[bytes]:
def _get_index(wacz_file: zipfile.ZipFile) -> Union[gzip.GzipFile, IO[bytes]]:
"""Opens the index file from the WACZ archive, checking for .cdxj, .cdxj.gz, .cdx. and .cdx.gz"""

index_paths = [
Expand All @@ -134,7 +134,7 @@ def _get_index(wacz_file: zipfile.ZipFile) -> gzip.GzipFile | IO[bytes]:

raise FileNotFoundError("No valid index file found.")

def _parse_index(self, index_file: gzip.GzipFile | IO[bytes]) -> dict[str, List[CdxjRecord]]:
def _parse_index(self, index_file: Union[gzip.GzipFile, IO[bytes]]) -> dict[str, List[CdxjRecord]]:
cdxj_records = defaultdict(list)

for line in index_file:
Expand All @@ -155,10 +155,10 @@ class MultiWaczFile:
def __init__(self, wacz_files: List[IO[bytes]]) -> None:
self.waczs = [WaczFile(wacz_file) for wacz_file in wacz_files]

def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> WARCRecord | None:
def get_warc_from_cdxj_record(self, cdxj_record: CdxjRecord) -> Union[WARCRecord, None]:
return cdxj_record.wacz_file.get_warc_from_cdxj_record(cdxj_record) if cdxj_record.wacz_file else None

def get_warc_from_url(self, url: str) -> WARCRecord | None:
def get_warc_from_url(self, url: str) -> Union[WARCRecord, None]:
for wacz in self.waczs:
warc_record = wacz.get_warc_from_url(url)
if warc_record:
Expand Down

0 comments on commit 1dc564d

Please sign in to comment.