Skip to content

Commit

Permalink
WARC lookup for remote zipfile (WACZ) (#23)
Browse files Browse the repository at this point in the history
* Add remote zipfile implementation with range requests

* Disable python3.7 workflow for now

* Update docs and drop support for Python 3.7

* Store WACZ entires with store mode in ZIP file

* Raise exception for unsupported scheme in URI

* Slight optimization by avoiding unnecessary list comprehension

---------

Co-authored-by: Wesley van Lee <[email protected]>
  • Loading branch information
leewesleyv and Wesley van Lee authored Dec 31, 2024
1 parent 70e90ab commit 2db01ad
Show file tree
Hide file tree
Showing 25 changed files with 794 additions and 392 deletions.
3 changes: 0 additions & 3 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,16 +31,13 @@ jobs:
strategy:
matrix:
toxenv:
- py37-scrapy29
- py312-scrapy29
- py38-scrapy210
- py312-scrapy210
- py38-scrapy211
- py312-scrapy211
- py312-scrapymaster
include:
- toxenv: py37-scrapy29
python-version: 3.7
- toxenv: py312-scrapy29
python-version: '3.12'

Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ Scrapy Webarchive is a plugin for Scrapy that allows users to capture and export

## Compatibility

* Python 3.7, 3.8, 3.9, 3.10, 3.11 and 3.12
* Python 3.8, 3.9, 3.10, 3.11 and 3.12

## Documentation

Expand Down
5 changes: 4 additions & 1 deletion docs/settings.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ This setting defines the description of the WACZ used in the `datapackage.json`,

### `SW_WACZ_SOURCE_URI`

⚠️ Scraping against a remote source currently only supports AWS S3.

```python
SW_WACZ_SOURCE_URI = "file:///Users/username/Documents/archive.wacz"
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz"

# Allows multiple sources, comma seperated.
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz,/path/to/archive.wacz"
SW_WACZ_SOURCE_URI = "s3://scrapy-webarchive/archive.wacz,file:///Users/username/Documents/archive.wacz"
```

This setting defines the location of the WACZ file that should be used as a source for the crawl job.
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ dependencies = [
"wacz==0.5.0",
"cdxj-indexer==1.4.5",
]
requires-python = ">=3.7,<3.13"
requires-python = ">=3.8,<3.13"
authors = []
maintainers = []
description = "A webarchive extension for Scrapy"
Expand Down
2 changes: 1 addition & 1 deletion scrapy_webarchive/cdxj.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from typing_extensions import TYPE_CHECKING, List

if TYPE_CHECKING:
from scrapy_webarchive.wacz import WaczFile
from scrapy_webarchive.wacz.wacz_file import WaczFile

CDXREC = re.compile(
r"^(?P<surt>(?P<host>[^\)\s]+)\)(?P<path>[^\?\s]+)?(\?(?P<query>\S+))?)"
Expand Down
6 changes: 6 additions & 0 deletions scrapy_webarchive/exceptions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,3 +2,9 @@ class WaczMiddlewareException(Exception):
"""Indicates a critical issue in the middleware."""

pass


class UnsupportedURIException(Exception):
"""Raised when the given URI scheme is not supported by the factory."""

pass
2 changes: 1 addition & 1 deletion scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from typing_extensions import Any, Dict, Protocol, Self, Type, Union, cast

from scrapy_webarchive.utils import WARC_DT_FORMAT, get_formatted_dt_string, get_scheme_from_uri
from scrapy_webarchive.wacz import WaczFileCreator
from scrapy_webarchive.wacz.creator import WaczFileCreator
from scrapy_webarchive.warc import WarcFileWriter


Expand Down
18 changes: 9 additions & 9 deletions scrapy_webarchive/spidermiddlewares.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@
from typing_extensions import Iterable, Self, Union

from scrapy_webarchive.exceptions import WaczMiddlewareException
from scrapy_webarchive.wacz import MultiWaczFile, WaczFile, open_wacz_file
from scrapy_webarchive.wacz.storages import ZipStorageHandlerFactory
from scrapy_webarchive.wacz.wacz_file import MultiWaczFile, WaczFile
from scrapy_webarchive.warc import record_transformer


Expand Down Expand Up @@ -55,20 +56,19 @@ def spider_opened(self, spider: Spider) -> None:

for wacz_uri in self.wacz_uris:
spider.logger.info(f"[WACZDownloader] Opening WACZ {wacz_uri}")
wacz_file = open_wacz_file(wacz_uri, self.timeout, spider.settings)
if wacz_file:
wacz_files.append(wacz_file)
else:
storage_handler = ZipStorageHandlerFactory.get_handler(wacz_uri, spider.settings)

if not storage_handler.zip_exists:
spider.logger.error(f"[WACZDownloader] Could not open WACZ {wacz_uri}")
continue

wacz_files.append(WaczFile(storage_handler=storage_handler))

if wacz_files:
spider.logger.info(
f"[WACZDownloader] Continuing with {len(wacz_files)}/{len(self.wacz_uris)} valid WACZ files"
)
if len(wacz_files) == 1:
self.wacz = WaczFile(wacz_files[0])
else:
self.wacz = MultiWaczFile(wacz_files)
self.wacz = wacz_files[0] if len(wacz_files) == 1 else MultiWaczFile(wacz_files)

# If there are not wacz_files, we raise a `WaczMiddlewareException` in the downloader/spider middleware.
# Raising an exception here does not stop the job from running. If there are no valid WACZ files configured
Expand Down
49 changes: 1 addition & 48 deletions scrapy_webarchive/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,7 @@
from datetime import datetime, timezone
from pathlib import Path
from typing import IO, Tuple
from urllib.parse import urlparse, urlunparse

from scrapy.settings import Settings
from urllib.parse import urlparse

WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S"
Expand Down Expand Up @@ -36,51 +34,6 @@ def get_scheme_from_uri(uri: str) -> str:
return urlparse(uri).scheme


def get_s3_client(settings: Settings):
"""Create an S3 client using the given settings."""

import botocore.session
session = botocore.session.get_session()
return session.create_client(
"s3",
aws_access_key_id=settings["AWS_ACCESS_KEY_ID"],
aws_secret_access_key=settings["AWS_SECRET_ACCESS_KEY"],
aws_session_token=settings["AWS_SESSION_TOKEN"],
endpoint_url=settings["AWS_ENDPOINT_URL"],
region_name=settings["AWS_REGION_NAME"],
use_ssl=settings["AWS_USE_SSL"],
verify=settings["AWS_VERIFY"],
)


def get_gcs_client(settings: Settings):
"""Create a Google Cloud Storage client using the given settings."""

from google.cloud import storage
return storage.Client(project=settings["GCS_PROJECT_ID"])


def add_ftp_credentials(wacz_uri: str, settings: Settings) -> str:
"""Add FTP username and password to the URI if not present."""

parsed_uri = urlparse(wacz_uri)

if parsed_uri.username is None:
# Build netloc with credentials.
credentials = f'{settings["FTP_USER"]}:{settings["FTP_PASSWORD"]}'
netloc = f'{credentials}@{parsed_uri.hostname}'

# Add port if present.
if parsed_uri.port:
netloc += f":{parsed_uri.port}"

# Update and return the URI with credentials.
updated_uri = parsed_uri._replace(netloc=netloc)
return urlunparse(updated_uri)

return wacz_uri


def hash_stream(hash_type: str, stream: IO) -> Tuple[int, str]:
"""Hashes the stream with given hash_type hasher."""

Expand Down
Loading

0 comments on commit 2db01ad

Please sign in to comment.