Skip to content

Commit

Permalink
Refactor and test WACZ writer class
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 11, 2024
1 parent c20c5a7 commit 106ebb0
Show file tree
Hide file tree
Showing 4 changed files with 86 additions and 27 deletions.
7 changes: 7 additions & 0 deletions scrapy_webarchive/cdxj.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,19 @@
import json
import re

from cdxj_indexer.main import CDXJIndexer

CDXREC = re.compile(
r"^(?P<surt>(?P<host>[^\)\s]+)\)(?P<path>[^\?\s]+)?(\?(?P<query>\S+))?)"
r"\s(?P<datetime>(?P<year>\d{4})(?P<month>\d{2})(?P<day>\d{2})(?P<hour>\d{2})(?P<minute>\d{2})(?P<second>\d{2})(?:\d{3})?)"
r"\s(?P<data>{.*})"
)

def write_cdxj_index(output: str, inputs: list[str]) -> str:
wacz_indexer = CDXJIndexer(output=output, inputs=inputs)
wacz_indexer.process_all()
return output


class CdxjRecord:
def _parse(self, line):
Expand Down
3 changes: 1 addition & 2 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,7 @@ def response_received(self, response: Response, request: Request, spider: Spider
self.stats.inc_value("wacz/exporter/request_written", spider=spider)

def spider_closed(self) -> None:
wacz_creator = WaczFileCreator(warc_fname=self.writer.warc_fname, store=self.store)
wacz_creator.create_wacz()
WaczFileCreator(warc_fname=self.writer.warc_fname, store=self.store).create()


def get_archive_uri_template_variables() -> dict:
Expand Down
59 changes: 34 additions & 25 deletions scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
import zipfile
from collections import defaultdict

from cdxj_indexer.main import CDXJIndexer
from warc import WARCReader as BaseWARCReader

from scrapy_webarchive.cdxj import CdxjRecord
from scrapy_webarchive.cdxj import CdxjRecord, write_cdxj_index


class WARCReader(BaseWARCReader):
Expand All @@ -29,39 +28,49 @@ def __init__(
self.warc_fname = warc_fname
self.cdxj_fname = cdxj_fname

def create_wacz(self) -> None:
"""Create the WACZ file from the WARC"""
def create(self):
"""Create the WACZ file from the WARC and CDXJ index"""

# Write cdxj index to a temporary file
write_cdxj_index(output=self.cdxj_fname, inputs=[self.warc_fname])

# Create the WACZ archive in memory
zip_buffer = self.create_wacz_zip()

# Cleanup the temporary files
self.cleanup_files(self.cdxj_fname, self.warc_fname)

# Save WACZ to the storage
zip_buffer.seek(0)
self.store.persist_file(self.get_wacz_fname(), zip_buffer, info=None)

def create_wacz_zip(self) -> io.BytesIO:
"""Create the WACZ zip file and return the in-memory buffer."""

zip_buffer = io.BytesIO()

# Write index
wacz_indexer = CDXJIndexer(
output=self.cdxj_fname,
inputs=[self.warc_fname],
)
wacz_indexer.process_all()
with zipfile.ZipFile(zip_buffer, "w", zipfile.ZIP_DEFLATED) as zip_file:
self.write_to_zip(zip_file, self.cdxj_fname, "indexes/")
self.write_to_zip(zip_file, self.warc_fname, "archive/")

with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file:
# Add the cdxj file to the WACZ
with open(self.cdxj_fname, "rb") as in_fh:
file_content = in_fh.read() # Read file content
zip_file.writestr("indexes/" + os.path.basename(self.cdxj_fname), file_content)
return zip_buffer

os.remove(self.cdxj_fname) # Remove original cdxj file
def write_to_zip(self, zip_file: zipfile.ZipFile, filename: str, destination: str) -> None:
"""Helper function to write a file into the ZIP archive."""

# Write WARC file to the WACZ
with open(self.warc_fname, "rb") as in_fh:
file_content = in_fh.read() # Read file content
zip_file.writestr("archive/" + os.path.basename(self.warc_fname), file_content)
with open(filename, "rb") as file_handle:
zip_file.writestr(destination + os.path.basename(filename), file_handle.read())

os.remove(self.warc_fname) # Remove original WARC file
def cleanup_files(self, *files: str) -> None:
"""Remove files from the filesystem."""

zip_buffer.seek(0)
self.store.persist_file(self.get_wacz_fname(), zip_buffer, info=None)
for file in files:
os.remove(file)

def get_wacz_fname(self) -> str:
wacz_fname = "-".join(self.warc_fname.split("-")[:2])
return wacz_fname + ".wacz"
"""Generate WACZ filename based on the WARC filename."""

return "-".join(self.warc_fname.split("-")[:2]) + ".wacz"


class MultiWaczFile:
Expand Down
44 changes: 44 additions & 0 deletions tests/test_wacz.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import zipfile
from unittest.mock import Mock

import pytest

from scrapy_webarchive.wacz import WaczFileCreator


class TestWaczFileCreator:
@pytest.fixture
def wacz_file_creator(self):
"""Fixture to initialize the WaczFileCreator with a mocked store"""

store = Mock()
warc_fname = "/scrapy-webarchive/quotes-20241007000000-00000-test.warc"
cdxj_fname = "/scrapy-webarchive/index.cdxj"
return WaczFileCreator(store=store, warc_fname=warc_fname, cdxj_fname=cdxj_fname)

def test_create_wacz(self, fs, wacz_file_creator):
# Setup the fake filesystem
fs.create_file("/scrapy-webarchive/index.cdxj", contents="")
fs.create_file("/scrapy-webarchive/quotes-20241007000000-00000-test.warc", contents="")

wacz_file_creator.create()

# Ensure the files are removed after creation
assert not fs.exists("/scrapy-webarchive/index.cdxj")
assert not fs.exists("/scrapy-webarchive/quotes-20241007000000-00000-test.warc")

# Verify the WACZ file was persisted in the store
wacz_fname = wacz_file_creator.get_wacz_fname()
wacz_file_creator.store.persist_file.assert_called_once()

# Assert that the correct WACZ filename was used
assert wacz_fname == "/scrapy-webarchive/quotes-20241007000000.wacz"

# Retrieve the zip buffer from the call args
call_args = wacz_file_creator.store.persist_file.call_args
zip_buffer = call_args[0][1]

# Verify that the WACZ zip content is correct
zip_file = zipfile.ZipFile(zip_buffer)
assert "indexes/index.cdxj" in zip_file.namelist()
assert "archive/quotes-20241007000000-00000-test.warc" in zip_file.namelist()

0 comments on commit 106ebb0

Please sign in to comment.