Skip to content

Commit

Permalink
Change WACZ outfile name generation and move datetime utils to utils …
Browse files Browse the repository at this point in the history
…file
  • Loading branch information
Wesley van Lee committed Oct 14, 2024
1 parent 9df5451 commit 8af1209
Show file tree
Hide file tree
Showing 5 changed files with 18 additions and 10 deletions.
4 changes: 2 additions & 2 deletions scrapy_webarchive/extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from scrapy.settings import Settings
from typing_extensions import Self

from scrapy_webarchive.utils import warc_date
from scrapy_webarchive.utils import get_warc_date
from scrapy_webarchive.wacz import WaczFileCreator
from scrapy_webarchive.warc import WarcFileWriter

Expand Down Expand Up @@ -90,7 +90,7 @@ def spider_opened(self) -> None:
self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"])

def response_received(self, response: Response, request: Request, spider: Spider) -> None:
request.meta["WARC-Date"] = warc_date()
request.meta["WARC-Date"] = get_warc_date()

# Write response WARC record
record = self.writer.write_response(response, request)
Expand Down
11 changes: 9 additions & 2 deletions scrapy_webarchive/utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,15 @@
from datetime import datetime, timezone

WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S"

def warc_date() -> str:
return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

def get_current_timestamp() -> str:
return datetime.now(timezone.utc).strftime(TIMESTAMP_DT_FORMAT)


def get_warc_date() -> str:
return datetime.now(timezone.utc).strftime(WARC_DT_FORMAT)


def header_lines_to_dict(lines):
Expand Down
3 changes: 2 additions & 1 deletion scrapy_webarchive/wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from warc import WARCReader as BaseWARCReader

from scrapy_webarchive.cdxj import CdxjRecord, write_cdxj_index
from scrapy_webarchive.utils import get_current_timestamp


class WARCReader(BaseWARCReader):
Expand Down Expand Up @@ -70,7 +71,7 @@ def cleanup_files(self, *files: str) -> None:
def get_wacz_fname(self) -> str:
"""Generate WACZ filename based on the WARC filename."""

return "-".join(self.warc_fname.split("-")[:2]) + ".wacz"
return f"archive-{get_current_timestamp()}.wacz"


class MultiWaczFile:
Expand Down
6 changes: 2 additions & 4 deletions scrapy_webarchive/warc.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import socket
import uuid
from datetime import datetime, timezone
from io import BytesIO
from urllib.parse import urlparse

Expand All @@ -15,7 +14,7 @@
from warcio.warcwriter import WARCWriter

from scrapy_webarchive.exceptions import WaczMiddlewareException
from scrapy_webarchive.utils import header_lines_to_dict
from scrapy_webarchive.utils import get_current_timestamp, header_lines_to_dict


def generate_warc_fname(prefix: str) -> str:
Expand All @@ -25,11 +24,10 @@ def generate_warc_fname(prefix: str) -> str:
{prefix}-{timestamp}-{serial}-{crawlhost}.warc.gz
"""

timestamp = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
crawlhost = socket.gethostname().split(".")[0]
# As of now we only generate one WARC file. Add serial in here to adhere to the warc specification.
serial = '00000'
return "-".join([prefix, timestamp, serial, crawlhost]) + ".warc.gz"
return "-".join([prefix, get_current_timestamp(), serial, crawlhost]) + ".warc.gz"


class WarcFileWriter:
Expand Down
4 changes: 3 additions & 1 deletion tests/test_wacz.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from unittest.mock import Mock

import pytest
from freezegun import freeze_time

from scrapy_webarchive.wacz import WaczFileCreator

Expand All @@ -16,6 +17,7 @@ def wacz_file_creator(self):
cdxj_fname = "/scrapy-webarchive/index.cdxj"
return WaczFileCreator(store=store, warc_fname=warc_fname, cdxj_fname=cdxj_fname)

@freeze_time("2024-10-04 08:27:11")
def test_create_wacz(self, fs, wacz_file_creator):
# Setup the fake filesystem
fs.create_file("/scrapy-webarchive/index.cdxj", contents="")
Expand All @@ -32,7 +34,7 @@ def test_create_wacz(self, fs, wacz_file_creator):
wacz_file_creator.store.persist_file.assert_called_once()

# Assert that the correct WACZ filename was used
assert wacz_fname == "/scrapy-webarchive/quotes-20241007000000.wacz"
assert wacz_fname == "archive-20241004082711.wacz"

# Retrieve the zip buffer from the call args
call_args = wacz_file_creator.store.persist_file.call_args
Expand Down

0 comments on commit 8af1209

Please sign in to comment.