Change WACZ outfile name generation and move datetime utils to utils …

…file
q-m · Oct 14, 2024 · 8af1209 · 8af1209
1 parent 9df5451
commit 8af1209
Show file tree

Hide file tree

Showing 5 changed files with 18 additions and 10 deletions.
diff --git a/scrapy_webarchive/extensions.py b/scrapy_webarchive/extensions.py
@@ -11,7 +11,7 @@
 from scrapy.settings import Settings
 from typing_extensions import Self
 
-from scrapy_webarchive.utils import warc_date
+from scrapy_webarchive.utils import get_warc_date
 from scrapy_webarchive.wacz import WaczFileCreator
 from scrapy_webarchive.warc import WarcFileWriter
 
@@ -90,7 +90,7 @@ def spider_opened(self) -> None:
         self.writer.write_warcinfo(robotstxt_obey=self.settings["ROBOTSTXT_OBEY"])
 
     def response_received(self, response: Response, request: Request, spider: Spider) -> None:
-        request.meta["WARC-Date"] = warc_date()
+        request.meta["WARC-Date"] = get_warc_date()
 
         # Write response WARC record
         record = self.writer.write_response(response, request)

diff --git a/scrapy_webarchive/utils.py b/scrapy_webarchive/utils.py
@@ -1,8 +1,15 @@
 from datetime import datetime, timezone
 
+WARC_DT_FORMAT = "%Y-%m-%dT%H:%M:%SZ"
+TIMESTAMP_DT_FORMAT = "%Y%m%d%H%M%S"
 
-def warc_date() -> str:
-    return datetime.now(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
+
+def get_current_timestamp() -> str:
+    return datetime.now(timezone.utc).strftime(TIMESTAMP_DT_FORMAT)
+
+
+def get_warc_date() -> str:
+    return datetime.now(timezone.utc).strftime(WARC_DT_FORMAT)
 
 
 def header_lines_to_dict(lines):

diff --git a/scrapy_webarchive/wacz.py b/scrapy_webarchive/wacz.py
@@ -7,6 +7,7 @@
 from warc import WARCReader as BaseWARCReader
 
 from scrapy_webarchive.cdxj import CdxjRecord, write_cdxj_index
+from scrapy_webarchive.utils import get_current_timestamp
 
 
 class WARCReader(BaseWARCReader):
@@ -70,7 +71,7 @@ def cleanup_files(self, *files: str) -> None:
     def get_wacz_fname(self) -> str:
         """Generate WACZ filename based on the WARC filename."""
 
-        return "-".join(self.warc_fname.split("-")[:2]) + ".wacz"
+        return f"archive-{get_current_timestamp()}.wacz"
 
 
 class MultiWaczFile:

diff --git a/scrapy_webarchive/warc.py b/scrapy_webarchive/warc.py
@@ -1,6 +1,5 @@
 import socket
 import uuid
-from datetime import datetime, timezone
 from io import BytesIO
 from urllib.parse import urlparse
 
@@ -15,7 +14,7 @@
 from warcio.warcwriter import WARCWriter
 
 from scrapy_webarchive.exceptions import WaczMiddlewareException
-from scrapy_webarchive.utils import header_lines_to_dict
+from scrapy_webarchive.utils import get_current_timestamp, header_lines_to_dict
 
 
 def generate_warc_fname(prefix: str) -> str:
@@ -25,11 +24,10 @@ def generate_warc_fname(prefix: str) -> str:
     {prefix}-{timestamp}-{serial}-{crawlhost}.warc.gz
     """
 
-    timestamp = datetime.now(timezone.utc).strftime("%Y%m%d%H%M%S")
     crawlhost = socket.gethostname().split(".")[0]
     # As of now we only generate one WARC file. Add serial in here to adhere to the warc specification.
     serial = '00000'
-    return "-".join([prefix, timestamp, serial, crawlhost]) + ".warc.gz"
+    return "-".join([prefix, get_current_timestamp(), serial, crawlhost]) + ".warc.gz"
 
 
 class WarcFileWriter:

diff --git a/tests/test_wacz.py b/tests/test_wacz.py
@@ -2,6 +2,7 @@
 from unittest.mock import Mock
 
 import pytest
+from freezegun import freeze_time
 
 from scrapy_webarchive.wacz import WaczFileCreator
 
@@ -16,6 +17,7 @@ def wacz_file_creator(self):
         cdxj_fname = "/scrapy-webarchive/index.cdxj"
         return WaczFileCreator(store=store, warc_fname=warc_fname, cdxj_fname=cdxj_fname)
 
+    @freeze_time("2024-10-04 08:27:11")
     def test_create_wacz(self, fs, wacz_file_creator):
         # Setup the fake filesystem
         fs.create_file("/scrapy-webarchive/index.cdxj", contents="")
@@ -32,7 +34,7 @@ def test_create_wacz(self, fs, wacz_file_creator):
         wacz_file_creator.store.persist_file.assert_called_once()
 
         # Assert that the correct WACZ filename was used
-        assert wacz_fname == "/scrapy-webarchive/quotes-20241007000000.wacz"
+        assert wacz_fname == "archive-20241004082711.wacz"
 
         # Retrieve the zip buffer from the call args
         call_args = wacz_file_creator.store.persist_file.call_args