Skip to content

Commit

Permalink
Add unit-test for the WarcRecordTransformer
Browse files Browse the repository at this point in the history
  • Loading branch information
Wesley van Lee committed Oct 7, 2024
1 parent 3ecbcb9 commit ff06ca6
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 13 deletions.
10 changes: 4 additions & 6 deletions scrapy_webarchive/warc.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,16 +169,14 @@ def request_for_record(self, record: dict, **kwargs):
def response_for_record(self, record: WARCRecord, **kwargs):
# We expect a response.
# https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warc-type-mandatory
if record["WARC-Type"] != "response":
raise WaczMiddlewareException(f"Unexpected record type: {record['type']}")
if record.type != "response":
raise WaczMiddlewareException(f"Unexpected record type: {record.type}")

# We only know how to handle application/http.
# https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#content-type
record_content_type = (record["Content-Type"] or "").split(";", 1)[0]
if record_content_type != "application/http":
raise WaczMiddlewareException(
f"Unexpected record content-type: {record_content_type}"
)
raise WaczMiddlewareException(f"Unexpected record content-type: {record_content_type}")

# There is a date field in record['WARC-Date'], but don't have a use for it now.
# https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warc-date-mandatory
Expand All @@ -200,7 +198,7 @@ def response_for_record(self, record: WARCRecord, **kwargs):
response_cls = self.response_types.from_headers(headers)

return response_cls(
url=record["WARC-Target-URI"],
url=record.url,
status=int(status.decode()),
protocol=protocol.decode(),
headers=headers,
Expand Down
52 changes: 45 additions & 7 deletions tests/test_warc.py
Original file line number Diff line number Diff line change
@@ -1,19 +1,57 @@
import socket

import pytest
from freezegun import freeze_time
from scrapy import Request
from scrapy.http import HtmlResponse
from warc.warc import WARCRecord

from scrapy_webarchive.warc import generate_warc_fname
from scrapy_webarchive.exceptions import WaczMiddlewareException
from scrapy_webarchive.warc import generate_warc_fname, record_transformer


@freeze_time("2024-10-04 08:27:11")
def test_generate_warc_fname(monkeypatch):
prefix = "rec"

# Use pytest's monkeypatch to mock the return value of socket.gethostname
monkeypatch.setattr(socket, "gethostname", lambda: "example.local")
assert generate_warc_fname(prefix) == "rec-20241004082711-00000-example.warc.gz"


@pytest.fixture
def warc_record_response():
payload = b"""HTTP/1.0 200\r\nContent-Length: 11064\r\nDate: Mon, 07 Oct 2024 09:58:44 GMT\r\nContent-Type: text/html; charset=utf-8\r\nStrict-Transport-Security: max-age=0; includeSubDomains; preload\r\n\r\n<!DOCTYPE html>\n<html lang="en">Welcome to scrapy-webarchive!</html>"""
return WARCRecord(payload=payload, headers={"WARC-Target-URI": "https://quotes.toscrape.com/"})


@pytest.fixture
def warc_record_request():
return WARCRecord(payload=b"Welcome to scrapy-webarchive!", headers={"WARC-Type": "request"})


class TestWarcRecordTransformer:
def test_request_for_record(self):
record = {
"url": "https://quotes.toscrape.com/",
"mime": "text/html",
"status": "200",
"digest": "sha1:AA7J5JETQ4H7GG22MU2NCAUO6LM2EPEU",
"length": "2302",
"offset": "384",
"filename": "quotes-20241007095844-00000-BA92-CKXFG4FF6H.warc.gz",
}

request = record_transformer.request_for_record(record)
assert isinstance(request, Request)
assert request.url == "https://quotes.toscrape.com/"
assert request.method == "GET"

# Call the function
warc_fname = generate_warc_fname(prefix)
def test_response_for_record_invalid_response_type(self, warc_record_request):
with pytest.raises(WaczMiddlewareException):
record_transformer.response_for_record(warc_record_request)

# Assert the result matches the expected filename
assert warc_fname == "rec-20241004082711-00000-example.warc.gz"
def test_response_for_record(self, warc_record_response):
response = record_transformer.response_for_record(warc_record_response)
assert isinstance(response, HtmlResponse)
assert response.url == 'https://quotes.toscrape.com/'
assert response.status == 200
assert response.body == b'<!DOCTYPE html>\n<html lang="en">Welcome to scrapy-webarchive!</html>'

0 comments on commit ff06ca6

Please sign in to comment.