Skip to content

Commit

Permalink
Merge pull request #4794 from freelawproject/scrapers_save_responses
Browse files Browse the repository at this point in the history
feat(scrapers): save raw scraped responses on S3
  • Loading branch information
mlissner authored Dec 11, 2024
2 parents 00bd775 + 21f75a7 commit 4d0e798
Show file tree
Hide file tree
Showing 5 changed files with 64 additions and 2 deletions.
20 changes: 20 additions & 0 deletions cl/lib/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,26 @@ def get_available_name(
return os.path.join(dir_name, uuid.uuid4().hex + file_ext)


class S3GlacierInstantRetrievalStorage(S3Storage):
"""Uses S3 GlacierInstantRetrieval storage class with private ACL"""

default_acl = "private"
bucket_name = settings.AWS_PRIVATE_STORAGE_BUCKET_NAME
file_overwrite = True

def get_object_parameters(self, name: str) -> Dict[str, str]:
params = self.object_parameters.copy()
params["StorageClass"] = "GLACIER_IR"
return params

def get_available_name(
self,
name: str,
max_length: Optional[int] = None,
) -> str:
return get_name_by_incrementing(self, name, max_length)


class HarvardPDFStorage(S3Boto3Storage):
"""S3 file storage for Harvard PDFs."""

Expand Down
2 changes: 2 additions & 0 deletions cl/scrapers/management/commands/cl_back_scrape_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from juriscraper.lib.importer import site_yielder

from cl.scrapers.management.commands import cl_scrape_opinions
from cl.scrapers.utils import save_response


def add_backscraper_arguments(parser) -> None:
Expand Down Expand Up @@ -74,6 +75,7 @@ def parse_and_scrape_site(
days_interval=options.get("days_interval"),
).back_scrape_iterable,
mod,
save_response_fn=save_response,
):
site.parse()
self.scrape_court(site, full_crawl=True)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from cl.scrapers.management.commands.cl_back_scrape_opinions import (
add_backscraper_arguments,
)
from cl.scrapers.utils import save_response


class Command(cl_scrape_oral_arguments.Command):
Expand All @@ -25,6 +26,7 @@ def parse_and_scrape_site(self, mod, options: dict):
days_interval=options.get("days_interval"),
).back_scrape_iterable,
mod,
save_response_fn=save_response,
):
site.parse()
self.scrape_court(site, full_crawl=True, backscrape=True)
Expand Down
3 changes: 2 additions & 1 deletion cl/scrapers/management/commands/cl_scrape_opinions.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
get_binary_content,
get_child_court,
get_extension,
save_response,
signal_handler,
update_or_create_docket,
)
Expand Down Expand Up @@ -385,7 +386,7 @@ def ingest_a_case(
)

def parse_and_scrape_site(self, mod, options: dict):
site = mod.Site().parse()
site = mod.Site(save_response_fn=save_response).parse()
self.scrape_court(site, options["full_crawl"])

def handle(self, *args, **options):
Expand Down
39 changes: 38 additions & 1 deletion cl/scrapers/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import json
import os
from datetime import date
from datetime import date, datetime
from typing import Optional, Tuple
from urllib.parse import urljoin

Expand All @@ -8,6 +9,7 @@
from asgiref.sync import async_to_sync
from courts_db import find_court_by_id, find_court_ids_by_name
from django.conf import settings
from django.core.files.base import ContentFile
from django.db.models import Q
from juriscraper import AbstractSite
from juriscraper.AbstractSite import logger
Expand All @@ -20,6 +22,7 @@
from cl.corpus_importer.utils import winnow_case_name
from cl.lib.decorators import retry
from cl.lib.microservice_utils import microservice
from cl.lib.storage import S3GlacierInstantRetrievalStorage
from cl.recap.mergers import find_docket_object
from cl.scrapers.exceptions import (
EmptyFileError,
Expand Down Expand Up @@ -446,3 +449,37 @@ def scraped_citation_object_is_valid(citation_object: dict) -> bool:
logger.error("Parsed reporter '%s' does not exist", parsed_reporter)

return False


def save_response(site: AbstractSite) -> None:
"""Stores scrapers responses content and headers in a S3 bucket
This is passed to juriscraper's Site objects as the
`save_response_fn` argument, which will make Juriscraper
save every response
:param site: the Site object, used to access the saved response
:return None
"""

storage = S3GlacierInstantRetrievalStorage()
response = site.request["response"]

scraper_id = site.court_id.split(".")[-1]
scrape_type = site.court_id.split(".")[1] # opinions or oral args
now_str = datetime.now().strftime("%Y/%m/%d/%H_%M_%S")
base_name = f"responses/{scrape_type}/{scraper_id}/{now_str}"

headers_json = json.dumps(dict(response.headers), indent=4)
storage.save(f"{base_name}_headers.json", ContentFile(headers_json))

try:
# both tests for and parses JSON content
content = json.loads(response.content)
extension = "json"
except (UnicodeDecodeError, json.decoder.JSONDecodeError):
content = response.content
extension = "html"

content_name = f"{base_name}.{extension}"
storage.save(content_name, ContentFile(content))

0 comments on commit 4d0e798

Please sign in to comment.