Skip to content

Commit

Permalink
Merge pull request #4885 from ttys0dev/update-bad-pacer-doc-id
Browse files Browse the repository at this point in the history
Fix finding document entries with a bad pacer_doc_id
  • Loading branch information
mlissner authored Jan 7, 2025
2 parents b7dac26 + 3388e05 commit 738f6f9
Show file tree
Hide file tree
Showing 3 changed files with 154 additions and 55 deletions.
1 change: 1 addition & 0 deletions cl/recap/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,7 @@ class RECAPEmailDocketEntryDataFactory(DictFactory):
pacer_doc_id = Faker("random_id_string")
pacer_magic_num = Faker("random_id_string")
pacer_seq_no = Faker("random_id_string")
short_description = Faker("text", max_nb_chars=15)


class RECAPEmailDocketDataFactory(DictFactory):
Expand Down
112 changes: 57 additions & 55 deletions cl/recap/mergers.py
Original file line number Diff line number Diff line change
Expand Up @@ -822,6 +822,35 @@ async def get_or_make_docket_entry(
return de, de_created


async def keep_latest_rd_document(queryset: QuerySet) -> RECAPDocument:
"""Retains the most recent item with a PDF, if available otherwise,
retains the most recent item overall.
:param queryset: RECAPDocument QuerySet to clean duplicates from.
:return: The matched RECAPDocument after cleaning.
"""
rd_with_pdf_queryset = queryset.filter(is_available=True).exclude(
filepath_local=""
)
if await rd_with_pdf_queryset.aexists():
rd = await rd_with_pdf_queryset.alatest("date_created")
else:
rd = await queryset.alatest("date_created")
await queryset.exclude(pk=rd.pk).adelete()
return rd


async def clean_duplicate_documents(params: dict[str, Any]) -> RECAPDocument:
"""Removes duplicate RECAPDocuments, keeping the most recent with PDF if
available or otherwise the most recent overall.
:param params: Query parameters to filter the RECAPDocuments.
:return: The matched RECAPDocument after cleaning.
"""
duplicate_rd_queryset = RECAPDocument.objects.filter(**params)
return await keep_latest_rd_document(duplicate_rd_queryset)


async def add_docket_entries(
d: Docket,
docket_entries: list[dict[str, Any]],
Expand Down Expand Up @@ -934,35 +963,39 @@ async def add_docket_entries(
rd = await RECAPDocument.objects.aget(**get_params)
rds_updated.append(rd)
except RECAPDocument.DoesNotExist:
try:
params["pacer_doc_id"] = docket_entry["pacer_doc_id"]
rd = await RECAPDocument.objects.acreate(
document_number=docket_entry["document_number"] or "",
is_available=False,
**params,
)
except ValidationError:
# Happens from race conditions.
continue
rds_created.append(rd)
rd = None
if de_created is False and not appelate_court_id_exists:
try:
# Check for documents with a bad pacer_doc_id
rd = await RECAPDocument.objects.aget(**params)
except RECAPDocument.DoesNotExist:
# Fallback to creating document
pass
except RECAPDocument.MultipleObjectsReturned:
rd = await clean_duplicate_documents(params)
if rd is None:
try:
params["pacer_doc_id"] = docket_entry["pacer_doc_id"]
rd = await RECAPDocument.objects.acreate(
document_number=docket_entry["document_number"] or "",
is_available=False,
**params,
)
rds_created.append(rd)
except ValidationError:
# Happens from race conditions.
continue
except RECAPDocument.MultipleObjectsReturned:
logger.info(
"Multiple recap documents found for document entry number'%s' "
"while processing '%s'" % (docket_entry["document_number"], d)
)
if params["document_type"] == RECAPDocument.ATTACHMENT:
continue
duplicate_rd_queryset = RECAPDocument.objects.filter(**params)
rd_with_pdf_queryset = duplicate_rd_queryset.filter(
is_available=True
).exclude(filepath_local="")
if await rd_with_pdf_queryset.aexists():
rd = await rd_with_pdf_queryset.alatest("date_created")
else:
rd = await duplicate_rd_queryset.alatest("date_created")
await duplicate_rd_queryset.exclude(pk=rd.pk).adelete()
rd = await clean_duplicate_documents(params)

rd.pacer_doc_id = rd.pacer_doc_id or docket_entry["pacer_doc_id"]
if docket_entry["pacer_doc_id"]:
rd.pacer_doc_id = docket_entry["pacer_doc_id"]
description = docket_entry.get("short_description")
if rd.document_type == RECAPDocument.PACER_DOCUMENT and description:
rd.description = description
Expand Down Expand Up @@ -1604,14 +1637,7 @@ async def clean_duplicate_attachment_entries(
)
async for dupe in dupes.aiterator():
duplicate_rd_queryset = rds.filter(pacer_doc_id=dupe.pacer_doc_id)
rd_with_pdf_queryset = duplicate_rd_queryset.filter(
is_available=True
).exclude(filepath_local="")
if await rd_with_pdf_queryset.aexists():
keep_rd = await rd_with_pdf_queryset.alatest("date_created")
else:
keep_rd = await duplicate_rd_queryset.alatest("date_created")
await duplicate_rd_queryset.exclude(pk=keep_rd.pk).adelete()
await keep_latest_rd_document(duplicate_rd_queryset)


async def merge_attachment_page_data(
Expand Down Expand Up @@ -1673,15 +1699,7 @@ async def merge_attachment_page_data(

except RECAPDocument.MultipleObjectsReturned as exc:
if pacer_case_id:
duplicate_rd_queryset = RECAPDocument.objects.filter(**params)
rd_with_pdf_queryset = duplicate_rd_queryset.filter(
is_available=True
).exclude(filepath_local="")
if await rd_with_pdf_queryset.aexists():
keep_rd = await rd_with_pdf_queryset.alatest("date_created")
else:
keep_rd = await duplicate_rd_queryset.alatest("date_created")
await duplicate_rd_queryset.exclude(pk=keep_rd.pk).adelete()
await clean_duplicate_documents(params)
main_rd = await RECAPDocument.objects.select_related(
"docket_entry", "docket_entry__docket"
).aget(**params)
Expand Down Expand Up @@ -1711,23 +1729,7 @@ async def merge_attachment_page_data(
break
except RECAPDocument.MultipleObjectsReturned as exc:
if pacer_case_id:
duplicate_rd_queryset = RECAPDocument.objects.filter(
**params
)
rd_with_pdf_queryset = duplicate_rd_queryset.filter(
is_available=True
).exclude(filepath_local="")
if await rd_with_pdf_queryset.aexists():
keep_rd = await rd_with_pdf_queryset.alatest(
"date_created"
)
else:
keep_rd = await duplicate_rd_queryset.alatest(
"date_created"
)
await duplicate_rd_queryset.exclude(
pk=keep_rd.pk
).adelete()
await clean_duplicate_documents(params)
main_rd = await RECAPDocument.objects.select_related(
"docket_entry", "docket_entry__docket"
).aget(**params)
Expand Down
96 changes: 96 additions & 0 deletions cl/recap/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -1072,6 +1072,102 @@ def test_process_attachments_for_subdocket_pq_with_missing_main_rd(
msg="Didn't get the expected docket ID.",
)

def test_match_recap_document_with_wrong_pacer_doc_id(self, mock_upload):
"""Confirm that when an existing RECAPDocument has an invalid
pacer_doc_id, we can still match it after excluding the pacer_doc_id
from the lookup.
"""

de_data = DocketEntriesDataFactory(
docket_entries=[
RECAPEmailDocketEntryDataFactory(
pacer_doc_id="04505578690",
document_number=5,
)
],
)
de = DocketEntryWithParentsFactory(
docket__court=self.court, entry_number=5
)
rd = RECAPDocumentFactory(
docket_entry=de,
document_type=RECAPDocument.PACER_DOCUMENT,
pacer_doc_id="04505578691",
document_number="5",
description="",
)
# Add the docket entry with the updated pacer_doc_id
async_to_sync(add_docket_entries)(de.docket, de_data["docket_entries"])
recap_documents = RECAPDocument.objects.all()
self.assertEqual(
recap_documents.count(), 1, msg="Wrong number of RECAPDocuments"
)
rd.refresh_from_db()
self.assertEqual(
rd.description,
de_data["docket_entries"][0]["short_description"],
msg="The short description doesn't match.",
)
self.assertEqual(
rd.pacer_doc_id,
de_data["docket_entries"][0]["pacer_doc_id"],
msg="The pacer_doc_id doesn't match.",
)

def test_match_recap_document_with_wrong_pacer_doc_id_duplicated(
self, mock_upload
):
"""Confirm that when an existing RECAPDocument has an invalid
pacer_doc_id, we can still match it after excluding the pacer_doc_id
from the lookup, even if there is more than one PACER_DOCUMENT that
belongs to the docket entry.
"""

de_data = DocketEntriesDataFactory(
docket_entries=[
RECAPEmailDocketEntryDataFactory(
pacer_doc_id="04505578690",
document_number=5,
)
],
)
de = DocketEntryWithParentsFactory(
docket__court=self.court, entry_number=5
)
RECAPDocumentFactory(
document_type=RECAPDocument.PACER_DOCUMENT,
docket_entry=de,
pacer_doc_id="04505578691",
document_number="5",
description="",
)
rd_2 = RECAPDocumentFactory(
document_type=RECAPDocument.PACER_DOCUMENT,
docket_entry=de,
pacer_doc_id="04505578691",
document_number="6",
description="",
is_available=True,
)
# Add the docket entry with the updated pacer_doc_id, remove the
# duplicated RD, and keep the one that is available.
async_to_sync(add_docket_entries)(de.docket, de_data["docket_entries"])
recap_documents = RECAPDocument.objects.all()
self.assertEqual(
recap_documents.count(), 1, msg="Wrong number of RECAPDocuments"
)
rd_2.refresh_from_db()
self.assertEqual(
rd_2.description,
de_data["docket_entries"][0]["short_description"],
msg="The short description doesn't match.",
)
self.assertEqual(
rd_2.pacer_doc_id,
de_data["docket_entries"][0]["pacer_doc_id"],
msg="The pacer_doc_id doesn't match.",
)


@mock.patch("cl.recap.tasks.DocketReport", new=fakes.FakeDocketReport)
@mock.patch(
Expand Down

0 comments on commit 738f6f9

Please sign in to comment.