Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(citations): add UnmatchedCitation model and logic #4949

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cl/citations/management/commands/find_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from django.core.management import CommandError
from django.core.management.base import CommandParser

from cl.citations.models import UnmatchedCitation
from cl.citations.tasks import (
find_citations_and_parentheticals_for_opinion_by_pks,
)
Expand Down Expand Up @@ -112,6 +113,9 @@ def handle(self, *args: List[str], **options: OptionsType) -> None:
query = query.filter(date_modified__gte=options["modified_after"])
if options.get("all"):
query = Opinion.objects.all()
sys.stdout.write("Deleting all UnmatchedCitation rows")
UnmatchedCitation.objects.all().delete()

self.count = query.count()
self.average_per_s = 0.0
self.timings: List[float] = []
Expand Down
50 changes: 50 additions & 0 deletions cl/citations/management/commands/update_unmatched_citations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from cl.citations.management.commands import find_citations
from cl.citations.models import UnmatchedCitation
from cl.lib.command_utils import VerboseCommand


class Command(find_citations.Command):
"""Re-run find_citations_and_parentheticals_for_opinion_by_pks for
opinions where unmatched citations have been found
"""

help = "Try to resolve unmatched citations"
# variables to use find_citations.Command.update_documents
count = 0
average_per_s = 0.0
timings: list[float] = []

def add_arguments(self, parser):
VerboseCommand.add_arguments(self, parser)
parser.add_argument(
"--resolve-failures",
action="store_true",
default=False,
help="Include citations with FAILED and FAILED_AMBIGUOUS status",
)
parser.add_argument(
"--queue",
default="batch1",
help="The celery queue where the tasks should be processed.",
)

def handle(self, *args, **options):
"""Re-uses find_citations.Command enqueuer and logging"""
VerboseCommand.handle(self, *args, **options)
status = [UnmatchedCitation.FOUND]
if options["resolve_failures"]:
status.extend(
[UnmatchedCitation.FAILED, UnmatchedCitation.FAILED_AMBIGUOUS]
)

# distinct() on Django only works when the same field is on .order_by()
opinion_ids = (
UnmatchedCitation.objects.filter(status__in=status)
.order_by("citing_opinion_id")
.distinct("citing_opinion_id")
)
self.count = opinion_ids.count()
opinion_pks = opinion_ids.values_list("citing_opinion_id", flat=True)
find_citations.Command.update_documents(
self, opinion_pks, options["queue"]
)
154 changes: 154 additions & 0 deletions cl/citations/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,154 @@
# Generated by Django 5.1.4 on 2025-01-23 22:48

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

initial = True

dependencies = [
("search", "0037_alter_citation_type_noop"),
]

operations = [
migrations.CreateModel(
name="UnmatchedCitation",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"volume",
models.SmallIntegerField(
help_text="The volume of the reporter"
),
),
(
"reporter",
models.TextField(
db_index=True,
help_text="The abbreviation for the reporter",
),
),
(
"page",
models.TextField(
help_text="The 'page' of the citation in the reporter. Unfortunately, this is not an integer, but is a string-type because several jurisdictions do funny things with the so-called 'page'. For example, we have seen Roman numerals in Nebraska, 13301-M in Connecticut, and 144M in Montana."
),
),
(
"type",
models.SmallIntegerField(
choices=[
(1, "A federal reporter citation (e.g. 5 F. 55)"),
(
2,
"A citation in a state-based reporter (e.g. Alabama Reports)",
),
(
3,
"A citation in a regional reporter (e.g. Atlantic Reporter)",
),
(
4,
"A citation in a specialty reporter (e.g. Lawyers' Edition)",
),
(
5,
"A citation in an early SCOTUS reporter (e.g. 5 Black. 55)",
),
(
6,
"A citation in the Lexis system (e.g. 5 LEXIS 55)",
),
(
7,
"A citation in the WestLaw system (e.g. 5 WL 55)",
),
(8, "A vendor neutral citation (e.g. 2013 FL 1)"),
(
9,
"A law journal citation within a scholarly or professional legal periodical (e.g. 95 Yale L.J. 5; 72 Soc.Sec.Rep.Serv. 318)",
),
],
help_text="The type of citation that this is.",
),
),
(
"status",
models.SmallIntegerField(
choices=[
(
1,
"The citation may be unmatched if: 1. it does not exist in the search_citation table. 2. It exists on the search_citation table, but we couldn't match the citation to a cluster on the previous citation extractor run",
),
(
2,
"The citation exists on the search_citation table. We haven't updated the citing Opinion.html_with_citations yet",
),
(
3,
"The citing Opinion.html_with_citations was updated successfully",
),
(
4,
"The citing Opinion.html_with_citations update failed because the citation is ambiguous",
),
(
5,
"We couldn't resolve the citation, and the citing Opinion.html_with_citations update failed",
),
],
help_text="Status of resolution of the initially unmatched citation",
),
),
(
"citation_string",
models.TextField(
help_text="The unparsed citation string in case it doesn't match the regular citation model in BaseCitation"
),
),
(
"court_id",
models.TextField(
help_text="A court_id as identified by eyecite from the opinion's context. May be useful to know where to find missing citations"
),
),
(
"year",
models.SmallIntegerField(
help_text="A year identified by eyecite from the opinion's context",
null=True,
),
),
(
"citing_opinion",
models.ForeignKey(
help_text="The opinion citing this citation",
on_delete=django.db.models.deletion.CASCADE,
related_name="unmatched_citations",
to="search.opinion",
),
),
],
options={
"indexes": [
models.Index(
fields=["volume", "reporter", "page"],
name="citations_u_volume_da4d25_idx",
)
],
"unique_together": {
("citing_opinion", "volume", "reporter", "page")
},
},
),
]
Empty file.
115 changes: 115 additions & 0 deletions cl/citations/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,115 @@
from django.db import models
from eyecite.models import FullCaseCitation

from cl.citations.utils import map_reporter_db_cite_type
from cl.search.models import BaseCitation, Citation, Opinion


class UnmatchedCitation(BaseCitation):
"""Keep track of citations that could not be resolved to a cluster on the
batch citator run
"""

UNMATCHED = 1
FOUND = 2
RESOLVED = 3
FAILED_AMBIGUOUS = 4
FAILED = 5
STATUS = (
(
UNMATCHED,
"The citation may be unmatched if: 1. it does not exist in the "
"search_citation table. 2. It exists on the search_citation table,"
" but we couldn't match the citation to a cluster on the previous"
" citation extractor run",
),
(
FOUND,
"The citation exists on the search_citation table. We "
" haven't updated the citing Opinion.html_with_citations yet",
),
(
RESOLVED,
"The citing Opinion.html_with_citations was updated successfully",
),
(
FAILED_AMBIGUOUS,
"The citing Opinion.html_with_citations update "
"failed because the citation is ambiguous",
),
(
FAILED,
"We couldn't resolve the citation, and the citing "
"Opinion.html_with_citations update failed",
),
)
citing_opinion: models.ForeignKey = models.ForeignKey(
Opinion,
help_text="The opinion citing this citation",
on_delete=models.CASCADE,
related_name="unmatched_citations",
)
status: models.SmallIntegerField = models.SmallIntegerField(
help_text="Status of resolution of the initially unmatched citation",
choices=STATUS,
)
citation_string: models.TextField = models.TextField(
help_text="The unparsed citation string in case it doesn't match the "
"regular citation model in BaseCitation"
)
court_id: models.TextField = models.TextField(
help_text="A court_id as identified by eyecite from the opinion's "
"context. May be useful to know where to find missing citations"
)
year: models.SmallIntegerField = models.SmallIntegerField(
help_text="A year identified by eyecite from the opinion's context",
null=True,
)

class Meta:
indexes = [
models.Index(
fields=["volume", "reporter", "page"],
)
]
#
unique_together = (("citing_opinion", "volume", "reporter", "page"),)

@classmethod
def create_from_eyecite(
cls, eyecite_citation: FullCaseCitation, citing_opinion: Opinion
):
"""
Create an UnmatchedCitation instance using an eyecite FullCaseCitation

Saving is left to the caller

:param eyecite_citation: a FullCaseCitation as returned by
eyecite.get_citations
:param citing_opinion: the opinion which uses the citation
"""
cite_type_str = eyecite_citation.all_editions[0].reporter.cite_type
year = eyecite_citation.metadata.year
unmatched_citation = cls(
citing_opinion=citing_opinion,
status=cls.UNMATCHED,
citation_string=eyecite_citation.matched_text(),
court_id=eyecite_citation.metadata.court or "",
year=int(year) if year else None,
volume=eyecite_citation.groups["volume"],
reporter=eyecite_citation.corrected_reporter(),
page=eyecite_citation.groups["page"],
type=map_reporter_db_cite_type(cite_type_str),
)

# The citation exists in the search_citation table, but it couldn't
# be resolved
if Citation.objects.filter(
volume=unmatched_citation.volume,
reporter=unmatched_citation.reporter,
page=unmatched_citation.page,
type=unmatched_citation.type,
).exists():
unmatched_citation.status = cls.FAILED

return unmatched_citation
Loading
Loading