Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(citations): add UnmatchedCitation model and logic #4949

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions cl/citations/management/commands/find_citations.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from django.core.management import CommandError
from django.core.management.base import CommandParser

from cl.citations.models import UnmatchedCitation
from cl.citations.tasks import (
find_citations_and_parentheticals_for_opinion_by_pks,
)
Expand Down Expand Up @@ -112,6 +113,9 @@ def handle(self, *args: List[str], **options: OptionsType) -> None:
query = query.filter(date_modified__gte=options["modified_after"])
if options.get("all"):
query = Opinion.objects.all()
sys.stdout.write("Deleting all UnmatchedCitation rows")
UnmatchedCitation.objects.all().delete()

self.count = query.count()
self.average_per_s = 0.0
self.timings: List[float] = []
Expand Down
50 changes: 50 additions & 0 deletions cl/citations/management/commands/update_unmatched_citations.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from cl.citations.management.commands import find_citations
from cl.citations.models import UnmatchedCitation
from cl.lib.command_utils import VerboseCommand


class Command(find_citations.Command):
"""Re-run find_citations_and_parentheticals_for_opinion_by_pks for
opinions where unmatched citations have been found
"""

help = "Try to resolve unmatched citations"
# variables to use find_citations.Command.update_documents
count = 0
average_per_s = 0.0
timings: list[float] = []

def add_arguments(self, parser):
VerboseCommand.add_arguments(self, parser)
parser.add_argument(
"--resolve-failures",
action="store_true",
default=False,
help="Include citations with FAILED and FAILED_AMBIGUOUS status",
)
parser.add_argument(
"--queue",
default="batch1",
help="The celery queue where the tasks should be processed.",
)

def handle(self, *args, **options):
"""Re-uses find_citations.Command enqueuer and logging"""
VerboseCommand.handle(self, *args, **options)
status = [UnmatchedCitation.FOUND]
if options["resolve_failures"]:
status.extend(
[UnmatchedCitation.FAILED, UnmatchedCitation.FAILED_AMBIGUOUS]
)

# distinct() on Django only works when the same field is on .order_by()
opinion_ids = (
UnmatchedCitation.objects.filter(status__in=status)
.order_by("citing_opinion_id")
.distinct("citing_opinion_id")
)
self.count = opinion_ids.count()
opinion_pks = opinion_ids.values_list("citing_opinion_id", flat=True)
find_citations.Command.update_documents(
self, opinion_pks, options["queue"]
)
153 changes: 153 additions & 0 deletions cl/citations/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,153 @@
# Generated by Django 5.1.4 on 2025-01-21 03:45

import django.db.models.deletion
from django.db import migrations, models


class Migration(migrations.Migration):

initial = True

dependencies = [
("search", "0037_alter_citation_type_noop"),
]

operations = [
migrations.CreateModel(
name="UnmatchedCitation",
fields=[
(
"id",
models.AutoField(
auto_created=True,
primary_key=True,
serialize=False,
verbose_name="ID",
),
),
(
"volume",
models.SmallIntegerField(
help_text="The volume of the reporter"
),
),
(
"reporter",
models.TextField(
db_index=True,
help_text="The abbreviation for the reporter",
),
),
(
"page",
models.TextField(
help_text="The 'page' of the citation in the reporter. Unfortunately, this is not an integer, but is a string-type because several jurisdictions do funny things with the so-called 'page'. For example, we have seen Roman numerals in Nebraska, 13301-M in Connecticut, and 144M in Montana."
),
),
(
"type",
models.SmallIntegerField(
choices=[
(1, "A federal reporter citation (e.g. 5 F. 55)"),
(
2,
"A citation in a state-based reporter (e.g. Alabama Reports)",
),
(
3,
"A citation in a regional reporter (e.g. Atlantic Reporter)",
),
(
4,
"A citation in a specialty reporter (e.g. Lawyers' Edition)",
),
(
5,
"A citation in an early SCOTUS reporter (e.g. 5 Black. 55)",
),
(
6,
"A citation in the Lexis system (e.g. 5 LEXIS 55)",
),
(
7,
"A citation in the WestLaw system (e.g. 5 WL 55)",
),
(8, "A vendor neutral citation (e.g. 2013 FL 1)"),
(
9,
"A law journal citation within a scholarly or professional legal periodical (e.g. 95 Yale L.J. 5; 72 Soc.Sec.Rep.Serv. 318)",
),
],
help_text="The type of citation that this is.",
),
),
(
"status",
models.SmallIntegerField(
choices=[
(
1,
"The citation does not exist in the search_citation table. We couldn't match the citation to a cluster on the previous citation extractor run",
),
(
2,
"The citation exists on the search_citation table. We haven't updated the citing Opinion.html_with_citations yet",
),
(
3,
"The citing Opinion.html_with_citations was updated successfully",
),
(
4,
"The citing Opinion.html_with_citations update failed because the citation is ambiguous",
),
(
5,
"The citing Opinion.html_with_citations update failed",
),
],
help_text="Status of resolution of the initially unmatched citation",
),
),
(
"citation_string",
models.TextField(
help_text="The unparsed citation string in case it doesn't match the regular citation model in BaseCitation"
),
),
(
"court_id",
models.TextField(
help_text="A court_id as identified by eyecite from the opinion's context. May be useful to know where to find missing citations"
),
),
(
"year",
models.TextField(
help_text="A year identified by eyecite from the opinion's context"
),
),
(
"citing_opinion",
models.ForeignKey(
help_text="The opinion citing this citation",
on_delete=django.db.models.deletion.CASCADE,
related_name="eyecite_citations",
to="search.opinion",
),
),
],
options={
"indexes": [
models.Index(
fields=["volume", "reporter", "page"],
name="citations_u_volume_da4d25_idx",
)
],
"unique_together": {
("citing_opinion", "volume", "reporter", "page")
},
},
),
]
Empty file.
96 changes: 96 additions & 0 deletions cl/citations/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
from django.db import models
from eyecite.models import FullCaseCitation

from cl.citations.utils import map_reporter_db_cite_type
from cl.search.models import BaseCitation, Opinion


class UnmatchedCitation(BaseCitation):
"""Keep track of citations that could not be resolved to a cluster on the
batch citator run
"""

UNMATCHED = 1
FOUND = 2
RESOLVED = 3
FAILED_AMBIGUOUS = 4
FAILED = 5
STATUS = (
(
UNMATCHED,
"The citation does not exist in the search_citation table."
" We couldn't match the citation to a cluster on the "
" previous citation extractor run",
),
(
FOUND,
"The citation exists on the search_citation table. We "
" haven't updated the citing Opinion.html_with_citations yet",
),
(
RESOLVED,
"The citing Opinion.html_with_citations was updated successfully",
),
(
FAILED_AMBIGUOUS,
"The citing Opinion.html_with_citations update "
"failed because the citation is ambiguous",
),
(FAILED, "The citing Opinion.html_with_citations update failed"),
)
citing_opinion: models.ForeignKey = models.ForeignKey(
Opinion,
help_text="The opinion citing this citation",
on_delete=models.CASCADE,
related_name="eyecite_citations",
grossir marked this conversation as resolved.
Show resolved Hide resolved
)
status: models.SmallIntegerField = models.SmallIntegerField(
help_text="Status of resolution of the initially unmatched citation",
choices=STATUS,
)
citation_string: models.TextField = models.TextField(
help_text="The unparsed citation string in case it doesn't match the "
"regular citation model in BaseCitation"
)
court_id: models.TextField = models.TextField(
help_text="A court_id as identified by eyecite from the opinion's "
"context. May be useful to know where to find missing citations"
)
year: models.TextField = models.TextField(
help_text="A year identified by eyecite from the opinion's context"
)
grossir marked this conversation as resolved.
Show resolved Hide resolved

class Meta:
indexes = [
models.Index(
fields=["volume", "reporter", "page"],
)
]
#
unique_together = (("citing_opinion", "volume", "reporter", "page"),)

@classmethod
def create_from_eyecite(
cls, eyecite_citation: FullCaseCitation, citing_opinion: Opinion
):
"""
Create an UnmatchedCitation instance using an eyecite FullCaseCitation

Saving is left to the caller

:param eyecite_citation: a FullCaseCitation as returned by
eyecite.get_citations
:param citing_opinion: the opinion which uses the citation
"""
cite_type_str = eyecite_citation.all_editions[0].reporter.cite_type
return cls(
citing_opinion=citing_opinion,
status=cls.UNMATCHED,
citation_string=eyecite_citation.matched_text(),
court_id=eyecite_citation.metadata.court or "",
year=eyecite_citation.metadata.year or "",
volume=eyecite_citation.groups["volume"],
reporter=eyecite_citation.corrected_reporter(),
page=eyecite_citation.groups["page"],
type=map_reporter_db_cite_type(cite_type_str),
)
Loading
Loading