diff --git a/cl/alerts/tests/tests.py b/cl/alerts/tests/tests.py index 57817100a8..6376730f05 100644 --- a/cl/alerts/tests/tests.py +++ b/cl/alerts/tests/tests.py @@ -2392,7 +2392,7 @@ def test_es_alert_update_and_delete(self, mock_abort_audio): user=self.user_profile.user, rate=Alert.REAL_TIME, name="Test Alert OA", - query="type=oa&docket_number=19-1010", + query="type=oa&docket_number=19-1010&order_by=score desc", alert_type=SEARCH_TYPES.ORAL_ARGUMENT, ) @@ -2402,6 +2402,7 @@ def test_es_alert_update_and_delete(self, mock_abort_audio): response_str = str(doc.to_dict()) self.assertIn("'query': '19-1010'", response_str) self.assertIn("'rate': 'rt'", response_str) + self.assertNotIn("function_score", response_str) # Update Alert search_alert_1.query = "type=oa&docket_number=19-1020" diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 8b99cba47e..1076802912 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -2221,13 +2221,19 @@ def test_index_and_delete_recap_alerts_from_percolator( user=self.user_profile.user, rate=Alert.WEEKLY, name="Test Alert Docket Only", - query='q="401 Civil"&type=r', + query='q="401 Civil"&type=r&order_by=score desc', alert_type=SEARCH_TYPES.RECAP, ) self.assertTrue( RECAPPercolator.exists(id=docket_only_alert.pk), msg=f"Alert id: {docket_only_alert.pk} was not indexed.", ) + alert_doc = RECAPPercolator.get(id=docket_only_alert.pk) + response_str = str(alert_doc.to_dict()) + self.assertIn("401 Civil", response_str) + self.assertIn("'rate': 'wly'", response_str) + # function_score breaks percolator queries. Ensure it is never indexed. + self.assertNotIn("function_score", response_str) docket_only_alert_id = docket_only_alert.pk # Remove the alert. diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 5f2017015a..c39292e9aa 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -37,6 +37,7 @@ ApiPositionMapping, BasePositionMapping, CleanData, + EsJoinQueries, EsMainQueries, ESRangeQueryParams, ) @@ -71,6 +72,7 @@ SEARCH_RECAP_PARENT_QUERY_FIELDS, api_child_highlight_map, cardinality_query_unique_ids, + date_decay_relevance_types, recap_boosts_es, ) from cl.search.exception import ( @@ -938,6 +940,74 @@ def build_custom_function_score_for_date( return query +def build_decay_relevance_score( + query: QueryString | str, + date_field: str, + scale: int, + decay: float, + default_missing_date: str = "1600-01-01T00:00:00Z", + boost_mode: str = "multiply", + min_score: float = 0.0, +) -> QueryString: + """ + Build a decay relevance score query for Elasticsearch that adjusts the + relevance of documents based on a date field. + + :param query: The Elasticsearch query string or QueryString object. + :param date_field: The date field used to compute the relevance decay. + :param scale: The scale (in years) that determines the rate of decay. + :param decay: The decay factor. + :param default_missing_date: The default date to use when the date field + is null. + :param boost_mode: The mode to combine the decay score with the query's + original relevance score. + :param min_score: The minimum score where the decay function stabilizes. + :return: The modified QueryString object with applied function score. + """ + + query = Q( + "function_score", + query=query, + script_score={ + "script": { + "source": f""" + def default_missing_date = Instant.parse(params.default_missing_date).toEpochMilli(); + def decay = (double)params.decay; + def now = new Date().getTime(); + def min_score = (double)params.min_score; + + // Convert scale parameter into milliseconds. + double years = (double)params.scale; + // Convert years to milliseconds 1 year = 365 days + long scaleMillis = (long)(years * 365 * 24 * 60 * 60 * 1000); + + // Retrieve the document date. If missing or null, use default_missing_date + def docDate = default_missing_date; + if (doc['{date_field}'].size() > 0) {{ + docDate = doc['{date_field}'].value.toInstant().toEpochMilli(); + }} + // λ = ln(decay)/scale + def lambda = Math.log(decay) / scaleMillis; + // Absolute distance from now + def diff = Math.abs(docDate - now); + // Score: exp( λ * max(0, |docDate - now|) ) + def decay_score = Math.exp(lambda * diff); + // Adjust the decay score to have a minimum value + return min_score + ((1 - min_score) * decay_score); + """, + "params": { + "default_missing_date": default_missing_date, + "scale": scale, # Years + "decay": decay, + "min_score": min_score, + }, + }, + }, + boost_mode=boost_mode, + ) + return query + + def build_has_child_query( query: QueryString | str, child_type: str, @@ -1021,30 +1091,21 @@ def combine_plain_filters_and_queries( final_query.filter = reduce(operator.iand, filters) if filters and string_query: final_query.minimum_should_match = 1 - - if cd["type"] == SEARCH_TYPES.ORAL_ARGUMENT: - # Apply custom score for dateArgued sorting in the V4 API. - final_query = apply_custom_score_to_main_query( - cd, final_query, api_version - ) return final_query def get_match_all_query( cd: CleanData, - search_query: Search, api_version: Literal["v3", "v4"] | None = None, child_highlighting: bool = True, -) -> Search: +) -> Query: """Build and return a match-all query for each type of document. :param cd: The query CleanedData - :param search_query: Elasticsearch DSL Search object :param api_version: Optional, the request API version. :param child_highlighting: Whether highlighting should be enabled in child docs. - :return: The modified Search object based on the given conditions. + :return: The Match All Query object. """ - _, query_hits_limit = get_child_top_hits_limit( cd, cd["type"], api_version=api_version ) @@ -1068,9 +1129,6 @@ def get_match_all_query( final_match_all_query = Q( "bool", should=q_should, minimum_should_match=1 ) - final_match_all_query = apply_custom_score_to_main_query( - cd, final_match_all_query, api_version - ) case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: # Match all query for RECAP and Dockets, it'll return dockets # with child documents and also empty dockets. @@ -1092,9 +1150,6 @@ def get_match_all_query( should=[match_all_child_query, match_all_parent_query], minimum_should_match=1, ) - final_match_all_query = apply_custom_score_to_main_query( - cd, final_match_all_query, api_version - ) case SEARCH_TYPES.OPINION: # Only return Opinion clusters. match_all_child_query = build_has_child_query( @@ -1115,12 +1170,9 @@ def get_match_all_query( case _: # No string_query or filters in plain search types like OA and # Parentheticals. Use a match_all query. - match_all_query = Q("match_all") - final_match_all_query = apply_custom_score_to_main_query( - cd, match_all_query, api_version - ) + final_match_all_query = Q("match_all") - return search_query.query(final_match_all_query) + return final_match_all_query def build_es_base_query( @@ -1147,10 +1199,13 @@ def build_es_base_query( main_query = None string_query = None - child_docs_query = None + child_query = None parent_query = None filters = [] plain_doc = False + join_queries = None + has_text_query = False + match_all_query = False match cd["type"]: case SEARCH_TYPES.PARENTHETICAL: filters = build_es_plain_filters(cd) @@ -1193,14 +1248,12 @@ def build_es_base_query( ], ) ) - main_query, child_docs_query, parent_query = ( - build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - child_highlighting=child_highlighting, - api_version=api_version, - ) + join_queries = build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + child_highlighting=child_highlighting, + api_version=api_version, ) case ( @@ -1226,15 +1279,13 @@ def build_es_base_query( ], ) ) - main_query, child_docs_query, parent_query = ( - build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - child_highlighting=child_highlighting, - api_version=api_version, - alerts=alerts, - ) + join_queries = build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + child_highlighting=child_highlighting, + api_version=api_version, + alerts=alerts, ) case SEARCH_TYPES.OPINION: @@ -1246,20 +1297,19 @@ def build_es_base_query( mlt_query = async_to_sync(build_more_like_this_query)( cluster_pks ) - main_query, child_docs_query, parent_query = ( - build_full_join_es_queries( - cd, - {"opinion": []}, - [], - mlt_query, - child_highlighting=True, - api_version=api_version, - ) + join_queries = build_full_join_es_queries( + cd, + {"opinion": []}, + [], + mlt_query, + child_highlighting=True, + api_version=api_version, ) return EsMainQueries( - search_query=search_query.query(main_query), - parent_query=parent_query, - child_query=child_docs_query, + search_query=search_query.query(join_queries.main_query), + boost_mode="multiply", + parent_query=join_queries.parent_query, + child_query=join_queries.child_query, ) opinion_search_fields = SEARCH_OPINION_QUERY_FIELDS @@ -1286,41 +1336,48 @@ def build_es_base_query( ], ) ) - main_query, child_docs_query, parent_query = ( - build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - mlt_query, - child_highlighting=child_highlighting, - api_version=api_version, - alerts=alerts, - ) + join_queries = build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + mlt_query, + child_highlighting=child_highlighting, + api_version=api_version, + alerts=alerts, ) + if join_queries is not None: + main_query = join_queries.main_query + parent_query = join_queries.parent_query + child_query = join_queries.child_query + has_text_query = join_queries.has_text_query + if not any([filters, string_query, main_query]): # No filters, string_query or main_query provided by the user, return a # match_all query - match_all_query = get_match_all_query( - cd, search_query, api_version, child_highlighting - ) - return EsMainQueries( - search_query=match_all_query, - parent_query=parent_query, - child_query=child_docs_query, - ) + main_query = get_match_all_query(cd, api_version, child_highlighting) + match_all_query = True - if plain_doc: + boost_mode = "multiply" if has_text_query else "replace" + if plain_doc and not match_all_query: # Combine the filters and string query for plain documents like Oral # arguments and parentheticals main_query = combine_plain_filters_and_queries( cd, filters, string_query, api_version ) + boost_mode = "multiply" if string_query else "replace" + + # Apply a custom function score to the main query, useful for cursor pagination + # in the V4 API and for date decay relevance. + main_query = apply_custom_score_to_main_query( + cd, main_query, api_version, boost_mode=boost_mode + ) return EsMainQueries( search_query=search_query.query(main_query), + boost_mode=boost_mode, parent_query=parent_query, - child_query=child_docs_query, + child_query=child_query, ) @@ -2076,15 +2133,27 @@ def merge_unavailable_fields_on_parent_document( def clean_count_query(search_query: Search) -> SearchDSL: """Cleans a given ES Search object for a count query. - Modifies the input Search object by removing 'inner_hits' from - any 'has_child' queries within the 'should' clause of the boolean query. + Modifies the input Search object by removing 'function_score' from the main + query if present and/or 'inner_hits' from any 'has_child' queries within + the 'should' clause of the boolean query. It then creates a new Search object with the modified query. :param search_query: The ES Search object. :return: A new ES Search object with the count query. """ - parent_total_query_dict = search_query.to_dict() + parent_total_query_dict = search_query.to_dict(count=True) + try: + # Clean function_score in queries that contain it + parent_total_query_dict = parent_total_query_dict["query"][ + "function_score" + ] + del parent_total_query_dict["boost_mode"] + del parent_total_query_dict["functions"] + except KeyError: + # Omit queries that don't contain it. + pass + try: # Clean the has_child query in queries that contain it. for query in parent_total_query_dict["query"]["bool"]["should"]: @@ -2489,13 +2558,17 @@ def nullify_query_score(query: Query) -> Query: def apply_custom_score_to_main_query( - cd: CleanData, query: Query, api_version: Literal["v3", "v4"] | None = None + cd: CleanData, + query: Query, + api_version: Literal["v3", "v4"] | None = None, + boost_mode: str = "multiply", ) -> Query: """Apply a custom function score to the main query. :param cd: The query CleanedData :param query: The ES Query object to be modified. :param api_version: Optional, the request API version. + :param boost_mode: Optional, the boost mode to apply for the decay relevancy score :return: The function_score query contains the base query, applied when child_order is used. """ @@ -2516,6 +2589,10 @@ def apply_custom_score_to_main_query( else False ) + valid_decay_relevance_types: dict[str, dict[str, str | int | float]] = ( + date_decay_relevance_types + ) + main_order_by = cd.get("order_by", "") if is_valid_custom_score_field and api_version == "v4": # Applies a custom function score to sort Documents based on # a date field. This serves as a workaround to enable the use of the @@ -2526,7 +2603,23 @@ def apply_custom_score_to_main_query( default_score=0, default_current_date=cd["request_date"], ) - + elif ( + main_order_by == "score desc" + and cd["type"] in valid_decay_relevance_types + ): + decay_settings = valid_decay_relevance_types[cd["type"]] + date_field = str(decay_settings["field"]) + scale = int(decay_settings["scale"]) + decay = float(decay_settings["decay"]) + min_score = float(decay_settings["min_score"]) + query = build_decay_relevance_score( + query, + date_field, + scale=scale, + decay=decay, + boost_mode=boost_mode, + min_score=min_score, + ) return query @@ -2538,7 +2631,7 @@ def build_full_join_es_queries( child_highlighting: bool = True, api_version: Literal["v3", "v4"] | None = None, alerts: bool = False, -) -> tuple[QueryString | list, QueryString | None, QueryString | None]: +) -> EsJoinQueries: """Build a complete Elasticsearch query with both parent and child document conditions. @@ -2554,6 +2647,7 @@ def build_full_join_es_queries( """ q_should = [] + has_text_query = False match cd["type"]: case ( SEARCH_TYPES.RECAP @@ -2683,6 +2777,7 @@ def build_full_join_es_queries( string_query = build_fulltext_query( parent_query_fields, cd.get("q", ""), only_queries=True ) + has_text_query = True if string_query else False # If child filters are set, add a has_child query as a filter to the # parent query to exclude results without matching children. @@ -2730,17 +2825,22 @@ def build_full_join_es_queries( q_should.append(parent_query) if not q_should: - return [], child_docs_query, parent_query + return EsJoinQueries( + main_query=[], + parent_query=parent_query, + child_query=child_docs_query, + has_text_query=has_text_query, + ) - main_join_query = apply_custom_score_to_main_query( - cd, - Q( + return EsJoinQueries( + main_query=Q( "bool", should=q_should, ), - api_version, + parent_query=parent_query, + child_query=child_docs_query, + has_text_query=has_text_query, ) - return (main_join_query, child_docs_query, parent_query) def limit_inner_hits( @@ -3000,11 +3100,14 @@ def do_es_api_query( # and sorting are set. # Note that in V3 Case Law Search, opinions are collapsed by cluster_id # meaning that only one result per cluster is shown. - s = build_child_docs_query( + child_docs_query = build_child_docs_query( child_docs_query, cd=cd, ) - main_query = search_query.query(s) + main_query = apply_custom_score_to_main_query( + cd, child_docs_query, api_version, boost_mode=es_queries.boost_mode + ) + main_query = search_query.query(main_query) highlight_options, fields_to_exclude = build_highlights_dict( highlighting_fields, hl_tag ) @@ -3047,7 +3150,10 @@ def do_es_api_query( # field exclusion are set. s = apply_custom_score_to_main_query( - cd, child_docs_query, api_version + cd, + child_docs_query, + api_version, + boost_mode=es_queries.boost_mode, ) main_query = search_query.query(s) highlight_options, fields_to_exclude = build_highlights_dict( @@ -3427,6 +3533,7 @@ def get_opinions_coverage_over_time( format="yyyy", ), ) + try: response = search_query.execute() except (TransportError, ConnectionError, RequestError): diff --git a/cl/lib/types.py b/cl/lib/types.py index ff257574e9..e4c29c31e6 100644 --- a/cl/lib/types.py +++ b/cl/lib/types.py @@ -195,10 +195,19 @@ def get_db_to_dataclass_map(self): @dataclass class EsMainQueries: search_query: Search + boost_mode: str parent_query: QueryString | None = None child_query: QueryString | None = None +@dataclass +class EsJoinQueries: + main_query: QueryString | list + parent_query: QueryString | None + child_query: QueryString | None + has_text_query: bool + + @dataclass class ApiPositionMapping(BasePositionMapping): position_type_dict: defaultdict[int, list[str]] = field( diff --git a/cl/recap/api_serializers.py b/cl/recap/api_serializers.py index e20c5be0a8..48fc52ef66 100644 --- a/cl/recap/api_serializers.py +++ b/cl/recap/api_serializers.py @@ -95,10 +95,10 @@ def validate(self, attrs): UPLOAD_TYPE.CASE_QUERY_RESULT_PAGE, ]: # These are district or bankruptcy court dockets. Is the court valid? - court_ids = Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list( - "pk", flat=True + court_ids = ( + Court.federal_courts.district_or_bankruptcy_pacer_courts() ) - if attrs["court"].pk not in court_ids: + if not court_ids.filter(pk=attrs["court"].pk).exists(): raise ValidationError( "%s is not a district or bankruptcy court ID. Did you " "mean to use the upload_type for appellate dockets?" @@ -108,11 +108,9 @@ def validate(self, attrs): if attrs["upload_type"] == UPLOAD_TYPE.CLAIMS_REGISTER: # Only allowed on bankruptcy courts bankruptcy_court_ids = ( - Court.federal_courts.bankruptcy_pacer_courts().values_list( - "pk", flat=True - ) + Court.federal_courts.bankruptcy_pacer_courts() ) - if attrs["court"].pk not in bankruptcy_court_ids: + if not bankruptcy_court_ids.filter(pk=attrs["court"].pk).exists(): raise ValidationError( "%s is not a bankruptcy court ID. Only bankruptcy cases " "should have claims registry pages." % attrs["court"] @@ -127,12 +125,8 @@ def validate(self, attrs): UPLOAD_TYPE.APPELLATE_CASE_QUERY_RESULT_PAGE, ]: # Appellate court dockets. Is the court valid? - appellate_court_ids = ( - Court.federal_courts.appellate_pacer_courts().values_list( - "pk", flat=True - ) - ) - if attrs["court"].pk not in appellate_court_ids: + appellate_court_ids = Court.federal_courts.appellate_pacer_courts() + if not appellate_court_ids.filter(pk=attrs["court"].pk).exists(): raise ValidationError( "%s is not an appellate court ID. Did you mean to use the " "upload_type for district dockets?" % attrs["court"] @@ -203,11 +197,8 @@ def validate(self, attrs): mail = attrs["mail"] receipt = attrs["receipt"] - all_court_ids = Court.federal_courts.all_pacer_courts().values_list( - "pk", flat=True - ) - - if court_id not in all_court_ids: + all_court_ids = Court.federal_courts.all_pacer_courts() + if not all_court_ids.filter(pk=court_id).exists(): raise ValidationError( f"{attrs['court'].pk} is not a PACER court ID." ) @@ -274,10 +265,9 @@ class Meta: def validate(self, attrs): # Is it a good court value? - valid_court_ids = Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list( - "pk", flat=True + valid_court_ids = ( + Court.federal_courts.district_or_bankruptcy_pacer_courts() ) - if ( attrs.get("court") or attrs.get("docket") @@ -293,7 +283,7 @@ def validate(self, attrs): if attrs.get("court") else attrs["docket"].court_id ) - if court_id not in valid_court_ids: + if not valid_court_ids.filter(pk=court_id).exists(): raise ValidationError(f"Invalid court id: {court_id}") # Docket validations diff --git a/cl/recap/factories.py b/cl/recap/factories.py index 9b786ed4fd..64f3afb714 100644 --- a/cl/recap/factories.py +++ b/cl/recap/factories.py @@ -93,6 +93,7 @@ class RECAPEmailDocketEntryDataFactory(DictFactory): pacer_doc_id = Faker("random_id_string") pacer_magic_num = Faker("random_id_string") pacer_seq_no = Faker("random_id_string") + short_description = Faker("text", max_nb_chars=15) class RECAPEmailDocketDataFactory(DictFactory): diff --git a/cl/recap/mergers.py b/cl/recap/mergers.py index 0bbef5a5ec..95fd75cc98 100644 --- a/cl/recap/mergers.py +++ b/cl/recap/mergers.py @@ -822,6 +822,35 @@ async def get_or_make_docket_entry( return de, de_created +async def keep_latest_rd_document(queryset: QuerySet) -> RECAPDocument: + """Retains the most recent item with a PDF, if available otherwise, + retains the most recent item overall. + + :param queryset: RECAPDocument QuerySet to clean duplicates from. + :return: The matched RECAPDocument after cleaning. + """ + rd_with_pdf_queryset = queryset.filter(is_available=True).exclude( + filepath_local="" + ) + if await rd_with_pdf_queryset.aexists(): + rd = await rd_with_pdf_queryset.alatest("date_created") + else: + rd = await queryset.alatest("date_created") + await queryset.exclude(pk=rd.pk).adelete() + return rd + + +async def clean_duplicate_documents(params: dict[str, Any]) -> RECAPDocument: + """Removes duplicate RECAPDocuments, keeping the most recent with PDF if + available or otherwise the most recent overall. + + :param params: Query parameters to filter the RECAPDocuments. + :return: The matched RECAPDocument after cleaning. + """ + duplicate_rd_queryset = RECAPDocument.objects.filter(**params) + return await keep_latest_rd_document(duplicate_rd_queryset) + + async def add_docket_entries( d: Docket, docket_entries: list[dict[str, Any]], @@ -934,17 +963,28 @@ async def add_docket_entries( rd = await RECAPDocument.objects.aget(**get_params) rds_updated.append(rd) except RECAPDocument.DoesNotExist: - try: - params["pacer_doc_id"] = docket_entry["pacer_doc_id"] - rd = await RECAPDocument.objects.acreate( - document_number=docket_entry["document_number"] or "", - is_available=False, - **params, - ) - except ValidationError: - # Happens from race conditions. - continue - rds_created.append(rd) + rd = None + if de_created is False and not appelate_court_id_exists: + try: + # Check for documents with a bad pacer_doc_id + rd = await RECAPDocument.objects.aget(**params) + except RECAPDocument.DoesNotExist: + # Fallback to creating document + pass + except RECAPDocument.MultipleObjectsReturned: + rd = await clean_duplicate_documents(params) + if rd is None: + try: + params["pacer_doc_id"] = docket_entry["pacer_doc_id"] + rd = await RECAPDocument.objects.acreate( + document_number=docket_entry["document_number"] or "", + is_available=False, + **params, + ) + rds_created.append(rd) + except ValidationError: + # Happens from race conditions. + continue except RECAPDocument.MultipleObjectsReturned: logger.info( "Multiple recap documents found for document entry number'%s' " @@ -952,17 +992,10 @@ async def add_docket_entries( ) if params["document_type"] == RECAPDocument.ATTACHMENT: continue - duplicate_rd_queryset = RECAPDocument.objects.filter(**params) - rd_with_pdf_queryset = duplicate_rd_queryset.filter( - is_available=True - ).exclude(filepath_local="") - if await rd_with_pdf_queryset.aexists(): - rd = await rd_with_pdf_queryset.alatest("date_created") - else: - rd = await duplicate_rd_queryset.alatest("date_created") - await duplicate_rd_queryset.exclude(pk=rd.pk).adelete() + rd = await clean_duplicate_documents(params) - rd.pacer_doc_id = rd.pacer_doc_id or docket_entry["pacer_doc_id"] + if docket_entry["pacer_doc_id"]: + rd.pacer_doc_id = docket_entry["pacer_doc_id"] description = docket_entry.get("short_description") if rd.document_type == RECAPDocument.PACER_DOCUMENT and description: rd.description = description @@ -1604,14 +1637,7 @@ async def clean_duplicate_attachment_entries( ) async for dupe in dupes.aiterator(): duplicate_rd_queryset = rds.filter(pacer_doc_id=dupe.pacer_doc_id) - rd_with_pdf_queryset = duplicate_rd_queryset.filter( - is_available=True - ).exclude(filepath_local="") - if await rd_with_pdf_queryset.aexists(): - keep_rd = await rd_with_pdf_queryset.alatest("date_created") - else: - keep_rd = await duplicate_rd_queryset.alatest("date_created") - await duplicate_rd_queryset.exclude(pk=keep_rd.pk).adelete() + await keep_latest_rd_document(duplicate_rd_queryset) async def merge_attachment_page_data( @@ -1673,15 +1699,7 @@ async def merge_attachment_page_data( except RECAPDocument.MultipleObjectsReturned as exc: if pacer_case_id: - duplicate_rd_queryset = RECAPDocument.objects.filter(**params) - rd_with_pdf_queryset = duplicate_rd_queryset.filter( - is_available=True - ).exclude(filepath_local="") - if await rd_with_pdf_queryset.aexists(): - keep_rd = await rd_with_pdf_queryset.alatest("date_created") - else: - keep_rd = await duplicate_rd_queryset.alatest("date_created") - await duplicate_rd_queryset.exclude(pk=keep_rd.pk).adelete() + await clean_duplicate_documents(params) main_rd = await RECAPDocument.objects.select_related( "docket_entry", "docket_entry__docket" ).aget(**params) @@ -1711,23 +1729,7 @@ async def merge_attachment_page_data( break except RECAPDocument.MultipleObjectsReturned as exc: if pacer_case_id: - duplicate_rd_queryset = RECAPDocument.objects.filter( - **params - ) - rd_with_pdf_queryset = duplicate_rd_queryset.filter( - is_available=True - ).exclude(filepath_local="") - if await rd_with_pdf_queryset.aexists(): - keep_rd = await rd_with_pdf_queryset.alatest( - "date_created" - ) - else: - keep_rd = await duplicate_rd_queryset.alatest( - "date_created" - ) - await duplicate_rd_queryset.exclude( - pk=keep_rd.pk - ).adelete() + await clean_duplicate_documents(params) main_rd = await RECAPDocument.objects.select_related( "docket_entry", "docket_entry__docket" ).aget(**params) diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 026b1ca2ef..ee674a9f25 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -20,6 +20,7 @@ from django.core.files.base import ContentFile, File from django.core.files.uploadedfile import SimpleUploadedFile from django.db import IntegrityError, transaction +from django.db.models import QuerySet from django.utils.timezone import now from juriscraper.lib.exceptions import PacerLoginException, ParsingException from juriscraper.lib.string_utils import CaseNameTweaker, harmonize @@ -114,7 +115,9 @@ async def process_recap_upload(pq: ProcessingQueue) -> None: for pq_pk in sub_docket_att_page_pks: await process_recap_attachment(pq_pk) elif pq.upload_type == UPLOAD_TYPE.PDF: - await process_recap_pdf(pq.pk) + sub_docket_pdf_pks = await find_subdocket_pdf_rds(pq.pk) + for pq_pk in sub_docket_pdf_pks: + await process_recap_pdf(pq_pk) elif pq.upload_type == UPLOAD_TYPE.DOCKET_HISTORY_REPORT: docket = await process_recap_docket_history_report(pq.pk) elif pq.upload_type == UPLOAD_TYPE.APPELLATE_DOCKET: @@ -676,6 +679,30 @@ async def get_att_data_from_pq( return pq, att_data, text +def get_main_rds(court_id: str, pacer_doc_id: str) -> QuerySet: + """ + Return the main RECAPDocument queryset for a given court and pacer_doc_id. + :param court_id: The court ID to query. + :param pacer_doc_id: The pacer document ID. + :return: The main RECAPDocument queryset. + """ + main_rds_qs = ( + RECAPDocument.objects.select_related("docket_entry__docket") + .filter( + pacer_doc_id=pacer_doc_id, + docket_entry__docket__court_id=court_id, + ) + .order_by("docket_entry__docket__pacer_case_id") + .distinct("docket_entry__docket__pacer_case_id") + .only( + "pacer_doc_id", + "docket_entry__docket__pacer_case_id", + "docket_entry__docket__court_id", + ) + ) + return main_rds_qs + + async def find_subdocket_att_page_rds( pk: int, ) -> list[int]: @@ -687,43 +714,100 @@ async def find_subdocket_att_page_rds( """ pq = await ProcessingQueue.objects.aget(pk=pk) - court = await Court.objects.aget(id=pq.court_id) pq, att_data, text = await get_att_data_from_pq(pq) pacer_doc_id = att_data["pacer_doc_id"] - main_rds = ( - RECAPDocument.objects.select_related("docket_entry__docket") - .filter( - pacer_doc_id=pacer_doc_id, - docket_entry__docket__court=court, - ) - .order_by("docket_entry__docket__pacer_case_id") - .distinct("docket_entry__docket__pacer_case_id") - .only( - "pacer_doc_id", - "docket_entry__docket__pacer_case_id", - "docket_entry__docket__court_id", - ) - .exclude(docket_entry__docket__pacer_case_id=pq.pacer_case_id) + main_rds = get_main_rds(pq.court_id, pacer_doc_id).exclude( + docket_entry__docket__pacer_case_id=pq.pacer_case_id ) pqs_to_process_pks = [ pq.pk ] # Add the original pq to the list of pqs to process original_file_content = text.encode("utf-8") original_file_name = pq.filepath_local.name - async for main_rd in main_rds: - main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id - # Create additional pqs for each subdocket case found. - pq_created = await ProcessingQueue.objects.acreate( - uploader_id=pq.uploader_id, - pacer_doc_id=pacer_doc_id, - pacer_case_id=main_pacer_case_id, - court_id=court.pk, - upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, - filepath_local=ContentFile( - original_file_content, name=original_file_name - ), + + @sync_to_async + def save_pq_instances(): + with transaction.atomic(): + for main_rd in main_rds: + main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + # Create additional pqs for each subdocket case found. + pq_created = ProcessingQueue.objects.create( + uploader_id=pq.uploader_id, + pacer_doc_id=pacer_doc_id, + pacer_case_id=main_pacer_case_id, + court_id=pq.court_id, + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=ContentFile( + original_file_content, name=original_file_name + ), + ) + pqs_to_process_pks.append(pq_created.pk) + + await save_pq_instances() + return pqs_to_process_pks + + +async def find_subdocket_pdf_rds( + pk: int, +) -> list[int]: + """Look for RECAP Documents that belong to subdockets, and create a PQ + object for each additional PDF upload that requires processing. + + :param pk: Primary key of the processing queue item. + :return: A list of ProcessingQueue pks to process. + """ + + pq = await ProcessingQueue.objects.aget(pk=pk) + main_rds = get_main_rds(pq.court_id, pq.pacer_doc_id) + pqs_to_process_pks = [ + pq.pk + ] # Add the original pq to the list of pqs to process + + appellate_court_ids = Court.federal_courts.appellate_pacer_courts() + if await appellate_court_ids.filter(pk=pq.court_id).aexists(): + # Abort the process for appellate documents. Subdockets cannot be found + # in appellate cases. + return pqs_to_process_pks + + if pq.pacer_case_id: + # If pq already has a pacer_case_id, exclude it from the queryset. + main_rds = main_rds.exclude( + docket_entry__docket__pacer_case_id=pq.pacer_case_id ) - pqs_to_process_pks.append(pq_created.pk) + + pdf_binary_content = pq.filepath_local.read() + + @sync_to_async + def save_pq_instances(): + with transaction.atomic(): + for i, main_rd in enumerate(main_rds): + if i == 0 and not pq.pacer_case_id: + # If the original PQ does not have a pacer_case_id, + # assign it a pacer_case_id from one of the matched RDs + # to ensure the RD lookup in process_recap_pdf succeeds. + pq.pacer_case_id = ( + main_rd.docket_entry.docket.pacer_case_id + ) + pq.save() + continue + + main_pacer_case_id = main_rd.docket_entry.docket.pacer_case_id + # Create additional pqs for each subdocket case found. + pq_created = ProcessingQueue.objects.create( + uploader_id=pq.uploader_id, + pacer_doc_id=pq.pacer_doc_id, + pacer_case_id=main_pacer_case_id, + document_number=pq.document_number, + attachment_number=pq.attachment_number, + court_id=pq.court_id, + upload_type=UPLOAD_TYPE.PDF, + filepath_local=ContentFile( + pdf_binary_content, name=pq.filepath_local.name + ), + ) + pqs_to_process_pks.append(pq_created.pk) + + await save_pq_instances() return pqs_to_process_pks @@ -747,10 +831,6 @@ async def process_recap_attachment( await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") - pq = await ProcessingQueue.objects.aget(pk=pk) - await mark_pq_status(pq, "", PROCESSING_STATUS.IN_PROGRESS) - logger.info(f"Processing RECAP item (debug is: {pq.debug}): {pq}") - pq, att_data, text = await get_att_data_from_pq(pq) if document_number is None: diff --git a/cl/recap/tests.py b/cl/recap/tests.py index bb249b6246..5013ff062c 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -17,6 +17,7 @@ from django.core.files.base import ContentFile from django.core.files.uploadedfile import SimpleUploadedFile from django.core.management import call_command +from django.db import transaction from django.test import RequestFactory, override_settings from django.urls import reverse from django.utils.timezone import now @@ -182,28 +183,6 @@ def setUpTestData(cls): ], ) - cls.att_data_2 = AppellateAttachmentPageFactory( - attachments=[ - AppellateAttachmentFactory( - pacer_doc_id="04505578698", attachment_number=1 - ), - AppellateAttachmentFactory( - pacer_doc_id="04505578699", attachment_number=2 - ), - ], - pacer_doc_id="04505578697", - pacer_case_id="104491", - document_number="1", - ) - cls.de_data_2 = DocketEntriesDataFactory( - docket_entries=[ - DocketEntryDataFactory( - pacer_doc_id="04505578697", - document_number=1, - ) - ], - ) - def setUp(self) -> None: self.async_client = AsyncAPIClient() self.user = User.objects.get(username="recap") @@ -793,39 +772,166 @@ def test_processing_an_acms_attachment_page(self, mock_upload): main_attachment[0].document_type, RECAPDocument.ATTACHMENT ) - def test_processing_subdocket_case_attachment_page(self, mock_upload): - """Can we replicate an attachment page upload from a subdocket case - to its corresponding RD across all related dockets? + def test_match_recap_document_with_wrong_pacer_doc_id(self, mock_upload): + """Confirm that when an existing RECAPDocument has an invalid + pacer_doc_id, we can still match it after excluding the pacer_doc_id + from the lookup. """ - d_1 = DocketFactory( + de_data = DocketEntriesDataFactory( + docket_entries=[ + RECAPEmailDocketEntryDataFactory( + pacer_doc_id="04505578690", + document_number=5, + ) + ], + ) + de = DocketEntryWithParentsFactory( + docket__court=self.court, entry_number=5 + ) + rd = RECAPDocumentFactory( + docket_entry=de, + document_type=RECAPDocument.PACER_DOCUMENT, + pacer_doc_id="04505578691", + document_number="5", + description="", + ) + # Add the docket entry with the updated pacer_doc_id + async_to_sync(add_docket_entries)(de.docket, de_data["docket_entries"]) + recap_documents = RECAPDocument.objects.all() + self.assertEqual( + recap_documents.count(), 1, msg="Wrong number of RECAPDocuments" + ) + rd.refresh_from_db() + self.assertEqual( + rd.description, + de_data["docket_entries"][0]["short_description"], + msg="The short description doesn't match.", + ) + self.assertEqual( + rd.pacer_doc_id, + de_data["docket_entries"][0]["pacer_doc_id"], + msg="The pacer_doc_id doesn't match.", + ) + + def test_match_recap_document_with_wrong_pacer_doc_id_duplicated( + self, mock_upload + ): + """Confirm that when an existing RECAPDocument has an invalid + pacer_doc_id, we can still match it after excluding the pacer_doc_id + from the lookup, even if there is more than one PACER_DOCUMENT that + belongs to the docket entry. + """ + + de_data = DocketEntriesDataFactory( + docket_entries=[ + RECAPEmailDocketEntryDataFactory( + pacer_doc_id="04505578690", + document_number=5, + ) + ], + ) + de = DocketEntryWithParentsFactory( + docket__court=self.court, entry_number=5 + ) + RECAPDocumentFactory( + document_type=RECAPDocument.PACER_DOCUMENT, + docket_entry=de, + pacer_doc_id="04505578691", + document_number="5", + description="", + ) + rd_2 = RECAPDocumentFactory( + document_type=RECAPDocument.PACER_DOCUMENT, + docket_entry=de, + pacer_doc_id="04505578691", + document_number="6", + description="", + is_available=True, + ) + # Add the docket entry with the updated pacer_doc_id, remove the + # duplicated RD, and keep the one that is available. + async_to_sync(add_docket_entries)(de.docket, de_data["docket_entries"]) + recap_documents = RECAPDocument.objects.all() + self.assertEqual( + recap_documents.count(), 1, msg="Wrong number of RECAPDocuments" + ) + rd_2.refresh_from_db() + self.assertEqual( + rd_2.description, + de_data["docket_entries"][0]["short_description"], + msg="The short description doesn't match.", + ) + self.assertEqual( + rd_2.pacer_doc_id, + de_data["docket_entries"][0]["pacer_doc_id"], + msg="The pacer_doc_id doesn't match.", + ) + + +class ReplicateRecapUploadsTest(TestCase): + """Test RECAP uploads are properly replicated to subdockets.""" + + @classmethod + def setUpTestData(cls): + cls.user = User.objects.get(username="recap") + cls.f = SimpleUploadedFile("file.txt", b"file content more content") + cls.court = CourtFactory.create(jurisdiction="FD", in_use=True) + cls.att_data_2 = AppellateAttachmentPageFactory( + attachments=[ + AppellateAttachmentFactory( + pacer_doc_id="04505578698", attachment_number=1 + ), + AppellateAttachmentFactory( + pacer_doc_id="04505578699", attachment_number=2 + ), + ], + pacer_doc_id="04505578697", + pacer_case_id="104491", + document_number="1", + ) + cls.de_data_2 = DocketEntriesDataFactory( + docket_entries=[ + DocketEntryDataFactory( + pacer_doc_id="04505578697", + document_number=1, + ) + ], + ) + + cls.d_1 = DocketFactory( source=Docket.RECAP, docket_number="23-4567", - court=self.court, + court=cls.court, pacer_case_id="104490", ) - d_2 = DocketFactory( + cls.d_2 = DocketFactory( source=Docket.RECAP, docket_number="23-4567", - court=self.court, + court=cls.court, pacer_case_id="104491", ) - d_3 = DocketFactory( + cls.d_3 = DocketFactory( source=Docket.RECAP, docket_number="23-4567", - court=self.court, + court=cls.court, pacer_case_id="104492", ) + def test_processing_subdocket_case_attachment_page(self): + """Can we replicate an attachment page upload from a subdocket case + to its corresponding RD across all related dockets? + """ + # Add the docket entry to every case. async_to_sync(add_docket_entries)( - d_1, self.de_data_2["docket_entries"] + self.d_1, self.de_data_2["docket_entries"] ) async_to_sync(add_docket_entries)( - d_2, self.de_data_2["docket_entries"] + self.d_2, self.de_data_2["docket_entries"] ) async_to_sync(add_docket_entries)( - d_3, self.de_data_2["docket_entries"] + self.d_3, self.de_data_2["docket_entries"] ) # Create an initial PQ. @@ -837,18 +943,18 @@ def test_processing_subdocket_case_attachment_page(self, mock_upload): filepath_local=self.f, ) d_1_recap_document = RECAPDocument.objects.filter( - docket_entry__docket=d_1 + docket_entry__docket=self.d_1 ) d_2_recap_document = RECAPDocument.objects.filter( - docket_entry__docket=d_2 + docket_entry__docket=self.d_2 ) d_3_recap_document = RECAPDocument.objects.filter( - docket_entry__docket=d_3 + docket_entry__docket=self.d_3 ) main_d_1_rd = d_1_recap_document[0] main_d_2_rd = d_2_recap_document[0] - main_d_3_rd = d_2_recap_document[0] + main_d_3_rd = d_3_recap_document[0] # After adding 1 docket entry, it should only exist its main RD on # every docket @@ -877,22 +983,22 @@ def test_processing_subdocket_case_attachment_page(self, mock_upload): self.assertEqual( d_1_recap_document.count(), 3, - msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_2.pacer_case_id}.", + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_2.pacer_case_id}.", ) self.assertEqual( d_2_recap_document.count(), 3, - msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_1.pacer_case_id}.", + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_1.pacer_case_id}.", ) self.assertEqual( d_3_recap_document.count(), 3, - msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_3.pacer_case_id}.", + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_3.pacer_case_id}.", ) main_d_1_rd.refresh_from_db() main_d_2_rd.refresh_from_db() - main_d_2_rd.refresh_from_db() + main_d_3_rd.refresh_from_db() self.assertEqual( main_d_1_rd.pacer_doc_id, self.de_data_2["docket_entries"][0]["pacer_doc_id"], @@ -908,29 +1014,32 @@ def test_processing_subdocket_case_attachment_page(self, mock_upload): # Two of them should be attachments. d_1_attachments = RECAPDocument.objects.filter( - docket_entry__docket=d_1, document_type=RECAPDocument.ATTACHMENT + docket_entry__docket=self.d_1, + document_type=RECAPDocument.ATTACHMENT, ) d_2_attachments = RECAPDocument.objects.filter( - docket_entry__docket=d_2, document_type=RECAPDocument.ATTACHMENT + docket_entry__docket=self.d_2, + document_type=RECAPDocument.ATTACHMENT, ) d_3_attachments = RECAPDocument.objects.filter( - docket_entry__docket=d_3, document_type=RECAPDocument.ATTACHMENT + docket_entry__docket=self.d_3, + document_type=RECAPDocument.ATTACHMENT, ) self.assertEqual( d_1_attachments.count(), 2, - msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {d_1.pacer_case_id}.", + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {self.d_1.pacer_case_id}.", ) self.assertEqual( d_2_attachments.count(), 2, - msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {d_2.pacer_case_id}.", + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {self.d_2.pacer_case_id}.", ) self.assertEqual( d_3_attachments.count(), 2, - msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {d_3.pacer_case_id}.", + msg=f"Didn't get the expected number of RDs Attachments for the docket with PACER case ID {self.d_3.pacer_case_id}.", ) att_1_data = self.att_data_2["attachments"][0] @@ -969,7 +1078,9 @@ def test_processing_subdocket_case_attachment_page(self, mock_upload): self.assertEqual(pqs_status, {PROCESSING_STATUS.SUCCESSFUL}) pqs_related_dockets = {pq.docket_id for pq in pqs_created} - self.assertEqual(pqs_related_dockets, {d_1.pk, d_2.pk, d_3.pk}) + self.assertEqual( + pqs_related_dockets, {self.d_1.pk, self.d_2.pk, self.d_3.pk} + ) # 3 PacerHtmlFiles should have been created, one for each case. att_html_created = PacerHtmlFiles.objects.all() @@ -981,29 +1092,15 @@ def test_processing_subdocket_case_attachment_page(self, mock_upload): {de.pk for de in DocketEntry.objects.all()}, related_htmls_de ) - def test_process_attachments_for_subdocket_pq_with_missing_main_rd( - self, mock_upload - ): + def test_process_attachments_for_subdocket_pq_with_missing_main_rd(self): """Confirm that if the RD related to the initial PQ is missing, we can still process attachments for subdocket cases where the main RD matches. """ - d_1 = DocketFactory( - source=Docket.RECAP, - docket_number="23-4567", - court=self.court, - pacer_case_id="104490", - ) - d_2 = DocketFactory( - source=Docket.RECAP, - docket_number="23-4567", - court=self.court, - pacer_case_id="104491", - ) # Add the docket entry only to d_1. async_to_sync(add_docket_entries)( - d_1, self.de_data_2["docket_entries"] + self.d_1, self.de_data_2["docket_entries"] ) # Create an initial PQ related to d_1 @@ -1015,22 +1112,22 @@ def test_process_attachments_for_subdocket_pq_with_missing_main_rd( filepath_local=self.f, ) d_1_recap_document = RECAPDocument.objects.filter( - docket_entry__docket=d_1 + docket_entry__docket=self.d_1 ) d_2_recap_document = RECAPDocument.objects.filter( - docket_entry__docket=d_2 + docket_entry__docket=self.d_2 ) # After adding 1 docket entry d_1 self.assertEqual( d_1_recap_document.count(), 1, - msg=f"Didn't get the initial number of RDs for the docket with PACER case ID {d_1.pacer_case_id}", + msg=f"Didn't get the initial number of RDs for the docket with PACER case ID {self.d_1.pacer_case_id}", ) self.assertEqual( d_2_recap_document.count(), 0, - msg=f"Didn't get the initial number of RDs for the docket with PACER case ID {d_2.pacer_case_id}", + msg=f"Didn't get the initial number of RDs for the docket with PACER case ID {self.d_2.pacer_case_id}", ) with mock.patch( @@ -1044,12 +1141,12 @@ def test_process_attachments_for_subdocket_pq_with_missing_main_rd( self.assertEqual( d_1_recap_document.count(), 3, - msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_2.pacer_case_id}.", + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_2.pacer_case_id}.", ) self.assertEqual( d_2_recap_document.count(), 0, - msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {d_1.pacer_case_id}.", + msg=f"Didn't get the expected number of RDs for the docket with PACER case ID {self.d_1.pacer_case_id}.", ) pq.refresh_from_db() @@ -1068,10 +1165,241 @@ def test_process_attachments_for_subdocket_pq_with_missing_main_rd( self.assertEqual(successful_pq[0].status, PROCESSING_STATUS.SUCCESSFUL) self.assertEqual( successful_pq[0].docket_id, - d_1.pk, + self.d_1.pk, msg="Didn't get the expected docket ID.", ) + @mock.patch("cl.recap.tasks.extract_recap_pdf_base") + def test_processing_subdocket_case_pdf_upload(self, mock_extract): + """Can we duplicate a PDF document upload from a subdocket case to the + corresponding RD across all related dockets? + """ + + # Add the docket entry to every case. + async_to_sync(add_docket_entries)( + self.d_1, self.de_data_2["docket_entries"] + ) + async_to_sync(add_docket_entries)( + self.d_2, self.de_data_2["docket_entries"] + ) + async_to_sync(add_docket_entries)( + self.d_3, self.de_data_2["docket_entries"] + ) + + d_1_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_1 + ) + d_2_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_2 + ) + d_3_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_3 + ) + + main_d_1_rd = d_1_recap_document[0] + main_d_2_rd = d_2_recap_document[0] + main_d_3_rd = d_3_recap_document[0] + + self.assertFalse(main_d_1_rd.is_available) + self.assertFalse(main_d_2_rd.is_available) + self.assertFalse(main_d_3_rd.is_available) + + # Two test cases: pacer_case_id and blank pacer_case_id + pacer_case_ids = ["104491", ""] + for pacer_case_id in pacer_case_ids: + with ( + self.subTest(pacer_case_id=pacer_case_id), + transaction.atomic(), + ): + # Create an initial PQ. + pq = ProcessingQueue.objects.create( + court=self.court, + uploader=self.user, + pacer_case_id=pacer_case_id, + pacer_doc_id="04505578697", + document_number=1, + upload_type=UPLOAD_TYPE.PDF, + filepath_local=self.f, + ) + + # Process the PDF upload. + async_to_sync(process_recap_upload)(pq) + + main_d_1_rd.refresh_from_db() + main_d_2_rd.refresh_from_db() + main_d_3_rd.refresh_from_db() + + self.assertTrue( + main_d_1_rd.is_available, + msg="is_available value doesn't match", + ) + self.assertTrue( + main_d_2_rd.is_available, + msg="is_available value doesn't match", + ) + self.assertTrue( + main_d_3_rd.is_available, + msg="is_available value doesn't match", + ) + + self.assertTrue(main_d_1_rd.filepath_local) + self.assertTrue(main_d_2_rd.filepath_local) + self.assertTrue(main_d_3_rd.filepath_local) + + # Assert the number of PQs created to process the additional subdocket RDs. + pqs_created = ProcessingQueue.objects.all() + self.assertEqual( + pqs_created.count(), + 3, + msg="The number of PQs doesn't match.", + ) + + pqs_status = {pq.status for pq in pqs_created} + self.assertEqual(pqs_status, {PROCESSING_STATUS.SUCCESSFUL}) + + pqs_related_dockets = {pq.docket_id for pq in pqs_created} + self.assertEqual( + pqs_related_dockets, + {self.d_1.pk, self.d_2.pk, self.d_3.pk}, + ) + pqs_related_docket_entries = { + pq.docket_entry_id for pq in pqs_created + } + self.assertEqual( + pqs_related_docket_entries, + { + main_d_1_rd.docket_entry.pk, + main_d_2_rd.docket_entry.pk, + main_d_3_rd.docket_entry.pk, + }, + ) + pqs_related_rds = {pq.recap_document_id for pq in pqs_created} + self.assertEqual( + pqs_related_rds, + {main_d_1_rd.pk, main_d_2_rd.pk, main_d_3_rd.pk}, + ) + + transaction.set_rollback(True) + + @mock.patch("cl.recap.tasks.extract_recap_pdf_base") + def test_processing_subdocket_case_pdf_attachment_upload( + self, mock_extract + ): + """Can we duplicate a PDF attachment document upload from a subdocket + case to the corresponding RD across all related dockets? + """ + + # Add the docket entry to every case. + async_to_sync(add_docket_entries)( + self.d_1, self.de_data_2["docket_entries"] + ) + async_to_sync(add_docket_entries)( + self.d_2, self.de_data_2["docket_entries"] + ) + + pq_att = ProcessingQueue.objects.create( + court=self.court, + uploader=self.user, + pacer_case_id="104491", + upload_type=UPLOAD_TYPE.ATTACHMENT_PAGE, + filepath_local=self.f, + ) + + with mock.patch( + "cl.recap.tasks.get_data_from_att_report", + side_effect=lambda x, y: self.att_data_2, + ): + # Process the attachment page containing 2 attachments. + async_to_sync(process_recap_upload)(pq_att) + + d_1_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_1 + ) + d_2_recap_document = RECAPDocument.objects.filter( + docket_entry__docket=self.d_2 + ) + self.assertEqual(d_1_recap_document.count(), 3) + self.assertEqual(d_2_recap_document.count(), 3) + + att_d_1_rd = d_1_recap_document.filter(attachment_number=2).first() + att_d_2_rd = d_2_recap_document.filter(attachment_number=2).first() + + self.assertFalse(att_d_1_rd.is_available) + self.assertFalse(att_d_2_rd.is_available) + + # Two test cases: pacer_case_id and blank pacer_case_id + pacer_case_ids = ["104491", ""] + for pacer_case_id in pacer_case_ids: + with ( + self.subTest(pacer_case_id=pacer_case_id), + transaction.atomic(), + ): + # Create an initial PQ. + pq = ProcessingQueue.objects.create( + court=self.court, + uploader=self.user, + pacer_case_id=pacer_case_id, + pacer_doc_id="04505578699", + document_number=1, + attachment_number=2, + upload_type=UPLOAD_TYPE.PDF, + filepath_local=self.f, + ) + + # Process the PDF upload. + async_to_sync(process_recap_upload)(pq) + + att_d_1_rd.refresh_from_db() + att_d_2_rd.refresh_from_db() + + self.assertTrue( + att_d_1_rd.is_available, + msg="is_available value doesn't match", + ) + self.assertTrue( + att_d_2_rd.is_available, + msg="is_available value doesn't match", + ) + + self.assertTrue(att_d_1_rd.filepath_local) + self.assertTrue(att_d_2_rd.filepath_local) + + # Assert the number of PQs created to process the additional subdocket RDs. + pqs_created = ProcessingQueue.objects.filter( + upload_type=UPLOAD_TYPE.PDF + ) + self.assertEqual( + pqs_created.count(), + 2, + msg="The number of PQs doesn't match.", + ) + + pqs_status = {pq.status for pq in pqs_created} + self.assertEqual(pqs_status, {PROCESSING_STATUS.SUCCESSFUL}) + + pqs_related_dockets = {pq.docket_id for pq in pqs_created} + self.assertEqual( + pqs_related_dockets, + {self.d_1.pk, self.d_2.pk}, + ) + pqs_related_docket_entries = { + pq.docket_entry_id for pq in pqs_created + } + self.assertEqual( + pqs_related_docket_entries, + { + att_d_1_rd.docket_entry.pk, + att_d_2_rd.docket_entry.pk, + }, + ) + pqs_related_rds = {pq.recap_document_id for pq in pqs_created} + self.assertEqual( + pqs_related_rds, + {att_d_1_rd.pk, att_d_2_rd.pk}, + ) + + transaction.set_rollback(True) + @mock.patch("cl.recap.tasks.DocketReport", new=fakes.FakeDocketReport) @mock.patch( diff --git a/cl/search/api_utils.py b/cl/search/api_utils.py index fd545fc262..23ff86cdfa 100644 --- a/cl/search/api_utils.py +++ b/cl/search/api_utils.py @@ -12,6 +12,7 @@ build_cardinality_count, build_es_main_query, build_sort_results, + clean_count_query, do_collapse_count_query, do_count_query, do_es_api_query, @@ -21,7 +22,6 @@ set_results_highlights, ) from cl.lib.search_utils import store_search_api_query -from cl.lib.utils import map_to_docket_entry_sorting from cl.search.constants import SEARCH_HL_TAG, cardinality_query_unique_ids from cl.search.documents import ( AudioDocument, @@ -64,7 +64,7 @@ def get_object_list(request, cd, paginator): case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: search_query = ESRECAPDocument.search() case _: - search_query = None + raise ElasticBadRequestError("Unsupported search type.") if use_default_query: main_query, _, _ = build_es_main_query(search_query, cd) @@ -260,12 +260,8 @@ def get_paginated_results( self.main_query = self.main_query.sort(default_sorting, unique_sorting) # Cardinality query parameters - query = Q(self.main_query.to_dict(count=True)["query"]) + main_count_query = clean_count_query(self.main_query) unique_field = cardinality_query_unique_ids[self.clean_data["type"]] - search_document = self.cardinality_base_document[ - self.clean_data["type"] - ] - main_count_query = search_document.search().query(query) cardinality_query = build_cardinality_count( main_count_query, unique_field ) @@ -273,10 +269,16 @@ def get_paginated_results( # Build a cardinality query to count child documents. child_cardinality_query = None child_cardinality_count_response = None - if self.child_docs_query: + if ( + self.child_docs_query + and self.clean_data["type"] == SEARCH_TYPES.RECAP + ): child_unique_field = cardinality_query_unique_ids[ SEARCH_TYPES.RECAP_DOCUMENT ] + search_document = self.cardinality_base_document[ + self.clean_data["type"] + ] child_count_query = search_document.search().query( self.child_docs_query ) @@ -292,7 +294,10 @@ def get_paginated_results( ) # If a cardinality query is available for the search_type, add it # to the multi-search query. - if child_cardinality_query: + if ( + child_cardinality_query + and self.clean_data["type"] == SEARCH_TYPES.RECAP + ): multi_search = multi_search.add(child_cardinality_query) responses = multi_search.execute() diff --git a/cl/search/api_views.py b/cl/search/api_views.py index 4f33b1d4f6..1761ccdd64 100644 --- a/cl/search/api_views.py +++ b/cl/search/api_views.py @@ -1,6 +1,5 @@ from http import HTTPStatus -import waffle from django.db.models import Prefetch from rest_framework import pagination, permissions, response, viewsets from rest_framework.exceptions import NotFound diff --git a/cl/search/constants.py b/cl/search/constants.py index f9d8b610f3..adb2f697d2 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -275,3 +275,37 @@ SEARCH_TYPES.ORAL_ARGUMENT: "id", SEARCH_TYPES.PARENTHETICAL: "id", } + + +date_decay_relevance_types = { + SEARCH_TYPES.OPINION: { + "field": "dateFiled", + "scale": 50, + "decay": 0.2, + "min_score": 0.1, + }, + SEARCH_TYPES.RECAP: { + "field": "dateFiled", + "scale": 20, + "decay": 0.2, + "min_score": 0.1, + }, + SEARCH_TYPES.DOCKETS: { + "field": "dateFiled", + "scale": 20, + "decay": 0.2, + "min_score": 0.1, + }, + SEARCH_TYPES.RECAP_DOCUMENT: { + "field": "dateFiled", + "scale": 20, + "decay": 0.2, + "min_score": 0.1, + }, + SEARCH_TYPES.ORAL_ARGUMENT: { + "field": "dateArgued", + "scale": 50, + "decay": 0.2, + "min_score": 0.1, + }, +} diff --git a/cl/search/documents.py b/cl/search/documents.py index a83c29f127..fcae4baaf6 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -355,6 +355,11 @@ def prepare_timestamp(self, instance): def prepare_percolator_query(self, instance): qd = QueryDict(instance.query.encode(), mutable=True) + if "order_by" in qd: + # sorting key is not required in percolator queries. Adding it + # generates a custom function score for decay relevance, which breaks + # percolator queries. + del qd["order_by"] search_form = SearchForm(qd) if not search_form.is_valid(): logger.warning( @@ -1988,6 +1993,9 @@ def prepare_percolator_query(self, instance): from cl.alerts.utils import build_plain_percolator_query qd = QueryDict(instance.query.encode(), mutable=True) + # For RECAP percolator queries, we use build_plain_percolator_query to + # build the query. It does not add a custom function_score, so there is + # no need to remove the order_by sorting key as it is ignored. search_form = SearchForm(qd) if not search_form.is_valid(): logger.warning( diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index 6b62084f73..358f7c2725 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2269,6 +2269,271 @@ def test_uses_exact_version_for_case_name_field(self) -> None: cluster_2.delete() +class OpinionSearchDecayRelevancyTest( + ESIndexTestCase, V4SearchAPIAssertions, TestCase +): + """ + Opinion Search Decay Relevancy Tests + """ + + @classmethod + def setUpTestData(cls): + # Rebuild the Opinion index + cls.rebuild_index("search.OpinionCluster") + + # Same keywords but different dateFiled + cls.opinion_old = OpinionClusterFactory.create( + case_name="Keyword Match", + case_name_full="", + case_name_short="", + date_filed=datetime.date(1832, 2, 23), + procedural_history="", + source="C", + attorneys="", + slug="opinion-old", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1235", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_old = OpinionFactory.create( + cluster=cls.opinion_old, plain_text="", author_str="" + ) + + cls.opinion_recent = OpinionClusterFactory.create( + case_name="Keyword Match", + case_name_full="", + case_name_short="", + date_filed=datetime.date(2024, 2, 23), + procedural_history="", + source="C", + attorneys="", + slug="opinion-recent", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1236", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_recent = OpinionFactory.create( + cluster=cls.opinion_recent, plain_text="", author_str="" + ) + + # Different relevance with same dateFiled + cls.opinion_high_relevance = OpinionClusterFactory.create( + case_name="Highly Relevant Keywords", + case_name_full="", + case_name_short="", + date_filed=datetime.date(2022, 2, 23), + procedural_history="More Highly Relevant Keywords", + source="C", + attorneys="More Highly Relevant Keywords", + slug="opinion-high-rel", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1237", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_high_relevance = OpinionFactory.create( + cluster=cls.opinion_high_relevance, plain_text="", author_str="" + ) + + cls.opinion_low_relevance = OpinionClusterFactory.create( + case_name="Highly Relevant Keywords", + case_name_full="", + case_name_short="", + date_filed=datetime.date(2022, 2, 23), + procedural_history="", + source="C", + attorneys="", + slug="opinion-low-rel", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1238", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_low_relevance = OpinionFactory.create( + cluster=cls.opinion_low_relevance, plain_text="", author_str="" + ) + + # Different relevance with different dateFiled + cls.opinion_high_relevance_old_date = OpinionClusterFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + date_filed=datetime.date(1900, 2, 23), + procedural_history="More Ipsum Dolor Terms", + source="C", + attorneys="More Ipsum Dolor Terms", + slug="opinion-high-rel-old", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1239", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_high_relevance_old_date = OpinionFactory.create( + cluster=cls.opinion_high_relevance_old_date, + plain_text="", + author_str="", + ) + + cls.opinion_low_relevance_new_date = OpinionClusterFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + date_filed=datetime.date(2024, 12, 23), + procedural_history="", + source="C", + attorneys="", + slug="opinion-low-rel-new", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1241", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_low_relevance_new_date = OpinionFactory.create( + cluster=cls.opinion_low_relevance_new_date, + plain_text="", + author_str="", + ) + + super().setUpTestData() + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.OPINION, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + cls.test_cases = [ + { + "name": "Same keywords, different dateFiled", + "search_params": { + "q": "Keyword Match", + "order_by": "score desc", + "type": SEARCH_TYPES.OPINION, + }, + "expected_order_frontend": [ + cls.opinion_recent.docket.docket_number, # Most recent dateFiled + cls.opinion_old.docket.docket_number, # Oldest dateFiled + ], + "expected_order": [ # API + cls.opinion_recent.pk, + cls.opinion_old.pk, + ], + }, + { + "name": "Different relevancy same dateFiled", + "search_params": { + "q": "Highly Relevant Keywords", + "order_by": "score desc", + "type": SEARCH_TYPES.OPINION, + }, + "expected_order_frontend": [ + cls.opinion_high_relevance.docket.docket_number, # Most relevant by keywords + cls.opinion_low_relevance.docket.docket_number, # Less relevant by keywords + ], + "expected_order": [ # API + cls.opinion_high_relevance.pk, # Most relevant by keywords + cls.opinion_low_relevance.pk, # Less relevant by keywords + ], + }, + { + "name": "Different relevancy and different dateFiled", + "search_params": { + "q": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.OPINION, + }, + "expected_order_frontend": [ + cls.opinion_low_relevance_new_date.docket.docket_number, # Combination of relevance and date rank it first. + cls.opinion_high_relevance_old_date.docket.docket_number, + ], + "expected_order": [ # API + cls.opinion_low_relevance_new_date.pk, + cls.opinion_high_relevance_old_date.pk, + ], + }, + { + "name": "Match all query decay relevancy.", + "search_params": { + "q": "", + "order_by": "score desc", + "type": SEARCH_TYPES.OPINION, + }, + # Order by recency and then by relevancy as per decay scoring logic + "expected_order_frontend": [ + cls.opinion_low_relevance_new_date.docket.docket_number, # 2024-12-23 1:21-bk-1241 + cls.opinion_recent.docket.docket_number, # 2024-02-23 1:21-bk-1236 + cls.opinion_high_relevance.docket.docket_number, # 2022-02-23 1:21-bk-1237 Indexed first, displayed first. + cls.opinion_low_relevance.docket.docket_number, # 2022-02-23 1:21-bk-1238 + cls.opinion_high_relevance_old_date.docket.docket_number, # 1800-02-23 1:21-bk-1239 + cls.opinion_old.docket.docket_number, # 1732-02-23 1:21-bk-1235 + ], + "expected_order": [ # V4 API + cls.opinion_low_relevance_new_date.pk, # 2024-12-23 + cls.opinion_recent.pk, # 2024-02-23 + cls.opinion_low_relevance.pk, # 2022-02-23 Higher PK in V4, API pk is a secondary sorting key. + cls.opinion_high_relevance.pk, # 2022-02-23 Lower PK + cls.opinion_high_relevance_old_date.pk, # 1800-02-23 + cls.opinion_old.pk, # 1732-02-23 + ], + "expected_order_v3": [ # V3 API + cls.opinion_low_relevance_new_date.pk, # 2024-12-23 + cls.opinion_recent.pk, # 2024-02-23 + cls.opinion_high_relevance.pk, # 2022-02-23 Indexed first, displayed first. + cls.opinion_low_relevance.pk, # 2022-02-23 + cls.opinion_high_relevance_old_date.pk, # 1800-02-23 + cls.opinion_old.pk, # 1732-02-23 + ], + }, + ] + + def test_relevancy_decay_scoring_frontend(self) -> None: + """Test relevancy decay scoring for Opinion search Frontend""" + + for test in self.test_cases: + with self.subTest(test["name"]): + r = async_to_sync(self._test_article_count)( + test["search_params"], + len(test["expected_order_frontend"]), + f"Failed count {test['name']}", + ) + self._assert_order_in_html( + r.content.decode(), test["expected_order_frontend"] + ) + + def test_relevancy_decay_scoring_v4_api(self) -> None: + """Test relevancy decay scoring for Opinion search V4 API""" + + for test in self.test_cases: + self._test_results_ordering(test, "cluster_id") + + def test_relevancy_decay_scoring_v3_api(self) -> None: + """Test relevancy decay scoring for Opinion search V3 API""" + + for test in self.test_cases: + self._test_results_ordering(test, "cluster_id", version="v3") + + @override_flag("ui_flag_for_o", False) @override_settings(RELATED_MLT_MINTF=1) class RelatedSearchTest( diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py index 008f6dc218..1147fa92e7 100644 --- a/cl/search/tests/tests_es_oral_arguments.py +++ b/cl/search/tests/tests_es_oral_arguments.py @@ -113,7 +113,7 @@ async def test_oa_results_relevance_ordering(self) -> None: ) self.assertTrue( r.content.decode().index("Jose") - > r.content.decode().index("Hong Liu"), + < r.content.decode().index("Hong Liu"), msg="'Jose' should come AFTER 'Hong Liu' when order_by relevance.", ) @@ -273,9 +273,9 @@ async def test_oa_results_relevance_ordering_elastic(self) -> None: ) self.assertTrue( r.content.decode().index("Jose") - > r.content.decode().index("Hong Liu Lorem") + < r.content.decode().index("Hong Liu Lorem") < r.content.decode().index("Hong Liu Yang"), - msg="'Jose' should come AFTER 'Hong Liu Lorem' and 'Hong Liu Yang' when order_by relevance.", + msg="'Jose' should come Before 'Hong Liu Lorem' and 'Hong Liu Yang' when order_by relevance.", ) @skip_if_common_tests_skipped @@ -983,14 +983,14 @@ def confirm_query_matched(response, query_id) -> bool: @staticmethod def save_percolator_query(cd): search_query = AudioDocument.search() + # Sorting key is not required in percolator queries. + del cd["order_by"] es_queries = build_es_base_query(search_query, cd) - search_query = es_queries.search_query - query_dict = search_query.to_dict()["query"] percolator_query = AudioPercolator( - percolator_query=query_dict, rate=Alert.REAL_TIME + percolator_query=es_queries.search_query.to_dict()["query"], + rate=Alert.REAL_TIME, ) percolator_query.save(refresh=True) - return percolator_query.meta.id @staticmethod @@ -1052,9 +1052,9 @@ def test_oa_results_relevance_ordering(self) -> None: expected = 3 self.assertEqual(actual, expected) self.assertTrue( - r.content.decode().index("Jose") - > r.content.decode().index("Hong Liu"), - msg="'Jose' should come AFTER 'Hong Liu' when order_by relevance.", + r.content.decode().index("Jose") # 2015, 8, 15 + < r.content.decode().index("Hong Liu"), # 2015, 8, 14 + msg="'Jose' should come Before 'Hong Liu' when order_by relevance.", ) def test_oa_results_search_match_phrase(self) -> None: @@ -1642,10 +1642,14 @@ def test_oa_results_relevance_ordering_elastic(self) -> None: expected = 3 self.assertEqual(actual, expected) self.assertTrue( - r.content.decode().index("Hong Liu Lorem") - < r.content.decode().index("Hong Liu Yang") - < r.content.decode().index("Jose"), - msg="'Hong Liu Lorem' should come BEFORE 'Hong Liu Yang' and 'Jose' when order_by relevance.", + r.content.decode().index( + "Hong Liu Lorem" + ) # 2015, 8, 14 - 9.486339 + < r.content.decode().index( + "Hong Liu Yang" + ) # 2015, 8, 14 - 9.034608 + < r.content.decode().index("Jose"), # 2015, 8, 15 - 4.7431693 + msg="'Jose' should come BEFORE 'Hong Liu Yang' and 'Hong Liu Lorem' when order_by relevance.", ) # Relevance order, two words match, reverse order. @@ -1663,10 +1667,10 @@ def test_oa_results_relevance_ordering_elastic(self) -> None: expected = 3 self.assertEqual(actual, expected) self.assertTrue( - r.content.decode().index("Jose") - > r.content.decode().index("Hong Liu Lorem") - < r.content.decode().index("Hong Liu Yang"), - msg="'Jose' should come AFTER 'Hong Liu Lorem' and 'Hong Liu Yang' when order_by relevance.", + r.content.decode().index("Jose") # 2015, 8, 15 + < r.content.decode().index("Hong Liu Lorem") # 2015, 8, 14 + < r.content.decode().index("Hong Liu Yang"), # 2015, 8, 14 + msg="'Jose' should come Before 'Hong Liu Lorem' and 'Hong Liu Yang' when order_by relevance.", ) # Relevance order, hyphenated compound word. @@ -2488,6 +2492,286 @@ def test_uses_exact_version_for_case_name_field(self) -> None: self.assertIn("Howells", r.content.decode()) +class OralArgumentsSearchDecayRelevancyTest( + ESIndexTestCase, V4SearchAPIAssertions, TestCase +): + """Oral Arguments Search Decay Relevancy Tests""" + + @classmethod + def setUpTestData(cls): + + # Same keywords but different dateArgued + with cls.captureOnCommitCallbacks(execute=True): + cls.docket_old = DocketFactory.create( + docket_number="1:21-bk-1235", + date_argued=datetime.date(1832, 2, 23), + ) + cls.audio_old = AudioFactory.create( + case_name="Keyword Match", + case_name_full="", + docket_id=cls.docket_old.pk, + duration=420, + judges="Judge Old", + local_path_original_file="test/audio/audio_old.mp3", + local_path_mp3="test/audio/audio_old.mp3", + source="C", + blocked=False, + sha1="old_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="Transcript for old audio", + ) + + cls.docket_recent = DocketFactory.create( + docket_number="1:21-bk-1236", + date_argued=datetime.date(2024, 2, 23), + ) + cls.audio_recent = AudioFactory.create( + case_name="Keyword Match", + case_name_full="", + docket_id=cls.docket_recent.pk, + duration=420, + judges="Judge Recent", + local_path_original_file="test/audio/audio_recent.mp3", + local_path_mp3="test/audio/audio_recent.mp3", + source="C", + blocked=False, + sha1="recent_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="Transcript for recent audio", + ) + + # Different relevance with same dateArgued + cls.docket_low_relevance = DocketFactory.create( + case_name="Highly Relevant Keywords", + docket_number="1:21-bk-1238", + date_argued=datetime.date(2022, 2, 23), + ) + cls.audio_low_relevance = AudioFactory.create( + case_name="Highly Relevant Keywords", + case_name_full="", + docket_id=cls.docket_low_relevance.pk, + duration=420, + judges="Judge Low", + local_path_original_file="test/audio/audio_low_rel.mp3", + local_path_mp3="test/audio/audio_low_rel.mp3", + source="C", + blocked=False, + sha1="low_rel_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="", + ) + + cls.docket_high_relevance = DocketFactory.create( + case_name="Highly Relevant Keywords", + docket_number="1:21-bk-1237", + date_argued=datetime.date(2022, 2, 23), + ) + cls.audio_high_relevance = AudioFactory.create( + case_name="Highly Relevant Keywords", + case_name_full="", + docket_id=cls.docket_high_relevance.pk, + duration=420, + judges="Judge High", + local_path_original_file="test/audio/audio_high_rel.mp3", + local_path_mp3="test/audio/audio_high_rel.mp3", + source="C", + blocked=False, + sha1="high_rel_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="More Highly Relevant Keywords in the transcript", + ) + + # Different relevance with different dateArgued + cls.docket_high_relevance_old_date = DocketFactory.create( + case_name="Ipsum Dolor Terms", + docket_number="1:21-bk-1239", + date_argued=datetime.date(1900, 2, 23), + ) + cls.audio_high_relevance_old_date = AudioFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + docket_id=cls.docket_high_relevance_old_date.pk, + duration=420, + judges="Judge Old Relevant", + local_path_original_file="test/audio/audio_high_rel_old.mp3", + local_path_mp3="test/audio/audio_high_rel_old.mp3", + source="C", + blocked=False, + sha1="high_rel_old_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="More Ipsum Dolor Terms", + ) + + cls.docket_high_relevance_null_date = DocketFactory.create( + case_name="Ipsum Dolor Terms", + docket_number="1:21-bk-1240", + date_argued=None, + ) + cls.audio_high_relevance_null_date = AudioFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + docket_id=cls.docket_high_relevance_null_date.pk, + duration=420, + judges="Judge Null", + local_path_original_file="test/audio/audio_high_rel_null.mp3", + local_path_mp3="test/audio/audio_high_rel_null.mp3", + source="C", + blocked=False, + sha1="high_rel_null_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="More Ipsum Dolor Terms", + ) + + cls.docket_low_relevance_new_date = DocketFactory.create( + case_name="Ipsum Dolor Terms", + docket_number="1:21-bk-1241", + date_argued=datetime.date(2024, 12, 23), + ) + cls.audio_low_relevance_new_date = AudioFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + docket_id=cls.docket_low_relevance_new_date.pk, + duration=420, + judges="Judge New Low", + local_path_original_file="test/audio/audio_low_rel_new.mp3", + local_path_mp3="test/audio/audio_low_rel_new.mp3", + source="C", + blocked=False, + sha1="low_rel_new_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="", + ) + + cls.test_cases = [ + { + "name": "Same keywords different dateArgued", + "search_params": { + "q": "Keyword Match", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_recent.docket_number, # Most recent dateArgued + cls.docket_old.docket_number, # Oldest dateArgued + ], + "expected_order": [ # API + cls.audio_recent.pk, + cls.audio_old.pk, + ], + }, + { + "name": "Different relevancy same dateArgued", + "search_params": { + "q": "Highly Relevant Keywords", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_high_relevance.docket_number, # Most relevant by keywords + cls.docket_low_relevance.docket_number, # Less relevant by keywords + ], + "expected_order": [ + cls.audio_high_relevance.pk, + cls.audio_low_relevance.pk, + ], + }, + { + "name": "Different relevancy different dateArgued", + "search_params": { + "q": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # Combination of relevance and date rank it first. + cls.docket_high_relevance_old_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. + ], + "expected_order": [ # API + cls.audio_low_relevance_new_date.pk, + cls.audio_high_relevance_old_date.pk, + cls.audio_high_relevance_null_date.pk, + ], + }, + { + "name": "Fixed main score for all (0 or 1) (using filters) and different dateArgued", + "search_params": { + "case_name": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # Most recent dateFiled + cls.docket_high_relevance_old_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. + ], + "expected_order": [ # API + cls.audio_low_relevance_new_date.pk, + cls.audio_high_relevance_old_date.pk, + cls.audio_high_relevance_null_date.pk, + ], + }, + { + "name": "Match all query decay relevancy.", + "search_params": { + "q": "", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # 2024-12-23 1:21-bk-1241 + cls.docket_recent.docket_number, # 2024-02-23 1:21-bk-1236 + cls.docket_low_relevance.docket_number, # 2022-02-23 1:21-bk-1238 Indexed first, displayed first. + cls.docket_high_relevance.docket_number, # 2022-02-23 1:21-bk-1237 + cls.docket_high_relevance_old_date.docket_number, # 1800-02-23 1:21-bk-1239 + cls.docket_old.docket_number, # 1732-02-23 1:21-bk-1235 + cls.docket_high_relevance_null_date.docket_number, # Null dateArgued 1:21-bk-1240 + ], + "expected_order": [ # V4 API + cls.audio_low_relevance_new_date.pk, # 2024-12-23 + cls.audio_recent.pk, # 2024-02-23 + cls.audio_high_relevance.pk, # 2022-02-23 Higher PK in V4 API, pk is a secondary sorting key. + cls.audio_low_relevance.pk, # 2022-02-23 + cls.audio_high_relevance_old_date.pk, # 1800-02-23 + cls.audio_old.pk, # 1732-02-23 + cls.audio_high_relevance_null_date.pk, # Null dateArgued + ], + "expected_order_v3": [ # V3 API + cls.audio_low_relevance_new_date.pk, # 2024-12-23 + cls.audio_recent.pk, # 2024-02-23 + cls.audio_low_relevance.pk, # 2022-02-23 Indexed first, displayed first. + cls.audio_high_relevance.pk, # 2022-02-23 + cls.audio_high_relevance_old_date.pk, # 1800-02-23 + cls.audio_old.pk, # 1732-02-23 + cls.audio_high_relevance_null_date.pk, # Null dateArgued + ], + }, + ] + + def test_relevancy_decay_scoring_frontend(self) -> None: + """Test relevancy decay scoring for Oral Arguments search Frontend""" + for test in self.test_cases: + with self.subTest(test["name"]): + r = async_to_sync(self._test_article_count)( + test["search_params"], + len(test["expected_order_frontend"]), + f"Failed count {test['name']}", + ) + self._assert_order_in_html( + r.content.decode(), test["expected_order_frontend"] + ) + + def test_relevancy_decay_scoring_v4_api(self) -> None: + """Test relevancy decay scoring for Oral Arguments search V4 API""" + for test in self.test_cases: + self._test_results_ordering(test, "id", version="v4") + + def test_relevancy_decay_scoring_v3_api(self) -> None: + """Test relevancy decay scoring for Oral Arguments search V3 API""" + for test in self.test_cases: + self._test_results_ordering(test, "id", version="v3") + + class OralArgumentIndexingTest( CountESTasksTestCase, ESIndexTestCase, TransactionTestCase ): diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 8a6f785de0..91981feb77 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -2854,6 +2854,333 @@ def test_uses_exact_version_for_case_name_field(self) -> None: docket_2.delete() +class RECAPSearchDecayRelevancyTest( + ESIndexTestCase, V4SearchAPIAssertions, TestCase +): + """ + RECAP Search Decay Relevancy Tests + """ + + @classmethod + def setUpTestData(cls): + cls.rebuild_index("search.Docket") + + # Same keywords but different dateFiled + cls.docket_old = DocketFactory( + case_name="Keyword Match", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1235", + source=Docket.RECAP, + date_filed=datetime.date(1832, 2, 23), + ) + cls.rd_old = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_old, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="019036000435", + ) + + cls.docket_recent = DocketFactory( + case_name="Keyword Match", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1236", + source=Docket.RECAP, + date_filed=datetime.date(2024, 2, 23), + ) + cls.rd_recent = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_recent, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="019036000436", + ) + + # Different relevance with same dateFiled + cls.docket_low_relevance = DocketFactory( + case_name="Highly Relevant Keywords", + case_name_full="", + case_name_short="", + nature_of_suit="", + docket_number="1:21-bk-1238", + source=Docket.RECAP, + date_filed=datetime.date(2022, 2, 23), + ) + cls.rd_low_relevance = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_low_relevance, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="019036000437", + ) + + cls.docket_high_relevance = DocketFactory( + case_name="Highly Relevant Keywords", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1237", + source=Docket.RECAP, + nature_of_suit="More Highly Relevant Keywords", + cause="More Highly Relevant Keywords", + date_filed=datetime.date(2022, 2, 23), + ) + cls.rd_high_relevance = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_high_relevance, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="01903600048", + ) + + # Different relevance with different dateFiled + cls.docket_high_relevance_old_date = DocketFactory( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1239", + source=Docket.RECAP, + nature_of_suit="More Ipsum Dolor Terms", + cause="More Ipsum Dolor Terms", + date_filed=datetime.date(1900, 2, 23), + ) + cls.rd_high_relevance_old_date = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_high_relevance_old_date, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="01903600049", + ) + + cls.docket_high_relevance_null_date = DocketFactory( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1240", + source=Docket.RECAP, + nature_of_suit="More Ipsum Dolor Terms", + cause="More Ipsum Dolor Terms", + date_filed=None, + ) + cls.rd_high_relevance_null_date = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_high_relevance_null_date, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="01903600050", + ) + + cls.docket_low_relevance_new_date = DocketFactory( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + nature_of_suit="", + docket_number="1:21-bk-1241", + source=Docket.RECAP, + date_filed=datetime.date(2024, 12, 23), + ) + cls.rd_low_relevance_new_date = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_low_relevance_new_date, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="01903600051", + ) + + super().setUpTestData() + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + cls.test_cases = [ + { + "name": "Same keywords, different dateFiled", + "search_params": { + "q": "Keyword Match", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_recent.docket_number, # Most recent dateFiled + cls.docket_old.docket_number, # Oldest dateFiled + ], + "expected_order": [ # API + cls.docket_recent.pk, + cls.docket_old.pk, + ], + }, + { + "name": "Different relevancy same dateFiled", + "search_params": { + "q": "Highly Relevant Keywords", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_high_relevance.docket_number, + # Most relevant by keywords + cls.docket_low_relevance.docket_number, + # Less relevant by keywords + ], + "expected_order": [ # API + cls.docket_high_relevance.pk, + cls.docket_low_relevance.pk, + ], + }, + { + "name": "Different relevancy different dateFiled", + "search_params": { + "q": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # Combination of relevance and date rank it first. + cls.docket_high_relevance_old_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. + ], + "expected_order": [ # API + cls.docket_low_relevance_new_date.pk, + cls.docket_high_relevance_old_date.pk, + cls.docket_high_relevance_null_date.pk, + ], + }, + { + "name": "Fixed main score for all (0 or 1) (using filters) and different dateFiled", + "search_params": { + "case_name": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # Most recent dateFiled + cls.docket_high_relevance_old_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. + ], + "expected_order": [ # API + cls.docket_low_relevance_new_date.pk, + cls.docket_high_relevance_old_date.pk, + cls.docket_high_relevance_null_date.pk, + ], + }, + { + "name": "Match all query decay relevancy.", + "search_params": { + "q": "", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, + # 2024, 12, 23 1:21-bk-1241 + cls.docket_recent.docket_number, + # 2024, 2, 23 1:21-bk-1236 + cls.docket_low_relevance.docket_number, + # 2022, 2, 23 1:21-bk-1238 Indexed first, displayed first. + cls.docket_high_relevance.docket_number, + # 2022, 2, 23 1:21-bk-1237 + cls.docket_high_relevance_old_date.docket_number, + # 1800, 2, 23 1:21-bk-1239 + cls.docket_old.docket_number, # 1732, 2, 23 1:21-bk-1235 + cls.docket_high_relevance_null_date.docket_number, + # Null dateFiled 1:21-bk-1240 + ], + "expected_order": [ # V4 API + cls.docket_low_relevance_new_date.pk, + # 2024, 12, 23 1:21-bk-1241 + cls.docket_recent.pk, + # 2024, 2, 23 1:21-bk-1236 + cls.docket_high_relevance.pk, + # 2022, 2, 23 1:21-bk-1237 Higher PK in V4, API pk is a secondary sorting key. + cls.docket_low_relevance.pk, + # 2022, 2, 23 1:21-bk-1238 Lower PK + cls.docket_high_relevance_old_date.pk, + # 1800, 2, 23 1:21-bk-1239 + cls.docket_old.pk, # 1732, 2, 23 1:21-bk-1235 + cls.docket_high_relevance_null_date.pk, + # Null 1:21-bk-1240 + ], + "expected_order_v3": [ # V3 API + cls.docket_low_relevance_new_date.pk, + # 2024, 12, 23 1:21-bk-1241 + cls.docket_recent.pk, + # 2024, 2, 23 1:21-bk-1236 + cls.docket_low_relevance.pk, + # 2022, 2, 23 1:21-bk-1238 Indexed first, displayed first. + cls.docket_high_relevance.pk, + # 2022, 2, 23 1:21-bk-1237 + cls.docket_high_relevance_old_date.pk, + # 1800, 2, 23 1:21-bk-1239 + cls.docket_old.pk, # 1732, 2, 23 1:21-bk-1235 + cls.docket_high_relevance_null_date.pk, + # Null 1:21-bk-1240 + ], + }, + ] + + def test_relevancy_decay_scoring_frontend(self) -> None: + """Test relevancy decay scoring for RECAP search Frontend""" + + for test in self.test_cases: + with self.subTest(test["name"]): + r = async_to_sync(self._test_article_count)( + test["search_params"], + len(test["expected_order_frontend"]), + f"Failed count {test["name"]}", + ) + self._assert_order_in_html( + r.content.decode(), test["expected_order_frontend"] + ) + + def test_relevancy_decay_scoring_v4_api(self) -> None: + """Test relevancy decay scoring for RECAP search V4 API""" + + search_types = [ + SEARCH_TYPES.RECAP, + SEARCH_TYPES.DOCKETS, + SEARCH_TYPES.RECAP_DOCUMENT, + ] + for search_type in search_types: + for test in self.test_cases: + test["search_params"]["type"] = search_type + self._test_results_ordering(test, "docket_id", version="v4") + + def test_relevancy_decay_scoring_v3_api(self) -> None: + """Test relevancy decay scoring for RECAP search V4 API""" + + search_types = [SEARCH_TYPES.RECAP, SEARCH_TYPES.DOCKETS] + for search_type in search_types: + for test in self.test_cases: + test["search_params"]["type"] = search_type + self._test_results_ordering(test, "docket_id", version="v3") + + class RECAPSearchAPICommonTests(RECAPSearchTestCase): version_api = "v3" @@ -3389,28 +3716,6 @@ async def test_results_ordering(self) -> None: # API await self._test_api_results_count(params, 3, "order random") - # Order by score desc (relevance). - params = { - "type": SEARCH_TYPES.RECAP, - "q": "SUBPOENAS SERVED", - "order_by": "score desc", - } - # API - r = await self._test_api_results_count(params, 3, "order score desc") - self.assertTrue( - r.content.decode().index("1:21-bk-1234") - < r.content.decode().index("12-1235"), - msg="'1:21-bk-1234' should come BEFORE '12-1235' when order_by score desc.", - ) - - params["type"] = SEARCH_TYPES.DOCKETS - r = await self._test_api_results_count(params, 2, "order") - self.assertTrue( - r.content.decode().index("1:21-bk-1234") - < r.content.decode().index("12-1235"), - msg="'1:21-bk-1234' should come BEFORE '12-1235' when order_by score desc.", - ) - # Order by entry_date_filed desc params = { "type": SEARCH_TYPES.RECAP, @@ -3910,7 +4215,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Query string, order by dateFiled desc", "search_params": search_params, - "expected_results": 5, "expected_order": [ docket_entry_recent.docket.pk, # 2024/02/23 self.de_1.docket.pk, # 2016/08/16 @@ -3922,7 +4226,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Query string, order by dateFiled asc", "search_params": params_date_filed_asc, - "expected_results": 5, "expected_order": [ docket_old.pk, # 1732/2/23 self.de.docket.pk, # 2015/8/16 @@ -3934,7 +4237,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Match all query, order by dateFiled desc", "search_params": params_match_all_date_filed_desc, - "expected_results": 8, "expected_order": [ docket_entry_recent.docket.pk, # 2024/2/23 self.de_1.docket.pk, # 2016/8/16 @@ -3949,7 +4251,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Match all query, order by dateFiled asc", "search_params": params_match_all_date_filed_asc, - "expected_results": 8, "expected_order": [ docket_old.pk, # 1732/2/23 self.de.docket.pk, # 2015/8/16 @@ -3964,7 +4265,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Query string, order by entry_date_filed asc", "search_params": params_entry_date_filed_asc, - "expected_results": 5, "expected_order": [ self.de_1.docket.pk, # 2014/7/19 self.de.docket.pk, # 2015/8/16 @@ -3976,7 +4276,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Match all query, order by entry_date_filed asc", "search_params": params_match_all_entry_date_filed_asc, - "expected_results": 8, "expected_order": [ self.de_1.docket.pk, # 2014/7/19 self.de.docket.pk, # 2015/8/16 diff --git a/cl/simple_pages/static/png/pray-button.png b/cl/simple_pages/static/png/pray-button.png index 76c1f6c7ed..8ef8b9caa2 100644 Binary files a/cl/simple_pages/static/png/pray-button.png and b/cl/simple_pages/static/png/pray-button.png differ diff --git a/cl/simple_pages/static/png/prayer-email.png b/cl/simple_pages/static/png/prayer-email.png index 012b988ca2..088f483836 100644 Binary files a/cl/simple_pages/static/png/prayer-email.png and b/cl/simple_pages/static/png/prayer-email.png differ diff --git a/cl/tests/cases.py b/cl/tests/cases.py index 8b23dea418..cdd93358db 100644 --- a/cl/tests/cases.py +++ b/cl/tests/cases.py @@ -346,24 +346,64 @@ async def _test_api_fields_content( f"Parent field '{field}' does not match.", ) - def _test_results_ordering(self, test, field): + def _test_results_ordering(self, test, field, version="v4"): """Ensure dockets appear in the response in a specific order.""" with self.subTest(test=test, msg=f'{test["name"]}'): r = self.client.get( - reverse("search-list", kwargs={"version": "v4"}), + reverse("search-list", kwargs={"version": version}), test["search_params"], ) - self.assertEqual(len(r.data["results"]), test["expected_results"]) + + expected_order_key = "expected_order" + if version == "v3": + expected_order_key = ( + "expected_order_v3" + if "expected_order_v3" in test + else "expected_order" + ) + + self.assertEqual( + len(r.data["results"]), len(test[expected_order_key]) + ) # Note that dockets where the date_field is null are sent to the bottom # of the results actual_order = [result[field] for result in r.data["results"]] self.assertEqual( actual_order, - test["expected_order"], - msg=f'Expected order {test["expected_order"]}, but got {actual_order}', + test[expected_order_key], + msg=f"Expected order {test[expected_order_key]}, but got {actual_order} for " + f"Search type: {test["search_params"]["type"]}", + ) + + def _assert_order_in_html( + self, decoded_content: str, expected_order: list + ) -> None: + """Assert that the expected order of documents appears correctly in the + HTML content.""" + + for i in range(len(expected_order) - 1): + self.assertTrue( + decoded_content.index(str(expected_order[i])) + < decoded_content.index(str(expected_order[i + 1])), + f"Expected {expected_order[i]} to appear before {expected_order[i + 1]} in the HTML content.", ) + async def _test_article_count(self, params, expected_count, field_name): + r = await self.async_client.get("/", params) + tree = html.fromstring(r.content.decode()) + got = len(tree.xpath("//article")) + self.assertEqual( + got, + expected_count, + msg="Did not get the right number of search results in Frontend with %s " + "filter applied.\n" + "Expected: %s\n" + " Got: %s\n\n" + "Params were: %s" % (field_name, expected_count, got, params), + ) + return r + def _test_page_variables( self, response, test_case, current_page, search_type ): diff --git a/poetry.lock b/poetry.lock index 3703770058..be4b8ae1a8 100644 --- a/poetry.lock +++ b/poetry.lock @@ -945,13 +945,13 @@ files = [ [[package]] name = "django" -version = "5.1.2" +version = "5.1.4" description = "A high-level Python web framework that encourages rapid development and clean, pragmatic design." optional = false python-versions = ">=3.10" files = [ - {file = "Django-5.1.2-py3-none-any.whl", hash = "sha256:f11aa87ad8d5617171e3f77e1d5d16f004b79a2cf5d2e1d2b97a6a1f8e9ba5ed"}, - {file = "Django-5.1.2.tar.gz", hash = "sha256:bd7376f90c99f96b643722eee676498706c9fd7dc759f55ebfaf2c08ebcdf4f0"}, + {file = "Django-5.1.4-py3-none-any.whl", hash = "sha256:236e023f021f5ce7dee5779de7b286565fdea5f4ab86bae5338e3f7b69896cf0"}, + {file = "Django-5.1.4.tar.gz", hash = "sha256:de450c09e91879fa5a307f696e57c851955c910a438a35e6b4c895e86bedc82a"}, ] [package.dependencies] @@ -2319,13 +2319,13 @@ setuptools = "*" [[package]] name = "juriscraper" -version = "2.6.48" +version = "2.6.49" description = "An API to scrape American court websites for metadata." optional = false python-versions = "*" files = [ - {file = "juriscraper-2.6.48-py27-none-any.whl", hash = "sha256:f2e198cb66a5d3f1423ec4928fc76e1f25c13d0caafc2a6262a7d158c39eab8e"}, - {file = "juriscraper-2.6.48.tar.gz", hash = "sha256:bc138e2c5776f55ef96c10f4a4185d0fec80d83e555e25d1f3fb4b384d399c53"}, + {file = "juriscraper-2.6.49-py27-none-any.whl", hash = "sha256:5954c15747ee5a922d8388db9bb1649100bf8376c39122dc2f4ede2b437b8d0b"}, + {file = "juriscraper-2.6.49.tar.gz", hash = "sha256:28254a22584cfd92a47bb91f9f3bc9139514ffdddfdad1085eacdc70f79fa264"}, ] [package.dependencies] @@ -5692,4 +5692,4 @@ testing = ["coverage[toml]", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = ">=3.13, <3.14" -content-hash = "8703160c5832be62299f5a926fa8670aed8715cb7c03dc7dd7be2d1a5c84fb2a" +content-hash = "49aab347be47355db92d0faabdfdb28120a588fb5694887e3e730cca312a0945" diff --git a/pyproject.toml b/pyproject.toml index ecff70afd5..c24ceec367 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -33,7 +33,7 @@ celery = "^5.4.0" certifi = "^2024.12.14" courts-db = "*" disposable-email-domains = "*" -Django = "^5.1.2" +Django = "^5.1.4" django-cache-memoize = "==0.*" django-cors-headers = "^4.6.0" django-csp = "^3.8"