diff --git a/cl/alerts/tests/tests.py b/cl/alerts/tests/tests.py index 57817100a8..6376730f05 100644 --- a/cl/alerts/tests/tests.py +++ b/cl/alerts/tests/tests.py @@ -2392,7 +2392,7 @@ def test_es_alert_update_and_delete(self, mock_abort_audio): user=self.user_profile.user, rate=Alert.REAL_TIME, name="Test Alert OA", - query="type=oa&docket_number=19-1010", + query="type=oa&docket_number=19-1010&order_by=score desc", alert_type=SEARCH_TYPES.ORAL_ARGUMENT, ) @@ -2402,6 +2402,7 @@ def test_es_alert_update_and_delete(self, mock_abort_audio): response_str = str(doc.to_dict()) self.assertIn("'query': '19-1010'", response_str) self.assertIn("'rate': 'rt'", response_str) + self.assertNotIn("function_score", response_str) # Update Alert search_alert_1.query = "type=oa&docket_number=19-1020" diff --git a/cl/alerts/tests/tests_recap_alerts.py b/cl/alerts/tests/tests_recap_alerts.py index 8b99cba47e..1076802912 100644 --- a/cl/alerts/tests/tests_recap_alerts.py +++ b/cl/alerts/tests/tests_recap_alerts.py @@ -2221,13 +2221,19 @@ def test_index_and_delete_recap_alerts_from_percolator( user=self.user_profile.user, rate=Alert.WEEKLY, name="Test Alert Docket Only", - query='q="401 Civil"&type=r', + query='q="401 Civil"&type=r&order_by=score desc', alert_type=SEARCH_TYPES.RECAP, ) self.assertTrue( RECAPPercolator.exists(id=docket_only_alert.pk), msg=f"Alert id: {docket_only_alert.pk} was not indexed.", ) + alert_doc = RECAPPercolator.get(id=docket_only_alert.pk) + response_str = str(alert_doc.to_dict()) + self.assertIn("401 Civil", response_str) + self.assertIn("'rate': 'wly'", response_str) + # function_score breaks percolator queries. Ensure it is never indexed. + self.assertNotIn("function_score", response_str) docket_only_alert_id = docket_only_alert.pk # Remove the alert. diff --git a/cl/lib/elasticsearch_utils.py b/cl/lib/elasticsearch_utils.py index 5f2017015a..c39292e9aa 100644 --- a/cl/lib/elasticsearch_utils.py +++ b/cl/lib/elasticsearch_utils.py @@ -37,6 +37,7 @@ ApiPositionMapping, BasePositionMapping, CleanData, + EsJoinQueries, EsMainQueries, ESRangeQueryParams, ) @@ -71,6 +72,7 @@ SEARCH_RECAP_PARENT_QUERY_FIELDS, api_child_highlight_map, cardinality_query_unique_ids, + date_decay_relevance_types, recap_boosts_es, ) from cl.search.exception import ( @@ -938,6 +940,74 @@ def build_custom_function_score_for_date( return query +def build_decay_relevance_score( + query: QueryString | str, + date_field: str, + scale: int, + decay: float, + default_missing_date: str = "1600-01-01T00:00:00Z", + boost_mode: str = "multiply", + min_score: float = 0.0, +) -> QueryString: + """ + Build a decay relevance score query for Elasticsearch that adjusts the + relevance of documents based on a date field. + + :param query: The Elasticsearch query string or QueryString object. + :param date_field: The date field used to compute the relevance decay. + :param scale: The scale (in years) that determines the rate of decay. + :param decay: The decay factor. + :param default_missing_date: The default date to use when the date field + is null. + :param boost_mode: The mode to combine the decay score with the query's + original relevance score. + :param min_score: The minimum score where the decay function stabilizes. + :return: The modified QueryString object with applied function score. + """ + + query = Q( + "function_score", + query=query, + script_score={ + "script": { + "source": f""" + def default_missing_date = Instant.parse(params.default_missing_date).toEpochMilli(); + def decay = (double)params.decay; + def now = new Date().getTime(); + def min_score = (double)params.min_score; + + // Convert scale parameter into milliseconds. + double years = (double)params.scale; + // Convert years to milliseconds 1 year = 365 days + long scaleMillis = (long)(years * 365 * 24 * 60 * 60 * 1000); + + // Retrieve the document date. If missing or null, use default_missing_date + def docDate = default_missing_date; + if (doc['{date_field}'].size() > 0) {{ + docDate = doc['{date_field}'].value.toInstant().toEpochMilli(); + }} + // λ = ln(decay)/scale + def lambda = Math.log(decay) / scaleMillis; + // Absolute distance from now + def diff = Math.abs(docDate - now); + // Score: exp( λ * max(0, |docDate - now|) ) + def decay_score = Math.exp(lambda * diff); + // Adjust the decay score to have a minimum value + return min_score + ((1 - min_score) * decay_score); + """, + "params": { + "default_missing_date": default_missing_date, + "scale": scale, # Years + "decay": decay, + "min_score": min_score, + }, + }, + }, + boost_mode=boost_mode, + ) + return query + + def build_has_child_query( query: QueryString | str, child_type: str, @@ -1021,30 +1091,21 @@ def combine_plain_filters_and_queries( final_query.filter = reduce(operator.iand, filters) if filters and string_query: final_query.minimum_should_match = 1 - - if cd["type"] == SEARCH_TYPES.ORAL_ARGUMENT: - # Apply custom score for dateArgued sorting in the V4 API. - final_query = apply_custom_score_to_main_query( - cd, final_query, api_version - ) return final_query def get_match_all_query( cd: CleanData, - search_query: Search, api_version: Literal["v3", "v4"] | None = None, child_highlighting: bool = True, -) -> Search: +) -> Query: """Build and return a match-all query for each type of document. :param cd: The query CleanedData - :param search_query: Elasticsearch DSL Search object :param api_version: Optional, the request API version. :param child_highlighting: Whether highlighting should be enabled in child docs. - :return: The modified Search object based on the given conditions. + :return: The Match All Query object. """ - _, query_hits_limit = get_child_top_hits_limit( cd, cd["type"], api_version=api_version ) @@ -1068,9 +1129,6 @@ def get_match_all_query( final_match_all_query = Q( "bool", should=q_should, minimum_should_match=1 ) - final_match_all_query = apply_custom_score_to_main_query( - cd, final_match_all_query, api_version - ) case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: # Match all query for RECAP and Dockets, it'll return dockets # with child documents and also empty dockets. @@ -1092,9 +1150,6 @@ def get_match_all_query( should=[match_all_child_query, match_all_parent_query], minimum_should_match=1, ) - final_match_all_query = apply_custom_score_to_main_query( - cd, final_match_all_query, api_version - ) case SEARCH_TYPES.OPINION: # Only return Opinion clusters. match_all_child_query = build_has_child_query( @@ -1115,12 +1170,9 @@ def get_match_all_query( case _: # No string_query or filters in plain search types like OA and # Parentheticals. Use a match_all query. - match_all_query = Q("match_all") - final_match_all_query = apply_custom_score_to_main_query( - cd, match_all_query, api_version - ) + final_match_all_query = Q("match_all") - return search_query.query(final_match_all_query) + return final_match_all_query def build_es_base_query( @@ -1147,10 +1199,13 @@ def build_es_base_query( main_query = None string_query = None - child_docs_query = None + child_query = None parent_query = None filters = [] plain_doc = False + join_queries = None + has_text_query = False + match_all_query = False match cd["type"]: case SEARCH_TYPES.PARENTHETICAL: filters = build_es_plain_filters(cd) @@ -1193,14 +1248,12 @@ def build_es_base_query( ], ) ) - main_query, child_docs_query, parent_query = ( - build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - child_highlighting=child_highlighting, - api_version=api_version, - ) + join_queries = build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + child_highlighting=child_highlighting, + api_version=api_version, ) case ( @@ -1226,15 +1279,13 @@ def build_es_base_query( ], ) ) - main_query, child_docs_query, parent_query = ( - build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - child_highlighting=child_highlighting, - api_version=api_version, - alerts=alerts, - ) + join_queries = build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + child_highlighting=child_highlighting, + api_version=api_version, + alerts=alerts, ) case SEARCH_TYPES.OPINION: @@ -1246,20 +1297,19 @@ def build_es_base_query( mlt_query = async_to_sync(build_more_like_this_query)( cluster_pks ) - main_query, child_docs_query, parent_query = ( - build_full_join_es_queries( - cd, - {"opinion": []}, - [], - mlt_query, - child_highlighting=True, - api_version=api_version, - ) + join_queries = build_full_join_es_queries( + cd, + {"opinion": []}, + [], + mlt_query, + child_highlighting=True, + api_version=api_version, ) return EsMainQueries( - search_query=search_query.query(main_query), - parent_query=parent_query, - child_query=child_docs_query, + search_query=search_query.query(join_queries.main_query), + boost_mode="multiply", + parent_query=join_queries.parent_query, + child_query=join_queries.child_query, ) opinion_search_fields = SEARCH_OPINION_QUERY_FIELDS @@ -1286,41 +1336,48 @@ def build_es_base_query( ], ) ) - main_query, child_docs_query, parent_query = ( - build_full_join_es_queries( - cd, - child_query_fields, - parent_query_fields, - mlt_query, - child_highlighting=child_highlighting, - api_version=api_version, - alerts=alerts, - ) + join_queries = build_full_join_es_queries( + cd, + child_query_fields, + parent_query_fields, + mlt_query, + child_highlighting=child_highlighting, + api_version=api_version, + alerts=alerts, ) + if join_queries is not None: + main_query = join_queries.main_query + parent_query = join_queries.parent_query + child_query = join_queries.child_query + has_text_query = join_queries.has_text_query + if not any([filters, string_query, main_query]): # No filters, string_query or main_query provided by the user, return a # match_all query - match_all_query = get_match_all_query( - cd, search_query, api_version, child_highlighting - ) - return EsMainQueries( - search_query=match_all_query, - parent_query=parent_query, - child_query=child_docs_query, - ) + main_query = get_match_all_query(cd, api_version, child_highlighting) + match_all_query = True - if plain_doc: + boost_mode = "multiply" if has_text_query else "replace" + if plain_doc and not match_all_query: # Combine the filters and string query for plain documents like Oral # arguments and parentheticals main_query = combine_plain_filters_and_queries( cd, filters, string_query, api_version ) + boost_mode = "multiply" if string_query else "replace" + + # Apply a custom function score to the main query, useful for cursor pagination + # in the V4 API and for date decay relevance. + main_query = apply_custom_score_to_main_query( + cd, main_query, api_version, boost_mode=boost_mode + ) return EsMainQueries( search_query=search_query.query(main_query), + boost_mode=boost_mode, parent_query=parent_query, - child_query=child_docs_query, + child_query=child_query, ) @@ -2076,15 +2133,27 @@ def merge_unavailable_fields_on_parent_document( def clean_count_query(search_query: Search) -> SearchDSL: """Cleans a given ES Search object for a count query. - Modifies the input Search object by removing 'inner_hits' from - any 'has_child' queries within the 'should' clause of the boolean query. + Modifies the input Search object by removing 'function_score' from the main + query if present and/or 'inner_hits' from any 'has_child' queries within + the 'should' clause of the boolean query. It then creates a new Search object with the modified query. :param search_query: The ES Search object. :return: A new ES Search object with the count query. """ - parent_total_query_dict = search_query.to_dict() + parent_total_query_dict = search_query.to_dict(count=True) + try: + # Clean function_score in queries that contain it + parent_total_query_dict = parent_total_query_dict["query"][ + "function_score" + ] + del parent_total_query_dict["boost_mode"] + del parent_total_query_dict["functions"] + except KeyError: + # Omit queries that don't contain it. + pass + try: # Clean the has_child query in queries that contain it. for query in parent_total_query_dict["query"]["bool"]["should"]: @@ -2489,13 +2558,17 @@ def nullify_query_score(query: Query) -> Query: def apply_custom_score_to_main_query( - cd: CleanData, query: Query, api_version: Literal["v3", "v4"] | None = None + cd: CleanData, + query: Query, + api_version: Literal["v3", "v4"] | None = None, + boost_mode: str = "multiply", ) -> Query: """Apply a custom function score to the main query. :param cd: The query CleanedData :param query: The ES Query object to be modified. :param api_version: Optional, the request API version. + :param boost_mode: Optional, the boost mode to apply for the decay relevancy score :return: The function_score query contains the base query, applied when child_order is used. """ @@ -2516,6 +2589,10 @@ def apply_custom_score_to_main_query( else False ) + valid_decay_relevance_types: dict[str, dict[str, str | int | float]] = ( + date_decay_relevance_types + ) + main_order_by = cd.get("order_by", "") if is_valid_custom_score_field and api_version == "v4": # Applies a custom function score to sort Documents based on # a date field. This serves as a workaround to enable the use of the @@ -2526,7 +2603,23 @@ def apply_custom_score_to_main_query( default_score=0, default_current_date=cd["request_date"], ) - + elif ( + main_order_by == "score desc" + and cd["type"] in valid_decay_relevance_types + ): + decay_settings = valid_decay_relevance_types[cd["type"]] + date_field = str(decay_settings["field"]) + scale = int(decay_settings["scale"]) + decay = float(decay_settings["decay"]) + min_score = float(decay_settings["min_score"]) + query = build_decay_relevance_score( + query, + date_field, + scale=scale, + decay=decay, + boost_mode=boost_mode, + min_score=min_score, + ) return query @@ -2538,7 +2631,7 @@ def build_full_join_es_queries( child_highlighting: bool = True, api_version: Literal["v3", "v4"] | None = None, alerts: bool = False, -) -> tuple[QueryString | list, QueryString | None, QueryString | None]: +) -> EsJoinQueries: """Build a complete Elasticsearch query with both parent and child document conditions. @@ -2554,6 +2647,7 @@ def build_full_join_es_queries( """ q_should = [] + has_text_query = False match cd["type"]: case ( SEARCH_TYPES.RECAP @@ -2683,6 +2777,7 @@ def build_full_join_es_queries( string_query = build_fulltext_query( parent_query_fields, cd.get("q", ""), only_queries=True ) + has_text_query = True if string_query else False # If child filters are set, add a has_child query as a filter to the # parent query to exclude results without matching children. @@ -2730,17 +2825,22 @@ def build_full_join_es_queries( q_should.append(parent_query) if not q_should: - return [], child_docs_query, parent_query + return EsJoinQueries( + main_query=[], + parent_query=parent_query, + child_query=child_docs_query, + has_text_query=has_text_query, + ) - main_join_query = apply_custom_score_to_main_query( - cd, - Q( + return EsJoinQueries( + main_query=Q( "bool", should=q_should, ), - api_version, + parent_query=parent_query, + child_query=child_docs_query, + has_text_query=has_text_query, ) - return (main_join_query, child_docs_query, parent_query) def limit_inner_hits( @@ -3000,11 +3100,14 @@ def do_es_api_query( # and sorting are set. # Note that in V3 Case Law Search, opinions are collapsed by cluster_id # meaning that only one result per cluster is shown. - s = build_child_docs_query( + child_docs_query = build_child_docs_query( child_docs_query, cd=cd, ) - main_query = search_query.query(s) + main_query = apply_custom_score_to_main_query( + cd, child_docs_query, api_version, boost_mode=es_queries.boost_mode + ) + main_query = search_query.query(main_query) highlight_options, fields_to_exclude = build_highlights_dict( highlighting_fields, hl_tag ) @@ -3047,7 +3150,10 @@ def do_es_api_query( # field exclusion are set. s = apply_custom_score_to_main_query( - cd, child_docs_query, api_version + cd, + child_docs_query, + api_version, + boost_mode=es_queries.boost_mode, ) main_query = search_query.query(s) highlight_options, fields_to_exclude = build_highlights_dict( @@ -3427,6 +3533,7 @@ def get_opinions_coverage_over_time( format="yyyy", ), ) + try: response = search_query.execute() except (TransportError, ConnectionError, RequestError): diff --git a/cl/lib/types.py b/cl/lib/types.py index ff257574e9..e4c29c31e6 100644 --- a/cl/lib/types.py +++ b/cl/lib/types.py @@ -195,10 +195,19 @@ def get_db_to_dataclass_map(self): @dataclass class EsMainQueries: search_query: Search + boost_mode: str parent_query: QueryString | None = None child_query: QueryString | None = None +@dataclass +class EsJoinQueries: + main_query: QueryString | list + parent_query: QueryString | None + child_query: QueryString | None + has_text_query: bool + + @dataclass class ApiPositionMapping(BasePositionMapping): position_type_dict: defaultdict[int, list[str]] = field( diff --git a/cl/search/api_utils.py b/cl/search/api_utils.py index fd545fc262..23ff86cdfa 100644 --- a/cl/search/api_utils.py +++ b/cl/search/api_utils.py @@ -12,6 +12,7 @@ build_cardinality_count, build_es_main_query, build_sort_results, + clean_count_query, do_collapse_count_query, do_count_query, do_es_api_query, @@ -21,7 +22,6 @@ set_results_highlights, ) from cl.lib.search_utils import store_search_api_query -from cl.lib.utils import map_to_docket_entry_sorting from cl.search.constants import SEARCH_HL_TAG, cardinality_query_unique_ids from cl.search.documents import ( AudioDocument, @@ -64,7 +64,7 @@ def get_object_list(request, cd, paginator): case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: search_query = ESRECAPDocument.search() case _: - search_query = None + raise ElasticBadRequestError("Unsupported search type.") if use_default_query: main_query, _, _ = build_es_main_query(search_query, cd) @@ -260,12 +260,8 @@ def get_paginated_results( self.main_query = self.main_query.sort(default_sorting, unique_sorting) # Cardinality query parameters - query = Q(self.main_query.to_dict(count=True)["query"]) + main_count_query = clean_count_query(self.main_query) unique_field = cardinality_query_unique_ids[self.clean_data["type"]] - search_document = self.cardinality_base_document[ - self.clean_data["type"] - ] - main_count_query = search_document.search().query(query) cardinality_query = build_cardinality_count( main_count_query, unique_field ) @@ -273,10 +269,16 @@ def get_paginated_results( # Build a cardinality query to count child documents. child_cardinality_query = None child_cardinality_count_response = None - if self.child_docs_query: + if ( + self.child_docs_query + and self.clean_data["type"] == SEARCH_TYPES.RECAP + ): child_unique_field = cardinality_query_unique_ids[ SEARCH_TYPES.RECAP_DOCUMENT ] + search_document = self.cardinality_base_document[ + self.clean_data["type"] + ] child_count_query = search_document.search().query( self.child_docs_query ) @@ -292,7 +294,10 @@ def get_paginated_results( ) # If a cardinality query is available for the search_type, add it # to the multi-search query. - if child_cardinality_query: + if ( + child_cardinality_query + and self.clean_data["type"] == SEARCH_TYPES.RECAP + ): multi_search = multi_search.add(child_cardinality_query) responses = multi_search.execute() diff --git a/cl/search/api_views.py b/cl/search/api_views.py index 4f33b1d4f6..1761ccdd64 100644 --- a/cl/search/api_views.py +++ b/cl/search/api_views.py @@ -1,6 +1,5 @@ from http import HTTPStatus -import waffle from django.db.models import Prefetch from rest_framework import pagination, permissions, response, viewsets from rest_framework.exceptions import NotFound diff --git a/cl/search/constants.py b/cl/search/constants.py index f9d8b610f3..adb2f697d2 100644 --- a/cl/search/constants.py +++ b/cl/search/constants.py @@ -275,3 +275,37 @@ SEARCH_TYPES.ORAL_ARGUMENT: "id", SEARCH_TYPES.PARENTHETICAL: "id", } + + +date_decay_relevance_types = { + SEARCH_TYPES.OPINION: { + "field": "dateFiled", + "scale": 50, + "decay": 0.2, + "min_score": 0.1, + }, + SEARCH_TYPES.RECAP: { + "field": "dateFiled", + "scale": 20, + "decay": 0.2, + "min_score": 0.1, + }, + SEARCH_TYPES.DOCKETS: { + "field": "dateFiled", + "scale": 20, + "decay": 0.2, + "min_score": 0.1, + }, + SEARCH_TYPES.RECAP_DOCUMENT: { + "field": "dateFiled", + "scale": 20, + "decay": 0.2, + "min_score": 0.1, + }, + SEARCH_TYPES.ORAL_ARGUMENT: { + "field": "dateArgued", + "scale": 50, + "decay": 0.2, + "min_score": 0.1, + }, +} diff --git a/cl/search/documents.py b/cl/search/documents.py index a83c29f127..fcae4baaf6 100644 --- a/cl/search/documents.py +++ b/cl/search/documents.py @@ -355,6 +355,11 @@ def prepare_timestamp(self, instance): def prepare_percolator_query(self, instance): qd = QueryDict(instance.query.encode(), mutable=True) + if "order_by" in qd: + # sorting key is not required in percolator queries. Adding it + # generates a custom function score for decay relevance, which breaks + # percolator queries. + del qd["order_by"] search_form = SearchForm(qd) if not search_form.is_valid(): logger.warning( @@ -1988,6 +1993,9 @@ def prepare_percolator_query(self, instance): from cl.alerts.utils import build_plain_percolator_query qd = QueryDict(instance.query.encode(), mutable=True) + # For RECAP percolator queries, we use build_plain_percolator_query to + # build the query. It does not add a custom function_score, so there is + # no need to remove the order_by sorting key as it is ignored. search_form = SearchForm(qd) if not search_form.is_valid(): logger.warning( diff --git a/cl/search/tests/tests_es_opinion.py b/cl/search/tests/tests_es_opinion.py index 6b62084f73..358f7c2725 100644 --- a/cl/search/tests/tests_es_opinion.py +++ b/cl/search/tests/tests_es_opinion.py @@ -2269,6 +2269,271 @@ def test_uses_exact_version_for_case_name_field(self) -> None: cluster_2.delete() +class OpinionSearchDecayRelevancyTest( + ESIndexTestCase, V4SearchAPIAssertions, TestCase +): + """ + Opinion Search Decay Relevancy Tests + """ + + @classmethod + def setUpTestData(cls): + # Rebuild the Opinion index + cls.rebuild_index("search.OpinionCluster") + + # Same keywords but different dateFiled + cls.opinion_old = OpinionClusterFactory.create( + case_name="Keyword Match", + case_name_full="", + case_name_short="", + date_filed=datetime.date(1832, 2, 23), + procedural_history="", + source="C", + attorneys="", + slug="opinion-old", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1235", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_old = OpinionFactory.create( + cluster=cls.opinion_old, plain_text="", author_str="" + ) + + cls.opinion_recent = OpinionClusterFactory.create( + case_name="Keyword Match", + case_name_full="", + case_name_short="", + date_filed=datetime.date(2024, 2, 23), + procedural_history="", + source="C", + attorneys="", + slug="opinion-recent", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1236", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_recent = OpinionFactory.create( + cluster=cls.opinion_recent, plain_text="", author_str="" + ) + + # Different relevance with same dateFiled + cls.opinion_high_relevance = OpinionClusterFactory.create( + case_name="Highly Relevant Keywords", + case_name_full="", + case_name_short="", + date_filed=datetime.date(2022, 2, 23), + procedural_history="More Highly Relevant Keywords", + source="C", + attorneys="More Highly Relevant Keywords", + slug="opinion-high-rel", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1237", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_high_relevance = OpinionFactory.create( + cluster=cls.opinion_high_relevance, plain_text="", author_str="" + ) + + cls.opinion_low_relevance = OpinionClusterFactory.create( + case_name="Highly Relevant Keywords", + case_name_full="", + case_name_short="", + date_filed=datetime.date(2022, 2, 23), + procedural_history="", + source="C", + attorneys="", + slug="opinion-low-rel", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1238", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_low_relevance = OpinionFactory.create( + cluster=cls.opinion_low_relevance, plain_text="", author_str="" + ) + + # Different relevance with different dateFiled + cls.opinion_high_relevance_old_date = OpinionClusterFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + date_filed=datetime.date(1900, 2, 23), + procedural_history="More Ipsum Dolor Terms", + source="C", + attorneys="More Ipsum Dolor Terms", + slug="opinion-high-rel-old", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1239", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_high_relevance_old_date = OpinionFactory.create( + cluster=cls.opinion_high_relevance_old_date, + plain_text="", + author_str="", + ) + + cls.opinion_low_relevance_new_date = OpinionClusterFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + date_filed=datetime.date(2024, 12, 23), + procedural_history="", + source="C", + attorneys="", + slug="opinion-low-rel-new", + precedential_status="Published", + docket=DocketFactory( + case_name="Base Docket", + docket_number="1:21-bk-1241", + source=Docket.HARVARD, + date_filed=datetime.date(1900, 1, 1), + ), + ) + cls.child_opinion_low_relevance_new_date = OpinionFactory.create( + cluster=cls.opinion_low_relevance_new_date, + plain_text="", + author_str="", + ) + + super().setUpTestData() + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.OPINION, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + cls.test_cases = [ + { + "name": "Same keywords, different dateFiled", + "search_params": { + "q": "Keyword Match", + "order_by": "score desc", + "type": SEARCH_TYPES.OPINION, + }, + "expected_order_frontend": [ + cls.opinion_recent.docket.docket_number, # Most recent dateFiled + cls.opinion_old.docket.docket_number, # Oldest dateFiled + ], + "expected_order": [ # API + cls.opinion_recent.pk, + cls.opinion_old.pk, + ], + }, + { + "name": "Different relevancy same dateFiled", + "search_params": { + "q": "Highly Relevant Keywords", + "order_by": "score desc", + "type": SEARCH_TYPES.OPINION, + }, + "expected_order_frontend": [ + cls.opinion_high_relevance.docket.docket_number, # Most relevant by keywords + cls.opinion_low_relevance.docket.docket_number, # Less relevant by keywords + ], + "expected_order": [ # API + cls.opinion_high_relevance.pk, # Most relevant by keywords + cls.opinion_low_relevance.pk, # Less relevant by keywords + ], + }, + { + "name": "Different relevancy and different dateFiled", + "search_params": { + "q": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.OPINION, + }, + "expected_order_frontend": [ + cls.opinion_low_relevance_new_date.docket.docket_number, # Combination of relevance and date rank it first. + cls.opinion_high_relevance_old_date.docket.docket_number, + ], + "expected_order": [ # API + cls.opinion_low_relevance_new_date.pk, + cls.opinion_high_relevance_old_date.pk, + ], + }, + { + "name": "Match all query decay relevancy.", + "search_params": { + "q": "", + "order_by": "score desc", + "type": SEARCH_TYPES.OPINION, + }, + # Order by recency and then by relevancy as per decay scoring logic + "expected_order_frontend": [ + cls.opinion_low_relevance_new_date.docket.docket_number, # 2024-12-23 1:21-bk-1241 + cls.opinion_recent.docket.docket_number, # 2024-02-23 1:21-bk-1236 + cls.opinion_high_relevance.docket.docket_number, # 2022-02-23 1:21-bk-1237 Indexed first, displayed first. + cls.opinion_low_relevance.docket.docket_number, # 2022-02-23 1:21-bk-1238 + cls.opinion_high_relevance_old_date.docket.docket_number, # 1800-02-23 1:21-bk-1239 + cls.opinion_old.docket.docket_number, # 1732-02-23 1:21-bk-1235 + ], + "expected_order": [ # V4 API + cls.opinion_low_relevance_new_date.pk, # 2024-12-23 + cls.opinion_recent.pk, # 2024-02-23 + cls.opinion_low_relevance.pk, # 2022-02-23 Higher PK in V4, API pk is a secondary sorting key. + cls.opinion_high_relevance.pk, # 2022-02-23 Lower PK + cls.opinion_high_relevance_old_date.pk, # 1800-02-23 + cls.opinion_old.pk, # 1732-02-23 + ], + "expected_order_v3": [ # V3 API + cls.opinion_low_relevance_new_date.pk, # 2024-12-23 + cls.opinion_recent.pk, # 2024-02-23 + cls.opinion_high_relevance.pk, # 2022-02-23 Indexed first, displayed first. + cls.opinion_low_relevance.pk, # 2022-02-23 + cls.opinion_high_relevance_old_date.pk, # 1800-02-23 + cls.opinion_old.pk, # 1732-02-23 + ], + }, + ] + + def test_relevancy_decay_scoring_frontend(self) -> None: + """Test relevancy decay scoring for Opinion search Frontend""" + + for test in self.test_cases: + with self.subTest(test["name"]): + r = async_to_sync(self._test_article_count)( + test["search_params"], + len(test["expected_order_frontend"]), + f"Failed count {test['name']}", + ) + self._assert_order_in_html( + r.content.decode(), test["expected_order_frontend"] + ) + + def test_relevancy_decay_scoring_v4_api(self) -> None: + """Test relevancy decay scoring for Opinion search V4 API""" + + for test in self.test_cases: + self._test_results_ordering(test, "cluster_id") + + def test_relevancy_decay_scoring_v3_api(self) -> None: + """Test relevancy decay scoring for Opinion search V3 API""" + + for test in self.test_cases: + self._test_results_ordering(test, "cluster_id", version="v3") + + @override_flag("ui_flag_for_o", False) @override_settings(RELATED_MLT_MINTF=1) class RelatedSearchTest( diff --git a/cl/search/tests/tests_es_oral_arguments.py b/cl/search/tests/tests_es_oral_arguments.py index 008f6dc218..1147fa92e7 100644 --- a/cl/search/tests/tests_es_oral_arguments.py +++ b/cl/search/tests/tests_es_oral_arguments.py @@ -113,7 +113,7 @@ async def test_oa_results_relevance_ordering(self) -> None: ) self.assertTrue( r.content.decode().index("Jose") - > r.content.decode().index("Hong Liu"), + < r.content.decode().index("Hong Liu"), msg="'Jose' should come AFTER 'Hong Liu' when order_by relevance.", ) @@ -273,9 +273,9 @@ async def test_oa_results_relevance_ordering_elastic(self) -> None: ) self.assertTrue( r.content.decode().index("Jose") - > r.content.decode().index("Hong Liu Lorem") + < r.content.decode().index("Hong Liu Lorem") < r.content.decode().index("Hong Liu Yang"), - msg="'Jose' should come AFTER 'Hong Liu Lorem' and 'Hong Liu Yang' when order_by relevance.", + msg="'Jose' should come Before 'Hong Liu Lorem' and 'Hong Liu Yang' when order_by relevance.", ) @skip_if_common_tests_skipped @@ -983,14 +983,14 @@ def confirm_query_matched(response, query_id) -> bool: @staticmethod def save_percolator_query(cd): search_query = AudioDocument.search() + # Sorting key is not required in percolator queries. + del cd["order_by"] es_queries = build_es_base_query(search_query, cd) - search_query = es_queries.search_query - query_dict = search_query.to_dict()["query"] percolator_query = AudioPercolator( - percolator_query=query_dict, rate=Alert.REAL_TIME + percolator_query=es_queries.search_query.to_dict()["query"], + rate=Alert.REAL_TIME, ) percolator_query.save(refresh=True) - return percolator_query.meta.id @staticmethod @@ -1052,9 +1052,9 @@ def test_oa_results_relevance_ordering(self) -> None: expected = 3 self.assertEqual(actual, expected) self.assertTrue( - r.content.decode().index("Jose") - > r.content.decode().index("Hong Liu"), - msg="'Jose' should come AFTER 'Hong Liu' when order_by relevance.", + r.content.decode().index("Jose") # 2015, 8, 15 + < r.content.decode().index("Hong Liu"), # 2015, 8, 14 + msg="'Jose' should come Before 'Hong Liu' when order_by relevance.", ) def test_oa_results_search_match_phrase(self) -> None: @@ -1642,10 +1642,14 @@ def test_oa_results_relevance_ordering_elastic(self) -> None: expected = 3 self.assertEqual(actual, expected) self.assertTrue( - r.content.decode().index("Hong Liu Lorem") - < r.content.decode().index("Hong Liu Yang") - < r.content.decode().index("Jose"), - msg="'Hong Liu Lorem' should come BEFORE 'Hong Liu Yang' and 'Jose' when order_by relevance.", + r.content.decode().index( + "Hong Liu Lorem" + ) # 2015, 8, 14 - 9.486339 + < r.content.decode().index( + "Hong Liu Yang" + ) # 2015, 8, 14 - 9.034608 + < r.content.decode().index("Jose"), # 2015, 8, 15 - 4.7431693 + msg="'Jose' should come BEFORE 'Hong Liu Yang' and 'Hong Liu Lorem' when order_by relevance.", ) # Relevance order, two words match, reverse order. @@ -1663,10 +1667,10 @@ def test_oa_results_relevance_ordering_elastic(self) -> None: expected = 3 self.assertEqual(actual, expected) self.assertTrue( - r.content.decode().index("Jose") - > r.content.decode().index("Hong Liu Lorem") - < r.content.decode().index("Hong Liu Yang"), - msg="'Jose' should come AFTER 'Hong Liu Lorem' and 'Hong Liu Yang' when order_by relevance.", + r.content.decode().index("Jose") # 2015, 8, 15 + < r.content.decode().index("Hong Liu Lorem") # 2015, 8, 14 + < r.content.decode().index("Hong Liu Yang"), # 2015, 8, 14 + msg="'Jose' should come Before 'Hong Liu Lorem' and 'Hong Liu Yang' when order_by relevance.", ) # Relevance order, hyphenated compound word. @@ -2488,6 +2492,286 @@ def test_uses_exact_version_for_case_name_field(self) -> None: self.assertIn("Howells", r.content.decode()) +class OralArgumentsSearchDecayRelevancyTest( + ESIndexTestCase, V4SearchAPIAssertions, TestCase +): + """Oral Arguments Search Decay Relevancy Tests""" + + @classmethod + def setUpTestData(cls): + + # Same keywords but different dateArgued + with cls.captureOnCommitCallbacks(execute=True): + cls.docket_old = DocketFactory.create( + docket_number="1:21-bk-1235", + date_argued=datetime.date(1832, 2, 23), + ) + cls.audio_old = AudioFactory.create( + case_name="Keyword Match", + case_name_full="", + docket_id=cls.docket_old.pk, + duration=420, + judges="Judge Old", + local_path_original_file="test/audio/audio_old.mp3", + local_path_mp3="test/audio/audio_old.mp3", + source="C", + blocked=False, + sha1="old_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="Transcript for old audio", + ) + + cls.docket_recent = DocketFactory.create( + docket_number="1:21-bk-1236", + date_argued=datetime.date(2024, 2, 23), + ) + cls.audio_recent = AudioFactory.create( + case_name="Keyword Match", + case_name_full="", + docket_id=cls.docket_recent.pk, + duration=420, + judges="Judge Recent", + local_path_original_file="test/audio/audio_recent.mp3", + local_path_mp3="test/audio/audio_recent.mp3", + source="C", + blocked=False, + sha1="recent_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="Transcript for recent audio", + ) + + # Different relevance with same dateArgued + cls.docket_low_relevance = DocketFactory.create( + case_name="Highly Relevant Keywords", + docket_number="1:21-bk-1238", + date_argued=datetime.date(2022, 2, 23), + ) + cls.audio_low_relevance = AudioFactory.create( + case_name="Highly Relevant Keywords", + case_name_full="", + docket_id=cls.docket_low_relevance.pk, + duration=420, + judges="Judge Low", + local_path_original_file="test/audio/audio_low_rel.mp3", + local_path_mp3="test/audio/audio_low_rel.mp3", + source="C", + blocked=False, + sha1="low_rel_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="", + ) + + cls.docket_high_relevance = DocketFactory.create( + case_name="Highly Relevant Keywords", + docket_number="1:21-bk-1237", + date_argued=datetime.date(2022, 2, 23), + ) + cls.audio_high_relevance = AudioFactory.create( + case_name="Highly Relevant Keywords", + case_name_full="", + docket_id=cls.docket_high_relevance.pk, + duration=420, + judges="Judge High", + local_path_original_file="test/audio/audio_high_rel.mp3", + local_path_mp3="test/audio/audio_high_rel.mp3", + source="C", + blocked=False, + sha1="high_rel_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="More Highly Relevant Keywords in the transcript", + ) + + # Different relevance with different dateArgued + cls.docket_high_relevance_old_date = DocketFactory.create( + case_name="Ipsum Dolor Terms", + docket_number="1:21-bk-1239", + date_argued=datetime.date(1900, 2, 23), + ) + cls.audio_high_relevance_old_date = AudioFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + docket_id=cls.docket_high_relevance_old_date.pk, + duration=420, + judges="Judge Old Relevant", + local_path_original_file="test/audio/audio_high_rel_old.mp3", + local_path_mp3="test/audio/audio_high_rel_old.mp3", + source="C", + blocked=False, + sha1="high_rel_old_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="More Ipsum Dolor Terms", + ) + + cls.docket_high_relevance_null_date = DocketFactory.create( + case_name="Ipsum Dolor Terms", + docket_number="1:21-bk-1240", + date_argued=None, + ) + cls.audio_high_relevance_null_date = AudioFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + docket_id=cls.docket_high_relevance_null_date.pk, + duration=420, + judges="Judge Null", + local_path_original_file="test/audio/audio_high_rel_null.mp3", + local_path_mp3="test/audio/audio_high_rel_null.mp3", + source="C", + blocked=False, + sha1="high_rel_null_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="More Ipsum Dolor Terms", + ) + + cls.docket_low_relevance_new_date = DocketFactory.create( + case_name="Ipsum Dolor Terms", + docket_number="1:21-bk-1241", + date_argued=datetime.date(2024, 12, 23), + ) + cls.audio_low_relevance_new_date = AudioFactory.create( + case_name="Ipsum Dolor Terms", + case_name_full="", + docket_id=cls.docket_low_relevance_new_date.pk, + duration=420, + judges="Judge New Low", + local_path_original_file="test/audio/audio_low_rel_new.mp3", + local_path_mp3="test/audio/audio_low_rel_new.mp3", + source="C", + blocked=False, + sha1="low_rel_new_sha1", + stt_status=Audio.STT_COMPLETE, + stt_transcript="", + ) + + cls.test_cases = [ + { + "name": "Same keywords different dateArgued", + "search_params": { + "q": "Keyword Match", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_recent.docket_number, # Most recent dateArgued + cls.docket_old.docket_number, # Oldest dateArgued + ], + "expected_order": [ # API + cls.audio_recent.pk, + cls.audio_old.pk, + ], + }, + { + "name": "Different relevancy same dateArgued", + "search_params": { + "q": "Highly Relevant Keywords", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_high_relevance.docket_number, # Most relevant by keywords + cls.docket_low_relevance.docket_number, # Less relevant by keywords + ], + "expected_order": [ + cls.audio_high_relevance.pk, + cls.audio_low_relevance.pk, + ], + }, + { + "name": "Different relevancy different dateArgued", + "search_params": { + "q": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # Combination of relevance and date rank it first. + cls.docket_high_relevance_old_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. + ], + "expected_order": [ # API + cls.audio_low_relevance_new_date.pk, + cls.audio_high_relevance_old_date.pk, + cls.audio_high_relevance_null_date.pk, + ], + }, + { + "name": "Fixed main score for all (0 or 1) (using filters) and different dateArgued", + "search_params": { + "case_name": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # Most recent dateFiled + cls.docket_high_relevance_old_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. + ], + "expected_order": [ # API + cls.audio_low_relevance_new_date.pk, + cls.audio_high_relevance_old_date.pk, + cls.audio_high_relevance_null_date.pk, + ], + }, + { + "name": "Match all query decay relevancy.", + "search_params": { + "q": "", + "order_by": "score desc", + "type": SEARCH_TYPES.ORAL_ARGUMENT, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # 2024-12-23 1:21-bk-1241 + cls.docket_recent.docket_number, # 2024-02-23 1:21-bk-1236 + cls.docket_low_relevance.docket_number, # 2022-02-23 1:21-bk-1238 Indexed first, displayed first. + cls.docket_high_relevance.docket_number, # 2022-02-23 1:21-bk-1237 + cls.docket_high_relevance_old_date.docket_number, # 1800-02-23 1:21-bk-1239 + cls.docket_old.docket_number, # 1732-02-23 1:21-bk-1235 + cls.docket_high_relevance_null_date.docket_number, # Null dateArgued 1:21-bk-1240 + ], + "expected_order": [ # V4 API + cls.audio_low_relevance_new_date.pk, # 2024-12-23 + cls.audio_recent.pk, # 2024-02-23 + cls.audio_high_relevance.pk, # 2022-02-23 Higher PK in V4 API, pk is a secondary sorting key. + cls.audio_low_relevance.pk, # 2022-02-23 + cls.audio_high_relevance_old_date.pk, # 1800-02-23 + cls.audio_old.pk, # 1732-02-23 + cls.audio_high_relevance_null_date.pk, # Null dateArgued + ], + "expected_order_v3": [ # V3 API + cls.audio_low_relevance_new_date.pk, # 2024-12-23 + cls.audio_recent.pk, # 2024-02-23 + cls.audio_low_relevance.pk, # 2022-02-23 Indexed first, displayed first. + cls.audio_high_relevance.pk, # 2022-02-23 + cls.audio_high_relevance_old_date.pk, # 1800-02-23 + cls.audio_old.pk, # 1732-02-23 + cls.audio_high_relevance_null_date.pk, # Null dateArgued + ], + }, + ] + + def test_relevancy_decay_scoring_frontend(self) -> None: + """Test relevancy decay scoring for Oral Arguments search Frontend""" + for test in self.test_cases: + with self.subTest(test["name"]): + r = async_to_sync(self._test_article_count)( + test["search_params"], + len(test["expected_order_frontend"]), + f"Failed count {test['name']}", + ) + self._assert_order_in_html( + r.content.decode(), test["expected_order_frontend"] + ) + + def test_relevancy_decay_scoring_v4_api(self) -> None: + """Test relevancy decay scoring for Oral Arguments search V4 API""" + for test in self.test_cases: + self._test_results_ordering(test, "id", version="v4") + + def test_relevancy_decay_scoring_v3_api(self) -> None: + """Test relevancy decay scoring for Oral Arguments search V3 API""" + for test in self.test_cases: + self._test_results_ordering(test, "id", version="v3") + + class OralArgumentIndexingTest( CountESTasksTestCase, ESIndexTestCase, TransactionTestCase ): diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 8a6f785de0..91981feb77 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -2854,6 +2854,333 @@ def test_uses_exact_version_for_case_name_field(self) -> None: docket_2.delete() +class RECAPSearchDecayRelevancyTest( + ESIndexTestCase, V4SearchAPIAssertions, TestCase +): + """ + RECAP Search Decay Relevancy Tests + """ + + @classmethod + def setUpTestData(cls): + cls.rebuild_index("search.Docket") + + # Same keywords but different dateFiled + cls.docket_old = DocketFactory( + case_name="Keyword Match", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1235", + source=Docket.RECAP, + date_filed=datetime.date(1832, 2, 23), + ) + cls.rd_old = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_old, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="019036000435", + ) + + cls.docket_recent = DocketFactory( + case_name="Keyword Match", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1236", + source=Docket.RECAP, + date_filed=datetime.date(2024, 2, 23), + ) + cls.rd_recent = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_recent, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="019036000436", + ) + + # Different relevance with same dateFiled + cls.docket_low_relevance = DocketFactory( + case_name="Highly Relevant Keywords", + case_name_full="", + case_name_short="", + nature_of_suit="", + docket_number="1:21-bk-1238", + source=Docket.RECAP, + date_filed=datetime.date(2022, 2, 23), + ) + cls.rd_low_relevance = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_low_relevance, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="019036000437", + ) + + cls.docket_high_relevance = DocketFactory( + case_name="Highly Relevant Keywords", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1237", + source=Docket.RECAP, + nature_of_suit="More Highly Relevant Keywords", + cause="More Highly Relevant Keywords", + date_filed=datetime.date(2022, 2, 23), + ) + cls.rd_high_relevance = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_high_relevance, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="01903600048", + ) + + # Different relevance with different dateFiled + cls.docket_high_relevance_old_date = DocketFactory( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1239", + source=Docket.RECAP, + nature_of_suit="More Ipsum Dolor Terms", + cause="More Ipsum Dolor Terms", + date_filed=datetime.date(1900, 2, 23), + ) + cls.rd_high_relevance_old_date = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_high_relevance_old_date, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="01903600049", + ) + + cls.docket_high_relevance_null_date = DocketFactory( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + docket_number="1:21-bk-1240", + source=Docket.RECAP, + nature_of_suit="More Ipsum Dolor Terms", + cause="More Ipsum Dolor Terms", + date_filed=None, + ) + cls.rd_high_relevance_null_date = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_high_relevance_null_date, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="01903600050", + ) + + cls.docket_low_relevance_new_date = DocketFactory( + case_name="Ipsum Dolor Terms", + case_name_full="", + case_name_short="", + nature_of_suit="", + docket_number="1:21-bk-1241", + source=Docket.RECAP, + date_filed=datetime.date(2024, 12, 23), + ) + cls.rd_low_relevance_new_date = RECAPDocumentFactory( + docket_entry=DocketEntryWithParentsFactory( + docket=cls.docket_low_relevance_new_date, + entry_number=1, + description="", + ), + description="", + is_available=False, + pacer_doc_id="01903600051", + ) + + super().setUpTestData() + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + cls.test_cases = [ + { + "name": "Same keywords, different dateFiled", + "search_params": { + "q": "Keyword Match", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_recent.docket_number, # Most recent dateFiled + cls.docket_old.docket_number, # Oldest dateFiled + ], + "expected_order": [ # API + cls.docket_recent.pk, + cls.docket_old.pk, + ], + }, + { + "name": "Different relevancy same dateFiled", + "search_params": { + "q": "Highly Relevant Keywords", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_high_relevance.docket_number, + # Most relevant by keywords + cls.docket_low_relevance.docket_number, + # Less relevant by keywords + ], + "expected_order": [ # API + cls.docket_high_relevance.pk, + cls.docket_low_relevance.pk, + ], + }, + { + "name": "Different relevancy different dateFiled", + "search_params": { + "q": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # Combination of relevance and date rank it first. + cls.docket_high_relevance_old_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. + ], + "expected_order": [ # API + cls.docket_low_relevance_new_date.pk, + cls.docket_high_relevance_old_date.pk, + cls.docket_high_relevance_null_date.pk, + ], + }, + { + "name": "Fixed main score for all (0 or 1) (using filters) and different dateFiled", + "search_params": { + "case_name": "Ipsum Dolor Terms", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, # Most recent dateFiled + cls.docket_high_relevance_old_date.docket_number, + cls.docket_high_relevance_null_date.docket_number, # docs with a null dateFiled are ranked lower. + ], + "expected_order": [ # API + cls.docket_low_relevance_new_date.pk, + cls.docket_high_relevance_old_date.pk, + cls.docket_high_relevance_null_date.pk, + ], + }, + { + "name": "Match all query decay relevancy.", + "search_params": { + "q": "", + "order_by": "score desc", + "type": SEARCH_TYPES.RECAP, + }, + "expected_order_frontend": [ + cls.docket_low_relevance_new_date.docket_number, + # 2024, 12, 23 1:21-bk-1241 + cls.docket_recent.docket_number, + # 2024, 2, 23 1:21-bk-1236 + cls.docket_low_relevance.docket_number, + # 2022, 2, 23 1:21-bk-1238 Indexed first, displayed first. + cls.docket_high_relevance.docket_number, + # 2022, 2, 23 1:21-bk-1237 + cls.docket_high_relevance_old_date.docket_number, + # 1800, 2, 23 1:21-bk-1239 + cls.docket_old.docket_number, # 1732, 2, 23 1:21-bk-1235 + cls.docket_high_relevance_null_date.docket_number, + # Null dateFiled 1:21-bk-1240 + ], + "expected_order": [ # V4 API + cls.docket_low_relevance_new_date.pk, + # 2024, 12, 23 1:21-bk-1241 + cls.docket_recent.pk, + # 2024, 2, 23 1:21-bk-1236 + cls.docket_high_relevance.pk, + # 2022, 2, 23 1:21-bk-1237 Higher PK in V4, API pk is a secondary sorting key. + cls.docket_low_relevance.pk, + # 2022, 2, 23 1:21-bk-1238 Lower PK + cls.docket_high_relevance_old_date.pk, + # 1800, 2, 23 1:21-bk-1239 + cls.docket_old.pk, # 1732, 2, 23 1:21-bk-1235 + cls.docket_high_relevance_null_date.pk, + # Null 1:21-bk-1240 + ], + "expected_order_v3": [ # V3 API + cls.docket_low_relevance_new_date.pk, + # 2024, 12, 23 1:21-bk-1241 + cls.docket_recent.pk, + # 2024, 2, 23 1:21-bk-1236 + cls.docket_low_relevance.pk, + # 2022, 2, 23 1:21-bk-1238 Indexed first, displayed first. + cls.docket_high_relevance.pk, + # 2022, 2, 23 1:21-bk-1237 + cls.docket_high_relevance_old_date.pk, + # 1800, 2, 23 1:21-bk-1239 + cls.docket_old.pk, # 1732, 2, 23 1:21-bk-1235 + cls.docket_high_relevance_null_date.pk, + # Null 1:21-bk-1240 + ], + }, + ] + + def test_relevancy_decay_scoring_frontend(self) -> None: + """Test relevancy decay scoring for RECAP search Frontend""" + + for test in self.test_cases: + with self.subTest(test["name"]): + r = async_to_sync(self._test_article_count)( + test["search_params"], + len(test["expected_order_frontend"]), + f"Failed count {test["name"]}", + ) + self._assert_order_in_html( + r.content.decode(), test["expected_order_frontend"] + ) + + def test_relevancy_decay_scoring_v4_api(self) -> None: + """Test relevancy decay scoring for RECAP search V4 API""" + + search_types = [ + SEARCH_TYPES.RECAP, + SEARCH_TYPES.DOCKETS, + SEARCH_TYPES.RECAP_DOCUMENT, + ] + for search_type in search_types: + for test in self.test_cases: + test["search_params"]["type"] = search_type + self._test_results_ordering(test, "docket_id", version="v4") + + def test_relevancy_decay_scoring_v3_api(self) -> None: + """Test relevancy decay scoring for RECAP search V4 API""" + + search_types = [SEARCH_TYPES.RECAP, SEARCH_TYPES.DOCKETS] + for search_type in search_types: + for test in self.test_cases: + test["search_params"]["type"] = search_type + self._test_results_ordering(test, "docket_id", version="v3") + + class RECAPSearchAPICommonTests(RECAPSearchTestCase): version_api = "v3" @@ -3389,28 +3716,6 @@ async def test_results_ordering(self) -> None: # API await self._test_api_results_count(params, 3, "order random") - # Order by score desc (relevance). - params = { - "type": SEARCH_TYPES.RECAP, - "q": "SUBPOENAS SERVED", - "order_by": "score desc", - } - # API - r = await self._test_api_results_count(params, 3, "order score desc") - self.assertTrue( - r.content.decode().index("1:21-bk-1234") - < r.content.decode().index("12-1235"), - msg="'1:21-bk-1234' should come BEFORE '12-1235' when order_by score desc.", - ) - - params["type"] = SEARCH_TYPES.DOCKETS - r = await self._test_api_results_count(params, 2, "order") - self.assertTrue( - r.content.decode().index("1:21-bk-1234") - < r.content.decode().index("12-1235"), - msg="'1:21-bk-1234' should come BEFORE '12-1235' when order_by score desc.", - ) - # Order by entry_date_filed desc params = { "type": SEARCH_TYPES.RECAP, @@ -3910,7 +4215,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Query string, order by dateFiled desc", "search_params": search_params, - "expected_results": 5, "expected_order": [ docket_entry_recent.docket.pk, # 2024/02/23 self.de_1.docket.pk, # 2016/08/16 @@ -3922,7 +4226,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Query string, order by dateFiled asc", "search_params": params_date_filed_asc, - "expected_results": 5, "expected_order": [ docket_old.pk, # 1732/2/23 self.de.docket.pk, # 2015/8/16 @@ -3934,7 +4237,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Match all query, order by dateFiled desc", "search_params": params_match_all_date_filed_desc, - "expected_results": 8, "expected_order": [ docket_entry_recent.docket.pk, # 2024/2/23 self.de_1.docket.pk, # 2016/8/16 @@ -3949,7 +4251,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Match all query, order by dateFiled asc", "search_params": params_match_all_date_filed_asc, - "expected_results": 8, "expected_order": [ docket_old.pk, # 1732/2/23 self.de.docket.pk, # 2015/8/16 @@ -3964,7 +4265,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Query string, order by entry_date_filed asc", "search_params": params_entry_date_filed_asc, - "expected_results": 5, "expected_order": [ self.de_1.docket.pk, # 2014/7/19 self.de.docket.pk, # 2015/8/16 @@ -3976,7 +4276,6 @@ def test_date_filed_sorting_function_score(self) -> None: { "name": "Match all query, order by entry_date_filed asc", "search_params": params_match_all_entry_date_filed_asc, - "expected_results": 8, "expected_order": [ self.de_1.docket.pk, # 2014/7/19 self.de.docket.pk, # 2015/8/16 diff --git a/cl/tests/cases.py b/cl/tests/cases.py index 8b23dea418..cdd93358db 100644 --- a/cl/tests/cases.py +++ b/cl/tests/cases.py @@ -346,24 +346,64 @@ async def _test_api_fields_content( f"Parent field '{field}' does not match.", ) - def _test_results_ordering(self, test, field): + def _test_results_ordering(self, test, field, version="v4"): """Ensure dockets appear in the response in a specific order.""" with self.subTest(test=test, msg=f'{test["name"]}'): r = self.client.get( - reverse("search-list", kwargs={"version": "v4"}), + reverse("search-list", kwargs={"version": version}), test["search_params"], ) - self.assertEqual(len(r.data["results"]), test["expected_results"]) + + expected_order_key = "expected_order" + if version == "v3": + expected_order_key = ( + "expected_order_v3" + if "expected_order_v3" in test + else "expected_order" + ) + + self.assertEqual( + len(r.data["results"]), len(test[expected_order_key]) + ) # Note that dockets where the date_field is null are sent to the bottom # of the results actual_order = [result[field] for result in r.data["results"]] self.assertEqual( actual_order, - test["expected_order"], - msg=f'Expected order {test["expected_order"]}, but got {actual_order}', + test[expected_order_key], + msg=f"Expected order {test[expected_order_key]}, but got {actual_order} for " + f"Search type: {test["search_params"]["type"]}", + ) + + def _assert_order_in_html( + self, decoded_content: str, expected_order: list + ) -> None: + """Assert that the expected order of documents appears correctly in the + HTML content.""" + + for i in range(len(expected_order) - 1): + self.assertTrue( + decoded_content.index(str(expected_order[i])) + < decoded_content.index(str(expected_order[i + 1])), + f"Expected {expected_order[i]} to appear before {expected_order[i + 1]} in the HTML content.", ) + async def _test_article_count(self, params, expected_count, field_name): + r = await self.async_client.get("/", params) + tree = html.fromstring(r.content.decode()) + got = len(tree.xpath("//article")) + self.assertEqual( + got, + expected_count, + msg="Did not get the right number of search results in Frontend with %s " + "filter applied.\n" + "Expected: %s\n" + " Got: %s\n\n" + "Params were: %s" % (field_name, expected_count, got, params), + ) + return r + def _test_page_variables( self, response, test_case, current_page, search_type ):