diff --git a/cl/lib/ratelimiter.py b/cl/lib/ratelimiter.py index 6ac26b0a06..384a539327 100644 --- a/cl/lib/ratelimiter.py +++ b/cl/lib/ratelimiter.py @@ -66,6 +66,7 @@ def get_path_to_make_key(group: str, request: HttpRequest) -> str: if "test" in sys.argv: ratelimiter_all_2_per_m = lambda func: func ratelimiter_unsafe_3_per_m = lambda func: func + ratelimiter_unsafe_5_per_d = lambda func: func ratelimiter_unsafe_10_per_m = lambda func: func ratelimiter_all_10_per_h = lambda func: func ratelimiter_unsafe_2000_per_h = lambda func: func @@ -79,6 +80,11 @@ def get_path_to_make_key(group: str, request: HttpRequest) -> str: rate="3/m", method=UNSAFE, ) + ratelimiter_unsafe_5_per_d = ratelimit( + key=get_ip_for_ratelimiter, + rate="5/d", + method=UNSAFE, + ) ratelimiter_unsafe_10_per_m = ratelimit( key=get_ip_for_ratelimiter, rate="10/m", diff --git a/cl/lib/search_utils.py b/cl/lib/search_utils.py index 15bc810c9a..cb6295dc0a 100644 --- a/cl/lib/search_utils.py +++ b/cl/lib/search_utils.py @@ -1,15 +1,58 @@ +import logging +import pickle import re -from typing import Any, Dict, List, Optional, Tuple, cast +from typing import Any, Dict, List, Optional, Tuple, TypedDict from urllib.parse import parse_qs, urlencode -from asgiref.sync import sync_to_async -from django.core.paginator import Page +from asgiref.sync import async_to_sync, sync_to_async +from django.conf import settings +from django.core.cache import cache +from django.core.exceptions import PermissionDenied +from django.core.paginator import EmptyPage, Page, PageNotAnInteger from django.http import HttpRequest +from django.http.request import QueryDict +from django_elasticsearch_dsl.search import Search +from eyecite.models import FullCaseCitation from eyecite.tokenizers import HyperscanTokenizer +from cl.citations.match_citations_queries import es_get_query_citation from cl.citations.utils import get_citation_depth_between_clusters -from cl.lib.types import SearchParam -from cl.search.forms import SearchForm +from cl.lib.crypto import sha256 +from cl.lib.elasticsearch_utils import ( + build_es_main_query, + compute_lowest_possible_estimate, + convert_str_date_fields_to_date_objects, + fetch_es_results, + get_facet_dict_for_search_query, + limit_inner_hits, + merge_courts_from_db, + merge_unavailable_fields_on_parent_document, + set_results_highlights, + simplify_estimated_count, +) +from cl.lib.paginators import ESPaginator +from cl.lib.types import CleanData +from cl.lib.utils import ( + sanitize_unbalanced_parenthesis, + sanitize_unbalanced_quotes, +) +from cl.search.constants import RELATED_PATTERN +from cl.search.documents import ( + AudioDocument, + DocketDocument, + ESRECAPDocument, + OpinionClusterDocument, + OpinionDocument, + ParentheticalGroupDocument, + PersonDocument, +) +from cl.search.exception import ( + BadProximityQuery, + DisallowedWildcardPattern, + UnbalancedParenthesesQuery, + UnbalancedQuotesQuery, +) +from cl.search.forms import SearchForm, _clean_form from cl.search.models import ( SEARCH_TYPES, Court, @@ -20,6 +63,19 @@ HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") +logger = logging.getLogger(__name__) + + +def check_pagination_depth(page_number): + """Check if the pagination is too deep (indicating a crawler)""" + + if page_number > settings.MAX_SEARCH_PAGINATION_DEPTH: + logger.warning( + "Query depth of %s denied access (probably a crawler)", + page_number, + ) + raise PermissionDenied + def make_get_string( request: HttpRequest, @@ -261,3 +317,418 @@ def store_search_api_query( source=SearchQuery.API, engine=engine, ) + + +class CachedESSearchResults(TypedDict): + results: Page | list + main_total: int | None + child_total: int | None + + +def retrieve_cached_search_results( + get_params: QueryDict, +) -> tuple[CachedESSearchResults | None, str]: + """ + Retrieve cached search results based on the GET parameters. + + :param get_params: The GET parameters provided by the user. + :return: A two-tuple containing either the cached search results and the + cache key based ona prefix and the get parameters, or None and the cache key + if no cached results were found. + """ + + params = get_params.copy() + # If no page is present in the parameters, set it to 1 to generate the same + # hash for page 1, regardless of whether the page parameter is included. + # Apply the same to the q parameter when it is not present in params. + params.setdefault("page", "1") + params.setdefault("q", "") + sorted_params = dict(sorted(params.items())) + key_prefix = "search_results_cache:" + params_hash = sha256(pickle.dumps(sorted_params)) + cache_key = f"{key_prefix}{params_hash}" + cached_results = cache.get(cache_key) + if cached_results: + return pickle.loads(cached_results), cache_key + return None, cache_key + + +def fetch_and_paginate_results( + get_params: QueryDict, + search_query: Search, + child_docs_count_query: Search | None, + rows_per_page: int = settings.SEARCH_PAGE_SIZE, + cache_key: str | None = None, +) -> tuple[Page | list, int, bool, int | None, int | None]: + """Fetch and paginate elasticsearch results. + + :param get_params: The user get params. + :param search_query: Elasticsearch DSL Search object + :param child_docs_count_query: The ES DSL Query to perform the count for + child documents if required, otherwise None. + :param rows_per_page: Number of records wanted per page + :param cache_key: The cache key to use. + :return: A five-tuple: the paginated results, the ES query time, whether + there was an error, the total number of hits for the main document, and + the total number of hits for the child document. + """ + + # Run the query and set up pagination + if cache_key is not None: + # Check cache for displaying insights on the Home Page. + results = cache.get(cache_key) + if results is not None: + return results, 0, False, None, None + + # Check micro-cache for all other search requests. + results_dict, micro_cache_key = retrieve_cached_search_results(get_params) + if results_dict: + # Return results and counts. Set query time to 1ms. + return ( + results_dict["results"], + 1, + False, + results_dict["main_total"], + results_dict["child_total"], + ) + + try: + page = int(get_params.get("page", 1)) + except ValueError: + page = 1 + + # Check pagination depth + check_pagination_depth(page) + + # Fetch results from ES + hits, query_time, error, main_total, child_total = fetch_es_results( + get_params, search_query, child_docs_count_query, page, rows_per_page + ) + + if error: + return [], query_time, error, main_total, child_total + paginator = ESPaginator(main_total, hits, rows_per_page) + try: + results = paginator.page(page) + except PageNotAnInteger: + results = paginator.page(1) + except EmptyPage: + results = paginator.page(paginator.num_pages) + + search_type = get_params.get("type", SEARCH_TYPES.OPINION) + # Set highlights in results. + convert_str_date_fields_to_date_objects(results, search_type) + merge_courts_from_db(results, search_type) + limit_inner_hits(get_params, results, search_type) + set_results_highlights(results, search_type) + merge_unavailable_fields_on_parent_document(results, search_type) + + if cache_key is not None: + # Cache only Page results for displaying insights on the Home Page. + cache.set(cache_key, results, settings.QUERY_RESULTS_CACHE) + elif settings.ELASTICSEARCH_MICRO_CACHE_ENABLED: + # Cache Page results and counts for all other search requests. + results_dict = { + "results": results, + "main_total": main_total, + "child_total": child_total, + } + serialized_data = pickle.dumps(results_dict) + cache.set( + micro_cache_key, + serialized_data, + settings.SEARCH_RESULTS_MICRO_CACHE, + ) + + return results, query_time, error, main_total, child_total + + +def remove_missing_citations( + missing_citations: list[FullCaseCitation], cd: CleanData +) -> tuple[list[str], str]: + """Removes missing citations from the query and returns the missing + citations as strings and the modified query. + + :param missing_citations: A list of FullCaseCitation objects representing + the citations that are missing from the query. + :param cd: A CleanData object containing the query string. + :return: A two-tuple containing a list of missing citation strings and the + suggested query string with missing citations removed. + """ + missing_citations_str = [ + citation.corrected_citation() for citation in missing_citations + ] + query_string = cd["q"] + for citation in missing_citations_str: + query_string = query_string.replace(citation, "") + suggested_query = ( + " ".join(query_string.split()) if missing_citations_str else "" + ) + return missing_citations_str, suggested_query + + +def do_es_search( + get_params: QueryDict, + rows: int = settings.SEARCH_PAGE_SIZE, + facet: bool = True, + cache_key: str | None = None, +): + """Run Elasticsearch searching and filtering and prepare data to display + + :param get_params: The request.GET params sent by user. + :param rows: The number of Elasticsearch results to request + :param facet: Whether to complete faceting in the query + :param cache_key: A cache key with which to save the results. Note that it + does not do anything clever with the actual query, so if you use this, your + cache key should *already* have factored in the query. If None, no caching + is set or used. Results are saved for six hours. + :return: A big dict of variables for use in the search results, homepage, or + other location. + """ + paged_results = None + courts = Court.objects.filter(in_use=True) + query_time: int | None = 0 + total_query_results: int | None = 0 + top_hits_limit: int | None = 5 + document_type = None + error_message = "" + suggested_query = "" + total_child_results: int | None = 0 + related_cluster = None + cited_cluster = None + query_citation = None + facet_fields = [] + missing_citations_str: list[str] = [] + error = True + + search_form = SearchForm(get_params, courts=courts) + match get_params.get("type", SEARCH_TYPES.OPINION): + case SEARCH_TYPES.PARENTHETICAL: + document_type = ParentheticalGroupDocument + case SEARCH_TYPES.ORAL_ARGUMENT: + document_type = AudioDocument + case SEARCH_TYPES.PEOPLE: + document_type = PersonDocument + case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: + document_type = DocketDocument + # Set a different number of results per page for RECAP SEARCH + rows = settings.RECAP_SEARCH_PAGE_SIZE + case SEARCH_TYPES.OPINION: + document_type = OpinionClusterDocument + + if search_form.is_valid() and document_type: + # Copy cleaned_data to preserve the original data when displaying the form + cd = search_form.cleaned_data.copy() + try: + # Create necessary filters to execute ES query + search_query = document_type.search() + + if cd["type"] in [ + SEARCH_TYPES.OPINION, + SEARCH_TYPES.RECAP, + SEARCH_TYPES.DOCKETS, + ]: + query_citation, missing_citations = es_get_query_citation(cd) + if cd["type"] in [ + SEARCH_TYPES.OPINION, + ]: + missing_citations_str, suggested_query = ( + remove_missing_citations(missing_citations, cd) + ) + cd["q"] = suggested_query if suggested_query else cd["q"] + ( + s, + child_docs_count_query, + top_hits_limit, + ) = build_es_main_query(search_query, cd) + ( + paged_results, + query_time, + error, + total_query_results, + total_child_results, + ) = fetch_and_paginate_results( + get_params, + s, + child_docs_count_query, + rows_per_page=rows, + cache_key=cache_key, + ) + cited_cluster = async_to_sync(add_depth_counts)( + # Also returns cited cluster if found + search_data=cd, + search_results=paged_results, + ) + related_prefix = RELATED_PATTERN.search(cd["q"]) + if related_prefix: + related_pks = related_prefix.group("pks").split(",") + related_cluster = OpinionCluster.objects.filter( + sub_opinions__pk__in=related_pks + ).distinct("pk") + except UnbalancedParenthesesQuery as e: + error = True + error_message = "unbalanced_parentheses" + if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING: + suggested_query = sanitize_unbalanced_parenthesis( + cd.get("q", "") + ) + except UnbalancedQuotesQuery as e: + error = True + error_message = "unbalanced_quotes" + if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING: + suggested_query = sanitize_unbalanced_quotes(cd.get("q", "")) + except BadProximityQuery as e: + error = True + error_message = "bad_proximity_token" + suggested_query = "proximity_filter" + if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING: + suggested_query = "proximity_query" + except DisallowedWildcardPattern: + error = True + error_message = "disallowed_wildcard_pattern" + finally: + # Make sure to always call the _clean_form method + search_form = _clean_form( + get_params, search_form.cleaned_data, courts + ) + if cd["type"] in [SEARCH_TYPES.OPINION] and facet: + # If the search query is valid, pass the cleaned data to filter and + # retrieve the correct number of opinions per status. Otherwise (if + # the query has errors), just provide a dictionary containing the + # search type to get the total number of opinions per status + facet_fields = get_facet_dict_for_search_query( + search_query, + cd if not error else {"type": cd["type"]}, + search_form, + ) + + courts, court_count_human, court_count = merge_form_with_courts( + courts, search_form + ) + search_summary_str = search_form.as_text(court_count_human) + search_summary_dict = search_form.as_display_dict(court_count_human) + results_details = [ + query_time, + total_query_results, + top_hits_limit, + total_child_results, + ] + + return { + "results": paged_results, + "results_details": results_details, + "search_form": search_form, + "search_summary_str": search_summary_str, + "search_summary_dict": search_summary_dict, + "error": error, + "courts": courts, + "court_count_human": court_count_human, + "court_count": court_count, + "query_citation": query_citation, + "cited_cluster": cited_cluster, + "related_cluster": related_cluster, + "facet_fields": facet_fields, + "error_message": error_message, + "suggested_query": suggested_query, + "estimated_count_threshold": simplify_estimated_count( + compute_lowest_possible_estimate( + settings.ELASTICSEARCH_CARDINALITY_PRECISION + ) + ), + "missing_citations": missing_citations_str, + } + + +def get_headers_for_search_export(type: str) -> list[str]: + """Creates a list of headers suitable for CSV export of search results. + + :param type: The type of Elasticsearch search to be performed. Valid values + are defined in the `SEARCH_TYPES` enum. + :return: A list of strings representing the CSV headers. + """ + match type: + case SEARCH_TYPES.PEOPLE: + keys = PersonDocument.__dict__["_fields"].keys() + case SEARCH_TYPES.ORAL_ARGUMENT: + keys = AudioDocument.__dict__["_fields"].keys() + case SEARCH_TYPES.PARENTHETICAL: + keys = ParentheticalGroupDocument.__dict__["_fields"].keys() + case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: + keys = set( + [ + *DocketDocument.__dict__["_fields"].keys(), + *ESRECAPDocument.__dict__["_fields"].keys(), + ] + ) + case SEARCH_TYPES.OPINION: + keys = set( + [ + *OpinionClusterDocument.__dict__["_fields"].keys(), + *OpinionDocument.__dict__["_fields"].keys(), + ] + ) + + return [ + key + for key in keys + if key not in ("person_child", "docket_child", "cluster_child") + ] + + +def fetch_es_results_for_csv( + queryset: QueryDict, search_type: str +) -> list[dict[str, Any]]: + """Retrieves matching results from Elasticsearch and returns them as a list + + This method will flatten nested results (like those returned by opinion and + recap searches) and limit the number of results in the list to + `settings.MAX_SEARCH_RESULTS_EXPORTED`. + + :param queryset: The query parameters sent by the user. + :param search_type: The type of Elasticsearch search to be performed. + :return: A list of dictionaries, where each dictionary represents a single + search result. + """ + csv_rows: list[dict[str, Any]] = [] + while len(csv_rows) <= settings.MAX_SEARCH_RESULTS_EXPORTED: + search = do_es_search( + queryset, rows=settings.MAX_SEARCH_RESULTS_EXPORTED + ) + if search["error"]: + return csv_rows + + results = search["results"] + match search_type: + case ( + SEARCH_TYPES.OPINION + | SEARCH_TYPES.RECAP + | SEARCH_TYPES.DOCKETS + ): + flat_results = [] + for result in results.object_list: + parent_dict = result.to_dict() + child_docs = parent_dict.pop("child_docs") + if child_docs: + flat_results.extend( + [ + parent_dict | doc["_source"].to_dict() + for doc in child_docs + ] + ) + else: + flat_results.extend([parent_dict]) + case _: + flat_results = [ + result.to_dict() for result in results.object_list + ] + + csv_rows.extend(flat_results) + + if not results.has_next(): + if len(csv_rows) <= settings.MAX_SEARCH_RESULTS_EXPORTED: + return csv_rows + break + + queryset["page"] = results.next_page_number() + + return csv_rows[: settings.MAX_SEARCH_RESULTS_EXPORTED] diff --git a/cl/opinion_page/views.py b/cl/opinion_page/views.py index 32fee0691b..629f7686ed 100644 --- a/cl/opinion_page/views.py +++ b/cl/opinion_page/views.py @@ -57,7 +57,7 @@ from cl.lib.model_helpers import choices_to_csv from cl.lib.models import THUMBNAIL_STATUSES from cl.lib.ratelimiter import ratelimiter_all_10_per_h -from cl.lib.search_utils import make_get_string +from cl.lib.search_utils import do_es_search, make_get_string from cl.lib.string_utils import trunc from cl.lib.thumbnails import make_png_thumbnail_for_instance from cl.lib.url_utils import get_redirect_or_abort @@ -98,7 +98,6 @@ RECAPDocument, ) from cl.search.selectors import get_clusters_from_citation_str -from cl.search.views import do_es_search HYPERSCAN_TOKENIZER = HyperscanTokenizer(cache_dir=".hyperscan") diff --git a/cl/search/tasks.py b/cl/search/tasks.py index 602ac95700..5a9647e9fb 100644 --- a/cl/search/tasks.py +++ b/cl/search/tasks.py @@ -1,5 +1,7 @@ +import csv +import io import logging -from datetime import date +from datetime import date, datetime from importlib import import_module from random import randint from typing import Any, Generator @@ -8,8 +10,12 @@ from celery.canvas import chain from django.apps import apps from django.conf import settings +from django.contrib.auth.models import User from django.core.exceptions import ObjectDoesNotExist +from django.core.mail import EmailMessage from django.db.models import Prefetch, QuerySet +from django.http import QueryDict +from django.template import loader from elasticsearch.exceptions import ( ApiError, ConflictError, @@ -34,6 +40,10 @@ from cl.celery_init import app from cl.lib.elasticsearch_utils import build_daterange_query from cl.lib.search_index_utils import get_parties_from_case_name +from cl.lib.search_utils import ( + fetch_es_results_for_csv, + get_headers_for_search_export, +) from cl.people_db.models import Person, Position from cl.search.documents import ( ES_CHILD_ID, @@ -45,6 +55,7 @@ PersonDocument, PositionDocument, ) +from cl.search.forms import SearchForm from cl.search.models import ( SEARCH_TYPES, Docket, @@ -341,6 +352,76 @@ def document_fields_to_update( return fields_to_update +@app.task( + autoretry_for=(ConnectionError, ConflictError, ConnectionTimeout), + max_retries=3, + ignore_result=True, +) +def email_search_results(user_id: int, query: str): + """Sends an email to the user with their search results as a CSV attachment. + + :param user_id: The ID of the user to send the email to. + :param query: The user's search query string. + """ + user = User.objects.get(pk=user_id) + # Parse the query string into a dictionary + qd = QueryDict(query.encode(), mutable=True) + + # Create a search form instance and validate the query data + search_form = SearchForm(qd) + if not search_form.is_valid(): + return + + # Get the cleaned data from the validated form + cd = search_form.cleaned_data + + # Fetch search results from Elasticsearch based on query and search type + search_results = fetch_es_results_for_csv( + queryset=qd, search_type=cd["type"] + ) + if not search_results: + return + + # Get the headers for the CSV file based on the search type + csv_headers = get_headers_for_search_export(cd["type"]) + + # Create the CSV content and store in a StringIO object + csv_content = None + with io.StringIO() as output: + csvwriter = csv.DictWriter( + output, + fieldnames=csv_headers, + extrasaction="ignore", + quotechar='"', + quoting=csv.QUOTE_ALL, + ) + csvwriter.writeheader() + for row in search_results: + csvwriter.writerow(row) + + csv_content: str = output.getvalue() + + # Prepare email content + txt_template = loader.get_template("search_results_email.txt") + email_context = {"username": user.username} + + # Create email object + message = EmailMessage( + subject="Your Search Results are Ready!", + body=txt_template.render(email_context), + from_email=settings.DEFAULT_FROM_EMAIL, + to=[user.email], + ) + + # Generate a filename for the CSV attachment with timestamp + now = datetime.now() + filename = f'search_results_{now.strftime("%Y%m%d_%H%M%S")}.csv' + + # Send email with attachments + message.attach(filename, csv_content, "text/csv") + message.send(fail_silently=False) + + @app.task( bind=True, autoretry_for=(ConnectionError, ConflictError, ConnectionTimeout), diff --git a/cl/search/templates/search_results_email.txt b/cl/search/templates/search_results_email.txt new file mode 100644 index 0000000000..92b9451f6c --- /dev/null +++ b/cl/search/templates/search_results_email.txt @@ -0,0 +1,15 @@ +Hi {{username}}, + +Your requested search results are attached as a CSV file. + +Please review the data carefully. + +If you have any questions or need further assistance, please don't hesitate to contact us. + +Sincerely, + +The Free Law Project Team + +------- +For questions or comments, please visit our contact page, https://www.courtlistener.com/contact/ +We're always happy to hear from you. diff --git a/cl/search/tests/tests_es_export.py b/cl/search/tests/tests_es_export.py new file mode 100644 index 0000000000..6acf9c3b12 --- /dev/null +++ b/cl/search/tests/tests_es_export.py @@ -0,0 +1,118 @@ +from django.core import mail +from django.core.management import call_command +from django.http import QueryDict +from django.urls import reverse + +from cl.lib.search_utils import fetch_es_results_for_csv +from cl.lib.test_helpers import RECAPSearchTestCase +from cl.search.models import SEARCH_TYPES +from cl.tests.cases import ESIndexTestCase, TestCase +from cl.users.factories import UserProfileWithParentsFactory + + +class ExportSearchTest(RECAPSearchTestCase, ESIndexTestCase, TestCase): + + errors = [ + ("Unbalance Quotes", 'q="test&type=o'), + ("Unbalance Parentheses", "q=Leave)&type=o"), + ("Bad syntax", "q=Leave /:&type=o"), + ] + + @classmethod + def setUpTestData(cls): + cls.user_profile = UserProfileWithParentsFactory() + cls.rebuild_index("search.Docket") + super().setUpTestData() + cls.rebuild_index("people_db.Person") + call_command( + "cl_index_parent_and_child_docs", + search_type=SEARCH_TYPES.RECAP, + queue="celery", + pk_offset=0, + testing_mode=True, + ) + + def test_returns_empty_list_when_query_with_error(self) -> None: + """Confirms the search helper returns an empty list when provided with + invalid query parameters.""" + for description, query in self.errors: + with self.subTest(description): + results = fetch_es_results_for_csv( + QueryDict(query.encode(), mutable=True), + SEARCH_TYPES.OPINION, + ) + self.assertEqual(len(results), 0) + + def test_limit_number_of_search_results(self) -> None: + """Checks hat `fetch_es_results_for_csv` returns a list with a size + equal to MAX_SEARCH_RESULTS_EXPORTED or the actual number of search + results (if it's less than `MAX_SEARCH_RESULTS_EXPORTED`) + """ + # This query should match all 5 judges indexed for this test + query = "q=gender:Female&type=p" + for i in range(6): + with self.subTest( + f"try to fetch only {i+1} results" + ), self.settings(MAX_SEARCH_RESULTS_EXPORTED=i + 1): + results = fetch_es_results_for_csv( + QueryDict(query.encode(), mutable=True), + SEARCH_TYPES.PEOPLE, + ) + expected_result_count = min( + i + 1, 5 + ) # Cap at 5 (total matching results) + self.assertEqual(len(results), expected_result_count) + + def test_can_flatten_nested_results(self) -> None: + """checks `fetch_es_results_for_csv` correctly handles and flattens + nested results.""" + # this query should match both docket records indexed + query = "type=r&q=12-1235 OR Jackson" + results = fetch_es_results_for_csv( + QueryDict(query.encode(), mutable=True), SEARCH_TYPES.RECAP + ) + # We expect 3 results because: + # - Docket 21-bk-1234 has 2 associated documents. + # - Docket 12-1235 has 1 associated document. + # + # The `fetch_es_results_for_csv` helper function should: + # - Flatten the results. + # - Add a row for each child document. + self.assertEqual(len(results), 3) + + def test_avoids_sending_email_for_query_with_error(self) -> None: + "Confirms we don't send emails when provided with invalid query" + self.client.login( + username=self.user_profile.user.username, password="password" + ) + for description, query in self.errors: + with self.subTest(description): + self.client.post( + reverse("export_search_results"), {"query": query} + ) + self.assertEqual(len(mail.outbox), 0) + + def test_do_not_send_empty_emails(self) -> None: + """Confirms that no emails are sent when the search query returns no + results""" + self.client.login( + username=self.user_profile.user.username, password="password" + ) + self.client.post( + reverse("export_search_results"), {"query": 'q="word"&type=r'} + ) + self.assertEqual(len(mail.outbox), 0) + + def test_sends_email_with_attachment(self) -> None: + "Confirms we dont send emails when provided with invalid query" + self.client.login( + username=self.user_profile.user.username, password="password" + ) + self.client.post( + reverse("export_search_results"), {"query": 'q="Jackson"&type=r'} + ) + self.assertEqual(len(mail.outbox), 1) + self.assertEqual( + mail.outbox[0].subject, "Your Search Results are Ready!" + ) + self.assertEqual(mail.outbox[0].to[0], self.user_profile.user.email) diff --git a/cl/search/tests/tests_es_recap.py b/cl/search/tests/tests_es_recap.py index 933913ea7e..3cead60fe3 100644 --- a/cl/search/tests/tests_es_recap.py +++ b/cl/search/tests/tests_es_recap.py @@ -2584,7 +2584,7 @@ def test_initial_document_button(self) -> None: for docket in dockets_to_remove: docket.delete() - @mock.patch("cl.search.views.fetch_es_results") + @mock.patch("cl.lib.search_utils.fetch_es_results") @override_settings( RECAP_SEARCH_PAGE_SIZE=2, ELASTICSEARCH_MICRO_CACHE_ENABLED=True ) @@ -7162,7 +7162,7 @@ def test_search_pagination_results_limit(self) -> None: # 100 results, 10 pages. total_results = 100 with mock.patch( - "cl.search.views.fetch_es_results", + "cl.lib.search_utils.fetch_es_results", side_effect=lambda *x: ( [], 1, @@ -7182,7 +7182,7 @@ def test_search_pagination_results_limit(self) -> None: # 101 results, 11 pages. total_results = 101 with mock.patch( - "cl.search.views.fetch_es_results", + "cl.lib.search_utils.fetch_es_results", side_effect=lambda *x: ( [], 1, @@ -7202,7 +7202,7 @@ def test_search_pagination_results_limit(self) -> None: # 20,000 results, 2,000 pages. total_results = 20_000 with mock.patch( - "cl.search.views.fetch_es_results", + "cl.lib.search_utils.fetch_es_results", side_effect=lambda *x: ( [], 1, diff --git a/cl/search/urls.py b/cl/search/urls.py index c1e7b9033a..69c7ee1d00 100644 --- a/cl/search/urls.py +++ b/cl/search/urls.py @@ -6,11 +6,19 @@ SearchFeed, search_feed_error_handler, ) -from cl.search.views import advanced, es_search, show_results +from cl.search.views import ( + advanced, + es_search, + export_search_results, + show_results, +) urlpatterns = [ # Search pages path("", show_results, name="show_results"), + path( + "search/export/", export_search_results, name="export_search_results" + ), path("opinion/", advanced, name="advanced_o"), path("audio/", advanced, name="advanced_oa"), path("person/", advanced, name="advanced_p"), diff --git a/cl/search/views.py b/cl/search/views.py index 4c10e94659..a29ab55065 100644 --- a/cl/search/views.py +++ b/cl/search/views.py @@ -1,94 +1,44 @@ -import logging -import pickle from datetime import date, datetime, timedelta, timezone from urllib.parse import quote from asgiref.sync import async_to_sync from cache_memoize import cache_memoize -from django.conf import settings from django.contrib import messages +from django.contrib.auth.decorators import login_required from django.contrib.auth.models import User -from django.core.cache import cache -from django.core.exceptions import PermissionDenied -from django.core.paginator import EmptyPage, Page, PageNotAnInteger from django.db.models import Count, Sum from django.http import HttpRequest, HttpResponse -from django.http.request import QueryDict from django.shortcuts import HttpResponseRedirect, get_object_or_404, render from django.template.response import TemplateResponse from django.urls import reverse from django.utils.timezone import make_aware from django.views.decorators.cache import never_cache -from django_elasticsearch_dsl.search import Search -from eyecite.models import FullCaseCitation +from django.views.decorators.http import require_POST from waffle.decorators import waffle_flag from cl.alerts.forms import CreateAlertForm from cl.alerts.models import Alert from cl.audio.models import Audio -from cl.citations.match_citations_queries import es_get_query_citation from cl.custom_filters.templatetags.text_filters import naturalduration from cl.lib.bot_detector import is_bot -from cl.lib.crypto import sha256 -from cl.lib.elasticsearch_utils import ( - build_es_main_query, - compute_lowest_possible_estimate, - convert_str_date_fields_to_date_objects, - fetch_es_results, - get_facet_dict_for_search_query, - get_only_status_facets, - limit_inner_hits, - merge_courts_from_db, - merge_unavailable_fields_on_parent_document, - set_results_highlights, - simplify_estimated_count, -) -from cl.lib.paginators import ESPaginator +from cl.lib.elasticsearch_utils import get_only_status_facets +from cl.lib.ratelimiter import ratelimiter_unsafe_5_per_d from cl.lib.redis_utils import get_redis_interface from cl.lib.search_utils import ( - add_depth_counts, + do_es_search, make_get_string, merge_form_with_courts, store_search_query, ) -from cl.lib.types import CleanData -from cl.lib.utils import ( - sanitize_unbalanced_parenthesis, - sanitize_unbalanced_quotes, -) -from cl.search.constants import RELATED_PATTERN -from cl.search.documents import ( - AudioDocument, - DocketDocument, - OpinionClusterDocument, - ParentheticalGroupDocument, - PersonDocument, -) -from cl.search.exception import ( - BadProximityQuery, - DisallowedWildcardPattern, - UnbalancedParenthesesQuery, - UnbalancedQuotesQuery, -) +from cl.lib.types import AuthenticatedHttpRequest +from cl.search.documents import OpinionClusterDocument from cl.search.forms import SearchForm, _clean_form -from cl.search.models import SEARCH_TYPES, Court, Opinion, OpinionCluster +from cl.search.models import SEARCH_TYPES, Court, Opinion +from cl.search.tasks import email_search_results from cl.stats.models import Stat from cl.stats.utils import tally_stat from cl.visualizations.models import SCOTUSMap -logger = logging.getLogger(__name__) - - -def check_pagination_depth(page_number): - """Check if the pagination is too deep (indicating a crawler)""" - - if page_number > settings.MAX_SEARCH_PAGINATION_DEPTH: - logger.warning( - "Query depth of %s denied access (probably a crawler)", - page_number, - ) - raise PermissionDenied - @cache_memoize(5 * 60) def get_homepage_stats(): @@ -418,314 +368,11 @@ def es_search(request: HttpRequest) -> HttpResponse: return render(request, template, render_dict) -def remove_missing_citations( - missing_citations: list[FullCaseCitation], cd: CleanData -) -> tuple[list[str], str]: - """Removes missing citations from the query and returns the missing - citations as strings and the modified query. - - :param missing_citations: A list of FullCaseCitation objects representing - the citations that are missing from the query. - :param cd: A CleanData object containing the query string. - :return: A two-tuple containing a list of missing citation strings and the - suggested query string with missing citations removed. - """ - missing_citations_str = [ - citation.corrected_citation() for citation in missing_citations - ] - query_string = cd["q"] - for citation in missing_citations_str: - query_string = query_string.replace(citation, "") - suggested_query = ( - " ".join(query_string.split()) if missing_citations_str else "" - ) - return missing_citations_str, suggested_query - - -def do_es_search( - get_params: QueryDict, - rows: int = settings.SEARCH_PAGE_SIZE, - facet: bool = True, - cache_key: str = None, -): - """Run Elasticsearch searching and filtering and prepare data to display - - :param get_params: The request.GET params sent by user. - :param rows: The number of Elasticsearch results to request - :param facet: Whether to complete faceting in the query - :param cache_key: A cache key with which to save the results. Note that it - does not do anything clever with the actual query, so if you use this, your - cache key should *already* have factored in the query. If None, no caching - is set or used. Results are saved for six hours. - :return: A big dict of variables for use in the search results, homepage, or - other location. - """ - paged_results = None - courts = Court.objects.filter(in_use=True) - query_time = total_query_results = 0 - top_hits_limit = 5 - document_type = None - error_message = "" - suggested_query = "" - total_child_results = 0 - related_cluster = None - cited_cluster = None - query_citation = None - facet_fields = [] - missing_citations_str = [] - error = True - - search_form = SearchForm(get_params, courts=courts) - match get_params.get("type", SEARCH_TYPES.OPINION): - case SEARCH_TYPES.PARENTHETICAL: - document_type = ParentheticalGroupDocument - case SEARCH_TYPES.ORAL_ARGUMENT: - document_type = AudioDocument - case SEARCH_TYPES.PEOPLE: - document_type = PersonDocument - case SEARCH_TYPES.RECAP | SEARCH_TYPES.DOCKETS: - document_type = DocketDocument - # Set a different number of results per page for RECAP SEARCH - rows = settings.RECAP_SEARCH_PAGE_SIZE - case SEARCH_TYPES.OPINION: - document_type = OpinionClusterDocument - - if search_form.is_valid() and document_type: - # Copy cleaned_data to preserve the original data when displaying the form - cd = search_form.cleaned_data.copy() - try: - # Create necessary filters to execute ES query - search_query = document_type.search() - - if cd["type"] in [ - SEARCH_TYPES.OPINION, - SEARCH_TYPES.RECAP, - SEARCH_TYPES.DOCKETS, - ]: - query_citation, missing_citations = es_get_query_citation(cd) - if cd["type"] in [ - SEARCH_TYPES.OPINION, - ]: - missing_citations_str, suggested_query = ( - remove_missing_citations(missing_citations, cd) - ) - cd["q"] = suggested_query if suggested_query else cd["q"] - ( - s, - child_docs_count_query, - top_hits_limit, - ) = build_es_main_query(search_query, cd) - ( - paged_results, - query_time, - error, - total_query_results, - total_child_results, - ) = fetch_and_paginate_results( - get_params, - s, - child_docs_count_query, - rows_per_page=rows, - cache_key=cache_key, - ) - cited_cluster = async_to_sync(add_depth_counts)( - # Also returns cited cluster if found - search_data=cd, - search_results=paged_results, - ) - related_prefix = RELATED_PATTERN.search(cd["q"]) - if related_prefix: - related_pks = related_prefix.group("pks").split(",") - related_cluster = OpinionCluster.objects.filter( - sub_opinions__pk__in=related_pks - ).distinct("pk") - except UnbalancedParenthesesQuery as e: - error = True - error_message = "unbalanced_parentheses" - if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING: - suggested_query = sanitize_unbalanced_parenthesis( - cd.get("q", "") - ) - except UnbalancedQuotesQuery as e: - error = True - error_message = "unbalanced_quotes" - if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING: - suggested_query = sanitize_unbalanced_quotes(cd.get("q", "")) - except BadProximityQuery as e: - error = True - error_message = "bad_proximity_token" - suggested_query = "proximity_filter" - if e.error_type == UnbalancedParenthesesQuery.QUERY_STRING: - suggested_query = "proximity_query" - except DisallowedWildcardPattern: - error = True - error_message = "disallowed_wildcard_pattern" - finally: - # Make sure to always call the _clean_form method - search_form = _clean_form( - get_params, search_form.cleaned_data, courts - ) - if cd["type"] in [SEARCH_TYPES.OPINION] and facet: - # If the search query is valid, pass the cleaned data to filter and - # retrieve the correct number of opinions per status. Otherwise (if - # the query has errors), just provide a dictionary containing the - # search type to get the total number of opinions per status - facet_fields = get_facet_dict_for_search_query( - search_query, - cd if not error else {"type": cd["type"]}, - search_form, - ) - - courts, court_count_human, court_count = merge_form_with_courts( - courts, search_form - ) - search_summary_str = search_form.as_text(court_count_human) - search_summary_dict = search_form.as_display_dict(court_count_human) - results_details = [ - query_time, - total_query_results, - top_hits_limit, - total_child_results, - ] - - return { - "results": paged_results, - "results_details": results_details, - "search_form": search_form, - "search_summary_str": search_summary_str, - "search_summary_dict": search_summary_dict, - "error": error, - "courts": courts, - "court_count_human": court_count_human, - "court_count": court_count, - "query_citation": query_citation, - "cited_cluster": cited_cluster, - "related_cluster": related_cluster, - "facet_fields": facet_fields, - "error_message": error_message, - "suggested_query": suggested_query, - "estimated_count_threshold": simplify_estimated_count( - compute_lowest_possible_estimate( - settings.ELASTICSEARCH_CARDINALITY_PRECISION - ) - ), - "missing_citations": missing_citations_str, - } - - -def retrieve_cached_search_results( - get_params: QueryDict, -) -> tuple[dict[str, Page | int] | None, str]: - """ - Retrieve cached search results based on the GET parameters. - - :param get_params: The GET parameters provided by the user. - :return: A two-tuple containing either the cached search results and the - cache key based ona prefix and the get parameters, or None and the cache key - if no cached results were found. - """ - - params = get_params.copy() - # If no page is present in the parameters, set it to 1 to generate the same - # hash for page 1, regardless of whether the page parameter is included. - # Apply the same to the q parameter when it is not present in params. - params.setdefault("page", "1") - params.setdefault("q", "") - sorted_params = dict(sorted(params.items())) - key_prefix = "search_results_cache:" - params_hash = sha256(pickle.dumps(sorted_params)) - cache_key = f"{key_prefix}{params_hash}" - cached_results = cache.get(cache_key) - if cached_results: - return pickle.loads(cached_results), cache_key - return None, cache_key - - -def fetch_and_paginate_results( - get_params: QueryDict, - search_query: Search, - child_docs_count_query: Search | None, - rows_per_page: int = settings.SEARCH_PAGE_SIZE, - cache_key: str = None, -) -> tuple[Page | list, int, bool, int | None, int | None]: - """Fetch and paginate elasticsearch results. - - :param get_params: The user get params. - :param search_query: Elasticsearch DSL Search object - :param child_docs_count_query: The ES DSL Query to perform the count for - child documents if required, otherwise None. - :param rows_per_page: Number of records wanted per page - :param cache_key: The cache key to use. - :return: A five-tuple: the paginated results, the ES query time, whether - there was an error, the total number of hits for the main document, and - the total number of hits for the child document. - """ - - # Run the query and set up pagination - if cache_key is not None: - # Check cache for displaying insights on the Home Page. - results = cache.get(cache_key) - if results is not None: - return results, 0, False, None, None - - # Check micro-cache for all other search requests. - results_dict, micro_cache_key = retrieve_cached_search_results(get_params) - if results_dict: - # Return results and counts. Set query time to 1ms. - return ( - results_dict["results"], - 1, - False, - results_dict["main_total"], - results_dict["child_total"], - ) - - try: - page = int(get_params.get("page", 1)) - except ValueError: - page = 1 - - # Check pagination depth - check_pagination_depth(page) - - # Fetch results from ES - hits, query_time, error, main_total, child_total = fetch_es_results( - get_params, search_query, child_docs_count_query, page, rows_per_page - ) - - if error: - return [], query_time, error, main_total, child_total - paginator = ESPaginator(main_total, hits, rows_per_page) - try: - results = paginator.page(page) - except PageNotAnInteger: - results = paginator.page(1) - except EmptyPage: - results = paginator.page(paginator.num_pages) - - search_type = get_params.get("type", SEARCH_TYPES.OPINION) - # Set highlights in results. - convert_str_date_fields_to_date_objects(results, search_type) - merge_courts_from_db(results, search_type) - limit_inner_hits(get_params, results, search_type) - set_results_highlights(results, search_type) - merge_unavailable_fields_on_parent_document(results, search_type) - - if cache_key is not None: - # Cache only Page results for displaying insights on the Home Page. - cache.set(cache_key, results, settings.QUERY_RESULTS_CACHE) - elif settings.ELASTICSEARCH_MICRO_CACHE_ENABLED: - # Cache Page results and counts for all other search requests. - results_dict = { - "results": results, - "main_total": main_total, - "child_total": child_total, - } - serialized_data = pickle.dumps(results_dict) - cache.set( - micro_cache_key, - serialized_data, - settings.SEARCH_RESULTS_MICRO_CACHE, - ) - - return results, query_time, error, main_total, child_total +@login_required +@ratelimiter_unsafe_5_per_d +@require_POST +def export_search_results(request: AuthenticatedHttpRequest) -> HttpResponse: + email_search_results.delay(request.user.pk, request.POST.get("query", "")) + # TODO: Update the frontend using Htmx to show a message indicating the + # export of search results is in progress. + return HttpResponse("It worked.") diff --git a/cl/settings/project/search.py b/cl/settings/project/search.py index cd44ec9534..2bbbd687f7 100644 --- a/cl/settings/project/search.py +++ b/cl/settings/project/search.py @@ -4,6 +4,11 @@ env = environ.FileAwareEnv() +################### +# Export setting # +################### +MAX_SEARCH_RESULTS_EXPORTED = env("MAX_SEARCH_RESULTS_EXPORTED", default=250) + ################### # Related content # ###################