diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 01aaeb7868..5181fdc9f4 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -49,23 +49,6 @@ jobs: matrix: tag_flags: ["--exclude-tag selenium", "--tag selenium"] steps: - - name: Check out solr - uses: actions/checkout@v4 - with: - repository: freelawproject/courtlistener-solr-server - ref: main - path: courtlistener-solr-server - - name: Set up solr permissions - run: | - cd courtlistener-solr-server - sudo chown -R :1024 data - sudo chown -R :1024 solr - sudo find data -type d -exec chmod g+s {} \; - sudo find solr -type d -exec chmod g+s {} \; - sudo find data -type d -exec chmod 775 {} \; - sudo find solr -type d -exec chmod 775 {} \; - sudo find data -type f -exec chmod 664 {} \; - sudo find solr -type f -exec chmod 664 {} \; - name: Check out CourtListener uses: actions/checkout@v4 with: diff --git a/cl/corpus_importer/factories.py b/cl/corpus_importer/factories.py index c9b7bdc86d..3cbe3fbcc3 100644 --- a/cl/corpus_importer/factories.py +++ b/cl/corpus_importer/factories.py @@ -36,29 +36,6 @@ class CaseLawFactory(factory.DictFactory): docket_number = Faker("federal_district_docket_number") -class RssDocketEntryDataFactory(factory.DictFactory): - date_filed = Faker("date_object") - description = "" - document_number = Faker("pyint", min_value=1, max_value=100) - pacer_doc_id = Faker("random_id_string") - pacer_seq_no = Faker("random_id_string") - short_description = Faker("text", max_nb_chars=40) - - -class RssDocketDataFactory(factory.DictFactory): - court_id = FuzzyText(length=4, chars=string.ascii_lowercase, suffix="d") - case_name = Faker("case_name") - docket_entries = factory.List( - [factory.SubFactory(RssDocketEntryDataFactory)] - ) - docket_number = Faker("federal_district_docket_number") - office = Faker("pyint", min_value=1, max_value=100) - chapter = Faker("pyint", min_value=1, max_value=100) - trustee_str = Faker("text", max_nb_chars=15) - type = Faker("text", max_nb_chars=8) - pacer_case_id = Faker("random_id_string") - - class FreeOpinionRowDataFactory(factory.DictFactory): case_name = Faker("case_name") cause = Faker("text", max_nb_chars=8) diff --git a/cl/corpus_importer/management/commands/troller_bk.py b/cl/corpus_importer/management/commands/troller_bk.py deleted file mode 100644 index 054d9e4682..0000000000 --- a/cl/corpus_importer/management/commands/troller_bk.py +++ /dev/null @@ -1,864 +0,0 @@ -# Import the troller BK RSS feeds -import argparse -import concurrent.futures -import gc -import linecache -import re -import sys -import threading -from collections import defaultdict -from datetime import datetime, timezone -from queue import Queue -from typing import Any, DefaultDict, Mapping, TypedDict -from urllib.parse import unquote - -from asgiref.sync import async_to_sync, sync_to_async -from dateutil.parser import ParserError -from django.db import DataError, IntegrityError, transaction -from django.db.models import Q -from django.utils.text import slugify -from django.utils.timezone import make_aware -from juriscraper.pacer import PacerRssFeed - -from cl.custom_filters.templatetags.text_filters import best_case_name -from cl.lib.command_utils import VerboseCommand, logger -from cl.lib.model_helpers import make_docket_number_core -from cl.lib.pacer import map_pacer_to_cl_id -from cl.lib.redis_utils import get_redis_interface -from cl.lib.storage import S3PrivateUUIDStorage -from cl.lib.string_utils import trunc -from cl.lib.timezone_helpers import localize_date_and_time -from cl.recap.mergers import ( - add_bankruptcy_data_to_docket, - calculate_recap_sequence_numbers, - find_docket_object, - update_docket_metadata, -) -from cl.recap_rss.tasks import ( - cache_hash, - get_last_build_date, - hash_item, - is_cached, -) -from cl.search.models import Court, Docket, DocketEntry, RECAPDocument - -FILES_BUFFER_THRESHOLD = 3 - - -async def check_for_early_termination( - court_id: str, docket: dict[str, Any] -) -> str | None: - """Check for early termination, skip the rest of the file in case a cached - item is reached or skip a single item if it doesn't contain required data. - Cache the current item. - - :param court_id: The court the docket entries belong to. - :param docket: A dict containing the item data. - :return: A "break" string indicating if the rest of the file should be - omitted, "continue" if only the current item should be omitted or None. - """ - item_hash = hash_item(docket) - if await is_cached(item_hash): - logger.info( - f"Hit a cached item, finishing adding bulk entries for {court_id} feed. " - ) - return "break" - - await cache_hash(item_hash) - if ( - not docket["pacer_case_id"] - and not docket["docket_number"] - or not len(docket["docket_entries"]) - ): - return "continue" - return None - - -def add_new_docket_from_rss( - court_id: str, - d: Docket, - docket: dict[str, Any], - unique_dockets: dict[str, Any], - dockets_to_create: list[Docket], -) -> None: - """Set metadata and extra values to the Docket object and append it to - the list of dockets to be added in bulk. - - :param court_id: The court the docket entries belong to. - :param d: The Docket object to modify and add. - :param docket: The dict containing the item data. - :param unique_dockets: The dict to keep track of unique dockets to add. - :param dockets_to_create: The list of dockets to add in bulk. - :return: None - """ - - date_filed, time_filed = localize_date_and_time( - court_id, docket["docket_entries"][0]["date_filed"] - ) - async_to_sync(update_docket_metadata)(d, docket) - d.pacer_case_id = docket["pacer_case_id"] - d.slug = slugify(trunc(best_case_name(d), 75)) - d.date_last_filing = date_filed - if d.docket_number: - d.docket_number_core = make_docket_number_core(d.docket_number) - - docket_in_list = unique_dockets.get(docket["docket_number"], None) - if not docket_in_list: - unique_dockets[docket["docket_number"]] = docket - dockets_to_create.append(d) - - -def do_bulk_additions( - court_id: str, - unique_dockets: dict[str, Any], - dockets_to_create: list[Docket], - des_to_add_no_existing_docket: DefaultDict[str, list[dict[str, Any]]], - des_to_add_existing_docket: list[tuple[int, dict[str, Any]]], -) -> tuple[list[int], int]: - """Create dockets, docket entries and recap documents in bulk. - - :param court_id: The court the docket entries belong to. - :param unique_dockets: The dict to keep track of unique dockets to add. - :param dockets_to_create: The list of dockets to add in bulk. - :param des_to_add_no_existing_docket: A DefaultDict containing entries to - add which its parent docket didn't exist, docket_number: [entries] - :param des_to_add_existing_docket: A list of tuples containing entries to - add which its parent docket exists, (docket.pk, docket_entry) - :return: A tuple containing a list of created recap documents pks, the - number of dockets created. - """ - - with transaction.atomic(): - # Create dockets in bulk. - d_bulk_created = Docket.objects.bulk_create(dockets_to_create) - - # Add bankruptcy data to dockets. - for d in d_bulk_created: - docket_data = unique_dockets.get(d.docket_number) - if docket_data: - add_bankruptcy_data_to_docket(d, docket_data) - - # Find and assign the created docket pk to the list of docket entries - # to add. - for d_created in d_bulk_created: - docket_number = d_created.docket_number - des_to_create = des_to_add_no_existing_docket[docket_number] - for de_entry in des_to_create: - des_to_add_existing_docket.append((d_created.pk, de_entry)) - - # Create docket entries in bulk. - docket_entries_to_add_bulk = get_docket_entries_to_add( - court_id, des_to_add_existing_docket - ) - des_bulk_created = DocketEntry.objects.bulk_create( - docket_entries_to_add_bulk - ) - - # Create RECAP documents in bulk. - rds_to_create_bulk = get_rds_to_add( - des_bulk_created, des_to_add_existing_docket - ) - rd_bulk_created = RECAPDocument.objects.bulk_create(rds_to_create_bulk) - - return [rd.pk for rd in rd_bulk_created], len(d_bulk_created) - - -def get_docket_entries_to_add( - court_id: str, des_to_add_existing_docket: list[tuple[int, dict[str, Any]]] -) -> list[DocketEntry]: - """Make and return a list of the DocketEntry objects to save in bulk. - - :param court_id: The court the docket entries belong to. - :param des_to_add_existing_docket: A list of tuples containing the docket - pk the entry belongs to, the docket entry dict. - :return: A list of DocketEntry objects. - """ - - docket_entries_to_add_bulk = [] - for de_add in des_to_add_existing_docket: - d_pk = de_add[0] - docket_entry = de_add[1] - calculate_recap_sequence_numbers([docket_entry], court_id) - date_filed, time_filed = localize_date_and_time( - court_id, docket_entry["date_filed"] - ) - de_to_add = DocketEntry( - docket_id=d_pk, - entry_number=docket_entry["document_number"], - description=docket_entry["description"], - pacer_sequence_number=docket_entry["pacer_seq_no"], - recap_sequence_number=docket_entry["recap_sequence_number"], - time_filed=time_filed, - date_filed=date_filed, - ) - docket_entries_to_add_bulk.append(de_to_add) - return docket_entries_to_add_bulk - - -def get_rds_to_add( - des_bulk_created: list[DocketEntry], - des_to_add_existing_docket: list[tuple[int, dict[str, Any]]], -) -> list[RECAPDocument]: - """Make and return a list of the RECAPDocument objects to save in bulk. - - :param des_bulk_created: The list of DocketEntry objects saved in a - previous step. - :param des_to_add_existing_docket: A list of tuples containing the docket - pk the entry belongs to, the docket entry dict. - :return: A list of RECAPDocument objects. - """ - - rds_to_create_bulk = [] - for d_entry, bulk_created in zip( - des_to_add_existing_docket, des_bulk_created - ): - de_pk = bulk_created.pk - docket_entry = d_entry[1] - document_number = docket_entry["document_number"] or "" - rd = RECAPDocument( - docket_entry_id=de_pk, - document_number=document_number, - description=docket_entry["short_description"], - document_type=RECAPDocument.PACER_DOCUMENT, - pacer_doc_id=docket_entry["pacer_doc_id"], - is_available=False, - ) - rds_to_create_bulk.append(rd) - - return rds_to_create_bulk - - -async def merge_rss_data( - feed_data: list[dict[str, Any]], - court_id: str, - build_date: datetime | None, -) -> tuple[list[int], int]: - """Merge the RSS data into the database - - :param feed_data: Data from an RSS feed file - :param court_id: The PACER court ID for the item - :param build_date: The RSS date build. - :return: A list of RECAPDocument PKs that can be passed to Solr - """ - - court_id = map_pacer_to_cl_id(court_id) - court = await Court.objects.aget(pk=court_id) - dockets_created = 0 - all_rds_created: list[int] = [] - court_ids = ( - Court.federal_courts.district_or_bankruptcy_pacer_courts().values_list( - "pk", flat=True - ) - ) - courts_exceptions_no_rss = ["miwb", "nceb", "pamd", "cit"] - if ( - build_date - and build_date - > make_aware(datetime(year=2018, month=4, day=20), timezone.utc) - and await court_ids.filter(id=court_id).aexists() - and court_id not in courts_exceptions_no_rss - ): - # Avoid parsing/adding feeds after we start scraping RSS Feeds for - # district and bankruptcy courts. - return all_rds_created, dockets_created - - dockets_to_create: list[Docket] = [] - unique_dockets: dict[str, Any] = {} - des_to_add_existing_docket: list[tuple[int, dict[str, Any]]] = [] - des_to_add_no_existing_docket: DefaultDict[str, list[dict[str, Any]]] = ( - defaultdict(list) - ) - for docket in feed_data: - skip_or_break = await check_for_early_termination(court_id, docket) - if skip_or_break == "continue": - continue - elif skip_or_break == "break": - break - - d = await find_docket_object( - court_id, - docket["pacer_case_id"], - docket["docket_number"], - docket.get("federal_defendant_number"), - docket.get("federal_dn_judge_initials_assigned"), - docket.get("federal_dn_judge_initials_referred"), - ) - docket_entry = docket["docket_entries"][0] - document_number = docket["docket_entries"][0]["document_number"] - if ( - document_number - and d.pk - and await d.docket_entries.filter( - entry_number=document_number - ).aexists() - ): - # It's an existing docket entry; let's not add it. - continue - else: - # Try finding the docket entry by short_description. - short_description = docket_entry["short_description"] - query = Q() - if short_description: - query |= Q( - recap_documents__description=docket_entry[ - "short_description" - ] - ) - if ( - d.pk - and await d.docket_entries.filter( - query, - date_filed=docket_entry["date_filed"], - entry_number=docket_entry["document_number"], - ).aexists() - ): - # It's an existing docket entry; let's not add it. - continue - - d.add_recap_source() - if not d.pk: - # Set metadata for the new docket and append the docket and entry - # to the list to add in bulk. - if ( - not docket["pacer_case_id"] - and court.jurisdiction != Court.FEDERAL_APPELLATE - ): - # Avoid adding the docket if it belongs to a district/bankr - # court and doesn't have a pacer_case_id - continue - - await sync_to_async(add_new_docket_from_rss)( - court_id, - d, - docket, - unique_dockets, - dockets_to_create, - ) - # Append docket entries to add in bulk. - des_to_add_no_existing_docket[docket["docket_number"]].append( - docket_entry - ) - else: - # Existing docket, update source, add bankr data and append the - # docket entry to add in bulk. - des_to_add_existing_docket.append((d.pk, docket_entry)) - try: - await d.asave(update_fields=["source"]) - await sync_to_async(add_bankruptcy_data_to_docket)(d, docket) - except (DataError, IntegrityError): - # Trouble. Log and move on - logger.warn( - "Got DataError or IntegrityError while saving docket." - ) - - rds_created_pks, dockets_created = await sync_to_async(do_bulk_additions)( - court_id, - unique_dockets, - dockets_to_create, - des_to_add_no_existing_docket, - des_to_add_existing_docket, - ) - all_rds_created.extend(rds_created_pks) - logger.info( - f"Finished adding {court_id} feed. Added {len(all_rds_created)} RDs." - ) - return all_rds_created, dockets_created - - -def parse_file( - binary_content: bytes, - court_id: str, -) -> tuple[Any, datetime | None]: - """Parse a RSS file and return the data. - - :param binary_content: The binary content of the file to parse. - :param court_id: The PACER court ID for the item - :return The parsed data from the retrieved XML feed. - """ - - feed = PacerRssFeed(court_id) - content = binary_content.decode("utf-8") - feed._parse_text(content) - build_date = get_last_build_date(binary_content) - return feed.data, build_date - - -def get_court_from_line(line: str): - """Get the court_id from the line. - - This is a bit annoying. Each file name looks something like: - - sources/troller-files/o-894|1599853056 - sources/troller-files/w-w-894|1599853056 - sources/troller-files/o-DCCF0395-BDBA-C444-149D8D8EFA2EC03D|1576082101 - sources/troller-files/w-88AC552F-BDBA-C444-1BD52598BA252265|1435103773 - sources/troller-files/w-w-DCCF049E-BDBA-C444-107C577164350B1E|1638858935 - sources/troller-files/w-88AC552F-BDBA-C444-1BD52598BA252265-1399913581 - sources/troller-files/w-w-Mariana|1638779760 - - The court_id is based on the part between the "/o-" and the "|" or "-". - Match it, look it up in our table of court IDs, and return the correct PACER ID. - - :param line: A line to a file in S3 - :return: The PACER court ID for the feed - """ - - court = None - regex = re.compile( - r"([A-Z0-9]{8}-[A-Z0-9]{4}-[A-Z0-9]{4}-[A-Z0-9]{16})|-([0-9]{3})\||-([0-9]{3})-|(Mariana)" - ) - match = re.search(regex, line) - if match is None: - return None - if match.group(1): - court = match.group(1) - if match.group(2): - court = match.group(2) - if match.group(3): - court = match.group(3) - if match.group(4): - court = match.group(4) - - if not court: - return None - return troller_ids.get(court, None) - - -class OptionsType(TypedDict): - offset: int - limit: int - file: str - - -def log_added_items_to_redis( - dockets_created: int, rds_created: int, line: int -) -> Mapping[str | bytes, int | str]: - """Log the number of dockets and recap documents created to redis. - Get the previous stored values and add the new ones. - - :param dockets_created: The dockets created. - :param rds_created: The recap documents created. - :param line: The last line imported. - :return: The data logged to redis. - """ - - r = get_redis_interface("STATS") - pipe = r.pipeline() - log_key = "troller_bk:log" - pipe.hgetall(log_key) - stored_values = pipe.execute() - current_total_dockets = int(stored_values[0].get("total_dockets", 0)) - current_total_rds = int(stored_values[0].get("total_rds", 0)) - - total_dockets_created = dockets_created + current_total_dockets - total_rds_created = rds_created + current_total_rds - log_info: Mapping[str | bytes, int | str] = { - "total_dockets": total_dockets_created, - "total_rds": total_rds_created, - "last_line": line, - "date_time": datetime.now().isoformat(), - } - pipe.hset(log_key, mapping=log_info) - pipe.expire(log_key, 60 * 60 * 24 * 28) # 4 weeks - pipe.execute() - return log_info - - -def download_file(item_path: str, order: int) -> tuple[bytes, str, int]: - """Small wrapper to download and read a file from S3. - :param item_path: The file path to download. - :param order: The original order of the file to keep in the queue. - :return: A tuple of the binary content of the file, the file path and the - file order. - """ - bucket = S3PrivateUUIDStorage() - with bucket.open(item_path, mode="rb") as f: - binary_content = f.read() - return binary_content, item_path, order - - -def download_files_from_paths( - item_paths: list[str], - files_queue: Queue, - last_thread: threading.Thread | None, -) -> None: - """Download multiple files concurrently and store them to a Queue. - :param item_paths: The list of file paths to download. - :param files_queue: The Queue where store the downloaded files. - :param last_thread: The previous thread launched. - :return: None - """ - - order = 0 - with concurrent.futures.ThreadPoolExecutor() as executor: - concurrent_downloads = [] - for item_path in item_paths: - concurrent_downloads.append( - executor.submit(download_file, item_path, order) - ) - order += 1 - - # Wait for all the downloads to complete. - completed_downloads = list( - concurrent.futures.as_completed(concurrent_downloads) - ) - # Order the downloads to preserver their original chron order. - completed_downloads.sort(key=lambda a: a.result()[2]) - # Add files to the Queue - for download in completed_downloads: - if last_thread: - # # Wait until the last thread completes, so we don't mess up - # the chronological order. - last_thread.join() - files_queue.put(download.result()) - - -def download_files_concurrently( - files_queue: Queue, - file_path: str, - files_downloaded_offset: int, - threads: list[threading.Thread], -) -> int: - """Get the next files to download and start a thread to download them. - :param files_queue: The Queue where store the downloaded files. - :param file_path: The file containing the list of paths to download. - :param files_downloaded_offset: The files that have been already downloaded - :param threads: The list of threads. - :return: The files_downloaded_offset updated - """ - - files_to_download = [] - linecache.clearcache() - linecache.checkcache(file_path) - if files_queue.qsize() < FILES_BUFFER_THRESHOLD - 1: - for j in range(FILES_BUFFER_THRESHOLD): - # Get the next paths to download. - next_line = linecache.getline( - file_path, files_downloaded_offset + 1 - ) - if next_line: - files_to_download.append(unquote(next_line).replace("\n", "")) - files_downloaded_offset += 1 - - # Download the files concurrently. - if files_to_download: - last_thread = None - if threads: - last_thread = threads[-1] - download_thread = threading.Thread( - target=download_files_from_paths, - args=(files_to_download, files_queue, last_thread), - ) - download_thread.start() - threads.append(download_thread) - - return files_downloaded_offset - - -def iterate_and_import_files( - options: OptionsType, threads: list[threading.Thread] -) -> None: - """Iterate over the inventory file and import all new items. - - - Merge into the DB - - Add to solr - - Do not send alerts or webhooks - - Do not touch dockets with entries (troller data is old) - - Do not parse (add) district/bankruptcy courts feeds after 2018-4-20 - that is the RSS feeds started being scraped by RECAP. - - :param options: The command line options - :param threads: A list of Threads. - :return: None - """ - - # Enable automatic garbage collection. - gc.enable() - f = open(options["file"], "r", encoding="utf-8") - total_dockets_created = 0 - total_rds_created = 0 - - files_queue: Queue = Queue(maxsize=FILES_BUFFER_THRESHOLD) - files_downloaded_offset = options["offset"] - for i, line in enumerate(f): - if i < options["offset"]: - continue - if i >= options["limit"] > 0: - break - - # If the files_queue has less than FILES_BUFFER_THRESHOLD files, then - # download more files ahead and store them to the queue. - files_downloaded_offset = download_files_concurrently( - files_queue, f.name, files_downloaded_offset, threads - ) - - # Process a file from the queue. - binary, item_path, order = files_queue.get() - court_id = get_court_from_line(item_path) - logger.info(f"Attempting: {item_path=} with {court_id=} \n") - if not court_id: - # Probably a court we don't know - continue - try: - feed_data, build_date = parse_file(binary, court_id) - except ParserError: - logger.info( - f"Skipping: {item_path=} with {court_id=} due to incorrect date format. \n" - ) - continue - rds_for_solr, dockets_created = async_to_sync(merge_rss_data)( - feed_data, court_id, build_date - ) - - total_dockets_created += dockets_created - total_rds_created += len(rds_for_solr) - - # Mark the file as completed and remove it from the queue. - files_queue.task_done() - - # Remove completed download threads from the list of threads. - for thread in threads: - if not thread.is_alive(): - threads.remove(thread) - logger.info(f"Last line imported: {i} \n") - - if not i % 25: - # Log every 25 lines. - log_added_items_to_redis( - total_dockets_created, total_rds_created, i - ) - # Restart counters after logging into redis. - total_dockets_created = 0 - total_rds_created = 0 - - # Ensure garbage collector is called at the end of each iteration. - gc.collect() - f.close() - - -class Command(VerboseCommand): - help = "Import the troller BK RSS files from S3 to the DB" - - def add_arguments(self, parser): - parser.add_argument( - "--offset", - type=int, - default=0, - help="The number of items to skip before beginning. Default is to " - "skip none.", - ) - parser.add_argument( - "--limit", - type=int, - default=0, - help="After doing this number, stop. This number is not additive " - "with the offset parameter. Default is to do all of them.", - ) - parser.add_argument( - "--file", - type=str, - help="Where is the text file that has the list of paths from the " - "bucket? Create this from an S3 inventory file, by removing " - "all but the path column", - ) - - def handle(self, *args, **options): - super().handle(*args, **options) - if not options["file"]: - raise argparse.ArgumentError( - "The 'file' argument is required for that action." - ) - - threads = [] - try: - iterate_and_import_files(options, threads) - except KeyboardInterrupt: - logger.info("The importer has stopped, waiting threads to exit.") - for thread in threads: - thread.join() - sys.exit(1) - - -troller_ids = { - "88AC552F-BDBA-C444-1BD52598BA252265": "nmb", - "DCCF0395-BDBA-C444-149D8D8EFA2EC03D": "almb", - "DCCF03A4-BDBA-C444-13AFEC481CF81C91": "alnb", - "DCCF03B4-BDBA-C444-180877EB555CF90A": "alsb", - "DCCF03C3-BDBA-C444-10B70B118120A4F8": "akb", - "DCCF03D3-BDBA-C444-1EA2D2D99D26D437": "azb", - "DCCF03E3-BDBA-C444-11C3D8B9C688D49E": "areb", - "DCCF03F2-BDBA-C444-14974FDC2C6DD113": "arwb", - "DCCF0412-BDBA-C444-1C60416590832545": "cacb", - "DCCF0421-BDBA-C444-12F451A14D4239AC": "caeb", - "DCCF0431-BDBA-C444-1CE9AB1898357D63": "canb", - "DCCF0440-BDBA-C444-1C8FEECE5B5AD482": "casb", - "DCCF0460-BDBA-C444-1282B46DCB6DF058": "cob", - "DCCF046F-BDBA-C444-126D999DD997D9A5": "ctb", - "DCCF047F-BDBA-C444-16EA4D3A7417C840": "deb", - "DCCF048F-BDBA-C444-12505144CA111B75": "dcb", - "DCCF049E-BDBA-C444-107C577164350B1E": "flmb", - "DCCF04BD-BDBA-C444-17B566BCA4E30864": "flnb", - "DCCF04CD-BDBA-C444-13315D191ADF5852": "flsb", - "DCCF04DD-BDBA-C444-11B09E58A8308286": "gamb", - "DCCF04EC-BDBA-C444-113648D978F0FF3B": "ganb", - "DCCF04FC-BDBA-C444-167F8376D8DF181B": "gasb", - "DCCF050C-BDBA-C444-1191B98D5C279255": "gub", - "DCCF051B-BDBA-C444-10E608B4E279AE73": "hib", - "DCCF052B-BDBA-C444-1128ADF2BE776FF5": "idb", - "DCCF053A-BDBA-C444-1E17C5EDDAAA98B3": "ilcb", - "DCCF055A-BDBA-C444-1B33BEAA267C9EF3": "ilnb", - "DCCF0569-BDBA-C444-10AAC89D6254827B": "ilsb", - "DCCF0579-BDBA-C444-13FDD2CBFCA0428E": "innb", - "DCCF0589-BDBA-C444-1403298F660F3248": "insb", - "DCCF0598-BDBA-C444-1D4AA3760C808AC6": "ianb", - "DCCF05A8-BDBA-C444-147676B19FFD9A64": "iasb", - "DCCF05B7-BDBA-C444-1159BABEABFF7AD8": "ksb", - "DCCF05C7-BDBA-C444-181132DD188F5B98": "kyeb", - "DCCF05D7-BDBA-C444-173EA852DA3C02F3": "kywb", - "DCCF05E6-BDBA-C444-1BBCF61EC04D7339": "laeb", - "DCCF05F6-BDBA-C444-1CC8B0B3A0BA9BBE": "lamb", - "DCCF0606-BDBA-C444-156EC6BFC06D300C": "lawb", - "DCCF0615-BDBA-C444-12DA3916397575D1": "meb", - "DCCF0625-BDBA-C444-16B46E54DD6D2B3F": "mdb", - "DCCF0634-BDBA-C444-172D1B61491F44EB": "mab", - "DCCF0644-BDBA-C444-16D30512F57AD7E7": "mieb", - "DCCF0654-BDBA-C444-1B26AFB780F7E57D": "miwb", - "DCCF0663-BDBA-C444-1E2D50E14B7E69B6": "mnb", - "DCCF0673-BDBA-C444-162C60670DF8F3CC": "msnb", - "DCCF0683-BDBA-C444-16D08467B7FFD39C": "mssb", - "DCCF0692-BDBA-C444-105A607741D9B25E": "moeb", - "DCCF06B1-BDBA-C444-1D0081621397B587": "mowb", - "DCCF06C1-BDBA-C444-116BC0B37A3105FA": "mtb", - "DCCF06D1-BDBA-C444-16605BEF7E402AFF": "neb", - "DCCF06E0-BDBA-C444-142566FBDE706DF9": "nvb", - "DCCF06F0-BDBA-C444-15CEC5BC7E8811B0": "nhb", - "DCCF0700-BDBA-C444-1833C704F349B4C5": "njb", - "DCCF071F-BDBA-C444-12E80A7584DAB242": "nyeb", - "DCCF072E-BDBA-C444-161CCB961DC28EAA": "nynb", - "DCCF073E-BDBA-C444-195A319E0477A40F": "nysb", - "DCCF075D-BDBA-C444-1A4574BEA4332780": "nywb", - "DCCF076D-BDBA-C444-1D86BA6110EAC8EB": "nceb", - "DCCF077D-BDBA-C444-19E00357E47293C6": "ncmb", - "DCCF078C-BDBA-C444-13A763C27712238D": "ncwb", - "DCCF079C-BDBA-C444-152775C142804DBF": "ndb", - "DCCF07AB-BDBA-C444-1909DD6A1D03789A": "ohnb", - "DCCF07BB-BDBA-C444-15CC4C79DA8F0883": "ohsb", - "DCCF07CB-BDBA-C444-16A03EA3C59A0E65": "okeb", - "DCCF07DA-BDBA-C444-19C1613A6E47E8CC": "oknb", - "DCCF07EA-BDBA-C444-11A55B458254CDA2": "okwb", - "DCCF07FA-BDBA-C444-1931F6C553EEC927": "orb", - "DCCF0819-BDBA-C444-121A57E62D0F901B": "paeb", - "DCCF0838-BDBA-C444-11578199813DA094": "pamb", - "DCCF0848-BDBA-C444-1FDC44C3E5C7F028": "pawb", - "DCCF0857-BDBA-C444-1249D33530373C4A": "prb", - "DCCF0867-BDBA-C444-11F248F5A172BED7": "rib", - "DCCF0877-BDBA-C444-140D6F0E2517D28A": "scb", - "DCCF0886-BDBA-C444-1FA114144D695156": "sdb", - "DCCF0896-BDBA-C444-19AE23DDBC293010": "tneb", - "DCCF08A5-BDBA-C444-16F88B92DFEFF2D7": "tnmb", - "DCCF08B5-BDBA-C444-1015B0D4FD4EA2BB": "tnwb", - "DCCF08D4-BDBA-C444-17A1F7F9130C2B5A": "txeb", - "DCCF08E4-BDBA-C444-1FF320EDE23FE1C4": "txnb", - "DCCF08F4-BDBA-C444-137D9095312F2A26": "txsb", - "DCCF0903-BDBA-C444-1F1B7B299E8BEDEC": "txwb", - "DCCF0913-BDBA-C444-1426E01E34A098A8": "utb", - "DCCF0922-BDBA-C444-1E7C4839C9DDE0DD": "vtb", - "DCCF0932-BDBA-C444-1E3B6019198C4AF3": "vib", - "DCCF0942-BDBA-C444-15DE36A8BF619EE3": "vaeb", - "DCCF0951-BDBA-C444-156287CAA9B5EA92": "vawb", - "DCCF0961-BDBA-C444-113035CFC50A69B8": "waeb", - "DCCF0971-BDBA-C444-1AE1249D4E72B62E": "wawb", - "DCCF0980-BDBA-C444-12EE39B96F6E2CAD": "wvnb", - "DCCF0990-BDBA-C444-16831E0CC62633BB": "wvsb", - "DCCF099F-BDBA-C444-163A7EEE0EB991F6": "wieb", - "DCCF09BF-BDBA-C444-1D3842A8131499EF": "wiwb", - "DCCF09CE-BDBA-C444-1B4915E476D3A9D2": "wyb", - "Mariana": "nmib", - "640": "almd", - "645": "alsd", - "648": "akd", - "651": "azd", - "653": "ared", - "656": "arwd", - "659": "cacd", - "662": "caed", - "664": "cand", - "667": "casd", - "670": "cod", - "672": "ctd", - "675": "ded", - "678": "dcd", - "681": "flmd", - "686": "flsd", - "689": "gamd", - "696": "gud", - "699": "hid", - "701": "idd", - "704": "ilcd", - "707": "ilnd", - "712": "innd", - "715": "insd", - "717": "iand", - "720": "iasd", - "723": "ksd", - "728": "kywd", - "731": "laed", - "734": "lamd", - "737": "lawd", - "740": "med", - "744": "mad", - "747": "mied", - "750": "miwd", - "757": "mssd", - "759": "moed", - "762": "mowd", - "765": "mtd", - "768": "ned", - "771": "nvd", - "773": "nhd", - "776": "njd", - "779": "nmd", - "781": "nyed", - "784": "nynd", - "787": "nysd", - "792": "nced", - "795": "ncmd", - "798": "ncwd", - "803": "nmid", - "806": "ohnd", - "811": "ohsd", - "818": "okwd", - "821": "ord", - "823": "paed", - "826": "pamd", - "829": "pawd", - "832": "prd", - "835": "rid", - "840": "sdd", - "843": "tned", - "846": "tnmd", - "849": "tnwd", - "851": "txed", - "854": "txnd", - "856": "txsd", - "859": "txwd", - "862": "utd", - "865": "vtd", - "868": "vid", - "873": "vawd", - "876": "waed", - "879": "wawd", - "882": "wvnd", - "885": "wvsd", - "888": "wied", - "891": "wiwd", - "894": "wyd", - # Appellate - "609": "ca6", - "619": "ca10", - "625": "cadc", - "628": "cafc", - # I don't think we currently crawl these. Worth checking. - "633": "uscfc", - "636": "cit", -} diff --git a/cl/corpus_importer/tasks.py b/cl/corpus_importer/tasks.py index a00a5e4448..cf242656fa 100644 --- a/cl/corpus_importer/tasks.py +++ b/cl/corpus_importer/tasks.py @@ -1614,7 +1614,7 @@ def get_docket_by_pacer_case_id( :param tag_names: A list of tag names that should be stored with the item in the DB. :param kwargs: A variety of keyword args to pass to DocketReport.query(). - :return: A dict indicating if we need to update Solr. + :return: A dict indicating if we need to update the search engine. """ if data is None: logger.info("Empty data argument. Terminating chains and exiting.") diff --git a/cl/corpus_importer/tests.py b/cl/corpus_importer/tests.py index c26207e871..b631fc3829 100644 --- a/cl/corpus_importer/tests.py +++ b/cl/corpus_importer/tests.py @@ -27,8 +27,6 @@ CaseLawCourtFactory, CaseLawFactory, CitationFactory, - RssDocketDataFactory, - RssDocketEntryDataFactory, ) from cl.corpus_importer.import_columbia.columbia_utils import fix_xml_tags from cl.corpus_importer.import_columbia.parse_opinions import ( @@ -57,11 +55,6 @@ normalize_authors_in_opinions, normalize_panel_in_opinioncluster, ) -from cl.corpus_importer.management.commands.troller_bk import ( - download_files_concurrently, - log_added_items_to_redis, - merge_rss_data, -) from cl.corpus_importer.management.commands.update_casenames_wl_dataset import ( check_case_names_match, parse_citations, @@ -90,7 +83,6 @@ ) from cl.lib.pacer import process_docket_data from cl.lib.redis_utils import get_redis_interface -from cl.lib.timezone_helpers import localize_date_and_time from cl.people_db.factories import PersonWithChildrenFactory, PositionFactory from cl.people_db.lookup_utils import ( extract_judge_last_name, @@ -99,22 +91,18 @@ ) from cl.people_db.models import Attorney, AttorneyOrganization, Party from cl.recap.models import UPLOAD_TYPE, PacerHtmlFiles -from cl.recap_rss.models import RssItemCache from cl.scrapers.models import PACERFreeDocumentRow from cl.search.factories import ( CourtFactory, - DocketEntryWithParentsFactory, DocketFactory, OpinionClusterFactory, OpinionClusterFactoryMultipleOpinions, OpinionClusterFactoryWithChildrenAndParents, OpinionClusterWithParentsFactory, OpinionWithChildrenFactory, - RECAPDocumentFactory, ) from cl.search.models import ( SOURCES, - BankruptcyInformation, Citation, Court, Docket, @@ -1120,1281 +1108,6 @@ def test_normalize_panel_str(self): self.assertEqual(len(cluster.panel.all()), 2) -def mock_download_file(item_path, order): - time.sleep(randint(1, 10) / 100) - return b"", item_path, order - - -class TrollerBKTests(TestCase): - @classmethod - def setUpTestData(cls) -> None: - # District factories - cls.court = CourtFactory(id="canb", jurisdiction="FB") - cls.court_neb = CourtFactory(id="nebraskab", jurisdiction="FD") - cls.court_pamd = CourtFactory(id="pamd", jurisdiction="FD") - cls.docket_d_before_2018 = DocketFactory( - case_name="Young v. State", - docket_number="3:17-CV-01477", - court=cls.court, - source=Docket.HARVARD, - pacer_case_id="1234", - ) - - cls.docket_d_after_2018 = DocketFactory( - case_name="Dragon v. State", - docket_number="3:15-CV-01455", - court=cls.court, - source=Docket.HARVARD, - pacer_case_id="5431", - ) - - cls.de_d_before_2018 = DocketEntryWithParentsFactory( - docket__court=cls.court, - docket__case_name="Young Entry v. Dragon", - docket__docket_number="3:87-CV-01400", - docket__source=Docket.HARVARD, - docket__pacer_case_id="9038", - entry_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=4), timezone.utc - ), - ) - - # Appellate factories - cls.court_appellate = CourtFactory(id="ca1", jurisdiction="F") - cls.docket_a_before_2018 = DocketFactory( - case_name="Young v. State", - docket_number="12-2532", - court=cls.court_appellate, - source=Docket.HARVARD, - pacer_case_id=None, - ) - cls.docket_a_after_2018 = DocketFactory( - case_name="Dragon v. State", - docket_number="15-1232", - court=cls.court_appellate, - source=Docket.HARVARD, - pacer_case_id=None, - ) - cls.de_a_before_2018 = DocketEntryWithParentsFactory( - docket__court=cls.court_appellate, - docket__case_name="Young Entry v. Dragon", - docket__docket_number="12-3242", - docket__source=Docket.HARVARD, - docket__pacer_case_id=None, - entry_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=4), timezone.utc - ), - ) - cls.docket_a_2018_case_id = DocketFactory( - case_name="Young v. State", - docket_number="12-5674", - court=cls.court_appellate, - source=Docket.RECAP, - pacer_case_id="12524", - ) - - @classmethod - def restart_troller_log(cls): - r = get_redis_interface("STATS") - key = r.keys("troller_bk:log") - if key: - r.delete(*key) - - def setUp(self) -> None: - self.restart_troller_log() - - def test_merge_district_rss_before_2018(self): - """1 Test merge district RSS file before 2018-4-20 into an existing - docket - - Before 2018-4-20 - District - Docket exists - No docket entries - - Merge docket entries, avoid updating metadata. - """ - d_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court.pk, - case_name="Young v. Dragon", - docket_number="3:17-CV-01473", - pacer_case_id="1234", - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2017, month=1, day=4), timezone.utc - ) - ) - ], - ) - - build_date = d_rss_data_before_2018["docket_entries"][0]["date_filed"] - self.assertEqual( - len(self.docket_d_before_2018.docket_entries.all()), 0 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_before_2018], self.court.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 0) - self.docket_d_before_2018.refresh_from_db() - self.assertEqual(self.docket_d_before_2018.case_name, "Young v. State") - self.assertEqual( - self.docket_d_before_2018.docket_number, "3:17-CV-01477" - ) - self.assertEqual( - len(self.docket_d_before_2018.docket_entries.all()), 1 - ) - self.assertEqual( - self.docket_d_before_2018.source, Docket.HARVARD_AND_RECAP - ) - - def test_avoid_merging_district_rss_after_2018(self): - """2 Test avoid merging district RSS file after 2018-4-20 - - After 2018-4-20 - District - Docket exists - No docket entries - - Don't merge docket entries, avoid updating metadata. - """ - d_rss_data_after_2018 = RssDocketDataFactory( - court_id=self.court.pk, - case_name="Dragon 1 v. State", - docket_number="3:15-CV-01456", - pacer_case_id="5431", - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2018, month=4, day=21), timezone.utc - ) - ) - ], - ) - - build_date = d_rss_data_after_2018["docket_entries"][0]["date_filed"] - self.assertEqual(len(self.docket_d_after_2018.docket_entries.all()), 0) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_after_2018], self.court.pk, build_date - ) - self.assertEqual(len(rds_created), 0) - self.assertEqual(d_created, 0) - self.docket_d_after_2018.refresh_from_db() - self.assertEqual(self.docket_d_after_2018.case_name, "Dragon v. State") - self.assertEqual( - self.docket_d_after_2018.docket_number, "3:15-CV-01455" - ) - self.assertEqual(len(self.docket_d_after_2018.docket_entries.all()), 0) - self.assertEqual(self.docket_d_after_2018.source, Docket.HARVARD) - - def test_merge_district_courts_rss_exceptions_after_2018(self): - """Test merging district RSS exceptions after 2018-4-20 - - After 2018-4-20 - District ["miwb", "nceb", "pamd", "cit"] - Docket doesn't exists - No docket entries - - Create docket, merge docket entries. - """ - d_rss_data_after_2018 = RssDocketDataFactory( - court_id=self.court_pamd.pk, - case_name="Dragon 1 v. State", - docket_number="3:15-CV-01456", - pacer_case_id="54312", - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2018, month=4, day=21), timezone.utc - ) - ) - ], - ) - - build_date = d_rss_data_after_2018["docket_entries"][0]["date_filed"] - self.assertEqual(len(self.docket_d_after_2018.docket_entries.all()), 0) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_after_2018], self.court_pamd.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 1) - - docket = Docket.objects.get(pacer_case_id="54312") - self.assertEqual(docket.case_name, "Dragon 1 v. State") - self.assertEqual(docket.docket_number, "3:15-CV-01456") - - def test_merging_district_docket_with_entries_before_2018(self): - """3 Test merge district RSS file before 2018-4-20 into a - docket with entries. - - Before 2018-4-20 - District - Docket exists - Docket entries - - Only merge entry if it doesn't exist, avoid updating metadata. - """ - d_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court.pk, - case_name="Young v. Dragon", - docket_number="3:17-CV-01473", - pacer_case_id="9038", - docket_entries=[ - RssDocketEntryDataFactory( - document_number="2", - date_filed=make_aware( - datetime(year=2017, month=1, day=4), timezone.utc - ), - ) - ], - ) - - build_date = d_rss_data_before_2018["docket_entries"][0]["date_filed"] - self.assertEqual( - len(self.de_d_before_2018.docket.docket_entries.all()), 1 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_before_2018], self.court.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 0) - self.de_d_before_2018.refresh_from_db() - self.assertEqual( - self.de_d_before_2018.docket.case_name, "Young Entry v. Dragon" - ) - self.assertEqual( - self.de_d_before_2018.docket.docket_number, "3:87-CV-01400" - ) - self.assertEqual( - len(self.de_d_before_2018.docket.docket_entries.all()), 2 - ) - self.assertEqual( - self.de_d_before_2018.docket.source, Docket.HARVARD_AND_RECAP - ) - - def test_avoid_merging_updating_docket_item_without_docket_entries( - self, - ): - """Test avoid merging or updating the docket when the RSS item doesn't - contain entries. - - Docket exists - Docket entries - - Avoid updating metadata. - """ - d_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court.pk, - case_name="Young v. Dragon", - docket_number="3:17-CV-01473", - pacer_case_id="9038", - docket_entries=[], - ) - - build_date = make_aware( - datetime(year=2017, month=1, day=4), timezone.utc - ) - self.assertEqual( - len(self.de_d_before_2018.docket.docket_entries.all()), 1 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_before_2018], self.court.pk, build_date - ) - self.assertEqual(len(rds_created), 0) - self.assertEqual(d_created, 0) - self.assertEqual(self.de_d_before_2018.docket.source, Docket.HARVARD) - - def test_add_new_district_rss_before_2018(self): - """4 Test adds a district RSS file before 2018-4-20, new docket. - - Before: 2018-4-20 - District - Docket doesn't exist - No docket entries - - Create docket, merge docket entries. - """ - d_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court.pk, - case_name="Youngs v. Dragon", - docket_number="3:20-CV-01473", - pacer_case_id="43562", - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2017, month=1, day=4), timezone.utc - ) - ) - ], - ) - - build_date = d_rss_data_before_2018["docket_entries"][0]["date_filed"] - dockets = Docket.objects.filter(pacer_case_id="43562") - self.assertEqual(dockets.count(), 0) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_before_2018], self.court.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 1) - self.assertEqual(dockets[0].case_name, "Youngs v. Dragon") - self.assertEqual(dockets[0].docket_number, "3:20-CV-01473") - self.assertEqual(len(dockets[0].docket_entries.all()), 1) - self.assertEqual(dockets[0].source, Docket.RECAP) - - def test_avoid_merging_rss_docket_with_entries_district_after_2018(self): - """5 Test avoid merging district RSS file after 2018-4-20 into a - docket with entries. - - After 2018-4-20 - District - Docket exists - Docket entries - - Don't merge docket entries, avoid updating metadata. - """ - d_rss_data_after_2018 = RssDocketDataFactory( - court_id=self.court.pk, - case_name="Young v. Dragons 2", - docket_number="3:57-CV-01453", - pacer_case_id="9038", - docket_entries=[ - RssDocketEntryDataFactory( - document_number="2", - date_filed=make_aware( - datetime(year=2019, month=1, day=4), timezone.utc - ), - ) - ], - ) - - build_date = d_rss_data_after_2018["docket_entries"][0]["date_filed"] - self.assertEqual( - len(self.de_d_before_2018.docket.docket_entries.all()), 1 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_after_2018], self.court.pk, build_date - ) - self.assertEqual(len(rds_created), 0) - self.assertEqual(d_created, 0) - self.de_d_before_2018.refresh_from_db() - self.assertEqual( - self.de_d_before_2018.docket.case_name, "Young Entry v. Dragon" - ) - self.assertEqual( - self.de_d_before_2018.docket.docket_number, "3:87-CV-01400" - ) - self.assertEqual( - len(self.de_d_before_2018.docket.docket_entries.all()), 1 - ) - self.assertEqual(self.de_d_before_2018.docket.source, Docket.HARVARD) - - def test_avoid_adding_new_district_rss_after_2018(self): - """6 Test avoid adding district RSS file after 2018-4-20. - - After 2018-4-20 - District - Docket doesn't exist - No docket entries - - Do not create docket, do not merge docket entries. - """ - d_rss_data_after_2018 = RssDocketDataFactory( - court_id=self.court.pk, - case_name="Youngs v. Dragon", - docket_number="3:20-CV-01473", - pacer_case_id="53432", - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2019, month=1, day=4), timezone.utc - ) - ) - ], - ) - - build_date = d_rss_data_after_2018["docket_entries"][0]["date_filed"] - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_after_2018], self.court.pk, build_date - ) - self.assertEqual(len(rds_created), 0) - self.assertEqual(d_created, 0) - - # Appellate - def test_merge_appellate_rss_before_2018(self): - """7 Test merge an appellate RSS file before 2018-4-20 - - Before 2018-4-20 - Appellate - Docket exists - No docket entries - - Merge docket entries, avoid updating metadata. - """ - a_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Young v. Dragon", - docket_number="12-2532", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2017, month=1, day=4), timezone.utc - ) - ) - ], - ) - - build_date = a_rss_data_before_2018["docket_entries"][0]["date_filed"] - self.assertEqual( - len(self.docket_a_before_2018.docket_entries.all()), 0 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [a_rss_data_before_2018], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 0) - self.docket_a_before_2018.refresh_from_db() - self.assertEqual(self.docket_a_before_2018.case_name, "Young v. State") - self.assertEqual(self.docket_a_before_2018.docket_number, "12-2532") - self.assertEqual( - len(self.docket_a_before_2018.docket_entries.all()), 1 - ) - self.assertEqual( - self.docket_a_before_2018.source, Docket.HARVARD_AND_RECAP - ) - - def test_merging_appellate_rss_after_2018(self): - """8 Test appellate RSS file after 2018-4-20 - - After 2018-4-20 - Appellate - Docket exists - No docket entries - - Merge docket entries, avoid updating metadata. - """ - a_rss_data_after_2018 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Dragon 1 v. State", - docket_number="15-1232", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2018, month=4, day=21), timezone.utc - ) - ) - ], - ) - - build_date = a_rss_data_after_2018["docket_entries"][0]["date_filed"] - self.assertEqual(len(self.docket_a_after_2018.docket_entries.all()), 0) - rds_created, d_created = async_to_sync(merge_rss_data)( - [a_rss_data_after_2018], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 0) - self.docket_a_after_2018.refresh_from_db() - self.assertEqual(self.docket_a_after_2018.case_name, "Dragon v. State") - self.assertEqual(self.docket_a_after_2018.docket_number, "15-1232") - self.assertEqual(len(self.docket_a_after_2018.docket_entries.all()), 1) - self.assertEqual( - self.docket_a_after_2018.source, Docket.HARVARD_AND_RECAP - ) - - def test_avoid_merging_existing_appellate_entry_before_2018(self): - """9 Test avoid merging appellate RSS file before 2018-4-20, docket - with entries. - - Before 2018-4-20 - Appellate - Docket exists - Docket entries - - Don't merge docket entries, avoid updating metadata. - """ - a_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Young v. Dragon", - docket_number="12-3242", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number="2", - date_filed=make_aware( - datetime(year=2017, month=1, day=4), timezone.utc - ), - ) - ], - ) - - build_date = a_rss_data_before_2018["docket_entries"][0]["date_filed"] - self.assertEqual( - len(self.de_a_before_2018.docket.docket_entries.all()), 1 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [a_rss_data_before_2018], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 0) - self.de_a_before_2018.refresh_from_db() - self.assertEqual( - self.de_a_before_2018.docket.case_name, "Young Entry v. Dragon" - ) - self.assertEqual(self.de_a_before_2018.docket.docket_number, "12-3242") - self.assertEqual( - len(self.de_a_before_2018.docket.docket_entries.all()), 2 - ) - self.assertEqual( - self.de_a_before_2018.docket.source, Docket.HARVARD_AND_RECAP - ) - - def test_merge_new_appellate_rss_before_2018(self): - """10 Merge a new appellate RSS file before 2018-4-20 - - Before: 2018-4-20 - Appellate - Docket doesn't exist - No docket entries - - Create docket, merge docket entries. - """ - a_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Youngs v. Dragon", - docket_number="23-4233", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2017, month=1, day=4), timezone.utc - ) - ) - ], - ) - - build_date = a_rss_data_before_2018["docket_entries"][0]["date_filed"] - dockets = Docket.objects.filter(docket_number="23-4233") - self.assertEqual(dockets.count(), 0) - rds_created, d_created = async_to_sync(merge_rss_data)( - [a_rss_data_before_2018], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 1) - self.assertEqual(dockets[0].case_name, "Youngs v. Dragon") - self.assertEqual(dockets[0].docket_number, "23-4233") - self.assertEqual(len(dockets[0].docket_entries.all()), 1) - self.assertEqual(dockets[0].source, Docket.RECAP) - - def test_avoid_merging_existing_appellate_entry_after_2018(self): - """11 Test avoid merging appellate RSS file after 2018-4-20, docket with - entries. - - After: 2018-4-20 - Appellate - Docket exists - Docket entry exist - - Don't merge the existing entry, avoid updating metadata. - """ - a_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Young v. Dragon", - docket_number="12-3242", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number="1", - date_filed=make_aware( - datetime(year=2019, month=1, day=4), timezone.utc - ), - ) - ], - ) - - build_date = a_rss_data_before_2018["docket_entries"][0]["date_filed"] - self.assertEqual( - len(self.de_a_before_2018.docket.docket_entries.all()), 1 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [a_rss_data_before_2018], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 0) - self.assertEqual(d_created, 0) - - def test_merging_appellate_docket_with_entries_after_2018(self): - """Test merge appellate RSS file after 2018-4-20, docket with - entries. - - After: 2018-4-20 - Appellate - Docket exists - Docket entries - - Only merge entry if it doesn't exist, avoid updating metadata. - """ - a_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Young v. Dragon", - docket_number="12-3242", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number="2", - date_filed=make_aware( - datetime(year=2019, month=1, day=4), timezone.utc - ), - ) - ], - ) - - build_date = a_rss_data_before_2018["docket_entries"][0]["date_filed"] - self.assertEqual( - len(self.de_a_before_2018.docket.docket_entries.all()), 1 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [a_rss_data_before_2018], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 0) - self.de_a_before_2018.refresh_from_db() - self.assertEqual( - self.de_a_before_2018.docket.case_name, "Young Entry v. Dragon" - ) - self.assertEqual(self.de_a_before_2018.docket.docket_number, "12-3242") - self.assertEqual( - len(self.de_a_before_2018.docket.docket_entries.all()), 2 - ) - self.assertEqual( - self.de_a_before_2018.docket.source, Docket.HARVARD_AND_RECAP - ) - - def test_merge_new_appellate_rss_after_2018(self): - """12 Merge a new appellate RSS file after 2018-4-20 - - After: 2018-4-20 - Appellate - Docket doesn't exist - No docket entries - - Create docket, merge docket entries, . - """ - - d_rss_data_after_2018 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Youngs v. Dragon", - docket_number="45-3232", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2019, month=1, day=4), timezone.utc - ) - ) - ], - ) - - build_date = d_rss_data_after_2018["docket_entries"][0]["date_filed"] - dockets = Docket.objects.filter(docket_number="45-3232") - self.assertEqual(dockets.count(), 0) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_after_2018], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 1) - self.assertEqual(dockets.count(), 1) - self.assertEqual(dockets[0].case_name, "Youngs v. Dragon") - self.assertEqual(dockets[0].docket_number, "45-3232") - self.assertEqual(len(dockets[0].docket_entries.all()), 1) - self.assertEqual(dockets[0].source, Docket.RECAP) - - def test_merging_appellate_docket_with_entries_case_id(self): - """Test merge an appellate RSS file into a docket with pacer_case_id - Find docket by docket_number_core, avoid duplicating. - Merge docket entries, avoid updating metadata. - """ - a_rss_data_before_2018 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Young v. Dragon", - docket_number="12-5674", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number="2", - date_filed=make_aware( - datetime(year=2019, month=1, day=4), timezone.utc - ), - ) - ], - ) - - build_date = a_rss_data_before_2018["docket_entries"][0]["date_filed"] - self.assertEqual( - len(self.docket_a_2018_case_id.docket_entries.all()), 0 - ) - rds_created, d_created = async_to_sync(merge_rss_data)( - [a_rss_data_before_2018], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 0) - self.docket_a_2018_case_id.refresh_from_db() - self.assertEqual( - self.docket_a_2018_case_id.case_name, "Young v. State" - ) - self.assertEqual(self.docket_a_2018_case_id.docket_number, "12-5674") - self.assertEqual(self.docket_a_2018_case_id.pacer_case_id, "12524") - self.assertEqual( - len(self.docket_a_2018_case_id.docket_entries.all()), 1 - ) - self.assertEqual(self.docket_a_2018_case_id.source, Docket.RECAP) - - def test_log_added_items_to_redis(self): - """Can we log dockets and rds added to redis, adding the previous - value? - """ - last_values = log_added_items_to_redis(100, 100, 50) - self.assertEqual(last_values["total_dockets"], 100) - self.assertEqual(last_values["total_rds"], 100) - self.assertEqual(last_values["last_line"], 50) - - last_values = log_added_items_to_redis(50, 80, 100) - self.assertEqual(last_values["total_dockets"], 150) - self.assertEqual(last_values["total_rds"], 180) - self.assertEqual(last_values["last_line"], 100) - - self.restart_troller_log() - - def test_merge_mapped_court_rss_before_2018(self): - """Merge a court mapped RSS file before 2018-4-20 - - before: 2018-4-20 - District neb -> nebraskab - Docket doesn't exist - No docket entries - - Create docket, merge docket entries, verify is assigned to nebraskab. - """ - - d_rss_data_before_2018 = RssDocketDataFactory( - court_id="neb", - case_name="Youngs v. Dragon", - docket_number="3:20-CV-01473", - pacer_case_id="43565", - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2017, month=1, day=4), timezone.utc - ) - ) - ], - ) - - build_date = d_rss_data_before_2018["docket_entries"][0]["date_filed"] - dockets = Docket.objects.filter(docket_number="3:20-CV-01473") - self.assertEqual(dockets.count(), 0) - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_before_2018], "neb", build_date - ) - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 1) - self.assertEqual(dockets.count(), 1) - self.assertEqual(dockets[0].case_name, "Youngs v. Dragon") - self.assertEqual(dockets[0].docket_number, "3:20-CV-01473") - self.assertEqual(len(dockets[0].docket_entries.all()), 1) - self.assertEqual(dockets[0].source, Docket.RECAP) - self.assertEqual(dockets[0].court.pk, "nebraskab") - - def test_avoid_merging_district_mapped_court_rss_after_2018(self): - """Avoid merging a new district RSS file with mapped court - after 2018-4-20. - - After: 2018-4-20 - District neb -> nebraskab - Docket doesn't exist - No docket entries - - Don't merge. - """ - - d_rss_data_after_2018 = RssDocketDataFactory( - court_id="neb", - case_name="Youngs v. Dragon", - docket_number="3:20-CV-01473", - pacer_case_id="43565", - docket_entries=[ - RssDocketEntryDataFactory( - date_filed=make_aware( - datetime(year=2019, month=1, day=4), timezone.utc - ) - ) - ], - ) - build_date = d_rss_data_after_2018["docket_entries"][0]["date_filed"] - rds_created, d_created = async_to_sync(merge_rss_data)( - [d_rss_data_after_2018], "neb", build_date - ) - self.assertEqual(len(rds_created), 0) - self.assertEqual(d_created, 0) - - def test_avoid_updating_docket_entry_metadata(self): - """Test merge appellate RSS file after 2018-4-20, docket with - entries. - - After: 2018-4-20 - Appellate - Docket exists - Docket entries - - Only merge entry if it doesn't exist, avoid updating metadata. - """ - - de_a_unnumbered = DocketEntryWithParentsFactory( - docket__court=self.court_appellate, - docket__case_name="Young Entry v. Dragon", - docket__docket_number="12-3245", - docket__source=Docket.HARVARD, - docket__pacer_case_id=None, - entry_number=None, - description="Original docket entry description", - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - RECAPDocumentFactory( - docket_entry=de_a_unnumbered, description="Opinion Issued" - ) - - a_rss_data_unnumbered = RssDocketDataFactory( - court_id=self.court_appellate.pk, - case_name="Young v. Dragon", - docket_number="12-3245", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=None, - description="New docket entry description", - short_description="Opinion Issued", - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - ], - ) - build_date = a_rss_data_unnumbered["docket_entries"][0]["date_filed"] - self.assertEqual(len(de_a_unnumbered.docket.docket_entries.all()), 1) - rds_created, d_created = async_to_sync(merge_rss_data)( - [a_rss_data_unnumbered], self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 0) - self.assertEqual(d_created, 0) - de_a_unnumbered.refresh_from_db() - self.assertEqual( - de_a_unnumbered.docket.case_name, "Young Entry v. Dragon" - ) - self.assertEqual(de_a_unnumbered.docket.docket_number, "12-3245") - self.assertEqual( - de_a_unnumbered.description, "Original docket entry description" - ) - self.assertEqual(len(de_a_unnumbered.docket.docket_entries.all()), 1) - self.assertEqual( - de_a_unnumbered.date_filed, - datetime(year=2018, month=1, day=4).date(), - ) - self.assertEqual(de_a_unnumbered.docket.source, Docket.HARVARD) - - @patch("cl.corpus_importer.management.commands.troller_bk.logger") - def test_avoid_cached_items(self, mock_logger): - """Can we skip a whole file when a cached item is hit?""" - - a_rss_data_0 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - docket_number="12-3247", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ), - ], - ) - - a_rss_data_1 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - docket_number="12-3245", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - ], - ) - a_rss_data_2 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - docket_number="12-3246", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - ], - ) - - list_rss_data_1 = [a_rss_data_1, a_rss_data_2] - list_rss_data_2 = [a_rss_data_0, a_rss_data_1] - - cached_items = RssItemCache.objects.all() - self.assertEqual(cached_items.count(), 0) - build_date = a_rss_data_0["docket_entries"][0]["date_filed"] - rds_created, d_created = async_to_sync(merge_rss_data)( - list_rss_data_1, self.court_appellate.pk, build_date - ) - self.assertEqual(len(rds_created), 2) - self.assertEqual(d_created, 2) - self.assertEqual(cached_items.count(), 2) - - # Remove recap_sequence_number from the dict to simulate the same item - del a_rss_data_1["docket_entries"][0]["recap_sequence_number"] - rds_created, d_created = async_to_sync(merge_rss_data)( - list_rss_data_2, self.court_appellate.pk, build_date - ) - - # The file is aborted when a cached item is hit - self.assertEqual(len(rds_created), 1) - self.assertEqual(d_created, 1) - self.assertEqual(cached_items.count(), 3) - mock_logger.info.assert_called_with( - f"Finished adding {self.court_appellate.pk} feed. Added {len(rds_created)} RDs." - ) - - @patch( - "cl.corpus_importer.management.commands.troller_bk.download_file", - side_effect=mock_download_file, - ) - def test_download_files_concurrently(self, mock_download): - """Test the download_files_concurrently method to verify proper - fetching of the next paths to download from a file. Concurrently - download these paths and add them to a queue in the original chronological order. - """ - test_dir = ( - Path(settings.INSTALL_ROOT) - / "cl" - / "corpus_importer" - / "test_assets" - ) - import_filename = "import.csv" - import_path = os.path.join(test_dir, import_filename) - - files_queue = Queue() - threads = [] - files_downloaded_offset = 0 - - with open(import_path, "rb") as f: - files_downloaded_offset = download_files_concurrently( - files_queue, f.name, files_downloaded_offset, threads - ) - self.assertEqual(len(threads), 1) - self.assertEqual(files_downloaded_offset, 3) - files_downloaded_offset = download_files_concurrently( - files_queue, f.name, files_downloaded_offset, threads - ) - - for thread in threads: - thread.join() - - self.assertEqual(len(threads), 2) - self.assertEqual(files_downloaded_offset, 6) - self.assertEqual(files_queue.qsize(), 6) - - # Verifies original chronological order. - binary, item_path, order = files_queue.get() - self.assertEqual(order, 0) - self.assertEqual(item_path.split("|")[1], "1575330086") - files_queue.task_done() - - binary, item_path, order = files_queue.get() - self.assertEqual(order, 1) - self.assertEqual(item_path.split("|")[1], "1575333374") - files_queue.task_done() - - binary, item_path, order = files_queue.get() - self.assertEqual(order, 2) - self.assertEqual(item_path.split("|")[1], "1575336978") - files_queue.task_done() - - binary, item_path, order = files_queue.get() - self.assertEqual(order, 0) - self.assertEqual(item_path.split("|")[1], "1575340576") - files_queue.task_done() - - binary, item_path, order = files_queue.get() - self.assertEqual(order, 1) - self.assertEqual(item_path.split("|")[1], "1575344176") - files_queue.task_done() - - binary, item_path, order = files_queue.get() - self.assertEqual(order, 2) - self.assertEqual(item_path.split("|")[1], "1575380176") - files_queue.task_done() - - self.assertEqual(files_queue.qsize(), 0) - - def test_add_objects_in_bulk(self): - """Can we properly add related RSS feed objects in bulk?""" - - a_rss_data_0 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - docket_number="15-3247", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ), - ], - ) - - a_rss_data_1 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - docket_number="15-3245", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - ], - ) - a_rss_data_2 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - docket_number="15-3247", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=2, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - ], - ) - - a_rss_data_3 = RssDocketDataFactory( - court_id=self.court_appellate.pk, - docket_number="12-2532", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=5, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - ], - ) - - list_rss_data = [ - a_rss_data_0, - a_rss_data_1, - a_rss_data_2, - a_rss_data_3, - ] - cached_items = RssItemCache.objects.all() - self.assertEqual(cached_items.count(), 0) - - build_date = a_rss_data_0["docket_entries"][0]["date_filed"] - rds_created, d_created = async_to_sync(merge_rss_data)( - list_rss_data, self.court_appellate.pk, build_date - ) - - date_filed, time_filed = localize_date_and_time( - self.court_appellate.pk, build_date - ) - - # Only two dockets created: 15-3247 and 15-3245, 12-2532 already exists - self.assertEqual(d_created, 2) - self.assertEqual(len(rds_created), 4) - - # Compare docket entries and rds created for each docket. - des_to_compare = [("15-3245", 1), ("15-3247", 2), ("12-2532", 1)] - for d_number, de_count in des_to_compare: - docket = Docket.objects.get(docket_number=d_number) - self.assertEqual(len(docket.docket_entries.all()), de_count) - - # For every docket entry there is one recap document created. - docket_entries = docket.docket_entries.all() - for de in docket_entries: - self.assertEqual(len(de.recap_documents.all()), 1) - self.assertEqual(de.time_filed, time_filed) - self.assertEqual(de.date_filed, date_filed) - self.assertNotEqual(de.recap_sequence_number, "") - - # docket_number_core generated for every docket - self.assertNotEqual(docket.docket_number_core, "") - # Slug is generated for every docket - self.assertNotEqual(docket.slug, "") - - # Verify RECAP source is added to existing and new dockets. - if d_number == "12-2532": - self.assertEqual(docket.source, Docket.HARVARD_AND_RECAP) - else: - self.assertEqual(docket.source, Docket.RECAP) - # Confirm date_last_filing is added to each new docket. - self.assertEqual(docket.date_last_filing, date_filed) - - # BankruptcyInformation is added only on new dockets. - bankr_objs_created = BankruptcyInformation.objects.all() - self.assertEqual(len(bankr_objs_created), 3) - - # Compare bankruptcy data is linked correctly to the parent docket. - bankr_d_1 = BankruptcyInformation.objects.get( - docket__docket_number=a_rss_data_0["docket_number"] - ) - self.assertEqual(bankr_d_1.chapter, str(a_rss_data_0["chapter"])) - self.assertEqual( - bankr_d_1.trustee_str, str(a_rss_data_0["trustee_str"]) - ) - - bankr_d_2 = BankruptcyInformation.objects.get( - docket__docket_number=a_rss_data_1["docket_number"] - ) - self.assertEqual(bankr_d_2.chapter, str(a_rss_data_1["chapter"])) - self.assertEqual( - bankr_d_2.trustee_str, str(a_rss_data_1["trustee_str"]) - ) - - bankr_d_3 = BankruptcyInformation.objects.get( - docket__docket_number=a_rss_data_3["docket_number"] - ) - self.assertEqual(bankr_d_3.chapter, str(a_rss_data_3["chapter"])) - self.assertEqual( - bankr_d_3.trustee_str, str(a_rss_data_3["trustee_str"]) - ) - - def test_avoid_adding_district_dockets_no_pacer_case_id_in_bulk(self): - """Can we avoid adding district/bankr dockets that don't have a - pacer_case_id?""" - - a_rss_data_0 = RssDocketDataFactory( - court_id=self.court_neb.pk, - docket_number="15-3247", - pacer_case_id=None, - docket_entries=[ - RssDocketEntryDataFactory( - document_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ), - ], - ) - - a_rss_data_1 = RssDocketDataFactory( - court_id=self.court_neb.pk, - docket_number="15-3245", - pacer_case_id="12345", - docket_entries=[ - RssDocketEntryDataFactory( - document_number=1, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - ], - ) - - list_rss_data = [ - a_rss_data_0, - a_rss_data_1, - ] - - build_date = a_rss_data_0["docket_entries"][0]["date_filed"] - rds_created, d_created = async_to_sync(merge_rss_data)( - list_rss_data, self.court_neb.pk, build_date - ) - - # Only one docket created: 15-3245, since 15-3247 don't have pacer_case_id - self.assertEqual(d_created, 1) - self.assertEqual(len(rds_created), 1) - - # Compare docket entries and rds created for each docket. - des_to_compare = [("15-3245", 1)] - for d_number, de_count in des_to_compare: - docket = Docket.objects.get(docket_number=d_number) - self.assertEqual(len(docket.docket_entries.all()), de_count) - # For every docket entry there is one recap document created. - docket_entries = docket.docket_entries.all() - for de in docket_entries: - self.assertEqual(len(de.recap_documents.all()), 1) - self.assertNotEqual(de.recap_sequence_number, "") - - # docket_number_core generated for every docket - self.assertNotEqual(docket.docket_number_core, "") - # Slug is generated for every docket - self.assertNotEqual(docket.slug, "") - self.assertEqual(docket.source, Docket.RECAP) - - # BankruptcyInformation is added only on new dockets. - bankr_objs_created = BankruptcyInformation.objects.all() - self.assertEqual(len(bankr_objs_created), 1) - - def test_avoid_adding_existing_entries_by_description(self): - """Can we avoid adding district/bankr dockets that don't have a - pacer_case_id?""" - - de = DocketEntryWithParentsFactory( - docket__court=self.court, - docket__case_name="Young Entry v. Dragon", - docket__docket_number="3:87-CV-01409", - docket__source=Docket.HARVARD, - docket__pacer_case_id="90385", - entry_number=None, - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ) - RECAPDocumentFactory(docket_entry=de, description="Opinion Issued") - a_rss_data_0 = RssDocketDataFactory( - court_id=self.court, - docket_number="3:87-CV-01409", - pacer_case_id="90385", - docket_entries=[ - RssDocketEntryDataFactory( - document_number=None, - short_description="Opinion Issued", - date_filed=make_aware( - datetime(year=2018, month=1, day=5), timezone.utc - ), - ), - ], - ) - list_rss_data = [ - a_rss_data_0, - ] - build_date = a_rss_data_0["docket_entries"][0]["date_filed"] - rds_created, d_created = async_to_sync(merge_rss_data)( - list_rss_data, self.court.pk, build_date - ) - - # No docket entry should be created - self.assertEqual(d_created, 0) - self.assertEqual(len(rds_created), 0) - - @patch( "cl.corpus_importer.management.commands.clean_up_mis_matched_dockets.download_file", side_effect=lambda a: { diff --git a/cl/lib/context_processors.py b/cl/lib/context_processors.py index d5b3957f4a..b92c5070ea 100644 --- a/cl/lib/context_processors.py +++ b/cl/lib/context_processors.py @@ -80,7 +80,7 @@ def inject_settings(request): 'CourtListener has every free opinion and order available in PACER and gets the latest ones every night.', 'Want to learn more about PACER? We have an extensive fact sheet.', 'You can use the link to any RECAP PDF to pull up the docket.', - "We have more than 80 million pages of PACER documents searchable in the RECAP Archive.", + "We have more than 200 million pages of PACER documents searchable in the RECAP Archive.", 'You can create an alert for any docket in the RECAP Archive. Just press the "Get Alerts" button.' % reverse("alert_help"), ) diff --git a/cl/recap/management/commands/reprocess_recap_dockets.py b/cl/recap/management/commands/reprocess_recap_dockets.py index 87d12ab97f..a54ca0cbfa 100644 --- a/cl/recap/management/commands/reprocess_recap_dockets.py +++ b/cl/recap/management/commands/reprocess_recap_dockets.py @@ -11,9 +11,9 @@ from cl.search.models import Docket, RECAPDocument -def extract_unextracted_rds_and_add_to_solr(queue: str) -> None: +def extract_unextracted_rds(queue: str) -> None: """Performs content extraction for all recap documents that need to be - extracted and then add to solr. + extracted. :param queue: The celery queue to use :return: None @@ -74,22 +74,20 @@ def add_arguments(self, parser): ) parser.add_argument( - "--extract-and-add-solr-unextracted-rds", + "--extract-unextracted-rds", action="store_true", default=False, - help="Extract all recap documents that need to be extracted and " - "then add to solr.", + help="Extract all recap documents that need to be extracted.", ) def handle(self, *args, **options): super().handle(*args, **options) - if options["extract_and_add_solr_unextracted_rds"]: + if options["extract_unextracted_rds"]: queue = options["queue"] sys.stdout.write( - "Extracting all recap documents that need extraction and then " - "add to solr. \n" + "Extracting all recap documents that need extraction. \n" ) - extract_unextracted_rds_and_add_to_solr(queue) + extract_unextracted_rds(queue) return ds = ( diff --git a/cl/recap/tasks.py b/cl/recap/tasks.py index 364fe5e49c..026b1ca2ef 100644 --- a/cl/recap/tasks.py +++ b/cl/recap/tasks.py @@ -2814,7 +2814,7 @@ def process_recap_email( all_updated_rds += docket_updated.rds_updated if not is_potentially_sealed_entry: - rds_to_extract_add_to_solr = all_attachment_rds + all_created_rds + rds_to_extract = all_attachment_rds + all_created_rds rds_updated_or_created = ( all_attachment_rds + all_created_rds + all_updated_rds ) @@ -2827,13 +2827,13 @@ def process_recap_email( msg = "Successful upload! Nice work." status = PROCESSING_STATUS.SUCCESSFUL else: - rds_to_extract_add_to_solr = [] + rds_to_extract = [] self.request.chain = None msg = "Could not retrieve Docket Entry" status = PROCESSING_STATUS.FAILED async_to_sync(mark_pq_status)(epq, msg, status, "status_message") - return [rd.pk for rd in rds_to_extract_add_to_solr] + return [rd.pk for rd in rds_to_extract] def do_recap_document_fetch(epq: EmailProcessingQueue, user: User) -> None: diff --git a/cl/recap/tests.py b/cl/recap/tests.py index bce48790c8..bb249b6246 100644 --- a/cl/recap/tests.py +++ b/cl/recap/tests.py @@ -79,7 +79,7 @@ clean_up_duplicate_appellate_entries, ) from cl.recap.management.commands.reprocess_recap_dockets import ( - extract_unextracted_rds_and_add_to_solr, + extract_unextracted_rds, ) from cl.recap.mergers import ( add_attorney, @@ -6226,7 +6226,7 @@ def test_extract_missed_recap_documents(self): ] self.assertEqual(len(rd_needs_extraction), 2) - extract_unextracted_rds_and_add_to_solr("celery") + extract_unextracted_rds("celery") rd_needs_extraction_after = [ x.pk @@ -6265,7 +6265,7 @@ def test_clean_up_recap_document_file(self, mock_open): self.assertEqual(rd[0].sha1, "asdfasdfasdfasdfasdfasddf") self.assertEqual(rd[0].date_upload, date_upload) - extract_unextracted_rds_and_add_to_solr("celery") + extract_unextracted_rds("celery") # File related fields should be cleaned up after the failed extraction. self.assertEqual(rd[0].is_available, False) self.assertEqual(rd[0].file_size, None) diff --git a/cl/search/api_utils.py b/cl/search/api_utils.py index 0fa844e770..fd545fc262 100644 --- a/cl/search/api_utils.py +++ b/cl/search/api_utils.py @@ -39,7 +39,7 @@ def get_object_list(request, cd, paginator): - """Perform the Solr work""" + """Perform the search engine work""" # Set the offset value try: page_number = int(request.GET.get(paginator.page_query_param, 1)) diff --git a/cl/settings/third_party/rest_framework.py b/cl/settings/third_party/rest_framework.py index 2c0f04163e..762a412b21 100644 --- a/cl/settings/third_party/rest_framework.py +++ b/cl/settings/third_party/rest_framework.py @@ -28,7 +28,15 @@ }, "OVERRIDE_THROTTLE_RATES": { # Throttling down. + # Multiple accounts + "JamesBond": "1/hour", + "JackSparrow": "1/hour", + "PeterPan": "1/hour", + "HomerSimpson": "1/hour", + "BruceWayne": "1/hour", # Unresponsive + "court_test_account": "1/hour", + "jmmckinnie": "1/hour", "projecttesting": "1/hour", "SAGW": "1/hour", # Bounced diff --git a/cl/users/templates/emails/welcome_email.txt b/cl/users/templates/emails/welcome_email.txt index bf86d70402..174f4157a3 100644 --- a/cl/users/templates/emails/welcome_email.txt +++ b/cl/users/templates/emails/welcome_email.txt @@ -1,32 +1,31 @@ Hello{% if name %} {{ name }}{% endif %}, I'm Mike, one of the co-founders of Free Law Project, the non-profit that -sponsors the development of CourtListener and RECAP. This is an automated +builds and maintains CourtListener, RECAP, and Bots.law. This is an automated message, but I like to send it anyway just to say hello and to welcome you to the site. Welcome! -Whether you'd just like to set up some daily alerts for topics that interest -you, download our data, or even contribute somehow to our efforts, I'm glad -you've signed up and would be happy to try to help you with whatever you're -trying to do. +Whether you're here to monitor cases, do research, download our data, or even +contribute somehow to our efforts, I'm glad you've signed up and would be happy +to help you with whatever you're trying to do. Some of the highlights of CourtListener and RECAP are: - - A searchable case law archive with high quality metadata and citations: + - A searchable case law archive with high-quality metadata and citations: https://www.courtlistener.com{% url "advanced_o" %} - - The RECAP Archive, one of the largest open collections of federal court - data: + - The RECAP Archive, which has over 200M pages of PACER content and over 60M + federal cases: https://www.courtlistener.com{% url "advanced_r" %} - - The RECAP Extension for Firefox and Chrome, which helps make PACER better - and cheaper, while contributing to the RECAP Archive: + - The RECAP Extension for Firefox, Chrome, Edge, and Safari, which helps make + PACER better and cheaper, while contributing to the RECAP Archive: https://free.law/recap/ - - Our oral argument audio archive, containing over a million minutes of + - Our oral argument audio archive, containing over three million minutes of recordings: https://www.courtlistener.com{% url "advanced_oa" %} diff --git a/docker/courtlistener/docker-compose.yml b/docker/courtlistener/docker-compose.yml index 05bf7d8403..b34f9f5c20 100644 --- a/docker/courtlistener/docker-compose.yml +++ b/docker/courtlistener/docker-compose.yml @@ -40,19 +40,6 @@ services: networks: - cl_net_overlay - # Search engine - cl-solr: - container_name: cl-solr - image: freelawproject/solr:latest - ports: - - "8983:8983" - # needs chmod 777 - volumes: - - ${CL_SOLR_CODE_DIR:-../../../courtlistener-solr-server}/data:/var/opt/solr/indices - - ${CL_SOLR_CODE_DIR:-../../../courtlistener-solr-server}/solr/cores/:/etc/opt/solr:ro - networks: - - cl_net_overlay - cl-webpack: container_name: cl-webpack image: node:16 @@ -101,7 +88,6 @@ services: - cl-postgresql - cl-redis - cl-celery - - cl-solr - cl-selenium - cl-doctor - cl-disclosures diff --git a/docker/django/docker-entrypoint.sh b/docker/django/docker-entrypoint.sh index 414bc8d1a9..ff8f3daec8 100644 --- a/docker/django/docker-entrypoint.sh +++ b/docker/django/docker-entrypoint.sh @@ -21,7 +21,7 @@ case "$1" in 'web-prod') # Tips: # 1. Set high number of --workers. Docs recommend 2-4× core count - # 2. Set --limit-request-line to high value to allow long Solr queries + # 2. Set --limit-request-line to high value to allow long search queries # 3. Set --max-requests to reset each worker once in a while exec gunicorn cl.asgi:application \ --chdir /opt/courtlistener/ \