Skip to content

Commit

Permalink
Add black & isort pre-commit hooks (#41)
Browse files Browse the repository at this point in the history
  • Loading branch information
jordan-gillard authored Sep 18, 2024
1 parent d3b836a commit b993491
Show file tree
Hide file tree
Showing 12 changed files with 309 additions and 126 deletions.
2 changes: 1 addition & 1 deletion .devcontainer/devcontainer.json
Original file line number Diff line number Diff line change
Expand Up @@ -17,5 +17,5 @@
]
}
},
"postCreateCommand": "poetry install"
"postCreateCommand": "poetry install && poetry run pre-commit install"
}
14 changes: 14 additions & 0 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
repos:
- repo: local
hooks:
- id: black
name: black
entry: poetry run black
language: system
types: [python]

- id: isort
name: isort
entry: poetry run isort
language: system
types: [python]
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -203,13 +203,16 @@ pip install poetry

# Install dependencies
poetry install

# Set up pre-commit hooks to keep your code formatted
poetry run pre-commit install
```

Check out [Important commands](#important-commands) below for next steps.

### Developing using a GitHub Codespace

This project uses a custom Development Container supported by GitHub Codespaces. Creating a new Codespace automatically takes care of installing all supported Python interpreters, the Poetry package manager, and Python dependencies for you.
This project uses a custom Development Container supported by GitHub Codespaces. Creating a new Codespace automatically takes care of installing all supported Python interpreters, the Poetry package manager, Python dependencies, and pre-commit hooks for you.

To create a new Codespace:
1. Click on the `<> Code` dropdown on the GitHub UI.
Expand Down
24 changes: 14 additions & 10 deletions edgar_tool/cli.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
import sys
import time
from datetime import date, timedelta, datetime
from datetime import date, datetime, timedelta
from typing import List, Optional
from warnings import warn

from edgar_tool.constants import (
SUPPORTED_OUTPUT_EXTENSIONS,
TEXT_SEARCH_CATEGORY_FORM_GROUPINGS,
TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING,
)
from edgar_tool.page_fetcher import NoResultsFoundError
from edgar_tool.rss import fetch_rss_feed
from edgar_tool.text_search import EdgarTextSearcher
from edgar_tool.utils import parse_location_input
from edgar_tool.page_fetcher import NoResultsFoundError


def _validate_text_search_args(
Expand Down Expand Up @@ -57,15 +58,16 @@ def _validate_text_search_args(
):
raise ValueError(
f"Filing form group must be one of: {'; '.join(TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.keys())}"
)
)
if single_forms:
single_list = [item for sublist in TEXT_SEARCH_CATEGORY_FORM_GROUPINGS.values() for item in
sublist]
single_list = [
item
for sublist in TEXT_SEARCH_CATEGORY_FORM_GROUPINGS.values()
for item in sublist
]
invalid_forms = [form for form in single_forms if form not in single_list]
if invalid_forms:
raise ValueError(
f"Single forms must be one or more of: {single_list}"
)
raise ValueError(f"Single forms must be one or more of: {single_list}")


class SecEdgarScraperCli:
Expand Down Expand Up @@ -135,7 +137,9 @@ def text_search(
scraper.text_search(
keywords=keywords,
entity_id=entity_id,
filing_form=TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.get(filing_form),
filing_form=TEXT_SEARCH_FILING_VS_MAPPING_CATEGORIES_MAPPING.get(
filing_form
),
single_forms=single_forms,
start_date=start_date,
end_date=end_date,
Expand All @@ -144,7 +148,7 @@ def text_search(
retries=retries,
destination=output,
peo_in=peo_in,
inc_in=inc_in
inc_in=inc_in,
)

@staticmethod
Expand Down
6 changes: 2 additions & 4 deletions edgar_tool/io.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
import csv
import json
from typing import List, Dict, Any, Iterator
from typing import Any, Dict, Iterator, List

import jsonlines

from edgar_tool.constants import (
SUPPORTED_OUTPUT_EXTENSIONS,
)
from edgar_tool.constants import SUPPORTED_OUTPUT_EXTENSIONS


def write_results_to_file(
Expand Down
3 changes: 2 additions & 1 deletion edgar_tool/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from edgar_tool.cli import SecEdgarScraperCli
import fire

from edgar_tool.cli import SecEdgarScraperCli


def main_entrypoint():
fire.Fire(SecEdgarScraperCli)
Expand Down
8 changes: 5 additions & 3 deletions edgar_tool/page_fetcher.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import time
import uuid
from random import uniform
from typing import Callable, Any, Optional
from typing import Any, Callable, Optional

from tenacity import retry, wait_fixed, stop_after_attempt
import requests
from tenacity import retry, stop_after_attempt, wait_fixed


def fetch_page(
Expand All @@ -23,6 +23,7 @@ def fetch_page(
:param stop_after_n: how many times to retry the request before failing
:return: wrapper function that takes a check method and retries the request if the page load fails
"""

@retry(
wait=wait_fixed(uniform(min_wait_seconds, max_wait_seconds)),
stop=stop_after_attempt(stop_after_n),
Expand Down Expand Up @@ -57,5 +58,6 @@ class ResultsTableNotFoundError(Exception):
class PageCheckFailedError(Exception):
pass


class NoResultsFoundError(Exception):
pass
pass
2 changes: 1 addition & 1 deletion edgar_tool/rss.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import json
import uuid
from pathlib import Path
from typing import List, Any, Dict, Iterator, Tuple
from typing import Any, Dict, Iterator, List, Tuple

import requests
import xmltodict
Expand Down
45 changes: 28 additions & 17 deletions edgar_tool/text_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,24 +4,23 @@
import urllib.parse
from datetime import date, timedelta
from math import ceil
from typing import List, Optional, Dict, Any, Iterator
from typing import Any, Dict, Iterator, List, Optional


from edgar_tool.page_fetcher import (
fetch_page,
PageCheckFailedError,
ResultsTableNotFoundError,
NoResultsFoundError
)
from edgar_tool.constants import (
TEXT_SEARCH_BASE_URL,
TEXT_SEARCH_CATEGORY_FORM_GROUPINGS,
TEXT_SEARCH_SPLIT_BATCHES_NUMBER,
TEXT_SEARCH_CSV_FIELDS_NAMES,
TEXT_SEARCH_FORM_MAPPING,
TEXT_SEARCH_LOCATIONS_MAPPING,
TEXT_SEARCH_SPLIT_BATCHES_NUMBER,
)
from edgar_tool.io import write_results_to_file
from edgar_tool.page_fetcher import (
NoResultsFoundError,
PageCheckFailedError,
ResultsTableNotFoundError,
fetch_page,
)
from edgar_tool.utils import split_date_range_in_n, unpack_singleton_list


Expand Down Expand Up @@ -130,7 +129,11 @@ def _parse_row(row: Dict[str, Any]) -> Dict[str, Any]:

places_of_business = _source.get("biz_locations")
places_of_business = [
f"{split[0]}, {TEXT_SEARCH_LOCATIONS_MAPPING.get(split[1])}" if len(split) == 2 else f"{split[0]}"
(
f"{split[0]}, {TEXT_SEARCH_LOCATIONS_MAPPING.get(split[1])}"
if len(split) == 2
else f"{split[0]}"
)
for place in places_of_business
if (split := place.rsplit(", ", maxsplit=1))
]
Expand Down Expand Up @@ -226,25 +229,31 @@ def _generate_request_args(

# Add optional parameters
if peo_in and inc_in:
raise ValueError("use only one of peo_in or inc_in, not both") ## because SEC API doesn't support
raise ValueError(
"use only one of peo_in or inc_in, not both"
) ## because SEC API doesn't support
else:
if peo_in:
request_args["locationCodes"] = peo_in
if inc_in:
request_args["locationCodes"] = inc_in
request_args["locationType"] = "incorporated"

if entity_id:
request_args["entityName"] = entity_id
# Handle forms and single forms
part_filing_form = [] if filing_form is None else TEXT_SEARCH_CATEGORY_FORM_GROUPINGS[filing_form]
part_filing_form = (
[]
if filing_form is None
else TEXT_SEARCH_CATEGORY_FORM_GROUPINGS[filing_form]
)
part_single_forms = [] if single_forms is None else single_forms

# Join the filing_forms and single forms and remove duplicates
forms = ",".join(list(set(part_filing_form + part_single_forms)))
if forms != "":
request_args["forms"] = forms

# URL-encode the request arguments
request_args = urllib.parse.urlencode(request_args)

Expand Down Expand Up @@ -373,7 +382,9 @@ def _generate_search_requests(
# If we have 10000 results, split date range in two separate requests and fetch first page again, do so until
# we have a set of date ranges for which none of the requests have 10000 results
if num_results == 0:
print(f"No results found for query in date range {start_date} -> {end_date}.")
print(
f"No results found for query in date range {start_date} -> {end_date}."
)
elif num_results < 10000:
print(
f"Less than 10000 ({num_results}) results found for range {start_date} -> {end_date}, "
Expand Down Expand Up @@ -475,7 +486,7 @@ def text_search(
print(
f"Skipping search request due to an unexpected {e.__class__.__name__} for request parameters '{r}': {e}"
)
if(search_requests_results == []):
if search_requests_results == []:
raise NoResultsFoundError(f"No results found for the search query")
write_results_to_file(
itertools.chain(*search_requests_results),
Expand Down Expand Up @@ -518,4 +529,4 @@ def _fetch_first_page_results_number(
raise NoResultsFoundError(
f"\nExecution aborting due to a {e.__class__.__name__} error raised "
f"while parsing number of results for first page at URL {url}: {e}"
) from e
) from e
Loading

0 comments on commit b993491

Please sign in to comment.