From 3e4ca09724f5023695337b536a2cd40e4f6ab42c Mon Sep 17 00:00:00 2001 From: Galen Reich <54807169+GalenReich@users.noreply.github.com> Date: Tue, 18 Jun 2024 16:20:21 +0100 Subject: [PATCH] Fix result duplication (#30) * Don't encode page=1 in url for reuse * Add 'from' field to paginated url * Bump version --- edgar_tool/text_search.py | 6 +----- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/edgar_tool/text_search.py b/edgar_tool/text_search.py index f3fd7da..656bd7a 100644 --- a/edgar_tool/text_search.py +++ b/edgar_tool/text_search.py @@ -191,7 +191,6 @@ def _generate_request_args( single_forms: Optional[List[str]], start_date: date, end_date: date, - page_number: int, peo_in: Optional[str], inc_in: Optional[str], ) -> str: @@ -204,7 +203,6 @@ def _generate_request_args( :param single_forms: List of single forms to search for (e.g. ['10-K', '10-Q']), defaults to None :param start_date: Start date for the custom date range, defaults to 5 years ago to replicate the default behavior of the SEC website :param end_date: End date for the custom date range, defaults to current date in order to replicate the default behavior of the SEC website - :param page_number: Page number to request, defaults to 1 :param peo_in: Search principal executive offices in a location (e.g. "NY,OH") :param inc_in: Search incorporated in a location (e.g. "NY,OH") @@ -224,7 +222,6 @@ def _generate_request_args( "dateRange": "custom", "startdt": start_date.strftime("%Y-%m-%d"), "enddt": end_date.strftime("%Y-%m-%d"), - "page": page_number, } # Add optional parameters @@ -286,7 +283,7 @@ def _fetch_search_request_results( num_pages = self._compute_number_of_pages() for i in range(1, num_pages + 1): - paginated_url = f"{TEXT_SEARCH_BASE_URL}{search_request_url_args}&page={i}" + paginated_url = f"{TEXT_SEARCH_BASE_URL}{search_request_url_args}&page={i}&from={100*(i-1)}" try: self.json_response = fetch_page( paginated_url, @@ -354,7 +351,6 @@ def _generate_search_requests( single_forms=single_forms, start_date=start_date, end_date=end_date, - page_number=1, peo_in=peo_in, inc_in=inc_in, ) diff --git a/pyproject.toml b/pyproject.toml index 2e6b50e..e17b80f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "edgar-tool" -version = "1.3.0" +version = "1.3.1" description = "Search and retrieve corporate and financial data from the United States Securities and Exchange Commission (SEC)." authors = ["Bellingcat"] license = "GNU General Public License v3 (GPLv3)"