From d3729ef7c729b0c51a2b0fbaaf5c61c1afd3e3eb Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 13 Jan 2025 20:51:03 +0100 Subject: [PATCH] fix input schema field type, fix logging --- .actor/input_schema.json | 2 +- src/helpers.py | 5 +++-- src/main.py | 13 +++++++------ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.actor/input_schema.json b/.actor/input_schema.json index 9023c0a..d064471 100644 --- a/.actor/input_schema.json +++ b/.actor/input_schema.json @@ -18,5 +18,5 @@ "default": 1 } }, - "required": ["url"] + "required": ["startUrl"] } diff --git a/src/helpers.py b/src/helpers.py index ba7d141..52ba963 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from apify_client.clients import KeyValueStoreClientAsync +logger = logging.getLogger('apify') def get_hostname_path_string_from_url(url: str) -> str: """Extracts the hostname and path from the URL.""" @@ -38,10 +39,10 @@ async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_u """Extracts the description from the HTML content stored in the KV store.""" store_id = html_url.split('records/')[-1] if not (record := await kvstore.get_record(store_id)): - logging.warning(f'Failed to get record with id "{store_id}"!') + logger.warning(f'Failed to get record with id "{store_id}"!') return None if not (html := record.get('value')) or not isinstance(html, str): - logging.warning(f'Invalid HTML content for record with id "{store_id}"!') + logger.warning(f'Invalid HTML content for record with id "{store_id}"!') return None return get_description_from_html(html) diff --git a/src/main.py b/src/main.py index 8f74df3..082d649 100644 --- a/src/main.py +++ b/src/main.py @@ -17,6 +17,7 @@ if TYPE_CHECKING: from src.types import SectionDict +logger = logging.getLogger('apify') async def main() -> None: """Main entry point for the Apify Actor. @@ -34,7 +35,7 @@ async def main() -> None: max_crawl_depth = int(actor_input.get('maxCrawlDepth', 1)) # call apify/website-content-crawler actor to get the html content - logging.info(f'Starting the "apify/website-content-crawler" actor for URL: {url}') + logger.info(f'Starting the "apify/website-content-crawler" actor for URL: {url}') actor_run_details = await Actor.call( 'apify/website-content-crawler', get_crawler_actor_config(url, max_crawl_depth=max_crawl_depth), @@ -57,13 +58,13 @@ async def main() -> None: async for item in run_dataset.iterate_items(): item_url = item.get('url') - logging.info(f'Processing page: {item_url}') + logger.info(f'Processing page: {item_url}') if item_url is None: - logging.warning('Missing "url" attribute in dataset item!') + logger.warning('Missing "url" attribute in dataset item!') continue html_url = item.get('htmlUrl') if html_url is None: - logging.warning('Missing "htmlUrl" attribute in dataset item!') + logger.warning('Missing "htmlUrl" attribute in dataset item!') continue is_root = item_url == url @@ -93,7 +94,7 @@ async def main() -> None: # save into kv-store as a file to be able to download it store = await Actor.open_key_value_store() await store.set_value('llms.txt', output) - logging.info('Saved the "llms.txt" file into the key-value store!') + logger.info('Saved the "llms.txt" file into the key-value store!') await Actor.push_data({'llms.txt': output}) - logging.info('Pushed the "llms.txt" file to the dataset!') + logger.info('Pushed the "llms.txt" file to the dataset!')