From 3c972814bd7af71518060c11dcf7873a71afa4fe Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 13 Jan 2025 20:19:47 +0100 Subject: [PATCH 1/9] renamed input from url to startUrl, removed default input fail instead, mention memory limit in readme, readme mention website content crawler, crawler config only non defaults, added logging and minor code improvements --- .actor/input_schema.json | 6 +++--- README.md | 40 +++++++++++++++++++++++----------------- src/crawler_config.py | 19 +------------------ src/helpers.py | 6 ++---- src/main.py | 11 +++++++---- src/renderer.py | 29 ++++++++++++++++++++++++++++- 6 files changed, 64 insertions(+), 47 deletions(-) diff --git a/.actor/input_schema.json b/.actor/input_schema.json index 24938a7..9023c0a 100644 --- a/.actor/input_schema.json +++ b/.actor/input_schema.json @@ -3,10 +3,10 @@ "type": "object", "schemaVersion": 1, "properties": { - "url": { - "title": "URL of the site", + "startUrl": { + "title": "Start URL", "type": "string", - "description": "The URL of website you want to get the llm.txt generated for.", + "description": "The URL from which the crawler will start to generate the llms.txt file.", "editor": "textfield", "prefill": "https://docs.apify.com/" }, diff --git a/README.md b/README.md index f5e049a..341d535 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# llms.txt Generator Actor πŸš€πŸ“„ +# llms.txt generator actor πŸš€πŸ“„ -The **llms.txt Generator Actor** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. +The **llms.txt generator actor** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This tool leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**. ## 🌟 What is llms.txt? @@ -8,14 +8,14 @@ The **llms.txt** format is a markdown-based standard for providing AI-friendly c - **Brief background information** and guidance. - **Links to additional resources** in markdown format. -- A simple, AI-focused structure to help coders, researchers, and AI models easily access and use website content. +- **AI-focused** structure to help coders, researchers, and AI models easily access and use website content. -Here’s a mock example: +Proposed example structure: ``` # Title -> Optional description goes here +> Optional description Optional details go here @@ -32,20 +32,21 @@ By adding an **llms.txt** file to your website, you make it easy for AI systems --- -## 🎯 Features of llms.txt Generator +## 🎯 Features of llms.txt generator Our actor is designed to simplify and automate the creation of **llms.txt** files. Here are its key features: -- **Deep Website Crawling**: Extracts content from multi-level websites using the powerful [Crawlee](https://crawlee.dev) library. +- **Deep Website Crawling**: Extracts content from multi-level websites using the powerful [Crawlee](https://crawlee.dev) library and the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor. - **Content Extraction**: Retrieves key metadata such as titles, descriptions, and URLs for seamless integration. - **File Generation**: Saves the output in the standardized **llms.txt** format. - **Downloadable Output**: The **llms.txt** file can be downloaded from the **Key-Value Store** in the Storage section of the actor run details. +- **Resource Management**: Limits the crawler actor to 4 GB of memory to ensure compatibility with the free tier, which has an 8 GB limit. Note that this may slow down the crawling process. --- ## πŸš€ How It Works -1. **Input**: Provide the URL of the website to crawl. +1. **Input**: Provide the Start URL of the website to crawl. 2. **Configuration**: Set the maximum crawl depth and other options (optional). 3. **Output**: The actor generates a structured **llms.txt** file with extracted content, ready for AI applications. @@ -53,28 +54,33 @@ Our actor is designed to simplify and automate the creation of **llms.txt** file ```json { - "url": "https://example.com", - "maxCrawlDepth": 2 + "startUrl": "https://docs.apify.com", + "maxCrawlDepth": 1 } ``` ### Output Example (llms.txt) ``` -# Example Website - -> A brief description of the website goes here. +# docs.apify.com ## Index -- [Home](https://example.com): Welcome to our website! -- [Docs](https://example.com/docs): Comprehensive documentation. -- [Blog](https://example.com/blog): Latest updates and articles. +- [Home | Platform | Apify Documentation](https://docs.apify.com/platform): Apify is your one-stop shop for web scraping, data extraction, and RPA. Automate anything you can do manually in a browser. +- [Web Scraping Academy | Academy | Apify Documentation](https://docs.apify.com/academy): Learn everything about web scraping and automation with our free courses that will turn you into an expert scraper developer. +- [Apify Documentation](https://docs.apify.com/api) +- [API scraping | Academy | Apify Documentation](https://docs.apify.com/academy/api-scraping): Learn all about how the professionals scrape various types of APIs with various configurations, parameters, and requirements. +- [API client for JavaScript | Apify Documentation](https://docs.apify.com/api/client/js/) +- [Apify API | Apify Documentation](https://docs.apify.com/api/v2) +- [API client for Python | Apify Documentation](https://docs.apify.com/api/client/python/) +... + ``` + --- -## ✨ Why Use llms.txt Generator? +## ✨ Why Use llms.txt generator? - **Save Time**: Automates the tedious process of extracting, formatting, and organizing web content. - **Boost AI Performance**: Provides clean, structured data for LLMs and AI-powered tools. diff --git a/src/crawler_config.py b/src/crawler_config.py index ce52895..e9b63d2 100644 --- a/src/crawler_config.py +++ b/src/crawler_config.py @@ -1,26 +1,9 @@ CRAWLER_CONFIG = { - 'aggressivePrune': False, - 'clickElementsCssSelector': '[aria-expanded="False"]', - 'clientSideMinChangePercentage': 15, - 'crawlerType': 'playwright:adaptive', - 'debugLog': False, - 'debugMode': False, - 'expandIframes': True, 'htmlTransformer': 'none', - 'ignoreCanonicalUrl': False, 'keepElementsCssSelector': 'meta[name="description"],meta[name="Description"]\ntitle', - 'keepUrlFragments': False, - # changed by get_crawler_actor_config with defailt value 1 + # changed by get_crawler_actor_config with default value 1 'maxCrawlDepth': 0, # 0 by default for root page only just in case - 'proxyConfiguration': {'useApifyProxy': True}, - 'readableTextCharThreshold': 100, - 'removeCookieWarnings': True, - 'renderingTypeDetectionPercentage': 10, - 'saveFiles': False, - 'saveHtml': False, 'saveHtmlAsFile': True, - 'saveMarkdown': False, - 'saveScreenshots': False, 'startUrls': [ # is populated by get_crawler_actor_config ], diff --git a/src/helpers.py b/src/helpers.py index 903047f..ba7d141 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -37,12 +37,10 @@ def is_description_suitable(description: str | None) -> bool: async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_url: str) -> str | None: """Extracts the description from the HTML content stored in the KV store.""" store_id = html_url.split('records/')[-1] - record = await kvstore.get_record(store_id) - if record is None: + if not (record := await kvstore.get_record(store_id)): logging.warning(f'Failed to get record with id "{store_id}"!') return None - html = record.get('value') - if html is None or not isinstance(html, str): + if not (html := record.get('value')) or not isinstance(html, str): logging.warning(f'Invalid HTML content for record with id "{store_id}"!') return None diff --git a/src/main.py b/src/main.py index c64e57c..8f74df3 100644 --- a/src/main.py +++ b/src/main.py @@ -26,14 +26,15 @@ async def main() -> None: the field of web scraping significantly. """ async with Actor: - actor_input = await Actor.get_input() or {'url': 'https://docs.apify.com/'} - url = actor_input.get('url') + actor_input = await Actor.get_input() + url = actor_input.get('startUrl') if url is None: - raise ValueError('Missing "url" attribute in input!') + raise ValueError('Missing "startUrl" attribute in input!') max_crawl_depth = int(actor_input.get('maxCrawlDepth', 1)) # call apify/website-content-crawler actor to get the html content + logging.info(f'Starting the "apify/website-content-crawler" actor for URL: {url}') actor_run_details = await Actor.call( 'apify/website-content-crawler', get_crawler_actor_config(url, max_crawl_depth=max_crawl_depth), @@ -52,11 +53,11 @@ async def main() -> None: data = {'title': root_title, 'description': None, 'sections': []} # add all pages to index section for now - # TODO: use path or LLM suggestions to group pages into sections # noqa: TD003 section: SectionDict = {'title': 'Index', 'links': []} async for item in run_dataset.iterate_items(): item_url = item.get('url') + logging.info(f'Processing page: {item_url}') if item_url is None: logging.warning('Missing "url" attribute in dataset item!') continue @@ -92,5 +93,7 @@ async def main() -> None: # save into kv-store as a file to be able to download it store = await Actor.open_key_value_store() await store.set_value('llms.txt', output) + logging.info('Saved the "llms.txt" file into the key-value store!') await Actor.push_data({'llms.txt': output}) + logging.info('Pushed the "llms.txt" file to the dataset!') diff --git a/src/renderer.py b/src/renderer.py index d7967ab..ec202d4 100644 --- a/src/renderer.py +++ b/src/renderer.py @@ -1,5 +1,32 @@ def render(data: dict) -> str: - """Generates llms.txt file from the provided data.""" + """Generates llms.txt file from the provided data. + + Example data: + { + 'title': 'Example', + 'description': 'Example description', + 'details': 'Example details', + 'sections': [ + { + 'title': 'Section 1', + 'links': [ + {'url': 'https://example.com', 'title': 'Example', 'description': 'Example description'}, + ], + }, + ], + } + Example output: + # Example + + > Example description + + Example details + + ## Section 1 + + - [Example](https://example.com): Example description + + """ result = f"# {data['title']}\n\n" if data.get('description'): From 13b5f3680399737d69c5a1fc9cadd30bc54faa23 Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 13 Jan 2025 20:24:07 +0100 Subject: [PATCH 2/9] remove "actor" from the name --- .actor/actor.json | 2 +- README.md | 4 ++-- pyproject.toml | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.actor/actor.json b/.actor/actor.json index 929a52b..5bfd484 100644 --- a/.actor/actor.json +++ b/.actor/actor.json @@ -1,6 +1,6 @@ { "actorSpecification": 1, - "name": "llmstxt-generator-actor", + "name": "llmstxt-generator", "title": "Generate llms.txt for the given site", "description": "Generates llms.txt for the given site", "version": "0.0", diff --git a/README.md b/README.md index 341d535..bf0ae3c 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ -# llms.txt generator actor πŸš€πŸ“„ +# llms.txt generator πŸš€πŸ“„ -The **llms.txt generator actor** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This tool leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**. +The **llms.txt generator** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This tool leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**. ## 🌟 What is llms.txt? diff --git a/pyproject.toml b/pyproject.toml index 0a375fb..64c99b7 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,7 +1,7 @@ [tool.poetry] name = "apify-llmstxt-generator" version = "0.1.0" -description = "llms.txt generator actor" +description = "llms.txt generator" authors = ["Jakub Kopecky "] readme = "README.md" package-mode = false From 0b00e652635da212bdfd1430576088b0bc77c9d3 Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 13 Jan 2025 20:50:37 +0100 Subject: [PATCH 3/9] fix readme naming title case (all caps) --- README.md | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/README.md b/README.md index bf0ae3c..ca0704f 100644 --- a/README.md +++ b/README.md @@ -36,21 +36,21 @@ By adding an **llms.txt** file to your website, you make it easy for AI systems Our actor is designed to simplify and automate the creation of **llms.txt** files. Here are its key features: -- **Deep Website Crawling**: Extracts content from multi-level websites using the powerful [Crawlee](https://crawlee.dev) library and the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor. -- **Content Extraction**: Retrieves key metadata such as titles, descriptions, and URLs for seamless integration. -- **File Generation**: Saves the output in the standardized **llms.txt** format. -- **Downloadable Output**: The **llms.txt** file can be downloaded from the **Key-Value Store** in the Storage section of the actor run details. -- **Resource Management**: Limits the crawler actor to 4 GB of memory to ensure compatibility with the free tier, which has an 8 GB limit. Note that this may slow down the crawling process. +- **Deep website crawling**: Extracts content from multi-level websites using the powerful [Crawlee](https://crawlee.dev) library and the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor. +- **Content extraction**: Retrieves key metadata such as titles, descriptions, and URLs for seamless integration. +- **File generation**: Saves the output in the standardized **llms.txt** format. +- **Downloadable output**: The **llms.txt** file can be downloaded from the **key-value store** in the storage section of the actor run details. +- **Resource management**: Limits the crawler actor to 4 GB of memory to ensure compatibility with the free tier, which has an 8 GB limit. Note that this may slow down the crawling process. --- -## πŸš€ How It Works +## πŸš€ How it works -1. **Input**: Provide the Start URL of the website to crawl. +1. **Input**: Provide the start URL of the website to crawl. 2. **Configuration**: Set the maximum crawl depth and other options (optional). 3. **Output**: The actor generates a structured **llms.txt** file with extracted content, ready for AI applications. -### Input Example +### Input example ```json { @@ -59,7 +59,7 @@ Our actor is designed to simplify and automate the creation of **llms.txt** file } ``` -### Output Example (llms.txt) +### Output example (llms.txt) ``` # docs.apify.com @@ -80,16 +80,16 @@ Our actor is designed to simplify and automate the creation of **llms.txt** file --- -## ✨ Why Use llms.txt generator? +## ✨ Why use llms.txt generator? -- **Save Time**: Automates the tedious process of extracting, formatting, and organizing web content. -- **Boost AI Performance**: Provides clean, structured data for LLMs and AI-powered tools. -- **Future-Proof**: Follows a standardized format that’s gaining adoption in the AI community. -- **User-Friendly**: Easy integration into customer-facing products, allowing users to generate **llms.txt** files effortlessly. +- **Save time**: Automates the tedious process of extracting, formatting, and organizing web content. +- **Boost AI performance**: Provides clean, structured data for LLMs and AI-powered tools. +- **Future-proof**: Follows a standardized format that’s gaining adoption in the AI community. +- **User-friendly**: Easy integration into customer-facing products, allowing users to generate **llms.txt** files effortlessly. --- -## πŸ”§ Technical Highlights +## πŸ”§ Technical highlights - Built on the [Apify SDK](https://docs.apify.com/sdk/python), leveraging state-of-the-art web scraping tools. - Designed to handle JavaScript-heavy websites using headless browsers. @@ -98,12 +98,12 @@ Our actor is designed to simplify and automate the creation of **llms.txt** file --- -## πŸ“– Learn More +## πŸ“– Learn more -- [Apify Platform](https://apify.com) -- [Apify SDK Documentation](https://docs.apify.com/sdk/python) -- [Crawlee Library](https://crawlee.dev) -- [llms.txt Proposal](https://example.com/llms-txt-proposal) +- [Apify platform](https://apify.com) +- [Apify SDK documentation](https://docs.apify.com/sdk/python) +- [Crawlee library](https://crawlee.dev) +- [llms.txt proposal](https://example.com/llms-txt-proposal) --- From d3729ef7c729b0c51a2b0fbaaf5c61c1afd3e3eb Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 13 Jan 2025 20:51:03 +0100 Subject: [PATCH 4/9] fix input schema field type, fix logging --- .actor/input_schema.json | 2 +- src/helpers.py | 5 +++-- src/main.py | 13 +++++++------ 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/.actor/input_schema.json b/.actor/input_schema.json index 9023c0a..d064471 100644 --- a/.actor/input_schema.json +++ b/.actor/input_schema.json @@ -18,5 +18,5 @@ "default": 1 } }, - "required": ["url"] + "required": ["startUrl"] } diff --git a/src/helpers.py b/src/helpers.py index ba7d141..52ba963 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -13,6 +13,7 @@ if TYPE_CHECKING: from apify_client.clients import KeyValueStoreClientAsync +logger = logging.getLogger('apify') def get_hostname_path_string_from_url(url: str) -> str: """Extracts the hostname and path from the URL.""" @@ -38,10 +39,10 @@ async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_u """Extracts the description from the HTML content stored in the KV store.""" store_id = html_url.split('records/')[-1] if not (record := await kvstore.get_record(store_id)): - logging.warning(f'Failed to get record with id "{store_id}"!') + logger.warning(f'Failed to get record with id "{store_id}"!') return None if not (html := record.get('value')) or not isinstance(html, str): - logging.warning(f'Invalid HTML content for record with id "{store_id}"!') + logger.warning(f'Invalid HTML content for record with id "{store_id}"!') return None return get_description_from_html(html) diff --git a/src/main.py b/src/main.py index 8f74df3..082d649 100644 --- a/src/main.py +++ b/src/main.py @@ -17,6 +17,7 @@ if TYPE_CHECKING: from src.types import SectionDict +logger = logging.getLogger('apify') async def main() -> None: """Main entry point for the Apify Actor. @@ -34,7 +35,7 @@ async def main() -> None: max_crawl_depth = int(actor_input.get('maxCrawlDepth', 1)) # call apify/website-content-crawler actor to get the html content - logging.info(f'Starting the "apify/website-content-crawler" actor for URL: {url}') + logger.info(f'Starting the "apify/website-content-crawler" actor for URL: {url}') actor_run_details = await Actor.call( 'apify/website-content-crawler', get_crawler_actor_config(url, max_crawl_depth=max_crawl_depth), @@ -57,13 +58,13 @@ async def main() -> None: async for item in run_dataset.iterate_items(): item_url = item.get('url') - logging.info(f'Processing page: {item_url}') + logger.info(f'Processing page: {item_url}') if item_url is None: - logging.warning('Missing "url" attribute in dataset item!') + logger.warning('Missing "url" attribute in dataset item!') continue html_url = item.get('htmlUrl') if html_url is None: - logging.warning('Missing "htmlUrl" attribute in dataset item!') + logger.warning('Missing "htmlUrl" attribute in dataset item!') continue is_root = item_url == url @@ -93,7 +94,7 @@ async def main() -> None: # save into kv-store as a file to be able to download it store = await Actor.open_key_value_store() await store.set_value('llms.txt', output) - logging.info('Saved the "llms.txt" file into the key-value store!') + logger.info('Saved the "llms.txt" file into the key-value store!') await Actor.push_data({'llms.txt': output}) - logging.info('Pushed the "llms.txt" file to the dataset!') + logger.info('Pushed the "llms.txt" file to the dataset!') From fdacc57dde91cf84070cbd13d32cf959c40a38d6 Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 13 Jan 2025 20:55:09 +0100 Subject: [PATCH 5/9] fix typo in readme --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ca0704f..7cbaf6d 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ The **llms.txt** format is a markdown-based standard for providing AI-friendly c - **Links to additional resources** in markdown format. - **AI-focused** structure to help coders, researchers, and AI models easily access and use website content. -Proposed example structure: +Proposed structure: ``` # Title From 82ac951d5226efbf41ff48e87889f5fc5abdf1cb Mon Sep 17 00:00:00 2001 From: MQ Date: Mon, 13 Jan 2025 21:09:57 +0100 Subject: [PATCH 6/9] format code --- src/helpers.py | 1 + src/main.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/helpers.py b/src/helpers.py index 52ba963..b0d415c 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -15,6 +15,7 @@ logger = logging.getLogger('apify') + def get_hostname_path_string_from_url(url: str) -> str: """Extracts the hostname and path from the URL.""" parsed_url = urlparse(url) diff --git a/src/main.py b/src/main.py index 082d649..5385c46 100644 --- a/src/main.py +++ b/src/main.py @@ -19,6 +19,7 @@ logger = logging.getLogger('apify') + async def main() -> None: """Main entry point for the Apify Actor. From 182e3881efac0c6cf79f2d0b9878c8ccb3b86cd6 Mon Sep 17 00:00:00 2001 From: MQ Date: Wed, 15 Jan 2025 14:21:10 +0100 Subject: [PATCH 7/9] refactor comments, raise on empty dataset, removed render from helpers, render using join, switch to pytest, actor.log in main --- Makefile | 2 +- poetry.lock | 91 +++++++++++++++++++++- pyproject.toml | 4 + src/__main__.py | 2 +- src/helpers.py | 8 +- src/main.py | 37 ++++----- src/{types.py => mytypes.py} | 9 +++ src/renderer.py | 21 +++--- tests/test_html.py | 23 +++--- tests/test_renderer.py | 141 +++++++++++++++++------------------ tests/tests.py | 7 -- 11 files changed, 218 insertions(+), 127 deletions(-) rename src/{types.py => mytypes.py} (64%) delete mode 100644 tests/tests.py diff --git a/Makefile b/Makefile index 4ff7827..4fb9170 100644 --- a/Makefile +++ b/Makefile @@ -20,4 +20,4 @@ format: poetry run ruff format $(DIRS_WITH_CODE) unit-test: - poetry run python -m unittest tests.tests + poetry run -C tests/ pytest diff --git a/poetry.lock b/poetry.lock index dfdfb81..7c627d3 100644 --- a/poetry.lock +++ b/poetry.lock @@ -841,6 +841,17 @@ files = [ [package.extras] all = ["flake8 (>=7.1.1)", "mypy (>=1.11.2)", "pytest (>=8.3.2)", "ruff (>=0.6.2)"] +[[package]] +name = "iniconfig" +version = "2.0.0" +description = "brain-dead simple config-ini parsing" +optional = false +python-versions = ">=3.7" +files = [ + {file = "iniconfig-2.0.0-py3-none-any.whl", hash = "sha256:b6a85871a79d2e3b22d2d1b94ac2824226a63c6b741c88f7ae975f18b6778374"}, + {file = "iniconfig-2.0.0.tar.gz", hash = "sha256:2d91e135bf72d31a410b17c16da610a82cb55f6b0477d1a902134b24a455b8b3"}, +] + [[package]] name = "inquirer" version = "3.4.0" @@ -1220,6 +1231,32 @@ files = [ {file = "mypy_extensions-1.0.0.tar.gz", hash = "sha256:75dbf8955dc00442a438fc4d0666508a9a97b6bd41aa2f0ffe9d2f2725af0782"}, ] +[[package]] +name = "packaging" +version = "24.2" +description = "Core utilities for Python packages" +optional = false +python-versions = ">=3.8" +files = [ + {file = "packaging-24.2-py3-none-any.whl", hash = "sha256:09abb1bccd265c01f4a3aa3f7a7db064b36514d2cba19a2f694fe6150451a759"}, + {file = "packaging-24.2.tar.gz", hash = "sha256:c228a6dc5e932d346bc5739379109d49e8853dd8223571c7c5b55260edc0b97f"}, +] + +[[package]] +name = "pluggy" +version = "1.5.0" +description = "plugin and hook calling mechanisms for python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pluggy-1.5.0-py3-none-any.whl", hash = "sha256:44e1ad92c8ca002de6377e165f3e0f1be63266ab4d554740532335b9d75ea669"}, + {file = "pluggy-1.5.0.tar.gz", hash = "sha256:2cffa88e94fdc978c4c574f15f9e59b7f4201d439195c3715ca9e2486f1d0cf1"}, +] + +[package.extras] +dev = ["pre-commit", "tox"] +testing = ["pytest", "pytest-benchmark"] + [[package]] name = "propcache" version = "0.2.1" @@ -1535,6 +1572,58 @@ files = [ [package.extras] windows-terminal = ["colorama (>=0.4.6)"] +[[package]] +name = "pytest" +version = "8.3.4" +description = "pytest: simple powerful testing with Python" +optional = false +python-versions = ">=3.8" +files = [ + {file = "pytest-8.3.4-py3-none-any.whl", hash = "sha256:50e16d954148559c9a74109af1eaf0c945ba2d8f30f0a3d3335edde19788b6f6"}, + {file = "pytest-8.3.4.tar.gz", hash = "sha256:965370d062bce11e73868e0335abac31b4d3de0e82f4007408d242b4f8610761"}, +] + +[package.dependencies] +colorama = {version = "*", markers = "sys_platform == \"win32\""} +iniconfig = "*" +packaging = "*" +pluggy = ">=1.5,<2" + +[package.extras] +dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "pygments (>=2.7.2)", "requests", "setuptools", "xmlschema"] + +[[package]] +name = "pytest-asyncio" +version = "0.25.2" +description = "Pytest support for asyncio" +optional = false +python-versions = ">=3.9" +files = [ + {file = "pytest_asyncio-0.25.2-py3-none-any.whl", hash = "sha256:0d0bb693f7b99da304a0634afc0a4b19e49d5e0de2d670f38dc4bfa5727c5075"}, + {file = "pytest_asyncio-0.25.2.tar.gz", hash = "sha256:3f8ef9a98f45948ea91a0ed3dc4268b5326c0e7bce73892acc654df4262ad45f"}, +] + +[package.dependencies] +pytest = ">=8.2,<9" + +[package.extras] +docs = ["sphinx (>=5.3)", "sphinx-rtd-theme (>=1)"] +testing = ["coverage (>=6.2)", "hypothesis (>=5.7.1)"] + +[[package]] +name = "pytest-timeout" +version = "2.3.1" +description = "pytest plugin to abort hanging tests" +optional = false +python-versions = ">=3.7" +files = [ + {file = "pytest-timeout-2.3.1.tar.gz", hash = "sha256:12397729125c6ecbdaca01035b9e5239d4db97352320af155b3f5de1ba5165d9"}, + {file = "pytest_timeout-2.3.1-py3-none-any.whl", hash = "sha256:68188cb703edfc6a18fad98dc25a3c61e9f24d644b0b70f33af545219fc7813e"}, +] + +[package.dependencies] +pytest = ">=7.0.0" + [[package]] name = "python-dateutil" version = "2.9.0.post0" @@ -2145,4 +2234,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.12" -content-hash = "b27c293b4d560c660121cd850272c396be898e7efe45059487f89d3556876194" +content-hash = "e741490d4b632d25e3c8932ed559ce3421aa2a284eb6df9de0f2b70a884241ab" diff --git a/pyproject.toml b/pyproject.toml index 64c99b7..a5c0612 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,9 @@ beautifulsoup4 = "^4.12.3" ruff = "^0.8.6" mypy = "^1.14.1" types-beautifulsoup4 = "^4.12.0.20241020" +pytest = "^8.3.4" +pytest-asyncio = "^0.25.2" +pytest-timeout = "^2.3.1" [build-system] requires = ["poetry-core"] @@ -104,6 +107,7 @@ max-branches = 18 [tool.pytest.ini_options] addopts = "-ra" asyncio_mode = "auto" +asyncio_default_fixture_loop_scope = "function" timeout = 1200 [tool.mypy] diff --git a/src/__main__.py b/src/__main__.py index 8a11883..65563a2 100644 --- a/src/__main__.py +++ b/src/__main__.py @@ -2,5 +2,5 @@ from .main import main -# Execute the Actor entry point. +# Execute the llms.txt generator actor entry point. asyncio.run(main()) diff --git a/src/helpers.py b/src/helpers.py index b0d415c..41a38af 100644 --- a/src/helpers.py +++ b/src/helpers.py @@ -8,11 +8,12 @@ from bs4.element import NavigableString from src.crawler_config import CRAWLER_CONFIG -from src.renderer import render if TYPE_CHECKING: from apify_client.clients import KeyValueStoreClientAsync +# not using Actor.log because pytest then throws a warning +# about non existent event loop logger = logging.getLogger('apify') @@ -49,11 +50,6 @@ async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_u return get_description_from_html(html) -def render_llms_txt(data: dict) -> str: - """Renders the `llms.txt` file using the provided data.""" - return render(data) - - def get_crawler_actor_config(url: str, max_crawl_depth: int = 1) -> dict: """Creates actor input configuration for the `apify/website-content-crawler` actor.""" config = CRAWLER_CONFIG diff --git a/src/main.py b/src/main.py index 5385c46..22e4be8 100644 --- a/src/main.py +++ b/src/main.py @@ -1,37 +1,28 @@ -"""This module defines the main entry point for the Apify Actor. +"""This module defines the main entry point for the llsm.txt generator actor.""" -To build Apify Actors, utilize the Apify SDK toolkit, read more at the official documentation: -https://docs.apify.com/sdk/python -""" - -# Apify SDK - A toolkit for building Apify Actors. Read more at: -# https://docs.apify.com/sdk/python import logging from typing import TYPE_CHECKING from urllib.parse import urlparse from apify import Actor -from .helpers import get_crawler_actor_config, get_description_from_kvstore, is_description_suitable, render_llms_txt +from .helpers import get_crawler_actor_config, get_description_from_kvstore, is_description_suitable +from .renderer import render_llms_txt if TYPE_CHECKING: - from src.types import SectionDict + from src.mytypes import LLMSData, SectionDict logger = logging.getLogger('apify') async def main() -> None: - """Main entry point for the Apify Actor. - - This coroutine is executed using `asyncio.run()`, so it must remain an asynchronous function for proper execution. - Asynchronous execution is required for communication with Apify platform, and it also enhances performance in - the field of web scraping significantly. - """ + """Main entry point for the llms.txt generator actor.""" async with Actor: actor_input = await Actor.get_input() url = actor_input.get('startUrl') if url is None: - raise ValueError('Missing "startUrl" attribute in input!') + msg = 'Missing "startUrl" attribute in input!' + raise ValueError(msg) max_crawl_depth = int(actor_input.get('maxCrawlDepth', 1)) @@ -44,7 +35,8 @@ async def main() -> None: memory_mbytes=4096, ) if actor_run_details is None: - raise RuntimeError('Failed to start the "apify/website-content-crawler" actor!') + msg = 'Failed to start the "apify/website-content-crawler" actor!' + raise RuntimeError(msg) run_client = Actor.apify_client.run(actor_run_details.id) run_store = run_client.key_value_store() @@ -53,11 +45,13 @@ async def main() -> None: hostname = urlparse(url).hostname root_title = hostname - data = {'title': root_title, 'description': None, 'sections': []} + data: LLMSData = {'title': root_title, 'description': None, 'details': None, 'sections': []} # add all pages to index section for now section: SectionDict = {'title': 'Index', 'links': []} + is_dataset_empty = True async for item in run_dataset.iterate_items(): + is_dataset_empty = False item_url = item.get('url') logger.info(f'Processing page: {item_url}') if item_url is None: @@ -87,6 +81,13 @@ async def main() -> None: section['links'].append({'url': item_url, 'title': title, 'description': description}) + if is_dataset_empty: + msg = ( + 'No pages were crawled successfully!' + 'Please check the "apify/website-content-crawler" actor run for more details.' + ) + raise RuntimeError(msg) + if section['links']: data['sections'].append(section) diff --git a/src/types.py b/src/mytypes.py similarity index 64% rename from src/types.py rename to src/mytypes.py index 9dfbfc5..733405a 100644 --- a/src/types.py +++ b/src/mytypes.py @@ -16,3 +16,12 @@ class SectionDict(TypedDict): title: str links: list[LinkDict] + + +class LLMSData(TypedDict): + """Dictionary representing the data structure of the `llms.txt` file.""" + + title: str + description: str | None + details: str | None + sections: list[SectionDict] diff --git a/src/renderer.py b/src/renderer.py index ec202d4..08198e4 100644 --- a/src/renderer.py +++ b/src/renderer.py @@ -1,4 +1,7 @@ -def render(data: dict) -> str: +from src.mytypes import LLMSData + + +def render_llms_txt(data: LLMSData) -> str: """Generates llms.txt file from the provided data. Example data: @@ -27,20 +30,20 @@ def render(data: dict) -> str: - [Example](https://example.com): Example description """ - result = f"# {data['title']}\n\n" + result = [f"# {data['title']}\n\n"] if data.get('description'): - result += f"> {data['description']}\n\n" + result.append(f"> {data['description']}\n\n") if data.get('details'): - result += f"{data['details']}\n\n" + result.append(f"{data['details']}\n\n") for section in data.get('sections', []): - result += f"## {section['title']}\n\n" + result.append(f"## {section['title']}\n\n") for link in section.get('links', []): - result += f"- [{link['title']}]({link['url']})" + link_str = f"- [{link['title']}]({link['url']})" if link.get('description'): - result += f": {link['description']}" - result += '\n' + link_str += f": {link['description']}" + result.append(f'{link_str}\n') - return result + return ''.join(result) diff --git a/tests/test_html.py b/tests/test_html.py index cfc1b23..76daf03 100644 --- a/tests/test_html.py +++ b/tests/test_html.py @@ -1,17 +1,16 @@ -import unittest - from src.helpers import get_description_from_html -class HtmlUnitTests(unittest.TestCase): - def test_description_meta_tag(self) -> None: - html = '' - assert get_description_from_html(html) == 'testdesc' +def test_description_meta_tag() -> None: + html = '' + assert get_description_from_html(html) == 'testdesc' + + +def test_description_meta_tag_with_capital_d() -> None: + html = '' + assert get_description_from_html(html) == 'testdec' - def test_description_meta_tag_with_capital_d(self) -> None: - html = '' - assert get_description_from_html(html) == 'testdec' - def test_no_description_meta_tag(self) -> None: - html = '' - assert get_description_from_html(html) is None +def test_no_description_meta_tag() -> None: + html = '' + assert get_description_from_html(html) is None diff --git a/tests/test_renderer.py b/tests/test_renderer.py index dbfc751..82d79c7 100644 --- a/tests/test_renderer.py +++ b/tests/test_renderer.py @@ -1,52 +1,49 @@ -import unittest - -from src.helpers import render_llms_txt - - -class RenderUnitTests(unittest.TestCase): - def test_render_llms_txt(self) -> None: - data = { - 'title': 'docs.apify.com', - 'sections': [ - { - 'title': 'Index', - 'links': [ - { - 'url': 'https://docs.apify.com/academy', - 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' - } - ] - } - ] - } - - expected_output = """# docs.apify.com +from src.renderer import render_llms_txt + + +def test_render_llms_txt() -> None: + data = { + 'title': 'docs.apify.com', + 'sections': [ + { + 'title': 'Index', + 'links': [ + { + 'url': 'https://docs.apify.com/academy', + 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' + } + ] + } + ] + } + + expected_output = """# docs.apify.com ## Index - [Web Scraping Academy](https://docs.apify.com/academy): Learn everything about web scraping. """ - assert render_llms_txt(data) == expected_output - - def test_render_llms_txt_with_description(self) -> None: - data = { - 'title': 'docs.apify.com', - 'description': 'Apify documentation', - 'sections': [ - { - 'title': 'Index', - 'links': [ - { - 'url': 'https://docs.apify.com/academy', - 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' - } - ] - } - ] - } - - expected_output = """# docs.apify.com + assert render_llms_txt(data) == expected_output + +def test_render_llms_txt_with_description() -> None: + data = { + 'title': 'docs.apify.com', + 'description': 'Apify documentation', + 'sections': [ + { + 'title': 'Index', + 'links': [ + { + 'url': 'https://docs.apify.com/academy', + 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' + } + ] + } + ] + } + + expected_output = """# docs.apify.com > Apify documentation @@ -55,27 +52,27 @@ def test_render_llms_txt_with_description(self) -> None: - [Web Scraping Academy](https://docs.apify.com/academy): Learn everything about web scraping. """ - assert render_llms_txt(data) == expected_output - - def test_render_llms_txt_with_description_and_details(self) -> None: - data = { - 'title': 'docs.apify.com', - 'description': 'Apify documentation', - 'details': 'This is the documentation for Apify', - 'sections': [ - { - 'title': 'Index', - 'links': [ - { - 'url': 'https://docs.apify.com/academy', - 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' - } - ] - } - ] - } - - expected_output = """# docs.apify.com + assert render_llms_txt(data) == expected_output + +def test_render_llms_txt_with_description_and_details() -> None: + data = { + 'title': 'docs.apify.com', + 'description': 'Apify documentation', + 'details': 'This is the documentation for Apify', + 'sections': [ + { + 'title': 'Index', + 'links': [ + { + 'url': 'https://docs.apify.com/academy', + 'title': 'Web Scraping Academy', 'description': 'Learn everything about web scraping.' + } + ] + } + ] + } + + expected_output = """# docs.apify.com > Apify documentation @@ -86,18 +83,18 @@ def test_render_llms_txt_with_description_and_details(self) -> None: - [Web Scraping Academy](https://docs.apify.com/academy): Learn everything about web scraping. """ - assert render_llms_txt(data) == expected_output + assert render_llms_txt(data) == expected_output - def test_render_llms_txt_with_no_sections(self) -> None: - data = { - 'title': 'docs.apify.com', - 'description': 'Apify documentation', - } +def test_render_llms_txt_with_no_sections() -> None: + data = { + 'title': 'docs.apify.com', + 'description': 'Apify documentation', + } - expected_output = """# docs.apify.com + expected_output = """# docs.apify.com > Apify documentation """ - assert render_llms_txt(data) == expected_output + assert render_llms_txt(data) == expected_output diff --git a/tests/tests.py b/tests/tests.py deleted file mode 100644 index 95ae296..0000000 --- a/tests/tests.py +++ /dev/null @@ -1,7 +0,0 @@ -import unittest - -from .test_html import HtmlUnitTests # noqa: F401 -from .test_renderer import RenderUnitTests # noqa: F401 - -if __name__ == '__main__': - unittest.main() From 27bb2818a470ca368caeaec44f141ac65866c837 Mon Sep 17 00:00:00 2001 From: MQ Date: Thu, 16 Jan 2025 11:17:45 +0100 Subject: [PATCH 8/9] fix readme, tool -> Actor, capitalize Actor --- README.md | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/README.md b/README.md index 7cbaf6d..e7d3c21 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # llms.txt generator πŸš€πŸ“„ -The **llms.txt generator** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This tool leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**. +The **llms.txt generator** is an Apify Actor that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This Actor leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**. ## 🌟 What is llms.txt? @@ -34,13 +34,13 @@ By adding an **llms.txt** file to your website, you make it easy for AI systems ## 🎯 Features of llms.txt generator -Our actor is designed to simplify and automate the creation of **llms.txt** files. Here are its key features: +Our Actor is designed to simplify and automate the creation of **llms.txt** files. Here are its key features: -- **Deep website crawling**: Extracts content from multi-level websites using the powerful [Crawlee](https://crawlee.dev) library and the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor. +- **Deep website crawling**: Extracts content from multi-level websites using the powerful [Crawlee](https://crawlee.dev) library and the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor. - **Content extraction**: Retrieves key metadata such as titles, descriptions, and URLs for seamless integration. - **File generation**: Saves the output in the standardized **llms.txt** format. -- **Downloadable output**: The **llms.txt** file can be downloaded from the **key-value store** in the storage section of the actor run details. -- **Resource management**: Limits the crawler actor to 4 GB of memory to ensure compatibility with the free tier, which has an 8 GB limit. Note that this may slow down the crawling process. +- **Downloadable output**: The **llms.txt** file can be downloaded from the **key-value store** in the storage section of the Actor run details. +- **Resource management**: Limits the crawler Actor to 4 GB of memory to ensure compatibility with the free tier, which has an 8 GB limit. Note that this may slow down the crawling process. --- @@ -48,7 +48,7 @@ Our actor is designed to simplify and automate the creation of **llms.txt** file 1. **Input**: Provide the start URL of the website to crawl. 2. **Configuration**: Set the maximum crawl depth and other options (optional). -3. **Output**: The actor generates a structured **llms.txt** file with extracted content, ready for AI applications. +3. **Output**: The Actor generates a structured **llms.txt** file with extracted content, ready for AI applications. ### Input example @@ -103,7 +103,7 @@ Our actor is designed to simplify and automate the creation of **llms.txt** file - [Apify platform](https://apify.com) - [Apify SDK documentation](https://docs.apify.com/sdk/python) - [Crawlee library](https://crawlee.dev) -- [llms.txt proposal](https://example.com/llms-txt-proposal) +- [llms.txt proposal](https://llmstxt.org) --- From 3b5f9b12100e061bf818ae62408760660b00fa92 Mon Sep 17 00:00:00 2001 From: MQ Date: Thu, 16 Jan 2025 11:21:07 +0100 Subject: [PATCH 9/9] add top description llms.txt org link --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e7d3c21..bf64815 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # llms.txt generator πŸš€πŸ“„ -The **llms.txt generator** is an Apify Actor that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This Actor leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**. +The **llms.txt generator** is an Apify Actor that helps you extract essential website content and generate an [llms.txt](https://llmstxt.org/) file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This Actor leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**. ## 🌟 What is llms.txt? @@ -103,7 +103,7 @@ Our Actor is designed to simplify and automate the creation of **llms.txt** file - [Apify platform](https://apify.com) - [Apify SDK documentation](https://docs.apify.com/sdk/python) - [Crawlee library](https://crawlee.dev) -- [llms.txt proposal](https://llmstxt.org) +- [llms.txt proposal](https://llmstxt.org/) ---