MQ37 · MQ37 · Jan 16, 2025 · Jan 13, 2025 · Jan 13, 2025 · Jan 13, 2025
diff --git a/.actor/actor.json b/.actor/actor.json
@@ -1,6 +1,6 @@
 {
   "actorSpecification": 1,
-  "name": "llmstxt-generator-actor",
+  "name": "llmstxt-generator",
   "title": "Generate llms.txt for the given site",
   "description": "Generates llms.txt for the given site",
   "version": "0.0",

diff --git a/.actor/input_schema.json b/.actor/input_schema.json
@@ -3,10 +3,10 @@
   "type": "object",
   "schemaVersion": 1,
   "properties": {
-    "url": {
-      "title": "URL of the site",
+    "startUrl": {
+      "title": "Start URL",
       "type": "string",
-      "description": "The URL of website you want to get the llm.txt generated for.",
+      "description": "The URL from which the crawler will start to generate the llms.txt file.",
       "editor": "textfield",
       "prefill": "https://docs.apify.com/"
     },
@@ -18,5 +18,5 @@
       "default": 1
     }
   },
-  "required": ["url"]
+  "required": ["startUrl"]
 }
diff --git a/Makefile b/Makefile
@@ -20,4 +20,4 @@ format:
 	poetry run ruff format $(DIRS_WITH_CODE)
 
 unit-test:
-	poetry run python -m unittest tests.tests
+	poetry run -C tests/ pytest
diff --git a/README.md b/README.md
@@ -1,21 +1,21 @@
-# llms.txt Generator Actor 🚀📄
+# llms.txt generator 🚀📄
 
-The **llms.txt Generator Actor** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA.
+The **llms.txt generator** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This tool leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**.
 
-The **llms.txt generator** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This tool leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**.
+The **llms.txt generator** is Apify Actor that helps you extract essential website content and generate an [llms.txt](https://llmstxt.org/) file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This Actor leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**.
-The **llms.txt generator** is an Apify tool that helps you extract essential website content and generate an **llms.txt** file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This tool leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**.
+The **llms.txt generator** is Apify Actor that helps you extract essential website content and generate an [llms.txt](https://llmstxt.org/) file, making your content ready for AI-powered applications such as fine-tuning, indexing, and integrating large language models (LLMs) like GPT-4, ChatGPT, or LLaMA. This Actor leverages the [Website Content Crawler](https://apify.com/apify/website-content-crawler) Actor to perform deep crawls and extract text content from web pages, ensuring comprehensive data collection. The Website Content Crawler is particularly useful because it supports output in multiple formats, including markdown, which is used by the **llms.txt**.
 ## 🌟 What is llms.txt?
 
 The **llms.txt** format is a markdown-based standard for providing AI-friendly content. It contains:
 
 - **Brief background information** and guidance.
 - **Links to additional resources** in markdown format.
-- A simple, AI-focused structure to help coders, researchers, and AI models easily access and use website content.
+- **AI-focused** structure to help coders, researchers, and AI models easily access and use website content.
 
-Here’s a mock example:
+Proposed structure:
 
 ```
 # Title
 
-> Optional description goes here
+> Optional description
 
 Optional details go here
 
@@ -32,58 +32,64 @@ By adding an **llms.txt** file to your website, you make it easy for AI systems
 
 ---
 
-## 🎯 Features of llms.txt Generator
+## 🎯 Features of llms.txt generator
 
 Our actor is designed to simplify and automate the creation of **llms.txt** files. Here are its key features:
 
-- **Deep Website Crawling**: Extracts content from multi-level websites using the powerful [Crawlee](https://crawlee.dev) library.
-- **Content Extraction**: Retrieves key metadata such as titles, descriptions, and URLs for seamless integration.
-- **File Generation**: Saves the output in the standardized **llms.txt** format.
-- **Downloadable Output**: The **llms.txt** file can be downloaded from the **Key-Value Store** in the Storage section of the actor run details.
+- **Deep website crawling**: Extracts content from multi-level websites using the powerful [Crawlee](https://crawlee.dev) library and the [Website Content Crawler](https://apify.com/apify/website-content-crawler) actor.
+- **Content extraction**: Retrieves key metadata such as titles, descriptions, and URLs for seamless integration.
+- **File generation**: Saves the output in the standardized **llms.txt** format.
+- **Downloadable output**: The **llms.txt** file can be downloaded from the **key-value store** in the storage section of the actor run details.
+- **Resource management**: Limits the crawler actor to 4 GB of memory to ensure compatibility with the free tier, which has an 8 GB limit. Note that this may slow down the crawling process.
 
 ---
 
-## 🚀 How It Works
+## 🚀 How it works
 
-1. **Input**: Provide the URL of the website to crawl.
+1. **Input**: Provide the start URL of the website to crawl.
 2. **Configuration**: Set the maximum crawl depth and other options (optional).
 3. **Output**: The actor generates a structured **llms.txt** file with extracted content, ready for AI applications.
 
-### Input Example
+### Input example
 
 ```json
 {
-  "url": "https://example.com",
-  "maxCrawlDepth": 2
+  "startUrl": "https://docs.apify.com",
+  "maxCrawlDepth": 1
 }
 ```
 
-### Output Example (llms.txt)
+### Output example (llms.txt)
 
 ```
-# Example Website
-
-> A brief description of the website goes here.
+# docs.apify.com
 
 ## Index
 
-- [Home](https://example.com): Welcome to our website!
-- [Docs](https://example.com/docs): Comprehensive documentation.
-- [Blog](https://example.com/blog): Latest updates and articles.
+- [Home | Platform | Apify Documentation](https://docs.apify.com/platform): Apify is your one-stop shop for web scraping, data extraction, and RPA. Automate anything you can do manually in a browser.
+- [Web Scraping Academy | Academy | Apify Documentation](https://docs.apify.com/academy): Learn everything about web scraping and automation with our free courses that will turn you into an expert scraper developer.
+- [Apify Documentation](https://docs.apify.com/api)
+- [API scraping | Academy | Apify Documentation](https://docs.apify.com/academy/api-scraping): Learn all about how the professionals scrape various types of APIs with various configurations, parameters, and requirements.
+- [API client for JavaScript | Apify Documentation](https://docs.apify.com/api/client/js/)
+- [Apify API | Apify Documentation](https://docs.apify.com/api/v2)
+- [API client for Python | Apify Documentation](https://docs.apify.com/api/client/python/)
+...
+
 ```
 
+
 ---
 
-## ✨ Why Use llms.txt Generator?
+## ✨ Why use llms.txt generator?
 
-- **Save Time**: Automates the tedious process of extracting, formatting, and organizing web content.
-- **Boost AI Performance**: Provides clean, structured data for LLMs and AI-powered tools.
-- **Future-Proof**: Follows a standardized format that’s gaining adoption in the AI community.
-- **User-Friendly**: Easy integration into customer-facing products, allowing users to generate **llms.txt** files effortlessly.
+- **Save time**: Automates the tedious process of extracting, formatting, and organizing web content.
+- **Boost AI performance**: Provides clean, structured data for LLMs and AI-powered tools.
+- **Future-proof**: Follows a standardized format that’s gaining adoption in the AI community.
+- **User-friendly**: Easy integration into customer-facing products, allowing users to generate **llms.txt** files effortlessly.
 
 ---
 
-## 🔧 Technical Highlights
+## 🔧 Technical highlights
 
 - Built on the [Apify SDK](https://docs.apify.com/sdk/python), leveraging state-of-the-art web scraping tools.
 - Designed to handle JavaScript-heavy websites using headless browsers.
@@ -92,12 +98,12 @@ Our actor is designed to simplify and automate the creation of **llms.txt** file
 
 ---
 
-## 📖 Learn More
+## 📖 Learn more
 
-- [Apify Platform](https://apify.com)
-- [Apify SDK Documentation](https://docs.apify.com/sdk/python)
-- [Crawlee Library](https://crawlee.dev)
-- [llms.txt Proposal](https://example.com/llms-txt-proposal)
+- [Apify platform](https://apify.com)
+- [Apify SDK documentation](https://docs.apify.com/sdk/python)
+- [Crawlee library](https://crawlee.dev)
+- [llms.txt proposal](https://example.com/llms-txt-proposal)
 
 ---
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,7 +1,7 @@
 [tool.poetry]
 name = "apify-llmstxt-generator"
 version = "0.1.0"
-description = "llms.txt generator actor"
+description = "llms.txt generator"
 authors = ["Jakub Kopecky <[email protected]>"]
 readme = "README.md"
 package-mode = false
@@ -15,6 +15,9 @@ beautifulsoup4 = "^4.12.3"
 ruff = "^0.8.6"
 mypy = "^1.14.1"
 types-beautifulsoup4 = "^4.12.0.20241020"
+pytest = "^8.3.4"
+pytest-asyncio = "^0.25.2"
+pytest-timeout = "^2.3.1"
 
 [build-system]
 requires = ["poetry-core"]
@@ -104,6 +107,7 @@ max-branches = 18
 [tool.pytest.ini_options]
 addopts = "-ra"
 asyncio_mode = "auto"
+asyncio_default_fixture_loop_scope = "function"
 timeout = 1200
 
 [tool.mypy]

diff --git a/src/__main__.py b/src/__main__.py
@@ -2,5 +2,5 @@
 
 from .main import main
 
-# Execute the Actor entry point.
+# Execute the llms.txt generator actor entry point.
 asyncio.run(main())
diff --git a/src/crawler_config.py b/src/crawler_config.py
@@ -1,26 +1,9 @@
 CRAWLER_CONFIG = {
-    'aggressivePrune': False,
-    'clickElementsCssSelector': '[aria-expanded="False"]',
-    'clientSideMinChangePercentage': 15,
-    'crawlerType': 'playwright:adaptive',
-    'debugLog': False,
-    'debugMode': False,
-    'expandIframes': True,
     'htmlTransformer': 'none',
-    'ignoreCanonicalUrl': False,
     'keepElementsCssSelector': 'meta[name="description"],meta[name="Description"]\ntitle',
-    'keepUrlFragments': False,
-    # changed by get_crawler_actor_config with defailt value 1
+    # changed by get_crawler_actor_config with default value 1
     'maxCrawlDepth': 0,  # 0 by default for root page only just in case
-    'proxyConfiguration': {'useApifyProxy': True},
-    'readableTextCharThreshold': 100,
-    'removeCookieWarnings': True,
-    'renderingTypeDetectionPercentage': 10,
-    'saveFiles': False,
-    'saveHtml': False,
     'saveHtmlAsFile': True,
-    'saveMarkdown': False,
-    'saveScreenshots': False,
     'startUrls': [
         # is populated by get_crawler_actor_config
     ],

diff --git a/src/helpers.py b/src/helpers.py
@@ -8,11 +8,14 @@
 from bs4.element import NavigableString
 
 from src.crawler_config import CRAWLER_CONFIG
-from src.renderer import render
 
 if TYPE_CHECKING:
     from apify_client.clients import KeyValueStoreClientAsync
 
+# not using Actor.log because pytest then throws a warning
+# about non existent event loop
+logger = logging.getLogger('apify')
+
 
 def get_hostname_path_string_from_url(url: str) -> str:
     """Extracts the hostname and path from the URL."""
@@ -37,23 +40,16 @@ def is_description_suitable(description: str | None) -> bool:
 async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_url: str) -> str | None:
     """Extracts the description from the HTML content stored in the KV store."""
     store_id = html_url.split('records/')[-1]
-    record = await kvstore.get_record(store_id)
-    if record is None:
-        logging.warning(f'Failed to get record with id "{store_id}"!')
+    if not (record := await kvstore.get_record(store_id)):
+        logger.warning(f'Failed to get record with id "{store_id}"!')
         return None
-    html = record.get('value')
-    if html is None or not isinstance(html, str):
-        logging.warning(f'Invalid HTML content for record with id "{store_id}"!')
+    if not (html := record.get('value')) or not isinstance(html, str):
+        logger.warning(f'Invalid HTML content for record with id "{store_id}"!')
         return None
 
     return get_description_from_html(html)
 
 
-def render_llms_txt(data: dict) -> str:
-    """Renders the `llms.txt` file using the provided data."""
-    return render(data)
-
-
 def get_crawler_actor_config(url: str, max_crawl_depth: int = 1) -> dict:
     """Creates actor input configuration for the `apify/website-content-crawler` actor."""
     config = CRAWLER_CONFIG