-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
fix/review-issue-2 #4
Changes from all commits
3c97281
13b5f36
0b00e65
d3729ef
fdacc57
82ac951
182e388
27bb281
3b5f9b1
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
[tool.poetry] | ||
name = "apify-llmstxt-generator" | ||
version = "0.1.0" | ||
description = "llms.txt generator actor" | ||
description = "llms.txt generator" | ||
authors = ["Jakub Kopecky <[email protected]>"] | ||
readme = "README.md" | ||
package-mode = false | ||
|
@@ -15,6 +15,9 @@ beautifulsoup4 = "^4.12.3" | |
ruff = "^0.8.6" | ||
mypy = "^1.14.1" | ||
types-beautifulsoup4 = "^4.12.0.20241020" | ||
pytest = "^8.3.4" | ||
pytest-asyncio = "^0.25.2" | ||
pytest-timeout = "^2.3.1" | ||
|
||
[build-system] | ||
requires = ["poetry-core"] | ||
|
@@ -104,6 +107,7 @@ max-branches = 18 | |
[tool.pytest.ini_options] | ||
addopts = "-ra" | ||
asyncio_mode = "auto" | ||
asyncio_default_fixture_loop_scope = "function" | ||
timeout = 1200 | ||
|
||
[tool.mypy] | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,11 +8,14 @@ | |
from bs4.element import NavigableString | ||
|
||
from src.crawler_config import CRAWLER_CONFIG | ||
from src.renderer import render | ||
|
||
if TYPE_CHECKING: | ||
from apify_client.clients import KeyValueStoreClientAsync | ||
|
||
# not using Actor.log because pytest then throws a warning | ||
# about non existent event loop | ||
logger = logging.getLogger('apify') | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can just use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Using |
||
|
||
|
||
def get_hostname_path_string_from_url(url: str) -> str: | ||
"""Extracts the hostname and path from the URL.""" | ||
|
@@ -37,23 +40,16 @@ def is_description_suitable(description: str | None) -> bool: | |
async def get_description_from_kvstore(kvstore: KeyValueStoreClientAsync, html_url: str) -> str | None: | ||
"""Extracts the description from the HTML content stored in the KV store.""" | ||
store_id = html_url.split('records/')[-1] | ||
record = await kvstore.get_record(store_id) | ||
if record is None: | ||
logging.warning(f'Failed to get record with id "{store_id}"!') | ||
if not (record := await kvstore.get_record(store_id)): | ||
logger.warning(f'Failed to get record with id "{store_id}"!') | ||
return None | ||
html = record.get('value') | ||
if html is None or not isinstance(html, str): | ||
logging.warning(f'Invalid HTML content for record with id "{store_id}"!') | ||
if not (html := record.get('value')) or not isinstance(html, str): | ||
logger.warning(f'Invalid HTML content for record with id "{store_id}"!') | ||
return None | ||
|
||
return get_description_from_html(html) | ||
|
||
|
||
def render_llms_txt(data: dict) -> str: | ||
"""Renders the `llms.txt` file using the provided data.""" | ||
return render(data) | ||
|
||
|
||
def get_crawler_actor_config(url: str, max_crawl_depth: int = 1) -> dict: | ||
"""Creates actor input configuration for the `apify/website-content-crawler` actor.""" | ||
config = CRAWLER_CONFIG | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's better to use the
requestListSources
editor in this case - see https://docs.apify.com/platform/actors/development/actor-definition/input-schema/specification/v1#array. You may useapify.RequestList
to process it afterwards.Unless you really want to support just one URL every time, of course.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Because of how the format of
llms.txt
is specified I think the single url input is more suitable - we treat it like an index for the whole site (or sub-site) so the url should act as a root (or sub-root).