Skip to content

Commit

Permalink
when checking if url is input root url normalize the url without the …
Browse files Browse the repository at this point in the history
…trailing /
  • Loading branch information
MQ37 committed Jan 17, 2025
1 parent c274710 commit 743b061
Show file tree
Hide file tree
Showing 4 changed files with 39 additions and 7 deletions.
7 changes: 7 additions & 0 deletions src/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,13 @@
logger = logging.getLogger('apify')


def normalize_url(url: str) -> str:
"""Normalizes the URL by removing trailing slash."""
parsed_url = urlparse(url)
normalized = parsed_url._replace(path=parsed_url.path.rstrip('/'))
return normalized.geturl()


def get_hostname_path_string_from_url(url: str) -> str:
"""Extracts the hostname and path from the URL."""
parsed_url = urlparse(url)
Expand Down
7 changes: 4 additions & 3 deletions src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from apify import Actor

from .helpers import get_crawler_actor_config, get_description_from_kvstore, is_description_suitable
from .helpers import get_crawler_actor_config, get_description_from_kvstore, is_description_suitable, normalize_url
from .renderer import render_llms_txt

if TYPE_CHECKING:
Expand All @@ -23,6 +23,7 @@ async def main() -> None:
if url is None:
msg = 'Missing "startUrl" attribute in input!'
raise ValueError(msg)
url_normalized = normalize_url(url)

max_crawl_depth = int(actor_input.get('maxCrawlDepth', 1))

Expand Down Expand Up @@ -62,7 +63,7 @@ async def main() -> None:
logger.warning('Missing "htmlUrl" attribute in dataset item!')
continue

is_root = item_url == url
is_root = normalize_url(item_url) == url_normalized
if is_root:
description = await get_description_from_kvstore(run_store, html_url)
data['description'] = description if is_description_suitable(description) else None
Expand All @@ -84,7 +85,7 @@ async def main() -> None:
if is_dataset_empty:
msg = (
'No pages were crawled successfully!'
'Please check the "apify/website-content-crawler" actor run for more details.'
' Please check the "apify/website-content-crawler" actor run for more details.'
)
raise RuntimeError(msg)

Expand Down
14 changes: 14 additions & 0 deletions tests/test_helpers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
from src.helpers import get_hostname_path_string_from_url, normalize_url


def test_normalize_url() -> None:
url = 'https://example.com/'
url_normalized = 'https://example.com'
assert normalize_url(url) == url_normalized

def test_get_hostname_path_string_from_url() -> None:
url = 'https://example.com/path'
assert get_hostname_path_string_from_url(url) == 'example.com/path'

url2 = 'https://example.com/path/'
assert get_hostname_path_string_from_url(url2) == 'example.com/path/'
18 changes: 14 additions & 4 deletions tests/test_renderer.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,16 @@
from typing import TYPE_CHECKING

from src.renderer import render_llms_txt

if TYPE_CHECKING:
from src.mytypes import LLMSData


def test_render_llms_txt() -> None:
data = {
data: LLMSData = {
'title': 'docs.apify.com',
'details': None,
'description': None,
'sections': [
{
'title': 'Index',
Expand All @@ -27,9 +34,10 @@ def test_render_llms_txt() -> None:
assert render_llms_txt(data) == expected_output

def test_render_llms_txt_with_description() -> None:
data = {
data: LLMSData = {
'title': 'docs.apify.com',
'description': 'Apify documentation',
'details': None,
'sections': [
{
'title': 'Index',
Expand All @@ -55,7 +63,7 @@ def test_render_llms_txt_with_description() -> None:
assert render_llms_txt(data) == expected_output

def test_render_llms_txt_with_description_and_details() -> None:
data = {
data: LLMSData = {
'title': 'docs.apify.com',
'description': 'Apify documentation',
'details': 'This is the documentation for Apify',
Expand Down Expand Up @@ -86,9 +94,11 @@ def test_render_llms_txt_with_description_and_details() -> None:
assert render_llms_txt(data) == expected_output

def test_render_llms_txt_with_no_sections() -> None:
data = {
data: LLMSData = {
'title': 'docs.apify.com',
'description': 'Apify documentation',
'details': None,
'sections': []
}

expected_output = """# docs.apify.com
Expand Down

0 comments on commit 743b061

Please sign in to comment.