Skip to content

Commit

Permalink
updated v0.3.3 (#9)
Browse files Browse the repository at this point in the history
* .

* .
  • Loading branch information
robbiemu authored Oct 20, 2024
1 parent dee70eb commit 4be7b2d
Show file tree
Hide file tree
Showing 3 changed files with 9 additions and 7 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ crawl-docs https://example.com / -o output.md \
--allowed-paths "/docs/" "/api/"
```

#### Skipping Pre-Fetch and Post-Fetch URLs with Ignore Paths
#### Skipping URLs with Ignore Paths

```bash
Copiar código
Expand Down
12 changes: 8 additions & 4 deletions src/libcrawler/libcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,8 +138,8 @@ def remove_common_elements(soup, extra_remove_selectors=None):
return soup


def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, headers={}, delay=1, delay_range=0.5,
extra_remove_selectors=None, allowed_paths=None, ignore_paths=[]):
def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,
headers={}, delay=1, delay_range=0.5, extra_remove_selectors=None, allowed_paths=None, ignore_paths=None):
visited_links = set()
root = PageNode(start_url)
node_lookup = {}
Expand All @@ -166,6 +166,8 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, head
continue
visited_links.add(current_link)

if ignore_paths is None:
ignore_paths = []
if any(ignore_path in current_link for ignore_path in ignore_paths):
logger.info(f"Skipping {current_link} matching ignore_paths")
continue
Expand Down Expand Up @@ -369,7 +371,8 @@ def crawl_and_convert(
delay_range=0.5,
extra_remove_selectors=None,
similarity_threshold=0.8,
allowed_paths=None
allowed_paths=None,
ignore_paths=None
):
# Build the tree and get page_markdowns and url_to_anchor
page_markdowns, url_to_anchor = build_tree(
Expand All @@ -381,7 +384,8 @@ def crawl_and_convert(
delay=delay,
delay_range=delay_range,
extra_remove_selectors=extra_remove_selectors,
allowed_paths=allowed_paths
allowed_paths=allowed_paths,
ignore_paths=ignore_paths
)

# Deduplicate content
Expand Down
2 changes: 0 additions & 2 deletions src/tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -523,12 +523,10 @@ def side_effect(url, headers={}):
start_url=self.start_url,
base_url=self.base_url,
output_filename=self.output_filename,
handle_robots_txt=False,
delay=0,
delay_range=0,
extra_remove_selectors=['header', 'footer', '.footer'],
similarity_threshold=0.6, # Increased threshold
allowed_paths=None,
headers=headers
)

Expand Down

0 comments on commit 4be7b2d

Please sign in to comment.