diff --git a/README.md b/README.md index e3b2310..5c8b7be 100644 --- a/README.md +++ b/README.md @@ -145,7 +145,7 @@ crawl-docs https://example.com / -o output.md \ --allowed-paths "/docs/" "/api/" ``` -#### Skipping Pre-Fetch and Post-Fetch URLs with Ignore Paths +#### Skipping URLs with Ignore Paths ```bash Copiar código diff --git a/src/libcrawler/libcrawler.py b/src/libcrawler/libcrawler.py index c7ac686..ef80850 100644 --- a/src/libcrawler/libcrawler.py +++ b/src/libcrawler/libcrawler.py @@ -138,8 +138,8 @@ def remove_common_elements(soup, extra_remove_selectors=None): return soup -def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, headers={}, delay=1, delay_range=0.5, - extra_remove_selectors=None, allowed_paths=None, ignore_paths=[]): +def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, + headers={}, delay=1, delay_range=0.5, extra_remove_selectors=None, allowed_paths=None, ignore_paths=None): visited_links = set() root = PageNode(start_url) node_lookup = {} @@ -166,6 +166,8 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, head continue visited_links.add(current_link) + if ignore_paths is None: + ignore_paths = [] if any(ignore_path in current_link for ignore_path in ignore_paths): logger.info(f"Skipping {current_link} matching ignore_paths") continue @@ -369,7 +371,8 @@ def crawl_and_convert( delay_range=0.5, extra_remove_selectors=None, similarity_threshold=0.8, - allowed_paths=None + allowed_paths=None, + ignore_paths=None ): # Build the tree and get page_markdowns and url_to_anchor page_markdowns, url_to_anchor = build_tree( @@ -381,7 +384,8 @@ def crawl_and_convert( delay=delay, delay_range=delay_range, extra_remove_selectors=extra_remove_selectors, - allowed_paths=allowed_paths + allowed_paths=allowed_paths, + ignore_paths=ignore_paths ) # Deduplicate content diff --git a/src/tests/test_crawler.py b/src/tests/test_crawler.py index d361075..bf11fd7 100644 --- a/src/tests/test_crawler.py +++ b/src/tests/test_crawler.py @@ -523,12 +523,10 @@ def side_effect(url, headers={}): start_url=self.start_url, base_url=self.base_url, output_filename=self.output_filename, - handle_robots_txt=False, delay=0, delay_range=0, extra_remove_selectors=['header', 'footer', '.footer'], similarity_threshold=0.6, # Increased threshold - allowed_paths=None, headers=headers )