From 048c71e27e99d3b0b72c6e25234beb1830b3092b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roberto=20Tom=C3=A1s=20Collins?= Date: Sun, 20 Oct 2024 11:09:21 -0400 Subject: [PATCH] v0.3.3 - new flag: --ignore-paths - new flag: --user-agent - harmonized user-agent and header flags - updated readme --- README.md | 11 +++++++ src/libcrawler/__main__.py | 20 +++++++++---- src/libcrawler/libcrawler.py | 24 +++++++++++---- src/libcrawler/version.py | 2 +- src/tests/test_crawler.py | 58 ++++++++++++++++++++++++++++++++++++ 5 files changed, 102 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index 038c8e9..e3b2310 100644 --- a/README.md +++ b/README.md @@ -9,6 +9,7 @@ This tool crawls a documentation website and converts the pages into a single Ma - Customizable threshold for similarity. - Configurable selectors to remove specific elements from pages. - Supports robots.txt compliance with an option to ignore it. +- **NEW in v0.3.3**: Ability to skip URLs based on ignore-paths both pre-fetch (before requesting content) and post-fetch (after redirects). ## Installation @@ -113,6 +114,8 @@ crawl-docs BASE_URL STARTING_POINT [OPTIONS] - `--remove-selectors SELECTOR [SELECTOR ...]`: Additional CSS selectors to remove from pages. - `--similarity-threshold SIMILARITY_THRESHOLD`: Similarity threshold for section comparison (default: 0.8). - `--allowed-paths PATH [PATH ...]`: List of URL paths to include during crawling. +- `--ignore-paths PATH [PATH ...]`: List of URL paths to skip during crawling, either before or after fetching content. +- `--user-agent USER_AGENT`: Specify a custom User-Agent string (which will be harmonized with any additional headers). - `--headers-file FILE`: Path to a JSON file containing optional headers. Only one of `--headers-file` or `--headers-json` can be used. - `--headers-json JSON` (JSON string): Optional headers as JSON @@ -142,6 +145,14 @@ crawl-docs https://example.com / -o output.md \ --allowed-paths "/docs/" "/api/" ``` +#### Skipping Pre-Fetch and Post-Fetch URLs with Ignore Paths + +```bash +Copiar código +crawl-docs https://example.com /docs/ -o output.md \ + --ignore-paths "/old/" "/legacy/" +``` + ### Dependencies - Python 3.6 or higher diff --git a/src/libcrawler/__main__.py b/src/libcrawler/__main__.py index 979ef0c..3592711 100644 --- a/src/libcrawler/__main__.py +++ b/src/libcrawler/__main__.py @@ -22,16 +22,22 @@ def main(): help='Additional CSS selectors to remove from pages.') parser.add_argument('--similarity-threshold', type=float, default=0.6, help='Similarity threshold for section comparison (default: 0.6).') - parser.add_argument('--allowed-paths', nargs='*', + parser.add_argument('--allowed-paths', nargs='*', help='List of URL paths to include during crawling.') + parser.add_argument('--ignore-paths', nargs='*', + help='List of URL paths to exclude from crawling.') - headers_group = parser.add_mutually_exclusive_group(required=True) - headers_group.add_argument('--headers-file', type=str, help='Path to a JSON file containing headers. Only one of --headers-file or --headers-json can be used.') - headers_group.add_argument('--headers-json', type=json.loads, help='Raw JSON string representing the headers. Only one of --headers-file or --headers-json can be used.') + parser.add_argument('--user-agent', type=str, help='Custom User-Agent string.') + headers_group = parser.add_mutually_exclusive_group() + headers_group.add_argument('--headers-file', type=str, + help='Path to a JSON file containing headers.') + headers_group.add_argument('--headers-json', type=json.loads, + help='Raw JSON string representing the headers.') args = parser.parse_args() - headers = {} + # Adjust logic for handling headers + headers = None if args.headers_file: try: with open(args.headers_file, 'r') as file: @@ -48,6 +54,7 @@ def main(): start_url = urljoin(args.base_url, args.starting_point) + # Adjust crawl_and_convert call to handle ignore-paths and optional headers crawl_and_convert( start_url=start_url, base_url=args.base_url, @@ -59,7 +66,8 @@ def main(): delay_range=args.delay_range, extra_remove_selectors=args.remove_selectors, similarity_threshold=args.similarity_threshold, - allowed_paths=args.allowed_paths + allowed_paths=args.allowed_paths, + ignore_paths=args.ignore_paths # Pass the ignore-paths argument ) diff --git a/src/libcrawler/libcrawler.py b/src/libcrawler/libcrawler.py index a1fbb44..c7ac686 100644 --- a/src/libcrawler/libcrawler.py +++ b/src/libcrawler/libcrawler.py @@ -53,8 +53,12 @@ def normalize_url(url): return normalized_url -def fetch_content(url, headers={}): +def fetch_content(url, user_agent=None, headers={}): """Fetches HTML content from a URL, following redirects.""" + # Harmonize user-agent with headers + if user_agent: + headers.setdefault('User-Agent', user_agent) + try: response = requests.get(url, headers=headers) response.raise_for_status() @@ -134,9 +138,8 @@ def remove_common_elements(soup, extra_remove_selectors=None): return soup -def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, - headers={}, delay=1, delay_range=0.5, - extra_remove_selectors=None, allowed_paths=None): +def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, headers={}, delay=1, delay_range=0.5, + extra_remove_selectors=None, allowed_paths=None, ignore_paths=[]): visited_links = set() root = PageNode(start_url) node_lookup = {} @@ -146,6 +149,10 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, robots_parser = load_robots_txt(base_url) if handle_robots_txt else None + # Harmonize User-agent and Headers + if user_agent: + headers.setdefault('User-Agent', user_agent) + # Store page content in Markdown page_markdowns = {} url_to_anchor = {} @@ -154,10 +161,15 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, while queue: current_node = queue.pop(0) current_link = normalize_url(current_node.url) + if current_link in visited_links: continue visited_links.add(current_link) + if any(ignore_path in current_link for ignore_path in ignore_paths): + logger.info(f"Skipping {current_link} matching ignore_paths") + continue + if handle_robots_txt and robots_parser: if not is_allowed_by_robots(current_link, user_agent, robots_parser): logger.info(f"Disallowed by robots.txt: {current_link}") @@ -165,8 +177,8 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, logger.info(f'Processing {current_link}') page_content, page_url = fetch_content(current_node.url, headers=headers) - if not page_content: - continue # Skip if content couldn't be fetched + if not page_content or (page_url and any(ignore_path in page_url for ignore_path in ignore_paths)): + continue soup = BeautifulSoup(page_content, 'html.parser') soup = remove_common_elements(soup, extra_remove_selectors=extra_remove_selectors) diff --git a/src/libcrawler/version.py b/src/libcrawler/version.py index bfcc44f..fef1a0d 100644 --- a/src/libcrawler/version.py +++ b/src/libcrawler/version.py @@ -1,2 +1,2 @@ -__version_info__ = ('0', '3', '2') +__version_info__ = ('0', '3', '3') __version__ = '.'.join(__version_info__) diff --git a/src/tests/test_crawler.py b/src/tests/test_crawler.py index 585cead..d361075 100644 --- a/src/tests/test_crawler.py +++ b/src/tests/test_crawler.py @@ -71,6 +71,64 @@ def test_fetch_content_failure(self, mock_get): self.assertIsNone(content) self.assertIsNone(url) + @patch('src.libcrawler.libcrawler.requests.get') + def test_user_agent_harmonization(self, mock_get): + # Mock response setup + mock_response = Mock() + mock_response.status_code = 200 + mock_response.text = 'Test content with headers and user-agent' + mock_get.return_value = mock_response + + # Headers without user-agent + headers = {'Accept': 'text/html'} + user_agent = 'test-agent' + + # Call the function with user-agent and headers + content, url = fetch_content('http://example.com/test', user_agent=user_agent, headers=headers) + + # Ensure the user-agent is added to the headers + expected_headers = {'Accept': 'text/html', 'User-Agent': 'test-agent'} + mock_get.assert_called_with('http://example.com/test', headers=expected_headers) + + # Assert content is fetched correctly + self.assertEqual(content, 'Test content with headers and user-agent') + +class TestIgnorePaths(unittest.TestCase): + def setUp(self): + self.start_url = 'http://example.com/start' + self.base_url = 'http://example.com' + # The ignore_paths can now contain partial matches + self.ignore_paths = ['/ignore-me', '/skip-this'] + +@patch('src.libcrawler.libcrawler.fetch_content') +def test_ignore_paths_pre_and_post_fetch(self, mock_fetch_content): + # Mock the fetch_content to simulate redirects and actual content + mock_fetch_content.side_effect = [ + ('Start Page', 'http://example.com/start'), # First URL + ('Ignored Page', 'http://example.com/ignore-me/page'), # Ignored after redirect + ('Another Ignored Page', 'http://example.com/skip-this/page2'), # Ignored after redirect + ('Allowed Page', 'http://example.com/allowed-page') # Not ignored + ] + + # Run the build_tree function + result_tree = build_tree( + start_url=self.start_url, + base_url=self.base_url, + ignore_paths=self.ignore_paths + ) + + # Check that the first URL (pre-fetch) was skipped entirely + self.assertEqual(mock_fetch_content.call_count, 3) + + # Check that the ignored URLs are not in the result tree (post-fetch) + for node in result_tree.values(): + self.assertNotIn('http://example.com/ignore-me/page', node.url) + self.assertNotIn('http://example.com/skip-this/page2', node.url) + + # Check that non-ignored URLs are present in the result tree + self.assertIn('http://example.com/start', result_tree) + self.assertIn('http://example.com/allowed-page', [node.url for node in result_tree.values()]) + class TestGetLinks(unittest.TestCase): def test_get_links_all_paths(self): html = '''