v0.3.3

- new flag: --ignore-paths - new flag: --user-agent - harmonized user-agent and header flags - updated readme
robbiemu · Oct 20, 2024 · 048c71e · 048c71e
1 parent 25aeb20
commit 048c71e
Show file tree

Hide file tree

Showing 5 changed files with 102 additions and 13 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ This tool crawls a documentation website and converts the pages into a single Ma
 - Customizable threshold for similarity.
 - Configurable selectors to remove specific elements from pages.
 - Supports robots.txt compliance with an option to ignore it.
+- **NEW in v0.3.3**: Ability to skip URLs based on ignore-paths both pre-fetch (before requesting content) and post-fetch (after redirects).
 
 ## Installation
 
@@ -113,6 +114,8 @@ crawl-docs BASE_URL STARTING_POINT [OPTIONS]
 - `--remove-selectors SELECTOR [SELECTOR ...]`: Additional CSS selectors to remove from pages.
 - `--similarity-threshold SIMILARITY_THRESHOLD`: Similarity threshold for section comparison (default: 0.8).
 - `--allowed-paths PATH [PATH ...]`: List of URL paths to include during crawling.
+- `--ignore-paths PATH [PATH ...]`: List of URL paths to skip during crawling, either before or after fetching content.
+- `--user-agent USER_AGENT`: Specify a custom User-Agent string (which will be harmonized with any additional headers).
 - `--headers-file FILE`: Path to a JSON file containing optional headers. Only one of `--headers-file` or `--headers-json` can be used.
 - `--headers-json JSON` (JSON string): Optional headers as JSON
 
@@ -142,6 +145,14 @@ crawl-docs https://example.com / -o output.md \
     --allowed-paths "/docs/" "/api/"
 ```
 
+#### Skipping Pre-Fetch and Post-Fetch URLs with Ignore Paths
+
+```bash
+Copiar código
+crawl-docs https://example.com /docs/ -o output.md \
+    --ignore-paths "/old/" "/legacy/"
+```
+
 ### Dependencies
 
 - Python 3.6 or higher

diff --git a/src/libcrawler/__main__.py b/src/libcrawler/__main__.py
@@ -22,16 +22,22 @@ def main():
                         help='Additional CSS selectors to remove from pages.')
     parser.add_argument('--similarity-threshold', type=float, default=0.6,
                         help='Similarity threshold for section comparison (default: 0.6).')
-    parser.add_argument('--allowed-paths', nargs='*',
+    parser.add_argument('--allowed-paths', nargs='*', 
                         help='List of URL paths to include during crawling.')
+    parser.add_argument('--ignore-paths', nargs='*',
+                        help='List of URL paths to exclude from crawling.')
 
-    headers_group = parser.add_mutually_exclusive_group(required=True)
-    headers_group.add_argument('--headers-file', type=str, help='Path to a JSON file containing headers. Only one of --headers-file or --headers-json can be used.')
-    headers_group.add_argument('--headers-json', type=json.loads, help='Raw JSON string representing the headers. Only one of --headers-file or --headers-json can be used.')
+    parser.add_argument('--user-agent', type=str, help='Custom User-Agent string.')
+    headers_group = parser.add_mutually_exclusive_group()
+    headers_group.add_argument('--headers-file', type=str, 
+                               help='Path to a JSON file containing headers.')
+    headers_group.add_argument('--headers-json', type=json.loads,
+                               help='Raw JSON string representing the headers.')
 
     args = parser.parse_args()
 
-    headers = {}
+    # Adjust logic for handling headers
+    headers = None
     if args.headers_file:
         try:
             with open(args.headers_file, 'r') as file:
@@ -48,6 +54,7 @@ def main():
 
     start_url = urljoin(args.base_url, args.starting_point)
 
+    # Adjust crawl_and_convert call to handle ignore-paths and optional headers
     crawl_and_convert(
         start_url=start_url,
         base_url=args.base_url,
@@ -59,7 +66,8 @@ def main():
         delay_range=args.delay_range,
         extra_remove_selectors=args.remove_selectors,
         similarity_threshold=args.similarity_threshold,
-        allowed_paths=args.allowed_paths
+        allowed_paths=args.allowed_paths,
+        ignore_paths=args.ignore_paths  # Pass the ignore-paths argument
     )
 
 

diff --git a/src/libcrawler/libcrawler.py b/src/libcrawler/libcrawler.py
@@ -53,8 +53,12 @@ def normalize_url(url):
     return normalized_url
 
 
-def fetch_content(url, headers={}):
+def fetch_content(url, user_agent=None, headers={}):
     """Fetches HTML content from a URL, following redirects."""
+    # Harmonize user-agent with headers
+    if user_agent:
+        headers.setdefault('User-Agent', user_agent)
+
     try:
         response = requests.get(url, headers=headers)
         response.raise_for_status()
@@ -134,9 +138,8 @@ def remove_common_elements(soup, extra_remove_selectors=None):
     return soup
 
 
-def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,
-               headers={}, delay=1, delay_range=0.5, 
-               extra_remove_selectors=None, allowed_paths=None):
+def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, headers={}, delay=1, delay_range=0.5, 
+               extra_remove_selectors=None, allowed_paths=None, ignore_paths=[]):
     visited_links = set()
     root = PageNode(start_url)
     node_lookup = {}
@@ -146,6 +149,10 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,
 
     robots_parser = load_robots_txt(base_url) if handle_robots_txt else None
 
+    # Harmonize User-agent and Headers
+    if user_agent:
+        headers.setdefault('User-Agent', user_agent)
+
     # Store page content in Markdown
     page_markdowns = {}
     url_to_anchor = {}
@@ -154,19 +161,24 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,
     while queue:
         current_node = queue.pop(0)
         current_link = normalize_url(current_node.url)
+
         if current_link in visited_links:
             continue
         visited_links.add(current_link)
 
+        if any(ignore_path in current_link for ignore_path in ignore_paths):
+            logger.info(f"Skipping {current_link} matching ignore_paths")
+            continue  
+
         if handle_robots_txt and robots_parser:
             if not is_allowed_by_robots(current_link, user_agent, robots_parser):
                 logger.info(f"Disallowed by robots.txt: {current_link}")
                 continue
 
         logger.info(f'Processing {current_link}')
         page_content, page_url = fetch_content(current_node.url, headers=headers)
-        if not page_content:
-            continue  # Skip if content couldn't be fetched
+        if not page_content or (page_url and any(ignore_path in page_url for ignore_path in ignore_paths)):
+            continue  
 
         soup = BeautifulSoup(page_content, 'html.parser')
         soup = remove_common_elements(soup, extra_remove_selectors=extra_remove_selectors)

diff --git a/src/libcrawler/version.py b/src/libcrawler/version.py
@@ -1,2 +1,2 @@
-__version_info__ = ('0', '3', '2')
+__version_info__ = ('0', '3', '3')
 __version__ = '.'.join(__version_info__)
diff --git a/src/tests/test_crawler.py b/src/tests/test_crawler.py
@@ -71,6 +71,64 @@ def test_fetch_content_failure(self, mock_get):
         self.assertIsNone(content)
         self.assertIsNone(url)
 
+    @patch('src.libcrawler.libcrawler.requests.get')
+    def test_user_agent_harmonization(self, mock_get):
+        # Mock response setup
+        mock_response = Mock()
+        mock_response.status_code = 200
+        mock_response.text = '<html><body>Test content with headers and user-agent</body></html>'
+        mock_get.return_value = mock_response
+
+        # Headers without user-agent
+        headers = {'Accept': 'text/html'}
+        user_agent = 'test-agent'
+
+        # Call the function with user-agent and headers
+        content, url = fetch_content('http://example.com/test', user_agent=user_agent, headers=headers)
+
+        # Ensure the user-agent is added to the headers
+        expected_headers = {'Accept': 'text/html', 'User-Agent': 'test-agent'}
+        mock_get.assert_called_with('http://example.com/test', headers=expected_headers)
+
+        # Assert content is fetched correctly
+        self.assertEqual(content, '<html><body>Test content with headers and user-agent</body></html>')
+
+class TestIgnorePaths(unittest.TestCase):
+    def setUp(self):
+        self.start_url = 'http://example.com/start'
+        self.base_url = 'http://example.com'
+        # The ignore_paths can now contain partial matches
+        self.ignore_paths = ['/ignore-me', '/skip-this']
+
+@patch('src.libcrawler.libcrawler.fetch_content')
+def test_ignore_paths_pre_and_post_fetch(self, mock_fetch_content):
+    # Mock the fetch_content to simulate redirects and actual content
+    mock_fetch_content.side_effect = [
+        ('<html><body>Start Page</body></html>', 'http://example.com/start'),  # First URL
+        ('<html><body>Ignored Page</body></html>', 'http://example.com/ignore-me/page'),  # Ignored after redirect
+        ('<html><body>Another Ignored Page</body></html>', 'http://example.com/skip-this/page2'),  # Ignored after redirect
+        ('<html><body>Allowed Page</body></html>', 'http://example.com/allowed-page')  # Not ignored
+    ]
+
+    # Run the build_tree function
+    result_tree = build_tree(
+        start_url=self.start_url,
+        base_url=self.base_url,
+        ignore_paths=self.ignore_paths
+    )
+
+    # Check that the first URL (pre-fetch) was skipped entirely
+    self.assertEqual(mock_fetch_content.call_count, 3)
+
+    # Check that the ignored URLs are not in the result tree (post-fetch)
+    for node in result_tree.values():
+        self.assertNotIn('http://example.com/ignore-me/page', node.url)
+        self.assertNotIn('http://example.com/skip-this/page2', node.url)
+
+    # Check that non-ignored URLs are present in the result tree
+    self.assertIn('http://example.com/start', result_tree)
+    self.assertIn('http://example.com/allowed-page', [node.url for node in result_tree.values()])
+
 class TestGetLinks(unittest.TestCase):
     def test_get_links_all_paths(self):
         html = '''