Skip to content

Commit

Permalink
v0.3.3
Browse files Browse the repository at this point in the history
- new flag: --ignore-paths
- new flag: --user-agent
- harmonized user-agent and header flags
- updated readme
  • Loading branch information
robbiemu committed Oct 20, 2024
1 parent 25aeb20 commit 048c71e
Show file tree
Hide file tree
Showing 5 changed files with 102 additions and 13 deletions.
11 changes: 11 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ This tool crawls a documentation website and converts the pages into a single Ma
- Customizable threshold for similarity.
- Configurable selectors to remove specific elements from pages.
- Supports robots.txt compliance with an option to ignore it.
- **NEW in v0.3.3**: Ability to skip URLs based on ignore-paths both pre-fetch (before requesting content) and post-fetch (after redirects).

## Installation

Expand Down Expand Up @@ -113,6 +114,8 @@ crawl-docs BASE_URL STARTING_POINT [OPTIONS]
- `--remove-selectors SELECTOR [SELECTOR ...]`: Additional CSS selectors to remove from pages.
- `--similarity-threshold SIMILARITY_THRESHOLD`: Similarity threshold for section comparison (default: 0.8).
- `--allowed-paths PATH [PATH ...]`: List of URL paths to include during crawling.
- `--ignore-paths PATH [PATH ...]`: List of URL paths to skip during crawling, either before or after fetching content.
- `--user-agent USER_AGENT`: Specify a custom User-Agent string (which will be harmonized with any additional headers).
- `--headers-file FILE`: Path to a JSON file containing optional headers. Only one of `--headers-file` or `--headers-json` can be used.
- `--headers-json JSON` (JSON string): Optional headers as JSON

Expand Down Expand Up @@ -142,6 +145,14 @@ crawl-docs https://example.com / -o output.md \
--allowed-paths "/docs/" "/api/"
```

#### Skipping Pre-Fetch and Post-Fetch URLs with Ignore Paths

```bash
Copiar código
crawl-docs https://example.com /docs/ -o output.md \
--ignore-paths "/old/" "/legacy/"
```

### Dependencies

- Python 3.6 or higher
Expand Down
20 changes: 14 additions & 6 deletions src/libcrawler/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,16 +22,22 @@ def main():
help='Additional CSS selectors to remove from pages.')
parser.add_argument('--similarity-threshold', type=float, default=0.6,
help='Similarity threshold for section comparison (default: 0.6).')
parser.add_argument('--allowed-paths', nargs='*',
parser.add_argument('--allowed-paths', nargs='*',
help='List of URL paths to include during crawling.')
parser.add_argument('--ignore-paths', nargs='*',
help='List of URL paths to exclude from crawling.')

headers_group = parser.add_mutually_exclusive_group(required=True)
headers_group.add_argument('--headers-file', type=str, help='Path to a JSON file containing headers. Only one of --headers-file or --headers-json can be used.')
headers_group.add_argument('--headers-json', type=json.loads, help='Raw JSON string representing the headers. Only one of --headers-file or --headers-json can be used.')
parser.add_argument('--user-agent', type=str, help='Custom User-Agent string.')
headers_group = parser.add_mutually_exclusive_group()
headers_group.add_argument('--headers-file', type=str,
help='Path to a JSON file containing headers.')
headers_group.add_argument('--headers-json', type=json.loads,
help='Raw JSON string representing the headers.')

args = parser.parse_args()

headers = {}
# Adjust logic for handling headers
headers = None
if args.headers_file:
try:
with open(args.headers_file, 'r') as file:
Expand All @@ -48,6 +54,7 @@ def main():

start_url = urljoin(args.base_url, args.starting_point)

# Adjust crawl_and_convert call to handle ignore-paths and optional headers
crawl_and_convert(
start_url=start_url,
base_url=args.base_url,
Expand All @@ -59,7 +66,8 @@ def main():
delay_range=args.delay_range,
extra_remove_selectors=args.remove_selectors,
similarity_threshold=args.similarity_threshold,
allowed_paths=args.allowed_paths
allowed_paths=args.allowed_paths,
ignore_paths=args.ignore_paths # Pass the ignore-paths argument
)


Expand Down
24 changes: 18 additions & 6 deletions src/libcrawler/libcrawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,8 +53,12 @@ def normalize_url(url):
return normalized_url


def fetch_content(url, headers={}):
def fetch_content(url, user_agent=None, headers={}):
"""Fetches HTML content from a URL, following redirects."""
# Harmonize user-agent with headers
if user_agent:
headers.setdefault('User-Agent', user_agent)

try:
response = requests.get(url, headers=headers)
response.raise_for_status()
Expand Down Expand Up @@ -134,9 +138,8 @@ def remove_common_elements(soup, extra_remove_selectors=None):
return soup


def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,
headers={}, delay=1, delay_range=0.5,
extra_remove_selectors=None, allowed_paths=None):
def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True, headers={}, delay=1, delay_range=0.5,
extra_remove_selectors=None, allowed_paths=None, ignore_paths=[]):
visited_links = set()
root = PageNode(start_url)
node_lookup = {}
Expand All @@ -146,6 +149,10 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,

robots_parser = load_robots_txt(base_url) if handle_robots_txt else None

# Harmonize User-agent and Headers
if user_agent:
headers.setdefault('User-Agent', user_agent)

# Store page content in Markdown
page_markdowns = {}
url_to_anchor = {}
Expand All @@ -154,19 +161,24 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,
while queue:
current_node = queue.pop(0)
current_link = normalize_url(current_node.url)

if current_link in visited_links:
continue
visited_links.add(current_link)

if any(ignore_path in current_link for ignore_path in ignore_paths):
logger.info(f"Skipping {current_link} matching ignore_paths")
continue

if handle_robots_txt and robots_parser:
if not is_allowed_by_robots(current_link, user_agent, robots_parser):
logger.info(f"Disallowed by robots.txt: {current_link}")
continue

logger.info(f'Processing {current_link}')
page_content, page_url = fetch_content(current_node.url, headers=headers)
if not page_content:
continue # Skip if content couldn't be fetched
if not page_content or (page_url and any(ignore_path in page_url for ignore_path in ignore_paths)):
continue

soup = BeautifulSoup(page_content, 'html.parser')
soup = remove_common_elements(soup, extra_remove_selectors=extra_remove_selectors)
Expand Down
2 changes: 1 addition & 1 deletion src/libcrawler/version.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
__version_info__ = ('0', '3', '2')
__version_info__ = ('0', '3', '3')
__version__ = '.'.join(__version_info__)
58 changes: 58 additions & 0 deletions src/tests/test_crawler.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,64 @@ def test_fetch_content_failure(self, mock_get):
self.assertIsNone(content)
self.assertIsNone(url)

@patch('src.libcrawler.libcrawler.requests.get')
def test_user_agent_harmonization(self, mock_get):
# Mock response setup
mock_response = Mock()
mock_response.status_code = 200
mock_response.text = '<html><body>Test content with headers and user-agent</body></html>'
mock_get.return_value = mock_response

# Headers without user-agent
headers = {'Accept': 'text/html'}
user_agent = 'test-agent'

# Call the function with user-agent and headers
content, url = fetch_content('http://example.com/test', user_agent=user_agent, headers=headers)

# Ensure the user-agent is added to the headers
expected_headers = {'Accept': 'text/html', 'User-Agent': 'test-agent'}
mock_get.assert_called_with('http://example.com/test', headers=expected_headers)

# Assert content is fetched correctly
self.assertEqual(content, '<html><body>Test content with headers and user-agent</body></html>')

class TestIgnorePaths(unittest.TestCase):
def setUp(self):
self.start_url = 'http://example.com/start'
self.base_url = 'http://example.com'
# The ignore_paths can now contain partial matches
self.ignore_paths = ['/ignore-me', '/skip-this']

@patch('src.libcrawler.libcrawler.fetch_content')
def test_ignore_paths_pre_and_post_fetch(self, mock_fetch_content):
# Mock the fetch_content to simulate redirects and actual content
mock_fetch_content.side_effect = [
('<html><body>Start Page</body></html>', 'http://example.com/start'), # First URL
('<html><body>Ignored Page</body></html>', 'http://example.com/ignore-me/page'), # Ignored after redirect
('<html><body>Another Ignored Page</body></html>', 'http://example.com/skip-this/page2'), # Ignored after redirect
('<html><body>Allowed Page</body></html>', 'http://example.com/allowed-page') # Not ignored
]

# Run the build_tree function
result_tree = build_tree(
start_url=self.start_url,
base_url=self.base_url,
ignore_paths=self.ignore_paths
)

# Check that the first URL (pre-fetch) was skipped entirely
self.assertEqual(mock_fetch_content.call_count, 3)

# Check that the ignored URLs are not in the result tree (post-fetch)
for node in result_tree.values():
self.assertNotIn('http://example.com/ignore-me/page', node.url)
self.assertNotIn('http://example.com/skip-this/page2', node.url)

# Check that non-ignored URLs are present in the result tree
self.assertIn('http://example.com/start', result_tree)
self.assertIn('http://example.com/allowed-page', [node.url for node in result_tree.values()])

class TestGetLinks(unittest.TestCase):
def test_get_links_all_paths(self):
html = '''
Expand Down

0 comments on commit 048c71e

Please sign in to comment.