robbiemu · robbiemu · Oct 9, 2024 · Oct 9, 2024
diff --git a/.github/workflows/pr-validation.yml b/.github/workflows/pr-validation.yml
@@ -0,0 +1,40 @@
+name: PR Validation
+
+on:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  validation:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.6'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install -r requirements.txt
+
+    - name: Get version from version.py
+      id: get_version
+      run: echo "::set-output name=version::$(python -c "import importlib; print(importlib.import_module('src.libcrawler.version').__version__)")"
+
+    - name: Check if branch exists for the current version
+      run: |
+        git ls-remote --heads origin refs/heads/${{ steps.get_version.outputs.version }} && exit 1 || true
+
+    - name: Run tests
+      run: python -m unittest discover -s src/tests
+
+    - name: Install package and test entry point
+      run: |
+        pip install .
+        crawl-docs --help
diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml
@@ -0,0 +1,31 @@
+name: Release to PyPI
+
+on:
+  workflow_dispatch:
+
+jobs:
+  release:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v3
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.6'
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip setuptools wheel twine
+
+    - name: Build package
+      run: |
+        python setup.py sdist bdist_wheel
+
+    - name: Publish to PyPI
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        user: __token__
+        password: ${{ secrets.PYPI_API_TOKEN }}
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1,4 @@
+include requirements.txt
+include LICENSE
+include README.md
+include src/libcrawler/version.py
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# Documentation Crawler and Converter v.0.2
+# Documentation Crawler and Converter v.0.3
 
 This tool crawls a documentation website and converts the pages into a single Markdown document. It intelligently removes common sections that appear across multiple pages to avoid duplication, including them once at the end of the document.
 
@@ -10,10 +10,93 @@ This tool crawls a documentation website and converts the pages into a single Ma
 - Configurable selectors to remove specific elements from pages.
 - Supports robots.txt compliance with an option to ignore it.
 
+## Installation
+
+### Prerequisites
+
+- **Python 3.6 or higher** is required.
+- (Optional) It is recommended to use a virtual environment to avoid dependency conflicts with other projects.
+
+### 1. Installing the Package with `pip`
+
+If you have already cloned the repository or downloaded the source code, you can install the package using `pip`:
+
+```bash
+pip install .
+```
+
+This will install the package in your current Python environment.
+
+### 2. Installing in Editable Mode
+
+If you are a developer or want to modify the source code and see your changes reflected immediately, you can install the package in **editable** mode. This allows you to edit the source files and test the changes without needing to reinstall the package:
+
+```bash
+pip install -e .
+```
+
+### 3. Using a Virtual Environment (Recommended)
+
+It is recommended to use a virtual environment to isolate the package and its dependencies. Follow these steps to set up a virtual environment and install the package:
+
+1. **Create a virtual environment** (e.g., named `venv`):
+
+   ```bash
+   python -m venv venv
+   ```
+
+2. **Activate the virtual environment**:
+
+   - On **macOS/Linux**:
+     ```bash
+     source venv/bin/activate
+     ```
+
+   - On **Windows**:
+     ```bash
+     .\venv\Scripts\activate
+     ```
+
+3. **Install the package** inside the virtual environment:
+
+   ```bash
+   pip install .
+   ```
+
+   This ensures that all dependencies are installed within the virtual environment.
+
+### 4. Installing from PyPI
+
+Once the package is published on PyPI, you can install it directly using:
+
+```bash
+pip install libcrawler
+```
+
+### 5. Upgrading the Package
+
+To upgrade the package to the latest version, use:
+
+```bash
+pip install --upgrade libcrawler
+```
+
+This will upgrade the package to the newest version available.
+
+### 6. Verifying the Installation
+
+You can verify that the package has been installed correctly by running:
+
+```bash
+pip show libcrawler
+```
+
+This will display information about the installed package, including the version, location, and dependencies.
+
 ## Usage
 
 ```bash
-python crawler_cli.py BASE_URL STARTING_POINT [OPTIONS]
+crawl-docs BASE_URL STARTING_POINT [OPTIONS]
 ```
 
 ### Arguments
@@ -30,30 +113,32 @@ python crawler_cli.py BASE_URL STARTING_POINT [OPTIONS]
 - `--remove-selectors SELECTOR [SELECTOR ...]`: Additional CSS selectors to remove from pages.
 - `--similarity-threshold SIMILARITY_THRESHOLD`: Similarity threshold for section comparison (default: 0.8).
 - `--allowed-paths PATH [PATH ...]`: List of URL paths to include during crawling.
+- `--headers-file FILE`: Path to a JSON file containing optional headers. Only one of `--headers-file` or `--headers-json` can be used.
+- `--headers-json JSON` (JSON string): Optional headers as JSON
 
 ### Examples
 
 #### Basic Usage
 ```bash
-python crawler_cli.py https://example.com /docs/ -o output.md
+crawl-docs https://example.com /docs/ -o output.md
 ```
 
 #### Adjusting Thresholds
 ```bash
-python crawler_cli.py https://example.com /docs/ -o output.md \
+crawl-docs https://example.com /docs/ -o output.md \
     --similarity-threshold 0.7 \
     --delay-range 0.3
 ```
 
 #### Specifying Extra Selectors to Remove
 ```bash
-python crawler_cli.py https://example.com /docs/ -o output.md \
+crawl-docs https://example.com /docs/ -o output.md \
     --remove-selectors ".sidebar" ".ad-banner"
 ```
 
 #### Limiting to Specific Paths
 ```bash
-python crawler_cli.py https://example.com / -o output.md \
+crawl-docs https://example.com / -o output.md \
     --allowed-paths "/docs/" "/api/"
 ```
 

diff --git a/build.py b/build.py
@@ -0,0 +1,29 @@
+import importlib.util
+import os
+from setuptools import setup
+import sys
+
+
+# Function to dynamically load the module and get its version
+module_name = 'version'
+module_path = os.path.join(os.path.dirname(__file__), 'src', 'libcrawler', 'version.py')
+
+def get_version():
+    spec = importlib.util.spec_from_file_location(module_name, module_path)
+    module = importlib.util.module_from_spec(spec)
+    sys.modules[module_name] = module
+    spec.loader.exec_module(module)
+    return module.__version__
+
+# Read the requirements from requirements.txt
+def read_requirements():
+    with open('requirements.txt') as req:
+        content = req.readlines()
+    # Remove comments and empty lines
+    requirements = [line.strip() for line in content if line.strip() and not line.startswith('#')]
+    return requirements
+
+setup(
+    version=get_version(),
+    install_requires=read_requirements(),
+)
diff --git a/pyproject.toml b/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+requires = [ "setuptools-scm[toml]>=8.0", "wheel" ]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "libcrawler"
+description = "A tool to crawl documentation and convert to Markdown."
+authors = [
+    { name="Robert Collins", email="[email protected]" }
+]
+requires-python = ">=3.6"
+classifiers = [
+    "Programming Language :: Python :: 3",
+    "License :: OSI Approved :: GNU Lesser General Public License v3 (LGPLv3)",
+    "Operating System :: OS Independent",
+]
+
+dynamic = ["version", "dependencies"]
+
+[tool.setuptools.packages.find]
+where = ["src"]
+exclude = ["libcrawler.egg_info", "libcrawler.tests"]
+
+[project.scripts]
+crawl-docs = "libcrawler.__main__:main"
diff --git a/src/tests/__init__.py → src/__init__.py b/src/tests/__init__.py → src/__init__.py
diff --git a/src/__main__.py → src/libcrawler/__main__.py b/src/__main__.py → src/libcrawler/__main__.py
@@ -1,11 +1,13 @@
 import argparse
+import json
 from urllib.parse import urljoin
 
-from crawler import crawl_and_convert
+from .libcrawler import crawl_and_convert
+from libcrawler.version import __version__
 
 
 def main():
-    parser = argparse.ArgumentParser(description='Crawl documentation and convert to Markdown.')
+    parser = argparse.ArgumentParser(description=f'Crawl documentation and convert to Markdown. v{__version__}')
     parser.add_argument('base_url', help='The base URL of the documentation site.')
     parser.add_argument('starting_point', help='The starting path of the documentation.')
     parser.add_argument('-o', '--output', default='documentation.md',
@@ -23,8 +25,27 @@ def main():
     parser.add_argument('--allowed-paths', nargs='*',
                         help='List of URL paths to include during crawling.')
 
+    headers_group = parser.add_mutually_exclusive_group(required=True)
+    headers_group.add_argument('--headers-file', type=str, help='Path to a JSON file containing headers. Only one of --headers-file or --headers-json can be used.')
+    headers_group.add_argument('--headers-json', type=json.loads, help='Raw JSON string representing the headers. Only one of --headers-file or --headers-json can be used.')
+
     args = parser.parse_args()
 
+    headers = {}
+    if args.headers_file:
+        try:
+            with open(args.headers_file, 'r') as file:
+                headers = json.load(file)
+        except (json.JSONDecodeError, FileNotFoundError) as e:
+            print(f"Error loading headers from file: {e}")
+            return
+    elif args.headers_json:
+        try:
+            headers = json.loads(args.headers_json)
+        except json.JSONDecodeError as e:
+            print(f"Invalid JSON format for --headers-json: {e}")
+            return
+
     start_url = urljoin(args.base_url, args.starting_point)
 
     crawl_and_convert(
@@ -33,6 +54,7 @@ def main():
         output_filename=args.output,
         user_agent=args.user_agent if hasattr(args, 'user_agent') else '*',
         handle_robots_txt=not args.no_robots,
+        headers=headers,
         delay=args.delay,
         delay_range=args.delay_range,
         extra_remove_selectors=args.remove_selectors,

diff --git a/src/crawler.py → src/libcrawler/libcrawler.py b/src/crawler.py → src/libcrawler/libcrawler.py
@@ -3,9 +3,7 @@
 Pages that are not part of the library documentation are excluded.
 """
 
-__version_info__ = ('0', '2', '0')
-__version__ = '.'.join(__version_info__)
-
+from .version import __version__
 
 from bs4 import BeautifulSoup
 from collections import defaultdict
@@ -55,10 +53,10 @@ def normalize_url(url):
     return normalized_url
 
 
-def fetch_content(url):
+def fetch_content(url, headers={}):
     """Fetches HTML content from a URL, following redirects."""
     try:
-        response = requests.get(url)
+        response = requests.get(url, headers=headers)
         response.raise_for_status()
         return response.text, response.url  # Return the final redirected URL
     except requests.exceptions.RequestException as e:
@@ -137,8 +135,8 @@ def remove_common_elements(soup, extra_remove_selectors=None):
 
 
 def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,
-               delay=1, delay_range=0.5, extra_remove_selectors=None,
-               allowed_paths=None):
+               headers={}, delay=1, delay_range=0.5, 
+               extra_remove_selectors=None, allowed_paths=None):
     visited_links = set()
     root = PageNode(start_url)
     node_lookup = {}
@@ -166,7 +164,7 @@ def build_tree(start_url, base_url, user_agent='*', handle_robots_txt=True,
                 continue
 
         logger.info(f'Processing {current_link}')
-        page_content, page_url = fetch_content(current_node.url)
+        page_content, page_url = fetch_content(current_node.url, headers=headers)
         if not page_content:
             continue  # Skip if content couldn't be fetched
 
@@ -354,6 +352,7 @@ def crawl_and_convert(
     output_filename,
     user_agent='*',
     handle_robots_txt=True,
+    headers={},
     delay=1,
     delay_range=0.5,
     extra_remove_selectors=None,
@@ -366,6 +365,7 @@ def crawl_and_convert(
         base_url=base_url,
         user_agent=user_agent,
         handle_robots_txt=handle_robots_txt,
+        headers=headers,
         delay=delay,
         delay_range=delay_range,
         extra_remove_selectors=extra_remove_selectors,

diff --git a/src/libcrawler/version.py b/src/libcrawler/version.py
@@ -0,0 +1,2 @@
+__version_info__ = ('0', '3', '0')
+__version__ = '.'.join(__version_info__)
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		__version_info__ = ('0', '3', '0')
		__version__ = '.'.join(__version_info__)