Merge branch 'main' into 6-scheduling

q-m · Nov 20, 2024 · 6840cb2 · 6840cb2
2 parents 36217d9 + b65050e
commit 6840cb2
Show file tree

Hide file tree

Showing 20 changed files with 335 additions and 225 deletions.
diff --git a/.github/workflows/test-docker.yml b/.github/workflows/test-docker.yml
diff --git a/.github/workflows/test-k8s.yml b/.github/workflows/test-k8s.yml
diff --git a/.github/workflows/test-manifest.yml b/.github/workflows/test-manifest.yml
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -0,0 +1,154 @@
+name: Scapyd-k8s CI
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+
+jobs:
+  test-unit:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install -r requirements-test.txt
+      
+      - name: Run tests
+        run: pytest -vv --color=yes scrapyd_k8s/tests/unit/
+
+  test-docker:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install -r requirements-test.txt
+
+      - name: Pull example spider
+        run: docker pull ghcr.io/q-m/scrapyd-k8s-spider-example
+
+      - name: Run scrapyd-k8s
+        run: |
+          cp scrapyd_k8s.sample-docker.conf scrapyd_k8s.conf
+          python -m scrapyd_k8s &
+          while ! nc -q 1 localhost 6800 </dev/null; do sleep 1; done
+          curl http://localhost:6800/daemonstatus.json
+
+      - name: Run tests
+        run: pytest -vv --color=yes scrapyd_k8s/tests/integration/
+
+  test-manifest:
+    container:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install -r requirements-test.txt
+
+      - name: Set up Docker Buildx
+        uses: docker/setup-buildx-action@v3
+
+      - name: Build container
+        uses: docker/build-push-action@v5
+        with:
+          context: .
+          push: false
+          load: true
+          tags: test:latest
+          cache-from: type=gha
+          cache-to: type=gha,mode=max
+
+      - name: Start minikube
+        uses: medyagh/setup-minikube@master
+
+      - name: Deploy to minikube
+        run: |
+          minikube image load test:latest
+          # already pull image so we don't have to wait for it later
+          minikube image pull ghcr.io/q-m/scrapyd-k8s-spider-example:latest
+          # load manifest
+          sed -i 's/\(imagePullPolicy:\s*\)\w\+/\1Never/' kubernetes.yaml
+          sed -i 's/\(image:\s*\)ghcr\.io\/q-m\/scrapyd-k8s:/\1test:/' kubernetes.yaml
+          sed -i 's/\(type:\s*\)ClusterIP/\1NodePort/' kubernetes.yaml
+          kubectl create -f kubernetes.yaml
+          # and wait for scrapyd-k8s to become ready
+          kubectl wait --for=condition=Available deploy/scrapyd-k8s --timeout=60s
+          curl --retry 10 --retry-delay 2 --retry-all-errors `minikube service scrapyd-k8s --url`/daemonstatus.json
+
+      - name: Run tests
+        run: |
+          TEST_WITH_K8S=1 \
+          TEST_BASE_URL=`minikube service scrapyd-k8s --url` \
+          TEST_MAX_WAIT=60 \
+          TEST_AVAILABLE_VERSIONS=latest,`skopeo list-tags docker://ghcr.io/q-m/scrapyd-k8s-spider-example | jq -r '.Tags | map(select(. != "latest" and (startswith("sha-") | not))) | join(",")'` \
+          pytest -vv --color=yes scrapyd_k8s/tests/integration/
+  test-k8s:
+    container:
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout
+        uses: actions/checkout@v4
+
+      - name: Set up Python
+        uses: actions/setup-python@v4
+        with:
+          python-version: 3.11
+          cache: 'pip'
+
+      - name: Install dependencies
+        run: |
+          pip install -r requirements.txt
+          pip install -r requirements-test.txt
+
+      - name: Start minikube
+        uses: medyagh/setup-minikube@master
+
+      - name: Prepare Kubernetes environment
+        run: |
+          kubectl create secret generic example-env-secret --from-literal=FOO_1=bar
+          kubectl create configmap example-env-configmap --from-literal=FOO_2=baz
+          # already pull image so we don't have to wait for it later
+          minikube image pull ghcr.io/q-m/scrapyd-k8s-spider-example:latest
+
+      - name: Run scrapyd-k8s
+        run: |
+          cp scrapyd_k8s.sample-k8s.conf scrapyd_k8s.conf
+          python -m scrapyd_k8s &
+          while ! nc -q 1 localhost 6800 </dev/null; do sleep 1; done
+          curl http://localhost:6800/daemonstatus.json
+
+      - name: Run tests
+        run: |
+          TEST_WITH_K8S=1 \
+          TEST_MAX_WAIT=60 \
+          TEST_AVAILABLE_VERSIONS=latest,`skopeo list-tags docker://ghcr.io/q-m/scrapyd-k8s-spider-example | jq -r '.Tags | map(select(. != "latest" and (startswith("sha-") | not))) | join(",")'` \
+          pytest -vv --color=yes scrapyd_k8s/tests/integration/
diff --git a/CONFIG.md b/CONFIG.md
@@ -0,0 +1,71 @@
+# scrapyd-k8s configuration
+
+scrapyd-k8s is configured with the file `scrapyd_k8s.conf`. The file format is meant to
+stick to [scrapyd's configuration](https://scrapyd.readthedocs.io/en/latest/config.html) where possible.
+
+## `[scrapyd]` section
+
+* `http_port`    - defaults to `6800` ([➽](https://scrapyd.readthedocs.io/en/latest/config.html#http-port))
+* `bind_address` - defaults to `127.0.0.1` ([➽](https://scrapyd.readthedocs.io/en/latest/config.html#bind-address))
+* `max_proc`     - _(implementation pending)_, if unset or `0` it will use the number of nodes in the cluster, defaults to `0` ([➽](https://scrapyd.readthedocs.io/en/latest/config.html#max-proc))
+* `repository`   - Python class for accessing the image repository, defaults to `scrapyd_k8s.repository.Remote`
+* `launcher`     - Python class for managing jobs on the cluster, defaults to `scrapyd_k8s.launcher.K8s`
+* `username`     - Set this and `password` to enable basic authentication ([➽](https://scrapyd.readthedocs.io/en/latest/config.html#username))
+* `password`     - Set this and `username` to enable basic authentication ([➽](https://scrapyd.readthedocs.io/en/latest/config.html#password))
+
+The Docker and Kubernetes launchers have their own additional options.
+
+## project sections
+
+Each project you want to be able to run, gets its own section, prefixed with `project.`. For example,
+consider an `example` spider, this would be defined in a `[project.example]` section.
+
+* `repository` - container repository for the project, e.g. `ghcr.io/q-m/scrapyd-k8s-spider-example`
+
+## Docker
+
+This section describes Docker-specific options.
+See [`scrapyd_k8s.sample-docker.conf`](scrapyd_k8s.sample-docker.conf) for an example.
+
+* `[scrapyd]` `launcher` - set this to `scrapyd_k8s.launcher.Docker`
+* `[scrapyd]` `repository` - choose between `scrapyd_k8s.repository.Local` and `scrapyd_k8s.repository.Remote`
+
+TODO: explain `Local` and `Remote` repository, and how to use them
+
+## Kubernetes
+
+This section describes Kubernetes-specific options.
+See [`scrapyd_k8s.sample-k8s.conf`](scrapyd_k8s.sample-k8s.conf) for an example.
+
+* `[scrapyd]` `launcher` - set this to `scrapyd_k8s.launcher.K8s`
+* `[scrapyd]` `repository` - set this to `scrapyd_k8s.repository.Remote`
+
+For Kubernetes, it is important to set resource limits.
+
+TODO: explain how to set limits, with default, project and spider specificity.
+
+
+### Kubernetes API interaction
+
+The Kubernetes event watcher is used in the code as part of the joblogs feature and is also utilized for limiting the
+number of jobs running in parallel on the cluster. Both features are not enabled by default and can be activated if you
+choose to use them.
+
+The event watcher establishes a connection to the Kubernetes API and receives a stream of events from it. However, the
+nature of this long-lived connection is unstable; it can be interrupted by network issues, proxies configured to terminate
+long-lived connections, and other factors. For this reason, a mechanism was implemented to re-establish the long-lived
+connection to the Kubernetes API. To achieve this, three parameters were introduced: `reconnection_attempts`,
+`backoff_time` and `backoff_coefficient`.
+
+#### What are these parameters about?
+
+* `reconnection_attempts` - defines how many consecutive attempts will be made to reconnect if the connection fails;
+* `backoff_time`, `backoff_coefficient` - are used to gradually slow down each subsequent attempt to establish a
+  connection with the Kubernetes API, preventing the API from becoming overloaded with requests.
+  The `backoff_time` increases exponentially and is calculated as `backoff_time *= self.backoff_coefficient`.
+
+#### When do I need to change it in the config file?
+
+Default values for these parameters are provided in the code and are tuned to an "average" cluster setting. If your network
+requirements or other conditions are unusual, you may need to adjust these values to better suit your specific setup.
+