From 84c8aa7f3817bb73732b023924f97913ce3b6923 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Francisco=20Calvo?= Date: Mon, 9 Sep 2024 14:48:17 +0200 Subject: [PATCH] feat: add background processing jobs (#5432) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Description This PR add the following changes: - [x] Add `rq` to help us execute background jobs. - [x] Add a background job to update all records for a dataset when the dataset distribution strategy is updated. - [x] Change HuggingFace Dockerfile to install Redis and run `rq` workers inside honcho Procfile. - [x] Add documentation about new `ARGILLA_REDIS_URL` environment variable. - [x] Add ping to Redis so Argilla server is not started if Redis is not ready. - [x] Change Argilla docker compose file to include a container with Redis and rq workers. - [x] Update Argilla server `README.md` file adding Redis as dependency to install. - [x] Add documentation about Redis being a new Argilla server dependency. - [x] Add `BACKGROUND_NUM_WORKERS` environment variable to specify the number of workers in the HF Space container. - [ ] ~~Modify `Dockerfile` template on HF to include the environment variable https://github.com/argilla-io/argilla/issues/5443:~~ ``` # (since: v2.2.0) Uncomment the next line to specify the number of background job workers to run (default: 2). # ENV BACKGROUND_NUM_WORKERS=2 ``` - [ ] Remove some `TODO` sections before merging. - [ ] Review K8s documentation (maybe delete it?). - [ ] If we want to persist Redis data on HF Spaces we can change our `Procfile` Redis process to the following: ``` redis: /usr/bin/redis-server --dbfilename argilla-redis.rdb --dir ${ARGILLA_HOME_PATH} ``` - [ ] Allow tests job workers synchronously (with pytest) It's not working due to asyncio stuff (running an asynchronous loop inside another one, more info here: https://github.com/rq/rq/issues/1986). Closes #5431 # Benchmarks The following timings were obtained updating the distribution strategy of a dataset with 100 and 10.000 records, using a basic and an upgraded CPU on HF Spaces, with and without persistent storage and measuring how much time the background job takes to complete: CPU basic: 2 vCPU, 16GB RAM CPU upgrade: 8 vCPU, 32GB RAM * CPU basic (with persistent storage): * 100 records dataset: ~8 seconds. * 10.000 records dataset: ~9 minutes. * CPU upgrade (with persistent storage): * 100 records dataset: ~5 seconds. * 10.000 records dataset: ~6 minutes. * CPU basic (no persistent storage): * 10.000 records dataset: ~101 seconds. * CPU upgrade (no persistent storage): * 10.000 records dataset: ~62 seconds. **Type of change** - New feature (non-breaking change which adds functionality) **How Has This Been Tested** - [x] Testing it on HF Spaces. **Checklist** - I added relevant documentation - I followed the style guidelines of this project - I did a self-review of my code - I made corresponding changes to the documentation - I confirm My changes generate no new warnings - I have added tests that prove my fix is effective or that my feature works - I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --------- Co-authored-by: Damián Pumar --- .github/workflows/argilla-server.yml | 10 +++ .../repositories/DatasetRepository.ts | 1 + argilla-server/CHANGELOG.md | 5 ++ argilla-server/README.md | 16 +++++ .../docker/argilla-hf-spaces/Dockerfile | 19 ++++-- .../docker/argilla-hf-spaces/Procfile | 2 + .../docker/argilla-hf-spaces/requirements.txt | 1 + argilla-server/docker/server/README.md | 1 - argilla-server/pdm.lock | 33 ++++++++- argilla-server/pyproject.toml | 7 +- argilla-server/src/argilla_server/_app.py | 20 ++++++ .../src/argilla_server/cli/__main__.py | 2 + .../src/argilla_server/cli/worker.py | 37 ++++++++++ .../src/argilla_server/contexts/datasets.py | 7 +- .../src/argilla_server/jobs/__init__.py | 14 ++++ .../src/argilla_server/jobs/dataset_jobs.py | 54 +++++++++++++++ .../src/argilla_server/jobs/queues.py | 24 +++++++ .../src/argilla_server/models/database.py | 7 -- argilla-server/src/argilla_server/settings.py | 2 + .../src/argilla_server/validators/datasets.py | 13 +--- .../v1/datasets/test_update_dataset.py | 68 ------------------- .../reference/argilla-server/configuration.md | 6 ++ docs/_source/community/developer_docs.md | 2 + .../deployments/docker/docker-compose.yaml | 64 ++++++++++++++--- 24 files changed, 308 insertions(+), 107 deletions(-) create mode 100644 argilla-server/src/argilla_server/cli/worker.py create mode 100644 argilla-server/src/argilla_server/jobs/__init__.py create mode 100644 argilla-server/src/argilla_server/jobs/dataset_jobs.py create mode 100644 argilla-server/src/argilla_server/jobs/queues.py diff --git a/.github/workflows/argilla-server.yml b/.github/workflows/argilla-server.yml index af36e1a824..6e9e7c5b04 100644 --- a/.github/workflows/argilla-server.yml +++ b/.github/workflows/argilla-server.yml @@ -51,6 +51,16 @@ jobs: ports: - 5432:5432 + redis: + image: redis + options: >- + --health-cmd "redis-cli ping" + --health-interval 10s + --health-timeout 5s + --health-retries 5 + ports: + - 6379:6379 + env: HF_HUB_DISABLE_TELEMETRY: 1 diff --git a/argilla-frontend/v1/infrastructure/repositories/DatasetRepository.ts b/argilla-frontend/v1/infrastructure/repositories/DatasetRepository.ts index 506578384c..dadc199b8c 100644 --- a/argilla-frontend/v1/infrastructure/repositories/DatasetRepository.ts +++ b/argilla-frontend/v1/infrastructure/repositories/DatasetRepository.ts @@ -100,6 +100,7 @@ export class DatasetRepository implements IDatasetRepository { ); revalidateCache(`/v1/datasets/${id}`); + revalidateCache(`/v1/datasets/${id}/progress`); return { when: data.updated_at, diff --git a/argilla-server/CHANGELOG.md b/argilla-server/CHANGELOG.md index 939937ff1b..bf82d59c68 100644 --- a/argilla-server/CHANGELOG.md +++ b/argilla-server/CHANGELOG.md @@ -16,6 +16,11 @@ These are the section headers that we use: ## [Unreleased]() +### Added + +- Added [`rq`](https://python-rq.org) library to process background jobs using [Redis](https://redis.io) as a dependency. ([#5432](https://github.com/argilla-io/argilla/pull/5432)) +- Added a new background job to update records status when a dataset distribution strategy is updated. ([#5432](https://github.com/argilla-io/argilla/pull/5432)) + ## [2.1.0](https://github.com/argilla-io/argilla/compare/v2.0.0...v2.1.0) ### Added diff --git a/argilla-server/README.md b/argilla-server/README.md index df92d11616..b92a177cd5 100644 --- a/argilla-server/README.md +++ b/argilla-server/README.md @@ -115,6 +115,12 @@ pdm migrate pdm server ``` +### Run RQ background workers + +```sh +pdm worker +``` + ## CLI commands This section list and describe the commands offered by the `argilla_server` Python package. If you need more information about the available @@ -271,6 +277,16 @@ The `argilla_server search-engine` group of commands offers functionality to wor - `python -m argilla_server search-engine reindex`: reindex all Argilla entities into search engine. +### Background Jobs + +Argilla uses [RQ](https://python-rq.org) as background job manager. RQ depends on [Redis](https://redis.io) to store and retrieve information about the jobs to be processed. + +Once that you have correctly installed Redis on your system, you can start the RQ worker by running the following CLI command: + +```sh +python -m argilla_server worker +``` + ## 🫱🏾‍🫲🏼 Contribute To help our community with the creation of contributions, we have created our [community](https://docs.argilla.io/latest/community/) docs. Additionally, you can always [schedule a meeting](https://calendly.com/david-berenstein-huggingface/30min) with our Developer Advocacy team so they can get you up to speed. diff --git a/argilla-server/docker/argilla-hf-spaces/Dockerfile b/argilla-server/docker/argilla-hf-spaces/Dockerfile index 5566231b5e..1fb2419e81 100644 --- a/argilla-server/docker/argilla-hf-spaces/Dockerfile +++ b/argilla-server/docker/argilla-hf-spaces/Dockerfile @@ -10,24 +10,30 @@ COPY scripts/start.sh /home/argilla COPY Procfile /home/argilla COPY requirements.txt /packages/requirements.txt -RUN apt-get update && apt-get install -y \ - apt-transport-https \ - gnupg \ - wget +RUN apt-get update && \ + apt-get install -y apt-transport-https gnupg wget # Install Elasticsearch signing key RUN wget -qO - https://artifacts.elastic.co/GPG-KEY-elasticsearch | gpg --dearmor -o /usr/share/keyrings/elasticsearch-keyring.gpg - # Add Elasticsearch repository RUN echo "deb [signed-by=/usr/share/keyrings/elasticsearch-keyring.gpg] https://artifacts.elastic.co/packages/8.x/apt stable main" | tee /etc/apt/sources.list.d/elastic-8.x.list +# Install Redis signing key +RUN wget -qO - https://packages.redis.io/gpg | gpg --dearmor -o /usr/share/keyrings/redis-archive-keyring.gpg +# Add Redis repository +RUN apt-get install -y lsb-release +RUN echo "deb [signed-by=/usr/share/keyrings/redis-archive-keyring.gpg] https://packages.redis.io/deb $(lsb_release -cs) main" | tee /etc/apt/sources.list.d/redis.list + RUN \ # Create a directory where Argilla will store the data mkdir /data && \ + apt-get update && \ # Install Elasticsearch and configure it - apt-get update && apt-get install -y elasticsearch=8.8.2 && \ + apt-get install -y elasticsearch=8.8.2 && \ chown -R argilla:argilla /usr/share/elasticsearch /etc/elasticsearch /var/lib/elasticsearch /var/log/elasticsearch && \ chown argilla:argilla /etc/default/elasticsearch && \ + # Install Redis + apt-get install -y redis && \ # Install image dependencies pip install -r /packages/requirements.txt && \ chmod +x /home/argilla/start.sh && \ @@ -52,6 +58,7 @@ ENV ELASTIC_CONTAINER=true ENV ES_JAVA_OPTS="-Xms1g -Xmx1g" ENV ARGILLA_HOME_PATH=/data/argilla +ENV BACKGROUND_NUM_WORKERS=2 ENV REINDEX_DATASETS=1 CMD ["/bin/bash", "start.sh"] diff --git a/argilla-server/docker/argilla-hf-spaces/Procfile b/argilla-server/docker/argilla-hf-spaces/Procfile index 344dfb58f0..751d36e4b4 100644 --- a/argilla-server/docker/argilla-hf-spaces/Procfile +++ b/argilla-server/docker/argilla-hf-spaces/Procfile @@ -1,2 +1,4 @@ elastic: /usr/share/elasticsearch/bin/elasticsearch +redis: /usr/bin/redis-server +worker: sleep 30; rq worker-pool --num-workers ${BACKGROUND_NUM_WORKERS} argilla: sleep 30; /bin/bash start_argilla_server.sh diff --git a/argilla-server/docker/argilla-hf-spaces/requirements.txt b/argilla-server/docker/argilla-hf-spaces/requirements.txt index a8ba17e35e..a6a81c0913 100644 --- a/argilla-server/docker/argilla-hf-spaces/requirements.txt +++ b/argilla-server/docker/argilla-hf-spaces/requirements.txt @@ -1 +1,2 @@ honcho +rq ~= 1.16.2 diff --git a/argilla-server/docker/server/README.md b/argilla-server/docker/server/README.md index 5b67a93437..66c246fd19 100644 --- a/argilla-server/docker/server/README.md +++ b/argilla-server/docker/server/README.md @@ -25,4 +25,3 @@ Besides the common environment variables defined in docs, this Docker image prov - `API_KEY`: If provided, the owner api key. When `USERNAME` and `PASSWORD` are provided and `API_KEY` is empty, a new random value will be generated (Default: `""`). - `REINDEX_DATASET`: If `true` or `1`, the datasets will be reindexed in the search engine. This is needed when some search configuration changed or data must be refreshed (Default: `0`). - diff --git a/argilla-server/pdm.lock b/argilla-server/pdm.lock index f52c3d30c7..6a6e00562c 100644 --- a/argilla-server/pdm.lock +++ b/argilla-server/pdm.lock @@ -5,7 +5,7 @@ groups = ["default", "postgresql", "test"] strategy = ["inherit_metadata"] lock_version = "4.5.0" -content_hash = "sha256:8a73b464d000b58444fc97b9e7ba74d8449774921f2bba250a0f155a311dead4" +content_hash = "sha256:c333424e19e30dc22ae7475a8f8cec7c965c90d6d551b7efef2a724fd7354245" [[metadata.targets]] requires_python = ">=3.8,<3.11" @@ -1964,6 +1964,22 @@ files = [ {file = "PyYAML-6.0.1.tar.gz", hash = "sha256:bfdf460b1736c775f2ba9f6a92bca30bc2095067b8a9d77876d1fad6cc3b4a43"}, ] +[[package]] +name = "redis" +version = "5.0.8" +requires_python = ">=3.7" +summary = "Python client for Redis database and key-value store" +groups = ["default"] +dependencies = [ + "async-timeout>=4.0.3; python_full_version < \"3.11.3\"", + "importlib-metadata>=1.0; python_version < \"3.8\"", + "typing-extensions; python_version < \"3.8\"", +] +files = [ + {file = "redis-5.0.8-py3-none-any.whl", hash = "sha256:56134ee08ea909106090934adc36f65c9bcbbaecea5b21ba704ba6fb561f8eb4"}, + {file = "redis-5.0.8.tar.gz", hash = "sha256:0c5b10d387568dfe0698c6fad6615750c24170e548ca2deac10c649d463e9870"}, +] + [[package]] name = "requests" version = "2.31.0" @@ -2012,6 +2028,21 @@ files = [ {file = "rich-13.7.0.tar.gz", hash = "sha256:5cb5123b5cf9ee70584244246816e9114227e0b98ad9176eede6ad54bf5403fa"}, ] +[[package]] +name = "rq" +version = "1.16.2" +requires_python = ">=3.7" +summary = "RQ is a simple, lightweight, library for creating background jobs, and processing them." +groups = ["default"] +dependencies = [ + "click>=5", + "redis>=3.5", +] +files = [ + {file = "rq-1.16.2-py3-none-any.whl", hash = "sha256:52e619f6cb469b00e04da74305045d244b75fecb2ecaa4f26422add57d3c5f09"}, + {file = "rq-1.16.2.tar.gz", hash = "sha256:5c5b9ad5fbaf792b8fada25cc7627f4d206a9a4455aced371d4f501cc3f13b34"}, +] + [[package]] name = "rsa" version = "4.9" diff --git a/argilla-server/pyproject.toml b/argilla-server/pyproject.toml index 10f87751ea..2df84995bd 100644 --- a/argilla-server/pyproject.toml +++ b/argilla-server/pyproject.toml @@ -47,13 +47,15 @@ dependencies = [ "httpx~=0.26.0", "oauthlib ~= 3.2.0", "social-auth-core ~= 4.5.0", + # Background processing + "rq ~= 1.16.2", # Info status "psutil >= 5.8, <5.10", # Telemetry "segment-analytics-python == 2.2.0", # For logging, tracebacks, printing, progressbars "rich != 13.1.0", - # for CLI + # For CLI "typer >= 0.6.0, < 0.10.0", # spaCy only supports typer<0.10.0 "packaging>=23.2", "psycopg2-binary>=2.9.9", @@ -169,10 +171,11 @@ _.env_file = ".env.dev" cli = { cmd = "python -m argilla_server.cli" } server = { cmd = "uvicorn argilla_server:app --port 6900 --reload" } migrate = { cmd = "alembic upgrade head" } +worker = { cmd = "python -m argilla_server worker" } server-dev.composite = [ "migrate", "cli database users create_default", - "server" + "server", ] test = { cmd = "pytest", env_file = ".env.test" } diff --git a/argilla-server/src/argilla_server/_app.py b/argilla-server/src/argilla_server/_app.py index 7833cfb995..05ad3fae04 100644 --- a/argilla-server/src/argilla_server/_app.py +++ b/argilla-server/src/argilla_server/_app.py @@ -19,6 +19,7 @@ import os import shutil import tempfile +import redis from datetime import datetime from pathlib import Path @@ -40,6 +41,7 @@ from argilla_server.search_engine import get_search_engine from argilla_server.settings import settings from argilla_server.static_rewrite import RewriteStaticFiles +from argilla_server.jobs.queues import REDIS_CONNECTION from argilla_server.telemetry import get_telemetry_client _LOGGER = logging.getLogger("argilla") @@ -50,7 +52,9 @@ async def app_lifespan(app: FastAPI): # See https://fastapi.tiangolo.com/advanced/events/#lifespan await configure_database() await configure_search_engine() + configure_redis() track_server_startup() + yield @@ -265,4 +269,20 @@ async def ping_search_engine(): await ping_search_engine() +def configure_redis(): + @backoff.on_exception(backoff.expo, ConnectionError, max_time=60) + def ping_redis(): + try: + REDIS_CONNECTION.ping() + except redis.exceptions.ConnectionError: + raise ConnectionError( + f"Your redis instance at {settings.redis_url} is not available or not responding.\n" + "Please make sure your redis instance is launched and correctly running and\n" + "you have the necessary access permissions. Once you have verified this, restart " + "the argilla server.\n" + ) + + ping_redis() + + app = create_server_app() diff --git a/argilla-server/src/argilla_server/cli/__main__.py b/argilla-server/src/argilla_server/cli/__main__.py index 2ac0cdb123..76eeabeb2a 100644 --- a/argilla-server/src/argilla_server/cli/__main__.py +++ b/argilla-server/src/argilla_server/cli/__main__.py @@ -17,12 +17,14 @@ from .database import app as database_app from .search_engine import app as search_engine_app from .start import start +from .worker import worker app = typer.Typer(help="Commands for Argilla server management", no_args_is_help=True) app.add_typer(database_app, name="database") app.add_typer(search_engine_app, name="search-engine") +app.command(name="worker", help="Starts rq workers")(worker) app.command(name="start", help="Starts the Argilla server")(start) diff --git a/argilla-server/src/argilla_server/cli/worker.py b/argilla-server/src/argilla_server/cli/worker.py new file mode 100644 index 0000000000..710f35422a --- /dev/null +++ b/argilla-server/src/argilla_server/cli/worker.py @@ -0,0 +1,37 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import typer + +from typing import List + +from argilla_server.jobs.queues import DEFAULT_QUEUE + +DEFAULT_NUM_WORKERS = 2 + + +def worker( + queues: List[str] = typer.Option([DEFAULT_QUEUE.name], help="Name of queues to listen"), + num_workers: int = typer.Option(DEFAULT_NUM_WORKERS, help="Number of workers to start"), +) -> None: + from rq.worker_pool import WorkerPool + from argilla_server.jobs.queues import REDIS_CONNECTION + + worker_pool = WorkerPool( + connection=REDIS_CONNECTION, + queues=queues, + num_workers=num_workers, + ) + + worker_pool.start() diff --git a/argilla-server/src/argilla_server/contexts/datasets.py b/argilla-server/src/argilla_server/contexts/datasets.py index e9f653251e..06668930d4 100644 --- a/argilla-server/src/argilla_server/contexts/datasets.py +++ b/argilla-server/src/argilla_server/contexts/datasets.py @@ -64,6 +64,7 @@ from argilla_server.database import get_async_db from argilla_server.enums import DatasetStatus, UserRole, RecordStatus from argilla_server.errors.future import NotUniqueError, UnprocessableEntityError +from argilla_server.jobs import dataset_jobs from argilla_server.models import ( Dataset, Field, @@ -170,7 +171,11 @@ async def publish_dataset(db: AsyncSession, search_engine: SearchEngine, dataset async def update_dataset(db: AsyncSession, dataset: Dataset, dataset_attrs: dict) -> Dataset: await DatasetUpdateValidator.validate(db, dataset, dataset_attrs) - return await dataset.update(db, **dataset_attrs) + dataset = await dataset.update(db, **dataset_attrs) + + dataset_jobs.update_dataset_records_status_job.delay(dataset.id) + + return dataset async def delete_dataset(db: AsyncSession, search_engine: SearchEngine, dataset: Dataset) -> Dataset: diff --git a/argilla-server/src/argilla_server/jobs/__init__.py b/argilla-server/src/argilla_server/jobs/__init__.py new file mode 100644 index 0000000000..4b6cecae7f --- /dev/null +++ b/argilla-server/src/argilla_server/jobs/__init__.py @@ -0,0 +1,14 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + diff --git a/argilla-server/src/argilla_server/jobs/dataset_jobs.py b/argilla-server/src/argilla_server/jobs/dataset_jobs.py new file mode 100644 index 0000000000..2389a315e8 --- /dev/null +++ b/argilla-server/src/argilla_server/jobs/dataset_jobs.py @@ -0,0 +1,54 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from uuid import UUID + +from rq import Retry +from rq.decorators import job + +from sqlalchemy import func, select + +from argilla_server.models import Record, Response +from argilla_server.database import AsyncSessionLocal +from argilla_server.jobs.queues import DEFAULT_QUEUE +from argilla_server.search_engine.base import SearchEngine +from argilla_server.settings import settings +from argilla_server.contexts import distribution + +JOB_TIMEOUT_DISABLED = -1 +JOB_RECORDS_YIELD_PER = 100 + + +@job(DEFAULT_QUEUE, timeout=JOB_TIMEOUT_DISABLED, retry=Retry(max=3)) +async def update_dataset_records_status_job(dataset_id: UUID): + """This Job updates the status of all the records in the dataset when the distribution strategy changes.""" + + record_ids = [] + + async with AsyncSessionLocal() as db: + stream = await db.stream( + select(Record.id) + .join(Response) + .where(Record.dataset_id == dataset_id) + .order_by(Record.inserted_at.asc()) + .execution_options(yield_per=JOB_RECORDS_YIELD_PER) + ) + + async for record_id in stream.scalars(): + record_ids.append(record_id) + + # NOTE: We are updating the records status outside the database transaction to avoid database locks with SQLite. + async with SearchEngine.get_by_name(settings.search_engine) as search_engine: + for record_id in record_ids: + await distribution.update_record_status(search_engine, record_id) diff --git a/argilla-server/src/argilla_server/jobs/queues.py b/argilla-server/src/argilla_server/jobs/queues.py new file mode 100644 index 0000000000..0f17a63bd6 --- /dev/null +++ b/argilla-server/src/argilla_server/jobs/queues.py @@ -0,0 +1,24 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import redis + +from rq import Queue + +from argilla_server.settings import settings + + +REDIS_CONNECTION = redis.from_url(settings.redis_url) + +DEFAULT_QUEUE = Queue("default", connection=REDIS_CONNECTION) diff --git a/argilla-server/src/argilla_server/models/database.py b/argilla-server/src/argilla_server/models/database.py index 765b157983..dfd00b02f5 100644 --- a/argilla-server/src/argilla_server/models/database.py +++ b/argilla-server/src/argilla_server/models/database.py @@ -370,13 +370,6 @@ class Dataset(DatabaseModel): __table_args__ = (UniqueConstraint("name", "workspace_id", name="dataset_name_workspace_id_uq"),) - @property - async def responses_count(self) -> int: - # TODO: This should be moved to proper repository - return await async_object_session(self).scalar( - select(func.count(Response.id)).join(Record).where(Record.dataset_id == self.id) - ) - @property def is_draft(self): return self.status == DatasetStatus.draft diff --git a/argilla-server/src/argilla_server/settings.py b/argilla-server/src/argilla_server/settings.py index ac900762f6..a734ca433b 100644 --- a/argilla-server/src/argilla_server/settings.py +++ b/argilla-server/src/argilla_server/settings.py @@ -98,6 +98,8 @@ class Settings(BaseSettings): elasticsearch_ca_path: Optional[str] = None cors_origins: List[str] = ["*"] + redis_url: str = "redis://localhost:6379/0" + docs_enabled: bool = True # Analyzer configuration diff --git a/argilla-server/src/argilla_server/validators/datasets.py b/argilla-server/src/argilla_server/validators/datasets.py index c886ec4e27..1a28e1d32e 100644 --- a/argilla-server/src/argilla_server/validators/datasets.py +++ b/argilla-server/src/argilla_server/validators/datasets.py @@ -44,15 +44,4 @@ async def _validate_name_is_not_duplicated(cls, db: AsyncSession, name: str, wor class DatasetUpdateValidator: @classmethod async def validate(cls, db: AsyncSession, dataset: Dataset, dataset_attrs: dict) -> None: - await cls._validate_distribution(dataset, dataset_attrs) - - @classmethod - async def _validate_distribution(cls, dataset: Dataset, dataset_attrs: dict) -> None: - if ( - dataset_attrs.get("distribution") is not None - and dataset.distribution != dataset_attrs.get("distribution") - and (await dataset.responses_count) > 0 - ): - raise UpdateDistributionWithExistingResponsesError( - "Distribution settings can't be modified for a dataset containing user responses" - ) + pass diff --git a/argilla-server/tests/unit/api/handlers/v1/datasets/test_update_dataset.py b/argilla-server/tests/unit/api/handlers/v1/datasets/test_update_dataset.py index 3b22878750..91c4f54f17 100644 --- a/argilla-server/tests/unit/api/handlers/v1/datasets/test_update_dataset.py +++ b/argilla-server/tests/unit/api/handlers/v1/datasets/test_update_dataset.py @@ -95,74 +95,6 @@ async def test_update_dataset_without_distribution_for_published_dataset( "min_submitted": 1, } - async def test_update_dataset_distribution_for_published_dataset_without_responses( - self, async_client: AsyncClient, owner_auth_header: dict - ): - dataset = await DatasetFactory.create(status=DatasetStatus.ready) - - response = await async_client.patch( - self.url(dataset.id), - headers=owner_auth_header, - json={ - "distribution": { - "strategy": DatasetDistributionStrategy.overlap, - "min_submitted": 4, - }, - }, - ) - - assert response.status_code == 200 - - assert dataset.distribution == { - "strategy": DatasetDistributionStrategy.overlap, - "min_submitted": 4, - } - - async def test_update_dataset_distribution_for_dataset_with_responses( - self, async_client: AsyncClient, owner_auth_header: dict - ): - dataset = await DatasetFactory.create(status=DatasetStatus.ready) - records = await RecordFactory.create_batch(10, dataset=dataset) - - for record in records: - await ResponseFactory.create(record=record) - - response = await async_client.patch( - self.url(dataset.id), - headers=owner_auth_header, - json={ - "distribution": { - "strategy": DatasetDistributionStrategy.overlap, - "min_submitted": 4, - }, - }, - ) - - assert response.status_code == 422 - - assert response.json() == { - "code": "update_distribution_with_existing_responses", - "message": "Distribution settings can't be modified for a dataset containing user responses", - } - - async def test_update_dataset_distribution_with_the_same_value_for_dataset_with_responses( - self, async_client: AsyncClient, owner_auth_header: dict - ): - dataset = await DatasetFactory.create(status=DatasetStatus.ready) - records = await RecordFactory.create_batch(10, dataset=dataset) - - for record in records: - await ResponseFactory.create(record=record) - - response = await async_client.patch( - self.url(dataset.id), - headers=owner_auth_header, - json={"distribution": dataset.distribution}, - ) - - assert response.status_code == 200 - assert response.json()["distribution"] == dataset.distribution - async def test_update_dataset_distribution_with_invalid_strategy( self, async_client: AsyncClient, owner_auth_header: dict ): diff --git a/argilla/docs/reference/argilla-server/configuration.md b/argilla/docs/reference/argilla-server/configuration.md index 6e1be5684f..0fbf229b1f 100644 --- a/argilla/docs/reference/argilla-server/configuration.md +++ b/argilla/docs/reference/argilla-server/configuration.md @@ -80,6 +80,12 @@ The following environment variables are useful only when PostgreSQL is used: - `ARGILLA_ELASTICSEARCH_CA_PATH`: Path to CA cert for ES host. For example: `/full/path/to/root-ca.pem` (Optional) +### Redis + +Redis is used by Argilla to store information about jobs to be processed on background. The following environment variables are useful to config how Argilla connects to Redis: + +- `ARGILLA_REDIS_URL`: A URL string that contains the necessary information to connect to a Redis instance (Default: `redis://localhost:6379/0`). + ### Datasets - `ARGILLA_LABEL_SELECTION_OPTIONS_MAX_ITEMS`: Set the number of maximum items to be allowed by label and multi label questions (Default: `500`). diff --git a/docs/_source/community/developer_docs.md b/docs/_source/community/developer_docs.md index 888249564e..2afff5b675 100644 --- a/docs/_source/community/developer_docs.md +++ b/docs/_source/community/developer_docs.md @@ -20,6 +20,8 @@ the data. used as the default built-in option and is deployed separately with the Argilla Server but a separate `PostgreSQL` can be used too. +- **Redis**: [Redis](https://redis.io) is used to store information about background jobs and it's a required dependency of Argilla server. + - **Vector Database**: A vector database to store the records data and perform scalable vector similarity searches and basic document searches. We currently support `ElasticSearch` and `AWS OpenSearch` and they can be deployed as separate Docker images. diff --git a/examples/deployments/docker/docker-compose.yaml b/examples/deployments/docker/docker-compose.yaml index fc695d74dd..12a8db3212 100644 --- a/examples/deployments/docker/docker-compose.yaml +++ b/examples/deployments/docker/docker-compose.yaml @@ -1,27 +1,65 @@ +x-common-variables: &common-variables + ARGILLA_HOME_PATH: /var/lib/argilla + ARGILLA_ELASTICSEARCH: http://elasticsearch:9200 + ARGILLA_DATABASE_URL: postgresql+asyncpg://postgres:postgres@postgres:5432/argilla + ARGILLA_REDIS_URL: redis://redis:6379/0 + services: argilla: - image: argilla/argilla-server:latest + # image: argilla/argilla-server:latest + # TODO: Change this to use latest version of Argilla Server once it's working + image: argilladev/argilla-server:develop restart: unless-stopped ports: - "6900:6900" environment: - ARGILLA_HOME_PATH: /var/lib/argilla - ARGILLA_ELASTICSEARCH: http://elasticsearch:9200 - - # HF_HUB_DISABLE_TELEMETRY: 1 # Opt-out for telemetry https://huggingface.co/docs/huggingface_hub/main/en/package_reference/utilities#huggingface_hub.utils.send_telemetry - # HF_HUB_OFFLINE: 1 # Opt-out for telemetry https://huggingface.co/docs/huggingface_hub/main/en/package_reference/utilities#huggingface_hub.utils.send_telemetry - + <<: *common-variables USERNAME: argilla PASSWORD: 12345678 API_KEY: argilla.apikey - # REINDEX_DATASETS: 1 # Uncomment this line to reindex Argilla datasets into the search engine when starting up - + # Uncomment the following line to reindex Argilla datasets into the search engine when starting up + # REINDEX_DATASETS: 1 + # Opt-out for telemetry https://huggingface.co/docs/huggingface_hub/main/en/package_reference/utilities#huggingface_hub.utils.send_telemetry + # HF_HUB_DISABLE_TELEMETRY: 1 + # Opt-out for telemetry https://huggingface.co/docs/huggingface_hub/main/en/package_reference/utilities#huggingface_hub.utils.send_telemetry + # HF_HUB_OFFLINE: 1 networks: - argilla volumes: # ARGILLA_HOME_PATH is used to define where Argilla will save it's application data. # If you change ARGILLA_HOME_PATH value please copy that same value to argilladata volume too. - argilladata:/var/lib/argilla + depends_on: + - postgres + - elasticsearch + - redis + + worker: + # TODO: Change this to use latest version of Argilla Server once it's working + image: argilladev/argilla-server:develop + environment: + <<: *common-variables + BACKGROUND_NUM_WORKERS: 2 + networks: + - argilla + depends_on: + - postgres + - elasticsearch + - redis + command: sh -c 'python -m argilla_server worker --num-workers $${BACKGROUND_NUM_WORKERS}' + + postgres: + image: postgres:14 + environment: + POSTGRES_USER: postgres + POSTGRES_PASSWORD: postgres + POSTGRES_DB: argilla + networks: + - argilla + ports: + - "5432:5432" + volumes: + - postgresdata:/var/lib/postgresql/data elasticsearch: image: docker.elastic.co/elasticsearch/elasticsearch:8.12.2 @@ -44,6 +82,13 @@ services: volumes: - elasticdata:/usr/share/elasticsearch/data/ + redis: + image: redis + networks: + - argilla + ports: + - "6379:6379" + networks: argilla: driver: bridge @@ -51,3 +96,4 @@ networks: volumes: argilladata: elasticdata: + postgresdata: