From 5cdd7f5067912ad9ba4b610f4d6d8d0c10bcaf17 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jos=C3=A9=20Francisco=20Calvo?= Date: Fri, 12 Jul 2024 12:23:25 +0200 Subject: [PATCH] improvement: add new `ARGILLA_DATABASE_SQLITE_TIMEOUT` environment variable setting (#5213) # Description On the distribution task testing effort meeting we found some errors raised by SQLite when the database was locked because too many writes were executed concurrently. Increasing the timeout for SQLite reduces this problem, specifically changing the connection `timeout` parameter from the default value of `5` seconds to `30` seconds make the problem totally disappear locally. In this PR I have added a new `ARGILLA_DATABASE_SQLITE_TIMEOUT` environment variable that allow us to set a value for this. With a default value of `15` seconds. The idea is to test this change on Spaces using a value of `30` seconds. Refs #5000 **Type of change** - Improvement (change adding some improvement to an existing functionality) **How Has This Been Tested** - Tested locally using SQLite and running 20 parallel response creations in bulk. **Checklist** - I added relevant documentation - I followed the style guidelines of this project - I did a self-review of my code - I made corresponding changes to the documentation - I confirm My changes generate no new warnings - I have added tests that prove my fix is effective or that my feature works - I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --- argilla-server/CHANGELOG.md | 4 ++++ .../src/argilla_server/constants.py | 4 ++++ argilla-server/src/argilla_server/database.py | 2 +- argilla-server/src/argilla_server/settings.py | 22 ++++++++++++++++++- .../tests/unit/commons/test_settings.py | 20 +++++++++++++---- .../configurations/server_configuration.md | 6 +++++ 6 files changed, 52 insertions(+), 6 deletions(-) diff --git a/argilla-server/CHANGELOG.md b/argilla-server/CHANGELOG.md index 7d8daf02a9..e7b2afcbfd 100644 --- a/argilla-server/CHANGELOG.md +++ b/argilla-server/CHANGELOG.md @@ -16,6 +16,10 @@ These are the section headers that we use: ## [Unreleased]() +### Added + +- Added new `ARGILLA_DATABASE_SQLITE_TIMEOUT` environment variable allowing to set transactions timeout for SQLite. ([#5213](https://github.com/argilla-io/argilla/pull/5213)) + ### Fixed - Fixed SQLite connection settings not working correctly due to a outdated conditional. ([#5149](https://github.com/argilla-io/argilla/pull/5149)) diff --git a/argilla-server/src/argilla_server/constants.py b/argilla-server/src/argilla_server/constants.py index 6bda197c27..96d8b20de8 100644 --- a/argilla-server/src/argilla_server/constants.py +++ b/argilla-server/src/argilla_server/constants.py @@ -15,6 +15,8 @@ API_KEY_HEADER_NAME = "X-Argilla-Api-Key" WORKSPACE_HEADER_NAME = "X-Argilla-Workspace" +DATABASE_SQLITE = "sqlite" + SEARCH_ENGINE_ELASTICSEARCH = "elasticsearch" SEARCH_ENGINE_OPENSEARCH = "opensearch" @@ -22,6 +24,8 @@ DEFAULT_PASSWORD = "1234" DEFAULT_API_KEY = "argilla.apikey" +DEFAULT_DATABASE_SQLITE_TIMEOUT = 15 + DEFAULT_MAX_KEYWORD_LENGTH = 128 DEFAULT_TELEMETRY_KEY = "C6FkcaoCbt78rACAgvyBxGBcMB3dM3nn" diff --git a/argilla-server/src/argilla_server/database.py b/argilla-server/src/argilla_server/database.py index e0bc4c4c95..1d52168c4c 100644 --- a/argilla-server/src/argilla_server/database.py +++ b/argilla-server/src/argilla_server/database.py @@ -51,7 +51,7 @@ def set_sqlite_pragma(dbapi_connection, connection_record): cursor.close() -async_engine = create_async_engine(settings.database_url) +async_engine = create_async_engine(settings.database_url, connect_args=settings.database_connect_args) AsyncSessionLocal = async_sessionmaker(autocommit=False, expire_on_commit=False, bind=async_engine) diff --git a/argilla-server/src/argilla_server/settings.py b/argilla-server/src/argilla_server/settings.py index d0900896fa..fc78a2cbc0 100644 --- a/argilla-server/src/argilla_server/settings.py +++ b/argilla-server/src/argilla_server/settings.py @@ -22,14 +22,16 @@ import re import warnings from pathlib import Path -from typing import List, Optional +from typing import Dict, List, Optional from urllib.parse import urlparse from argilla_server.constants import ( + DATABASE_SQLITE, DEFAULT_LABEL_SELECTION_OPTIONS_MAX_ITEMS, DEFAULT_MAX_KEYWORD_LENGTH, DEFAULT_SPAN_OPTIONS_MAX_ITEMS, DEFAULT_TELEMETRY_KEY, + DEFAULT_DATABASE_SQLITE_TIMEOUT, SEARCH_ENGINE_ELASTICSEARCH, SEARCH_ENGINE_OPENSEARCH, ) @@ -74,6 +76,10 @@ class Settings(BaseSettings): home_path: Optional[str] = Field(description="The home path where argilla related files will be stored") base_url: Optional[str] = Field(description="The default base url where server will be deployed") database_url: Optional[str] = Field(description="The database url that argilla will use as data store") + database_sqlite_timeout: Optional[int] = Field( + default=DEFAULT_DATABASE_SQLITE_TIMEOUT, + description="SQLite database connection timeout in seconds", + ) elasticsearch: str = "http://localhost:9200" elasticsearch_ssl_verify: bool = True @@ -220,6 +226,20 @@ def old_dataset_records_index_name(self) -> str: return index_name.replace("", "") return index_name.replace("", f".{ns}") + @property + def database_connect_args(self) -> Dict: + if self.database_is_sqlite: + return {"timeout": self.database_sqlite_timeout} + + return {} + + @property + def database_is_sqlite(self) -> bool: + if self.database_url is None: + return False + + return self.database_url.lower().startswith(DATABASE_SQLITE) + @property def search_engine_is_elasticsearch(self) -> bool: return self.search_engine == SEARCH_ENGINE_ELASTICSEARCH diff --git a/argilla-server/tests/unit/commons/test_settings.py b/argilla-server/tests/unit/commons/test_settings.py index f6aea4c669..3063f4c27e 100644 --- a/argilla-server/tests/unit/commons/test_settings.py +++ b/argilla-server/tests/unit/commons/test_settings.py @@ -27,6 +27,7 @@ def test_wrong_settings_namespace(monkeypatch, bad_namespace): def test_settings_namespace(monkeypatch): monkeypatch.setenv("ARGILLA_NAMESPACE", "namespace") + settings = Settings() assert settings.namespace == "namespace" @@ -38,12 +39,12 @@ def test_settings_index_replicas_with_shards_defined(monkeypatch): monkeypatch.setenv("ARGILLA_ES_RECORDS_INDEX_SHARDS", "100") monkeypatch.setenv("ARGILLA_ES_RECORDS_INDEX_REPLICAS", "2") - settings = Settings() - assert settings.es_records_index_replicas == 2 + assert Settings().es_records_index_replicas == 2 def test_settings_default_index_replicas_with_shards_defined(monkeypatch): monkeypatch.setenv("ARGILLA_ES_RECORDS_INDEX_SHARDS", "100") + settings = Settings() assert settings.es_records_index_shards == 100 @@ -54,9 +55,20 @@ def test_settings_default_database_url(monkeypatch): monkeypatch.setenv("ARGILLA_DATABASE_URL", "") settings = Settings() + assert settings.database_url == f"sqlite+aiosqlite:///{settings.home_path}/argilla.db?check_same_thread=False" +def test_settings_database_sqlite_timeout(monkeypatch): + monkeypatch.setenv("ARGILLA_DATABASE_SQLITE_TIMEOUT", "3") + + assert Settings().database_sqlite_timeout == 3 + + +def test_settings_default_database_sqlite_timeout(): + assert Settings().database_sqlite_timeout == 15 + + @pytest.mark.parametrize( "url, expected_url", [ @@ -68,5 +80,5 @@ def test_settings_default_database_url(monkeypatch): ) def test_settings_database_url(url: str, expected_url: str, monkeypatch): monkeypatch.setenv("ARGILLA_DATABASE_URL", url) - settings = Settings() - assert settings.database_url == expected_url + + assert Settings().database_url == expected_url diff --git a/docs/_source/getting_started/installation/configurations/server_configuration.md b/docs/_source/getting_started/installation/configurations/server_configuration.md index 96378dd7d0..6889aeba63 100644 --- a/docs/_source/getting_started/installation/configurations/server_configuration.md +++ b/docs/_source/getting_started/installation/configurations/server_configuration.md @@ -67,6 +67,12 @@ You can set the following environment variables to further configure your server - `ARGILLA_DATABASE_URL`: A URL string that contains the necessary information to connect to a database. Argilla uses SQLite by default, PostgreSQL is also officially supported (Default: `sqlite:///$ARGILLA_HOME_PATH/argilla.db?check_same_thread=False`). +#### SQLite + +The following environment variables are useful only when SQLite is used: + +- `ARGILLA_DATABASE_SQLITE_TIMEOUT`: How many seconds the connection should wait before raising an `OperationalError` when a table is locked. If another connection opens a transaction to modify a table, that table will be locked until the transaction is committed. (Defaut: `15` seconds). + #### Elasticsearch and Opensearch - `ARGILLA_ELASTICSEARCH`: URL of the connection endpoint of the Elasticsearch instance (Default: `http://localhost:9200`).