From c25c88c8fcdaf3d2bf0abdd20d6e793828e55c73 Mon Sep 17 00:00:00 2001 From: Paco Aranda Date: Mon, 7 Oct 2024 10:44:05 +0200 Subject: [PATCH] [Refactor] Remove settings name pattern restrictions (#5573) # Description This PR removes name pattern validation for field, question, metadata-property and vector-settings **Type of change** - Refactor (change restructuring the codebase without changing functionality) - Improvement (change adding some improvement to an existing functionality) **How Has This Been Tested** **Checklist** - I added relevant documentation - I followed the style guidelines of this project - I did a self-review of my code - I made corresponding changes to the documentation - I confirm My changes generate no new warnings - I have added tests that prove my fix is effective or that my feature works - I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --- .github/workflows/argilla.yml | 2 +- argilla-server/CHANGELOG.md | 7 + .../argilla_server/api/schemas/v1/fields.py | 2 - .../api/schemas/v1/metadata_properties.py | 2 - .../api/schemas/v1/questions.py | 2 - .../api/schemas/v1/vector_settings.py | 2 - .../v1/datasets/test_create_dataset_field.py | 259 +++++++++++++++ .../handlers/v1/datasets/test_questions.py | 18 - .../unit/api/handlers/v1/test_datasets.py | 314 ------------------ argilla/CHANGELOG.md | 4 + .../src/argilla/_models/_settings/_fields.py | 6 - .../argilla/_models/_settings/_metadata.py | 6 - .../_models/_settings/_questions/_base.py | 6 - .../src/argilla/_models/_settings/_vectors.py | 6 - argilla/src/argilla/datasets/_io/_hub.py | 11 - argilla/src/argilla/settings/_io/_hub.py | 6 - argilla/src/argilla/settings/_resource.py | 15 - .../tests/integration/test_import_features.py | 8 +- .../test_settings/test_settings_fields.py | 15 - .../test_settings_from_features.py | 26 -- 20 files changed, 275 insertions(+), 442 deletions(-) create mode 100644 argilla-server/tests/unit/api/handlers/v1/datasets/test_create_dataset_field.py diff --git a/.github/workflows/argilla.yml b/.github/workflows/argilla.yml index d034c88e67..685822eb4e 100644 --- a/.github/workflows/argilla.yml +++ b/.github/workflows/argilla.yml @@ -21,7 +21,7 @@ jobs: build: services: argilla-server: - image: argilladev/argilla-hf-spaces:develop + image: argilladev/argilla-hf-spaces:pr-5573 ports: - 6900:6900 env: diff --git a/argilla-server/CHANGELOG.md b/argilla-server/CHANGELOG.md index 1a9aced5db..7287d439a8 100644 --- a/argilla-server/CHANGELOG.md +++ b/argilla-server/CHANGELOG.md @@ -20,6 +20,13 @@ These are the section headers that we use: - Now it is possible to publish a dataset without required fields. Allowing being published with at least one field (required or not). ([#5569](https://github.com/argilla-io/argilla/pull/5569)) +### Removed + +- Removed name pattern restrictions for Fields. ([#5573](https://github.com/argilla-io/argilla/pull/5573)) +- Removed name pattern restrictions for Questions. ([#5573](https://github.com/argilla-io/argilla/pull/5573)) +- Removed name pattern restrictions for Metadata Properties. ([#5573](https://github.com/argilla-io/argilla/pull/5573)) +- Removed name pattern restrictions for Vector Settings. ([#5573](https://github.com/argilla-io/argilla/pull/5573)) + ## [2.3.0](https://github.com/argilla-io/argilla/compare/v2.2.0...v2.3.0) ### Added diff --git a/argilla-server/src/argilla_server/api/schemas/v1/fields.py b/argilla-server/src/argilla_server/api/schemas/v1/fields.py index 7137c93ed2..ecc7b4eb33 100644 --- a/argilla-server/src/argilla_server/api/schemas/v1/fields.py +++ b/argilla-server/src/argilla_server/api/schemas/v1/fields.py @@ -21,7 +21,6 @@ from argilla_server.pydantic_v1 import BaseModel, constr from argilla_server.pydantic_v1 import Field as PydanticField -FIELD_CREATE_NAME_REGEX = r"^(?=.*[a-z0-9])[a-z0-9_-]+$" FIELD_CREATE_NAME_MIN_LENGTH = 1 FIELD_CREATE_NAME_MAX_LENGTH = 200 @@ -30,7 +29,6 @@ FieldName = Annotated[ constr( - regex=FIELD_CREATE_NAME_REGEX, min_length=FIELD_CREATE_NAME_MIN_LENGTH, max_length=FIELD_CREATE_NAME_MAX_LENGTH, ), diff --git a/argilla-server/src/argilla_server/api/schemas/v1/metadata_properties.py b/argilla-server/src/argilla_server/api/schemas/v1/metadata_properties.py index da8265f95a..c5104305ff 100644 --- a/argilla-server/src/argilla_server/api/schemas/v1/metadata_properties.py +++ b/argilla-server/src/argilla_server/api/schemas/v1/metadata_properties.py @@ -23,7 +23,6 @@ FLOAT_METADATA_METRICS_PRECISION = 5 -METADATA_PROPERTY_CREATE_NAME_REGEX = r"^(?=.*[a-z0-9])[a-z0-9_-]+$" METADATA_PROPERTY_CREATE_NAME_MIN_LENGTH = 1 METADATA_PROPERTY_CREATE_NAME_MAX_LENGTH = 200 @@ -103,7 +102,6 @@ class FloatMetadataProperty(BaseModel): str, Field( ..., - regex=METADATA_PROPERTY_CREATE_NAME_REGEX, min_length=METADATA_PROPERTY_CREATE_NAME_MIN_LENGTH, max_length=METADATA_PROPERTY_CREATE_NAME_MAX_LENGTH, ), diff --git a/argilla-server/src/argilla_server/api/schemas/v1/questions.py b/argilla-server/src/argilla_server/api/schemas/v1/questions.py index 93106e7730..02e0ff3189 100644 --- a/argilla-server/src/argilla_server/api/schemas/v1/questions.py +++ b/argilla-server/src/argilla_server/api/schemas/v1/questions.py @@ -28,7 +28,6 @@ from typing_extensions import Annotated -QUESTION_CREATE_NAME_REGEX = r"^(?=.*[a-z0-9])[a-z0-9_-]+$" QUESTION_CREATE_NAME_MIN_LENGTH = 1 QUESTION_CREATE_NAME_MAX_LENGTH = 200 @@ -287,7 +286,6 @@ class SpanQuestionSettingsUpdate(UpdateSchema): QuestionName = Annotated[ constr( - regex=QUESTION_CREATE_NAME_REGEX, min_length=QUESTION_CREATE_NAME_MIN_LENGTH, max_length=QUESTION_CREATE_NAME_MAX_LENGTH, ), diff --git a/argilla-server/src/argilla_server/api/schemas/v1/vector_settings.py b/argilla-server/src/argilla_server/api/schemas/v1/vector_settings.py index d9d6acd506..4715f2b2ab 100644 --- a/argilla-server/src/argilla_server/api/schemas/v1/vector_settings.py +++ b/argilla-server/src/argilla_server/api/schemas/v1/vector_settings.py @@ -20,7 +20,6 @@ from argilla_server.errors.future import UnprocessableEntityError from argilla_server.pydantic_v1 import BaseModel, Field, PositiveInt, constr -VECTOR_SETTINGS_CREATE_NAME_REGEX = r"^(?=.*[a-z0-9])[a-z0-9_-]+$" VECTOR_SETTINGS_CREATE_NAME_MIN_LENGTH = 1 VECTOR_SETTINGS_CREATE_NAME_MAX_LENGTH = 200 @@ -63,7 +62,6 @@ class VectorsSettings(BaseModel): class VectorSettingsCreate(BaseModel): name: str = Field( ..., - regex=VECTOR_SETTINGS_CREATE_NAME_REGEX, min_length=VECTOR_SETTINGS_CREATE_NAME_MIN_LENGTH, max_length=VECTOR_SETTINGS_CREATE_NAME_MAX_LENGTH, description="The title of the vector settings", diff --git a/argilla-server/tests/unit/api/handlers/v1/datasets/test_create_dataset_field.py b/argilla-server/tests/unit/api/handlers/v1/datasets/test_create_dataset_field.py new file mode 100644 index 0000000000..4f6c586a75 --- /dev/null +++ b/argilla-server/tests/unit/api/handlers/v1/datasets/test_create_dataset_field.py @@ -0,0 +1,259 @@ +# Copyright 2021-present, the Recognai S.L. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from datetime import datetime +from typing import TYPE_CHECKING +from uuid import UUID, uuid4 + +import pytest +from sqlalchemy import func, select + +from argilla_server.api.schemas.v1.fields import FIELD_CREATE_NAME_MAX_LENGTH, FIELD_CREATE_TITLE_MAX_LENGTH +from argilla_server.constants import API_KEY_HEADER_NAME +from argilla_server.enums import ( + DatasetStatus, +) +from argilla_server.models import ( + Field, +) +from tests.factories import ( + AdminFactory, + AnnotatorFactory, + DatasetFactory, + FieldFactory, + WorkspaceFactory, +) + +if TYPE_CHECKING: + from httpx import AsyncClient + from sqlalchemy.ext.asyncio import AsyncSession + + +@pytest.mark.asyncio +class TestCreateDatasetField: + @pytest.mark.parametrize( + ("settings", "expected_settings"), + [ + ({"type": "text"}, {"type": "text", "use_markdown": False}), + ({"type": "text", "discarded": "value"}, {"type": "text", "use_markdown": False}), + ({"type": "text", "use_markdown": False}, {"type": "text", "use_markdown": False}), + ], + ) + async def test_create_dataset_field( + self, + async_client: "AsyncClient", + db: "AsyncSession", + owner_auth_header: dict, + settings: dict, + expected_settings: dict, + ): + dataset = await DatasetFactory.create() + field_json = {"name": "name", "title": "title", "settings": settings} + + response = await async_client.post( + f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json + ) + + assert response.status_code == 201 + assert (await db.execute(select(func.count(Field.id)))).scalar() == 1 + + response_body = response.json() + assert await db.get(Field, UUID(response_body["id"])) + assert response_body == { + "id": str(UUID(response_body["id"])), + "name": "name", + "title": "title", + "required": False, + "settings": expected_settings, + "dataset_id": str(dataset.id), + "inserted_at": datetime.fromisoformat(response_body["inserted_at"]).isoformat(), + "updated_at": datetime.fromisoformat(response_body["updated_at"]).isoformat(), + } + + async def test_create_dataset_field_without_authentication(self, async_client: "AsyncClient", db: "AsyncSession"): + dataset = await DatasetFactory.create() + field_json = { + "name": "name", + "title": "title", + "settings": {"type": "text"}, + } + + response = await async_client.post(f"/api/v1/datasets/{dataset.id}/fields", json=field_json) + + assert response.status_code == 401 + assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 + + async def test_create_dataset_field_as_admin(self, async_client: "AsyncClient", db: "AsyncSession"): + workspace = await WorkspaceFactory.create() + admin = await AdminFactory.create(workspaces=[workspace]) + dataset = await DatasetFactory.create(workspace=workspace) + field_json = { + "name": "name", + "title": "title", + "settings": {"type": "text"}, + } + + response = await async_client.post( + f"/api/v1/datasets/{dataset.id}/fields", + headers={API_KEY_HEADER_NAME: admin.api_key}, + json=field_json, + ) + + assert response.status_code == 201 + assert (await db.execute(select(func.count(Field.id)))).scalar() == 1 + + async def test_create_dataset_field_as_annotator(self, async_client: "AsyncClient", db: "AsyncSession"): + annotator = await AnnotatorFactory.create() + dataset = await DatasetFactory.create() + field_json = { + "name": "name", + "title": "title", + "settings": {"type": "text"}, + } + + response = await async_client.post( + f"/api/v1/datasets/{dataset.id}/fields", + headers={API_KEY_HEADER_NAME: annotator.api_key}, + json=field_json, + ) + + assert response.status_code == 403 + assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 + + async def test_create_dataset_field_with_invalid_max_length_name( + self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict + ): + dataset = await DatasetFactory.create() + field_json = { + "name": "a" * (FIELD_CREATE_NAME_MAX_LENGTH + 1), + "title": "title", + "settings": {"type": "text"}, + } + + response = await async_client.post( + f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json + ) + + assert response.status_code == 422 + # assert db.query(Field).count() == 0 + assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 + + async def test_create_dataset_field_with_invalid_max_length_title( + self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict + ): + dataset = await DatasetFactory.create() + field_json = { + "name": "name", + "title": "a" * (FIELD_CREATE_TITLE_MAX_LENGTH + 1), + "settings": {"type": "text"}, + } + + response = await async_client.post( + f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json + ) + + assert response.status_code == 422 + assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 + + @pytest.mark.parametrize( + "settings", + [ + {}, + None, + {"type": "wrong-type"}, + {"type": "text", "use_markdown": None}, + {"type": "rating", "options": None}, + {"type": "rating", "options": []}, + ], + ) + async def test_create_dataset_field_with_invalid_settings( + self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict, settings: dict + ): + dataset = await DatasetFactory.create() + field_json = { + "name": "name", + "title": "Title", + "settings": settings, + } + + response = await async_client.post( + f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json + ) + + assert response.status_code == 422 + assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 + + async def test_create_dataset_field_with_existent_name( + self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict + ): + field = await FieldFactory.create(name="name") + + response = await async_client.post( + f"/api/v1/datasets/{field.dataset_id}/fields", + headers=owner_auth_header, + json={ + "name": "name", + "title": "title", + "settings": {"type": "text"}, + }, + ) + + assert response.status_code == 409 + assert response.json() == { + "detail": f"Field with name `{field.name}` already exists for dataset with id `{field.dataset_id}`" + } + + assert (await db.execute(select(func.count(Field.id)))).scalar() == 1 + + async def test_create_dataset_field_with_published_dataset( + self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict + ): + dataset = await DatasetFactory.create(status=DatasetStatus.ready) + + response = await async_client.post( + f"/api/v1/datasets/{dataset.id}/fields", + headers=owner_auth_header, + json={ + "name": "name", + "title": "title", + "settings": {"type": "text"}, + }, + ) + + assert response.status_code == 422 + assert response.json() == {"detail": "Field cannot be created for a published dataset"} + + assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 + + async def test_create_dataset_field_with_nonexistent_dataset_id( + self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict + ): + dataset_id = uuid4() + + await DatasetFactory.create() + + response = await async_client.post( + f"/api/v1/datasets/{dataset_id}/fields", + headers=owner_auth_header, + json={ + "name": "text", + "title": "Text", + "settings": {"type": "text"}, + }, + ) + + assert response.status_code == 404 + assert response.json() == {"detail": f"Dataset with id `{dataset_id}` not found"} + + assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 diff --git a/argilla-server/tests/unit/api/handlers/v1/datasets/test_questions.py b/argilla-server/tests/unit/api/handlers/v1/datasets/test_questions.py index cf08a4f33f..76b5d6e457 100644 --- a/argilla-server/tests/unit/api/handlers/v1/datasets/test_questions.py +++ b/argilla-server/tests/unit/api/handlers/v1/datasets/test_questions.py @@ -329,24 +329,6 @@ async def test_create_dataset_question_as_annotator(self, async_client: "AsyncCl assert response.status_code == 403 assert (await db.execute(select(func.count(Question.id)))).scalar() == 0 - @pytest.mark.parametrize("invalid_name", ["", " ", " ", "-", "--", "_", "__", "A", "AA", "invalid_nAmE"]) - async def test_create_dataset_question_with_invalid_name( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict, invalid_name: str - ): - dataset = await DatasetFactory.create() - question_json = { - "name": invalid_name, - "title": "title", - "settings": {"type": "text"}, - } - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/questions", headers=owner_auth_header, json=question_json - ) - - assert response.status_code == 422 - assert (await db.execute(select(func.count(Question.id)))).scalar() == 0 - async def test_create_dataset_question_with_invalid_max_length_name( self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict ): diff --git a/argilla-server/tests/unit/api/handlers/v1/test_datasets.py b/argilla-server/tests/unit/api/handlers/v1/test_datasets.py index 4b57a24888..3be021701a 100644 --- a/argilla-server/tests/unit/api/handlers/v1/test_datasets.py +++ b/argilla-server/tests/unit/api/handlers/v1/test_datasets.py @@ -912,30 +912,6 @@ async def test_create_dataset_with_invalid_length_guidelines( assert response.status_code == 422 assert (await db.execute(select(func.count(Dataset.id)))).scalar() == 0 - @pytest.mark.parametrize( - "dataset_json", - [ - {"name": ""}, - {"name": "123$abc"}, - {"name": "unit@test"}, - {"name": "-test-dataset"}, - {"name": "_test-dataset"}, - {"name": "a" * (DATASET_NAME_MAX_LENGTH + 1)}, - {"name": "test-dataset", "guidelines": ""}, - {"name": "test-dataset", "guidelines": "a" * (DATASET_GUIDELINES_MAX_LENGTH + 1)}, - ], - ) - async def test_create_dataset_with_invalid_settings( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict, dataset_json: dict - ): - workspace = await WorkspaceFactory.create() - dataset_json.update({"workspace_id": str(workspace.id)}) - - response = await async_client.post("/api/v1/datasets", headers=owner_auth_header, json=dataset_json) - - assert response.status_code == 422 - assert (await db.execute(select(func.count(Dataset.id)))).scalar() == 0 - async def test_create_dataset_without_authentication(self, async_client: "AsyncClient", db: "AsyncSession"): workspace = await WorkspaceFactory.create() dataset_json = {"name": "name", "workspace_id": str(workspace.id)} @@ -1009,240 +985,6 @@ async def test_create_dataset_with_nonexistent_workspace_id( assert (await db.execute(select(func.count(Dataset.id)))).scalar() == 0 - @pytest.mark.parametrize( - ("settings", "expected_settings"), - [ - ({"type": "text"}, {"type": "text", "use_markdown": False}), - ({"type": "text", "discarded": "value"}, {"type": "text", "use_markdown": False}), - ({"type": "text", "use_markdown": False}, {"type": "text", "use_markdown": False}), - ], - ) - async def test_create_dataset_field( - self, - async_client: "AsyncClient", - db: "AsyncSession", - owner_auth_header: dict, - settings: dict, - expected_settings: dict, - ): - dataset = await DatasetFactory.create() - field_json = {"name": "name", "title": "title", "settings": settings} - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json - ) - - assert response.status_code == 201 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 1 - - response_body = response.json() - assert await db.get(Field, UUID(response_body["id"])) - assert response_body == { - "id": str(UUID(response_body["id"])), - "name": "name", - "title": "title", - "required": False, - "settings": expected_settings, - "dataset_id": str(dataset.id), - "inserted_at": datetime.fromisoformat(response_body["inserted_at"]).isoformat(), - "updated_at": datetime.fromisoformat(response_body["updated_at"]).isoformat(), - } - - async def test_create_dataset_field_without_authentication(self, async_client: "AsyncClient", db: "AsyncSession"): - dataset = await DatasetFactory.create() - field_json = { - "name": "name", - "title": "title", - "settings": {"type": "text"}, - } - - response = await async_client.post(f"/api/v1/datasets/{dataset.id}/fields", json=field_json) - - assert response.status_code == 401 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - - async def test_create_dataset_field_as_admin(self, async_client: "AsyncClient", db: "AsyncSession"): - workspace = await WorkspaceFactory.create() - admin = await AdminFactory.create(workspaces=[workspace]) - dataset = await DatasetFactory.create(workspace=workspace) - field_json = { - "name": "name", - "title": "title", - "settings": {"type": "text"}, - } - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/fields", - headers={API_KEY_HEADER_NAME: admin.api_key}, - json=field_json, - ) - - assert response.status_code == 201 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 1 - - async def test_create_dataset_field_as_annotator(self, async_client: "AsyncClient", db: "AsyncSession"): - annotator = await AnnotatorFactory.create() - dataset = await DatasetFactory.create() - field_json = { - "name": "name", - "title": "title", - "settings": {"type": "text"}, - } - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/fields", - headers={API_KEY_HEADER_NAME: annotator.api_key}, - json=field_json, - ) - - assert response.status_code == 403 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - - @pytest.mark.parametrize("invalid_name", ["", " ", " ", "-", "--", "_", "__", "A", "AA", "invalid_nAmE"]) - async def test_create_dataset_field_with_invalid_name( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict, invalid_name: str - ): - dataset = await DatasetFactory.create() - field_json = { - "name": invalid_name, - "title": "title", - "settings": {"type": "text"}, - } - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json - ) - - assert response.status_code == 422 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - - async def test_create_dataset_field_with_invalid_max_length_name( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict - ): - dataset = await DatasetFactory.create() - field_json = { - "name": "a" * (FIELD_CREATE_NAME_MAX_LENGTH + 1), - "title": "title", - "settings": {"type": "text"}, - } - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json - ) - - assert response.status_code == 422 - # assert db.query(Field).count() == 0 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - - async def test_create_dataset_field_with_invalid_max_length_title( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict - ): - dataset = await DatasetFactory.create() - field_json = { - "name": "name", - "title": "a" * (FIELD_CREATE_TITLE_MAX_LENGTH + 1), - "settings": {"type": "text"}, - } - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json - ) - - assert response.status_code == 422 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - - @pytest.mark.parametrize( - "settings", - [ - {}, - None, - {"type": "wrong-type"}, - {"type": "text", "use_markdown": None}, - {"type": "rating", "options": None}, - {"type": "rating", "options": []}, - ], - ) - async def test_create_dataset_field_with_invalid_settings( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict, settings: dict - ): - dataset = await DatasetFactory.create() - field_json = { - "name": "name", - "title": "Title", - "settings": settings, - } - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/fields", headers=owner_auth_header, json=field_json - ) - - assert response.status_code == 422 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - - async def test_create_dataset_field_with_existent_name( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict - ): - field = await FieldFactory.create(name="name") - - response = await async_client.post( - f"/api/v1/datasets/{field.dataset_id}/fields", - headers=owner_auth_header, - json={ - "name": "name", - "title": "title", - "settings": {"type": "text"}, - }, - ) - - assert response.status_code == 409 - assert response.json() == { - "detail": f"Field with name `{field.name}` already exists for dataset with id `{field.dataset_id}`" - } - - assert (await db.execute(select(func.count(Field.id)))).scalar() == 1 - - async def test_create_dataset_field_with_published_dataset( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict - ): - dataset = await DatasetFactory.create(status=DatasetStatus.ready) - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/fields", - headers=owner_auth_header, - json={ - "name": "name", - "title": "title", - "settings": {"type": "text"}, - }, - ) - - assert response.status_code == 422 - assert response.json() == {"detail": "Field cannot be created for a published dataset"} - - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - - async def test_create_dataset_field_with_nonexistent_dataset_id( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict - ): - dataset_id = uuid4() - - await DatasetFactory.create() - - response = await async_client.post( - f"/api/v1/datasets/{dataset_id}/fields", - headers=owner_auth_header, - json={ - "name": "text", - "title": "Text", - "settings": {"type": "text"}, - }, - ) - - assert response.status_code == 404 - assert response.json() == {"detail": f"Dataset with id `{dataset_id}` not found"} - - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - @pytest.mark.parametrize( ("settings", "expected_settings"), [ @@ -1410,32 +1152,6 @@ async def test_create_dataset_metadata_property_as_annotator(self, async_client: assert response.status_code == 403 assert (await db.execute(select(func.count(Question.id)))).scalar() == 0 - @pytest.mark.parametrize( - "invalid_name", - [ - None, - "", - "::", - "bad Name", - "¿pef", - "wrong:name", - "wrong.name" "**", - "a" * (METADATA_PROPERTY_CREATE_NAME_MAX_LENGTH + 1), - ], - ) - async def test_create_dataset_metadata_property_with_invalid_name( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict, invalid_name: str - ): - dataset = await DatasetFactory.create() - metadata_property_json = {"name": invalid_name, "title": "title", "settings": {"type": "terms"}} - - response = await async_client.post( - f"/api/v1/datasets/{dataset.id}/metadata-properties", headers=owner_auth_header, json=metadata_property_json - ) - - assert response.status_code == 422 - assert (await db.execute(select(func.count(Field.id)))).scalar() == 0 - async def test_create_dataset_metadata_property_with_existent_name( self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict ): @@ -1608,7 +1324,6 @@ async def test_create_dataset_vector_settings( [ {"name": "", "title": "vectors", "dimensions": 5}, {"name": "a" * (VECTOR_SETTINGS_CREATE_NAME_MAX_LENGTH + 1), "title": "vectors", "dimensions": 5}, - {"name": " invalid", "title": "vectors", "dimensions": 5}, {"name": "vectors", "title": "", "dimensions": 5}, { "name": "vectors", @@ -4771,35 +4486,6 @@ async def test_update_dataset(self, async_client: "AsyncClient", db: "AsyncSessi assert dataset.guidelines == guidelines assert dataset.allow_extra_metadata is allow_extra_metadata - @pytest.mark.parametrize( - "dataset_json", - [ - {"name": None}, - {"name": ""}, - {"name": "123$abc"}, - {"name": "unit@test"}, - {"name": "-test-dataset"}, - {"name": "_test-dataset"}, - {"name": "a" * (DATASET_NAME_MAX_LENGTH + 1)}, - {"name": "test-dataset", "guidelines": ""}, - {"name": "test-dataset", "guidelines": "a" * (DATASET_GUIDELINES_MAX_LENGTH + 1)}, - {"allow_extra_metadata": None}, - ], - ) - @pytest.mark.asyncio - async def test_update_dataset_with_invalid_settings( - self, async_client: "AsyncClient", db: "AsyncSession", owner_auth_header: dict, dataset_json: dict - ): - dataset = await DatasetFactory.create( - name="Current Name", guidelines="Current Guidelines", status=DatasetStatus.ready - ) - - response = await async_client.patch( - f"/api/v1/datasets/{dataset.id}", headers=owner_auth_header, json=dataset_json - ) - - assert response.status_code == 422 - @pytest.mark.asyncio async def test_update_dataset_with_invalid_payload(self, async_client: "AsyncClient", owner_auth_header: dict): dataset = await DatasetFactory.create() diff --git a/argilla/CHANGELOG.md b/argilla/CHANGELOG.md index 3165c12b0c..93069be9bd 100644 --- a/argilla/CHANGELOG.md +++ b/argilla/CHANGELOG.md @@ -16,6 +16,10 @@ These are the section headers that we use: ## [Unreleased]() +### Removed + +- Removed name sanitizing for dataset settings names. This may cause issues with old server versions. Especially when working with `from_hub`. ([#5574](https://github.com/argilla-io/argilla/pull/5574)) + ## [2.3.0](https://github.com/argilla-io/argilla/compare/v2.2.2...v2.3.0) ### Added diff --git a/argilla/src/argilla/_models/_settings/_fields.py b/argilla/src/argilla/_models/_settings/_fields.py index 185dbaedae..d957ca83ae 100644 --- a/argilla/src/argilla/_models/_settings/_fields.py +++ b/argilla/src/argilla/_models/_settings/_fields.py @@ -56,12 +56,6 @@ class FieldModel(ResourceModel): description: Optional[str] = None dataset_id: Optional[UUID] = None - @field_validator("name") - @classmethod - def __name_lower(cls, name): - formatted_name = name.lower().replace(" ", "_") - return formatted_name - @field_validator("title") @classmethod def __title_default(cls, title: str, info: ValidationInfo) -> str: diff --git a/argilla/src/argilla/_models/_settings/_metadata.py b/argilla/src/argilla/_models/_settings/_metadata.py index 7cc97984df..0266226791 100644 --- a/argilla/src/argilla/_models/_settings/_metadata.py +++ b/argilla/src/argilla/_models/_settings/_metadata.py @@ -105,12 +105,6 @@ class MetadataFieldModel(ResourceModel): dataset_id: Optional[UUID] = None - @field_validator("name") - @classmethod - def __name_lower(cls, name): - formatted_name = name.lower().replace(" ", "_") - return formatted_name - @field_validator("title") @classmethod def __title_default(cls, title, values): diff --git a/argilla/src/argilla/_models/_settings/_questions/_base.py b/argilla/src/argilla/_models/_settings/_questions/_base.py index e55a5557ab..e661689507 100644 --- a/argilla/src/argilla/_models/_settings/_questions/_base.py +++ b/argilla/src/argilla/_models/_settings/_questions/_base.py @@ -35,12 +35,6 @@ class QuestionBaseModel(BaseModel, validate_assignment=True): inserted_at: Optional[datetime] = None updated_at: Optional[datetime] = None - @field_validator("name") - @classmethod - def __name_lower(cls, name: str, info: ValidationInfo): - formatted_name = name.lower().replace(" ", "_") - return formatted_name - @field_validator("title", mode="before") @classmethod def __title_default(cls, title, info: ValidationInfo): diff --git a/argilla/src/argilla/_models/_settings/_vectors.py b/argilla/src/argilla/_models/_settings/_vectors.py index 80b5d01e69..6a54e064d2 100644 --- a/argilla/src/argilla/_models/_settings/_vectors.py +++ b/argilla/src/argilla/_models/_settings/_vectors.py @@ -32,12 +32,6 @@ class VectorFieldModel(ResourceModel): def serialize_id(self, value: UUID) -> str: return str(value) - @field_validator("name") - @classmethod - def _name_lower(cls, name): - formatted_name = name.lower().replace(" ", "_") - return formatted_name - @field_validator("title") @classmethod def _title_default(cls, title: str, info: ValidationInfo) -> str: diff --git a/argilla/src/argilla/datasets/_io/_hub.py b/argilla/src/argilla/datasets/_io/_hub.py index 3d58fa1638..b2496adb02 100644 --- a/argilla/src/argilla/datasets/_io/_hub.py +++ b/argilla/src/argilla/datasets/_io/_hub.py @@ -200,17 +200,6 @@ def from_hub( @staticmethod def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"): """This method extracts the responses from a Hugging Face dataset and returns a list of `Record` objects""" - # THIS IS REQUIRED SINCE THE NAME RESTRICTION IN ARGILLA. HUGGING FACE DATASET COLUMNS ARE CASE SENSITIVE - # Also, there is a logic with column names including ".responses" and ".suggestion" in the name. - columns_map = {} - for column in hf_dataset.column_names: - if ".responses" in column or ".suggestion" in column: - columns_map[column] = column.lower() - else: - columns_map[column] = dataset.settings._sanitize_settings_name(column) - - hf_dataset = hf_dataset.rename_columns(columns_map) - # Identify columns that columns that contain responses responses_columns = [col for col in hf_dataset.column_names if ".responses" in col] response_questions = defaultdict(dict) diff --git a/argilla/src/argilla/settings/_io/_hub.py b/argilla/src/argilla/settings/_io/_hub.py index c476c2acaf..ce6f8f590e 100644 --- a/argilla/src/argilla/settings/_io/_hub.py +++ b/argilla/src/argilla/settings/_io/_hub.py @@ -220,12 +220,6 @@ def _define_settings_from_features( feature_type = _map_feature_type(feature) attribute_definition = _map_attribute_type(feature_mapping.get(name)) - name = Settings._sanitize_settings_name(name) - - if not Settings._is_valid_name(name): - warnings.warn(f"Feature '{name}' has an invalid name. Skipping.") - continue - if feature_type == FeatureType.CHAT: fields.append(ChatField(name=name, required=False)) elif feature_type == FeatureType.TEXT: diff --git a/argilla/src/argilla/settings/_resource.py b/argilla/src/argilla/settings/_resource.py index 599ceb043c..727fce0dd2 100644 --- a/argilla/src/argilla/settings/_resource.py +++ b/argilla/src/argilla/settings/_resource.py @@ -14,7 +14,6 @@ import json import os -import re from functools import cached_property from pathlib import Path from typing import List, Optional, TYPE_CHECKING, Dict, Union, Iterator, Sequence, Literal @@ -417,15 +416,6 @@ def _validate_mapping(cls, mapping: Dict[str, Union[str, Sequence[str]]]) -> dic return validate_mapping - @classmethod - def _sanitize_settings_name(cls, name: str) -> str: - """Sanitize the name for the settings""" - - for char in [" ", ":", ".", "&", "?", "!"]: - name = name.replace(char, "_") - - return name.lower() - def __process_guidelines(self, guidelines): if guidelines is None: return guidelines @@ -439,11 +429,6 @@ def __process_guidelines(self, guidelines): return guidelines - @classmethod - def _is_valid_name(cls, name: str) -> bool: - """Check if the name is valid""" - return bool(re.match(r"^(?=.*[a-z0-9])[a-z0-9_-]+$", name)) - Property = Union[Field, VectorField, MetadataType, QuestionType] diff --git a/argilla/tests/integration/test_import_features.py b/argilla/tests/integration/test_import_features.py index 720652ecde..8c0aa0d21d 100644 --- a/argilla/tests/integration/test_import_features.py +++ b/argilla/tests/integration/test_import_features.py @@ -120,13 +120,13 @@ def test_import_from_hub_with_upper_case_columns(self, client: rg.Argilla, token name=dataset_name, ) - assert created_dataset.settings.fields[0].name == "text" - assert list(created_dataset.records)[0].fields["text"] == "Hello World, how are you?" + assert created_dataset.settings.fields[0].name == "Text" + assert list(created_dataset.records)[0].fields["Text"] == "Hello World, how are you?" def test_import_from_hub_with_unlabelled_classes(self, client: rg.Argilla, token: str, dataset_name: str): created_dataset = rg.Dataset.from_hub( "argilla-internal-testing/test_import_from_hub_with_unlabelled_classes", token=token, name=dataset_name ) - assert created_dataset.settings.fields[0].name == "text" - assert list(created_dataset.records)[0].fields["text"] == "Hello World, how are you?" + assert created_dataset.settings.fields[0].name == "Text" + assert list(created_dataset.records)[0].fields["Text"] == "Hello World, how are you?" diff --git a/argilla/tests/unit/test_settings/test_settings_fields.py b/argilla/tests/unit/test_settings/test_settings_fields.py index a9e3825a7d..beb55a7134 100644 --- a/argilla/tests/unit/test_settings/test_settings_fields.py +++ b/argilla/tests/unit/test_settings/test_settings_fields.py @@ -36,21 +36,6 @@ def test_init_text_field_with_title(self): assert text_field.title == mock_title assert text_field.required is True - @pytest.mark.parametrize( - "name, expected", - [ - ("prompt", "prompt"), - ("Prompt", "prompt"), - ("Prompt Name", "prompt_name"), - ("Prompt Name 2", "prompt_name_2"), - ("Prompt Name 2", "prompt_name_2"), - ], - ) - def test_name_validator(self, name, expected, mocker): - mock_use_markdown = True - text_field = rg.TextField(name=name, use_markdown=mock_use_markdown) - assert text_field.name == expected - @pytest.mark.parametrize( "title, name, expected", [ diff --git a/argilla/tests/unit/test_settings/test_settings_from_features.py b/argilla/tests/unit/test_settings/test_settings_from_features.py index fd712129a6..93b0d2ecfe 100644 --- a/argilla/tests/unit/test_settings/test_settings_from_features.py +++ b/argilla/tests/unit/test_settings/test_settings_from_features.py @@ -38,32 +38,6 @@ def test_define_settings_from_features_image(): assert settings.fields[0].name == "image_column" -@pytest.mark.parametrize( - "column,name", - [ - ("Text column", "text_column"), - ("text column", "text_column"), - ("text:column", "text_column"), - ("text.column", "text_column"), - ], -) -def test_define_settings_from_features_with_non_curated_column_name(column: str, name: str): - features = {column: {"_type": "Value", "dtype": "string"}} - - settings = _define_settings_from_features(features, feature_mapping={}) - - assert len(settings.fields) == 1 - assert settings.fields[0].name == name - - -@pytest.mark.parametrize("column", ["textcolumn", "text|column", "text]column", "text/column"]) -def test_define_settings_from_features_with_unsupported_column_name(column: str): - features = {column: {"_type": "Value", "dtype": "string"}} - - with pytest.raises(SettingsError): - _define_settings_from_features(features, feature_mapping={}) - - def test_define_settings_from_features_multiple(): features = { "text_column": {"_type": "Value", "dtype": "string"},