From e28ef13caf11aa94afa7ef89957334469564259b Mon Sep 17 00:00:00 2001 From: burtenshaw Date: Thu, 29 Aug 2024 11:43:32 +0200 Subject: [PATCH 1/2] [REFACTOR] refactor from hub method to simplify method (#5420) # Description Closes # **Type of change** - Bug fix (non-breaking change which fixes an issue) - New feature (non-breaking change which adds functionality) - Breaking change (fix or feature that would cause existing functionality to not work as expected) - Refactor (change restructuring the codebase without changing functionality) - Improvement (change adding some improvement to an existing functionality) - Documentation update **How Has This Been Tested** **Checklist** - I added relevant documentation - I followed the style guidelines of this project - I did a self-review of my code - I made corresponding changes to the documentation - I confirm My changes generate no new warnings - I have added tests that prove my fix is effective or that my feature works - I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --------- Co-authored-by: Paco Aranda Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> --- argilla/src/argilla/datasets/_export/_hub.py | 72 ++++++++++--------- .../tests/integration/test_export_dataset.py | 60 ++++++++++++++++ 2 files changed, 100 insertions(+), 32 deletions(-) diff --git a/argilla/src/argilla/datasets/_export/_hub.py b/argilla/src/argilla/datasets/_export/_hub.py index 0d33f428ce..e6ec2476c6 100644 --- a/argilla/src/argilla/datasets/_export/_hub.py +++ b/argilla/src/argilla/datasets/_export/_hub.py @@ -16,14 +16,15 @@ import warnings from collections import defaultdict from tempfile import TemporaryDirectory -from typing import TYPE_CHECKING, Any, Optional, Type, Union +from typing import TYPE_CHECKING, Any, Optional, Type, Union, Dict from uuid import UUID +from datasets import DatasetDict +from datasets.data_files import EmptyDatasetError + from argilla._exceptions._api import UnprocessableEntityError from argilla._exceptions._records import RecordsIngestionError from argilla._exceptions._settings import SettingsError -from datasets.data_files import EmptyDatasetError - from argilla.datasets._export._disk import DiskImportExportMixin from argilla.records._mapping import IngestedRecordMapper from argilla.responses import Response @@ -72,6 +73,7 @@ def to_hub( with TemporaryDirectory() as tmpdirname: config_dir = os.path.join(tmpdirname) + self.to_disk(path=config_dir, with_records=False) if generate_card: @@ -129,9 +131,12 @@ def from_hub( Returns: A `Dataset` loaded from the Hugging Face Hub. """ - from datasets import Dataset, DatasetDict, load_dataset + from datasets import load_dataset from huggingface_hub import snapshot_download + if name is None: + name = repo_id.replace("/", "_") + if settings is not None: dataset = cls(name=name, settings=settings) dataset.create() @@ -150,31 +155,9 @@ def from_hub( if with_records: try: - hf_dataset: Dataset = load_dataset(path=repo_id, **kwargs) # type: ignore - if isinstance(hf_dataset, DatasetDict) and "split" not in kwargs: - if len(hf_dataset.keys()) > 1: - raise ValueError( - "Only one dataset can be loaded at a time, use `split` to select a split, available splits" - f" are: {', '.join(hf_dataset.keys())}." - ) - hf_dataset: Dataset = hf_dataset[list(hf_dataset.keys())[0]] - for feature in hf_dataset.features: - if feature not in dataset.settings.fields or feature not in dataset.settings.questions: - warnings.warn( - message=f"Feature {feature} in Hugging Face dataset is not defined in dataset settings." - ) - warnings.warn( - message=f"Available fields: {dataset.settings.fields}. Available questions: {dataset.settings.questions}." - ) - try: - cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset) - except (RecordsIngestionError, UnprocessableEntityError) as e: - if settings is not None: - raise SettingsError( - message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema {hf_dataset.features}" - ) from e - else: - raise e + hf_dataset = load_dataset(path=repo_id, **kwargs) # type: ignore + hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, **kwargs) + cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset) except EmptyDatasetError: warnings.warn( message="Trying to load a dataset `with_records=True` but dataset does not contain any records.", @@ -221,9 +204,7 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"): records = [] for idx, row in enumerate(hf_dataset): record = mapper(row) - record.id = row.pop("id") for question_name, values in response_questions.items(): - response_users = {} response_values = values["responses"][idx] response_users = values["users"][idx] response_status = values["status"][idx] @@ -240,4 +221,31 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"): ) record.responses.add(response) records.append(record) - dataset.records.log(records=records) + + try: + dataset.records.log(records=records) + except (RecordsIngestionError, UnprocessableEntityError) as e: + raise SettingsError( + message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema. Hugging face dataset features: {hf_dataset.features}. Argilla dataset settings : {dataset.settings}" + ) from e + + @staticmethod + def _get_dataset_split(hf_dataset: "HFDataset", split: Optional[str] = None, **kwargs: Dict) -> "HFDataset": + """Get a single dataset from a Hugging Face dataset. + + Parameters: + hf_dataset (HFDataset): The Hugging Face dataset to get a single dataset from. + + Returns: + HFDataset: The single dataset. + """ + + if isinstance(hf_dataset, DatasetDict) and split is None: + split = next(iter(hf_dataset.keys())) + if len(hf_dataset.keys()) > 1: + warnings.warn( + message=f"Multiple splits found in Hugging Face dataset. Using the first split: {split}. " + f"Available splits are: {', '.join(hf_dataset.keys())}." + ) + hf_dataset = hf_dataset[split] + return hf_dataset diff --git a/argilla/tests/integration/test_export_dataset.py b/argilla/tests/integration/test_export_dataset.py index 92e3272ccc..041871bce1 100644 --- a/argilla/tests/integration/test_export_dataset.py +++ b/argilla/tests/integration/test_export_dataset.py @@ -250,10 +250,70 @@ def test_import_dataset_from_hub_using_settings( assert new_dataset.settings.fields[0].name == "text" assert new_dataset.settings.questions[0].name == "label" + + @pytest.mark.parametrize("with_records_import", [True, False]) + def test_import_dataset_from_hub_using_settings( + self, + token: str, + dataset: rg.Dataset, + client, + mock_data: List[dict[str, Any]], + with_records_export: bool, + with_records_import: bool, + ): + repo_id = ( + f"argilla-internal-testing/test_import_dataset_from_hub_using_settings_with_records{with_records_export}" + ) + mock_dataset_name = f"test_import_dataset_from_hub_using_settings_{uuid.uuid4()}" + dataset.records.log(records=mock_data) + + dataset.to_hub(repo_id=repo_id, with_records=with_records_export, token=token) + settings = rg.Settings( + fields=[ + rg.TextField(name="text"), + ], + questions=[ + rg.LabelQuestion(name="label", labels=["positive", "negative"]), + rg.LabelQuestion(name="extra_label", labels=["extra_positive", "extra_negative"]), + ], + ) + if with_records_import and not with_records_export: + with pytest.warns( + expected_warning=UserWarning, + match="Trying to load a dataset `with_records=True` but dataset does not contain any records.", + ): + new_dataset = rg.Dataset.from_hub( + repo_id=repo_id, + client=client, + with_records=with_records_import, + token=token, + settings=settings, + name=mock_dataset_name, + ) + else: + new_dataset = rg.Dataset.from_hub( + repo_id=repo_id, + client=client, + with_records=with_records_import, + token=token, + settings=settings, + name=mock_dataset_name, + ) + + if with_records_import and with_records_export: + for i, record in enumerate(new_dataset.records(with_suggestions=True)): + assert record.fields["text"] == mock_data[i]["text"] + assert record.suggestions["label"].value == mock_data[i]["label"] + else: + assert len(new_dataset.records.to_list()) == 0 + + assert new_dataset.settings.fields[0].name == "text" + assert new_dataset.settings.questions[0].name == "label" assert new_dataset.settings.questions[1].name == "extra_label" assert len(new_dataset.settings.questions[1].labels) == 2 assert new_dataset.settings.questions[1].labels[0] == "extra_positive" assert new_dataset.settings.questions[1].labels[1] == "extra_negative" + assert new_dataset.name == mock_dataset_name def test_import_dataset_from_hub_using_wrong_settings( self, From 7495136122952d59ccee32a0c52fd1026dba6e25 Mon Sep 17 00:00:00 2001 From: Natalia Elvira <126158523+nataliaElv@users.noreply.github.com> Date: Fri, 30 Aug 2024 11:01:50 +0200 Subject: [PATCH 2/2] Docs: advanced queries dsl (#5435) # Description Closes # **Type of change** - Bug fix (non-breaking change which fixes an issue) - New feature (non-breaking change which adds functionality) - Breaking change (fix or feature that would cause existing functionality to not work as expected) - Refactor (change restructuring the codebase without changing functionality) - Improvement (change adding some improvement to an existing functionality) - Documentation update **How Has This Been Tested** **Checklist** - I added relevant documentation - I followed the style guidelines of this project - I did a self-review of my code - I made corresponding changes to the documentation - I confirm My changes generate no new warnings - I have added tests that prove my fix is effective or that my feature works - I have added relevant notes to the CHANGELOG.md file (See https://keepachangelog.com/) --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com> Co-authored-by: Francisco Aranda --- argilla/docs/how_to_guides/annotate.md | 6 ++++++ argilla/docs/how_to_guides/query.md | 23 ++++++++++++++++++++--- argilla/mkdocs.yml | 4 +++- 3 files changed, 29 insertions(+), 4 deletions(-) diff --git a/argilla/docs/how_to_guides/annotate.md b/argilla/docs/how_to_guides/annotate.md index ba5fdb64e9..b825b76020 100644 --- a/argilla/docs/how_to_guides/annotate.md +++ b/argilla/docs/how_to_guides/annotate.md @@ -136,6 +136,12 @@ The UI offers various features designed for data exploration and understanding. From the **control panel** at the top of the left pane, you can search by keyword across the entire dataset. If you have more than one field in your records, you may specify if the search is to be performed “All” fields or on a specific one. Matched results are highlighted in color. +!!! note + If you introduce more than one keyword, the search will return results where **all** keywords have a match. + +!!! tip + For more advanced searches, take a look at the [advanced queries DSL](query.md#advanced-queries). + ### Order by record semantic similarity You can retrieve records based on their similarity to another record if vectors have been added to the dataset. diff --git a/argilla/docs/how_to_guides/query.md b/argilla/docs/how_to_guides/query.md index 9d1aab1c7a..eaf2c877f3 100644 --- a/argilla/docs/how_to_guides/query.md +++ b/argilla/docs/how_to_guides/query.md @@ -35,7 +35,7 @@ You can search for records in your dataset by **querying** or **filtering**. The To search for records with terms, you can use the `Dataset.records` attribute with a query string. The search terms are used to search for records that contain the terms in the text field. You can search a single term or various terms, in the latter, all of them should appear in the record to be retrieved. -=== "Single search term" +=== "Single term search" ```python import argilla as rg @@ -49,7 +49,7 @@ To search for records with terms, you can use the `Dataset.records` attribute wi queried_records = dataset.records(query=query).to_list(flatten=True) ``` -=== "Multiple search term" +=== "Multiple terms search" ```python import argilla as rg @@ -63,6 +63,23 @@ To search for records with terms, you can use the `Dataset.records` attribute wi queried_records = dataset.records(query=query).to_list(flatten=True) ``` +### Advanced queries + +If you need more complex searches, you can use [Elasticsearch's simple query string syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax). Here is a summary of the different available operators: + +| operator | description | example | +| ------------ | --------------------------- | --------------------------------------------------------------------- | +|`+` or `space`| **AND**: search both terms | `argilla + distilabel` or `argilla distilabel`
return records that include the terms "argilla" and "distilabel"| +|`|` | **OR**: search either term | `argilla | distilabel`
returns records that include the term "argilla" or "distilabel"| +|`-` | **Negation**: exclude a term| `argilla -distilabel`
returns records that contain the term "argilla" and don't have the term "distilabel"| +|`*` | **Prefix**: search a prefix | `arg*`
returns records with any words starting with "arg-"| +|`"` | **Phrase**: search a phrase | `"argilla and distilabel"`
returns records that contain the phrase "argilla and distilabel"| +|`(` and `)` | **Precedence**: group terms | `(argilla | distilabel) rules`
returns records that contain either "argilla" or "distilabel" and "rules"| +|`~N` | **Edit distance**: search a term or phrase with an edit distance| `argilla~1`
returns records that contain the term "argilla" with an edit distance of 1, e.g. "argila"| + +!!! tip + To use one of these characters literally, escape it with a preceding backslash `\`, e.g. `"1 \+ 2"` would match records where the phrase "1 + 2" is found. + ## Filter by conditions You can use the `Filter` class to define the conditions and pass them to the `Dataset.records` attribute to fetch records based on the conditions. Conditions include "==", ">=", "<=", or "in". Conditions can be combined with dot notation to filter records based on metadata, suggestions, or responses. You can use a single condition or multiple conditions to filter records. @@ -72,7 +89,7 @@ You can use the `Filter` class to define the conditions and pass them to the `Da | `==` | The `field` value is equal to the `value` | | `>=` | The `field` value is greater than or equal to the `value` | | `<=` | The `field` value is less than or equal to the `value` | -| `in` | TThe `field` value is included in a list of values | +| `in` | The `field` value is included in a list of values | === "Single condition" diff --git a/argilla/mkdocs.yml b/argilla/mkdocs.yml index 57aef13e62..64d8f08a9c 100644 --- a/argilla/mkdocs.yml +++ b/argilla/mkdocs.yml @@ -121,6 +121,7 @@ plugins: - docs/scripts/gen_changelog.py - docs/scripts/gen_popular_issues.py # - docs/scripts/gen_ref_pages.py + enabled: !ENV [CI, false] # enables the plugin only during continuous integration (CI), disabled on local build - literate-nav: nav_file: SUMMARY.md - section-index @@ -148,7 +149,8 @@ plugins: # Signature separate_signature: false show_signature_annotations: false - - social + - social: + enabled: !ENV [CI, false] # enables the plugin only during continuous integration (CI), disabled on local build - mknotebooks - material-plausible