Merge branch 'develop' into feat/add-image-field-support

argilla-io · Sep 3, 2024 · b22e844 · b22e844
2 parents de6c3fa + 7495136
commit b22e844
Show file tree

Hide file tree

Showing 5 changed files with 129 additions and 36 deletions.
diff --git a/argilla/docs/how_to_guides/annotate.md b/argilla/docs/how_to_guides/annotate.md
@@ -136,6 +136,12 @@ The UI offers various features designed for data exploration and understanding.
 
 From the **control panel** at the top of the left pane, you can search by keyword across the entire dataset. If you have more than one field in your records, you may specify if the search is to be performed “All” fields or on a specific one. Matched results are highlighted in color.
 
+!!! note
+    If you introduce more than one keyword, the search will return results where **all** keywords have a match.
+
+!!! tip
+    For more advanced searches, take a look at the [advanced queries DSL](query.md#advanced-queries).
+
 ### Order by record semantic similarity
 
 You can retrieve records based on their similarity to another record if vectors have been added to the dataset.

diff --git a/argilla/docs/how_to_guides/query.md b/argilla/docs/how_to_guides/query.md
@@ -35,7 +35,7 @@ You can search for records in your dataset by **querying** or **filtering**. The
 
 To search for records with terms, you can use the `Dataset.records` attribute with a query string. The search terms are used to search for records that contain the terms in the text field. You can search a single term or various terms, in the latter, all of them should appear in the record to be retrieved.
 
-=== "Single search term"
+=== "Single term search"
 
     ```python
     import argilla as rg
@@ -49,7 +49,7 @@ To search for records with terms, you can use the `Dataset.records` attribute wi
     queried_records = dataset.records(query=query).to_list(flatten=True)
     ```
 
-=== "Multiple search term"
+=== "Multiple terms search"
 
     ```python
     import argilla as rg
@@ -63,6 +63,23 @@ To search for records with terms, you can use the `Dataset.records` attribute wi
     queried_records = dataset.records(query=query).to_list(flatten=True)
     ```
 
+### Advanced queries
+
+If you need more complex searches, you can use [Elasticsearch's simple query string syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax). Here is a summary of the different available operators:
+
+| operator     | description                 | example                                                               |
+| ------------ | --------------------------- | --------------------------------------------------------------------- |
+|`+` or `space`| **AND**: search both terms  | `argilla + distilabel` or `argilla distilabel`</br> return records that include the terms "argilla" and "distilabel"|
+|`|`           | **OR**: search either term  | `argilla | distilabel` </br> returns records that include the term "argilla" or "distilabel"|
+|`-`           | **Negation**: exclude a term| `argilla -distilabel` </br> returns records that contain the term "argilla" and don't have the term "distilabel"|
+|`*`           | **Prefix**: search a prefix | `arg*`</br> returns records with any words starting with "arg-"|
+|`"`           | **Phrase**: search a phrase | `"argilla and distilabel"` </br> returns records that contain the phrase "argilla and distilabel"|
+|`(` and `)`   | **Precedence**: group terms | `(argilla | distilabel) rules` </br> returns records that contain either "argilla" or "distilabel" and "rules"|
+|`~N`          | **Edit distance**: search a term or phrase with an edit distance| `argilla~1` </br> returns records that contain the term "argilla" with an edit distance of 1, e.g. "argila"|
+
+!!! tip
+    To use one of these characters literally, escape it with a preceding backslash `\`, e.g. `"1 \+ 2"` would match records where the phrase "1 + 2" is found.
+
 ## Filter by conditions
 
 You can use the `Filter` class to define the conditions and pass them to the `Dataset.records` attribute to fetch records based on the conditions. Conditions include "==", ">=", "<=", or "in". Conditions can be combined with dot notation to filter records based on metadata, suggestions, or responses. You can use a single condition or multiple conditions to filter records.
@@ -72,7 +89,7 @@ You can use the `Filter` class to define the conditions and pass them to the `Da
 | `==`     | The `field` value is equal to the `value`                 |
 | `>=`     | The `field` value is greater than or equal to the `value` |
 | `<=`     | The `field` value is less than or equal to the `value`    |
-| `in`     | TThe `field` value is included in a list of values        |
+| `in`     | The `field` value is included in a list of values        |
 
 === "Single condition"
 

diff --git a/argilla/mkdocs.yml b/argilla/mkdocs.yml
@@ -121,6 +121,7 @@ plugins:
         - docs/scripts/gen_changelog.py
         - docs/scripts/gen_popular_issues.py
         # - docs/scripts/gen_ref_pages.py
+      enabled: !ENV [CI, false] # enables the plugin only during continuous integration (CI), disabled on local build
   - literate-nav:
       nav_file: SUMMARY.md
   - section-index
@@ -148,7 +149,8 @@ plugins:
             # Signature
             separate_signature: false
             show_signature_annotations: false
-  - social
+  - social:
+      enabled: !ENV [CI, false] # enables the plugin only during continuous integration (CI), disabled on local build
   - mknotebooks
   - material-plausible
 

diff --git a/argilla/src/argilla/datasets/_export/_hub.py b/argilla/src/argilla/datasets/_export/_hub.py
@@ -16,14 +16,15 @@
 import warnings
 from collections import defaultdict
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Any, Optional, Type, Union
+from typing import TYPE_CHECKING, Any, Optional, Type, Union, Dict
 from uuid import UUID
 
+from datasets import DatasetDict
+from datasets.data_files import EmptyDatasetError
+
 from argilla._exceptions._api import UnprocessableEntityError
 from argilla._exceptions._records import RecordsIngestionError
 from argilla._exceptions._settings import SettingsError
-from datasets.data_files import EmptyDatasetError
-
 from argilla.datasets._export._disk import DiskImportExportMixin
 from argilla.records._mapping import IngestedRecordMapper
 from argilla.responses import Response
@@ -72,6 +73,7 @@ def to_hub(
 
         with TemporaryDirectory() as tmpdirname:
             config_dir = os.path.join(tmpdirname)
+
             self.to_disk(path=config_dir, with_records=False)
 
             if generate_card:
@@ -129,9 +131,12 @@ def from_hub(
         Returns:
             A `Dataset` loaded from the Hugging Face Hub.
         """
-        from datasets import Dataset, DatasetDict, load_dataset
+        from datasets import load_dataset
         from huggingface_hub import snapshot_download
 
+        if name is None:
+            name = repo_id.replace("/", "_")
+
         if settings is not None:
             dataset = cls(name=name, settings=settings)
             dataset.create()
@@ -150,31 +155,9 @@ def from_hub(
 
         if with_records:
             try:
-                hf_dataset: Dataset = load_dataset(path=repo_id, **kwargs)  # type: ignore
-                if isinstance(hf_dataset, DatasetDict) and "split" not in kwargs:
-                    if len(hf_dataset.keys()) > 1:
-                        raise ValueError(
-                            "Only one dataset can be loaded at a time, use `split` to select a split, available splits"
-                            f" are: {', '.join(hf_dataset.keys())}."
-                        )
-                    hf_dataset: Dataset = hf_dataset[list(hf_dataset.keys())[0]]
-                for feature in hf_dataset.features:
-                    if feature not in dataset.settings.fields or feature not in dataset.settings.questions:
-                        warnings.warn(
-                            message=f"Feature {feature} in Hugging Face dataset is not defined in dataset settings."
-                        )
-                        warnings.warn(
-                            message=f"Available fields: {dataset.settings.fields}. Available questions: {dataset.settings.questions}."
-                        )
-                try:
-                    cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
-                except (RecordsIngestionError, UnprocessableEntityError) as e:
-                    if settings is not None:
-                        raise SettingsError(
-                            message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema {hf_dataset.features}"
-                        ) from e
-                    else:
-                        raise e
+                hf_dataset = load_dataset(path=repo_id, **kwargs)  # type: ignore
+                hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, **kwargs)
+                cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
             except EmptyDatasetError:
                 warnings.warn(
                     message="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
@@ -221,9 +204,7 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
         records = []
         for idx, row in enumerate(hf_dataset):
             record = mapper(row)
-            record.id = row.pop("id")
             for question_name, values in response_questions.items():
-                response_users = {}
                 response_values = values["responses"][idx]
                 response_users = values["users"][idx]
                 response_status = values["status"][idx]
@@ -240,4 +221,31 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
                     )
                     record.responses.add(response)
             records.append(record)
-        dataset.records.log(records=records)
+
+        try:
+            dataset.records.log(records=records)
+        except (RecordsIngestionError, UnprocessableEntityError) as e:
+            raise SettingsError(
+                message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema. Hugging face dataset features: {hf_dataset.features}. Argilla dataset settings : {dataset.settings}"
+            ) from e
+
+    @staticmethod
+    def _get_dataset_split(hf_dataset: "HFDataset", split: Optional[str] = None, **kwargs: Dict) -> "HFDataset":
+        """Get a single dataset from a Hugging Face dataset.
+
+        Parameters:
+            hf_dataset (HFDataset): The Hugging Face dataset to get a single dataset from.
+
+        Returns:
+            HFDataset: The single dataset.
+        """
+
+        if isinstance(hf_dataset, DatasetDict) and split is None:
+            split = next(iter(hf_dataset.keys()))
+            if len(hf_dataset.keys()) > 1:
+                warnings.warn(
+                    message=f"Multiple splits found in Hugging Face dataset. Using the first split: {split}. "
+                    f"Available splits are: {', '.join(hf_dataset.keys())}."
+                )
+            hf_dataset = hf_dataset[split]
+        return hf_dataset
diff --git a/argilla/tests/integration/test_export_dataset.py b/argilla/tests/integration/test_export_dataset.py
@@ -259,10 +259,70 @@ def test_import_dataset_from_hub_using_settings(
 
         assert new_dataset.settings.fields[0].name == "text"
         assert new_dataset.settings.questions[0].name == "label"
+
+    @pytest.mark.parametrize("with_records_import", [True, False])
+    def test_import_dataset_from_hub_using_settings(
+        self,
+        token: str,
+        dataset: rg.Dataset,
+        client,
+        mock_data: List[dict[str, Any]],
+        with_records_export: bool,
+        with_records_import: bool,
+    ):
+        repo_id = (
+            f"argilla-internal-testing/test_import_dataset_from_hub_using_settings_with_records{with_records_export}"
+        )
+        mock_dataset_name = f"test_import_dataset_from_hub_using_settings_{uuid.uuid4()}"
+        dataset.records.log(records=mock_data)
+
+        dataset.to_hub(repo_id=repo_id, with_records=with_records_export, token=token)
+        settings = rg.Settings(
+            fields=[
+                rg.TextField(name="text"),
+            ],
+            questions=[
+                rg.LabelQuestion(name="label", labels=["positive", "negative"]),
+                rg.LabelQuestion(name="extra_label", labels=["extra_positive", "extra_negative"]),
+            ],
+        )
+        if with_records_import and not with_records_export:
+            with pytest.warns(
+                expected_warning=UserWarning,
+                match="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
+            ):
+                new_dataset = rg.Dataset.from_hub(
+                    repo_id=repo_id,
+                    client=client,
+                    with_records=with_records_import,
+                    token=token,
+                    settings=settings,
+                    name=mock_dataset_name,
+                )
+        else:
+            new_dataset = rg.Dataset.from_hub(
+                repo_id=repo_id,
+                client=client,
+                with_records=with_records_import,
+                token=token,
+                settings=settings,
+                name=mock_dataset_name,
+            )
+
+        if with_records_import and with_records_export:
+            for i, record in enumerate(new_dataset.records(with_suggestions=True)):
+                assert record.fields["text"] == mock_data[i]["text"]
+                assert record.suggestions["label"].value == mock_data[i]["label"]
+        else:
+            assert len(new_dataset.records.to_list()) == 0
+
+        assert new_dataset.settings.fields[0].name == "text"
+        assert new_dataset.settings.questions[0].name == "label"
         assert new_dataset.settings.questions[1].name == "extra_label"
         assert len(new_dataset.settings.questions[1].labels) == 2
         assert new_dataset.settings.questions[1].labels[0] == "extra_positive"
         assert new_dataset.settings.questions[1].labels[1] == "extra_negative"
+        assert new_dataset.name == mock_dataset_name
 
     def test_import_dataset_from_hub_using_wrong_settings(
         self,