From e28ef13caf11aa94afa7ef89957334469564259b Mon Sep 17 00:00:00 2001
From: burtenshaw <ben@argilla.io>
Date: Thu, 29 Aug 2024 11:43:32 +0200
Subject: [PATCH 1/2] [REFACTOR] refactor from hub method to simplify method
 (#5420)

# Description
<!-- Please include a summary of the changes and the related issue.
Please also include relevant motivation and context. List any
dependencies that are required for this change. -->

Closes #<issue_number>

**Type of change**
<!-- Please delete options that are not relevant. Remember to title the
PR according to the type of change -->

- Bug fix (non-breaking change which fixes an issue)
- New feature (non-breaking change which adds functionality)
- Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- Refactor (change restructuring the codebase without changing
functionality)
- Improvement (change adding some improvement to an existing
functionality)
- Documentation update

**How Has This Been Tested**
<!-- Please add some reference about how your feature has been tested.
-->

**Checklist**
<!-- Please go over the list and make sure you've taken everything into
account -->

- I added relevant documentation
- I followed the style guidelines of this project
- I did a self-review of my code
- I made corresponding changes to the documentation
- I confirm My changes generate no new warnings
- I have added tests that prove my fix is effective or that my feature
works
- I have added relevant notes to the CHANGELOG.md file (See
https://keepachangelog.com/)

---------

Co-authored-by: Paco Aranda <frascuchon@gmail.com>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
---
 argilla/src/argilla/datasets/_export/_hub.py  | 72 ++++++++++---------
 .../tests/integration/test_export_dataset.py  | 60 ++++++++++++++++
 2 files changed, 100 insertions(+), 32 deletions(-)

diff --git a/argilla/src/argilla/datasets/_export/_hub.py b/argilla/src/argilla/datasets/_export/_hub.py
index 0d33f428ce..e6ec2476c6 100644
--- a/argilla/src/argilla/datasets/_export/_hub.py
+++ b/argilla/src/argilla/datasets/_export/_hub.py
@@ -16,14 +16,15 @@
 import warnings
 from collections import defaultdict
 from tempfile import TemporaryDirectory
-from typing import TYPE_CHECKING, Any, Optional, Type, Union
+from typing import TYPE_CHECKING, Any, Optional, Type, Union, Dict
 from uuid import UUID
 
+from datasets import DatasetDict
+from datasets.data_files import EmptyDatasetError
+
 from argilla._exceptions._api import UnprocessableEntityError
 from argilla._exceptions._records import RecordsIngestionError
 from argilla._exceptions._settings import SettingsError
-from datasets.data_files import EmptyDatasetError
-
 from argilla.datasets._export._disk import DiskImportExportMixin
 from argilla.records._mapping import IngestedRecordMapper
 from argilla.responses import Response
@@ -72,6 +73,7 @@ def to_hub(
 
         with TemporaryDirectory() as tmpdirname:
             config_dir = os.path.join(tmpdirname)
+
             self.to_disk(path=config_dir, with_records=False)
 
             if generate_card:
@@ -129,9 +131,12 @@ def from_hub(
         Returns:
             A `Dataset` loaded from the Hugging Face Hub.
         """
-        from datasets import Dataset, DatasetDict, load_dataset
+        from datasets import load_dataset
         from huggingface_hub import snapshot_download
 
+        if name is None:
+            name = repo_id.replace("/", "_")
+
         if settings is not None:
             dataset = cls(name=name, settings=settings)
             dataset.create()
@@ -150,31 +155,9 @@ def from_hub(
 
         if with_records:
             try:
-                hf_dataset: Dataset = load_dataset(path=repo_id, **kwargs)  # type: ignore
-                if isinstance(hf_dataset, DatasetDict) and "split" not in kwargs:
-                    if len(hf_dataset.keys()) > 1:
-                        raise ValueError(
-                            "Only one dataset can be loaded at a time, use `split` to select a split, available splits"
-                            f" are: {', '.join(hf_dataset.keys())}."
-                        )
-                    hf_dataset: Dataset = hf_dataset[list(hf_dataset.keys())[0]]
-                for feature in hf_dataset.features:
-                    if feature not in dataset.settings.fields or feature not in dataset.settings.questions:
-                        warnings.warn(
-                            message=f"Feature {feature} in Hugging Face dataset is not defined in dataset settings."
-                        )
-                        warnings.warn(
-                            message=f"Available fields: {dataset.settings.fields}. Available questions: {dataset.settings.questions}."
-                        )
-                try:
-                    cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
-                except (RecordsIngestionError, UnprocessableEntityError) as e:
-                    if settings is not None:
-                        raise SettingsError(
-                            message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema {hf_dataset.features}"
-                        ) from e
-                    else:
-                        raise e
+                hf_dataset = load_dataset(path=repo_id, **kwargs)  # type: ignore
+                hf_dataset = cls._get_dataset_split(hf_dataset=hf_dataset, **kwargs)
+                cls._log_dataset_records(hf_dataset=hf_dataset, dataset=dataset)
             except EmptyDatasetError:
                 warnings.warn(
                     message="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
@@ -221,9 +204,7 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
         records = []
         for idx, row in enumerate(hf_dataset):
             record = mapper(row)
-            record.id = row.pop("id")
             for question_name, values in response_questions.items():
-                response_users = {}
                 response_values = values["responses"][idx]
                 response_users = values["users"][idx]
                 response_status = values["status"][idx]
@@ -240,4 +221,31 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
                     )
                     record.responses.add(response)
             records.append(record)
-        dataset.records.log(records=records)
+
+        try:
+            dataset.records.log(records=records)
+        except (RecordsIngestionError, UnprocessableEntityError) as e:
+            raise SettingsError(
+                message=f"Failed to load records from Hugging Face dataset. Defined settings do not match dataset schema. Hugging face dataset features: {hf_dataset.features}. Argilla dataset settings : {dataset.settings}"
+            ) from e
+
+    @staticmethod
+    def _get_dataset_split(hf_dataset: "HFDataset", split: Optional[str] = None, **kwargs: Dict) -> "HFDataset":
+        """Get a single dataset from a Hugging Face dataset.
+
+        Parameters:
+            hf_dataset (HFDataset): The Hugging Face dataset to get a single dataset from.
+
+        Returns:
+            HFDataset: The single dataset.
+        """
+
+        if isinstance(hf_dataset, DatasetDict) and split is None:
+            split = next(iter(hf_dataset.keys()))
+            if len(hf_dataset.keys()) > 1:
+                warnings.warn(
+                    message=f"Multiple splits found in Hugging Face dataset. Using the first split: {split}. "
+                    f"Available splits are: {', '.join(hf_dataset.keys())}."
+                )
+            hf_dataset = hf_dataset[split]
+        return hf_dataset
diff --git a/argilla/tests/integration/test_export_dataset.py b/argilla/tests/integration/test_export_dataset.py
index 92e3272ccc..041871bce1 100644
--- a/argilla/tests/integration/test_export_dataset.py
+++ b/argilla/tests/integration/test_export_dataset.py
@@ -250,10 +250,70 @@ def test_import_dataset_from_hub_using_settings(
 
         assert new_dataset.settings.fields[0].name == "text"
         assert new_dataset.settings.questions[0].name == "label"
+
+    @pytest.mark.parametrize("with_records_import", [True, False])
+    def test_import_dataset_from_hub_using_settings(
+        self,
+        token: str,
+        dataset: rg.Dataset,
+        client,
+        mock_data: List[dict[str, Any]],
+        with_records_export: bool,
+        with_records_import: bool,
+    ):
+        repo_id = (
+            f"argilla-internal-testing/test_import_dataset_from_hub_using_settings_with_records{with_records_export}"
+        )
+        mock_dataset_name = f"test_import_dataset_from_hub_using_settings_{uuid.uuid4()}"
+        dataset.records.log(records=mock_data)
+
+        dataset.to_hub(repo_id=repo_id, with_records=with_records_export, token=token)
+        settings = rg.Settings(
+            fields=[
+                rg.TextField(name="text"),
+            ],
+            questions=[
+                rg.LabelQuestion(name="label", labels=["positive", "negative"]),
+                rg.LabelQuestion(name="extra_label", labels=["extra_positive", "extra_negative"]),
+            ],
+        )
+        if with_records_import and not with_records_export:
+            with pytest.warns(
+                expected_warning=UserWarning,
+                match="Trying to load a dataset `with_records=True` but dataset does not contain any records.",
+            ):
+                new_dataset = rg.Dataset.from_hub(
+                    repo_id=repo_id,
+                    client=client,
+                    with_records=with_records_import,
+                    token=token,
+                    settings=settings,
+                    name=mock_dataset_name,
+                )
+        else:
+            new_dataset = rg.Dataset.from_hub(
+                repo_id=repo_id,
+                client=client,
+                with_records=with_records_import,
+                token=token,
+                settings=settings,
+                name=mock_dataset_name,
+            )
+
+        if with_records_import and with_records_export:
+            for i, record in enumerate(new_dataset.records(with_suggestions=True)):
+                assert record.fields["text"] == mock_data[i]["text"]
+                assert record.suggestions["label"].value == mock_data[i]["label"]
+        else:
+            assert len(new_dataset.records.to_list()) == 0
+
+        assert new_dataset.settings.fields[0].name == "text"
+        assert new_dataset.settings.questions[0].name == "label"
         assert new_dataset.settings.questions[1].name == "extra_label"
         assert len(new_dataset.settings.questions[1].labels) == 2
         assert new_dataset.settings.questions[1].labels[0] == "extra_positive"
         assert new_dataset.settings.questions[1].labels[1] == "extra_negative"
+        assert new_dataset.name == mock_dataset_name
 
     def test_import_dataset_from_hub_using_wrong_settings(
         self,

From 7495136122952d59ccee32a0c52fd1026dba6e25 Mon Sep 17 00:00:00 2001
From: Natalia Elvira <126158523+nataliaElv@users.noreply.github.com>
Date: Fri, 30 Aug 2024 11:01:50 +0200
Subject: [PATCH 2/2] Docs: advanced queries dsl (#5435)

# Description
<!-- Please include a summary of the changes and the related issue.
Please also include relevant motivation and context. List any
dependencies that are required for this change. -->

Closes #<issue_number>

**Type of change**
<!-- Please delete options that are not relevant. Remember to title the
PR according to the type of change -->

- Bug fix (non-breaking change which fixes an issue)
- New feature (non-breaking change which adds functionality)
- Breaking change (fix or feature that would cause existing
functionality to not work as expected)
- Refactor (change restructuring the codebase without changing
functionality)
- Improvement (change adding some improvement to an existing
functionality)
- Documentation update

**How Has This Been Tested**
<!-- Please add some reference about how your feature has been tested.
-->

**Checklist**
<!-- Please go over the list and make sure you've taken everything into
account -->

- I added relevant documentation
- I followed the style guidelines of this project
- I did a self-review of my code
- I made corresponding changes to the documentation
- I confirm My changes generate no new warnings
- I have added tests that prove my fix is effective or that my feature
works
- I have added relevant notes to the CHANGELOG.md file (See
https://keepachangelog.com/)

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Co-authored-by: Francisco Aranda <francis@argilla.io>
---
 argilla/docs/how_to_guides/annotate.md |  6 ++++++
 argilla/docs/how_to_guides/query.md    | 23 ++++++++++++++++++++---
 argilla/mkdocs.yml                     |  4 +++-
 3 files changed, 29 insertions(+), 4 deletions(-)

diff --git a/argilla/docs/how_to_guides/annotate.md b/argilla/docs/how_to_guides/annotate.md
index ba5fdb64e9..b825b76020 100644
--- a/argilla/docs/how_to_guides/annotate.md
+++ b/argilla/docs/how_to_guides/annotate.md
@@ -136,6 +136,12 @@ The UI offers various features designed for data exploration and understanding.
 
 From the **control panel** at the top of the left pane, you can search by keyword across the entire dataset. If you have more than one field in your records, you may specify if the search is to be performed “All” fields or on a specific one. Matched results are highlighted in color.
 
+!!! note
+    If you introduce more than one keyword, the search will return results where **all** keywords have a match.
+
+!!! tip
+    For more advanced searches, take a look at the [advanced queries DSL](query.md#advanced-queries).
+
 ### Order by record semantic similarity
 
 You can retrieve records based on their similarity to another record if vectors have been added to the dataset.
diff --git a/argilla/docs/how_to_guides/query.md b/argilla/docs/how_to_guides/query.md
index 9d1aab1c7a..eaf2c877f3 100644
--- a/argilla/docs/how_to_guides/query.md
+++ b/argilla/docs/how_to_guides/query.md
@@ -35,7 +35,7 @@ You can search for records in your dataset by **querying** or **filtering**. The
 
 To search for records with terms, you can use the `Dataset.records` attribute with a query string. The search terms are used to search for records that contain the terms in the text field. You can search a single term or various terms, in the latter, all of them should appear in the record to be retrieved.
 
-=== "Single search term"
+=== "Single term search"
 
     ```python
     import argilla as rg
@@ -49,7 +49,7 @@ To search for records with terms, you can use the `Dataset.records` attribute wi
     queried_records = dataset.records(query=query).to_list(flatten=True)
     ```
 
-=== "Multiple search term"
+=== "Multiple terms search"
 
     ```python
     import argilla as rg
@@ -63,6 +63,23 @@ To search for records with terms, you can use the `Dataset.records` attribute wi
     queried_records = dataset.records(query=query).to_list(flatten=True)
     ```
 
+### Advanced queries
+
+If you need more complex searches, you can use [Elasticsearch's simple query string syntax](https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-simple-query-string-query.html#simple-query-string-syntax). Here is a summary of the different available operators:
+
+| operator     | description                 | example                                                               |
+| ------------ | --------------------------- | --------------------------------------------------------------------- |
+|`+` or `space`| **AND**: search both terms  | `argilla + distilabel` or `argilla distilabel`</br> return records that include the terms "argilla" and "distilabel"|
+|`|`           | **OR**: search either term  | `argilla | distilabel` </br> returns records that include the term "argilla" or "distilabel"|
+|`-`           | **Negation**: exclude a term| `argilla -distilabel` </br> returns records that contain the term "argilla" and don't have the term "distilabel"|
+|`*`           | **Prefix**: search a prefix | `arg*`</br> returns records with any words starting with "arg-"|
+|`"`           | **Phrase**: search a phrase | `"argilla and distilabel"` </br> returns records that contain the phrase "argilla and distilabel"|
+|`(` and `)`   | **Precedence**: group terms | `(argilla | distilabel) rules` </br> returns records that contain either "argilla" or "distilabel" and "rules"|
+|`~N`          | **Edit distance**: search a term or phrase with an edit distance| `argilla~1` </br> returns records that contain the term "argilla" with an edit distance of 1, e.g. "argila"|
+
+!!! tip
+    To use one of these characters literally, escape it with a preceding backslash `\`, e.g. `"1 \+ 2"` would match records where the phrase "1 + 2" is found.
+
 ## Filter by conditions
 
 You can use the `Filter` class to define the conditions and pass them to the `Dataset.records` attribute to fetch records based on the conditions. Conditions include "==", ">=", "<=", or "in". Conditions can be combined with dot notation to filter records based on metadata, suggestions, or responses. You can use a single condition or multiple conditions to filter records.
@@ -72,7 +89,7 @@ You can use the `Filter` class to define the conditions and pass them to the `Da
 | `==`     | The `field` value is equal to the `value`                 |
 | `>=`     | The `field` value is greater than or equal to the `value` |
 | `<=`     | The `field` value is less than or equal to the `value`    |
-| `in`     | TThe `field` value is included in a list of values        |
+| `in`     | The `field` value is included in a list of values        |
 
 === "Single condition"
 
diff --git a/argilla/mkdocs.yml b/argilla/mkdocs.yml
index 57aef13e62..64d8f08a9c 100644
--- a/argilla/mkdocs.yml
+++ b/argilla/mkdocs.yml
@@ -121,6 +121,7 @@ plugins:
         - docs/scripts/gen_changelog.py
         - docs/scripts/gen_popular_issues.py
         # - docs/scripts/gen_ref_pages.py
+      enabled: !ENV [CI, false] # enables the plugin only during continuous integration (CI), disabled on local build
   - literate-nav:
       nav_file: SUMMARY.md
   - section-index
@@ -148,7 +149,8 @@ plugins:
             # Signature
             separate_signature: false
             show_signature_annotations: false
-  - social
+  - social:
+      enabled: !ENV [CI, false] # enables the plugin only during continuous integration (CI), disabled on local build
   - mknotebooks
   - material-plausible