Skip to content

Commit

Permalink
[ENHANCEMENT] catch record with no known keys and raise detailed error (
Browse files Browse the repository at this point in the history
#5356)

# Description

This PR responds to #5281 by catching records with no known keys and
raising an ingestion error.

**Type of change**

- Improvement (change adding some improvement to an existing
functionality)

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
burtenshaw and pre-commit-ci[bot] authored Aug 1, 2024
1 parent 56d828b commit 5c12e32
Show file tree
Hide file tree
Showing 4 changed files with 47 additions and 1 deletion.
2 changes: 1 addition & 1 deletion argilla/pdm.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 19 additions & 0 deletions argilla/src/argilla/records/_mapping/_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import re
from typing import TYPE_CHECKING, Any, Dict, List, Optional, Sequence, Union, Tuple
from uuid import UUID
import warnings

from argilla._exceptions import RecordsIngestionError
from argilla.records._resource import Record
Expand Down Expand Up @@ -83,6 +84,24 @@ def __call__(self, data: Dict[str, Any], user_id: Optional[UUID] = None) -> Reco
metadata = self._map_attributes(data=data, mapping=self.mapping.metadata)
vectors = self._map_attributes(data=data, mapping=self.mapping.vector)

unknown_keys = [key for key in data.keys() if key not in self.mapping.keys()]
if unknown_keys:
warnings.warn(f"Keys {unknown_keys} in data are not present in the mapping and will be ignored.")

if len([k for k in data if k != self.mapping.id.source]) == 0:
raise RecordsIngestionError(
message=f"Record has no data. All records must have at least one attribute. Record id: {record_id}."
)

if data and not (record_id or suggestions or responses or fields or metadata or vectors):
raise RecordsIngestionError(
message=f"""Record has no identifiable keys. If keys in source dataset
do not match the names in `dataset.settings`, you should use a
`mapping` with `dataset.records.log`.
Available keys: {self.mapping.keys()}.
Unkown keys: {unknown_keys}. """
)

return Record(
id=record_id,
fields=fields,
Expand Down
11 changes: 11 additions & 0 deletions argilla/src/argilla/records/_mapping/_routes.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,3 +111,14 @@ def add_route(self, attribute_route: AttributeRoute) -> None:
self.id = attribute_route
else:
self._get_routes_group_by_type(attribute_route.type)[attribute_route.name] = attribute_route

def keys(self) -> List[str]:
"""Utility method to get all the keys in the mapping"""
return (
list(self.suggestion.keys())
+ list(self.response.keys())
+ list(self.field.keys())
+ list(self.metadata.keys())
+ list(self.vector.keys())
+ ["id"]
)
16 changes: 16 additions & 0 deletions argilla/tests/unit/test_record_ingestion.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pytest

import argilla as rg
from argilla._exceptions import RecordsIngestionError


@pytest.fixture
Expand Down Expand Up @@ -50,6 +51,21 @@ def test_ingest_record_from_dict(dataset):
assert record.suggestions[0].value == "positive"


def test_ingest_record_from_empty_dict_raises(dataset):
with pytest.raises(RecordsIngestionError):
dataset.records._ingest_records(
records=[
{"id": "record_id"},
],
)
with pytest.raises(RecordsIngestionError):
dataset.records._ingest_records(
records=[
{},
],
)


def test_ingest_record_from_dict_with_mapped_suggestions(dataset):
mock_mapping = {
"my_prompt": "prompt",
Expand Down

0 comments on commit 5c12e32

Please sign in to comment.