Skip to content

Commit

Permalink
Merge branch 'feat/hub-integration' into docs/guide-for-exporting
Browse files Browse the repository at this point in the history
  • Loading branch information
burtenshaw committed Jul 11, 2024
2 parents a9f21a1 + 54afe17 commit a310ba6
Showing 1 changed file with 9 additions and 7 deletions.
16 changes: 9 additions & 7 deletions argilla/src/argilla/datasets/_export/_hub.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,34 +160,36 @@ def _log_dataset_records(hf_dataset: "HFDataset", dataset: "Dataset"):
response_questions[question_name]["status"] = hf_dataset[col]

# Check if all user ids are known to this Argilla client
known_users_ids = [user.id for user in dataset._client.users._api.list()]
known_users_ids = [user.id for user in dataset._client.users]
unknown_user_ids = set(user_ids.keys()) - set(known_users_ids)
my_user = dataset._client.me.id
if len(unknown_user_ids) > 1:
warnings.warn(
message=f"""Found unknown user ids in dataset repo: {unknown_user_ids}.
Assigning first response for each record to current user ({dataset._client.me.username}) and discarding the rest."""
Assigning first response for each record to current user ({my_user.username}) and discarding the rest."""
)
for unknown_user_id in unknown_user_ids:
user_ids[unknown_user_id] = dataset._client.me.id
user_ids[unknown_user_id] = my_user.id

# Create a mapper to map the Hugging Face dataset to a Record object
mapping = {col: col for col in hf_dataset.column_names if ".suggestion" in col}
mapper = IngestedRecordMapper(dataset=dataset, mapping=mapping, user_id=dataset._client.me.id)
mapper = IngestedRecordMapper(dataset=dataset, mapping=mapping, user_id=my_user.id)

# Extract responses and create Record objects
records = []
for idx, row in enumerate(hf_dataset):
record = mapper(row)
record.id = row.pop("id")
for question_name, values in response_questions.items():
_user_ids = user_ids.copy()
response_users = {}
response_values = values["responses"][idx]
response_users = values["users"][idx]
response_status = values["status"][idx]
for value, user_id, status in zip(response_values, response_users, response_status):
user_id = _user_ids.pop(UUID(user_id), None)
if user_id is None:
user_id = user_ids[UUID(user_id)]
if user_id in response_users:
continue
response_users[user_id] = True
response = Response(
user_id=user_id,
question_name=question_name,
Expand Down

0 comments on commit a310ba6

Please sign in to comment.