Skip to content

Commit

Permalink
add readme updates
Browse files Browse the repository at this point in the history
  • Loading branch information
davidberenstein1957 committed Dec 3, 2024
1 parent bed8333 commit 0a0f99c
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 19 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ hf_oauth_scopes:

<h1 align="center">
<br>
🧬 Synthetic Data Generator
Synthetic Data Generator
<br>
</h1>
<h3 align="center">Build datasets using natural language</h2>

![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui.png)
![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui-full.png)

<p align="center">
<a href="https://pypi.org/project/synthetic-dataset-generator/">
Expand Down
Binary file added assets/ui-full.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified assets/ui.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
51 changes: 34 additions & 17 deletions src/distilabel_dataset_generator/pipelines/eval.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,8 @@
from typing import List

from datasets import get_dataset_config_names, get_dataset_split_names
from distilabel.llms import InferenceEndpointsLLM
from distilabel.steps.tasks import (
UltraFeedback,
TextGeneration,
UltraFeedback,
)

from src.distilabel_dataset_generator.pipelines.base import (
Expand All @@ -21,7 +19,7 @@ def get_ultrafeedback_evaluator(aspect, is_sample):
tokenizer_id=MODEL,
api_key=_get_next_api_key(),
generation_kwargs={
"temperature": 0.7,
"temperature": 0,
"max_new_tokens": 256 if is_sample else 2048,
},
),
Expand All @@ -39,12 +37,12 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample)
api_key=_get_next_api_key(),
structured_output={"format": "json", "schema": structured_output},
generation_kwargs={
"temperature": 0.7,
"temperature": 0,
"max_new_tokens": 256 if is_sample else 2048,
},
),
template=prompt_template,
columns=columns
columns=columns,
)
custom_evaluator.load()
return custom_evaluator
Expand Down Expand Up @@ -81,13 +79,13 @@ def generate_ultrafeedback_pipeline_code(
tokenizer_id=MODEL,
api_key=os.environ["HF_TOKEN"],
generation_kwargs={{
"temperature": 0.7,
"temperature": 0,
"max_new_tokens": 2048,
}},
),
aspect=aspect,
)
load_the_dataset >> ultrafeedback_evaluator
if __name__ == "__main__":
Expand All @@ -113,7 +111,7 @@ def generate_ultrafeedback_pipeline_code(
load_the_dataset = LoadDataFromDicts(
data = data,
)
tasks = []
for aspect in aspects:
evaluate_responses = UltraFeedback(
Expand All @@ -124,7 +122,7 @@ def generate_ultrafeedback_pipeline_code(
tokenizer_id=MODEL,
api_key=os.environ["HF_TOKEN"],
generation_kwargs={{
"temperature": 0.7,
"temperature": 0,
"max_new_tokens": 2048,
}},
output_mappings={{
Expand All @@ -135,9 +133,9 @@ def generate_ultrafeedback_pipeline_code(
}} if aspect in ["truthfulness", "helpfulness"] else {{"rationales": f"rationales_{{aspect}}", "ratings": f"ratings_{{aspect}}"}},
)
tasks.append(evaluate_responses)
combine_outputs = CombineOutputs()
load_the_dataset >> tasks >> combine_outputs
if __name__ == "__main__":
Expand Down Expand Up @@ -177,14 +175,14 @@ def generate_custom_pipeline_code(
api_key=os.environ["HF_TOKEN"],
structured_output={{"format": "json", "schema": {structured_output}}},
generation_kwargs={{
"temperature": 0.7,
"temperature": 0,
"max_new_tokens": 2048,
}},
),
template=CUSTOM_TEMPLATE,
columns={columns}
)
load_the_dataset >> custom_evaluator
if __name__ == "__main__":
Expand All @@ -193,13 +191,32 @@ def generate_custom_pipeline_code(
return code


def generate_pipeline_code(repo_id, aspects, instruction_column, response_columns, prompt_template, structured_output, num_rows, eval_type):
def generate_pipeline_code(
repo_id,
aspects,
instruction_column,
response_columns,
prompt_template,
structured_output,
num_rows,
eval_type,
):
if repo_id is None:
subset = "default"
split = "train"
else:
subset = get_dataset_config_names(repo_id)[0]
split = get_dataset_split_names(repo_id, subset)[0]
if eval_type == "ultrafeedback":
return generate_ultrafeedback_pipeline_code(repo_id, subset, split, aspects, instruction_column, response_columns, num_rows)
return generate_custom_pipeline_code(repo_id, subset, split, prompt_template, structured_output, num_rows)
return generate_ultrafeedback_pipeline_code(
repo_id,
subset,
split,
aspects,
instruction_column,
response_columns,
num_rows,
)
return generate_custom_pipeline_code(
repo_id, subset, split, prompt_template, structured_output, num_rows
)

0 comments on commit 0a0f99c

Please sign in to comment.