diff --git a/README.md b/README.md index 199dfe7..eefe374 100644 --- a/README.md +++ b/README.md @@ -20,12 +20,12 @@ hf_oauth_scopes:


- 🧬 Synthetic Data Generator + Synthetic Data Generator

Build datasets using natural language

-![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui.png) +![Synthetic Data Generator](https://huggingface.co/spaces/argilla/synthetic-data-generator/resolve/main/assets/ui-full.png)

diff --git a/assets/ui-full.png b/assets/ui-full.png new file mode 100644 index 0000000..6f15a59 Binary files /dev/null and b/assets/ui-full.png differ diff --git a/assets/ui.png b/assets/ui.png index 6f15a59..20b3a10 100644 Binary files a/assets/ui.png and b/assets/ui.png differ diff --git a/src/distilabel_dataset_generator/pipelines/eval.py b/src/distilabel_dataset_generator/pipelines/eval.py index 60ff454..cf1d25b 100644 --- a/src/distilabel_dataset_generator/pipelines/eval.py +++ b/src/distilabel_dataset_generator/pipelines/eval.py @@ -1,10 +1,8 @@ -from typing import List - from datasets import get_dataset_config_names, get_dataset_split_names from distilabel.llms import InferenceEndpointsLLM from distilabel.steps.tasks import ( - UltraFeedback, TextGeneration, + UltraFeedback, ) from src.distilabel_dataset_generator.pipelines.base import ( @@ -21,7 +19,7 @@ def get_ultrafeedback_evaluator(aspect, is_sample): tokenizer_id=MODEL, api_key=_get_next_api_key(), generation_kwargs={ - "temperature": 0.7, + "temperature": 0, "max_new_tokens": 256 if is_sample else 2048, }, ), @@ -39,12 +37,12 @@ def get_custom_evaluator(prompt_template, structured_output, columns, is_sample) api_key=_get_next_api_key(), structured_output={"format": "json", "schema": structured_output}, generation_kwargs={ - "temperature": 0.7, + "temperature": 0, "max_new_tokens": 256 if is_sample else 2048, }, ), template=prompt_template, - columns=columns + columns=columns, ) custom_evaluator.load() return custom_evaluator @@ -81,13 +79,13 @@ def generate_ultrafeedback_pipeline_code( tokenizer_id=MODEL, api_key=os.environ["HF_TOKEN"], generation_kwargs={{ - "temperature": 0.7, + "temperature": 0, "max_new_tokens": 2048, }}, ), aspect=aspect, ) - + load_the_dataset >> ultrafeedback_evaluator if __name__ == "__main__": @@ -113,7 +111,7 @@ def generate_ultrafeedback_pipeline_code( load_the_dataset = LoadDataFromDicts( data = data, ) - + tasks = [] for aspect in aspects: evaluate_responses = UltraFeedback( @@ -124,7 +122,7 @@ def generate_ultrafeedback_pipeline_code( tokenizer_id=MODEL, api_key=os.environ["HF_TOKEN"], generation_kwargs={{ - "temperature": 0.7, + "temperature": 0, "max_new_tokens": 2048, }}, output_mappings={{ @@ -135,9 +133,9 @@ def generate_ultrafeedback_pipeline_code( }} if aspect in ["truthfulness", "helpfulness"] else {{"rationales": f"rationales_{{aspect}}", "ratings": f"ratings_{{aspect}}"}}, ) tasks.append(evaluate_responses) - + combine_outputs = CombineOutputs() - + load_the_dataset >> tasks >> combine_outputs if __name__ == "__main__": @@ -177,14 +175,14 @@ def generate_custom_pipeline_code( api_key=os.environ["HF_TOKEN"], structured_output={{"format": "json", "schema": {structured_output}}}, generation_kwargs={{ - "temperature": 0.7, + "temperature": 0, "max_new_tokens": 2048, }}, ), template=CUSTOM_TEMPLATE, columns={columns} ) - + load_the_dataset >> custom_evaluator if __name__ == "__main__": @@ -193,7 +191,16 @@ def generate_custom_pipeline_code( return code -def generate_pipeline_code(repo_id, aspects, instruction_column, response_columns, prompt_template, structured_output, num_rows, eval_type): +def generate_pipeline_code( + repo_id, + aspects, + instruction_column, + response_columns, + prompt_template, + structured_output, + num_rows, + eval_type, +): if repo_id is None: subset = "default" split = "train" @@ -201,5 +208,15 @@ def generate_pipeline_code(repo_id, aspects, instruction_column, response_column subset = get_dataset_config_names(repo_id)[0] split = get_dataset_split_names(repo_id, subset)[0] if eval_type == "ultrafeedback": - return generate_ultrafeedback_pipeline_code(repo_id, subset, split, aspects, instruction_column, response_columns, num_rows) - return generate_custom_pipeline_code(repo_id, subset, split, prompt_template, structured_output, num_rows) + return generate_ultrafeedback_pipeline_code( + repo_id, + subset, + split, + aspects, + instruction_column, + response_columns, + num_rows, + ) + return generate_custom_pipeline_code( + repo_id, subset, split, prompt_template, structured_output, num_rows + )