diff --git a/src/lemonade/tools/humaneval.py b/src/lemonade/tools/humaneval.py index c9433a3..771a38c 100644 --- a/src/lemonade/tools/humaneval.py +++ b/src/lemonade/tools/humaneval.py @@ -118,6 +118,7 @@ def run( k_samples, timeout, model_results_dir, + state.device, first_n_samples, ) @@ -153,6 +154,7 @@ def _evaluate_model( k_samples: int, timeout: float, results_dir: str, + device: str, first_n_samples: Optional[int] = TOTAL_PROBLEMS, ) -> Dict[str, float]: """ @@ -198,7 +200,9 @@ def _evaluate_model( expected = dataset[task_id]["canonical_solution"] # Generate completion - input_ids = tokenizer(prompt, return_tensors="pt").input_ids + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to( + device + ) completion = model.generate( input_ids, max_new_tokens=512, diff --git a/src/lemonade/tools/mmlu.py b/src/lemonade/tools/mmlu.py index 33abfcb..d11d327 100644 --- a/src/lemonade/tools/mmlu.py +++ b/src/lemonade/tools/mmlu.py @@ -150,7 +150,9 @@ def run( prompt = _gen_prompt(dev_df, subject, ntrain) + _format_example( test_df, i, include_answer=False ) - input_ids = tokenizer(prompt, return_tensors="pt").input_ids + input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to( + state.device + ) response_text = _generate_response(tokenizer, model, input_ids) try: