diff --git a/src/lemonade/tools/humaneval.py b/src/lemonade/tools/humaneval.py
index c9433a3..771a38c 100644
--- a/src/lemonade/tools/humaneval.py
+++ b/src/lemonade/tools/humaneval.py
@@ -118,6 +118,7 @@ def run(
             k_samples,
             timeout,
             model_results_dir,
+            state.device,
             first_n_samples,
         )
 
@@ -153,6 +154,7 @@ def _evaluate_model(
         k_samples: int,
         timeout: float,
         results_dir: str,
+        device: str,
         first_n_samples: Optional[int] = TOTAL_PROBLEMS,
     ) -> Dict[str, float]:
         """
@@ -198,7 +200,9 @@ def _evaluate_model(
                         expected = dataset[task_id]["canonical_solution"]
 
                         # Generate completion
-                        input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+                        input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(
+                            device
+                        )
                         completion = model.generate(
                             input_ids,
                             max_new_tokens=512,
diff --git a/src/lemonade/tools/mmlu.py b/src/lemonade/tools/mmlu.py
index 33abfcb..d11d327 100644
--- a/src/lemonade/tools/mmlu.py
+++ b/src/lemonade/tools/mmlu.py
@@ -150,7 +150,9 @@ def run(
                 prompt = _gen_prompt(dev_df, subject, ntrain) + _format_example(
                     test_df, i, include_answer=False
                 )
-                input_ids = tokenizer(prompt, return_tensors="pt").input_ids
+                input_ids = tokenizer(prompt, return_tensors="pt").input_ids.to(
+                    state.device
+                )
 
                 response_text = _generate_response(tokenizer, model, input_ids)
                 try: