From 87d256adacaf4f3cf4fc17687b77825eec6c8588 Mon Sep 17 00:00:00 2001
From: maleksan85 <maleksan@amd.com>
Date: Fri, 17 Jan 2025 17:04:06 -0600
Subject: [PATCH] detokenize = False for benchmarks

---
 benchmarks/profiling/benchmark_latency.py    | 1 +
 benchmarks/profiling/benchmark_throughput.py | 2 ++
 2 files changed, 3 insertions(+)

diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py
index 73366f22f8de8..b1d1c5602d98d 100644
--- a/benchmarks/profiling/benchmark_latency.py
+++ b/benchmarks/profiling/benchmark_latency.py
@@ -81,6 +81,7 @@ def get_profiling_context(profile_result_dir: Optional[str] = None):
         top_p=1.0,
         ignore_eos=True,
         max_tokens=args.output_len,
+        detokenize = False,
     )
     print(sampling_params)
     dummy_prompt_token_ids = np.random.randint(10000,
diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py
index 3bbdd7d4267ae..22a4b1028f778 100644
--- a/benchmarks/profiling/benchmark_throughput.py
+++ b/benchmarks/profiling/benchmark_throughput.py
@@ -135,6 +135,7 @@ def get_profiling_context(profile_dir: Optional[str] = None):
                 top_p=1.0,
                 ignore_eos=True,
                 max_tokens=output_len,
+                detokenize = False,
             ))
 
     if args.profile_torch or args.profile_rpd:
@@ -171,6 +172,7 @@ async def run_vllm_async(
                     top_p=1.0,
                     ignore_eos=True,
                     max_tokens=output_len,
+                    detokenize = False,
                 ))
 
         generators = []