diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py index 73366f22f8de8..b1d1c5602d98d 100644 --- a/benchmarks/profiling/benchmark_latency.py +++ b/benchmarks/profiling/benchmark_latency.py @@ -81,6 +81,7 @@ def get_profiling_context(profile_result_dir: Optional[str] = None): top_p=1.0, ignore_eos=True, max_tokens=args.output_len, + detokenize = False, ) print(sampling_params) dummy_prompt_token_ids = np.random.randint(10000, diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py index 3bbdd7d4267ae..22a4b1028f778 100644 --- a/benchmarks/profiling/benchmark_throughput.py +++ b/benchmarks/profiling/benchmark_throughput.py @@ -135,6 +135,7 @@ def get_profiling_context(profile_dir: Optional[str] = None): top_p=1.0, ignore_eos=True, max_tokens=output_len, + detokenize = False, )) if args.profile_torch or args.profile_rpd: @@ -171,6 +172,7 @@ async def run_vllm_async( top_p=1.0, ignore_eos=True, max_tokens=output_len, + detokenize = False, )) generators = []