From 87d256adacaf4f3cf4fc17687b77825eec6c8588 Mon Sep 17 00:00:00 2001 From: maleksan85 Date: Fri, 17 Jan 2025 17:04:06 -0600 Subject: [PATCH] detokenize = False for benchmarks --- benchmarks/profiling/benchmark_latency.py | 1 + benchmarks/profiling/benchmark_throughput.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/benchmarks/profiling/benchmark_latency.py b/benchmarks/profiling/benchmark_latency.py index 73366f22f8de8..b1d1c5602d98d 100644 --- a/benchmarks/profiling/benchmark_latency.py +++ b/benchmarks/profiling/benchmark_latency.py @@ -81,6 +81,7 @@ def get_profiling_context(profile_result_dir: Optional[str] = None): top_p=1.0, ignore_eos=True, max_tokens=args.output_len, + detokenize = False, ) print(sampling_params) dummy_prompt_token_ids = np.random.randint(10000, diff --git a/benchmarks/profiling/benchmark_throughput.py b/benchmarks/profiling/benchmark_throughput.py index 3bbdd7d4267ae..22a4b1028f778 100644 --- a/benchmarks/profiling/benchmark_throughput.py +++ b/benchmarks/profiling/benchmark_throughput.py @@ -135,6 +135,7 @@ def get_profiling_context(profile_dir: Optional[str] = None): top_p=1.0, ignore_eos=True, max_tokens=output_len, + detokenize = False, )) if args.profile_torch or args.profile_rpd: @@ -171,6 +172,7 @@ async def run_vllm_async( top_p=1.0, ignore_eos=True, max_tokens=output_len, + detokenize = False, )) generators = []