From c75a4c446c33f16244cb6346d3001c6c107042c1 Mon Sep 17 00:00:00 2001 From: arakowsk-amd <182798202+arakowsk-amd@users.noreply.github.com> Date: Tue, 17 Dec 2024 13:47:31 -0800 Subject: [PATCH] removing --enable-chunked-prefill --- docs/dev-docker/README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/docs/dev-docker/README.md b/docs/dev-docker/README.md index b3a295e914f3a..d8e1c9f68a07b 100644 --- a/docs/dev-docker/README.md +++ b/docs/dev-docker/README.md @@ -261,8 +261,7 @@ Benchmark Meta-Llama-3.1-405B FP8 with input 128 tokens, output 128 tokens and t --num-scheduler-steps 10 \ --tensor-parallel-size 8 \ --input-len 128 \ - --output-len 128 \ - --enable-chunked-prefill false + --output-len 128 If you want to run Meta-Llama-3.1-405B FP16, please run @@ -278,8 +277,7 @@ If you want to run Meta-Llama-3.1-405B FP16, please run --output-len 128 \ --swap-space 16 \ --max-model-len 8192 \ - --max-num-batched-tokens 65536 \ - --enable-chunked-prefill false + --max-num-batched-tokens 65536 For fp8 quantized Llama3.18B/70B models: