Skip to content

Commit

Permalink
Merge branch 'habana_main' into dev/mfylcek/sampler-aware_batch_size_…
Browse files Browse the repository at this point in the history
…padding
  • Loading branch information
mfylcek authored Jan 21, 2025
2 parents 3696791 + fedf706 commit e61dd86
Show file tree
Hide file tree
Showing 199 changed files with 6,623 additions and 4,067 deletions.
107 changes: 107 additions & 0 deletions .buildkite/nightly-benchmarks/scripts/run-nightly-benchmarks.sh
Original file line number Diff line number Diff line change
Expand Up @@ -301,6 +301,104 @@ run_serving_tests() {
kill_gpu_processes
}

run_genai_perf_tests() {
# run genai-perf tests

# $1: a json file specifying genai-perf test cases
local genai_perf_test_file
genai_perf_test_file=$1

# Iterate over genai-perf tests
jq -c '.[]' "$genai_perf_test_file" | while read -r params; do
# get the test name, and append the GPU type back to it.
test_name=$(echo "$params" | jq -r '.test_name')

# if TEST_SELECTOR is set, only run the test cases that match the selector
if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
echo "Skip test case $test_name."
continue
fi

# prepend the current serving engine to the test name
test_name=${CURRENT_LLM_SERVING_ENGINE}_${test_name}

# get common parameters
common_params=$(echo "$params" | jq -r '.common_parameters')
model=$(echo "$common_params" | jq -r '.model')
tp=$(echo "$common_params" | jq -r '.tp')
dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
port=$(echo "$common_params" | jq -r '.port')
num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
reuse_server=$(echo "$common_params" | jq -r '.reuse_server')

# get client and server arguments
server_params=$(echo "$params" | jq -r ".${CURRENT_LLM_SERVING_ENGINE}_server_parameters")
qps_list=$(echo "$params" | jq -r '.qps_list')
qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
echo "Running over qps list $qps_list"

# check if there is enough GPU to run the test
if [[ $gpu_count -lt $tp ]]; then
echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
continue
fi

if [[ $reuse_server == "true" ]]; then
echo "Reuse previous server for test case $test_name"
else
kill_gpu_processes
bash "$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/launch-server.sh" \
"$server_params" "$common_params"
fi

if wait_for_server; then
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE server is up and running."
else
echo ""
echo "$CURRENT_LLM_SERVING_ENGINE failed to start within the timeout period."
break
fi

# iterate over different QPS
for qps in $qps_list; do
# remove the surrounding single quote from qps
if [[ "$qps" == *"inf"* ]]; then
echo "qps was $qps"
qps=$num_prompts
echo "now qps is $qps"
fi

new_test_name=$test_name"_qps_"$qps
backend=$CURRENT_LLM_SERVING_ENGINE

if [[ "$backend" == *"vllm"* ]]; then
backend="vllm"
fi
#TODO: add output dir.
client_command="genai-perf profile \
-m $model \
--service-kind openai \
--backend vllm \
--endpoint-type chat \
--streaming \
--url localhost:$port \
--request-rate $qps \
--num-prompts $num_prompts \
"

echo "Client command: $client_command"

eval "$client_command"

#TODO: process/record outputs
done
done

kill_gpu_processes

}

prepare_dataset() {

Expand Down Expand Up @@ -328,12 +426,17 @@ main() {

pip install -U transformers

pip install -r requirements-dev.txt
which genai-perf

# check storage
df -h

ensure_installed wget
ensure_installed curl
ensure_installed jq
# genai-perf dependency
ensure_installed libb64-0d

prepare_dataset

Expand All @@ -345,6 +448,10 @@ main() {
# run the test
run_serving_tests "$BENCHMARK_ROOT/tests/nightly-tests.json"

# run genai-perf tests
run_genai_perf_tests "$BENCHMARK_ROOT/tests/genai-perf-tests.json"
mv artifacts/ $RESULTS_FOLDER/

# upload benchmark results to buildkite
python3 -m pip install tabulate pandas
python3 "$BENCHMARK_ROOT/scripts/summary-nightly-results.py"
Expand Down
23 changes: 23 additions & 0 deletions .buildkite/nightly-benchmarks/tests/genai-perf-tests.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[
{
"test_name": "llama8B_tp1_genai_perf",
"qps_list": [4,8,16,32],
"common_parameters": {
"model": "meta-llama/Meta-Llama-3-8B-Instruct",
"tp": 1,
"port": 8000,
"num_prompts": 500,
"reuse_server": false
},
"vllm_server_parameters": {
"disable_log_stats": "",
"disable_log_requests": "",
"gpu_memory_utilization": 0.9,
"num_scheduler_steps": 10,
"max_num_seqs": 512,
"dtype": "bfloat16"
},
"genai_perf_input_parameters": {
}
}
]
4 changes: 2 additions & 2 deletions .buildkite/run-cpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,6 @@ function cpu_tests() {
tests/lora/test_qwen2vl.py"
}

# All of CPU tests are expected to be finished less than 25 mins.
# All of CPU tests are expected to be finished less than 40 mins.
export -f cpu_tests
timeout 30m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
timeout 40m bash -c "cpu_tests $CORE_RANGE $NUMA_NODE"
9 changes: 8 additions & 1 deletion .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ steps:
source_file_dependencies:
- vllm/
commands:
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py
- pytest -v -s entrypoints/llm --ignore=entrypoints/llm/test_lazy_outlines.py --ignore=entrypoints/llm/test_generate.py --ignore=entrypoints/llm/test_generate_multiple_loras.py --ignore=entrypoints/llm/test_guided_generate.py --ignore=entrypoints/llm/test_collective_rpc.py
- pytest -v -s entrypoints/llm/test_lazy_outlines.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate.py # it needs a clean process
- pytest -v -s entrypoints/llm/test_generate_multiple_loras.py # it needs a clean process
Expand All @@ -126,11 +126,15 @@ steps:
- tests/distributed
- tests/spec_decode/e2e/test_integration_dist_tp4
- tests/compile
- examples/offline_inference/rlhf.py
commands:
- pytest -v -s distributed/test_utils.py
- pytest -v -s compile/test_basic_correctness.py
- pytest -v -s distributed/test_pynccl.py
- pytest -v -s spec_decode/e2e/test_integration_dist_tp4.py
# TODO: create a dedicated test section for multi-GPU example tests
# when we have multiple distributed example tests
- python3 ../examples/offline_inference/rlhf.py

- label: Metrics, Tracing Test # 10min
num_gpus: 2
Expand Down Expand Up @@ -462,7 +466,10 @@ steps:
- vllm/worker/worker_base.py
- vllm/worker/worker.py
- vllm/worker/model_runner.py
- entrypoints/llm/test_collective_rpc.py
commands:
- pytest -v -s entrypoints/llm/test_collective_rpc.py
- torchrun --nproc-per-node=2 distributed/test_torchrun_example.py
- pytest -v -s ./compile/test_basic_correctness.py
- pytest -v -s ./compile/test_wrapper.py
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep 'Same node test passed'
Expand Down
1 change: 1 addition & 0 deletions .jenkins/lm-eval-harness/run-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ do
export LM_EVAL_TP_SIZE=$TP_SIZE
export PT_HPU_ENABLE_LAZY_COLLECTIVES=true
export VLLM_SKIP_WARMUP=true
export TQDM_BAR_FORMAT="{desc}: {percentage:3.0f}% {bar:10} | {n_fmt}/{total_fmt} [{elapsed}<{remaining}]"
RANDOM_SUFFIX=$(tr -dc A-Za-z0-9 </dev/urandom | head -c 4; echo)
JUNIT_FAMILY=""
JUNIT_XML=""
Expand Down
10 changes: 7 additions & 3 deletions .jenkins/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@
"LM_EVAL_TEST_DATA_FILE",
".jenkins/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct.yaml")

REPORT_PERFORMANCE = os.environ.get("LM_EVAL_REPORT_PERFORMANCE",
"false") in ['1', 'true']

TP_SIZE = os.environ.get("LM_EVAL_TP_SIZE", 1)


Expand Down Expand Up @@ -170,9 +173,10 @@ def test_lm_eval_correctness(record_xml_attribute, record_property):
x['resps'])))['input_ids'])) for x in samples
]
tokenized_outputs_lens = [len(x) for x in tokenized_outputs]
report_performance(task['name'], tokenized_inputs_lens,
tokenized_outputs_lens, total_time,
record_property)
if REPORT_PERFORMANCE:
report_performance(task['name'], tokenized_inputs_lens,
tokenized_outputs_lens, total_time,
record_property)

for metric in task["metrics"]:
ground_truth = metric["value"]
Expand Down
6 changes: 3 additions & 3 deletions Dockerfile.cpu
Original file line number Diff line number Diff line change
Expand Up @@ -26,20 +26,20 @@ RUN pip install intel_extension_for_pytorch==2.5.0

WORKDIR /workspace

COPY requirements-build.txt requirements-build.txt
ARG PIP_EXTRA_INDEX_URL="https://download.pytorch.org/whl/cpu"
ENV PIP_EXTRA_INDEX_URL=${PIP_EXTRA_INDEX_URL}
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-build.txt,target=requirements-build.txt \
pip install --upgrade pip && \
pip install -r requirements-build.txt

FROM cpu-test-1 AS build

WORKDIR /workspace/vllm

COPY requirements-common.txt requirements-common.txt
COPY requirements-cpu.txt requirements-cpu.txt
RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,src=requirements-common.txt,target=requirements-common.txt \
--mount=type=bind,src=requirements-cpu.txt,target=requirements-cpu.txt \
pip install -v -r requirements-cpu.txt

COPY . .
Expand Down
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ Easy, fast, and cheap LLM serving for everyone
</p>

---
> [!NOTE]
> For Intel Gaudi specific setup instructions and examples, please refer [Intel® Gaudi® README](https://github.com/HabanaAI/vllm-fork/blob/habana_main/README_GAUDI.md). For jupyter notebook based quickstart tutorials refer [Getting Started with vLLM](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/Getting_Started_with_vLLM/Getting_Started_with_vLLM.ipynb) and [Understanding vLLM on Gaudi](https://github.com/HabanaAI/Gaudi-tutorials/blob/main/PyTorch/Understanding_vLLM_on_Gaudi/Understanding_vLLM_on_Gaudi.ipynb).
The first vLLM meetup in 2025 is happening on January 22nd, Wednesday, with Google Cloud in San Francisco! We will talk about vLLM's performant V1 architecture, Q1 roadmap, Google Cloud's innovation around vLLM: networking, Cloud Run, Vertex, and TPU! [Register Now](https://lu.ma/zep56hui)

Expand Down
Loading

0 comments on commit e61dd86

Please sign in to comment.