diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index b39dce2659a54..0412c5f37952d 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -1,36 +1,43 @@ import os +import sys import zipfile -MAX_SIZE_MB = 250 +# Read the VLLM_MAX_SIZE_MB environment variable, defaulting to 250 MB +VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 250)) def print_top_10_largest_files(zip_file): + """Print the top 10 largest files in the given zip file.""" with zipfile.ZipFile(zip_file, 'r') as z: file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] file_sizes.sort(key=lambda x: x[1], reverse=True) for f, size in file_sizes[:10]: - print(f"{f}: {size/(1024*1024)} MBs uncompressed.") + print(f"{f}: {size / (1024 * 1024):.2f} MBs uncompressed.") def check_wheel_size(directory): + """Check the size of .whl files in the given directory.""" for root, _, files in os.walk(directory): - for f in files: - if f.endswith(".whl"): - wheel_path = os.path.join(root, f) - wheel_size = os.path.getsize(wheel_path) - wheel_size_mb = wheel_size / (1024 * 1024) - if wheel_size_mb > MAX_SIZE_MB: - print( - f"Wheel {wheel_path} is too large ({wheel_size_mb} MB) " - f"compare to the allowed size ({MAX_SIZE_MB} MB).") + for file_name in files: + if file_name.endswith(".whl"): + wheel_path = os.path.join(root, file_name) + wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) + if wheel_size_mb > VLLM_MAX_SIZE_MB: + print(f"Not allowed: Wheel {wheel_path} is larger " + f"({wheel_size_mb:.2f} MB) than the limit " + f"({VLLM_MAX_SIZE_MB} MB).") print_top_10_largest_files(wheel_path) return 1 else: print(f"Wheel {wheel_path} is within the allowed size " - f"({wheel_size_mb} MB).") + f"({wheel_size_mb:.2f} MB).") return 0 if __name__ == "__main__": - import sys - sys.exit(check_wheel_size(sys.argv[1])) + if len(sys.argv) < 2: + print("Usage: python check-wheel-size.py ") + sys.exit(1) + + directory = sys.argv[1] + sys.exit(check_wheel_size(directory)) \ No newline at end of file diff --git a/.buildkite/run-amd-test.sh b/.buildkite/run-amd-test.sh old mode 100644 new mode 100755 index 5548071390aff..972c62a091aea --- a/.buildkite/run-amd-test.sh +++ b/.buildkite/run-amd-test.sh @@ -1,5 +1,5 @@ # This script runs test inside the corresponding ROCm docker container. -set -ex +set -o pipefail # Print ROCm version echo "--- Confirming Clean Initial State" @@ -70,16 +70,51 @@ HF_CACHE="$(realpath ~)/huggingface" mkdir -p ${HF_CACHE} HF_MOUNT="/root/.cache/huggingface" -docker run \ +commands=$@ +PARALLEL_JOB_COUNT=8 +# check if the command contains shard flag, we will run all shards in parallel because the host have 8 GPUs. +if [[ $commands == *"--shard-id="* ]]; then + for GPU in $(seq 0 $(($PARALLEL_JOB_COUNT-1))); do + #replace shard arguments + commands=${@//"--shard-id= "/"--shard-id=${GPU} "} + commands=${commands//"--num-shards= "/"--num-shards=${PARALLEL_JOB_COUNT} "} + docker run \ --device /dev/kfd --device /dev/dri \ --network host \ --shm-size=16gb \ --rm \ - -e HIP_VISIBLE_DEVICES=0 \ + -e HIP_VISIBLE_DEVICES=${GPU} \ -e HF_TOKEN \ -v ${HF_CACHE}:${HF_MOUNT} \ -e HF_HOME=${HF_MOUNT} \ - --name ${container_name} \ + --name ${container_name}_${GPU} \ ${image_name} \ - /bin/bash -c "${@}" - + /bin/bash -c "${commands}" \ + |& while read -r line; do echo ">>Shard $GPU: $line"; done & + PIDS+=($!) + done + #wait for all processes to finish and collect exit codes + for pid in ${PIDS[@]}; do + wait ${pid} + STATUS+=($?) + done + for st in ${STATUS[@]}; do + if [[ ${st} -ne 0 ]]; then + echo "One of the processes failed with $st" + exit ${st} + fi + done +else + docker run \ + --device /dev/kfd --device /dev/dri \ + --network host \ + --shm-size=16gb \ + --rm \ + -e HIP_VISIBLE_DEVICES=0 \ + -e HF_TOKEN \ + -v ${HF_CACHE}:${HF_MOUNT} \ + -e HF_HOME=${HF_MOUNT} \ + --name ${container_name} \ + ${image_name} \ + /bin/bash -c "${commands}" +fi diff --git a/.buildkite/run-cpu-test.sh b/.buildkite/run-cpu-test.sh index 8e4be08f3aba0..ca9cf15780e25 100644 --- a/.buildkite/run-cpu-test.sh +++ b/.buildkite/run-cpu-test.sh @@ -23,7 +23,12 @@ docker exec cpu-test-avx2 bash -c "python3 examples/offline_inference.py" # Run basic model test docker exec cpu-test bash -c " pip install pytest matplotlib einops transformers_stream_generator - pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py --ignore=tests/models/test_oot_registration.py --ignore=tests/models/test_registry.py --ignore=tests/models/test_jamba.py --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported + pytest -v -s tests/models -m \"not vlm\" --ignore=tests/models/test_embedding.py \ + --ignore=tests/models/test_oot_registration.py \ + --ignore=tests/models/test_registry.py \ + --ignore=tests/models/test_fp8.py \ + --ignore=tests/models/test_jamba.py \ + --ignore=tests/models/test_danube3_4b.py" # Mamba and Danube3-4B on CPU is not supported # online inference docker exec cpu-test bash -c " diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 86eddb576c42a..d0317b2fc48c9 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -92,6 +92,7 @@ steps: - pytest -v -s entrypoints/openai - pytest -v -s entrypoints/test_chat_utils.py + - label: Distributed Tests (4 GPUs) # 10min working_dir: "/vllm-workspace/tests" num_gpus: 4 @@ -157,6 +158,7 @@ steps: - python3 offline_inference_with_prefix.py - python3 llm_engine_example.py - python3 offline_inference_vision_language.py + - python3 offline_inference_vision_language_multi_image.py - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors - python3 offline_inference_encoder_decoder.py @@ -218,9 +220,9 @@ steps: - pytest -v -s spec_decode - label: LoRA Test %N # 30min each + mirror_hardwares: [amd] source_file_dependencies: - vllm/lora - - csrc/punica - tests/lora command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py parallelism: 4 @@ -271,6 +273,15 @@ steps: - export VLLM_WORKER_MULTIPROC_METHOD=spawn - bash ./run-tests.sh -c configs/models-small.txt -t 1 +- label: OpenAI-Compatible Tool Use # 20 min + fast_check: false + mirror_hardwares: [ amd ] + source_file_dependencies: + - vllm/ + - tests/tool_use + commands: + - pytest -v -s tool_use + ##### 1 GPU test ##### ##### multi gpus test ##### @@ -358,9 +369,9 @@ steps: - label: LoRA Long Context (Distributed) # 11min # This test runs llama 13B, so it is required to run on 4 GPUs. num_gpus: 4 + soft_fail: true source_file_dependencies: - vllm/lora - - csrc/punica - tests/lora/test_long_context commands: # FIXIT: find out which code initialize cuda before running the test diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md new file mode 100644 index 0000000000000..f801b5f8f5513 --- /dev/null +++ b/CODE_OF_CONDUCT.md @@ -0,0 +1,128 @@ + +# vLLM Code of Conduct + +## Our Pledge + +We as members, contributors, and leaders pledge to make participation in our +community a harassment-free experience for everyone, regardless of age, body +size, visible or invisible disability, ethnicity, sex characteristics, gender +identity and expression, level of experience, education, socioeconomic status, +nationality, personal appearance, race, caste, color, religion, or sexual +identity and orientation. + +We pledge to act and interact in ways that contribute to an open, welcoming, +diverse, inclusive, and healthy community. + +## Our Standards + +Examples of behavior that contributes to a positive environment for our +community include: + +* Demonstrating empathy and kindness toward other people +* Being respectful of differing opinions, viewpoints, and experiences +* Giving and gracefully accepting constructive feedback +* Accepting responsibility and apologizing to those affected by our mistakes, + and learning from the experience +* Focusing on what is best not just for us as individuals, but for the overall + community + +Examples of unacceptable behavior include: + +* The use of sexualized language or imagery, and sexual attention or advances of + any kind +* Trolling, insulting or derogatory comments, and personal or political attacks +* Public or private harassment +* Publishing others' private information, such as a physical or email address, + without their explicit permission +* Other conduct which could reasonably be considered inappropriate in a + professional setting + +## Enforcement Responsibilities + +Community leaders are responsible for clarifying and enforcing our standards of +acceptable behavior and will take appropriate and fair corrective action in +response to any behavior that they deem inappropriate, threatening, offensive, +or harmful. + +Community leaders have the right and responsibility to remove, edit, or reject +comments, commits, code, wiki edits, issues, and other contributions that are +not aligned to this Code of Conduct, and will communicate reasons for moderation +decisions when appropriate. + +## Scope + +This Code of Conduct applies within all community spaces, and also applies when +an individual is officially representing the community in public spaces. +Examples of representing our community include using an official email address, +posting via an official social media account, or acting as an appointed +representative at an online or offline/IRL event. + +## Enforcement + +Instances of abusive, harassing, or otherwise unacceptable behavior may be +reported to the community leaders responsible for enforcement in the #code-of-conduct +channel in the [vLLM Discord](https://discord.com/invite/jz7wjKhh6g). +All complaints will be reviewed and investigated promptly and fairly. + +All community leaders are obligated to respect the privacy and security of the +reporter of any incident. + +## Enforcement Guidelines + +Community leaders will follow these Community Impact Guidelines in determining +the consequences for any action they deem in violation of this Code of Conduct: + +### 1. Correction + +**Community Impact**: Use of inappropriate language or other behavior deemed +unprofessional or unwelcome in the community. + +**Consequence**: A private, written warning from community leaders, providing +clarity around the nature of the violation and an explanation of why the +behavior was inappropriate. A public apology may be requested. + +### 2. Warning + +**Community Impact**: A violation through a single incident or series of +actions. + +**Consequence**: A warning with consequences for continued behavior. No +interaction with the people involved, including unsolicited interaction with +those enforcing the Code of Conduct, for a specified period of time. This +includes avoiding interactions in community spaces as well as external channels +like social media. Violating these terms may lead to a temporary or permanent +ban. + +### 3. Temporary Ban + +**Community Impact**: A serious violation of community standards, including +sustained inappropriate behavior. + +**Consequence**: A temporary ban from any sort of interaction or public +communication with the community for a specified period of time. No public or +private interaction with the people involved, including unsolicited interaction +with those enforcing the Code of Conduct, is allowed during this period. +Violating these terms may lead to a permanent ban. + +### 4. Permanent Ban + +**Community Impact**: Demonstrating a pattern of violation of community +standards, including sustained inappropriate behavior, harassment of an +individual, or aggression toward or disparagement of classes of individuals. + +**Consequence**: A permanent ban from any sort of public interaction within the +community. + +## Attribution + +This Code of Conduct is adapted from the [Contributor Covenant](https://www.contributor-covenant.org/), +version 2.1, available at +[v2.1](https://www.contributor-covenant.org/version/2/1/code_of_conduct.html). + +Community Impact Guidelines were inspired by +[Mozilla's code of conduct enforcement ladder](https://github.com/mozilla/inclusion). + +For answers to common questions about this code of conduct, see the +[Contributor Covenant FAQ](https://www.contributor-covenant.org/faq). Translations are available at +[Contributor Covenant translations](https://www.contributor-covenant.org/translations). + diff --git a/Dockerfile b/Dockerfile index ec6069f605eb1..7f255e1d6e93e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -108,10 +108,17 @@ RUN --mount=type=cache,target=/root/.cache/ccache \ python3 setup.py bdist_wheel --dist-dir=dist --py-limited-api=cp38; \ fi -# check the size of the wheel, we cannot upload wheels larger than 100MB +# Check the size of the wheel if RUN_WHEEL_CHECK is true COPY .buildkite/check-wheel-size.py check-wheel-size.py -RUN python3 check-wheel-size.py dist - +# Default max size of the wheel is 250MB +ARG VLLM_MAX_SIZE_MB=250 +ENV VLLM_MAX_SIZE_MB=$VLLM_MAX_SIZE_MB +ARG RUN_WHEEL_CHECK=true +RUN if [ "$RUN_WHEEL_CHECK" = "true" ]; then \ + python3 check-wheel-size.py dist; \ + else \ + echo "Skipping wheel size check."; \ + fi #################### EXTENSION Build IMAGE #################### #################### DEV IMAGE #################### diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index e38ceaa222956..84f366bdba387 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -56,8 +56,8 @@ class BenchmarkMetrics: total_input: int total_output: int request_throughput: float - input_throughput: float output_throughput: float + total_token_throughput: float mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float @@ -283,8 +283,8 @@ def calculate_metrics( total_input=total_input, total_output=sum(actual_output_lens), request_throughput=completed / dur_s, - input_throughput=total_input / dur_s, output_throughput=sum(actual_output_lens) / dur_s, + total_token_throughput=(total_input + sum(actual_output_lens)) / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend std_ttft_ms=np.std(ttfts or 0) * 1000, @@ -426,10 +426,10 @@ async def benchmark( metrics.total_output)) print("{:<40} {:<10.2f}".format("Request throughput (req/s):", metrics.request_throughput)) - print("{:<40} {:<10.2f}".format("Input token throughput (tok/s):", - metrics.input_throughput)) print("{:<40} {:<10.2f}".format("Output token throughput (tok/s):", metrics.output_throughput)) + print("{:<40} {:<10.2f}".format("Total Token throughput (tok/s):", + metrics.total_token_throughput)) result = { "duration": benchmark_duration, @@ -437,8 +437,8 @@ async def benchmark( "total_input_tokens": metrics.total_input, "total_output_tokens": metrics.total_output, "request_throughput": metrics.request_throughput, - "input_throughput": metrics.input_throughput, "output_throughput": metrics.output_throughput, + "total_token_throughput": metrics.total_token_throughput, "input_lens": [output.prompt_len for output in outputs], "output_lens": actual_output_lens, "ttfts": [output.ttft for output in outputs], diff --git a/benchmarks/benchmark_throughput.py b/benchmarks/benchmark_throughput.py index eaf256f7cb8c2..94549d84fb4e4 100644 --- a/benchmarks/benchmark_throughput.py +++ b/benchmarks/benchmark_throughput.py @@ -6,13 +6,16 @@ from typing import List, Optional, Tuple import torch +import uvloop from tqdm import tqdm from transformers import (AutoModelForCausalLM, AutoTokenizer, PreTrainedTokenizerBase) -from vllm.engine.arg_utils import EngineArgs +from vllm.engine.arg_utils import AsyncEngineArgs, EngineArgs +from vllm.entrypoints.openai.api_server import ( + build_async_engine_client_from_engine_args) from vllm.model_executor.layers.quantization import QUANTIZATION_METHODS -from vllm.utils import FlexibleArgumentParser +from vllm.utils import FlexibleArgumentParser, merge_async_iterators def sample_requests( @@ -135,6 +138,93 @@ def run_vllm( return end - start +async def run_vllm_async( + requests: List[Tuple[str, int, int]], + model: str, + tokenizer: str, + quantization: Optional[str], + tensor_parallel_size: int, + seed: int, + n: int, + use_beam_search: bool, + trust_remote_code: bool, + dtype: str, + max_model_len: Optional[int], + enforce_eager: bool, + kv_cache_dtype: str, + quantization_param_path: Optional[str], + device: str, + enable_prefix_caching: bool, + enable_chunked_prefill: bool, + max_num_batched_tokens: int, + distributed_executor_backend: Optional[str], + gpu_memory_utilization: float = 0.9, + num_scheduler_steps: int = 1, + use_v2_block_manager: bool = False, + download_dir: Optional[str] = None, + load_format: str = EngineArgs.load_format, + disable_async_output_proc: bool = False, + disable_frontend_multiprocessing: bool = False, +) -> float: + from vllm import SamplingParams + engine_args = AsyncEngineArgs( + model=model, + tokenizer=tokenizer, + quantization=quantization, + tensor_parallel_size=tensor_parallel_size, + seed=seed, + trust_remote_code=trust_remote_code, + dtype=dtype, + max_model_len=max_model_len, + gpu_memory_utilization=gpu_memory_utilization, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + quantization_param_path=quantization_param_path, + device=device, + enable_prefix_caching=enable_prefix_caching, + download_dir=download_dir, + enable_chunked_prefill=enable_chunked_prefill, + max_num_batched_tokens=max_num_batched_tokens, + distributed_executor_backend=distributed_executor_backend, + load_format=load_format, + num_scheduler_steps=num_scheduler_steps, + use_v2_block_manager=use_v2_block_manager, + disable_async_output_proc=disable_async_output_proc, + worker_use_ray=False, + engine_use_ray=False, + disable_log_requests=True, + ) + + async with build_async_engine_client_from_engine_args( + engine_args, disable_frontend_multiprocessing) as llm: + + # Add the requests to the engine. + prompts: List[str] = [] + sampling_params: List[SamplingParams] = [] + for prompt, _, output_len in requests: + prompts.append(prompt) + sampling_params.append( + SamplingParams( + n=n, + temperature=0.0 if use_beam_search else 1.0, + top_p=1.0, + use_beam_search=use_beam_search, + ignore_eos=True, + max_tokens=output_len, + )) + + generators = [] + start = time.perf_counter() + for i, (prompt, sp) in enumerate(zip(prompts, sampling_params)): + generator = llm.generate(prompt, sp, request_id=f"test{i}") + generators.append(generator) + all_gens = merge_async_iterators(*generators) + async for i, res in all_gens: + pass + end = time.perf_counter() + return end - start + + def run_hf( requests: List[Tuple[str, int, int]], model: str, @@ -230,7 +320,7 @@ def main(args: argparse.Namespace): args.output_len) if args.backend == "vllm": - elapsed_time = run_vllm( + run_args = [ requests, args.model, args.tokenizer, args.quantization, args.tensor_parallel_size, args.seed, args.n, args.use_beam_search, args.trust_remote_code, args.dtype, args.max_model_len, @@ -240,7 +330,14 @@ def main(args: argparse.Namespace): args.max_num_batched_tokens, args.distributed_executor_backend, args.gpu_memory_utilization, args.num_scheduler_steps, args.use_v2_block_manager, args.download_dir, args.load_format, - args.disable_async_output_proc) + args.disable_async_output_proc + ] + + if args.async_engine: + run_args.append(args.disable_frontend_multiprocessing) + elapsed_time = uvloop.run(run_vllm_async(*run_args)) + else: + elapsed_time = run_vllm(*run_args) elif args.backend == "hf": assert args.tensor_parallel_size == 1 elapsed_time = run_hf(requests, args.model, tokenizer, args.n, @@ -426,6 +523,14 @@ def main(args: argparse.Namespace): action='store_true', default=False, help="Disable async output processor for vLLM backend.") + parser.add_argument("--async-engine", + action='store_true', + default=False, + help="Use vLLM async engine rather than LLM class.") + parser.add_argument("--disable-frontend-multiprocessing", + action='store_true', + default=False, + help="Disable decoupled async engine frontend.") args = parser.parse_args() if args.tokenizer is None: args.tokenizer = args.model diff --git a/docs/source/getting_started/debugging.rst b/docs/source/getting_started/debugging.rst index 117a9dd666481..31ecca1332e5d 100644 --- a/docs/source/getting_started/debugging.rst +++ b/docs/source/getting_started/debugging.rst @@ -21,7 +21,7 @@ If you have already taken care of the above issues, but the vLLM instance still With more logging, hopefully you can find the root cause of the issue. -If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the ``LLM`` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error. +If it crashes, and the error trace shows somewhere around ``self.graph.replay()`` in ``vllm/worker/model_runner.py``, it is a cuda error inside cudagraph. To know the particular cuda operation that causes the error, you can add ``--enforce-eager`` to the command line, or ``enforce_eager=True`` to the :class:`~vllm.LLM` class, to disable the cudagraph optimization. This way, you can locate the exact cuda operation that causes the error. Here are some common issues that can cause hangs: diff --git a/docs/source/getting_started/quickstart.rst b/docs/source/getting_started/quickstart.rst index 89bdc247c5e8e..80b19ac672936 100644 --- a/docs/source/getting_started/quickstart.rst +++ b/docs/source/getting_started/quickstart.rst @@ -24,7 +24,9 @@ Offline Batched Inference We first show an example of using vLLM for offline batched inference on a dataset. In other words, we use vLLM to generate texts for a list of input prompts. -Import ``LLM`` and ``SamplingParams`` from vLLM. The ``LLM`` class is the main class for running offline inference with vLLM engine. The ``SamplingParams`` class specifies the parameters for the sampling process. +Import :class:`~vllm.LLM` and :class:`~vllm.SamplingParams` from vLLM. +The :class:`~vllm.LLM` class is the main class for running offline inference with vLLM engine. +The :class:`~vllm.SamplingParams` class specifies the parameters for the sampling process. .. code-block:: python @@ -42,7 +44,7 @@ Define the list of input prompts and the sampling parameters for generation. The ] sampling_params = SamplingParams(temperature=0.8, top_p=0.95) -Initialize vLLM's engine for offline inference with the ``LLM`` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `. +Initialize vLLM's engine for offline inference with the :class:`~vllm.LLM` class and the `OPT-125M model `_. The list of supported models can be found at :ref:`supported models `. .. code-block:: python diff --git a/docs/source/getting_started/tpu-installation.rst b/docs/source/getting_started/tpu-installation.rst index d0c2498d8849e..217028839e347 100644 --- a/docs/source/getting_started/tpu-installation.rst +++ b/docs/source/getting_started/tpu-installation.rst @@ -59,7 +59,7 @@ First, install the dependencies: $ export DATE="20240828" $ export TORCH_VERSION="2.5.0" $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl - $ pip3 install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl + $ pip install https://storage.googleapis.com/pytorch-xla-releases/wheels/tpuvm/torch_xla-${TORCH_VERSION}.dev${DATE}-cp310-cp310-linux_x86_64.whl $ # Install JAX and Pallas. $ pip install torch_xla[tpu] -f https://storage.googleapis.com/libtpu-releases/index.html diff --git a/docs/source/models/supported_models.rst b/docs/source/models/supported_models.rst index 2c20b6e48407d..0c0a54281e3f3 100644 --- a/docs/source/models/supported_models.rst +++ b/docs/source/models/supported_models.rst @@ -194,12 +194,12 @@ Multimodal Language Models * - Architecture - Models - - Supported Modalities + - Modalities - Example HuggingFace Models - :ref:`LoRA ` * - :code:`Blip2ForConditionalGeneration` - BLIP-2 - - Image + - Image\ :sup:`E` - :code:`Salesforce/blip2-opt-2.7b`, :code:`Salesforce/blip2-opt-6.7b`, etc. - * - :code:`ChameleonForConditionalGeneration` @@ -214,40 +214,48 @@ Multimodal Language Models - * - :code:`InternVLChatModel` - InternVL2 - - Image + - Image\ :sup:`E` - :code:`OpenGVLab/InternVL2-4B`, :code:`OpenGVLab/InternVL2-8B`, etc. - * - :code:`LlavaForConditionalGeneration` - LLaVA-1.5 - - Image + - Image\ :sup:`E` - :code:`llava-hf/llava-1.5-7b-hf`, :code:`llava-hf/llava-1.5-13b-hf`, etc. - * - :code:`LlavaNextForConditionalGeneration` - LLaVA-NeXT - - Image + - Image\ :sup:`E+` - :code:`llava-hf/llava-v1.6-mistral-7b-hf`, :code:`llava-hf/llava-v1.6-vicuna-7b-hf`, etc. - * - :code:`PaliGemmaForConditionalGeneration` - PaliGemma - - Image + - Image\ :sup:`E` - :code:`google/paligemma-3b-pt-224`, :code:`google/paligemma-3b-mix-224`, etc. - * - :code:`Phi3VForCausalLM` - Phi-3-Vision, Phi-3.5-Vision - - Image + - Image\ :sup:`E+` - :code:`microsoft/Phi-3-vision-128k-instruct`, :code:`microsoft/Phi-3.5-vision-instruct` etc. - * - :code:`MiniCPMV` - MiniCPM-V - - Image + - Image\ :sup:`+` - :code:`openbmb/MiniCPM-V-2` (see note), :code:`openbmb/MiniCPM-Llama3-V-2_5`, :code:`openbmb/MiniCPM-V-2_6`, etc. - + * - :code:`QWenLMHeadModel` + - Qwen + - Image + - :code:`Qwen/Qwen-VL`, :code:`Qwen/Qwen-VL-Chat`, etc. + - * - :code:`UltravoxModel` - Ultravox - - Audio + - Audio\ :sup:`E+` - :code:`fixie-ai/ultravox-v0_3` - +| :sup:`E` Pre-computed embeddings can be inputted for this modality. +| :sup:`+` Multiple items can be inputted per text prompt for this modality. + .. note:: For :code:`openbmb/MiniCPM-V-2`, the official repo doesn't work yet, so we need to use a fork (:code:`HwwwH/MiniCPM-V-2`) for now. For more details, please see: https://github.com/vllm-project/vllm/pull/4087#issuecomment-2250397630 diff --git a/docs/source/models/vlm.rst b/docs/source/models/vlm.rst index 236e37b51d470..08db891665044 100644 --- a/docs/source/models/vlm.rst +++ b/docs/source/models/vlm.rst @@ -9,26 +9,23 @@ This document shows you how to run and serve these models using vLLM. .. important:: We are actively iterating on VLM support. Expect breaking changes to VLM usage and development in upcoming releases without prior deprecation. - Currently, the support for vision language models on vLLM has the following limitations: - - * Only single image input is supported per text prompt. - We are continuously improving user & developer experience for VLMs. Please `open an issue on GitHub `_ if you have any feedback or feature requests. -Offline Batched Inference -------------------------- +Offline Inference +----------------- + +Single-image input +^^^^^^^^^^^^^^^^^^ -To initialize a VLM, the aforementioned arguments must be passed to the ``LLM`` class for instantiating the engine. +The :class:`~vllm.LLM` class can be instantiated in much the same way as language-only models. .. code-block:: python llm = LLM(model="llava-hf/llava-1.5-7b-hf") -.. important:: +.. note:: We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow - the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that - internally for each model. - + the above snippet. Specifically, ``image_feature_size`` can no longer be specified as we now calculate that internally for each model. To pass an image to the model, note the following in :class:`vllm.inputs.PromptInputs`: @@ -86,61 +83,117 @@ To pass an image to the model, note the following in :class:`vllm.inputs.PromptI A code example can be found in `examples/offline_inference_vision_language.py `_. +Multi-image input +^^^^^^^^^^^^^^^^^ -Online OpenAI Vision API Compatible Inference ----------------------------------------------- +Multi-image input is only supported for a subset of VLMs, as shown :ref:`here `. -You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API `_. +To enable multiple multi-modal items per text prompt, you have to set ``limit_mm_per_prompt`` for the :class:`~vllm.LLM` class. -.. note:: - Currently, vLLM supports only **single** ``image_url`` input per ``messages``. Support for multi-image inputs will be - added in the future. +.. code-block:: python -Below is an example on how to launch the same ``llava-hf/llava-1.5-7b-hf`` with vLLM API server. + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, # Required to load Phi-3.5-vision + max_model_len=4096, # Otherwise, it may not fit in smaller GPUs + limit_mm_per_prompt={"image": 2}, # The maximum number to accept + ) -.. important:: - Since OpenAI Vision API is based on `Chat `_ API, a chat template - is **required** to launch the API server if the model's tokenizer does not come with one. In this example, we use the - HuggingFace Llava chat template that you can find in the example folder `here `_. +Instead of passing in a single image, you can pass in a list of images. + +.. code-block:: python + + # Refer to the HuggingFace repo for the correct format to use + prompt = "<|user|>\n\n\nWhat is the content of each image?<|end|>\n<|assistant|>\n" + + # Load the images using PIL.Image + image1 = PIL.Image.open(...) + image2 = PIL.Image.open(...) + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": [image1, image2] + }, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + +A code example can be found in `examples/offline_inference_vision_language_multi_image.py `_. + +Online Inference +---------------- + +OpenAI Vision API +^^^^^^^^^^^^^^^^^ + +You can serve vision language models with vLLM's HTTP server that is compatible with `OpenAI Vision API `_. + +Below is an example on how to launch the same ``microsoft/Phi-3.5-vision-instruct`` with vLLM's OpenAI-compatible API server. .. code-block:: bash - vllm serve llava-hf/llava-1.5-7b-hf --chat-template template_llava.jinja + vllm serve microsoft/Phi-3.5-vision-instruct --max-model-len 4096 \ + --trust-remote-code --limit-mm-per-prompt image=2 .. important:: - We have removed all vision language related CLI args in the ``0.5.1`` release. **This is a breaking change**, so please update your code to follow - the above snippet. Specifically, ``image_feature_size`` is no longer required to be specified as we now calculate that - internally for each model. + Since OpenAI Vision API is based on `Chat Completions `_ API, + a chat template is **required** to launch the API server. + + Although Phi-3.5-Vision comes with a chat template, for other models you may have to provide one if the model's tokenizer does not come with it. + The chat template can be inferred based on the documentation on the model's HuggingFace repo. + For example, LLaVA-1.5 (``llava-hf/llava-1.5-7b-hf``) requires a chat template that can be found `here `_. To consume the server, you can use the OpenAI client like in the example below: .. code-block:: python from openai import OpenAI + openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" + client = OpenAI( api_key=openai_api_key, base_url=openai_api_base, ) + + # Single-image input inference + image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" + chat_response = client.chat.completions.create( - model="llava-hf/llava-1.5-7b-hf", + model="microsoft/Phi-3.5-vision-instruct", messages=[{ "role": "user", "content": [ # NOTE: The prompt formatting with the image token `` is not needed # since the prompt will be processed automatically by the API server. - {"type": "text", "text": "What's in this image?"}, - { - "type": "image_url", - "image_url": { - "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg", - }, - }, + {"type": "text", "text": "What’s in this image?"}, + {"type": "image_url", "image_url": {"url": image_url}}, ], }], ) - print("Chat response:", chat_response) + print("Chat completion output:", chat_response.choices[0].message.content) + + # Multi-image input inference + image_url_duck = "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg" + image_url_lion = "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg" + + chat_response = client.chat.completions.create( + model="microsoft/Phi-3.5-vision-instruct", + messages=[{ + "role": "user", + "content": [ + {"type": "text", "text": "What are the animals in these images?"}, + {"type": "image_url", "image_url": {"url": image_url_duck}}, + {"type": "image_url", "image_url": {"url": image_url_lion}}, + ], + }], + ) + print("Chat completion output:", chat_response.choices[0].message.content) + A full code example can be found in `examples/openai_vision_api_client.py `_. diff --git a/docs/source/serving/openai_compatible_server.md b/docs/source/serving/openai_compatible_server.md index b2acde390083c..eb4ea0fb5655e 100644 --- a/docs/source/serving/openai_compatible_server.md +++ b/docs/source/serving/openai_compatible_server.md @@ -110,6 +110,14 @@ directory [here](https://github.com/vllm-project/vllm/tree/main/examples/) :func: create_parser_for_docs :prog: vllm serve ``` +## Tool Calling in the Chat Completion API +### Named Function Calling +vLLM supports only named function calling in the chat completion API by default. It does so using Outlines, so this is +enabled by default, and will work with any supported model. You are guaranteed a validly-parsable function call - not a +high-quality one. + +To use a named function, you need to define the functions in the `tools` parameter of the chat completion request, and +specify the `name` of one of the tools in the `tool_choice` parameter of the chat completion request. ### Config file @@ -140,10 +148,52 @@ The order of priorities is `command line > config file values > defaults`. ## Tool calling in the chat completion API vLLM supports only named function calling in the chat completion API. The `tool_choice` options `auto` and `required` are **not yet supported** but on the roadmap. -To use a named function you need to define the function in the `tools` parameter and call it in the `tool_choice` parameter. - -It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. **This may change in the future.** +It is the callers responsibility to prompt the model with the tool information, vLLM will not automatically manipulate the prompt. vLLM will use guided decoding to ensure the response matches the tool parameter object defined by the JSON schema in the `tools` parameter. -Please refer to the OpenAI API reference documentation for more information. + +### Automatic Function Calling +To enable this feature, you should set the following flags: +* `--enable-auto-tool-choice` -- **mandatory** Auto tool choice. tells vLLM that you want to enable the model to generate its own tool calls when it +deems appropriate. +* `--tool-call-parser` -- select the tool parser to use - currently either `hermes` or `mistral`. Additional tool parsers +will continue to be added in the future. +* `--chat-template` -- **optional** for auto tool choice. the path to the chat template which handles `tool`-role messages and `assistant`-role messages +that contain previously generated tool calls. Hermes and Mistral models have tool-compatible chat templates in their +`tokenizer_config.json` files, but you can specify a custom template. This argument can be set to `tool_use` if your model has a tool use-specific chat +template configured in the `tokenizer_config.json`. In this case, it will be used per the `transformers` specification. More on this [here](https://huggingface.co/docs/transformers/en/chat_templating#why-do-some-models-have-multiple-templates) +from HuggingFace; and you can find an example of this in a `tokenizer_config.json` [here](https://huggingface.co/NousResearch/Hermes-2-Pro-Llama-3-8B/blob/main/tokenizer_config.json) + +If your favorite tool-calling model is not supported, please feel free to contribute a parser & tool use chat template! + +#### Hermes Models +All Nous Research Hermes-series models newer than Hermes 2 Pro should be supported. +* `NousResearch/Hermes-2-Pro-*` +* `NousResearch/Hermes-2-Theta-*` +* `NousResearch/Hermes-3-*` + + +_Note that the Hermes 2 **Theta** models are known to have degraded tool call quality & capabilities due to the merge +step in their creation_. + +Flags: `--tool-call-parser hermes` + +#### Mistral Models +Supported models: +* `mistralai/Mistral-7B-Instruct-v0.3` (confirmed) +* Additional mistral function-calling models are compatible as well. + +Known issues: +1. Mistral 7B struggles to generate parallel tool calls correctly. +2. Mistral's `tokenizer_config.json` chat template requires tool call IDs that are exactly 9 digits, which is +much shorter than what vLLM generates. Since an exception is thrown when this condition +is not met, the following additional chat templates are provided: + +* `examples/tool_chat_template_mistral.jinja` - this is the "official" Mistral chat template, but tweaked so that +it works with vLLM's tool call IDs (provided `tool_call_id` fields are truncated to the last 9 digits) +* `examples/tool_chat_template_mistral_parallel.jinja` - this is a "better" version that adds a tool-use system prompt +when tools are provided, that results in much better reliability when working with parallel tool calling. + + +Recommended flags: `--tool-call-parser mistral --chat-template examples/tool_chat_template_mistral_parallel.jinja` diff --git a/examples/offline_inference_audio_language.py b/examples/offline_inference_audio_language.py index 56ce8646c20c9..1c6ac06123bbb 100644 --- a/examples/offline_inference_audio_language.py +++ b/examples/offline_inference_audio_language.py @@ -11,25 +11,33 @@ from vllm.assets.audio import AudioAsset from vllm.utils import FlexibleArgumentParser -# Input audio and question -audio_and_sample_rate = AudioAsset("mary_had_lamb").audio_and_sample_rate -question = "What is recited in the audio?" +audio_assets = [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] +question_per_audio_count = [ + "What is recited in the audio?", + "What sport and what nursery rhyme are referenced?" +] # Ultravox 0.3 -def run_ultravox(question): +def run_ultravox(question, audio_count): model_name = "fixie-ai/ultravox-v0_3" tokenizer = AutoTokenizer.from_pretrained(model_name) messages = [{ - 'role': 'user', - 'content': f"<|reserved_special_token_0|>\n{question}" + 'role': + 'user', + 'content': + "<|reserved_special_token_0|>\n" * audio_count + question }] prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) - llm = LLM(model=model_name) + llm = LLM(model=model_name, + enforce_eager=True, + enable_chunked_prefill=False, + max_model_len=8192, + limit_mm_per_prompt={"audio": audio_count}) stop_token_ids = None return llm, prompt, stop_token_ids @@ -44,7 +52,9 @@ def main(args): if model not in model_example_map: raise ValueError(f"Model type {model} is not supported.") - llm, prompt, stop_token_ids = model_example_map[model](question) + audio_count = args.num_audios + llm, prompt, stop_token_ids = model_example_map[model]( + question_per_audio_count[audio_count - 1], audio_count) # We set temperature to 0.2 so that outputs can be different # even when all prompts are identical when running batch inference. @@ -53,23 +63,18 @@ def main(args): stop_token_ids=stop_token_ids) assert args.num_prompts > 0 - if args.num_prompts == 1: - # Single inference - inputs = { - "prompt": prompt, - "multi_modal_data": { - "audio": audio_and_sample_rate - }, - } - - else: + inputs = { + "prompt": prompt, + "multi_modal_data": { + "audio": [ + asset.audio_and_sample_rate + for asset in audio_assets[:audio_count] + ] + }, + } + if args.num_prompts > 1: # Batch inference - inputs = [{ - "prompt": prompt, - "multi_modal_data": { - "audio": audio_and_sample_rate - }, - } for _ in range(args.num_prompts)] + inputs = [inputs] * args.num_prompts outputs = llm.generate(inputs, sampling_params=sampling_params) @@ -92,6 +97,11 @@ def main(args): type=int, default=1, help='Number of prompts to run.') + parser.add_argument("--num-audios", + type=int, + default=1, + choices=[1, 2], + help="Number of audio items per prompt.") args = parser.parse_args() main(args) diff --git a/examples/offline_inference_neuron_int8_quantization.py b/examples/offline_inference_neuron_int8_quantization.py new file mode 100644 index 0000000000000..8ec17e3400953 --- /dev/null +++ b/examples/offline_inference_neuron_int8_quantization.py @@ -0,0 +1,50 @@ +import os + +from vllm import LLM, SamplingParams + +# creates XLA hlo graphs for all the context length buckets. +os.environ['NEURON_CONTEXT_LENGTH_BUCKETS'] = "128,512,1024,2048" +# creates XLA hlo graphs for all the token gen buckets. +os.environ['NEURON_TOKEN_GEN_BUCKETS'] = "128,512,1024,2048" +# Quantizes neuron model weight to int8 , +# The default config for quantization is int8 dtype. +os.environ['NEURON_QUANT_DTYPE'] = "s8" + +# Sample prompts. +prompts = [ + "Hello, my name is", + "The president of the United States is", + "The capital of France is", + "The future of AI is", +] +# Create a sampling params object. +sampling_params = SamplingParams(temperature=0.8, top_p=0.95) + +# Create an LLM. +llm = LLM( + model="TinyLlama/TinyLlama-1.1B-Chat-v1.0", + max_num_seqs=8, + # The max_model_len and block_size arguments are required to be same as + # max sequence length when targeting neuron device. + # Currently, this is a known limitation in continuous batching support + # in transformers-neuronx. + # TODO(liangfu): Support paged-attention in transformers-neuronx. + max_model_len=2048, + block_size=2048, + # The device can be automatically detected when AWS Neuron SDK is installed. + # The device argument can be either unspecified for automated detection, + # or explicitly assigned. + device="neuron", + quantization="neuron_quant", + override_neuron_config={ + "cast_logits_dtype": "bfloat16", + }, + tensor_parallel_size=2) +# Generate texts from the prompts. The output is a list of RequestOutput objects +# that contain the prompt, generated text, and other information. +outputs = llm.generate(prompts, sampling_params) +# Print the outputs. +for output in outputs: + prompt = output.prompt + generated_text = output.outputs[0].text + print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}") diff --git a/examples/offline_inference_vision_language.py b/examples/offline_inference_vision_language.py index 9a0e9d4bc5362..aa1580343aee7 100644 --- a/examples/offline_inference_vision_language.py +++ b/examples/offline_inference_vision_language.py @@ -159,6 +159,20 @@ def run_blip2(question): return llm, prompt, stop_token_ids +# Qwen +def run_qwen_vl(question): + + llm = LLM( + model="Qwen/Qwen-VL", + trust_remote_code=True, + max_num_seqs=5, + ) + + prompt = f"{question}Picture 1: \n" + stop_token_ids = None + return llm, prompt, stop_token_ids + + model_example_map = { "llava": run_llava, "llava-next": run_llava_next, @@ -169,6 +183,7 @@ def run_blip2(question): "minicpmv": run_minicpmv, "blip-2": run_blip2, "internvl_chat": run_internvl, + "qwen_vl": run_qwen_vl, } diff --git a/examples/offline_inference_vision_language_multi_image.py b/examples/offline_inference_vision_language_multi_image.py new file mode 100644 index 0000000000000..73543ab5da2b4 --- /dev/null +++ b/examples/offline_inference_vision_language_multi_image.py @@ -0,0 +1,95 @@ +""" +This example shows how to use vLLM for running offline inference with +multi-image input on vision language models, using the chat template defined +by the model. +""" +from argparse import Namespace +from typing import List + +from vllm import LLM +from vllm.multimodal.utils import fetch_image +from vllm.utils import FlexibleArgumentParser + +QUESTION = "What is the content of each image?" +IMAGE_URLS = [ + "https://upload.wikimedia.org/wikipedia/commons/d/da/2015_Kaczka_krzy%C5%BCowka_w_wodzie_%28samiec%29.jpg", + "https://upload.wikimedia.org/wikipedia/commons/7/77/002_The_lion_king_Snyggve_in_the_Serengeti_National_Park_Photo_by_Giles_Laurent.jpg", +] + + +def _load_phi3v(image_urls: List[str]): + return LLM( + model="microsoft/Phi-3.5-vision-instruct", + trust_remote_code=True, + max_model_len=4096, + limit_mm_per_prompt={"image": len(image_urls)}, + ) + + +def run_phi3v_generate(question: str, image_urls: List[str]): + llm = _load_phi3v(image_urls) + + placeholders = "\n".join(f"<|image_{i}|>" + for i, _ in enumerate(image_urls, start=1)) + prompt = f"<|user|>\n{placeholders}\n{question}<|end|>\n<|assistant|>\n" + + outputs = llm.generate({ + "prompt": prompt, + "multi_modal_data": { + "image": [fetch_image(url) for url in image_urls] + }, + }) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +def run_phi3v_chat(question: str, image_urls: List[str]): + llm = _load_phi3v(image_urls) + + outputs = llm.chat([{ + "role": + "user", + "content": [ + { + "type": "text", + "text": question, + }, + *({ + "type": "image_url", + "image_url": { + "url": image_url + }, + } for image_url in image_urls), + ], + }]) + + for o in outputs: + generated_text = o.outputs[0].text + print(generated_text) + + +def main(args: Namespace): + method = args.method + + if method == "generate": + run_phi3v_generate(QUESTION, IMAGE_URLS) + elif method == "chat": + run_phi3v_chat(QUESTION, IMAGE_URLS) + else: + raise ValueError(f"Invalid method: {method}") + + +if __name__ == "__main__": + parser = FlexibleArgumentParser( + description='Demo on using vLLM for offline inference with ' + 'vision language models that support multi-image input') + parser.add_argument("--method", + type=str, + default="generate", + choices=["generate", "chat"], + help="The method to run in `vllm.LLM`.") + + args = parser.parse_args() + main(args) diff --git a/examples/openai_chat_completion_client_with_tools.py b/examples/openai_chat_completion_client_with_tools.py new file mode 100644 index 0000000000000..2bbe42b6bd2ef --- /dev/null +++ b/examples/openai_chat_completion_client_with_tools.py @@ -0,0 +1,162 @@ +""" +Set up this example by starting a vLLM OpenAI-compatible server with tool call +options enabled. For example: + +IMPORTANT: for mistral, you must use one of the provided mistral tool call +templates, or your own - the model default doesn't work for tool calls with vLLM +See the vLLM docs on OpenAI server & tool calling for more details. + +vllm serve --model mistralai/Mistral-7B-Instruct-v0.3 \ + --chat-template examples/tool_chat_template_mistral.jinja \ + --enable-auto-tool-choice --tool-call-parser mistral + +OR +vllm serve --model NousResearch/Hermes-2-Pro-Llama-3-8B \ + --chat-template examples/tool_chat_template_hermes.jinja \ + --enable-auto-tool-choice --tool-call-parser hermes +""" +import json + +from openai import OpenAI + +# Modify OpenAI's API key and API base to use vLLM's API server. +openai_api_key = "EMPTY" +openai_api_base = "http://localhost:8000/v1" + +client = OpenAI( + # defaults to os.environ.get("OPENAI_API_KEY") + api_key=openai_api_key, + base_url=openai_api_base, +) + +models = client.models.list() +model = models.data[0].id + +tools = [{ + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": + "string", + "description": + "The city to find the weather for, e.g. 'San Francisco'" + }, + "state": { + "type": + "string", + "description": + "the two-letter abbreviation for the state that the city is" + " in, e.g. 'CA' which would mean 'California'" + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"] + } + }, + "required": ["city", "state", "unit"] + } + } +}] + +messages = [{ + "role": "user", + "content": "Hi! How are you doing today?" +}, { + "role": "assistant", + "content": "I'm doing well! How can I help you?" +}, { + "role": + "user", + "content": + "Can you tell me what the temperate will be in Dallas, in fahrenheit?" +}] + +chat_completion = client.chat.completions.create(messages=messages, + model=model, + tools=tools) + +print("Chat completion results:") +print(chat_completion) +print("\n\n") + +tool_calls_stream = client.chat.completions.create(messages=messages, + model=model, + tools=tools, + stream=True) + +chunks = [] +for chunk in tool_calls_stream: + chunks.append(chunk) + if chunk.choices[0].delta.tool_calls: + print(chunk.choices[0].delta.tool_calls[0]) + else: + print(chunk.choices[0].delta) + +arguments = [] +tool_call_idx = -1 +for chunk in chunks: + + if chunk.choices[0].delta.tool_calls: + tool_call = chunk.choices[0].delta.tool_calls[0] + + if tool_call.index != tool_call_idx: + if tool_call_idx >= 0: + print( + f"streamed tool call arguments: {arguments[tool_call_idx]}" + ) + tool_call_idx = chunk.choices[0].delta.tool_calls[0].index + arguments.append("") + if tool_call.id: + print(f"streamed tool call id: {tool_call.id} ") + + if tool_call.function: + if tool_call.function.name: + print(f"streamed tool call name: {tool_call.function.name}") + + if tool_call.function.arguments: + arguments[tool_call_idx] += tool_call.function.arguments + +if len(arguments): + print(f"streamed tool call arguments: {arguments[-1]}") + +print("\n\n") + +messages.append({ + "role": "assistant", + "tool_calls": chat_completion.choices[0].message.tool_calls +}) + + +# Now, simulate a tool call +def get_current_weather(city: str, state: str, unit: 'str'): + return ("The weather in Dallas, Texas is 85 degrees fahrenheit. It is " + "partly cloudly, with highs in the 90's.") + + +available_tools = {"get_current_weather": get_current_weather} + +completion_tool_calls = chat_completion.choices[0].message.tool_calls +for call in completion_tool_calls: + tool_to_call = available_tools[call.function.name] + args = json.loads(call.function.arguments) + result = tool_to_call(**args) + print(result) + messages.append({ + "role": "tool", + "content": result, + "tool_call_id": call.id, + "name": call.function.name + }) + +chat_completion_2 = client.chat.completions.create(messages=messages, + model=model, + tools=tools, + stream=False) +print("\n\n") +print(chat_completion_2) diff --git a/examples/openai_vision_api_client.py b/examples/openai_vision_api_client.py index e1d4055763e5f..1ba702ef019e4 100644 --- a/examples/openai_vision_api_client.py +++ b/examples/openai_vision_api_client.py @@ -27,9 +27,10 @@ models = client.models.list() model = models.data[0].id +# Single-image input inference image_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg" -# Use image url in the payload +## Use image url in the payload chat_completion_from_url = client.chat.completions.create( messages=[{ "role": @@ -52,10 +53,10 @@ ) result = chat_completion_from_url.choices[0].message.content -print(f"Chat completion output:{result}") +print("Chat completion output:", result) -# Use base64 encoded image in the payload +## Use base64 encoded image in the payload def encode_image_base64_from_url(image_url: str) -> str: """Encode an image retrieved from a remote url to base64 format.""" @@ -122,4 +123,4 @@ def encode_image_base64_from_url(image_url: str) -> str: ) result = chat_completion_from_url.choices[0].message.content -print(f"Chat completion output:{result}") +print("Chat completion output:", result) diff --git a/examples/tool_chat_template_hermes.jinja b/examples/tool_chat_template_hermes.jinja new file mode 100644 index 0000000000000..b18b463032d4f --- /dev/null +++ b/examples/tool_chat_template_hermes.jinja @@ -0,0 +1,129 @@ +{%- macro json_to_python_type(json_spec) %} + {%- set basic_type_map = { + "string": "str", + "number": "float", + "integer": "int", + "boolean": "bool" +} %} + + {%- if basic_type_map[json_spec.type] is defined %} + {{- basic_type_map[json_spec.type] }} + {%- elif json_spec.type == "array" %} + {{- "list[" + json_to_python_type(json_spec|items) + "]" }} + {%- elif json_spec.type == "object" %} + {%- if json_spec.additionalProperties is defined %} + {{- "dict[str, " + json_to_python_type(json_spec.additionalProperties) + ']' }} + {%- else %} + {{- "dict" }} + {%- endif %} + {%- elif json_spec.type is iterable %} + {{- "Union[" }} + {%- for t in json_spec.type %} + {{- json_to_python_type({"type": t}) }} + {%- if not loop.last %} + {{- "," }} + {%- endif %} + {%- endfor %} + {{- "]" }} + {%- else %} + {{- "Any" }} + {%- endif %} +{%- endmacro %} + + +{{- bos_token }} +{{- "<|im_start|>system\nYou are a function calling AI model. You are provided with function signatures within XML tags. You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions. Here are the available tools: " }} +{%- if tools is iterable and tools | length > 0 %} + {%- for tool in tools %} + {%- if tool.function is defined %} + {%- set tool = tool.function %} + {%- endif %} + {{- '{"type": "function", "function": ' }} + {{- '{"name": "' + tool.name + '", ' }} + {{- '"description": "' + tool.name + '(' }} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {{- param_name + ": " + json_to_python_type(param_fields) }} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- ")" }} + {%- if tool.return is defined %} + {{- " -> " + json_to_python_type(tool.return) }} + {%- endif %} + {{- " - " + tool.description + "\n\n" }} + {%- for param_name, param_fields in tool.parameters.properties|items %} + {%- if loop.first %} + {{- " Args:\n" }} + {%- endif %} + {{- " " + param_name + "(" + json_to_python_type(param_fields) + "): " + param_fields.description|trim }} + {%- endfor %} + {%- if tool.return is defined and tool.return.description is defined %} + {{- "\n Returns:\n " + tool.return.description }} + {%- endif %} + {{- '"' }} + {{- ', "parameters": ' }} + {%- if tool.parameters.properties | length == 0 %} + {{- "{}" }} + {%- else %} + {{- tool.parameters|tojson }} + {%- endif %} + {{- "}" }} + {%- if not loop.last %} + {{- "\n" }} + {%- endif %} + {%- endfor %} +{%- endif %} +{{- " " }} +{{- 'Use the following pydantic model json schema for each tool call you will make: {"properties": {"name": {"title": "Name", "type": "string"}, "arguments": {"title": "Arguments", "type": "object"}}, "required": ["name", "arguments"], "title": "FunctionCall", "type": "object"}} +' }} +{{- "For each function call return a json object with function name and arguments within XML tags as follows: +" }} +{{- " +" }} +{{- '{"name": , "arguments": } +' }} +{{- '<|im_end|>' }} +{%- for message in messages %} + {%- if message.role == "user" or message.role == "system" or (message.role == "assistant" and message.tool_calls is not defined) %} + {{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }} + {%- elif message.role == "assistant" and message.tool_calls is defined %} + {{- '<|im_start|>' + message.role }} + {%- for tool_call in message.tool_calls %} + {{- '\n\n' }} + {%- if tool_call.function is defined %} + {%- set tool_call = tool_call.function %} + {%- endif %} + {{- '{' }} + {{- '"name": "' }} + {{- tool_call.name }} + {{- '"}' }} + {{- ', ' }} + {%- if tool_call.arguments is defined %} + {{- '"arguments": ' }} + {{- tool_call.arguments|tojson }} + {%- endif %} + {{- '\n' }} + {%- endfor %} + {{- '<|im_end|>\n' }} + {%- elif message.role == "tool" %} + {%- if loop.previtem and loop.previtem.role != "tool" %} + {{- '<|im_start|>tool\n' }} + {%- endif %} + {{- '\n' }} + {{- message.content }} + {%- if not loop.last %} + {{- '\n\n' }} + {%- else %} + {{- '\n' }} + {%- endif %} + {%- if not loop.last and loop.nextitem.role != "tool" %} + {{- '<|im_end|>' }} + {%- elif loop.last %} + {{- '<|im_end|>' }} + {%- endif %} + {%- endif %} +{%- endfor %} +{%- if add_generation_prompt %} + {{- '<|im_start|>assistant\n' }} +{%- endif %} diff --git a/examples/tool_chat_template_mistral.jinja b/examples/tool_chat_template_mistral.jinja new file mode 100644 index 0000000000000..49691f59c2f2c --- /dev/null +++ b/examples/tool_chat_template_mistral.jinja @@ -0,0 +1,86 @@ +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} +{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %} + +{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %} + {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %} + {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }} + {%- endif %} +{%- endfor %} + +{{- bos_token }} +{%- for message in loop_messages %} + {%- if message["role"] == "user" %} + {%- if tools is not none and (message == user_messages[-1]) %} + {{- "[AVAILABLE_TOOLS] [" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- '{"type": "function", "function": {' }} + {%- for key, val in tool.items() if key != "return" %} + {%- if val is string %} + {{- '"' + key + '": "' + val + '"' }} + {%- else %} + {{- '"' + key + '": ' + val|tojson }} + {%- endif %} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- "}}" }} + {%- if not loop.last %} + {{- ", " }} + {%- else %} + {{- "]" }} + {%- endif %} + {%- endfor %} + {{- "[/AVAILABLE_TOOLS]" }} + {%- endif %} + {%- if loop.last and system_message is defined %} + {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }} + {%- else %} + {{- "[INST] " + message["content"] + "[/INST]" }} + {%- endif %} + {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %} + {%- if message.tool_calls is defined %} + {%- set tool_calls = message.tool_calls %} + {%- else %} + {%- set tool_calls = message.content %} + {%- endif %} + {{- "[TOOL_CALLS] [" }} + {%- for tool_call in tool_calls %} + {%- set out = tool_call.function|tojson %} + {{- out[:-1] }} + {%- if not tool_call.id is defined or tool_call.id|length < 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }} + {%- endif %} + {{- ', "id": "' + tool_call.id[-9:] + '"}' }} + {%- if not loop.last %} + {{- ", " }} + {%- else %} + {{- "]" + eos_token }} + {%- endif %} + {%- endfor %} + {%- elif message["role"] == "assistant" %} + {{- " " + message["content"] + eos_token }} + {%- elif message["role"] == "tool_results" or message["role"] == "tool" %} + {%- if message.content is defined and message.content.content is defined %} + {%- set content = message.content.content %} + {%- else %} + {%- set content = message.content %} + {%- endif %} + {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }} + {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }} + {%- endif %} + {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }} + {%- else %} + {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} + {%- endif %} +{%- endfor %} diff --git a/examples/tool_chat_template_mistral_parallel.jinja b/examples/tool_chat_template_mistral_parallel.jinja new file mode 100644 index 0000000000000..a294cbfd026be --- /dev/null +++ b/examples/tool_chat_template_mistral_parallel.jinja @@ -0,0 +1,94 @@ +{%- if messages[0]["role"] == "system" %} + {%- set system_message = messages[0]["content"] %} + {%- set loop_messages = messages[1:] %} +{%- else %} + {%- set loop_messages = messages %} +{%- endif %} +{%- if not tools is defined %} + {%- set tools = none %} +{%- endif %} +{%- if tools is defined %} + {%- set parallel_tool_prompt = "You are a helpful assistant that can call tools. If you call one or more tools, format them in a single JSON array or objects, where each object is a tool call, not as separate objects outside of an array or multiple arrays. Use the format [{\"name\": tool call name, \"arguments\": tool call arguments}, additional tool calls] if you call more than one tool. If you call tools, do not attempt to interpret them or otherwise provide a response until you receive a tool call result that you can interpret for the user." %} + {%- if system_message is defined %} + {%- set system_message = parallel_tool_prompt + "\n\n" + system_message %} + {%- else %} + {%- set system_message = parallel_tool_prompt %} + {%- endif %} +{%- endif %} +{%- set user_messages = loop_messages | selectattr("role", "equalto", "user") | list %} + +{%- for message in loop_messages | rejectattr("role", "equalto", "tool") | rejectattr("role", "equalto", "tool_results") | selectattr("tool_calls", "undefined") %} + {%- if (message["role"] == "user") != (loop.index0 % 2 == 0) %} + {{- raise_exception("After the optional system message, conversation roles must alternate user/assistant/user/assistant/...") }} + {%- endif %} +{%- endfor %} + +{{- bos_token }} +{%- for message in loop_messages %} + {%- if message["role"] == "user" %} + {%- if tools is not none and (message == user_messages[-1]) %} + {{- "[AVAILABLE_TOOLS] [" }} + {%- for tool in tools %} + {%- set tool = tool.function %} + {{- '{"type": "function", "function": {' }} + {%- for key, val in tool.items() if key != "return" %} + {%- if val is string %} + {{- '"' + key + '": "' + val + '"' }} + {%- else %} + {{- '"' + key + '": ' + val|tojson }} + {%- endif %} + {%- if not loop.last %} + {{- ", " }} + {%- endif %} + {%- endfor %} + {{- "}}" }} + {%- if not loop.last %} + {{- ", " }} + {%- else %} + {{- "]" }} + {%- endif %} + {%- endfor %} + {{- "[/AVAILABLE_TOOLS]" }} + {%- endif %} + {%- if loop.last and system_message is defined %} + {{- "[INST] " + system_message + "\n\n" + message["content"] + "[/INST]" }} + {%- else %} + {{- "[INST] " + message["content"] + "[/INST]" }} + {%- endif %} + {%- elif message["role"] == "tool_calls" or message.tool_calls is defined %} + {%- if message.tool_calls is defined %} + {%- set tool_calls = message.tool_calls %} + {%- else %} + {%- set tool_calls = message.content %} + {%- endif %} + {{- "[TOOL_CALLS] [" }} + {%- for tool_call in tool_calls %} + {%- set out = tool_call.function|tojson %} + {{- out[:-1] }} + {%- if not tool_call.id is defined or tool_call.id|length < 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (1)" + tool_call.id) }} + {%- endif %} + {{- ', "id": "' + tool_call.id[-9:] + '"}' }} + {%- if not loop.last %} + {{- ", " }} + {%- else %} + {{- "]" + eos_token }} + {%- endif %} + {%- endfor %} + {%- elif message["role"] == "assistant" %} + {{- " " + message["content"] + eos_token }} + {%- elif message["role"] == "tool_results" or message["role"] == "tool" %} + {%- if message.content is defined and message.content.content is defined %} + {%- set content = message.content.content %} + {%- else %} + {%- set content = message.content %} + {%- endif %} + {{- '[TOOL_RESULTS] {"content": ' + content|string + ", " }} + {%- if not message.tool_call_id is defined or message.tool_call_id|length < 9 %} + {{- raise_exception("Tool call IDs should be alphanumeric strings with length >= 9! (2)" + message.tool_call_id) }} + {%- endif %} + {{- '"call_id": "' + message.tool_call_id[-9:] + '"}[/TOOL_RESULTS]' }} + {%- else %} + {{- raise_exception("Only user and assistant roles are supported, with the exception of an initial optional system message!") }} + {%- endif %} +{%- endfor %} diff --git a/requirements-common.txt b/requirements-common.txt index 4c5b681a0d5ab..e430753357ca0 100644 --- a/requirements-common.txt +++ b/requirements-common.txt @@ -9,7 +9,7 @@ tokenizers >= 0.19.1 # Required for Llama 3. protobuf # Required by LlamaTokenizer. fastapi aiohttp -openai >= 1.0 # Ensure modern openai package (ensure types module present) +openai >= 1.40.0 # Ensure modern openai package (ensure types module present) uvicorn[standard] pydantic >= 2.8 # Required for OpenAI server. pillow # Required for image processing @@ -20,6 +20,7 @@ lm-format-enforcer == 0.10.6 outlines >= 0.0.43, < 0.1 # Requires torch >= 2.1.0 typing_extensions >= 4.10 filelock >= 3.10.4 # filelock starts to support `mode` argument from 3.10.4 +partial-json-parser # used for parsing partial JSON outputs pyzmq msgspec gguf == 0.9.1 diff --git a/tests/basic_correctness/test_chunked_prefill.py b/tests/basic_correctness/test_chunked_prefill.py index a63ac380e8598..9c34b2a13fd53 100644 --- a/tests/basic_correctness/test_chunked_prefill.py +++ b/tests/basic_correctness/test_chunked_prefill.py @@ -16,18 +16,6 @@ "facebook/opt-125m", "meta-llama/Llama-2-7b-hf", ] -E5M2_KV_MODELS = [ - "facebook/opt-125m", - "meta-llama/Llama-2-7b-chat-hf", -] -E4M3_KV_MODELS = [ - "meta-llama/Llama-2-7b-chat-hf", "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", - "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme" -] -KV_CACHE_QUANTIZATION_PATHS = { - "meta-llama/Llama-2-7b-chat-hf": - "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json" -} @pytest.mark.parametrize("model", MODELS) @@ -78,10 +66,10 @@ def test_models( ) -@pytest.mark.parametrize("kv_cache_dtype,model", - [("fp8_e5m2", m) - for m in E5M2_KV_MODELS] + [("fp8_e4m3", m) - for m in E4M3_KV_MODELS]) +@pytest.mark.parametrize( + "kv_cache_dtype,model", + [("fp8_e4m3", + "nm-testing/TinyLlama-1.1B-compressed-tensors-kv-cache-scheme")]) # Due to low-precision numerical divergence, we only test logprob of 4 tokens @pytest.mark.parametrize("max_tokens", [4]) @pytest.mark.parametrize("chunked_prefill_token_size", [4, 16]) @@ -104,30 +92,15 @@ def test_models_with_fp8_kv_cache( disable_async_output_proc: bool, ) -> None: """ - Only checks log probs match between chunked-prefill and - non-chunked-prefill version of vLLM model runner. - - This test is used when there is discrepancy in kernels - / numerics (e.g. when using lower-precision types like FP8). + Check output logprobs match between no_chunked_prefill and chunked_prefill + with fp8 kv cache. General fp8 kv-cache tests are covered in test_fp8.py, + so here we only check chunked prefill. """ NUM_LOG_PROBS = 8 - if model == "facebook/opt-125m": - pytest.skip( - "#7378: CUDA illegal memory access (undiagnosed) facebook/opt-125m" - ) - if ((model, kv_cache_dtype, chunked_prefill_token_size) == ( - "nm-testing/Qwen2-1.5B-Instruct-FP8-K-V", "fp8_e4m3", 4)): - pytest.skip("flakey test, see: #7874 #8051") - max_num_seqs = chunked_prefill_token_size max_num_batched_tokens = chunked_prefill_token_size - extra_kwargs = {} - if model in KV_CACHE_QUANTIZATION_PATHS: - extra_kwargs["quantization_param_path"] = KV_CACHE_QUANTIZATION_PATHS[ - model] - with vllm_runner( model, tensor_parallel_size=tensor_parallel_size, @@ -135,7 +108,6 @@ def test_models_with_fp8_kv_cache( max_num_seqs=max_num_seqs, kv_cache_dtype=kv_cache_dtype, disable_async_output_proc=disable_async_output_proc, - **extra_kwargs, ) as vllm_model: no_chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) @@ -149,7 +121,6 @@ def test_models_with_fp8_kv_cache( max_num_seqs=max_num_seqs, kv_cache_dtype=kv_cache_dtype, disable_async_output_proc=disable_async_output_proc, - **extra_kwargs, ) as vllm_model: chunked_prefill_outputs = vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, NUM_LOG_PROBS) diff --git a/tests/distributed/test_pipeline_parallel.py b/tests/distributed/test_pipeline_parallel.py index 4d54e43d5788c..637d2b30f6b1f 100644 --- a/tests/distributed/test_pipeline_parallel.py +++ b/tests/distributed/test_pipeline_parallel.py @@ -18,23 +18,26 @@ VLLM_MULTI_NODE = os.getenv("VLLM_MULTI_NODE", "0") == "1" -@pytest.mark.parametrize(("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, " - "MODEL_NAME, DIST_BACKEND"), - [ - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "mp"), - (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), - (1, 3, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), - (1, 4, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"), - (1, 4, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), - (2, 2, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), - (2, 2, 0, 1, "meta-llama/Meta-Llama-3-8B", "ray"), - ]) +@pytest.mark.parametrize( + ("TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, TRUST_REMOTE_CODE, " + "MODEL_NAME, DIST_BACKEND"), + [ + (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), + (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), + (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), + (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "mp"), + (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "mp"), + (1, 3, 0, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), + (1, 4, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), + (1, 4, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), + (2, 2, 1, 0, 0, "meta-llama/Meta-Llama-3-8B", "ray"), + (2, 2, 0, 1, 0, "meta-llama/Meta-Llama-3-8B", "ray"), + (2, 2, 1, 1, 1, "internlm/internlm2_5-7b-chat", "ray"), + ], +) @fork_new_process_for_each_test -def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, - DIST_BACKEND): +def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, + TRUST_REMOTE_CODE, MODEL_NAME, DIST_BACKEND): if VLLM_MULTI_NODE and DIST_BACKEND == "mp": pytest.skip("Skipping multi-node pipeline parallel test for " "multiprocessing distributed backend") @@ -71,6 +74,9 @@ def test_compare_tp(TP_SIZE, PP_SIZE, EAGER_MODE, CHUNKED_PREFILL, MODEL_NAME, if EAGER_MODE: pp_args.append("--enforce-eager") tp_args.append("--enforce-eager") + if TRUST_REMOTE_CODE: + pp_args.append("--trust-remote-code") + tp_args.append("--trust-remote-code") pp_env = None if (DIST_BACKEND == "ray" and TP_SIZE == 2 and PP_SIZE == 2 and CHUNKED_PREFILL): diff --git a/tests/entrypoints/llm/test_generate.py b/tests/entrypoints/llm/test_generate.py index c426e9b4ee899..ef34bebbb0f8c 100644 --- a/tests/entrypoints/llm/test_generate.py +++ b/tests/entrypoints/llm/test_generate.py @@ -6,6 +6,7 @@ from vllm import LLM, RequestOutput, SamplingParams from ...conftest import cleanup +from ..openai.test_vision import TEST_IMAGE_URLS MODEL_NAME = "facebook/opt-125m" @@ -159,3 +160,36 @@ def test_chat(): ] outputs = llm.chat(messages) assert len(outputs) == 1 + + +@pytest.mark.parametrize("image_urls", + [[TEST_IMAGE_URLS[0], TEST_IMAGE_URLS[1]]]) +def test_chat_multi_image(image_urls: List[str]): + llm = LLM( + model="microsoft/Phi-3.5-vision-instruct", + dtype="bfloat16", + max_model_len=4096, + max_num_seqs=5, + enforce_eager=True, + trust_remote_code=True, + limit_mm_per_prompt={"image": 2}, + ) + + messages = [{ + "role": + "user", + "content": [ + *({ + "type": "image_url", + "image_url": { + "url": image_url + } + } for image_url in image_urls), + { + "type": "text", + "text": "What's in this image?" + }, + ], + }] + outputs = llm.chat(messages) + assert len(outputs) >= 0 diff --git a/tests/entrypoints/test_chat_utils.py b/tests/entrypoints/test_chat_utils.py index 53f99189beb1c..6ded5102c9314 100644 --- a/tests/entrypoints/test_chat_utils.py +++ b/tests/entrypoints/test_chat_utils.py @@ -1,11 +1,14 @@ import warnings +from typing import Optional import pytest from PIL import Image from vllm.assets.image import ImageAsset from vllm.config import ModelConfig -from vllm.entrypoints.chat_utils import parse_chat_messages +from vllm.entrypoints.chat_utils import (parse_chat_messages, + parse_chat_messages_futures) +from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import encode_image_base64 from vllm.transformers_utils.tokenizer_group import TokenizerGroup @@ -42,10 +45,28 @@ def image_url(): return f"data:image/jpeg;base64,{base64}" -@pytest.mark.asyncio -async def test_parse_chat_messages_with_image_url(phi3v_model_config, - phi3v_tokenizer, image_url): - conversation, mm_future = parse_chat_messages([{ +def _assert_mm_data_is_image_input( + mm_data: Optional[MultiModalDataDict], + image_count: int, +) -> None: + assert mm_data is not None + assert set(mm_data.keys()) == {"image"} + + image_data = mm_data.get("image") + assert image_data is not None + + if image_count == 1: + assert isinstance(image_data, Image.Image) + else: + assert isinstance(image_data, list) and len(image_data) == image_count + + +def test_parse_chat_messages_single_image( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ "role": "user", "content": [{ @@ -63,15 +84,42 @@ async def test_parse_chat_messages_with_image_url(phi3v_model_config, "role": "user", "content": "<|image_1|>\nWhat's in the image?" }] - mm_data = await mm_future - assert set(mm_data.keys()) == {"image"} - assert isinstance(mm_data["image"], Image.Image) + _assert_mm_data_is_image_input(mm_data, 1) @pytest.mark.asyncio -async def test_parse_chat_messages_multiple_images(phi3v_model_config, - phi3v_tokenizer, image_url): - conversation, mm_future = parse_chat_messages([{ +async def test_parse_chat_messages_single_image_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_future = parse_chat_messages_futures([{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in the image?" + }] + }], phi3v_model_config, phi3v_tokenizer) + + assert conversation == [{ + "role": "user", + "content": "<|image_1|>\nWhat's in the image?" + }] + _assert_mm_data_is_image_input(await mm_future, 1) + + +def test_parse_chat_messages_multiple_images( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ "role": "user", "content": [{ @@ -96,15 +144,49 @@ async def test_parse_chat_messages_multiple_images(phi3v_model_config, "content": "<|image_1|>\n<|image_2|>\nWhat's in these images?" }] - mm_data = await mm_future - assert set(mm_data.keys()) == {"image"} - assert len(mm_data["image"]) == 2 + _assert_mm_data_is_image_input(mm_data, 2) @pytest.mark.asyncio -async def test_parse_chat_messages_placeholder_already_in_prompt( - phi3v_model_config, phi3v_tokenizer, image_url): - conversation, mm_future = parse_chat_messages([{ +async def test_parse_chat_messages_multiple_images_async( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_future = parse_chat_messages_futures([{ + "role": + "user", + "content": [{ + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "image_url", + "image_url": { + "url": image_url + } + }, { + "type": "text", + "text": "What's in these images?" + }] + }], phi3v_model_config, phi3v_tokenizer) + + assert conversation == [{ + "role": + "user", + "content": + "<|image_1|>\n<|image_2|>\nWhat's in these images?" + }] + _assert_mm_data_is_image_input(await mm_future, 2) + + +def test_parse_chat_messages_placeholder_already_in_prompt( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ "role": "user", "content": [{ @@ -131,15 +213,15 @@ async def test_parse_chat_messages_placeholder_already_in_prompt( "content": "What's in <|image_1|> and how does it compare to <|image_2|>?" }] - mm_data = await mm_future - assert set(mm_data.keys()) == {"image"} - assert len(mm_data["image"]) == 2 + _assert_mm_data_is_image_input(mm_data, 2) -@pytest.mark.asyncio -async def test_parse_chat_messages_placeholder_one_already_in_prompt( - phi3v_model_config, phi3v_tokenizer, image_url): - conversation, mm_future = parse_chat_messages([{ +def test_parse_chat_messages_placeholder_one_already_in_prompt( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ "role": "user", "content": [{ @@ -167,15 +249,15 @@ async def test_parse_chat_messages_placeholder_one_already_in_prompt( "<|image_2|>\nWhat's in <|image_1|> and how does it compare to the " "other one?" }] - mm_data = await mm_future - assert set(mm_data.keys()) == {"image"} - assert len(mm_data["image"]) == 2 + _assert_mm_data_is_image_input(mm_data, 2) -@pytest.mark.asyncio -async def test_parse_chat_messages_multiple_images_across_messages( - phi3v_model_config, phi3v_tokenizer, image_url): - conversation, mm_future = parse_chat_messages([{ +def test_parse_chat_messages_multiple_images_across_messages( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): + conversation, mm_data = parse_chat_messages([{ "role": "user", "content": [{ @@ -218,14 +300,14 @@ async def test_parse_chat_messages_multiple_images_across_messages( "content": "<|image_2|>\nWhat about this one?" }, ] - mm_data = await mm_future - assert set(mm_data.keys()) == {"image"} - assert len(mm_data["image"]) == 2 + _assert_mm_data_is_image_input(mm_data, 2) -@pytest.mark.asyncio -async def test_parse_chat_messages_rejects_too_many_images_in_one_message( - phi3v_model_config, phi3v_tokenizer, image_url): +def test_parse_chat_messages_rejects_too_many_images_in_one_message( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", @@ -259,9 +341,11 @@ async def test_parse_chat_messages_rejects_too_many_images_in_one_message( }], phi3v_model_config, phi3v_tokenizer) -@pytest.mark.asyncio -async def test_parse_chat_messages_rejects_too_many_images_across_messages( - phi3v_model_config, phi3v_tokenizer, image_url): +def test_parse_chat_messages_rejects_too_many_images_across_messages( + phi3v_model_config, + phi3v_tokenizer, + image_url, +): with warnings.catch_warnings(): warnings.filterwarnings( "ignore", diff --git a/tests/kernels/test_flashinfer.py b/tests/kernels/test_flashinfer.py index 67f12cf1ee08e..696cc0c6cdf10 100644 --- a/tests/kernels/test_flashinfer.py +++ b/tests/kernels/test_flashinfer.py @@ -445,7 +445,8 @@ def test_flashinfer_decode_with_paged_fp8_kv( head_size, block_size, "NONE", - data_type=dtype) + data_type=dtype, + q_data_type=dtype) output = wrapper.forward(query, kv_cache_fp8, logits_soft_cap=soft_cap, diff --git a/tests/lora/test_gemma.py b/tests/lora/test_gemma.py index 709246179bfe4..f7c1d4f041c12 100644 --- a/tests/lora/test_gemma.py +++ b/tests/lora/test_gemma.py @@ -1,7 +1,10 @@ from typing import List +import pytest + import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hip MODEL_PATH = "google/gemma-7b" @@ -10,7 +13,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: prompts = [ "Quote: Imagination is", "Quote: Be yourself;", - "Quote: So many books,", + "Quote: Painting is poetry that is seen rather than felt,", ] sampling_params = vllm.SamplingParams(temperature=0, max_tokens=32) outputs = llm.generate( @@ -28,6 +31,7 @@ def do_sample(llm: vllm.LLM, lora_path: str, lora_id: int) -> List[str]: return generated_texts +@pytest.mark.xfail(is_hip(), reason="There can be output mismatch on ROCm") def test_gemma_lora(gemma_lora_files): llm = vllm.LLM(MODEL_PATH, max_model_len=1024, @@ -37,7 +41,8 @@ def test_gemma_lora(gemma_lora_files): expected_lora_output = [ "more important than knowledge.\nAuthor: Albert Einstein\n", "everyone else is already taken.\nAuthor: Oscar Wilde\n", - "so little time\nAuthor: Frank Zappa\n", + "and poetry is painting that is felt rather than seen.\n" + "Author: Leonardo da Vinci\n", ] output1 = do_sample(llm, gemma_lora_files, lora_id=1) diff --git a/tests/lora/test_quant_model.py b/tests/lora/test_quant_model.py index 2370c693e9534..133e0d4514a6d 100644 --- a/tests/lora/test_quant_model.py +++ b/tests/lora/test_quant_model.py @@ -7,6 +7,7 @@ import vllm from vllm.lora.request import LoRARequest +from vllm.utils import is_hip from .conftest import cleanup @@ -17,12 +18,23 @@ class ModelWithQuantization: quantization: str -MODELS: List[ModelWithQuantization] = [ - ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", - quantization="AWQ"), - ModelWithQuantization(model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", - quantization="GPTQ"), -] +MODELS: List[ModelWithQuantization] +#AWQ quantization is currently not supported in ROCm. +if is_hip(): + MODELS = [ + ModelWithQuantization( + model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + quantization="GPTQ"), + ] +else: + MODELS = [ + ModelWithQuantization( + model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-AWQ", + quantization="AWQ"), + ModelWithQuantization( + model_path="TheBloke/TinyLlama-1.1B-Chat-v0.3-GPTQ", + quantization="GPTQ"), + ] def do_sample(llm: vllm.LLM, diff --git a/tests/models/test_fp8.py b/tests/models/test_fp8.py index 4ab968c01da04..17acdb52322fd 100644 --- a/tests/models/test_fp8.py +++ b/tests/models/test_fp8.py @@ -3,116 +3,97 @@ Note: these tests will only pass on L4 GPU. """ import os -from typing import List +from typing import Optional import pytest -import torch -from transformers import AutoTokenizer +from tests.kernels.utils import override_backend_env_variable from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -MAX_MODEL_LEN = 1024 - -MODELS = [ - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", - "meta-llama/Meta-Llama-3-8B-Instruct", -] +from ..models.utils import check_logprobs_close -EXPECTED_STRS_MAP = { - "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV": { - "auto": [ - 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) process information in distinct ways, with both', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, nemuri no' - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system made up of several basic components that work together to enable it to', - 'Zeta-5, a highly advanced robot designed for menial labor, had never experienced anything like', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya kotori wa mushi o tsuk' - ] - }, - "meta-llama/Meta-Llama-3-8B-Instruct": { - "auto": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'In the vast, sterile laboratory, Robot 3456-Alpha, or "Alpha" for short', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki wa mushi o tsukamu' - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', - 'In the year 2154, robotics engineer Dr. Rachel Kim had spent years perfecting her latest', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya tori, mushi o tsukamu' - ] - }, -} +os.environ["TOKENIZERS_PARALLELISM"] = "true" -# This test compares against golden strings for exact match since -# there is no baseline implementation to compare against -# and is unstable w.r.t specifics of the fp8 implementation or -# the hardware being run on. -# Disabled to prevent it from breaking the build -@pytest.mark.skip( - reason= - "Prevent unstable test based on golden strings from breaking the build.") @pytest.mark.skipif(not is_quant_method_supported("fp8"), reason="fp8 is not supported on this GPU type.") -@pytest.mark.parametrize("model_name", MODELS) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -def test_models(example_prompts, model_name, kv_cache_dtype) -> None: - model = LLM(model=model_name, - max_model_len=MAX_MODEL_LEN, - trust_remote_code=True, - enforce_eager=True, - quantization="fp8", - kv_cache_dtype=kv_cache_dtype) +@pytest.mark.parametrize( + "kv_cache_dtype,base_model,test_model,scale_path", + [ + # Test FP8 checkpoint w. fp8_e4m3 kv-cache scaling factors. + ("fp8_e4m3", "meta-llama/Meta-Llama-3-8B-Instruct", + "nm-testing/Meta-Llama-3-8B-Instruct-FP8-KV", None), + # Test FP16 checkpoint w. fp8_e5m2 kv-cache. + ("fp8_e5m2", "meta-llama/Meta-Llama-3-8B-Instruct", + "meta-llama/Meta-Llama-3-8B-Instruct", None), + # Test FP16 checkpoint w. fp8_e4m3 kv-cache scaling factors in json. + ("fp8_e4m3", "meta-llama/Llama-2-7b-chat-hf", + "meta-llama/Llama-2-7b-chat-hf", + "./tests/fp8_kv/llama2-7b-fp8-kv/kv_cache_scales.json") + ]) +# Due to low-precision numerical divergence, we only test logprob of 4 tokens +@pytest.mark.parametrize("max_tokens", [4]) +@pytest.mark.parametrize("enforce_eager", [False, True]) +@pytest.mark.parametrize("backend", ["FLASH_ATTN", "XFORMERS", "FLASHINFER"]) +# NOTE: Increasing this in this suite will fail CI because we currently cannot +# reset distributed env properly. Use a value > 1 just when you test. +@pytest.mark.parametrize("tensor_parallel_size", [1]) +# Due to low-precision numerical divergence, this test is too sensitive for +# the async postprocessor +@pytest.mark.parametrize("disable_async_output_proc", [True]) +def test_models( + vllm_runner, + example_prompts, + kv_cache_dtype: str, + base_model: str, + test_model: str, + scale_path: Optional[str], + max_tokens: int, + enforce_eager: bool, + backend: str, + tensor_parallel_size: int, + disable_async_output_proc: bool, + monkeypatch, +) -> None: + """ + Only checks log probs match to cover the discrepancy in + numerical sensitive kernels. + """ + override_backend_env_variable(monkeypatch, backend) + + MAX_MODEL_LEN = 1024 + NUM_LOG_PROBS = 8 + + with vllm_runner( + base_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype="auto", + disable_async_output_proc=disable_async_output_proc, + ) as vllm_model: + baseline_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - tokenizer = AutoTokenizer.from_pretrained(model_name) - formatted_prompts = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - tokenize=False, - add_generation_prompt=True) - for prompt in example_prompts - ] + extra_kwargs = {} + if scale_path is not None: + extra_kwargs["quantization_param_path"] = scale_path - params = SamplingParams(max_tokens=20, temperature=0) - generations: List[str] = [] - # Note: these need to be run 1 at a time due to numerical precision, - # since the expected strs were generated this way. - for prompt in formatted_prompts: - outputs = model.generate(prompt, params) - generations.append(outputs[0].outputs[0].text) - del model + with vllm_runner( + test_model, + max_model_len=MAX_MODEL_LEN, + tensor_parallel_size=tensor_parallel_size, + enforce_eager=enforce_eager, + kv_cache_dtype=kv_cache_dtype, + disable_async_output_proc=disable_async_output_proc, + **extra_kwargs, + ) as vllm_model: + test_outputs = vllm_model.generate_greedy_logprobs( + example_prompts, max_tokens, NUM_LOG_PROBS) - print(model_name, kv_cache_dtype, generations) - expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype] - for i in range(len(example_prompts)): - generated_str = generations[i] - expected_str = expected_strs[i] - assert expected_str == generated_str, ( - f"Test{i}:\nExpected: {expected_str!r}\nvLLM: {generated_str!r}") + check_logprobs_close( + outputs_0_lst=baseline_outputs, + outputs_1_lst=test_outputs, + name_0="fp16_kv_cache", + name_1="fp8_kv_cache", + ) diff --git a/tests/models/test_fp8kv_flashinfer.py b/tests/models/test_fp8kv_flashinfer.py deleted file mode 100644 index ff2a44162b6c3..0000000000000 --- a/tests/models/test_fp8kv_flashinfer.py +++ /dev/null @@ -1,96 +0,0 @@ -# flake8: noqa -"""Tests fp8 models against ground truth generation -This verifies the flashinfer backend with fp8 -quantization and fp8 KV Cache without scaling -factors Note: these tests will only pass on H100 GPU. -""" -import os -from typing import List - -import pytest -from transformers import AutoTokenizer - -from tests.quantization.utils import is_quant_method_supported -from vllm import LLM, SamplingParams - -os.environ["TOKENIZERS_PARALLELISM"] = "true" - -MAX_MODEL_LEN = 1024 - -MODELS = [ - "nm-testing/Meta-Llama-3-8B-Instruct-FP8", -] - -EXPECTED_STRS_MAP = { - "nm-testing/Meta-Llama-3-8B-Instruct-FP8": { - "auto": [ - 'LLaMA is a high-throughput and memory-efficient inference and serving engine for Large Language Models (', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, consisting of interconnected nodes or "ne', - 'In the sterile, metallic halls of the robotics lab, a peculiar phenomenon occurred. Zeta-5', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. The', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, mushi o', - ], - "fp8": [ - 'LLM (Large Language Model) is a type of artificial intelligence (AI) model that is trained', - 'Here are the major milestones in the development of artificial intelligence (AI) from 1950 to ', - 'Artificial intelligence (AI) and human intelligence (HI) differ significantly in how they process information.', - 'A neural network is a complex system modeled after the human brain, composed of interconnected nodes or "ne', - 'Zeta-5, a highly advanced robot designed for menial labor, whirred and beep', - 'The COVID-19 pandemic has had a profound impact on global economic structures and future business models. Here', - 'The Mona Lisa, painted by Leonardo da Vinci in the early 16th century, is one of', - 'Here are the translations:\n\n**Japanese:** (Haya aki no tori, guri o', - ] - } -} - - -# This test compares against golden strings for exact match since -# there is no baseline implementation to compare against -# and is unstable w.r.t specifics of the fp8 implementation or -# the hardware being run on. -# No assert to prevent it from breaking the build -@pytest.mark.skipif(not is_quant_method_supported("fp8"), - reason="fp8 is not supported on this GPU type.") -@pytest.mark.parametrize("model_name", MODELS) -@pytest.mark.parametrize("kv_cache_dtype", ["auto", "fp8"]) -@pytest.mark.parametrize("backend", ["XFORMERS", "FLASHINFER"]) -def test_models(example_prompts, model_name, kv_cache_dtype, backend) -> None: - # Note that the golden strings may not work for FLASHINFER Backend. - # The intention is to test the path - os.environ["VLLM_ATTENTION_BACKEND"] = backend - model = LLM(model=model_name, - max_model_len=MAX_MODEL_LEN, - trust_remote_code=True, - quantization="fp8", - kv_cache_dtype=kv_cache_dtype) - - tokenizer = AutoTokenizer.from_pretrained(model_name) - formatted_prompts = [ - tokenizer.apply_chat_template([{ - "role": "user", - "content": prompt - }], - tokenize=False, - add_generation_prompt=True) - for prompt in example_prompts - ] - - params = SamplingParams(max_tokens=20, temperature=0) - generations: List[str] = [] - # Note: these need to be run 1 at a time due to numerical precision, - # since the expected strs were generated this way. - for prompt in formatted_prompts: - outputs = model.generate(prompt, params) - generations.append(outputs[0].outputs[0].text) - del model - - print(f"Testing: {model_name} with kv_cache_dtype: {kv_cache_dtype}") - expected_strs = EXPECTED_STRS_MAP[model_name][kv_cache_dtype] - for i in range(len(example_prompts)): - generated_str = generations[i] - expected_str = expected_strs[i] - print(f"generated_str\n: {generated_str}") - print(f"expected_str\n: {expected_str}") diff --git a/tests/models/test_qwen.py b/tests/models/test_qwen.py index 0f974fcc1885c..05f5cbf8c3435 100644 --- a/tests/models/test_qwen.py +++ b/tests/models/test_qwen.py @@ -1,48 +1,165 @@ -from typing import Type +import pathlib +from typing import List, Optional, Type import pytest -from ..conftest import HfRunner, VllmRunner +from vllm.multimodal.utils import rescale_image_size + +from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets from .utils import check_logprobs_close -models = ["qwen/qwen-vl"] +pytestmark = pytest.mark.vlm +text_only_models = [ + "Qwen/Qwen-7B-Chat" # Has no visual component +] -@pytest.mark.parametrize("dtype", ["half"]) -@pytest.mark.parametrize("max_tokens", [32]) -@pytest.mark.parametrize("num_logprobs", [5]) -@pytest.mark.parametrize("model", models) -def test_text_only_qwen_model( +multimodal_models = ["Qwen/Qwen-VL"] + +HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({ + "stop_sign": + "Picture 1: \nWhat's the content of the image?: ", + "cherry_blossom": + "Picture 1: \nWhat is the season?: ", +}) + + +### Tests for multimodal Qwen models +def run_test( + tmp_path: pathlib.PosixPath, hf_runner: Type[HfRunner], vllm_runner: Type[VllmRunner], - example_prompts, + image_assets: _ImageAssets, model: str, *, + size_factors: List[float], dtype: str, max_tokens: int, num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, ): - # This test checks language inputs only, since the visual component - # for qwen-vl is still unsupported in VLLM. In the near-future, the - # implementation and this test will be extended to consider - # visual inputs as well. + """Inference result should be the same between hf and vllm. + + All the image fixtures for the test is under tests/images. + For huggingface runner, we provide the PIL images as input. + For vllm runner, we provide MultiModalDataDict objects + and corresponding MultiModalConfig as input. + Note, the text input is also adjusted to abide by vllm contract. + The text output is sanitized to be able to compare with hf. + """ + images = [asset.pil_image for asset in image_assets] + + # Export the images to a tempdir and substitute it into the hf prompt; + # the contents between / will be ignored by VLLM, but the + # transformers implementation for the visual transformer parses this to + # reload it in the forward call; the contents are treated as a URL or a + # local path. + for idx, asset in enumerate(image_assets): + image_tmp_path = tmp_path / f"{asset.name}.jpg" + asset.pil_image.save(image_tmp_path) + HF_IMAGE_PROMPTS[idx] = HF_IMAGE_PROMPTS[idx].replace( + "", f"{image_tmp_path}") + + inputs_per_image = [( + [prompt for _ in size_factors], + [rescale_image_size(image, factor) for factor in size_factors], + ) for image, prompt in zip(images, HF_IMAGE_PROMPTS)] + + # NOTE: take care of the order. run vLLM first, and then run HF. + # vLLM needs a fresh new process without cuda initialization. + # if we run HF first, the cuda initialization will be done and it + # will hurt multiprocessing backend with fork method (the default method). + + # max_model_len should be greater than image_feature_size + # Qwen encodes images into a fixed content size of 256 + with vllm_runner(model, + max_model_len=300, + max_num_seqs=1, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True) as vllm_model: + vllm_outputs_per_image = [ + vllm_model.generate_greedy_logprobs(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + with hf_runner(model, dtype=dtype) as hf_model: - hf_outputs = hf_model.generate_greedy_logprobs_limit( - example_prompts, - max_tokens, - num_logprobs=num_logprobs, + hf_outputs_per_image = [ + hf_model.generate_greedy_logprobs_limit(prompts, + max_tokens, + num_logprobs=num_logprobs, + images=images) + for prompts, images in inputs_per_image + ] + + for hf_outputs, vllm_outputs in zip(hf_outputs_per_image, + vllm_outputs_per_image): + + check_logprobs_close( + outputs_0_lst=hf_outputs, + outputs_1_lst=vllm_outputs, + name_0="hf", + name_1="vllm", ) + +@pytest.mark.parametrize("model", multimodal_models) +@pytest.mark.parametrize( + "size_factors", + [ + # No image + [], + # Single-scale + [1.0], + # Single-scale, batched + [1.0, 1.0, 1.0], + # Multi-scale + [0.25, 0.5, 1.0], + ], +) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [8]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_multimodal_models(tmp_path, hf_runner, vllm_runner, image_assets, + model, size_factors, dtype, max_tokens, + num_logprobs) -> None: + run_test( + tmp_path, + hf_runner, + vllm_runner, + image_assets, + model, + size_factors=size_factors, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) + + +# Ensure that a text-only Qwen model can still be loaded and +# used for inference in VLLM without throwing. +@pytest.mark.parametrize("model", text_only_models) +@pytest.mark.parametrize("dtype", ["bfloat16"]) +@pytest.mark.parametrize("max_tokens", [32]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_text_only_qwen_model_can_be_loaded_and_run( + vllm_runner: Type[VllmRunner], + example_prompts, + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, +): with vllm_runner(model, dtype=dtype) as vllm_model: - vllm_outputs = vllm_model.generate_greedy_logprobs( + vllm_model.generate_greedy_logprobs( example_prompts, max_tokens, num_logprobs=num_logprobs, ) - - check_logprobs_close( - outputs_0_lst=hf_outputs, - outputs_1_lst=vllm_outputs, - name_0="hf", - name_1="vllm", - ) diff --git a/tests/models/test_ultravox.py b/tests/models/test_ultravox.py index 23008f9b8b563..e98db9b65f484 100644 --- a/tests/models/test_ultravox.py +++ b/tests/models/test_ultravox.py @@ -16,37 +16,32 @@ AudioTuple = Tuple[np.ndarray, int] +VLLM_PLACEHOLDER = "<|reserved_special_token_0|>" +HF_PLACEHOLDER = "<|audio|>" + @pytest.fixture(scope="session") -def audio_and_sample_rate(): +def audio_assets(): from vllm.assets.audio import AudioAsset - return AudioAsset("mary_had_lamb").audio_and_sample_rate + return [AudioAsset("mary_had_lamb"), AudioAsset("winning_call")] -@pytest.fixture -def prompts_and_audios(audio_and_sample_rate): - tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) +@pytest.fixture(scope="module", params=("mary_had_lamb", "winning_call")) +def audio(request): + from vllm.assets.audio import AudioAsset + return AudioAsset(request.param) - vllm_placeholder = "<|reserved_special_token_0|>" - hf_placeholder = "<|audio|>" - question = "What's in the audio?" - vllm_prompt = tokenizer.apply_chat_template( - [{ - 'role': 'user', - 'content': f"{vllm_placeholder}\n{question}" - }], - tokenize=False, - add_generation_prompt=True) - hf_prompt = tokenizer.apply_chat_template( - [{ - 'role': 'user', - 'content': f"{hf_placeholder}\n{question}" - }], - tokenize=False, - add_generation_prompt=True) +def _get_prompt(audio_count, question, placeholder): + tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME) + placeholder = f"{placeholder}\n" * audio_count - return [(vllm_prompt, hf_prompt, audio_and_sample_rate)] + return tokenizer.apply_chat_template([{ + 'role': 'user', + 'content': f"{placeholder}{question}" + }], + tokenize=False, + add_generation_prompt=True) def vllm_to_hf_output(vllm_output: Tuple[List[int], str, @@ -134,15 +129,71 @@ def process(hf_inputs: BatchEncoding): ) +def run_multi_audio_test( + vllm_runner: Type[VllmRunner], + prompts_and_audios: List[Tuple[str, List[AudioTuple]]], + model: str, + *, + dtype: str, + max_tokens: int, + num_logprobs: int, + tensor_parallel_size: int, + distributed_executor_backend: Optional[str] = None, +): + with vllm_runner(model, + dtype=dtype, + tensor_parallel_size=tensor_parallel_size, + distributed_executor_backend=distributed_executor_backend, + enforce_eager=True, + limit_mm_per_prompt={ + "audio": + max((len(audio) for _, audio in prompts_and_audios)) + }) as vllm_model: + vllm_outputs = vllm_model.generate_greedy_logprobs( + [prompt for prompt, _ in prompts_and_audios], + max_tokens, + num_logprobs=num_logprobs, + audios=[audios for _, audios in prompts_and_audios]) + + # The HuggingFace model doesn't support multiple audios yet, so + # just assert that some tokens were generated. + assert all(tokens for tokens, *_ in vllm_outputs) + + @pytest.mark.parametrize("dtype", ["half"]) @pytest.mark.parametrize("max_tokens", [128]) @pytest.mark.parametrize("num_logprobs", [5]) -def test_models(hf_runner, vllm_runner, prompts_and_audios, dtype: str, - max_tokens: int, num_logprobs: int) -> None: +def test_models(hf_runner, vllm_runner, audio, dtype: str, max_tokens: int, + num_logprobs: int) -> None: + + vllm_prompt = _get_prompt(1, "Describe the audio above.", VLLM_PLACEHOLDER) + hf_prompt = _get_prompt(1, "Describe the audio above.", HF_PLACEHOLDER) run_test( hf_runner, vllm_runner, - prompts_and_audios, + [(vllm_prompt, hf_prompt, audio.audio_and_sample_rate)], + MODEL_NAME, + dtype=dtype, + max_tokens=max_tokens, + num_logprobs=num_logprobs, + tensor_parallel_size=1, + ) + + +@pytest.mark.parametrize("dtype", ["half"]) +@pytest.mark.parametrize("max_tokens", [128]) +@pytest.mark.parametrize("num_logprobs", [5]) +def test_models_with_multiple_audios(vllm_runner, audio_assets, dtype: str, + max_tokens: int, + num_logprobs: int) -> None: + + vllm_prompt = _get_prompt(len(audio_assets), + "Describe each of the audios above.", + VLLM_PLACEHOLDER) + run_multi_audio_test( + vllm_runner, + [(vllm_prompt, [audio.audio_and_sample_rate + for audio in audio_assets])], MODEL_NAME, dtype=dtype, max_tokens=max_tokens, diff --git a/tests/tool_use/__init__.py b/tests/tool_use/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/tests/tool_use/conftest.py b/tests/tool_use/conftest.py new file mode 100644 index 0000000000000..ab6a29eba1b3f --- /dev/null +++ b/tests/tool_use/conftest.py @@ -0,0 +1,32 @@ +import pytest +import pytest_asyncio +from huggingface_hub import snapshot_download + +from tests.utils import RemoteOpenAIServer + +from .utils import ARGS, CONFIGS, ServerConfig + + +# for each server config, download the model and return the config +@pytest.fixture(scope="session", params=CONFIGS.keys()) +def server_config(request): + config = CONFIGS[request.param] + # download model and tokenizer using transformers + snapshot_download(config["model"]) + yield CONFIGS[request.param] + + +# run this for each server config +@pytest.fixture(scope="session") +def server(request, server_config: ServerConfig): + model = server_config["model"] + args_for_model = server_config["arguments"] + with RemoteOpenAIServer(model, ARGS + args_for_model, + max_wait_seconds=480) as server: + yield server + + +@pytest_asyncio.fixture +async def client(server: RemoteOpenAIServer): + async with server.get_async_client() as async_client: + yield async_client diff --git a/tests/tool_use/test_chat_completions.py b/tests/tool_use/test_chat_completions.py new file mode 100644 index 0000000000000..038ff81d2b674 --- /dev/null +++ b/tests/tool_use/test_chat_completions.py @@ -0,0 +1,143 @@ +from typing import List + +import openai +import pytest + +from .utils import MESSAGES_WITHOUT_TOOLS, WEATHER_TOOL + + +# test: make sure chat completions without tools provided work even when tools +# are enabled. This makes sure tool call chat templates work, AND that the tool +# parser stream processing doesn't change the output of the model. +@pytest.mark.asyncio +async def test_chat_completion_without_tools(client: openai.AsyncOpenAI): + models = await client.models.list() + model_name: str = models.data[0].id + chat_completion = await client.chat.completions.create( + messages=MESSAGES_WITHOUT_TOOLS, + temperature=0, + max_tokens=150, + model=model_name, + logprobs=False) + choice = chat_completion.choices[0] + stop_reason = chat_completion.choices[0].finish_reason + output_text = chat_completion.choices[0].message.content + + # check to make sure we got text + assert output_text is not None + assert len(output_text) > 0 + assert stop_reason != "tool_calls" + + # check to make sure no tool calls were returned + assert (choice.message.tool_calls is None + or len(choice.message.tool_calls) == 0) + + # make the same request, streaming + stream = await client.chat.completions.create( + messages=MESSAGES_WITHOUT_TOOLS, + temperature=0, + max_tokens=150, + model=model_name, + logprobs=False, + stream=True, + ) + chunks: List[str] = [] + finish_reason_count = 0 + role_sent: bool = False + + # assemble streamed chunks + async for chunk in stream: + delta = chunk.choices[0].delta + + # make sure the role is assistant + if delta.role: + assert not role_sent + assert delta.role == 'assistant' + role_sent = True + + if delta.content: + chunks.append(delta.content) + + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + assert chunk.choices[0].finish_reason == choice.finish_reason + + # make sure tool call chunks aren't being streamed + assert not delta.tool_calls or len(delta.tool_calls) == 0 + + # make sure the role was sent, only 1 finish reason was sent, that chunks + # were in fact sent, and that the chunks match non-streaming + assert role_sent + assert finish_reason_count == 1 + assert len(chunks) + assert "".join(chunks) == output_text + + +# test: conversation with tools enabled and provided that should not invoke +# tools, to make sure we can still get normal chat completion responses +# and that they won't be parsed as tools +@pytest.mark.asyncio +async def test_chat_completion_with_tools(client: openai.AsyncOpenAI): + models = await client.models.list() + model_name: str = models.data[0].id + chat_completion = await client.chat.completions.create( + messages=MESSAGES_WITHOUT_TOOLS, + temperature=0, + max_tokens=150, + model=model_name, + tools=[WEATHER_TOOL], + logprobs=False) + choice = chat_completion.choices[0] + stop_reason = chat_completion.choices[0].finish_reason + output_text = chat_completion.choices[0].message.content + + # check to make sure we got text + assert output_text is not None + assert stop_reason != 'tool_calls' + assert len(output_text) > 0 + + # check to make sure no tool calls were returned + assert (choice.message.tool_calls is None + or len(choice.message.tool_calls) == 0) + + # make the same request, streaming + stream = await client.chat.completions.create( + messages=MESSAGES_WITHOUT_TOOLS, + temperature=0, + max_tokens=150, + model=model_name, + logprobs=False, + tools=[WEATHER_TOOL], + stream=True, + ) + + chunks: List[str] = [] + finish_reason_count = 0 + role_sent: bool = False + + # assemble streamed chunks + async for chunk in stream: + delta = chunk.choices[0].delta + + # make sure the role is assistant + if delta.role: + assert delta.role == 'assistant' + role_sent = True + + if delta.content: + chunks.append(delta.content) + + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + + # make sure tool call chunks aren't being streamed + assert not delta.tool_calls or len(delta.tool_calls) == 0 + + # make sure the role was sent, only 1 finish reason was sent, that chunks + # were in fact sent, and that the chunks match non-streaming + assert role_sent + assert finish_reason_count == 1 + assert chunk.choices[0].finish_reason == stop_reason + assert chunk.choices[0].finish_reason != 'tool_calls' + assert len(chunks) + assert "".join(chunks) == output_text diff --git a/tests/tool_use/test_parallel_tool_calls.py b/tests/tool_use/test_parallel_tool_calls.py new file mode 100644 index 0000000000000..b03b5a2075a6c --- /dev/null +++ b/tests/tool_use/test_parallel_tool_calls.py @@ -0,0 +1,193 @@ +import json +from typing import Dict, List, Optional + +import openai +import pytest + +from .utils import (MESSAGES_ASKING_FOR_PARALLEL_TOOLS, + MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, SEARCH_TOOL, + WEATHER_TOOL) + + +# test: getting the model to generate parallel tool calls (streaming/not) +# when requested. NOTE that not all models may support this, so some exclusions +# may be added in the future. e.g. llama 3.1 models are not designed to support +# parallel tool calls. +@pytest.mark.asyncio +async def test_parallel_tool_calls(client: openai.AsyncOpenAI): + models = await client.models.list() + model_name: str = models.data[0].id + chat_completion = await client.chat.completions.create( + messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS, + temperature=0, + max_tokens=200, + model=model_name, + tools=[WEATHER_TOOL, SEARCH_TOOL], + logprobs=False) + + choice = chat_completion.choices[0] + stop_reason = chat_completion.choices[0].finish_reason + non_streamed_tool_calls = chat_completion.choices[0].message.tool_calls + + # make sure 2 tool calls are present + assert choice.message.role == "assistant" + assert non_streamed_tool_calls is not None + assert len(non_streamed_tool_calls) == 2 + + for tool_call in non_streamed_tool_calls: + # make sure the tool includes a function and ID + assert tool_call.type == "function" + assert tool_call.function is not None + assert isinstance(tool_call.id, str) + assert len(tool_call.id) > 16 + + # make sure the weather tool was called correctly + assert tool_call.function.name == WEATHER_TOOL["function"]["name"] + assert isinstance(tool_call.function.arguments, str) + + parsed_arguments = json.loads(tool_call.function.arguments) + assert isinstance(parsed_arguments, Dict) + assert isinstance(parsed_arguments.get("city"), str) + assert isinstance(parsed_arguments.get("state"), str) + + assert stop_reason == "tool_calls" + + # make the same request, streaming + stream = await client.chat.completions.create( + model=model_name, + messages=MESSAGES_ASKING_FOR_PARALLEL_TOOLS, + temperature=0, + max_tokens=200, + tools=[WEATHER_TOOL, SEARCH_TOOL], + logprobs=False, + stream=True) + + role_name: Optional[str] = None + finish_reason_count: int = 0 + + tool_call_names: List[str] = [] + tool_call_args: List[str] = [] + tool_call_idx: int = -1 + tool_call_id_count: int = 0 + + async for chunk in stream: + + # if there's a finish reason make sure it's tools + if chunk.choices[0].finish_reason: + finish_reason_count += 1 + assert chunk.choices[0].finish_reason == 'tool_calls' + + # if a role is being streamed make sure it wasn't already set to + # something else + if chunk.choices[0].delta.role: + assert not role_name or role_name == 'assistant' + role_name = 'assistant' + + # if a tool call is streamed make sure there's exactly one + # (based on the request parameters + streamed_tool_calls = chunk.choices[0].delta.tool_calls + + if streamed_tool_calls and len(streamed_tool_calls) > 0: + + # make sure only one diff is present - correct even for parallel + assert len(streamed_tool_calls) == 1 + tool_call = streamed_tool_calls[0] + + # if a new tool is being called, set up empty arguments + if tool_call.index != tool_call_idx: + tool_call_idx = tool_call.index + tool_call_args.append("") + + # if a tool call ID is streamed, make sure one hasn't been already + if tool_call.id: + tool_call_id_count += 1 + assert (isinstance(tool_call.id, str) + and (len(tool_call.id) > 16)) + + # if parts of the function start being streamed + if tool_call.function: + # if the function name is defined, set it. it should be streamed + # IN ENTIRETY, exactly one time. + if tool_call.function.name: + assert isinstance(tool_call.function.name, str) + tool_call_names.append(tool_call.function.name) + + if tool_call.function.arguments: + # make sure they're a string and then add them to the list + assert isinstance(tool_call.function.arguments, str) + + tool_call_args[ + tool_call.index] += tool_call.function.arguments + + assert finish_reason_count == 1 + assert role_name == 'assistant' + + assert (len(non_streamed_tool_calls) == len(tool_call_names) == + len(tool_call_args)) + + for i in range(2): + assert non_streamed_tool_calls[i].function.name == tool_call_names[i] + streamed_args = json.loads(tool_call_args[i]) + non_streamed_args = json.loads( + non_streamed_tool_calls[i].function.arguments) + assert streamed_args == non_streamed_args + + +# test: providing parallel tool calls back to the model to get a response +# (streaming/not) +@pytest.mark.asyncio +async def test_parallel_tool_calls_with_results(client: openai.AsyncOpenAI): + models = await client.models.list() + model_name: str = models.data[0].id + chat_completion = await client.chat.completions.create( + messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, + temperature=0, + max_tokens=200, + model=model_name, + tools=[WEATHER_TOOL, SEARCH_TOOL], + logprobs=False) + + choice = chat_completion.choices[0] + + assert choice.finish_reason != "tool_calls" # "stop" or "length" + assert choice.message.role == "assistant" + assert choice.message.tool_calls is None \ + or len(choice.message.tool_calls) == 0 + assert choice.message.content is not None + assert "98" in choice.message.content # Dallas temp in tool response + assert "78" in choice.message.content # Orlando temp in tool response + + stream = await client.chat.completions.create( + messages=MESSAGES_WITH_PARALLEL_TOOL_RESPONSE, + temperature=0, + max_tokens=200, + model=model_name, + tools=[WEATHER_TOOL, SEARCH_TOOL], + logprobs=False, + stream=True) + + chunks: List[str] = [] + finish_reason_count = 0 + role_sent: bool = False + + async for chunk in stream: + delta = chunk.choices[0].delta + + if delta.role: + assert not role_sent + assert delta.role == "assistant" + role_sent = True + + if delta.content: + chunks.append(delta.content) + + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + assert chunk.choices[0].finish_reason == choice.finish_reason + + assert not delta.tool_calls or len(delta.tool_calls) == 0 + + assert role_sent + assert finish_reason_count == 1 + assert len(chunks) + assert "".join(chunks) == choice.message.content diff --git a/tests/tool_use/test_tool_calls.py b/tests/tool_use/test_tool_calls.py new file mode 100644 index 0000000000000..c3abe9e1f5060 --- /dev/null +++ b/tests/tool_use/test_tool_calls.py @@ -0,0 +1,192 @@ +import json +from typing import Dict, List, Optional + +import openai +import pytest + +from .utils import (MESSAGES_ASKING_FOR_TOOLS, MESSAGES_WITH_TOOL_RESPONSE, + SEARCH_TOOL, WEATHER_TOOL) + + +# test: request a chat completion that should return tool calls, so we know they +# are parsable +@pytest.mark.asyncio +async def test_tool_call_and_choice(client: openai.AsyncOpenAI): + models = await client.models.list() + model_name: str = models.data[0].id + chat_completion = await client.chat.completions.create( + messages=MESSAGES_ASKING_FOR_TOOLS, + temperature=0, + max_tokens=100, + model=model_name, + tools=[WEATHER_TOOL, SEARCH_TOOL], + logprobs=False) + + choice = chat_completion.choices[0] + stop_reason = chat_completion.choices[0].finish_reason + tool_calls = chat_completion.choices[0].message.tool_calls + + # make sure a tool call is present + assert choice.message.role == 'assistant' + assert tool_calls is not None + assert len(tool_calls) == 1 + assert tool_calls[0].type == 'function' + assert tool_calls[0].function is not None + assert isinstance(tool_calls[0].id, str) + assert len(tool_calls[0].id) > 16 + + # make sure the weather tool was called (classic example) with arguments + assert tool_calls[0].function.name == WEATHER_TOOL["function"]["name"] + assert tool_calls[0].function.arguments is not None + assert isinstance(tool_calls[0].function.arguments, str) + + # make sure the arguments parse properly + parsed_arguments = json.loads(tool_calls[0].function.arguments) + assert isinstance(parsed_arguments, Dict) + assert isinstance(parsed_arguments.get("city"), str) + assert isinstance(parsed_arguments.get("state"), str) + assert parsed_arguments.get("city") == "Dallas" + assert parsed_arguments.get("state") == "TX" + + assert stop_reason == "tool_calls" + + function_name: Optional[str] = None + function_args_str: str = '' + tool_call_id: Optional[str] = None + role_name: Optional[str] = None + finish_reason_count: int = 0 + + # make the same request, streaming + stream = await client.chat.completions.create( + model=model_name, + messages=MESSAGES_ASKING_FOR_TOOLS, + temperature=0, + max_tokens=100, + tools=[WEATHER_TOOL, SEARCH_TOOL], + logprobs=False, + stream=True) + + async for chunk in stream: + assert chunk.choices[0].index == 0 + + if chunk.choices[0].finish_reason: + finish_reason_count += 1 + assert chunk.choices[0].finish_reason == 'tool_calls' + + # if a role is being streamed make sure it wasn't already set to + # something else + if chunk.choices[0].delta.role: + assert not role_name or role_name == 'assistant' + role_name = 'assistant' + + # if a tool call is streamed make sure there's exactly one + # (based on the request parameters + streamed_tool_calls = chunk.choices[0].delta.tool_calls + + if streamed_tool_calls and len(streamed_tool_calls) > 0: + assert len(streamed_tool_calls) == 1 + tool_call = streamed_tool_calls[0] + + # if a tool call ID is streamed, make sure one hasn't been already + if tool_call.id: + assert not tool_call_id + tool_call_id = tool_call.id + + # if parts of the function start being streamed + if tool_call.function: + # if the function name is defined, set it. it should be streamed + # IN ENTIRETY, exactly one time. + if tool_call.function.name: + assert function_name is None + assert isinstance(tool_call.function.name, str) + function_name = tool_call.function.name + if tool_call.function.arguments: + assert isinstance(tool_call.function.arguments, str) + function_args_str += tool_call.function.arguments + + assert finish_reason_count == 1 + assert role_name == 'assistant' + assert isinstance(tool_call_id, str) and (len(tool_call_id) > 16) + + # validate the name and arguments + assert function_name == WEATHER_TOOL["function"]["name"] + assert function_name == tool_calls[0].function.name + assert isinstance(function_args_str, str) + + # validate arguments + streamed_args = json.loads(function_args_str) + assert isinstance(streamed_args, Dict) + assert isinstance(streamed_args.get("city"), str) + assert isinstance(streamed_args.get("state"), str) + assert streamed_args.get("city") == "Dallas" + assert streamed_args.get("state") == "TX" + + # make sure everything matches non-streaming except for ID + assert function_name == tool_calls[0].function.name + assert choice.message.role == role_name + assert choice.message.tool_calls[0].function.name == function_name + + # compare streamed with non-streamed args Dict-wise, not string-wise + # because character-to-character comparison might not work e.g. the tool + # call parser adding extra spaces or something like that. we care about the + # dicts matching not byte-wise match + assert parsed_arguments == streamed_args + + +# test: providing tools and results back to model to get a non-tool response +# (streaming/not) +@pytest.mark.asyncio +async def test_tool_call_with_results(client: openai.AsyncOpenAI): + models = await client.models.list() + model_name: str = models.data[0].id + chat_completion = await client.chat.completions.create( + messages=MESSAGES_WITH_TOOL_RESPONSE, + temperature=0, + max_tokens=100, + model=model_name, + tools=[WEATHER_TOOL, SEARCH_TOOL], + logprobs=False) + + choice = chat_completion.choices[0] + + assert choice.finish_reason != "tool_calls" # "stop" or "length" + assert choice.message.role == "assistant" + assert choice.message.tool_calls is None \ + or len(choice.message.tool_calls) == 0 + assert choice.message.content is not None + assert "98" in choice.message.content # the temperature from the response + + stream = await client.chat.completions.create( + messages=MESSAGES_WITH_TOOL_RESPONSE, + temperature=0, + max_tokens=100, + model=model_name, + tools=[WEATHER_TOOL, SEARCH_TOOL], + logprobs=False, + stream=True) + + chunks: List[str] = [] + finish_reason_count = 0 + role_sent: bool = False + + async for chunk in stream: + delta = chunk.choices[0].delta + + if delta.role: + assert not role_sent + assert delta.role == "assistant" + role_sent = True + + if delta.content: + chunks.append(delta.content) + + if chunk.choices[0].finish_reason is not None: + finish_reason_count += 1 + assert chunk.choices[0].finish_reason == choice.finish_reason + + assert not delta.tool_calls or len(delta.tool_calls) == 0 + + assert role_sent + assert finish_reason_count == 1 + assert len(chunks) + assert "".join(chunks) == choice.message.content diff --git a/tests/tool_use/utils.py b/tests/tool_use/utils.py new file mode 100644 index 0000000000000..8ec9b05b2c521 --- /dev/null +++ b/tests/tool_use/utils.py @@ -0,0 +1,215 @@ +from typing import Dict, List + +from openai.types.chat import (ChatCompletionMessageParam, + ChatCompletionToolParam) +from typing_extensions import TypedDict + +from tests.utils import VLLM_PATH + + +class ServerConfig(TypedDict): + model: str + arguments: List[str] + + +# universal args for all models go here. also good if you need to test locally +# and change type or KV cache quantization or something. +ARGS: List[str] = ["--enable-auto-tool-choice", "--max-model-len", "8096"] + +CONFIGS: Dict[str, ServerConfig] = { + "hermes": { + "model": + "NousResearch/Hermes-2-Pro-Llama-3-8B", + "arguments": [ + "--tool-call-parser", "hermes", "--chat-template", + str(VLLM_PATH / "examples/tool_chat_template_hermes.jinja") + ] + }, + "mistral": { + "model": + "mistralai/Mistral-7B-Instruct-v0.3", + "arguments": [ + "--tool-call-parser", "mistral", "--chat-template", + str(VLLM_PATH / "examples/tool_chat_template_mistral.jinja"), + "--ignore-patterns=\"consolidated.safetensors\"" + ] + } +} + +WEATHER_TOOL: ChatCompletionToolParam = { + "type": "function", + "function": { + "name": "get_current_weather", + "description": "Get the current weather in a given location", + "parameters": { + "type": "object", + "properties": { + "city": { + "type": + "string", + "description": + "The city to find the weather for, " + "e.g. 'San Francisco'" + }, + "state": { + "type": + "string", + "description": + "the two-letter abbreviation for the state " + "that the city is in, e.g. 'CA' which would " + "mean 'California'" + }, + "unit": { + "type": "string", + "description": "The unit to fetch the temperature in", + "enum": ["celsius", "fahrenheit"] + } + } + } + } +} + +SEARCH_TOOL: ChatCompletionToolParam = { + "type": "function", + "function": { + "name": + "web_search", + "description": + "Search the internet and get a summary of the top " + "10 webpages. Should only be used if you don't know " + "the answer to a user query, and the results are likely" + "to be able to be found with a web search", + "parameters": { + "type": "object", + "properties": { + "search_term": { + "type": + "string", + "description": + "The term to use in the search. This should" + "ideally be keywords to search for, not a" + "natural-language question" + } + }, + "required": ["search_term"] + } + } +} + +MESSAGES_WITHOUT_TOOLS: List[ChatCompletionMessageParam] = [{ + "role": + "system", + "content": + "You are a helpful assistant with access to tools. If a tool" + " that you have would be helpful to answer a user query, " + "call the tool. Otherwise, answer the user's query directly " + "without calling a tool. DO NOT CALL A TOOL THAT IS IRRELEVANT " + "to the user's question - just respond to it normally." +}, { + "role": + "user", + "content": + "Hi! How are you?" +}, { + "role": + "assistant", + "content": + "I'm doing great! How can I assist you?" +}, { + "role": + "user", + "content": + "Can you tell me a joke please?" +}] + +MESSAGES_ASKING_FOR_TOOLS: List[ChatCompletionMessageParam] = [{ + "role": + "user", + "content": + "What is the weather in Dallas, Texas in Fahrenheit?" +}] + +MESSAGES_WITH_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{ + "role": + "user", + "content": + "What is the weather in Dallas, Texas in Fahrenheit?" +}, { + "role": + "assistant", + "tool_calls": [{ + "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295", + "type": "function", + "function": { + "name": + WEATHER_TOOL["function"]["name"], + "arguments": + '{"city": "Dallas", "state": "TX", ' + '"unit": "fahrenheit"}' + } + }] +}, { + "role": + "tool", + "tool_call_id": + "chatcmpl-tool-03e6481b146e408e9523d9c956696295", + "content": + "The weather in Dallas is 98 degrees fahrenheit, with partly" + "cloudy skies and a low chance of rain." +}] + +MESSAGES_ASKING_FOR_PARALLEL_TOOLS: List[ChatCompletionMessageParam] = [{ + "role": + "user", + "content": + "What is the weather in Dallas, Texas and Orlando, Florida in " + "Fahrenheit?" +}] + +MESSAGES_WITH_PARALLEL_TOOL_RESPONSE: List[ChatCompletionMessageParam] = [{ + "role": + "user", + "content": + "What is the weather in Dallas, Texas and Orlando, Florida in " + "Fahrenheit?" +}, { + "role": + "assistant", + "tool_calls": [{ + "id": "chatcmpl-tool-03e6481b146e408e9523d9c956696295", + "type": "function", + "function": { + "name": + WEATHER_TOOL["function"]["name"], + "arguments": + '{"city": "Dallas", "state": "TX", ' + '"unit": "fahrenheit"}' + } + }, { + "id": "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b", + "type": "function", + "function": { + "name": + WEATHER_TOOL["function"]["name"], + "arguments": + '{"city": "Orlando", "state": "Fl", ' + '"unit": "fahrenheit"}' + } + }] +}, { + "role": + "tool", + "tool_call_id": + "chatcmpl-tool-03e6481b146e408e9523d9c956696295", + "content": + "The weather in Dallas TX is 98 degrees fahrenheit with mostly " + "cloudy skies and a chance of rain in the evening." +}, { + "role": + "tool", + "tool_call_id": + "chatcmpl-tool-d027061e1bd21cda48bee7da829c1f5b", + "content": + "The weather in Orlando FL is 78 degrees fahrenheit with clear" + "skies." +}] diff --git a/tests/utils.py b/tests/utils.py index cd8d7b1f25905..04067ef372ac2 100644 --- a/tests/utils.py +++ b/tests/utils.py @@ -178,7 +178,12 @@ def compare_two_settings(model: str, env2: The second set of environment variables to pass to the API server. """ - tokenizer = AutoTokenizer.from_pretrained(model) + trust_remote_code = "--trust-remote-code" + if trust_remote_code in arg1 or trust_remote_code in arg2: + tokenizer = AutoTokenizer.from_pretrained(model, + trust_remote_code=True) + else: + tokenizer = AutoTokenizer.from_pretrained(model) prompt = "Hello, my name is" token_ids = tokenizer(prompt)["input_ids"] diff --git a/vllm/attention/backends/flashinfer.py b/vllm/attention/backends/flashinfer.py index aa9d4a71dbf87..7aec8203eb1e5 100644 --- a/vllm/attention/backends/flashinfer.py +++ b/vllm/attention/backends/flashinfer.py @@ -224,6 +224,7 @@ def graph_capture_get_metadata_for_batch(self, batch_size: int): query_start_loc=query_start_loc_host, device=self.runner.device, data_type=kv_cache_dtype, + q_data_type=self.runner.model_config.dtype, use_cuda_graph=True, decode_wrapper=self._graph_decode_wrapper, prefill_wrapper=None) @@ -292,6 +293,8 @@ class FlashInferMetadata(AttentionMetadata): page_size: Optional[int] = None # The data type of the paged kv cache data_type: torch.dtype = None + # The data type of the query + q_data_type: torch.dtype = None device: torch.device = torch.device("cuda") is_profile_run: bool = False @@ -353,7 +356,10 @@ def begin_forward(self): self.page_size, # Disable flashinfer's pos encoding and use vllm's rope. pos_encoding_mode="NONE", - data_type=self.data_type) + # kv-cache data type. + data_type=self.data_type, + # query data type. + q_data_type=self.q_data_type) def asdict_zerocopy(self, skip_fields: Optional[Set[str]] = None @@ -617,6 +623,7 @@ def build(self, seq_lens: List[int], query_lens: List[int], query_start_loc=query_start_loc, device=device, data_type=kv_cache_dtype, + q_data_type=self.runner.model_config.dtype, use_cuda_graph=use_captured_graph, is_profile_run=self.is_profile_run) diff --git a/vllm/config.py b/vllm/config.py index f967cdeb78a2d..e96a3f283e6a4 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -1,8 +1,8 @@ import enum import json from dataclasses import dataclass, field, fields -from typing import (TYPE_CHECKING, ClassVar, List, Mapping, Optional, Tuple, - Type, Union) +from typing import (TYPE_CHECKING, Any, ClassVar, Dict, List, Mapping, + Optional, Tuple, Type, Union) import torch from transformers import PretrainedConfig @@ -35,18 +35,20 @@ _MULTIMODAL_MODEL_MAX_NUM_BATCHED_TOKENS = 4096 _PP_SUPPORTED_MODELS = [ - "AquilaModel", "AquilaForCausalLM", + "AquilaModel", "DeepseekV2ForCausalLM", + "GPT2LMHeadModel", + "InternLM2ForCausalLM", "InternLMForCausalLM", + "InternVLChatModel", "JAISLMHeadModel", "LlamaForCausalLM", "LLaMAForCausalLM", "MistralForCausalLM", - "Phi3ForCausalLM", - "GPT2LMHeadModel", "MixtralForCausalLM", "NemotronForCausalLM", + "Phi3ForCausalLM", "Qwen2ForCausalLM", "Qwen2MoeForCausalLM", "QWenLMHeadModel", @@ -115,35 +117,39 @@ class ModelConfig: the model name will be the same as `model`. limit_mm_per_prompt: Maximum number of data instances per modality per prompt. Only applicable for multimodal models. + override_neuron_config: Initialize non default neuron config or + override default neuron config that are specific to Neuron devices, + this argument will be used to configure the neuron config that + can not be gathered from the vllm arguments. """ def __init__( - self, - model: str, - tokenizer: str, - tokenizer_mode: str, - trust_remote_code: bool, - dtype: Union[str, torch.dtype], - seed: int, - revision: Optional[str] = None, - code_revision: Optional[str] = None, - rope_scaling: Optional[dict] = None, - rope_theta: Optional[float] = None, - tokenizer_revision: Optional[str] = None, - max_model_len: Optional[int] = None, - spec_target_max_model_len: Optional[int] = None, - quantization: Optional[str] = None, - quantization_param_path: Optional[str] = None, - enforce_eager: Optional[bool] = None, - max_context_len_to_capture: Optional[int] = None, - max_seq_len_to_capture: Optional[int] = None, - max_logprobs: int = 20, - disable_sliding_window: bool = False, - skip_tokenizer_init: bool = False, - served_model_name: Optional[Union[str, List[str]]] = None, - limit_mm_per_prompt: Optional[Mapping[str, int]] = None, - use_async_output_proc: bool = True, - ) -> None: + self, + model: str, + tokenizer: str, + tokenizer_mode: str, + trust_remote_code: bool, + dtype: Union[str, torch.dtype], + seed: int, + revision: Optional[str] = None, + code_revision: Optional[str] = None, + rope_scaling: Optional[dict] = None, + rope_theta: Optional[float] = None, + tokenizer_revision: Optional[str] = None, + max_model_len: Optional[int] = None, + spec_target_max_model_len: Optional[int] = None, + quantization: Optional[str] = None, + quantization_param_path: Optional[str] = None, + enforce_eager: Optional[bool] = None, + max_context_len_to_capture: Optional[int] = None, + max_seq_len_to_capture: Optional[int] = None, + max_logprobs: int = 20, + disable_sliding_window: bool = False, + skip_tokenizer_init: bool = False, + served_model_name: Optional[Union[str, List[str]]] = None, + limit_mm_per_prompt: Optional[Mapping[str, int]] = None, + use_async_output_proc: bool = True, + override_neuron_config: Optional[Dict[str, Any]] = None) -> None: self.model = model self.tokenizer = tokenizer self.tokenizer_mode = tokenizer_mode @@ -227,6 +233,9 @@ def __init__( limit_mm_per_prompt) if not self.skip_tokenizer_init: self._verify_tokenizer_mode() + + self.override_neuron_config = override_neuron_config if is_neuron( + ) else None self._verify_embedding_mode() self._verify_quantization() self._verify_cuda_graph() @@ -278,6 +287,7 @@ def _verify_quantization(self) -> None: "experts_int8" ] tpu_supported_quantization = ["tpu_int8"] + neuron_supported_quantization = ["neuron_quant"] if self.quantization is not None: self.quantization = self.quantization.lower() @@ -332,6 +342,11 @@ def _verify_quantization(self) -> None: "Using AWQ quantization with ROCm, but VLLM_USE_TRITON_AWQ" " is not set, enabling VLLM_USE_TRITON_AWQ.") envs.VLLM_USE_TRITON_AWQ = True + if is_neuron( + ) and self.quantization not in neuron_supported_quantization: + raise ValueError( + f"{self.quantization} quantization is currently not " + f"supported in Neuron Backend.") def _verify_cuda_graph(self) -> None: if self.max_seq_len_to_capture is None: diff --git a/vllm/engine/arg_utils.py b/vllm/engine/arg_utils.py index 0c1db53f76917..3629e89f95d04 100644 --- a/vllm/engine/arg_utils.py +++ b/vllm/engine/arg_utils.py @@ -2,8 +2,8 @@ import dataclasses import json from dataclasses import dataclass -from typing import (TYPE_CHECKING, Dict, List, Mapping, Optional, Tuple, Type, - Union) +from typing import (TYPE_CHECKING, Any, Dict, List, Mapping, Optional, Tuple, + Type, Union) import torch @@ -149,6 +149,7 @@ class EngineArgs: otlp_traces_endpoint: Optional[str] = None collect_detailed_traces: Optional[str] = None disable_async_output_proc: bool = False + override_neuron_config: Optional[Dict[str, Any]] = None def __post_init__(self): if self.tokenizer is None: @@ -742,6 +743,16 @@ def add_cli_args(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: default=EngineArgs.disable_async_output_proc, help="Disable async output processing. This may result in " "lower performance.") + parser.add_argument( + '--override-neuron-config', + type=lambda configs: { + str(key): value + for key, value in + (config.split(':') for config in configs.split(',')) + }, + default=None, + help="override or set neuron device configuration.") + return parser @classmethod @@ -802,7 +813,7 @@ def create_engine_config(self) -> EngineConfig: served_model_name=self.served_model_name, limit_mm_per_prompt=self.limit_mm_per_prompt, use_async_output_proc=not self.disable_async_output_proc, - ) + override_neuron_config=self.override_neuron_config) cache_config = CacheConfig( block_size=self.block_size if self.device != "neuron" else self.max_model_len, # neuron needs block_size = max_model_len diff --git a/vllm/engine/llm_engine.py b/vllm/engine/llm_engine.py index 7da4f7b25db9e..50dcb6937eb6f 100644 --- a/vllm/engine/llm_engine.py +++ b/vllm/engine/llm_engine.py @@ -214,6 +214,7 @@ def __init__( "Initializing an LLM engine (v%s) with config: " "model=%r, speculative_config=%r, tokenizer=%r, " "skip_tokenizer_init=%s, tokenizer_mode=%s, revision=%s, " + "override_neuron_config=%s, " "rope_scaling=%r, rope_theta=%r, tokenizer_revision=%s, " "trust_remote_code=%s, dtype=%s, max_seq_len=%d, " "download_dir=%r, load_format=%s, tensor_parallel_size=%d, " @@ -232,6 +233,7 @@ def __init__( model_config.skip_tokenizer_init, model_config.tokenizer_mode, model_config.revision, + model_config.override_neuron_config, model_config.rope_scaling, model_config.rope_theta, model_config.tokenizer_revision, diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index c70c6d9330b10..f9f9536a7c160 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -1,29 +1,36 @@ import asyncio import codecs +import json +from abc import ABC, abstractmethod from collections import defaultdict -from functools import lru_cache +from functools import lru_cache, partial from pathlib import Path -from typing import (Any, Awaitable, Dict, Iterable, List, Literal, Mapping, - Optional, Tuple, Union) +from typing import (Any, Awaitable, Dict, Generic, Iterable, List, Literal, + Mapping, Optional, Tuple, TypeVar, Union, cast) # yapf conflicts with isort for this block # yapf: disable -from openai.types.chat import ChatCompletionContentPartImageParam +from openai.types.chat import (ChatCompletionAssistantMessageParam, + ChatCompletionContentPartImageParam) from openai.types.chat import ( ChatCompletionContentPartParam as OpenAIChatCompletionContentPartParam) -from openai.types.chat import ChatCompletionContentPartTextParam +from openai.types.chat import (ChatCompletionContentPartRefusalParam, + ChatCompletionContentPartTextParam) from openai.types.chat import ( ChatCompletionMessageParam as OpenAIChatCompletionMessageParam) +from openai.types.chat import (ChatCompletionMessageToolCallParam, + ChatCompletionToolMessageParam) # yapf: enable # pydantic needs the TypedDict from typing_extensions -from pydantic import ConfigDict, TypeAdapter +from pydantic import ConfigDict from typing_extensions import Required, TypeAlias, TypedDict from vllm.config import ModelConfig from vllm.logger import init_logger from vllm.multimodal import MultiModalDataDict from vllm.multimodal.utils import (async_get_and_parse_audio, - async_get_and_parse_image) + async_get_and_parse_image, + get_and_parse_audio, get_and_parse_image) from vllm.transformers_utils.tokenizer import AnyTokenizer logger = init_logger(__name__) @@ -52,7 +59,8 @@ class CustomChatCompletionContentPartParam(TypedDict, total=False): ChatCompletionContentPartParam: TypeAlias = Union[ OpenAIChatCompletionContentPartParam, ChatCompletionContentPartAudioParam, - CustomChatCompletionContentPartParam, ] + ChatCompletionContentPartRefusalParam, + CustomChatCompletionContentPartParam] class CustomChatCompletionMessageParam(TypedDict, total=False): @@ -70,18 +78,40 @@ class CustomChatCompletionMessageParam(TypedDict, total=False): same role. """ + tool_call_id: Optional[str] + """Tool call that this message is responding to.""" + + tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]] + """The tool calls generated by the model, such as function calls.""" + ChatCompletionMessageParam = Union[OpenAIChatCompletionMessageParam, CustomChatCompletionMessageParam] # TODO: Make fields ReadOnly once mypy supports it -class ConversationMessage(TypedDict): - role: str - content: str +class ConversationMessage(TypedDict, total=False): + role: Required[str] + """The role of the message's author.""" + + content: Optional[str] + """The contents of the message""" + + tool_call_id: Optional[str] + """Tool call that this message is responding to.""" + name: Optional[str] + """The name of the function to call""" -class MultiModalItemTracker: + tool_calls: Optional[Iterable[ChatCompletionMessageToolCallParam]] + """The tool calls generated by the model, such as function calls.""" + + +ModalityStr = Literal["image", "audio"] +_T = TypeVar("_T") + + +class BaseMultiModalItemTracker(ABC, Generic[_T]): """ Tracks multi-modal items in a given request and ensures that the number of multi-modal items in a given request does not exceed the configured @@ -89,37 +119,28 @@ class MultiModalItemTracker: """ def __init__(self, model_config: ModelConfig, tokenizer: AnyTokenizer): + super().__init__() + self._model_config = model_config self._tokenizer = tokenizer self._allowed_items = (model_config.multimodal_config.limit_per_prompt if model_config.multimodal_config else {}) self._consumed_items = {k: 0 for k in self._allowed_items} - self._futures: List[Awaitable[MultiModalDataDict]] = [] + + self._items: List[_T] = [] @staticmethod @lru_cache(maxsize=None) - def _cached_token_str(tokenizer: AnyTokenizer, token_index: int): + def _cached_token_str(tokenizer: AnyTokenizer, token_index: int) -> str: return tokenizer.decode(token_index) - def add(self, modality: Literal["image", "audio"], - mm_future: Awaitable[MultiModalDataDict]) -> Optional[str]: - """ - Adds the multi-modal item to the current prompt and returns the - placeholder string to use, if any. - """ - allowed_count = self._allowed_items.get(modality, 1) - current_count = self._consumed_items.get(modality, 0) + 1 - if current_count > allowed_count: - raise ValueError( - f"At most {allowed_count} {modality}(s) may be provided in " - "one request.") - - self._consumed_items[modality] = current_count - self._futures.append(mm_future) - + def _placeholder_str(self, modality: ModalityStr, + current_count: int) -> Optional[str]: # TODO: Let user specify how to insert image tokens into prompt # (similar to chat template) - model_type = self._model_config.hf_config.model_type + hf_config = self._model_config.hf_config + model_type = hf_config.model_type + if modality == "image": if model_type == "phi3_v": # Workaround since this token is not defined in the tokenizer @@ -129,10 +150,11 @@ def add(self, modality: Literal["image", "audio"], if model_type in ("blip-2", "chatglm", "fuyu", "paligemma"): # These models do not use image tokens in the prompt return None + if model_type == "qwen": + return f"Picture {current_count}: " if model_type.startswith("llava"): - return MultiModalItemTracker._cached_token_str( - self._tokenizer, - self._model_config.hf_config.image_token_index) + return self._cached_token_str(self._tokenizer, + hf_config.image_token_index) if model_type in ("chameleon", "internvl_chat"): return "" @@ -145,11 +167,11 @@ def add(self, modality: Literal["image", "audio"], raise TypeError(f"Unknown modality: {modality}") @staticmethod - async def _combine(futures: List[Awaitable[MultiModalDataDict]]): + def _combine(items: List[MultiModalDataDict]) -> MultiModalDataDict: mm_lists: Mapping[str, List[object]] = defaultdict(list) # Merge all the multi-modal items - for single_mm_data in (await asyncio.gather(*futures)): + for single_mm_data in items: for mm_key, mm_item in single_mm_data.items(): if isinstance(mm_item, list): mm_lists[mm_key].extend(mm_item) @@ -162,9 +184,113 @@ async def _combine(futures: List[Awaitable[MultiModalDataDict]]): for mm_key, mm_list in mm_lists.items() } - def all_mm_data(self) -> Optional[Awaitable[MultiModalDataDict]]: - return MultiModalItemTracker._combine( - self._futures) if self._futures else None + def add(self, modality: ModalityStr, item: _T) -> Optional[str]: + """ + Add a multi-modal item to the current prompt and returns the + placeholder string to use, if any. + """ + allowed_count = self._allowed_items.get(modality, 1) + current_count = self._consumed_items.get(modality, 0) + 1 + if current_count > allowed_count: + raise ValueError( + f"At most {allowed_count} {modality}(s) may be provided in " + "one request.") + + self._consumed_items[modality] = current_count + self._items.append(item) + + return self._placeholder_str(modality, current_count) + + @abstractmethod + def create_parser(self) -> "BaseMultiModalContentParser": + raise NotImplementedError + + +class MultiModalItemTracker(BaseMultiModalItemTracker[MultiModalDataDict]): + + def all_mm_data(self) -> Optional[MultiModalDataDict]: + return self._combine(self._items) if self._items else None + + def create_parser(self) -> "BaseMultiModalContentParser": + return MultiModalContentParser(self) + + +class AsyncMultiModalItemTracker( + BaseMultiModalItemTracker[Awaitable[MultiModalDataDict]]): + + async def all_mm_data(self) -> Optional[MultiModalDataDict]: + if self._items: + items = await asyncio.gather(*self._items) + return self._combine(items) + + return None + + def create_parser(self) -> "BaseMultiModalContentParser": + return AsyncMultiModalContentParser(self) + + +class BaseMultiModalContentParser(ABC): + + def __init__(self) -> None: + super().__init__() + + # multimodal placeholder_string : count + self._placeholder_counts: Dict[str, int] = defaultdict(lambda: 0) + + def _add_placeholder(self, placeholder: Optional[str]): + if placeholder: + self._placeholder_counts[placeholder] += 1 + + def mm_placeholder_counts(self) -> Dict[str, int]: + return dict(self._placeholder_counts) + + @abstractmethod + def parse_image(self, image_url: str) -> None: + raise NotImplementedError + + @abstractmethod + def parse_audio(self, audio_url: str) -> None: + raise NotImplementedError + + +class MultiModalContentParser(BaseMultiModalContentParser): + + def __init__(self, tracker: MultiModalItemTracker) -> None: + super().__init__() + + self._tracker = tracker + + def parse_image(self, image_url: str) -> None: + image = get_and_parse_image(image_url) + + placeholder = self._tracker.add("image", image) + self._add_placeholder(placeholder) + + def parse_audio(self, audio_url: str) -> None: + audio = get_and_parse_audio(audio_url) + + placeholder = self._tracker.add("audio", audio) + self._add_placeholder(placeholder) + + +class AsyncMultiModalContentParser(BaseMultiModalContentParser): + + def __init__(self, tracker: AsyncMultiModalItemTracker) -> None: + super().__init__() + + self._tracker = tracker + + def parse_image(self, image_url: str) -> None: + image_coro = async_get_and_parse_image(image_url) + + placeholder = self._tracker.add("image", image_coro) + self._add_placeholder(placeholder) + + def parse_audio(self, audio_url: str) -> None: + audio_coro = async_get_and_parse_audio(audio_url) + + placeholder = self._tracker.add("audio", audio_coro) + self._add_placeholder(placeholder) def load_chat_template( @@ -197,10 +323,10 @@ def load_chat_template( # (similar to chat template) def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], text_prompt: str) -> str: - """Combine multimodal prompts for a multimodal language model""" + """Combine multimodal prompts for a multimodal language model.""" # Look through the text prompt to check for missing placeholders - missing_placeholders = [] + missing_placeholders: List[str] = [] for placeholder in placeholder_counts: # For any existing placeholder in the text prompt, we leave it as is @@ -219,50 +345,45 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], return "\n".join(missing_placeholders + [text_prompt]) -_TextParser = TypeAdapter(ChatCompletionContentPartTextParam) -_ImageParser = TypeAdapter(ChatCompletionContentPartImageParam) -_AudioParser = TypeAdapter(ChatCompletionContentPartAudioParam) +# No need to validate using Pydantic again +_TextParser = partial(cast, ChatCompletionContentPartTextParam) +_ImageParser = partial(cast, ChatCompletionContentPartImageParam) +_AudioParser = partial(cast, ChatCompletionContentPartAudioParam) +_RefusalParser = partial(cast, ChatCompletionContentPartRefusalParam) def _parse_chat_message_content_parts( role: str, parts: Iterable[ChatCompletionContentPartParam], - mm_tracker: MultiModalItemTracker, + mm_tracker: BaseMultiModalItemTracker, ) -> List[ConversationMessage]: texts: List[str] = [] - # multimodal placeholder_string : count - mm_placeholder_counts: Dict[str, int] = {} + mm_parser = mm_tracker.create_parser() for part in parts: part_type = part["type"] if part_type == "text": - text = _TextParser.validate_python(part)["text"] + text = _TextParser(part)["text"] texts.append(text) elif part_type == "image_url": - image_url = _ImageParser.validate_python(part)["image_url"] + image_url = _ImageParser(part)["image_url"] if image_url.get("detail", "auto") != "auto": logger.warning( "'image_url.detail' is currently not supported and " "will be ignored.") - image_coro = async_get_and_parse_image(image_url["url"]) - placeholder = mm_tracker.add("image", image_coro) - if placeholder: - mm_placeholder_counts[placeholder] = mm_placeholder_counts.get( - placeholder, 0) + 1 + mm_parser.parse_image(image_url["url"]) elif part_type == "audio_url": - audio_url = _AudioParser.validate_python(part)["audio_url"] - audio_coro = async_get_and_parse_audio(audio_url["url"]) - placeholder = mm_tracker.add("audio", audio_coro) - if placeholder: - mm_placeholder_counts[placeholder] = mm_placeholder_counts.get( - placeholder, 0) + 1 + audio_url = _AudioParser(part)["audio_url"] + + mm_parser.parse_audio(audio_url["url"]) else: raise NotImplementedError(f"Unknown part type: {part_type}") text_prompt = "\n".join(texts) + mm_placeholder_counts = mm_parser.mm_placeholder_counts() if mm_placeholder_counts: text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts, text_prompt) @@ -270,29 +391,53 @@ def _parse_chat_message_content_parts( return [ConversationMessage(role=role, content=text_prompt)] +# No need to validate using Pydantic again +_AssistantParser = partial(cast, ChatCompletionAssistantMessageParam) +_ToolParser = partial(cast, ChatCompletionToolMessageParam) + + def _parse_chat_message_content( - message: ChatCompletionMessageParam, - mm_tracker: MultiModalItemTracker) -> List[ConversationMessage]: + message: ChatCompletionMessageParam, + mm_tracker: BaseMultiModalItemTracker, +) -> List[ConversationMessage]: role = message["role"] content = message.get("content") if content is None: - return [] - if isinstance(content, str): - return [ConversationMessage(role=role, content=content)] + content = [] + elif isinstance(content, str): + content = [ + ChatCompletionContentPartTextParam(type="text", text=content) + ] - return _parse_chat_message_content_parts( + result = _parse_chat_message_content_parts( role, content, # type: ignore mm_tracker, ) + for result_msg in result: + if role == 'assistant': + parsed_msg = _AssistantParser(message) + + if "tool_calls" in parsed_msg: + result_msg["tool_calls"] = list(parsed_msg["tool_calls"]) + elif role == "tool": + parsed_msg = _ToolParser(message) + if "tool_call_id" in parsed_msg: + result_msg["tool_call_id"] = parsed_msg["tool_call_id"] + + if "name" in message and isinstance(message["name"], str): + result_msg["name"] = message["name"] + + return result + def parse_chat_messages( messages: List[ChatCompletionMessageParam], model_config: ModelConfig, tokenizer: AnyTokenizer, -) -> Tuple[List[ConversationMessage], Optional[Awaitable[MultiModalDataDict]]]: +) -> Tuple[List[ConversationMessage], Optional[MultiModalDataDict]]: conversation: List[ConversationMessage] = [] mm_tracker = MultiModalItemTracker(model_config, tokenizer) @@ -304,6 +449,22 @@ def parse_chat_messages( return conversation, mm_tracker.all_mm_data() +def parse_chat_messages_futures( + messages: List[ChatCompletionMessageParam], + model_config: ModelConfig, + tokenizer: AnyTokenizer, +) -> Tuple[List[ConversationMessage], Awaitable[Optional[MultiModalDataDict]]]: + conversation: List[ConversationMessage] = [] + mm_tracker = AsyncMultiModalItemTracker(model_config, tokenizer) + + for msg in messages: + sub_messages = _parse_chat_message_content(msg, mm_tracker) + + conversation.extend(sub_messages) + + return conversation, mm_tracker.all_mm_data() + + def apply_chat_template( tokenizer: AnyTokenizer, conversation: List[ConversationMessage], @@ -318,6 +479,20 @@ def apply_chat_template( "allowed, so you must provide a chat template if the tokenizer " "does not define one.") + # per the Transformers docs & maintainers, tool call arguments in + # assistant-role messages with tool_calls need to be dicts not JSON str - + # this is how tool-use chat templates will expect them moving forwards + # so, for messages that have tool_calls, parse the string (which we get + # from openAI format) to dict + for message in conversation: + if (message["role"] == "assistant" and "tool_calls" in message + and isinstance(message["tool_calls"], list)): + + for i in range(len(message["tool_calls"])): + args: str = message["tool_calls"][i]["function"]["arguments"] + parsed_args: Dict = json.loads(args) + message["tool_calls"][i]["function"]["arguments"] = parsed_args + prompt = tokenizer.apply_chat_template( conversation=conversation, chat_template=chat_template, diff --git a/vllm/entrypoints/llm.py b/vllm/entrypoints/llm.py index 0edd4bfaecd6a..b32c90a4df1aa 100644 --- a/vllm/entrypoints/llm.py +++ b/vllm/entrypoints/llm.py @@ -23,7 +23,7 @@ get_cached_tokenizer) from vllm.transformers_utils.tokenizer_group import TokenizerGroup from vllm.usage.usage_lib import UsageContext -from vllm.utils import Counter, deprecate_kwargs +from vllm.utils import Counter, deprecate_kwargs, is_list_of logger = init_logger(__name__) @@ -358,15 +358,18 @@ def chat( add_generation_prompt: bool = True, ) -> List[RequestOutput]: """ - Generates responses for chat messages. + Generate responses for a chat conversation. - Converts the messages to prompts using the tokenizer and calls - the :meth:`generate` method to generate the responses. + The chat conversation is converted into a text prompt using the + tokenizer and calls the :meth:`generate` method to generate the + responses. + + Multi-modal inputs can be passed in the same way you would pass them + to the OpenAI API. Args: - messages: A list of messages to generate responses for. Each - message is a list of dictionaries with 'role' and 'content' - keys. + messages: A single conversation represented as a list of messages. + Each message is a dictionary with 'role' and 'content' keys. sampling_params: The sampling parameters for text generation. If None, we use the default sampling parameters. When it is a single value, it is applied to every prompt. When it @@ -387,21 +390,25 @@ def chat( tokenizer = self.get_tokenizer() model_config = self.llm_engine.get_model_config() - conversations, _ = parse_chat_messages(messages, model_config, - tokenizer) + conversation, mm_data = parse_chat_messages(messages, model_config, + tokenizer) prompt = apply_chat_template( tokenizer, - conversations, + conversation, chat_template=chat_template, - add_generation_prompt=add_generation_prompt) + add_generation_prompt=add_generation_prompt, + ) inputs: PromptInputs - if isinstance(prompt, list) and isinstance(prompt[0], int): + if is_list_of(prompt, int): inputs = TokensPrompt(prompt_token_ids=prompt) else: inputs = TextPrompt(prompt=prompt) + if mm_data is not None: + inputs["multi_modal_data"] = mm_data + return self.generate( inputs, sampling_params=sampling_params, diff --git a/vllm/entrypoints/openai/api_server.py b/vllm/entrypoints/openai/api_server.py index 8e8371ef1559a..728a2e5232d9b 100644 --- a/vllm/entrypoints/openai/api_server.py +++ b/vllm/entrypoints/openai/api_server.py @@ -67,7 +67,7 @@ def model_is_embedding(model_name: str, trust_remote_code: bool, - quantization: str) -> bool: + quantization: Optional[str]) -> bool: return ModelConfig(model=model_name, tokenizer=model_name, tokenizer_mode="auto", @@ -96,13 +96,6 @@ async def _force_log(): @asynccontextmanager async def build_async_engine_client( args: Namespace) -> AsyncIterator[Optional[AsyncEngineClient]]: - """ - Create AsyncEngineClient, either: - - in-process using the AsyncLLMEngine Directly - - multiprocess using AsyncLLMEngine RPC - - Returns the Client or None if the creation failed. - """ # Context manager to handle async_engine_client lifecycle # Ensures everything is shutdown and cleaned up on error/exit @@ -112,14 +105,37 @@ async def build_async_engine_client( # Backend itself still global for the silly lil' health handler global async_engine_client + async with build_async_engine_client_from_engine_args( + engine_args, args.disable_frontend_multiprocessing) as engine: + + async_engine_client = engine # type: ignore[assignment] + yield engine + + +@asynccontextmanager +async def build_async_engine_client_from_engine_args( + engine_args: AsyncEngineArgs, + disable_frontend_multiprocessing: bool = False, +) -> AsyncIterator[Optional[AsyncEngineClient]]: + """ + Create AsyncEngineClient, either: + - in-process using the AsyncLLMEngine Directly + - multiprocess using AsyncLLMEngine RPC + + Returns the Client or None if the creation failed. + """ + # If manually triggered or embedding model, use AsyncLLMEngine in process. # TODO: support embedding model via RPC. - if (model_is_embedding(args.model, args.trust_remote_code, - args.quantization) - or args.disable_frontend_multiprocessing): - async_engine_client = AsyncLLMEngine.from_engine_args( + if (model_is_embedding(engine_args.model, engine_args.trust_remote_code, + engine_args.quantization) + or disable_frontend_multiprocessing): + engine_client = AsyncLLMEngine.from_engine_args( engine_args, usage_context=UsageContext.OPENAI_API_SERVER) - yield async_engine_client + try: + yield engine_client + finally: + engine_client.shutdown_background_loop() return # Otherwise, use the multiprocessing AsyncLLMEngine. @@ -148,7 +164,6 @@ async def build_async_engine_client( # NOTE: Actually, this is not true yet. We still need to support # embedding models via RPC (see TODO above) rpc_client = AsyncEngineRPCClient(rpc_path) - async_engine_client = rpc_client # type: ignore # Start RPCServer in separate process (holds the AsyncLLMEngine). context = multiprocessing.get_context("spawn") @@ -174,7 +189,7 @@ async def build_async_engine_client( yield None return - yield async_engine_client + yield rpc_client # type: ignore[misc] finally: # Ensure rpc server process was terminated rpc_server_process.terminate() @@ -218,7 +233,7 @@ def mount_metrics(app: FastAPI): metrics_route = Mount("/metrics", make_asgi_app()) # Workaround for 307 Redirect for /metrics - metrics_route.path_regex = re.compile('^/metrics(?P.*)$') + metrics_route.path_regex = re.compile("^/metrics(?P.*)$") app.routes.append(metrics_route) @@ -268,11 +283,14 @@ async def show_version(): @router.post("/v1/chat/completions") async def create_chat_completion(request: ChatCompletionRequest, raw_request: Request): + generator = await openai_serving_chat.create_chat_completion( request, raw_request) + if isinstance(generator, ErrorResponse): return JSONResponse(content=generator.model_dump(), status_code=generator.code) + elif isinstance(generator, ChatCompletionResponse): return JSONResponse(content=generator.model_dump()) @@ -407,7 +425,8 @@ async def init_app( request_logger=request_logger, chat_template=args.chat_template, return_tokens_as_token_ids=args.return_tokens_as_token_ids, - ) + enable_auto_tools=args.enable_auto_tool_choice, + tool_parser=args.tool_call_parser) openai_serving_completion = OpenAIServingCompletion( async_engine_client, model_config, diff --git a/vllm/entrypoints/openai/cli_args.py b/vllm/entrypoints/openai/cli_args.py index 94742838b421c..7ccee0b6b55b7 100644 --- a/vllm/entrypoints/openai/cli_args.py +++ b/vllm/entrypoints/openai/cli_args.py @@ -163,6 +163,24 @@ def make_arg_parser(parser: FlexibleArgumentParser) -> FlexibleArgumentParser: help="If specified, will run the OpenAI frontend server in the same " "process as the model serving engine.") + parser.add_argument( + "--enable-auto-tool-choice", + action="store_true", + default=False, + help= + "Enable auto tool choice for supported models. Use --tool-call-parser" + "to specify which parser to use") + + parser.add_argument( + "--tool-call-parser", + type=str, + choices=["mistral", "hermes"], + default=None, + help= + "Select the tool call parser depending on the model that you're using." + " This is used to parse the model-generated tool call into OpenAI API " + "format. Required for --enable-auto-tool-choice.") + parser = AsyncEngineArgs.add_cli_args(parser) parser.add_argument('--max-log-len', diff --git a/vllm/entrypoints/openai/protocol.py b/vllm/entrypoints/openai/protocol.py index 0954b81595ef5..ff9c3690672b6 100644 --- a/vllm/entrypoints/openai/protocol.py +++ b/vllm/entrypoints/openai/protocol.py @@ -5,8 +5,9 @@ from typing import Any, Dict, List, Literal, Optional, Union import torch +from openai.types.chat import ChatCompletionContentPartParam from pydantic import BaseModel, ConfigDict, Field, model_validator -from typing_extensions import Annotated +from typing_extensions import Annotated, Required, TypedDict from vllm.entrypoints.chat_utils import ChatCompletionMessageParam from vllm.entrypoints.openai.logits_processors import get_logits_processors @@ -35,6 +36,26 @@ assert _LONG_INFO.max == _MOCK_LONG_INFO.max +class CustomChatCompletionMessageParam(TypedDict, total=False): + """Enables custom roles in the Chat Completion API.""" + role: Required[str] + """The role of the message's author.""" + + content: Union[str, List[ChatCompletionContentPartParam]] + """The contents of the message.""" + + name: str + """An optional name for the participant. + + Provides the model information to differentiate between participants of the + same role. + """ + + tool_call_id: Optional[str] + + tool_calls: Optional[List[dict]] + + class OpenAIBaseModel(BaseModel): # OpenAI API does not allow extra fields model_config = ConfigDict(extra="forbid") @@ -145,8 +166,11 @@ class ChatCompletionRequest(OpenAIBaseModel): temperature: Optional[float] = 0.7 top_p: Optional[float] = 1.0 tools: Optional[List[ChatCompletionToolsParam]] = None - tool_choice: Optional[Union[Literal["none"], + tool_choice: Optional[Union[Literal["none"], Literal["auto"], ChatCompletionNamedToolChoiceParam]] = "none" + + # NOTE this will be ignored by VLLM -- the model determines the behavior + parallel_tool_calls: Optional[bool] = False user: Optional[str] = None # doc: begin-chat-completion-sampling-params @@ -328,6 +352,9 @@ def check_logprobs(cls, data): @model_validator(mode="before") @classmethod def check_guided_decoding_count(cls, data): + if isinstance(data, ValueError): + raise data + guide_count = sum([ "guided_json" in data and data["guided_json"] is not None, "guided_regex" in data and data["guided_regex"] is not None, @@ -339,21 +366,61 @@ def check_guided_decoding_count(cls, data): "You can only use one kind of guided decoding " "('guided_json', 'guided_regex' or 'guided_choice').") # you can only either use guided decoding or tools, not both - if guide_count > 1 and "tool_choice" in data and data[ - "tool_choice"] != "none": + if guide_count > 1 and data.get("tool_choice", + "none") not in ("none", "auto"): raise ValueError( "You can only either use guided decoding or tools, not both.") return data @model_validator(mode="before") @classmethod - def check_tool_choice(cls, data): - if "tool_choice" in data and data["tool_choice"] != "none": - if not isinstance(data["tool_choice"], dict): - raise ValueError("Currently only named tools are supported.") + def check_tool_usage(cls, data): + + # if "tool_choice" is not specified but tools are provided, + # default to "auto" tool_choice + if "tool_choice" not in data and "tools" in data: + data["tool_choice"] = "auto" + + # if "tool_choice" is specified -- validation + if "tool_choice" in data: + + # ensure that if "tool choice" is specified, tools are present if "tools" not in data or data["tools"] is None: raise ValueError( "When using `tool_choice`, `tools` must be set.") + + # make sure that tool choice is either a named tool + # OR that it's set to "auto" + if data["tool_choice"] != "auto" and not isinstance( + data["tool_choice"], dict): + raise ValueError( + "`tool_choice` must either be a named tool or \"auto\". " + "`tool_choice=\"none\" is not supported.") + + # ensure that if "tool_choice" is specified as an object, + # it matches a valid tool + if isinstance(data["tool_choice"], dict): + valid_tool = False + specified_function = data["tool_choice"]["function"] + if not specified_function: + raise ValueError( + "Incorrectly formatted `tool_choice`. Should be like " + "`{\"type\": \"function\"," + " \"function\": {\"name\": \"my_function\"}}`") + specified_function_name = specified_function["name"] + if not specified_function_name: + raise ValueError( + "Incorrectly formatted `tool_choice`. Should be like " + "`{\"type\": \"function\", " + "\"function\": {\"name\": \"my_function\"}}`") + for tool in data["tools"]: + if tool["function"]["name"] == specified_function_name: + valid_tool = True + break + if not valid_tool: + raise ValueError( + "The tool specified in `tool_choice` does not match any" + " of the specified `tools`") return data @@ -413,7 +480,7 @@ class CompletionRequest(OpenAIBaseModel): ) guided_json: Optional[Union[str, dict, BaseModel]] = Field( default=None, - description=("If specified, the output will follow the JSON schema."), + description="If specified, the output will follow the JSON schema.", ) guided_regex: Optional[str] = Field( default=None, @@ -633,9 +700,41 @@ class ToolCall(OpenAIBaseModel): function: FunctionCall +class DeltaFunctionCall(BaseModel): + name: Optional[str] = None + arguments: Optional[str] = None + + +# a tool call delta where everything is optional +class DeltaToolCall(OpenAIBaseModel): + id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}") + type: Literal["function"] = "function" + index: int + function: Optional[DeltaFunctionCall] = None + + +# the initial delta that gets sent once a new tool call is started; +class InitialDeltaToolCall(DeltaToolCall): + id: str = Field(default_factory=lambda: f"chatcmpl-tool-{random_uuid()}") + type: Literal["function"] = "function" + index: int + + +class ExtractedToolCallInformation(BaseModel): + # indicate if tools were called + tools_called: bool + + # extracted tool calls + tool_calls: List[ToolCall] + + # content - per OpenAI spec, content AND tool calls can be returned rarely + # But some models will do this intentionally + content: Optional[str] = None + + class ChatMessage(OpenAIBaseModel): role: str - content: str + content: Optional[str] = None tool_calls: List[ToolCall] = Field(default_factory=list) @@ -657,7 +756,9 @@ class ChatCompletionResponseChoice(OpenAIBaseModel): index: int message: ChatMessage logprobs: Optional[ChatCompletionLogProbs] = None - finish_reason: Optional[str] = None + # per OpenAI spec this is the default + finish_reason: Optional[str] = "stop" + # not part of the OpenAI spec but included in vLLM for legacy reasons stop_reason: Optional[Union[int, str]] = None @@ -674,7 +775,7 @@ class ChatCompletionResponse(OpenAIBaseModel): class DeltaMessage(OpenAIBaseModel): role: Optional[str] = None content: Optional[str] = None - tool_calls: List[ToolCall] = Field(default_factory=list) + tool_calls: List[DeltaToolCall] = Field(default_factory=list) class ChatCompletionResponseStreamChoice(OpenAIBaseModel): diff --git a/vllm/entrypoints/openai/rpc/client.py b/vllm/entrypoints/openai/rpc/client.py index c457555c54b9c..9b88db746be5c 100644 --- a/vllm/entrypoints/openai/rpc/client.py +++ b/vllm/entrypoints/openai/rpc/client.py @@ -7,6 +7,7 @@ import cloudpickle import zmq import zmq.asyncio +from zmq import Frame # type: ignore[attr-defined] from zmq.asyncio import Socket from vllm.config import (DecodingConfig, LoRAConfig, ModelConfig, @@ -214,6 +215,7 @@ async def _send_get_data_rpc_request(self, request: RPCUtilityRequest, # Await the data from the Server. frame = await socket.recv(copy=False) + assert isinstance(frame, Frame) data = pickle.loads(frame.buffer) if isinstance(data, Exception): @@ -247,6 +249,7 @@ async def do_rpc_call(socket: Socket, request: RPC_REQUEST_TYPE): f"{self._data_timeout} ms") frame = await socket.recv(copy=False) + assert isinstance(frame, Frame) return pickle.loads(frame.buffer) # Make a new socket connection. @@ -395,6 +398,7 @@ async def generate( # Stream back the results from the RPC Server. while not finished: message = await socket.recv(copy=False) + assert isinstance(message, Frame) request_output = pickle.loads(message.buffer) if isinstance(request_output, Exception): diff --git a/vllm/entrypoints/openai/serving_chat.py b/vllm/entrypoints/openai/serving_chat.py index f7576509d06c8..78f355228012f 100644 --- a/vllm/entrypoints/openai/serving_chat.py +++ b/vllm/entrypoints/openai/serving_chat.py @@ -1,6 +1,8 @@ import asyncio +import json import time -from typing import AsyncGenerator, AsyncIterator, Dict, Final, List, Optional +from typing import (AsyncGenerator, AsyncIterator, Callable, Dict, Final, List, + Optional) from typing import Sequence as GenericSequence from typing import Union @@ -11,23 +13,25 @@ from vllm.entrypoints.chat_utils import (ConversationMessage, apply_chat_template, load_chat_template, - parse_chat_messages) + parse_chat_messages_futures) from vllm.entrypoints.logger import RequestLogger from vllm.entrypoints.openai.protocol import ( ChatCompletionLogProb, ChatCompletionLogProbs, ChatCompletionLogProbsContent, ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, ChatCompletionResponse, ChatCompletionResponseChoice, ChatCompletionResponseStreamChoice, - ChatCompletionStreamResponse, ChatMessage, DeltaMessage, ErrorResponse, - FunctionCall, ToolCall, UsageInfo) + ChatCompletionStreamResponse, ChatMessage, DeltaFunctionCall, DeltaMessage, + DeltaToolCall, ErrorResponse, FunctionCall, ToolCall, UsageInfo) from vllm.entrypoints.openai.serving_engine import (LoRAModulePath, OpenAIServing, PromptAdapterPath, TextTokensPrompt) +from vllm.entrypoints.openai.tool_parsers import (Hermes2ProToolParser, + MistralToolParser, + ToolParser) from vllm.inputs import TokensPrompt from vllm.logger import init_logger -from vllm.multimodal import MultiModalDataDict -from vllm.outputs import RequestOutput +from vllm.outputs import CompletionOutput, RequestOutput from vllm.sequence import Logprob from vllm.tracing import (contains_trace_headers, extract_trace_headers, log_tracing_disabled_warning) @@ -39,19 +43,19 @@ class OpenAIServingChat(OpenAIServing): - def __init__( - self, - async_engine_client: AsyncEngineClient, - model_config: ModelConfig, - served_model_names: List[str], - response_role: str, - *, - lora_modules: Optional[List[LoRAModulePath]], - prompt_adapters: Optional[List[PromptAdapterPath]], - request_logger: Optional[RequestLogger], - chat_template: Optional[str], - return_tokens_as_token_ids: bool = False, - ): + def __init__(self, + async_engine_client: AsyncEngineClient, + model_config: ModelConfig, + served_model_names: List[str], + response_role: str, + *, + lora_modules: Optional[List[LoRAModulePath]], + prompt_adapters: Optional[List[PromptAdapterPath]], + request_logger: Optional[RequestLogger], + chat_template: Optional[str], + return_tokens_as_token_ids: bool = False, + enable_auto_tools: bool = False, + tool_parser: Optional[str] = None): super().__init__(async_engine_client=async_engine_client, model_config=model_config, served_model_names=served_model_names, @@ -61,10 +65,27 @@ def __init__( return_tokens_as_token_ids=return_tokens_as_token_ids) self.response_role = response_role - - # If this is None we use the tokenizer's default chat template + self.use_tool_use_model_template = False self.chat_template = load_chat_template(chat_template) + # set up tool use + self.enable_auto_tools: bool = enable_auto_tools + if self.enable_auto_tools: + logger.info( + "\"auto\" tool choice has been enabled please note that while" + " the parallel_tool_calls client option is preset for " + "compatibility reasons, it will be ignored.") + + self.tool_parser: Optional[Callable[[AnyTokenizer], ToolParser]] = None + if self.enable_auto_tools: + if tool_parser == "mistral": + self.tool_parser = MistralToolParser + elif tool_parser == "hermes": + self.tool_parser = Hermes2ProToolParser + else: + raise TypeError("Error: --enable-auto-tool-choice requires " + "--tool-call-parser") + async def create_chat_completion( self, request: ChatCompletionRequest, @@ -77,11 +98,10 @@ async def create_chat_completion( for the API specification. This API mimics the OpenAI ChatCompletion API. - NOTE: Currently we do not support the following feature: - - function_call (Users should implement this by themselves) """ error_check_ret = await self._check_model(request) if error_check_ret is not None: + logger.error("Error with model %s", error_check_ret) return error_check_ret try: @@ -94,7 +114,7 @@ async def create_chat_completion( tokenizer = await self.async_engine_client.get_tokenizer( lora_request) - conversation, mm_data_future = parse_chat_messages( + conversation, mm_data_future = parse_chat_messages_futures( request.messages, model_config, tokenizer) tool_dicts = None if request.tools is None else [ @@ -114,14 +134,26 @@ async def create_chat_completion( logger.error("Error in applying chat template from request: %s", e) return self.create_error_response(str(e)) - mm_data: Optional[MultiModalDataDict] = None try: - if mm_data_future: - mm_data = await mm_data_future + mm_data = await mm_data_future except Exception as e: logger.error("Error in loading multi-modal data: %s", e) return self.create_error_response(str(e)) + # validation for OpenAI tools + # tool_choice = "required" is not supported + if request.tool_choice == "required": + return self.create_error_response( + "tool_choice = \"required\" is not supported!") + + # "auto" tools requires --enable-auto-tool-choice + # and --tool-call-parser + if request.tool_choice == "auto" and not ( + self.enable_auto_tools and self.tool_parser is not None): + return self.create_error_response( + "\"auto\" tool choice requires " + "--enable-auto-tool-choice and --tool-call-parser to be set") + request_id = f"chat-{random_uuid()}" try: guided_decode_logits_processor = ( @@ -190,6 +222,7 @@ async def create_chat_completion( if request.stream: return self.chat_completion_stream_generator( request, result_generator, request_id, conversation, tokenizer) + try: return await self.chat_completion_full_generator( request, result_generator, request_id, conversation, tokenizer) @@ -222,6 +255,9 @@ async def chat_completion_stream_generator( previous_num_tokens = [0] * num_choices finish_reason_sent = [False] * num_choices + tool_parser: Optional[ToolParser] = self.tool_parser( + tokenizer) if self.tool_parser else None + try: async for res in result_generator: # We need to do it here, because if there are exceptions in @@ -231,6 +267,9 @@ async def chat_completion_stream_generator( # Send first response for each request.n (index) with # the role role = self.get_chat_request_role(request) + + # NOTE num_choices defaults to 1 so this usually executes + # once per request for i in range(num_choices): choice_data = ChatCompletionResponseStreamChoice( index=i, @@ -243,14 +282,18 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) + + # if usage should be included if (request.stream_options and request.stream_options.include_usage): - if (request.stream_options.continuous_usage_stats): + # if continuous usage stats are requested, add it + if request.stream_options.continuous_usage_stats: prompt_tokens = len(res.prompt_token_ids) usage = UsageInfo(prompt_tokens=prompt_tokens, completion_tokens=0, total_tokens=prompt_tokens) chunk.usage = usage + # otherwise don't else: chunk.usage = None @@ -260,7 +303,7 @@ async def chat_completion_stream_generator( # Send response to echo the input portion of the # last message if request.echo: - last_msg_content = "" + last_msg_content: Optional[str] = "" if conversation and conversation[-1].get( "content") and conversation[-1].get( "role") == role: @@ -301,6 +344,7 @@ async def chat_completion_stream_generator( first_iteration = False for output in res.outputs: + i = output.index if finish_reason_sent[i]: @@ -323,20 +367,50 @@ async def chat_completion_stream_generator( logprobs = None delta_text = output.text[len(previous_texts[i]):] - previous_texts[i] = output.text - previous_num_tokens[i] = len(output.token_ids) + delta_message: Optional[DeltaMessage] = None - if request.tool_choice and type( - request.tool_choice - ) is ChatCompletionNamedToolChoiceParam: + # handle streaming deltas for tools with named tool_choice + if (request.tool_choice and type(request.tool_choice) is + ChatCompletionNamedToolChoiceParam): delta_message = DeltaMessage(tool_calls=[ - ToolCall(function=FunctionCall( + DeltaToolCall(function=DeltaFunctionCall( name=request.tool_choice.function.name, - arguments=delta_text)) + arguments=delta_text), + index=i) ]) + + # handle streaming deltas for tools with "auto" tool choice + elif (self._should_stream_with_auto_tool_parsing(request) + and tool_parser): + delta_message = ( + tool_parser.extract_tool_calls_streaming( + previous_text=previous_texts[i], + current_text=output.text, + delta_text=delta_text, + previous_token_ids= \ + output.token_ids[ + :-1 * len(delta_token_ids) + ], + current_token_ids=output.token_ids, + delta_token_ids=delta_token_ids + ) + ) + + # handle streaming just a content delta else: delta_message = DeltaMessage(content=delta_text) + # set the previous values for the next iteration + previous_texts[i] = output.text + previous_num_tokens[i] = len(output.token_ids) + + # if the message delta is None (e.g. because it was a + # "control token" for tool calls or the parser otherwise + # wasn't ready to send a token, then + # get the next token without streaming a chunk + if delta_message is None: + continue + if output.finish_reason is None: # Send token-by-token response for each request.n @@ -351,6 +425,8 @@ async def chat_completion_stream_generator( created=created_time, choices=[choice_data], model=model_name) + + # handle usage stats if requested & if continuous if (request.stream_options and request.stream_options.include_usage): if (request.stream_options.continuous_usage_stats): @@ -368,14 +444,55 @@ async def chat_completion_stream_generator( data = chunk.model_dump_json(exclude_unset=True) yield f"data: {data}\n\n" + + # if the model is finished generating else: + # check to make sure we haven't "forgotten" to stream + # any tokens that were generated but previously + # matched by partial json parsing + # only happens if we are NOT using guided decoding + if tool_parser: + index = len( + tool_parser.prev_tool_call_arr) - 1 if len( + tool_parser.prev_tool_call_arr) > 0 else 0 + else: + index = 0 + + if self._should_check_for_unstreamed_tool_arg_tokens( + delta_message, output) and tool_parser: + # get the expected call based on partial JSON + # parsing which "autocompletes" the JSON + expected_call = json.dumps( + tool_parser.prev_tool_call_arr[index].get( + "arguments", {})) + + # get what we've streamed so for for arguments + # for the current tool + actual_call = tool_parser.streamed_args_for_tool[ + index] + + # check to see if there's anything left to stream + remaining_call = expected_call.replace( + actual_call, "", 1) + + # set that as a delta message + delta_message = DeltaMessage(tool_calls=[ + DeltaToolCall(index=index, + function=DeltaFunctionCall( + arguments=remaining_call). + model_dump(exclude_none=True)) + ]) + # Send the finish response for each request.n only once prompt_tokens = len(res.prompt_token_ids) choice_data = ChatCompletionResponseStreamChoice( index=i, delta=delta_message, logprobs=logprobs, - finish_reason=output.finish_reason, + finish_reason=output.finish_reason + if not (tool_parser + and len(tool_parser.prev_tool_call_arr)) + else "tool_calls", stop_reason=output.stop_reason) chunk = ChatCompletionStreamResponse( id=request_id, @@ -401,6 +518,8 @@ async def chat_completion_stream_generator( yield f"data: {data}\n\n" finish_reason_sent[i] = True + # once the final token is handled, if stream_options.include_usage + # is sent, send the usage if (request.stream_options and request.stream_options.include_usage): final_usage = UsageInfo( @@ -422,6 +541,7 @@ async def chat_completion_stream_generator( except ValueError as e: # TODO: Use a vllm-specific Validation Error + logger.error("error in chat completion stream generator: %s", e) data = self.create_streaming_error_response(str(e)) yield f"data: {data}\n\n" # Send the final done message after all response.n are finished @@ -466,8 +586,21 @@ async def chat_completion_full_generator( else: logprobs = None - if request.tool_choice and type( + # by default, tools are not used. + tools_called = False + + # if auto tools are not enabled, and a named tool choice using + # outlines is not being used + if not (self.enable_auto_tools + or not self.tool_parser) and not isinstance( + request.tool_choice, + ChatCompletionNamedToolChoiceParam): + message = ChatMessage(role=role, content=output.text) + + # if the request uses tools and specified a tool choice + elif request.tool_choice and type( request.tool_choice) is ChatCompletionNamedToolChoiceParam: + message = ChatMessage( role=role, content="", @@ -476,14 +609,47 @@ async def chat_completion_full_generator( name=request.tool_choice.function.name, arguments=output.text)) ]) + tools_called = True + + # if the request doesn't use tool choice + # OR specifies to not use a tool elif not request.tool_choice or request.tool_choice == "none": + + message = ChatMessage(role=role, content=output.text) + + # handle when there are tools and tool choice is auto + elif request.tools and ( + request.tool_choice == "auto" + or request.tool_choice is None) and self.enable_auto_tools \ + and self.tool_parser: + + tool_parser = self.tool_parser(tokenizer) + tool_call_info = tool_parser.extract_tool_calls(output.text) + tools_called = tool_call_info.tools_called + if tool_call_info.tools_called: + message = ChatMessage(role=role, + content=tool_call_info.content, + tool_calls=tool_call_info.tool_calls) + + else: + # FOR NOW make it a chat message; we will have to detect + # the type to make it later. + message = ChatMessage(role=role, content=output.text) + + # undetermined case that is still important to handle + else: + logger.error( + "Error in chat_completion_full_generator - cannot determine" + " if tools should be extracted. Returning a standard chat " + "completion.") message = ChatMessage(role=role, content=output.text) choice_data = ChatCompletionResponseChoice( index=output.index, message=message, logprobs=logprobs, - finish_reason=output.finish_reason, + finish_reason="tool_calls" if tools_called else + output.finish_reason if output.finish_reason else "stop", stop_reason=output.stop_reason) choices.append(choice_data) @@ -491,10 +657,11 @@ async def chat_completion_full_generator( last_msg_content = "" if conversation and conversation[-1].get( "content") and conversation[-1].get("role") == role: - last_msg_content = conversation[-1]["content"] + last_msg_content = conversation[-1]["content"] or "" for choice in choices: - full_message = last_msg_content + choice.message.content + full_message = last_msg_content + (choice.message.content + or "") choice.message.content = full_message num_prompt_tokens = len(final_res.prompt_token_ids) @@ -577,3 +744,38 @@ def _create_chat_logprobs( )) return ChatCompletionLogProbs(content=logprobs_content) + + def _should_stream_with_auto_tool_parsing(self, + request: ChatCompletionRequest): + """ + Utility function to check if streamed tokens should go through the tool + call parser that was configured. + + We only want to do this IF user-provided tools are set, a tool parser + is configured, "auto" tool choice is enabled, and the request's tool + choice field indicates that "auto" tool choice should be used. + """ + return (request.tools and self.tool_parser and self.enable_auto_tools + and request.tool_choice in ['auto', None]) + + def _should_check_for_unstreamed_tool_arg_tokens( + self, + delta_message: Optional[DeltaMessage], + output: CompletionOutput, + ) -> bool: + """ + Check to see if we should check for unstreamed tool arguments tokens. + This is only applicable when auto tool parsing is enabled, the delta + is a tool call with arguments. + """ + + # yapf: disable + return bool( + # if there is a delta message that includes tool calls which + # include a function that has arguments + self.enable_auto_tools and self.tool_parser and delta_message + and delta_message.tool_calls and delta_message.tool_calls[0] + and delta_message.tool_calls[0].function + and delta_message.tool_calls[0].function.arguments is not None + and output.finish_reason is not None + ) diff --git a/vllm/entrypoints/openai/serving_tokenization.py b/vllm/entrypoints/openai/serving_tokenization.py index fc9ca29e9cf86..69a5ad5b62cfa 100644 --- a/vllm/entrypoints/openai/serving_tokenization.py +++ b/vllm/entrypoints/openai/serving_tokenization.py @@ -4,7 +4,7 @@ from vllm.engine.protocol import AsyncEngineClient from vllm.entrypoints.chat_utils import (apply_chat_template, load_chat_template, - parse_chat_messages) + parse_chat_messages_futures) from vllm.entrypoints.logger import RequestLogger # yapf conflicts with isort for this block # yapf: disable @@ -43,7 +43,11 @@ def __init__( request_logger=request_logger) # If this is None we use the tokenizer's default chat template - self.chat_template = load_chat_template(chat_template) + # the list of commonly-used chat template names for HF named templates + hf_chat_templates: List[str] = ['default', 'tool_use'] + self.chat_template = chat_template \ + if chat_template in hf_chat_templates \ + else load_chat_template(chat_template) async def create_tokenize( self, @@ -65,10 +69,11 @@ async def create_tokenize( if isinstance(request, TokenizeChatRequest): model_config = self.model_config - conversation, mm_data_future = parse_chat_messages( + conversation, mm_data_future = parse_chat_messages_futures( request.messages, model_config, tokenizer) - if mm_data_future: + mm_data = await mm_data_future + if mm_data: logger.warning( "Multi-modal inputs are ignored during tokenization") diff --git a/vllm/entrypoints/openai/tool_parsers/__init__.py b/vllm/entrypoints/openai/tool_parsers/__init__.py new file mode 100644 index 0000000000000..5d5d53784fedf --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/__init__.py @@ -0,0 +1,5 @@ +from .abstract_tool_parser import ToolParser +from .hermes_tool_parser import Hermes2ProToolParser +from .mistral_tool_parser import MistralToolParser + +__all__ = ["ToolParser", "Hermes2ProToolParser", "MistralToolParser"] \ No newline at end of file diff --git a/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py new file mode 100644 index 0000000000000..b0807e6f1e782 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/abstract_tool_parser.py @@ -0,0 +1,58 @@ +from typing import Dict, List, Sequence, Union + +from vllm.entrypoints.openai.protocol import (DeltaMessage, + ExtractedToolCallInformation) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer + +logger = init_logger(__name__) + + +class ToolParser: + """ + Abstract ToolParser class that should not be used directly. Provided + properties and methods should be used in + derived classes. + """ + + def __init__(self, tokenizer: AnyTokenizer): + self.prev_tool_call_arr: List[Dict] = [] + # the index of the tool call that is currently being parsed + self.current_tool_id: int = -1 + self.current_tool_name_sent: bool = False + self.current_tool_initial_sent: bool = False + self.streamed_args_for_tool: List[str] = [] + + self.model_tokenizer = tokenizer + + def extract_tool_calls(self, + model_output: str) -> ExtractedToolCallInformation: + """ + Static method that should be implemented for extracting tool calls from + a complete model-generated string. + Used for non-streaming responses where we have the entire model response + available before sending to the client. + Static because it's stateless. + """ + raise NotImplementedError( + "AbstractToolParser.extract_tool_calls has not been implemented!") + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + """ + Instance method that should be implemented for extracting tool calls + from an incomplete response; for use when handling tool calls and + streaming. Has to be an instance method because it requires state - + the current tokens/diffs, but also the information about what has + previously been parsed and extracted (see constructor) + """ + raise NotImplementedError( + "AbstractToolParser.extract_tool_calls_streaming has not been " + "implemented!") diff --git a/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py new file mode 100644 index 0000000000000..7afbca7162edf --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/hermes_tool_parser.py @@ -0,0 +1,344 @@ +import json +import re +from typing import Dict, List, Sequence, Union + +import partial_json_parser +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + InitialDeltaToolCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser) +from vllm.entrypoints.openai.tool_parsers.utils import ( + extract_intermediate_diff) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer + +logger = init_logger(__name__) + + +class Hermes2ProToolParser(ToolParser): + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if isinstance(self.model_tokenizer, MistralTokenizer): + logger.error( + "Detected Mistral tokenizer when using a Hermes model") + self.model_tokenizer = self.model_tokenizer.tokenizer + + self.current_tool_name_sent: bool = False + self.prev_tool_call_arr: List[Dict] = [] + self.current_tool_id: int = -1 + self.current_tool_name_sent = False + self.current_tool_initial_sent: bool = False + self.streamed_args_for_tool: List[str] = [ + ] # map what has been streamed for each tool so far to a list + + self.tool_call_start_token: str = "" + self.tool_call_end_token: str = "" + + self.tool_call_regex = re.compile( + r"(.*?)|(.*)", re.DOTALL) + self.scratch_pad_regex = re.compile( + r"(.*?)", re.DOTALL) + + if not self.model_tokenizer: + raise ValueError( + "The model tokenizer must be passed to the ToolParser " + "constructor during construction.") + self.tool_call_start_token_id: int = self.model_tokenizer.vocab[ + self.tool_call_start_token] + self.tool_call_end_token_id: int = self.model_tokenizer.vocab[ + self.tool_call_end_token] + if not self.tool_call_start_token_id or not self.tool_call_end_token_id: + raise RuntimeError( + "Hermes 2 Pro Tool parser could not locate tool call start/end " + "tokens in the tokenizer!") + + def extract_tool_calls(self, + model_output: str) -> ExtractedToolCallInformation: + + # sanity check; avoid unnecessary processing + if self.tool_call_start_token not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + else: + + try: + # there are two possible captures - between tags, or between a + # tag and end-of-string so the result of + # findall is an array of tuples where one is a function call and + # the other is None + function_call_tuples = ( + self.tool_call_regex.findall(model_output)) + + # load the JSON, and then use it to build the Function and + # Tool Call + raw_function_calls = [ + json.loads(match[0] if match[0] else match[1]) + for match in function_call_tuples + ] + tool_calls = [ + ToolCall( + type="function", + function=FunctionCall( + name=function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(function_call["arguments"]))) + for function_call in raw_function_calls + ] + + content = model_output[:model_output. + find(self.tool_call_start_token)] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if content else None) + + except Exception as e: + logger.error("Error in extracting tool call from response %s", + e) + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + + logger.debug("delta_text: %s", delta_text) + logger.debug("delta_token_ids: %s", delta_token_ids) + # check to see if we should be streaming a tool call - is there a + if self.tool_call_start_token_id not in current_token_ids: + logger.debug("No tool call tokens found!") + return DeltaMessage(content=delta_text) + + try: + + # figure out where we are in the parsing by counting tool call + # start & end tags + prev_tool_start_count = previous_token_ids.count( + self.tool_call_start_token_id) + prev_tool_end_count = previous_token_ids.count( + self.tool_call_end_token_id) + cur_tool_start_count = current_token_ids.count( + self.tool_call_start_token_id) + cur_tool_end_count = current_token_ids.count( + self.tool_call_end_token_id) + + # case: if we're generating text, OR rounding out a tool call + if (cur_tool_start_count == cur_tool_end_count + and prev_tool_end_count == cur_tool_end_count): + logger.debug("Generating text content! skipping tool parsing.") + if delta_text != self.tool_call_end_token: + return DeltaMessage(content=delta_text) + + # case: if tool open & close tag counts don't match, we're doing + # imaginary "else" block here + # something with tools with this diff. + # flags for partial JSON parting. exported constants from + # "Allow" are handled via BIT MASK + flags = Allow.ALL if self.current_tool_name_sent \ + else Allow.ALL & ~Allow.STR + + # case -- we're starting a new tool call + if (cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count > prev_tool_start_count): + if len(delta_token_ids) > 1: + tool_call_portion = current_text.split( + self.tool_call_start_token)[-1] + else: + tool_call_portion = None + delta = None + + text_portion = None + + # set cursors and state appropriately + self.current_tool_id += 1 + self.current_tool_name_sent = False + self.current_tool_initial_sent = False + self.streamed_args_for_tool.append("") + logger.debug("Starting on a new tool %s", self.current_tool_id) + + # case -- we're updating an existing tool call + elif (cur_tool_start_count > cur_tool_end_count + and cur_tool_start_count == prev_tool_start_count): + + # get the portion of the text that's the tool call + tool_call_portion = current_text.split( + self.tool_call_start_token)[-1] + text_portion = None + + # case -- the current tool call is being closed. + elif (cur_tool_start_count == cur_tool_end_count + and cur_tool_end_count > prev_tool_end_count): + diff = self.prev_tool_call_arr[self.current_tool_id].get( + "arguments") + if diff: + diff = json.dumps(diff).replace( + self.streamed_args_for_tool[self.current_tool_id], "") + logger.debug( + "Finishing tool and found diff that had not " + "been streamed yet: %s", diff) + self.streamed_args_for_tool[self.current_tool_id] \ + += diff + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff).model_dump( + exclude_none=True)) + ]) + + # case -- otherwise we're just generating text + else: + text = delta_text.replace(self.tool_call_start_token, "") + text = text.replace(self.tool_call_end_token, "") + delta = DeltaMessage(tool_calls=[], content=text) + return delta + + try: + + current_tool_call = partial_json_parser.loads( + tool_call_portion or "{}", + flags) if tool_call_portion else None + logger.debug("Parsed tool call %s", current_tool_call) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug('not enough tokens to parse into JSON yet') + return None + + # case - we haven't sent the initial delta with the tool call ID + # (it will be sent) + if not self.current_tool_initial_sent: + self.current_tool_initial_sent = True + return DeltaMessage(tool_calls=[ + InitialDeltaToolCall( + index=self.current_tool_id).model_dump( + exclude_none=True) + ]) + + # case - we haven't sent the tool name yet. If it's available, send + # it. otherwise, wait until it's available. + elif not self.current_tool_name_sent: + function_name: Union[str, None] = current_tool_call.get("name") + if function_name: + self.current_tool_name_sent = True + return DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True)) + ]) + else: + return None + # case -- otherwise, send the tool call delta + + # if the tool call portion is None, send the delta as text + if tool_call_portion is None: + # if there's text but not tool calls, send that - + # otherwise None to skip chunk + delta = DeltaMessage(content=delta_text) \ + if text_portion is not None else None + return delta + + # now, the nitty-gritty of tool calls + # now we have the portion to parse as tool call. + + logger.debug("Trying to parse current tool call with ID %s", + self.current_tool_id) + + # if we're starting a new tool call, push an empty object in as + # a placeholder for the arguments + if len(self.prev_tool_call_arr) <= self.current_tool_id: + self.prev_tool_call_arr.append({}) + + # main logic for tool parsing here - compare prev. partially-parsed + # JSON to the current partially-parsed JSON + prev_arguments = ( + self.prev_tool_call_arr[self.current_tool_id].get("arguments")) + cur_arguments = current_tool_call.get("arguments") + + logger.debug("diffing old arguments: %s", prev_arguments) + logger.debug("against new ones: %s", cur_arguments) + + # case -- no arguments have been created yet. skip sending a delta. + if not cur_arguments and not prev_arguments: + logger.debug("Skipping text %s - no arguments", delta_text) + delta = None + + # case -- prev arguments are defined, but non are now. + # probably impossible, but not a fatal error - just keep going + elif not cur_arguments and prev_arguments: + logger.error("should be impossible to have arguments reset " + "mid-call. skipping streaming anything.") + delta = None + + # case -- we now have the first info about arguments available from + # autocompleting the JSON + elif cur_arguments and not prev_arguments: + + cur_arguments_json = json.dumps(cur_arguments) + logger.debug("finding %s in %s", delta_text, + cur_arguments_json) + + # get the location where previous args differ from current + args_delta_start_loc = cur_arguments_json.index(delta_text) \ + + len(delta_text) + + # use that to find the actual delta + arguments_delta = cur_arguments_json[:args_delta_start_loc] + logger.debug("First tokens in arguments received: %s", + arguments_delta) + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[self.current_tool_id] \ + += arguments_delta + + # last case -- we have an update to existing arguments. + elif cur_arguments and prev_arguments: + + cur_args_json = json.dumps(cur_arguments) + prev_args_json = json.dumps(prev_arguments) + logger.debug("Searching for diff between\n%s", cur_args_json) + logger.debug("and\n%s", prev_args_json) + argument_diff = extract_intermediate_diff( + cur_args_json, prev_args_json) + logger.debug("got argument diff %s", argument_diff) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[self.current_tool_id] \ + += argument_diff + + # handle saving the state for the current tool into + # the "prev" list for use in diffing for the next iteration + if self.current_tool_id == len(self.prev_tool_call_arr) - 1: + self.prev_tool_call_arr[self.current_tool_id] = \ + current_tool_call + else: + self.prev_tool_call_arr.append(current_tool_call) + + return delta + + except Exception as e: + logger.error("Error trying to handle streaming tool call: %s", e) + return None # do not stream a delta. skip this token ID. diff --git a/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py new file mode 100644 index 0000000000000..d48770c792e98 --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/mistral_tool_parser.py @@ -0,0 +1,293 @@ +import json +import re +from typing import Dict, List, Sequence, Union + +import partial_json_parser +from partial_json_parser.core.options import Allow + +from vllm.entrypoints.openai.protocol import (DeltaFunctionCall, DeltaMessage, + DeltaToolCall, + ExtractedToolCallInformation, + FunctionCall, + InitialDeltaToolCall, ToolCall) +from vllm.entrypoints.openai.tool_parsers.abstract_tool_parser import ( + ToolParser) +from vllm.entrypoints.openai.tool_parsers.utils import ( + extract_intermediate_diff) +from vllm.logger import init_logger +from vllm.transformers_utils.tokenizer import AnyTokenizer, MistralTokenizer + +logger = init_logger(__name__) + + +class MistralToolParser(ToolParser): + """ + Tool call parser for Mistral 7B Instruct v0.3, intended for use with the + examples/tool_chat_template_mistral.jinja template. + + Used when --enable-auto-tool-choice --tool-call-parser gmistral are all set + """ + + def __init__(self, tokenizer: AnyTokenizer): + super().__init__(tokenizer) + + if isinstance(self.model_tokenizer, MistralTokenizer): + self.model_tokenizer = self.model_tokenizer.tokenizer + else: + logger.info("Non-Mistral tokenizer detected when using a Mistral " + "model...") + + # initialize properties used for state when parsing tool calls in + # streaming mode + self.prev_tool_call_arr: List[Dict] = [] + self.current_tool_id: int = -1 + self.current_tool_name_sent: bool = False + self.current_tool_initial_sent: bool = False + self.streamed_args_for_tool: List[str] = [ + ] # map what has been streamed for each tool so far to a list + self.bot_token = "[TOOL_CALLS]" + self.bot_token_id = self.model_tokenizer.vocab[self.bot_token] + self.tool_call_regex = re.compile(r"\[{.*?}\]", re.DOTALL) + + def extract_tool_calls(self, + model_output: str) -> ExtractedToolCallInformation: + """ + Extract the tool calls from a complete model response. Requires + find-and-replacing single quotes with double quotes for JSON parsing, + make sure your tool call arguments don't ever include quotes! + """ + + # case -- if a tool call token is not present, return a text response + if self.bot_token not in model_output: + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + try: + + # use a regex to find the tool call. remove the BOT token + # and make sure to replace single quotes with double quotes + raw_tool_call = self.tool_call_regex.findall( + model_output.replace(self.bot_token, ""))[0] + + # load the JSON, and then use it to build the Function and + # Tool Call + function_call_arr = json.loads(raw_tool_call) + tool_calls: List[ToolCall] = [ + ToolCall( + type="function", + function=FunctionCall( + name=raw_function_call["name"], + # function call args are JSON but as a string + arguments=json.dumps(raw_function_call["arguments"]))) + for raw_function_call in function_call_arr + ] + + # get any content before the tool call + content = model_output.split(self.bot_token)[0] + return ExtractedToolCallInformation( + tools_called=True, + tool_calls=tool_calls, + content=content if len(content) > 0 else None) + + except Exception as e: + logger.error("Error in extracting tool call from response: %s", e) + print("ERROR", e) + # return information to just treat the tool call as regular JSON + return ExtractedToolCallInformation(tools_called=False, + tool_calls=[], + content=model_output) + + def extract_tool_calls_streaming( + self, + previous_text: str, + current_text: str, + delta_text: str, + previous_token_ids: Sequence[int], + current_token_ids: Sequence[int], + delta_token_ids: Sequence[int], + ) -> Union[DeltaMessage, None]: + + # if the tool call token is not in the tokens generated so far, append + # output to contents since it's not a tool + if self.bot_token_id not in current_token_ids: + return DeltaMessage(content=delta_text) + + # if the tool call token ID IS in the tokens generated so far, that + # means we're parsing as tool calls now + + # handle if we detected the BOT token which means the start of tool + # calling + if (self.bot_token_id in delta_token_ids + and len(delta_token_ids) == 1): + # if it's the only token, return None, so we don't send a chat + # completion any don't send a control token + return None + + # bit mask flags for partial JSON parsing. If the name hasn't been + # sent yet, don't allow sending + # an incomplete string since OpenAI only ever (as far as I have + # seen) allows sending the entire tool/ function name at once. + flags = Allow.ALL if self.current_tool_name_sent \ + else Allow.ALL & ~Allow.STR + try: + + # replace BOT token with empty string, and convert single quotes + # to double to allow parsing as JSON since mistral uses single + # quotes instead of double for tool calls + parsable_arr = current_text.split(self.bot_token)[1] + + # tool calls are generated in an array, so do partial JSON + # parsing on the entire array + try: + tool_call_arr: List[Dict] = partial_json_parser.loads( + parsable_arr, flags) + except partial_json_parser.core.exceptions.MalformedJSON: + logger.debug('not enough tokens to parse into JSON yet') + return None + + # select as the current tool call the one we're on the state at + + current_tool_call: Dict = tool_call_arr[self.current_tool_id] \ + if len(tool_call_arr) > 0 else {} + + # case -- if no tokens have been streamed for the tool, e.g. + # only the array brackets, stream nothing + if len(tool_call_arr) == 0: + return None + + # case: we are starting a new tool in the array + # -> array has > 0 length AND length has moved past cursor + elif (len(tool_call_arr) > 0 + and len(tool_call_arr) > self.current_tool_id + 1): + + # if we're moving on to a new call, first make sure we + # haven't missed anything in the previous one that was + # auto-generated due to JSON completions, but wasn't + # streamed to the client yet. + if self.current_tool_id >= 0: + diff: Union[str, None] = current_tool_call.get("arguments") + + if diff: + diff = json.dumps(diff).replace( + self.streamed_args_for_tool[self.current_tool_id], + "") + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=diff).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += diff + else: + delta = None + else: + delta = None + # re-set stuff pertaining to progress in the current tool + self.current_tool_id = len(tool_call_arr) - 1 + self.current_tool_name_sent = False + self.current_tool_initial_sent = False + self.streamed_args_for_tool.append("") + logger.debug("starting on new tool %d", self.current_tool_id) + return delta + + # case: update an existing tool - this is handled below + + # if the current tool initial data incl. the id, type=function + # and idx not sent, send that + if not self.current_tool_initial_sent: + self.current_tool_initial_sent = True + delta = DeltaMessage(tool_calls=[ + InitialDeltaToolCall( + index=self.current_tool_id).model_dump( + exclude_none=True) + ]) + + # if the current tool name hasn't been sent, send if available + # - otherwise send nothing + elif not self.current_tool_name_sent: + function_name = current_tool_call.get("name") + if function_name: + + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + name=function_name).model_dump( + exclude_none=True)) + ]) + self.current_tool_name_sent = True + else: + delta = None + + # now we know we're on the same tool call and we're streaming + # arguments + else: + + prev_arguments = self.prev_tool_call_arr[ + self.current_tool_id].get("arguments") + cur_arguments = current_tool_call.get("arguments") + + new_text = delta_text.replace("\'", "\"") + + if not cur_arguments and not prev_arguments: + + delta = None + elif not cur_arguments and prev_arguments: + logger.error( + "INVARIANT - impossible to have arguments reset " + "mid-arguments") + delta = None + elif cur_arguments and not prev_arguments: + cur_arguments_json = json.dumps(cur_arguments) + logger.debug("finding %s in %s", new_text, + cur_arguments_json) + + arguments_delta = cur_arguments_json[:cur_arguments_json. + index(new_text) + + len(new_text)] + logger.debug("First tokens in arguments received: %s", + arguments_delta) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=arguments_delta). + model_dump(exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += arguments_delta + + elif cur_arguments and prev_arguments: + cur_args_json = json.dumps(cur_arguments) + prev_args_json = json.dumps(prev_arguments) + logger.debug("Searching for diff between \n%s\n%s", + cur_args_json, prev_args_json) + + argument_diff = extract_intermediate_diff( + cur_args_json, prev_args_json) + logger.debug("got arguments diff: %s", argument_diff) + delta = DeltaMessage(tool_calls=[ + DeltaToolCall(index=self.current_tool_id, + function=DeltaFunctionCall( + arguments=argument_diff).model_dump( + exclude_none=True)) + ]) + self.streamed_args_for_tool[ + self.current_tool_id] += argument_diff + else: + # try parsing it with regular JSON - if it works we're + # at the end, and we need to send the difference between + # tokens streamed so far and the valid JSON + delta = None + + # check to see if the name is defined and has been sent. if so, + # stream the name - otherwise keep waiting + # finish by setting old and returning None as base case + self.prev_tool_call_arr = tool_call_arr + return delta + + except Exception as e: + logger.error("Error trying to handle streaming tool call: %s", e) + logger.debug( + "Skipping chunk as a result of tool streaming extraction " + "error") + return None diff --git a/vllm/entrypoints/openai/tool_parsers/utils.py b/vllm/entrypoints/openai/tool_parsers/utils.py new file mode 100644 index 0000000000000..db7fc5259fc4e --- /dev/null +++ b/vllm/entrypoints/openai/tool_parsers/utils.py @@ -0,0 +1,87 @@ +def find_common_prefix(s1: str, s2: str) -> str: + """ + Finds a common prefix that is shared between two strings, if there is one. + Order of arguments is NOT important. + + This function is provided as a UTILITY for extracting information from JSON + generated by partial_json_parser, to help in ensuring that the right tokens + are returned in streaming, so that close-quotes, close-brackets and + close-braces are not returned prematurely. + + e.g. find_common_prefix('{"fruit": "ap"}', '{"fruit": "apple"}') -> + '{"fruit": "ap' + """ + prefix = '' + min_length = min(len(s1), len(s2)) + for i in range(0, min_length): + if s1[i] == s2[i]: + prefix += s1[i] + else: + break + return prefix + + +def find_common_suffix(s1: str, s2: str) -> str: + """ + Finds a common suffix shared between two strings, if there is one. Order of + arguments is NOT important. + Stops when the suffix ends OR it hits an alphanumeric character + + e.g. find_common_suffix('{"fruit": "ap"}', '{"fruit": "apple"}') -> '"}' + """ + suffix = '' + min_length = min(len(s1), len(s2)) + for i in range(1, min_length + 1): + if s1[-i] == s2[-i] and not s1[-i].isalnum(): + suffix = s1[-i] + suffix + else: + break + return suffix + + +def extract_intermediate_diff(curr: str, old: str) -> str: + """ + Given two strings, extract the difference in the middle between two strings + that are known to have a common prefix and/or suffix. + + This function is provided as a UTILITY for extracting information from JSON + generated by partial_json_parser, to help in ensuring that the right tokens + are returned in streaming, so that close-quotes, close-brackets and + close-braces are not returned prematurely. The order of arguments IS + important - the new version of the partially-parsed JSON must be the first + argument, and the secnod argument must be from the previous generation. + + What it returns, is tokens that should be streamed to the client. + + e.g. extract_intermediate_diff('{"fruit": "apple"}', '{"fruit": "ap"}') + -> 'ple' + + """ + suffix = find_common_suffix(curr, old) + + old = old[::-1].replace(suffix[::-1], '', 1)[::-1] + prefix = find_common_prefix(curr, old) + diff = curr + if len(suffix): + diff = diff[::-1].replace(suffix[::-1], '', 1)[::-1] + + if len(prefix): + # replace the prefix only once in case it's mirrored + diff = diff.replace(prefix, '', 1) + + return diff + + +def find_all_indices(string, substring): + """ + Find all (starting) indices of a substring in a given string. Useful for + tool call extraction + """ + indices = [] + index = -1 + while True: + index = string.find(substring, index + 1) + if index == -1: + break + indices.append(index) + return indices diff --git a/vllm/model_executor/guided_decoding/__init__.py b/vllm/model_executor/guided_decoding/__init__.py index f9fcdead980a2..7161e83952a3d 100644 --- a/vllm/model_executor/guided_decoding/__init__.py +++ b/vllm/model_executor/guided_decoding/__init__.py @@ -59,8 +59,9 @@ def _adapt_request_for_tool_use(request: Union[CompletionRequest, if type(request) is CompletionRequest: return request - # user has chosen to not use any tool - if request.tool_choice == "none": + # user has chosen to not use any tool, + # OR is allowing the model to choose a tool. + if request.tool_choice == "none" or request.tool_choice == "auto": return request # user has chosen to use a named tool diff --git a/vllm/model_executor/guided_decoding/outlines_decoding.py b/vllm/model_executor/guided_decoding/outlines_decoding.py index bfc658ef7d26b..e1f5b380120c5 100644 --- a/vllm/model_executor/guided_decoding/outlines_decoding.py +++ b/vllm/model_executor/guided_decoding/outlines_decoding.py @@ -8,8 +8,9 @@ from pydantic import BaseModel from transformers import PreTrainedTokenizerBase -from vllm.entrypoints.openai.protocol import (ChatCompletionRequest, - CompletionRequest) +from vllm.entrypoints.openai.protocol import ( + ChatCompletionNamedToolChoiceParam, ChatCompletionRequest, + CompletionRequest) from vllm.model_executor.guided_decoding.guided_fields import ( GuidedDecodingRequest) from vllm.model_executor.guided_decoding.outlines_logits_processors import ( @@ -101,16 +102,30 @@ def _get_guide_and_mode( request: Union[CompletionRequest, ChatCompletionRequest, GuidedDecodingRequest] ) -> Union[Tuple[str, GuidedDecodingMode], Tuple[None, None]]: + # if the request is a chat completion request, AND the tool choice is a + # named tool choice, do guided decoding + # using that tool as the JSON schema + if isinstance(request, ChatCompletionRequest) and isinstance( + request.tool_choice, ChatCompletionNamedToolChoiceParam): + # Guided generation for tools/functions parameters + if request.tool_choice.type == "function": + for tool in request.tools: + if (tool.type == "function" and tool.function.name + == request.tool_choice.function.name): + json = json_dumps(tool.function.parameters, sort_keys=True) + return json, GuidedDecodingMode.JSON + return None, None - if request.guided_json: - json = request.guided_json - if isinstance(json, dict): + elif request.guided_json: + if isinstance(request.guided_json, dict): # turn dict into hashable string - json = json_dumps(json) - elif isinstance(json, BaseModel): + json = json_dumps(request.guided_json) + elif isinstance(request.guided_json, BaseModel): # use pydantic signature so that different model classes # with the same fields will get hashed the same - json = str(json.__signature__) + json = str(request.guided_json.__signature__) + else: + json = request.guided_json return json, GuidedDecodingMode.JSON elif request.guided_regex: return request.guided_regex, GuidedDecodingMode.REGEX diff --git a/vllm/model_executor/layers/linear.py b/vllm/model_executor/layers/linear.py index cae1c5fc8571f..5de66bf209d20 100644 --- a/vllm/model_executor/layers/linear.py +++ b/vllm/model_executor/layers/linear.py @@ -26,7 +26,7 @@ "CompressedTensorsLinearMethod", "AWQMarlinLinearMethod", "AWQLinearMethod", "GPTQMarlinLinearMethod", "Fp8LinearMethod", "MarlinLinearMethod", "QQQLinearMethod", "GPTQMarlin24LinearMethod", - "TPUInt8LinearMethod", "GPTQLinearMethod" + "TPUInt8LinearMethod", "GPTQLinearMethod", "FBGEMMFp8LinearMethod" ] diff --git a/vllm/model_executor/layers/quantization/__init__.py b/vllm/model_executor/layers/quantization/__init__.py index 95b160f4287f9..c6fb6ca0d2e01 100644 --- a/vllm/model_executor/layers/quantization/__init__.py +++ b/vllm/model_executor/layers/quantization/__init__.py @@ -22,6 +22,8 @@ from vllm.model_executor.layers.quantization.gptq_marlin_24 import ( GPTQMarlin24Config) from vllm.model_executor.layers.quantization.marlin import MarlinConfig +from vllm.model_executor.layers.quantization.neuron_quant import ( + NeuronQuantConfig) from vllm.model_executor.layers.quantization.qqq import QQQConfig from vllm.model_executor.layers.quantization.squeezellm import SqueezeLLMConfig from vllm.model_executor.layers.quantization.tpu_int8 import Int8TpuConfig @@ -46,6 +48,7 @@ "bitsandbytes": BitsAndBytesConfig, "qqq": QQQConfig, "experts_int8": ExpertsInt8Config, + "neuron_quant": NeuronQuantConfig, } diff --git a/vllm/model_executor/layers/quantization/fbgemm_fp8.py b/vllm/model_executor/layers/quantization/fbgemm_fp8.py index 4aaa02e5e3972..0b1f6ff685200 100644 --- a/vllm/model_executor/layers/quantization/fbgemm_fp8.py +++ b/vllm/model_executor/layers/quantization/fbgemm_fp8.py @@ -15,9 +15,9 @@ from vllm.model_executor.layers.quantization.utils.quant_utils import ( is_layer_skipped) from vllm.model_executor.layers.quantization.utils.w8a8_utils import ( - apply_fp8_linear, create_per_channel_scale_param, - normalize_e4m3fn_to_e4m3fnuz) -from vllm.model_executor.utils import set_weight_attrs + apply_fp8_linear, normalize_e4m3fn_to_e4m3fnuz) +from vllm.model_executor.parameter import (ChannelQuantScaleParameter, + ModelWeightParameter) from vllm.platforms import current_platform from vllm.utils import is_hip @@ -87,6 +87,7 @@ def create_weights( params_dtype: torch.dtype, **extra_weight_attrs, ): + weight_loader = extra_weight_attrs.get("weight_loader") del input_size, output_size output_size_per_partition = sum(output_partition_sizes) @@ -97,20 +98,21 @@ def create_weights( layer.orig_dtype = params_dtype # WEIGHT - weight = Parameter(torch.empty(output_size_per_partition, - input_size_per_partition, - dtype=torch.float8_e4m3fn), - requires_grad=False) + weight = ModelWeightParameter(data=torch.empty( + output_size_per_partition, + input_size_per_partition, + dtype=torch.float8_e4m3fn), + input_dim=1, + output_dim=0, + weight_loader=weight_loader) layer.register_parameter("weight", weight) - set_weight_attrs(weight, { - "input_dim": 1, - "output_dim": 0, - **extra_weight_attrs, - }) # WEIGHT SCALE - weight_scale = create_per_channel_scale_param(output_partition_sizes, - **extra_weight_attrs) + weight_scale = ChannelQuantScaleParameter(data=torch.empty( + (sum(output_partition_sizes), 1), dtype=torch.float32), + output_dim=0, + weight_loader=weight_loader) + weight_scale[:] = torch.finfo(torch.float32).min layer.register_parameter("weight_scale", weight_scale) # INPUT SCALE UPPER BOUND @@ -120,6 +122,11 @@ def create_weights( layer.input_scale_ub = input_scale_ub def process_weights_after_loading(self, layer: Module) -> None: + # required by torch.compile + layer.weight_scale = Parameter(layer.weight_scale.data, + requires_grad=False) + layer.weight = Parameter(layer.weight.data, requires_grad=False) + weight = layer.weight if is_hip(): diff --git a/vllm/model_executor/layers/quantization/gptq_marlin.py b/vllm/model_executor/layers/quantization/gptq_marlin.py index 94eb3f301541a..b06ff7bd2bace 100644 --- a/vllm/model_executor/layers/quantization/gptq_marlin.py +++ b/vllm/model_executor/layers/quantization/gptq_marlin.py @@ -51,10 +51,6 @@ def __init__(self, weight_bits: int, group_size: int, desc_act: bool, self.quant_type = self.TYPE_MAP[(weight_bits, is_sym)] - # Verify supported on platform. - verify_marlin_supported(quant_type=self.quant_type, - group_size=self.group_size) - def __repr__(self) -> str: return (f"GPTQMarlinConfig(quant_type={self.quant_type}, " f"group_size={self.group_size}, " @@ -153,6 +149,10 @@ class GPTQMarlinLinearMethod(LinearMethodBase): def __init__(self, quant_config: GPTQMarlinConfig) -> None: self.quant_config = quant_config + # Verify supported on platform. + verify_marlin_supported(quant_type=self.quant_config.quant_type, + group_size=self.quant_config.group_size) + def create_weights( self, layer: torch.nn.Module, diff --git a/vllm/model_executor/layers/quantization/neuron_quant.py b/vllm/model_executor/layers/quantization/neuron_quant.py new file mode 100644 index 0000000000000..2624981f6a614 --- /dev/null +++ b/vllm/model_executor/layers/quantization/neuron_quant.py @@ -0,0 +1,67 @@ +import os +from importlib.util import find_spec +from typing import Any, Dict, List, Optional + +from torch.nn import Module + +from vllm.model_executor.layers.quantization.base_config import ( + QuantizationConfig) + +SUPPORTED_QUANT_DTYPE_LIST = ['s8', 'f8e4m3fn'] + + +class NeuronQuantConfig(QuantizationConfig): + """Int8 Quantization Config class for Neuron Backend.""" + + def __init__( + self, + dequant_dtype: str = "f16", + quantize_method: str = "vector_dynamic", + ) -> None: + self.quant_dtype = os.getenv("NEURON_QUANT_DTYPE", "s8") + if self.quant_dtype not in SUPPORTED_QUANT_DTYPE_LIST: + raise ValueError( + f"Neuron quantization datatype {self.quant_dtype} is not valid," + f"the quantization datatype should match one of the below types" + f"{SUPPORTED_QUANT_DTYPE_LIST}") + self.dequant_dtype = dequant_dtype + self.quantize_method = quantize_method + + def get_name(self) -> str: + return "neuron_quant" + + def get_supported_act_dtypes(self) -> List[str]: + return SUPPORTED_QUANT_DTYPE_LIST + + @classmethod + def get_min_capability(cls) -> int: + raise NotImplementedError( + "This function should not be called with Neuron Backend") + + @staticmethod + def get_config_filenames() -> List[str]: + return [] + + @classmethod + def from_config(cls, config: Dict[str, Any]) -> "NeuronQuantConfig": + quantize_method = cls.get_from_keys(config, ["quantize_method"]) + dequant_dtype = cls.get_from_keys(config, ["dequant_dtype"]) + return cls(dequant_dtype=dequant_dtype, + quantize_method=quantize_method) + + def get_quant_method(self, layer: Module, prefix: str) -> Optional[Any]: + if find_spec("transformers_neuronx") is not None: + return self.get_quantization_config() + else: + raise NotImplementedError( + "Neuron Quantization is only supported through" + " transformers_neuronx.") + + def get_scaled_act_names(self) -> List[str]: + return [] + + def get_quantization_config(self): + from transformers_neuronx.config import QuantizationConfig + return QuantizationConfig(quant_dtype=self.quant_dtype, + dequant_dtype=self.dequant_dtype, + quantize_method=self.quantize_method) diff --git a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py index dbab632d57549..f04db27902dbd 100644 --- a/vllm/model_executor/layers/quantization/utils/w8a8_utils.py +++ b/vllm/model_executor/layers/quantization/utils/w8a8_utils.py @@ -1,10 +1,8 @@ from typing import List, Optional, Tuple, Union import torch -from torch.nn import Parameter from vllm import _custom_ops as ops -from vllm.model_executor.utils import set_weight_attrs from vllm.platforms import current_platform from vllm.utils import is_hip @@ -38,31 +36,6 @@ def all_close_1d(x: torch.Tensor) -> bool: return all(torch.allclose(x[0], x[i]) for i in range(x.shape[0])) -def create_per_tensor_scale_param( - output_partition_sizes: List[int], - **extra_weight_attrs, -) -> Parameter: - scale = Parameter(torch.empty(len(output_partition_sizes), - dtype=torch.float32), - requires_grad=False) - scale[:] = torch.finfo(torch.float32).min - set_weight_attrs(scale, { - "needs_scalar_to_array": True, - **extra_weight_attrs - }) - return scale - - -def create_per_channel_scale_param(output_partition_sizes: List[int], - **extra_weight_attrs) -> Parameter: - scale = Parameter(torch.empty((sum(output_partition_sizes), 1), - dtype=torch.float32), - requires_grad=False) - scale[:] = torch.finfo(torch.float32).min - set_weight_attrs(scale, {"output_dim": 0, **extra_weight_attrs}) - return scale - - def convert_to_channelwise( weight_scale: torch.Tensor, logical_widths: List[int]) -> Tuple[torch.Tensor, torch.Tensor]: diff --git a/vllm/model_executor/layers/resampler.py b/vllm/model_executor/layers/resampler.py new file mode 100644 index 0000000000000..8cd938fc85fb2 --- /dev/null +++ b/vllm/model_executor/layers/resampler.py @@ -0,0 +1,273 @@ +# coding=utf-8 +# Adapted from +# https://github.com/huggingface/transformers/blob/v4.28.0/src/transformers/models/llama/modeling_llama.py +# https://huggingface.co/Qwen/Qwen-7B/blob/main/modeling_qwen.py +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +# +# Copyright 2023 The Qwen team. +# Copyright 2023 The vLLM team. +# Copyright 2022 EleutherAI and the HuggingFace Inc. team. All rights reserved. +# +# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX +# and OPT implementations in this library. It has been modified from its +# original forms to accommodate minor architectural differences compared +# to GPT-NeoX and OPT used by the Meta AI team that trained the model. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" +Shared resampler perceiver network used in multimodal models and +related helpers for sincos positional embeddings. + +Example models: Qwen (Qwen-VL), Minicpmv2.0 +""" +import math +from functools import partial +from typing import Callable, Optional, Tuple, Union + +import numpy as np +import torch +import torch.nn.functional as F +from torch import nn +from torch.nn.init import trunc_normal_ + +from vllm.model_executor.layers.linear import ReplicatedLinear + +DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6) + + +def get_abs_pos(abs_pos: torch.Tensor, tgt_size: Union[torch.Tensor, + int]) -> torch.Tensor: + # abs_pos: L, C + # tgt_size: (H, W) + # return: M, C + src_size = int(math.sqrt(abs_pos.size(0))) + dtype = abs_pos.dtype + if isinstance(tgt_size, int): + tgt_size = (tgt_size, tgt_size) + if (src_size == tgt_size[0] and src_size == tgt_size[1]): + return abs_pos + return (F.interpolate( + abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2), + size=(tgt_size[0], tgt_size[1]), + mode="bicubic", + align_corners=False, + ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)) + + +# sin/cos positional embedding helpers are adapted from: +# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 +def get_1d_sincos_pos_embed_from_grid( + embed_dim: int, pos: np.ndarray, + version: Tuple[int, int] = (2, 0)) -> torch.Tensor: + """ + embed_dim: output dimension for each position + pos: a list of positions to be encoded: size (M,) / (H, W) + out: (M, D) / (H, W, D) + """ + assert embed_dim % 2 == 0 + omega = np.arange(embed_dim // 2, dtype=np.float32) + omega /= embed_dim / 2.0 + omega = 1.0 / 10000**omega # (D/2,) + + if version == (2, 0): + pos = pos.reshape(-1) # (M,) + out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product + emb_sin = np.sin(out) # (M, D/2) + emb_cos = np.cos(out) # (M, D/2) + emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) + else: + out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product + emb_sin = np.sin(out) # (H, W, D/2) + emb_cos = np.cos(out) # (H, W, D/2) + emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) + return emb + + +def get_2d_sincos_pos_embed_from_grid( + embed_dim: int, grid: np.ndarray, + version: Tuple[int, int] = (2, 0)) -> torch.Tensor: + assert embed_dim % 2 == 0 + + # use half of dimensions to encode grid_h + emb_h = get_1d_sincos_pos_embed_from_grid( + embed_dim // 2, grid[0], version) # (H*W, D/2) or (H, W, D/2) + emb_w = get_1d_sincos_pos_embed_from_grid( + embed_dim // 2, grid[1], version) # (H*W, D/2) or (H, W, D/2) + + if version == (2, 0): + emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) + else: + emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) + return emb + + +def get_2d_sincos_pos_embed( + embed_dim: int, + grid_size: Union[int, Tuple[int, int]], + cls_token: bool = False, + version: Tuple[int, int] = (2, 0), +) -> torch.Tensor: + """ + grid_size: int of the grid height and width + return: + pos_embed: [grid_size*grid_size, embed_dim] or + [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) + """ + if isinstance(grid_size, int): + grid_h_size, grid_w_size = grid_size, grid_size + else: + grid_h_size, grid_w_size = grid_size[0], grid_size[1] + + grid_h = np.arange(grid_h_size, dtype=np.float32) + grid_w = np.arange(grid_w_size, dtype=np.float32) + grid = np.meshgrid(grid_w, grid_h) # here w goes first + grid = np.stack(grid, axis=0) + assert isinstance(grid, np.ndarray) and \ + grid.shape == (2, grid_h_size, grid_w_size) + + if version == (2, 0): + grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version) + if cls_token: + pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], + axis=0) + else: + pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version) + return pos_embed + + +class BaseResampler(nn.Module): + """ + A 2D perceiver-resampler network with one cross attention layers by + (grid_size**2) learnable queries and 2d sincos pos_emb. + Outputs: + A tensor with the shape of (grid_size**2, embed_dim) + """ + + def __init__( + self, + num_queries: int, + embed_dim: int, + num_heads: int, + kv_dim: Optional[int] = None, + norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, + do_post_projection: bool = True, + ) -> None: + super().__init__() + + self.num_queries = num_queries + self.embed_dim = embed_dim + self.num_heads = num_heads + + self.query = nn.Parameter(torch.zeros(self.num_queries, embed_dim)) + trunc_normal_(self.query, std=0.02) + if kv_dim is not None and kv_dim != embed_dim: + self.kv_proj = ReplicatedLinear(kv_dim, embed_dim, bias=False) + else: + # Maintain the same return value with ReplicatedLinear.forward + self.kv_proj = lambda *args, **kwargs: ( # type: ignore # noqa + nn.Identity()(*args, **kwargs), + None, + ) + self.attn = nn.MultiheadAttention(embed_dim, num_heads) + self.ln_q = norm_layer(embed_dim) + self.ln_kv = norm_layer(embed_dim) + self.do_post_projection = do_post_projection + self.ln_post = norm_layer(embed_dim) if do_post_projection else None + self.proj = nn.Parameter( + (embed_dim**-0.5) * + torch.randn(embed_dim, embed_dim)) if do_post_projection else None + + def _init_weights(self, m: nn.Module) -> None: + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + def _repeat(self, query, N: int): + return query.unsqueeze(1).repeat(1, N, 1) + + +class Resampler2(BaseResampler): + """Resampler-perceiver network to be used for a variety of model types, + e.g., Qwen-vl / Minicpmv 2.0. The main difference is the addition of the + do_post_projection arg, which indicates whether or not there should be + a post layer normalization and projector after the attention. This is + present in minicpmv2.0, but not qwen-vl. + """ + + def __init__( + self, + grid_size: int, + embed_dim: int, + num_heads: int, + kv_dim: Optional[int] = None, + norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, + adaptive: bool = False, + do_post_projection: bool = True, + ) -> None: + super().__init__(grid_size**2, + embed_dim, + num_heads, + kv_dim, + norm_layer, + do_post_projection=do_post_projection) + + self.adaptive = adaptive + pos_embed_arr = get_2d_sincos_pos_embed(embed_dim, + grid_size, + version=(2, 0)) + + self.pos_embed = nn.Parameter( + torch.from_numpy(pos_embed_arr).requires_grad_(False)) + + self.apply(self._init_weights) + + def forward( + self, + x: torch.Tensor, + tgt_sizes: Optional[torch.Tensor] = None, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + if tgt_sizes is None: + tgt_sizes = int(math.sqrt(x.size(1))) + if self.adaptive: + pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim, + tgt_sizes, + version=(2, 0)) + pos_embed = torch.from_numpy(pos_embed_arr).to(device=x.device, + dtype=x.dtype) + else: + pos_embed = get_abs_pos(self.pos_embed, + tgt_sizes).to(device=x.device, + dtype=x.dtype) + + x, _ = self.kv_proj(x) + x = self.ln_kv(x).permute(1, 0, 2) + + N = x.shape[1] + q = self.ln_q(self.query) + out = self.attn( + self._repeat(q, N) + self.pos_embed.unsqueeze(1), + x + pos_embed.unsqueeze(1), + x, + attn_mask=attn_mask, + )[0] + x = out.permute(1, 0, 2) + if self.do_post_projection: + x = self.ln_post(x) + x = x @ self.proj + return x diff --git a/vllm/model_executor/layers/rotary_embedding.py b/vllm/model_executor/layers/rotary_embedding.py index c5a0278e485d4..d323f6cc432a2 100644 --- a/vllm/model_executor/layers/rotary_embedding.py +++ b/vllm/model_executor/layers/rotary_embedding.py @@ -28,7 +28,6 @@ import torch.nn as nn from vllm.model_executor.custom_op import CustomOp -from vllm.platforms import current_platform def _rotate_neox(x: torch.Tensor) -> torch.Tensor: @@ -48,21 +47,29 @@ def _apply_rotary_emb( x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, + is_neox_style: bool, ) -> torch.Tensor: """ Args: x: [num_tokens, num_heads, head_size] cos: [num_tokens, head_size // 2] sin: [num_tokens, head_size // 2] + is_neox_style: Whether to use the Neox-style or GPT-J-style rotary + positional embeddings. """ - orig_dtype = x.dtype - x = x.float() - x1, x2 = torch.chunk(x, 2, dim=-1) - cos = cos.unsqueeze(-2) - sin = sin.unsqueeze(-2) + cos = cos.unsqueeze(-2).to(x.dtype) + sin = sin.unsqueeze(-2).to(x.dtype) + if is_neox_style: + x1, x2 = torch.chunk(x, 2, dim=-1) + else: + x1 = x[..., ::2] + x2 = x[..., 1::2] o1 = x1 * cos - x2 * sin o2 = x2 * cos + x1 * sin - return torch.cat((o1, o2), dim=-1).to(orig_dtype) + if is_neox_style: + return torch.cat((o1, o2), dim=-1) + else: + return torch.stack((o1, o2), dim=-1).flatten(-2) class RotaryEmbedding(CustomOp): @@ -87,10 +94,9 @@ def __init__( cache = self._compute_cos_sin_cache() cache = cache.to(dtype) + self.cos_sin_cache: torch.Tensor self.register_buffer("cos_sin_cache", cache, persistent=False) - self.use_native2 = current_platform.is_tpu() and is_neox_style - def _compute_inv_freq(self, base: Union[int, float]) -> torch.Tensor: """Compute the inverse frequency.""" # NOTE(woosuk): To exactly match the HF implementation, we need to @@ -119,59 +125,7 @@ def forward_native( key: torch.Tensor, offsets: Optional[torch.Tensor] = None, ) -> Tuple[torch.Tensor, torch.Tensor]: - """A PyTorch-native implementation equivalent to forward(). - - This method mimics the implementation of the custom CUDA kernel - used in `forward_cuda()`. - """ - query = query.view(*query.shape[:-1], -1, self.head_size) - key = key.view(*key.shape[:-1], -1, self.head_size) - - query_rot = query[..., :self.rotary_dim] - key_rot = key[..., :self.rotary_dim] - if self.rotary_dim < self.head_size: - query_pass = query[..., self.rotary_dim:] - key_pass = key[..., self.rotary_dim:] - - self.cos_sin_cache: torch.Tensor = self.cos_sin_cache.to( - positions.device, dtype=query.dtype) - cos_sin = self.cos_sin_cache[torch.add(positions, offsets) - if offsets is not None else positions] - cos, sin = cos_sin.chunk(2, dim=-1) - if self.is_neox_style: - # NOTE(woosuk): Here we assume that the positions tensor has the - # shape [batch_size, seq_len]. - cos = cos.repeat(1, 1, 2).unsqueeze(-2) - sin = sin.repeat(1, 1, 2).unsqueeze(-2) - else: - cos = cos.repeat_interleave(2, dim=-1).unsqueeze(-2) - sin = sin.repeat_interleave(2, dim=-1).unsqueeze(-2) - - rotate_fn = _rotate_neox if self.is_neox_style else _rotate_gptj - query_rot = query_rot * cos + rotate_fn(query_rot) * sin - key_rot = key_rot * cos + rotate_fn(key_rot) * sin - - if self.rotary_dim < self.head_size: - query = torch.cat((query_rot, query_pass), dim=-1) - key = torch.cat((key_rot, key_pass), dim=-1) - else: - query = query_rot - key = key_rot - query = query.flatten(-2) - key = key.flatten(-2) - return query, key - - def forward_native2( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: - """Another PyTorch-native implementation of forward(). - - This method might perform better than `forward_native()` when compiled. - """ + """A PyTorch-native implementation of forward().""" if offsets is not None: positions = positions + offsets positions = positions.flatten() @@ -183,14 +137,14 @@ def forward_native2( query = query.view(num_tokens, -1, self.head_size) query_rot = query[..., :self.rotary_dim] query_pass = query[..., self.rotary_dim:] - query_rot = _apply_rotary_emb(query_rot, cos, sin) + query_rot = _apply_rotary_emb(query_rot, cos, sin, self.is_neox_style) query = torch.cat((query_rot, query_pass), dim=-1).reshape(query_shape) key_shape = key.shape key = key.view(num_tokens, -1, self.head_size) key_rot = key[..., :self.rotary_dim] key_pass = key[..., self.rotary_dim:] - key_rot = _apply_rotary_emb(key_rot, cos, sin) + key_rot = _apply_rotary_emb(key_rot, cos, sin, self.is_neox_style) key = torch.cat((key_rot, key_pass), dim=-1).reshape(key_shape) return query, key @@ -203,7 +157,7 @@ def forward_cuda( ) -> Tuple[torch.Tensor, torch.Tensor]: from vllm import _custom_ops as ops - self.cos_sin_cache = self.cos_sin_cache.to(positions.device, + self.cos_sin_cache = self.cos_sin_cache.to(query.device, dtype=query.dtype) # ops.rotary_embedding()/batched_rotary_embedding() # are in-place operations that update the query and key tensors. @@ -240,17 +194,6 @@ def forward_xpu( self.cos_sin_cache, self.is_neox_style) return query, key - def forward_tpu( - self, - positions: torch.Tensor, - query: torch.Tensor, - key: torch.Tensor, - offsets: Optional[torch.Tensor] = None, - ) -> Tuple[torch.Tensor, torch.Tensor]: - forward_fn = (self.forward_native2 - if self.use_native2 else self.forward_native) - return forward_fn(positions, query, key, offsets) - def extra_repr(self) -> str: s = f"head_size={self.head_size}, rotary_dim={self.rotary_dim}" s += f", max_position_embeddings={self.max_position_embeddings}" diff --git a/vllm/model_executor/model_loader/neuron.py b/vllm/model_executor/model_loader/neuron.py index 7396ac833e782..594ae442ef328 100644 --- a/vllm/model_executor/model_loader/neuron.py +++ b/vllm/model_executor/model_loader/neuron.py @@ -10,6 +10,7 @@ from vllm.config import ModelConfig, ParallelConfig, SchedulerConfig from vllm.model_executor.layers.logits_processor import LogitsProcessor +from vllm.model_executor.layers.quantization import get_quantization_config from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.sampling_metadata import SamplingMetadata @@ -81,8 +82,7 @@ def load_weights(self, model_name_or_path: str, **kwargs): neuronx_model_cls = getattr(neuronx_module, neuronx_model_cls_name) split_model_dir = f"{model_name_or_path}-split" - if os.path.isdir(os.path.join(model_name_or_path, - "pytorch_model.bin")): + if _is_pretrained_neuron_checkpoint(model_name_or_path): split_model_dir = model_name_or_path elif not os.path.exists(f"{model_name_or_path}-split"): hf_model_cls = getattr(transformers, hf_model_cls_name) @@ -97,6 +97,23 @@ def load_weights(self, model_name_or_path: str, **kwargs): self.model.to_neuron() +def _is_pretrained_neuron_checkpoint(model_name_or_path: str) -> bool: + # Checking if the neuron checkpoint is saved in the old format. + if os.path.isdir(os.path.join(model_name_or_path, "pytorch_model.bin")): + return True + # Checking if the neuron checkpoint is saved in the new format. + pretrained_split_files = ["config.json", "generation_config.json"] + pretrained_split_format = ".safetensors" + for file in pretrained_split_files: + file_path = os.path.join(model_name_or_path, file) + if not os.path.isfile(file_path): + return False + for file in os.listdir(model_name_or_path): + if file.endswith(pretrained_split_format): + return True + return False + + def _get_model_architecture(config: PretrainedConfig) -> str: architectures = getattr(config, "architectures", []) for arch in architectures: @@ -119,19 +136,51 @@ def _get_buckets(env: str, default_value: List[int]) -> List[int]: return buckets_list +def _get_default_neuron_config(model_config: ModelConfig, + parallel_config: ParallelConfig, + scheduler_config: SchedulerConfig): + from transformers_neuronx.config import ContinuousBatchingConfig + from transformers_neuronx.constants import LAYOUT_BSH + + continuous_batching_config = ContinuousBatchingConfig( + batch_size_for_shared_caches=scheduler_config.max_num_seqs) + quant_config = dict( + dequant_dtype=TORCH_DTYPE_TO_NEURON_AMP[model_config.dtype], + quantize_method="vector_dynamic") + neuron_quantization_config_builder = lambda quant: get_quantization_config( + quant).from_config(quant_config).get_quant_method(None, "") + # TODO: Add Paged attention config to the default neuron arguments. + default_neuron_args = dict( + collectives_layout=LAYOUT_BSH, + attention_layout=LAYOUT_BSH, + fuse_qkv=True, + quant=neuron_quantization_config_builder(model_config.quantization) + if model_config.quantization else None, + continuous_batching=continuous_batching_config, + weight_tiling=bool(model_config.quantization)) + return default_neuron_args + + +def _get_neuron_config_after_override(default_neuron_config, + overridden_neuron_config): + from transformers_neuronx.config import NeuronConfig + overridden_neuron_config = overridden_neuron_config or {} + default_neuron_config.update(overridden_neuron_config) + return NeuronConfig(**default_neuron_config) + + def get_neuron_model(model_config: ModelConfig, parallel_config: ParallelConfig, scheduler_config: SchedulerConfig) -> nn.Module: - from transformers_neuronx.config import (ContinuousBatchingConfig, - NeuronConfig) # Create a model instance. model = NeuronCasualLM(model_config.hf_config) - continuous_batching_config = ContinuousBatchingConfig( - batch_size_for_shared_caches=scheduler_config.max_num_seqs) - neuron_config = NeuronConfig( - continuous_batching=continuous_batching_config) + default_neuron_config_args = _get_default_neuron_config( + model_config, parallel_config, scheduler_config) + + neuron_config = _get_neuron_config_after_override( + default_neuron_config_args, model_config.override_neuron_config) context_length_estimates = _get_buckets("NEURON_CONTEXT_LENGTH_BUCKETS", [scheduler_config.max_model_len]) diff --git a/vllm/model_executor/models/__init__.py b/vllm/model_executor/models/__init__.py index e30370596496a..4db847029566f 100644 --- a/vllm/model_executor/models/__init__.py +++ b/vllm/model_executor/models/__init__.py @@ -51,7 +51,6 @@ "PhiForCausalLM": ("phi", "PhiForCausalLM"), "Phi3ForCausalLM": ("llama", "LlamaForCausalLM"), "PhiMoEForCausalLM": ("phimoe", "PhiMoEForCausalLM"), - "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), "Qwen2ForCausalLM": ("qwen2", "Qwen2ForCausalLM"), "Qwen2MoeForCausalLM": ("qwen2_moe", "Qwen2MoeForCausalLM"), "RWForCausalLM": ("falcon", "FalconForCausalLM"), @@ -88,6 +87,7 @@ "PaliGemmaForConditionalGeneration"), "Phi3VForCausalLM": ("phi3v", "Phi3VForCausalLM"), "UltravoxModel": ("ultravox", "UltravoxModel"), + "QWenLMHeadModel": ("qwen", "QWenLMHeadModel"), } _CONDITIONAL_GENERATION_MODELS = { "BartModel": ("bart", "BartForConditionalGeneration"), diff --git a/vllm/model_executor/models/internlm2.py b/vllm/model_executor/models/internlm2.py index 23669b540f561..11a8431a5e7f7 100644 --- a/vllm/model_executor/models/internlm2.py +++ b/vllm/model_executor/models/internlm2.py @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- from functools import partial -from typing import Any, Dict, Iterable, List, Optional, Tuple +from typing import Any, Dict, Iterable, List, Optional, Tuple, Union import torch from torch import nn @@ -8,7 +8,7 @@ from vllm.attention import Attention, AttentionMetadata from vllm.config import CacheConfig -from vllm.distributed import (get_tensor_model_parallel_rank, +from vllm.distributed import (get_pp_group, get_tensor_model_parallel_rank, get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather) @@ -28,6 +28,9 @@ from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.sequence import IntermediateTensors +from .utils import (is_pp_missing_parameter, + make_empty_intermediate_tensors_factory, make_layers) + class InternLM2MLP(nn.Module): @@ -234,6 +237,7 @@ def __init__( config: PretrainedConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, + prefix: str = "", ) -> None: super().__init__() self.config = config @@ -243,11 +247,15 @@ def __init__( config.vocab_size, config.hidden_size, ) - self.layers = nn.ModuleList([ - InternLMDecoderLayer(config, cache_config, quant_config) - for _ in range(config.num_hidden_layers) - ]) + self.start_layer, self.end_layer, self.layers = make_layers( + config.num_hidden_layers, + lambda prefix: InternLMDecoderLayer(config, cache_config, + quant_config), + prefix=f"{prefix}.layers") self.norm = RMSNorm(config.hidden_size, eps=config.rms_norm_eps) + self.make_empty_intermediate_tensors = ( + make_empty_intermediate_tensors_factory( + ["hidden_states", "residual"], config.hidden_size)) def get_input_embeddings(self, input_ids: torch.Tensor) -> torch.Tensor: return self.tok_embeddings(input_ids) @@ -260,21 +268,31 @@ def forward( attn_metadata: AttentionMetadata, intermediate_tensors: IntermediateTensors = None, inputs_embeds: Optional[torch.Tensor] = None, - ) -> torch.Tensor: - if inputs_embeds is not None: - hidden_states = inputs_embeds + ) -> Union[torch.Tensor, IntermediateTensors]: + if get_pp_group().is_first_rank: + if inputs_embeds is not None: + hidden_states = inputs_embeds + else: + hidden_states = self.tok_embeddings(input_ids) + residual = None else: - hidden_states = self.tok_embeddings(input_ids) - residual = None - for i in range(len(self.layers)): + assert intermediate_tensors is not None + hidden_states = intermediate_tensors["hidden_states"] + residual = intermediate_tensors["residual"] + for i in range(self.start_layer, self.end_layer): layer = self.layers[i] hidden_states, residual = layer( positions, hidden_states, - kv_caches[i], + kv_caches[i - self.start_layer], attn_metadata, residual, ) + if not get_pp_group().is_last_rank: + return IntermediateTensors({ + "hidden_states": hidden_states, + "residual": residual + }) hidden_states, _ = self.norm(hidden_states, residual) return hidden_states @@ -298,6 +316,8 @@ def __init__( self.output.weight = self.model.tok_embeddings.weight self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() + self.make_empty_intermediate_tensors = ( + self.model.make_empty_intermediate_tensors) def forward( self, @@ -308,7 +328,7 @@ def forward( intermediate_tensors: IntermediateTensors, ) -> torch.Tensor: hidden_states = self.model(input_ids, positions, kv_caches, - attn_metadata) + attn_metadata, intermediate_tensors) return hidden_states def compute_logits( @@ -345,6 +365,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + if is_pp_missing_parameter(name, self): + continue param = params_dict[name] weight_loader = param.weight_loader weight_loader(param, loaded_weight, shard_id) @@ -353,6 +375,8 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue + if is_pp_missing_parameter(name, self): + continue param = params_dict[name] weight_loader = getattr(param, "weight_loader", default_weight_loader) diff --git a/vllm/model_executor/models/internvl.py b/vllm/model_executor/models/internvl.py index 5ca8d0b6a2922..d317fdce3ba68 100644 --- a/vllm/model_executor/models/internvl.py +++ b/vllm/model_executor/models/internvl.py @@ -341,6 +341,8 @@ def __init__(self, nn.Linear(llm_hidden_size, llm_hidden_size)) self.img_context_token_id = None + self.make_empty_intermediate_tensors = ( + self.language_model.make_empty_intermediate_tensors) def pixel_shuffle(self, x, scale_factor=0.5): n, w, h, c = x.size() @@ -461,7 +463,7 @@ def forward( positions, kv_caches, attn_metadata, - None, + intermediate_tensors, inputs_embeds=inputs_embeds) return hidden_states diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index dd10729b9ffb5..f8be9490ee55d 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -26,11 +26,9 @@ from array import array from functools import partial from typing import (Any, Callable, Iterable, List, Mapping, Optional, Tuple, - TypedDict, Union) + TypedDict) -import numpy as np import torch -import torch.nn.functional as F import torch.types from PIL import Image from torch import nn @@ -44,6 +42,8 @@ from vllm.model_executor.layers.linear import ReplicatedLinear from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization import QuantizationConfig +from vllm.model_executor.layers.resampler import (Resampler2, + get_2d_sincos_pos_embed) from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ParallelLMHead from vllm.model_executor.model_loader.utils import set_default_torch_dtype @@ -98,101 +98,6 @@ class MiniCPMVImagePixelInputs(TypedDict): DEFAULT_LN = partial(nn.LayerNorm, eps=1e-6) -def get_abs_pos(abs_pos: torch.Tensor, tgt_size: torch.Tensor): - # abs_pos: L, C - # tgt_size: (H, W) - # return: M, C - src_size = int(math.sqrt(abs_pos.size(0))) - # tgt_size = int(math.sqrt(tgt_size)) - dtype = abs_pos.dtype - - return (F.interpolate( - abs_pos.float().reshape(1, src_size, src_size, -1).permute(0, 3, 1, 2), - size=(tgt_size[0], tgt_size[1]), - mode="bicubic", - align_corners=False, - ).permute(0, 2, 3, 1).flatten(0, 2).to(dtype=dtype)) - - -# https://github.com/facebookresearch/mae/blob/efb2a8062c206524e35e47d04501ed4f544c0ae8/util/pos_embed.py#L20 -def get_2d_sincos_pos_embed( - embed_dim: int, - grid_size: Union[int, Tuple[int, int]], - cls_token: bool = False, - version: Tuple[int, int] = (2, 0), -): - """ - grid_size: int of the grid height and width - return: - pos_embed: [grid_size*grid_size, embed_dim] or - [1+grid_size*grid_size, embed_dim] (w/ or w/o cls_token) - """ - if isinstance(grid_size, int): - grid_h_size, grid_w_size = grid_size, grid_size - else: - grid_h_size, grid_w_size = grid_size[0], grid_size[1] - - grid_h = np.arange(grid_h_size, dtype=np.float32) - grid_w = np.arange(grid_w_size, dtype=np.float32) - grid = np.meshgrid(grid_w, grid_h) # here w goes first - grid = np.stack(grid, axis=0) - - if version == (2, 0): - grid = grid.reshape([2, 1, grid_h_size, grid_w_size]) - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version) - if cls_token: - pos_embed = np.concatenate([np.zeros([1, embed_dim]), pos_embed], - axis=0) - else: - pos_embed = get_2d_sincos_pos_embed_from_grid(embed_dim, grid, version) - return pos_embed - - -def get_2d_sincos_pos_embed_from_grid(embed_dim: int, - grid: np.ndarray, - version: Tuple[int, int] = (2, 0)): - assert embed_dim % 2 == 0 - - # use half of dimensions to encode grid_h - emb_h = get_1d_sincos_pos_embed_from_grid( - embed_dim // 2, grid[0], version) # (H*W, D/2) or (H, W, D/2) - emb_w = get_1d_sincos_pos_embed_from_grid( - embed_dim // 2, grid[1], version) # (H*W, D/2) or (H, W, D/2) - - if version == (2, 0): - emb = np.concatenate([emb_h, emb_w], axis=1) # (H*W, D) - else: - emb = np.concatenate([emb_h, emb_w], axis=-1) # (H, W, D) - return emb - - -def get_1d_sincos_pos_embed_from_grid(embed_dim: int, - pos: np.ndarray, - version: Tuple[int, int] = (2, 0)): - """ - embed_dim: output dimension for each position - pos: a list of positions to be encoded: size (M,) / (H, W) - out: (M, D) / (H, W, D) - """ - assert embed_dim % 2 == 0 - omega = np.arange(embed_dim // 2, dtype=np.float32) - omega /= embed_dim / 2.0 - omega = 1.0 / 10000**omega # (D/2,) - - if version == (2, 0): - pos = pos.reshape(-1) # (M,) - out = np.einsum("m,d->md", pos, omega) # (M, D/2), outer product - emb_sin = np.sin(out) # (M, D/2) - emb_cos = np.cos(out) # (M, D/2) - emb = np.concatenate([emb_sin, emb_cos], axis=1) # (M, D) - else: - out = np.einsum("hw,d->hwd", pos, omega) # (H, W, D/2), outer product - emb_sin = np.sin(out) # (H, W, D/2) - emb_cos = np.cos(out) # (H, W, D/2) - emb = np.concatenate([emb_sin, emb_cos], axis=-1) # (H, W, D) - return emb - - class BaseResampler(nn.Module): """ A 2D perceiver-resampler network with one cross attention layers by @@ -245,62 +150,6 @@ def _repeat(self, query, N: int): return query.unsqueeze(1).repeat(1, N, 1) -class Resampler2(BaseResampler): - - def __init__( - self, - grid_size: int, - embed_dim: int, - num_heads: int, - kv_dim: Optional[int] = None, - norm_layer: Callable[[int], nn.LayerNorm] = DEFAULT_LN, - adaptive: bool = False, - ) -> None: - super().__init__(grid_size**2, embed_dim, num_heads, kv_dim, - norm_layer) - - self.adaptive = adaptive - pos_embed_arr = get_2d_sincos_pos_embed(embed_dim, - grid_size, - version=(2, 0)) - self.pos_embed = nn.Parameter( - torch.from_numpy(pos_embed_arr).float()).requires_grad_(False) - - self.apply(self._init_weights) - - def forward( - self, - x: torch.Tensor, - tgt_sizes: torch.Tensor, - attn_mask: Optional[torch.Tensor] = None, - ): - if self.adaptive: - pos_embed_arr = get_2d_sincos_pos_embed(self.embed_dim, - tgt_sizes, - version=(2, 0)) - pos_embed = torch.from_numpy(pos_embed_arr).to(device=x.device, - dtype=x.dtype) - else: - pos_embed = get_abs_pos(self.pos_embed, tgt_sizes) - - x, _ = self.kv_proj(x) - x = self.ln_kv(x).permute(1, 0, 2) - - N = x.shape[1] - q = self.ln_q(self.query) - out = self.attn( - self._repeat(q, N) + self.pos_embed.unsqueeze(1), - x + pos_embed.unsqueeze(1), - x, - attn_mask=attn_mask, - )[0] - x = out.permute(1, 0, 2) - - x = self.ln_post(x) - x = x @ self.proj - return x - - class Resampler2_5(BaseResampler): def __init__( @@ -782,7 +631,8 @@ def init_resampler(self, embed_dim: int, vision_dim: int) -> nn.Module: num_heads=embed_dim // 128, grid_size=int(math.sqrt(self.config.query_num)), kv_dim=vision_dim, - adaptive=True, + adaptive=False, + do_post_projection=True, ) return resampler diff --git a/vllm/model_executor/models/qwen.py b/vllm/model_executor/models/qwen.py index 8298e3bac4465..a726ec10984c0 100644 --- a/vllm/model_executor/models/qwen.py +++ b/vllm/model_executor/models/qwen.py @@ -4,36 +4,402 @@ # Copyright (c) Alibaba Cloud. # LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE """Inference-only QWen model compatible with HuggingFace weights.""" -from typing import Any, Dict, Iterable, List, Optional, Tuple +import math +import re +from array import array +from functools import partial +from typing import (Any, Callable, Dict, Iterable, List, Literal, Mapping, + Optional, Tuple, TypedDict, Union) + +import numpy as np import torch +from PIL import Image from torch import nn +from torchvision import transforms +from torchvision.transforms import InterpolationMode from transformers import PretrainedConfig from vllm.attention import Attention, AttentionMetadata -from vllm.config import CacheConfig +from vllm.config import CacheConfig, MultiModalConfig from vllm.distributed import get_pp_group, get_tensor_model_parallel_world_size -from vllm.model_executor.layers.activation import SiluAndMul +from vllm.inputs import INPUT_REGISTRY, InputContext, LLMInputs +from vllm.logger import init_logger +from vllm.model_executor.layers.activation import SiluAndMul, get_act_fn from vllm.model_executor.layers.layernorm import RMSNorm -from vllm.model_executor.layers.linear import (MergedColumnParallelLinear, +from vllm.model_executor.layers.linear import (ColumnParallelLinear, + MergedColumnParallelLinear, QKVParallelLinear, RowParallelLinear) from vllm.model_executor.layers.logits_processor import LogitsProcessor from vllm.model_executor.layers.quantization.base_config import ( QuantizationConfig) +from vllm.model_executor.layers.resampler import Resampler2, get_abs_pos from vllm.model_executor.layers.rotary_embedding import get_rope from vllm.model_executor.layers.sampler import Sampler, SamplerOutput from vllm.model_executor.layers.vocab_parallel_embedding import ( ParallelLMHead, VocabParallelEmbedding) from vllm.model_executor.model_loader.weight_utils import default_weight_loader +from vllm.model_executor.models.interfaces import SupportsMultiModal from vllm.model_executor.sampling_metadata import SamplingMetadata -from vllm.sequence import IntermediateTensors -from vllm.utils import print_warning_once +from vllm.multimodal import MULTIMODAL_REGISTRY +from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.utils import cached_get_tokenizer +from vllm.sequence import (VLLM_TOKEN_ID_ARRAY_TYPE, IntermediateTensors, + SequenceData) + +from .utils import flatten_bn, is_pp_missing_parameter, make_layers + +logger = init_logger(__name__) + +# NOTE: Qwen models have a few other special tags, e.g., ref, bbox, quad; +# for the time being, these tags are not considered as special at encoding +# time. This may change as VLLMs multimodal API changes in the future. +IMG_START = "" +IMG_END = "" +IMG_PAD = "" +# Image context is fixed at 256 for all images +MAX_QWEN_IMG_TOKENS = 256 +# Image normalization params +CLIP_MEAN = (0.48145466, 0.4578275, 0.40821073) +CLIP_STD = (0.26862954, 0.26130258, 0.27577711) + + +class QwenImagePixelInputs(TypedDict): + type: Literal["pixel_values"] + data: torch.Tensor + """ + Shape: `(batch_size * num_images, 3, image_size, image_size)` + + Note that image_size is the value in the vision config to which we resize + the image to in the normalization transform. Currently multi-image support + can only be leveraged by passing image embeddings directly. + """ + + +class QwenImageEmbeddingInputs(TypedDict): + type: Literal["image_embeds"] + data: torch.Tensor + """Shape: `(batch_size * num_images, 256, hidden_size)` + + `hidden_size` must match the hidden size of the language model backbone + and is stored in the visual config of the model if we have one. + """ + + +QwenImageInputs = Union[QwenImagePixelInputs, QwenImageEmbeddingInputs] + + +class VisualAttention(nn.Module): + """self-attention layer class. + Self-attention layer takes input with size [s, b, h] + and returns output of the same size. + """ + + def __init__( + self, + embed_dim: int, + num_heads: int, + bias: bool = True, + kdim: Optional[int] = None, + vdim: Optional[int] = None, + ): + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self._qkv_same_embed_dim = self.kdim == embed_dim \ + and self.vdim == embed_dim + + self.num_heads = num_heads + + # Per attention head and per partition values. + assert embed_dim % num_heads == 0 + self.hidden_size_per_attention_head = embed_dim // num_heads + self.num_attention_heads_per_partition = num_heads + self.hidden_size_per_partition = embed_dim + + # Strided linear layer. + assert self._qkv_same_embed_dim, \ + 'Visual Attention implementation only supports self-attention' + self.in_proj = nn.Linear(embed_dim, 3 * embed_dim) + self.out_proj = nn.Linear(embed_dim, embed_dim) + self.norm_factor = math.sqrt(self.hidden_size_per_attention_head) + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + # query/key/value: [sq, b, h] + sq, b, _ = x.size() + mixed_x_layer = self.in_proj(x) + + # [sq, b, (np * 3 * hn)] --> [sq, b, np, 3 * hn] + new_tensor_shape = mixed_x_layer.size()[:-1] + \ + (self.num_attention_heads_per_partition, + 3 * self.hidden_size_per_attention_head) + mixed_x_layer = mixed_x_layer.view(*new_tensor_shape) + + # [sq, b, np, 3 * hn] --> 3 [sq, b, np, hn] + query_layer, key_layer, value_layer = mixed_x_layer.split( + self.hidden_size_per_attention_head, dim=-1) + + # [sq, b, np, hn] -> [sq, b * np, hn] + query_layer = query_layer.view( + sq, b * self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head).transpose(0, 1) + # [sk, b, np, hn] -> [sk, b * np, hn] + key_layer = key_layer.view( + sq, b * self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head).transpose(0, 1) + + q_scaled = query_layer / self.norm_factor + if attn_mask is not None: + attention_probs = torch.baddbmm(attn_mask, q_scaled, + key_layer.transpose(-2, -1)) + else: + attention_probs = torch.bmm(q_scaled, key_layer.transpose(-2, -1)) + attention_probs = attention_probs.softmax(dim=-1) + + value_layer = value_layer.view( + sq, b * self.num_attention_heads_per_partition, + self.hidden_size_per_attention_head).transpose(0, 1) + + # matmul: [b * np, sq, hn] + context_layer = torch.bmm(attention_probs, value_layer) -from .utils import is_pp_missing_parameter, make_layers + # change view [b, np, sq, hn] + context_layer = context_layer.view( + b, self.num_attention_heads_per_partition, sq, + self.hidden_size_per_attention_head) + + # [b, np, sq, hn] --> [sq, b, np, hn] + context_layer = context_layer.permute(2, 0, 1, 3).contiguous() + + # [sq, b, np, hn] --> [sq, b, hp] + new_context_layer_shape = context_layer.size()[:-2] + \ + (self.hidden_size_per_partition,) + context_layer = context_layer.view(*new_context_layer_shape) + + output = self.out_proj(context_layer) + + return output + + +class QwenVMLP(nn.Module): + """MLP for the visual component of the Qwen model.""" + + def __init__( + self, + hidden_size: int, + intermediate_size: int, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.c_fc = ColumnParallelLinear(hidden_size, + intermediate_size, + bias=True, + quant_config=quant_config) + self.act_fn = get_act_fn("gelu", quant_config, intermediate_size) + self.c_proj = RowParallelLinear( + intermediate_size, + hidden_size, + bias=True, + quant_config=quant_config, + ) + + def forward(self, x): + x, _ = self.c_fc(x) + x = self.act_fn(x) + x, _ = self.c_proj(x) + return x + + +class VisualAttentionBlock(nn.Module): + + def __init__( + self, + d_model: int, + n_head: int, + mlp_ratio: float = 4.0, + norm_layer: Callable = nn.LayerNorm, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + + self.ln_1 = norm_layer(d_model) + self.ln_2 = norm_layer(d_model) + mlp_width = int(d_model * mlp_ratio) + self.attn = VisualAttention(d_model, n_head) + self.mlp = QwenVMLP( + hidden_size=d_model, + intermediate_size=mlp_width, + quant_config=quant_config, + ) + + def attention( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + attn_mask = attn_mask.to(x.dtype) if attn_mask is not None else None + return self.attn(x, attn_mask=attn_mask) + + def forward( + self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None, + ) -> torch.Tensor: + x = x + self.attention(self.ln_1(x), attn_mask=attn_mask) + x = x + self.mlp(self.ln_2(x)) + return x + + +class TransformerBlock(nn.Module): + + def __init__( + self, + width: int, + layers: int, + heads: int, + mlp_ratio: float = 4.0, + norm_layer: Callable = nn.LayerNorm, + quant_config: Optional[QuantizationConfig] = None, + ): + super().__init__() + self.width = width + self.layers = layers + + self.resblocks = nn.ModuleList([ + VisualAttentionBlock(width, + heads, + mlp_ratio, + norm_layer=norm_layer, + quant_config=quant_config) + for _ in range(layers) + ]) + + def get_cast_dtype(self) -> torch.dtype: + return self.resblocks[0].mlp.c_fc.weight.dtype + + def get_cast_device(self) -> torch.device: + return self.resblocks[0].mlp.c_fc.weight.device + + def forward(self, + x: torch.Tensor, + attn_mask: Optional[torch.Tensor] = None) -> torch.Tensor: + for r in self.resblocks: + x = r(x, attn_mask=attn_mask) + return x + + +class VisionTransformer(nn.Module): + + def __init__(self, + image_size: int, + patch_size: int, + width: int, + layers: int, + heads: int, + mlp_ratio: float, + n_queries: int = 256, + output_dim: int = 512, + image_start_id: int = 151857, + quant_config: Optional[QuantizationConfig] = None, + **kwargs): + super().__init__() + image_height, image_width = self.image_size = (image_size, image_size) + patch_height, patch_width = self.patch_size = (patch_size, patch_size) + self.grid_size = (image_height // patch_height, + image_width // patch_width) + self.output_dim = output_dim + self.conv1 = nn.Conv2d(in_channels=3, + out_channels=width, + kernel_size=patch_size, + stride=patch_size, + bias=False) + + # class embeddings and positional embeddings + scale = width**-0.5 + self.positional_embedding = nn.Parameter(scale * + torch.randn(256, width)) + + norm_layer = partial(nn.LayerNorm, eps=1e-6) + + self.ln_pre = norm_layer(width) + self.transformer = TransformerBlock(width, + layers, + heads, + mlp_ratio, + norm_layer=norm_layer, + quant_config=quant_config) + + self.attn_pool = Resampler2( + grid_size=int(math.sqrt(n_queries)), + embed_dim=output_dim, + num_heads=output_dim // 128, + kv_dim=width, + norm_layer=norm_layer, + adaptive=False, + do_post_projection=False, + ).to( + device=self.positional_embedding.device, + dtype=self.positional_embedding.dtype, + ) + + self.ln_post = norm_layer(output_dim) + self.proj = nn.Parameter( + (output_dim**-0.5) * torch.randn(output_dim, output_dim)) + self.image_start_id = image_start_id + self.image_end_id = image_start_id + 1 + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = x.to( + dtype=self.transformer.get_cast_dtype(), + device=self.transformer.get_cast_device(), + ) + + # to patches + x = self.conv1(x) # shape = [*, width, grid, grid] + x = x.reshape(x.shape[0], x.shape[1], + -1) # shape = [*, width, grid ** 2] + x = x.permute(0, 2, 1) # shape = [*, grid ** 2, width] + + x = x + get_abs_pos(self.positional_embedding, int(math.sqrt( + x.size(1)))) + + x = self.ln_pre(x) + + x = x.permute(1, 0, 2) # NLD -> LND + x = self.transformer(x) + x = x.permute(1, 0, 2) # LND -> NLD + + x = self.attn_pool(x) + x = self.ln_post(x) + x = x @ self.proj + + return x + + def get_image_positions(self, + input_ids: torch.Tensor) -> Optional[torch.Tensor]: + """Given the input IDs, extracts start/stop points corresponding to + images. + + args: + Returns: + Optional torch tensor corresponding to start/stop pairs of images. + """ + if torch.any(input_ids == self.image_start_id): + bos_pos = torch.where(input_ids == self.image_start_id) + eos_pos = torch.where(input_ids == self.image_end_id) + return torch.stack((bos_pos[0], eos_pos[0]), dim=1) + return None class QWenMLP(nn.Module): + """MLP for the language component of the Qwen model, which contains a + MergedColumnParallelLinear merging 2 outputs via silu activation.""" def __init__( self, @@ -56,7 +422,7 @@ def __init__( "Only silu is supported for now.") self.act_fn = SiluAndMul() - def forward(self, x): + def forward(self, x: torch.Tensor) -> torch.Tensor: gate_up, _ = self.gate_up_proj(x) x = self.act_fn(gate_up) x, _ = self.c_proj(x) @@ -203,6 +569,9 @@ def __init__( lambda prefix: QWenBlock(config, cache_config, quant_config), prefix=f"{prefix}.h") self.ln_f = RMSNorm(config.hidden_size, eps=config.layer_norm_epsilon) + self.visual = VisionTransformer(**config.visual, + quant_config=quant_config) if hasattr( + config, "visual") else None def forward( self, @@ -211,9 +580,33 @@ def forward( kv_caches: List[torch.Tensor], attn_metadata: AttentionMetadata, intermediate_tensors: Optional[IntermediateTensors], + pixel_values: Optional[QwenImageInputs], ) -> torch.Tensor: + img_pos = None + # If pixel / visual embeddings are provided, this is a visual model + if pixel_values is not None and self.visual is not None: + if pixel_values["type"] != "image_embeds": + image_embeds = self.visual(pixel_values["data"]) + else: + image_embeds = pixel_values["data"] + + # features should be of shape (# images, 256, hidden_dim) + img_pos = self.visual.get_image_positions(input_ids) + if isinstance( + img_pos, + np.ndarray) and img_pos.shape[0] != image_embeds.shape[0]: + raise ValueError( + f"Number of placeholders: {img_pos.shape[0]} " + f"does not match number of images {image_embeds.shape[0]}." + ) + if get_pp_group().is_first_rank: hidden_states = self.wte(input_ids) + # Merge the image embeddings into the hidden states if actually have + # visual features and the corresponding image tokens + if img_pos is not None: + for idx, (img_bos, img_eos) in enumerate(img_pos): + hidden_states[img_bos + 1:img_eos] = image_embeds[idx] residual = None else: assert intermediate_tensors is not None @@ -237,16 +630,241 @@ def forward( return hidden_states -class QWenLMHeadModel(nn.Module): +def get_image_text(image_num: int, padding: bool) -> str: + """Retrieves a placeholder text that when tokenized, will be expanded with + image pads. + + Args: + image_num: The number of the image that we want a text prompt for. + Images should be indexed starting at 1. + padding: Whether or not padding should be manually added. + + Returns: + Text placeholder prompt for the image being considered. + """ + image_start = f"Picture {image_num}: {IMG_START}" + image_end = f"{IMG_END}\n" + if not padding: + return f"{image_start}{image_end}" + return f"{image_start}{MAX_QWEN_IMG_TOKENS * IMG_PAD}{image_end}" + + +def input_processor_for_qwen(ctx: InputContext, + llm_inputs: LLMInputs) -> LLMInputs: + """Processes the inputs, which may or may not be multimodal. + Multimodal inputs will only be processed if the model has a "visual" + component in its model config, otherwise they'll be ignored. + + Args: + ctx: Context of the loaded model. + llm_inputs: LLM inputs which may have a multi_modal_data attribute. + + Returns: + If the model is language only or not multimodal inputs were provided, + returns llm_inputs unmodified. Otherwise, processes the multimodal + images / image embeddings and adds the fixed-length image placeholders. + """ + multi_modal_data = llm_inputs.get("multi_modal_data") + + # Only process images if we have multimodal data and a visual config + hf_config = ctx.get_hf_config() + if (multi_modal_data is None or "image" not in multi_modal_data + or not hasattr(hf_config, "visual")): + return llm_inputs + + prompt = llm_inputs.get("prompt") + prompt_token_ids = llm_inputs["prompt_token_ids"] + model_config = ctx.model_config + tokenizer = cached_get_tokenizer(model_config.tokenizer, + trust_remote_code=True) + image_data = multi_modal_data["image"] + if isinstance(image_data, torch.Tensor): + num_dims = len(image_data.shape) + if num_dims < 2 or num_dims > 3: + raise ValueError( + f"Expected img embeds to be have 3 dimensions, got {num_dims}") + num_images = 1 if num_dims == 2 else image_data.shape[0] + else: + # TODO - handle multiple image inputs once the API is solidified + num_images = 1 + + if prompt is None: + prompt = tokenizer.decode(prompt_token_ids) + + # Drops anything between / tags; encoding with the tokenizer + # will automatically add the image pads for the context. + new_prompt, num_matched_images = re.subn( + r"(Picture \d*: ).*?(<\/img>\n)", + r"\1\2", + prompt, + ) + + if num_matched_images != num_images: + logger.warning( + "Number of matched image placeholders %s doesn't match the number " + "of expected images %s; check your placeholder formatting.", + num_matched_images, num_images) + + new_prompt_token_ids = tokenizer.encode(new_prompt) + + return LLMInputs(prompt=new_prompt, + prompt_token_ids=new_prompt_token_ids, + multi_modal_data=multi_modal_data) + + +def input_mapper_for_qwen(ctx: InputContext, data: object) -> MultiModalInputs: + """Maps the input data to its MultiModalInputs (if any). + + Args: + ctx: Context of the loaded model. + data: data potentially containing image/image embeddings to be mapped + to pixel_values in .forward() for a visual QWenLMHeadModel model. + + Returns: + MultiModalInputs containing the stacked normalized images tensor or + image embeddings. + """ + # Early exit if we have provided an image to a language only Qwen model + hf_config = ctx.get_hf_config() + if not hasattr(hf_config, "visual"): + logger.warning( + "Images were provided but this model has no visual config; " + "multimodal inputs will not be forwarded to the model.") + return MultiModalInputs() + + model_config = ctx.model_config + tokenizer = cached_get_tokenizer(model_config.tokenizer, + trust_remote_code=True) + + image_pair_tok = tokenizer.encode(IMG_START + IMG_END, + add_special_tokens=False, + return_tensors="pt").squeeze() + image_start_id = image_pair_tok[0] + image_end_id = image_pair_tok[-1] + if (image_start_id + 1) != image_end_id: + raise ValueError( + f"Found image end ID {image_end_id}, but expected {IMG_START} + 1") + if len(image_pair_tok) != (MAX_QWEN_IMG_TOKENS + 2): + raise ValueError( + f"Expected image context length of {MAX_QWEN_IMG_TOKENS}, " + f"but got {image_pair_tok - 2}") + + hf_config = ctx.get_hf_config() + image_size = hf_config.visual["image_size"] + img_emb_size = hf_config.visual["output_dim"] + + if isinstance(data, torch.Tensor): + # It's expected that our values have already been processed + # by the visual transformer; shape is expected to be: + # (# images, 256, hidden_size) + if len(data.shape) == 2: + # Assume only one image embed was provided; unsqueeze the extra dim + data = data.unsqueeze(0) + if len(data.shape) != 3 or data.shape[ + 1] != MAX_QWEN_IMG_TOKENS or data.shape[2] != img_emb_size: + raise ValueError( + "Expected image embeds to be a tensor of shape" + f"[# images, {MAX_QWEN_IMG_TOKENS}, {img_emb_size}], but " + f"received shape [{data.shape}]") + pixel_values = data + + else: + transform = build_normalization_transform(image_size) + # TODO - handle multiple image inputs once the API is solidified + transformed_images = [transform(data)] + pixel_values = torch.stack(transformed_images, dim=0) + return MultiModalInputs({"pixel_values": pixel_values}) + + +def build_normalization_transform(image_size: int) -> transforms.Compose: + """Builds a normalization transform which can be applied to one or + more input images from which we want to extract visual features. + + Args: + image_size: size of the image to be processed for visual embeddings. + + Returns: + Callable transform for normalizing and resizing one RGB image. + """ + return transforms.Compose([ + transforms.Resize((image_size, image_size), + interpolation=InterpolationMode.BICUBIC), + transforms.ToTensor(), + transforms.Normalize(mean=CLIP_MEAN, std=CLIP_STD), + ]) + + +def dummy_data_for_qwen( + ctx: InputContext, + seq_len: int, + mm_counts: Mapping[str, int], +) -> Tuple[SequenceData, Optional[Dict]]: + """Build dummy data for warming up Qwen models; this will only contain text + matching the defaults for VLLM unless the model has a visual config. + + Args: + ctx: Context of the loaded model. + seq_len: Number of tokens in the text sequence. + mm_counts: multimodal data counts. + + Returns: + Tuple containing sequential and multimodal data. + """ + hf_config = ctx.get_hf_config() + + # The presence of a visual config indicates this is a multimodal model. + # If we don't have it, the model is considered an LLM for warmup purposes. + if not hasattr(hf_config, "visual"): + seq_data = SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, [0] * seq_len)) + mm_data = None + return seq_data, mm_data + + # We have a visual component - use images to warm up + num_images = mm_counts["image"] + model_config = ctx.model_config + tokenizer = cached_get_tokenizer(model_config.tokenizer, + trust_remote_code=True) + + # Build the image prompts with no imgpads; the tokenizer will add img pads + image_prompt = ''.join( + [get_image_text(idx, False) for idx in range(1, num_images + 1)]) + toks = tokenizer.encode(image_prompt, add_special_tokens=False) + + # Make sure we actually get the fixed context size per tok padding + num_pads = toks.count(tokenizer.encode(IMG_PAD)[0]) + if num_pads != (num_images * MAX_QWEN_IMG_TOKENS): + raise ValueError( + f"Tokenized dummy data should encode {MAX_QWEN_IMG_TOKENS} pads" + f" per image, but got {num_pads} pads for {num_images} image(s)" + " in total. Are you using a qwen tokenizer?") + + # Ensure the number of tokens is at minimum the sequence length provided + if len(toks) < seq_len: + toks += [0] * (seq_len - len(toks)) + + # Build the input images; width/height doesn't actually matter here since + # the data will get resized and the # of tokens per image is constant + image = Image.new("RGB", (224, 224), color=0) + mm_data = {"image": image if num_images == 1 else [image] * num_images} + return SequenceData(array(VLLM_TOKEN_ID_ARRAY_TYPE, toks)), mm_data + + +@MULTIMODAL_REGISTRY.register_image_input_mapper(input_mapper_for_qwen) +@MULTIMODAL_REGISTRY.register_max_image_tokens(MAX_QWEN_IMG_TOKENS) +@INPUT_REGISTRY.register_dummy_data(dummy_data_for_qwen) +@INPUT_REGISTRY.register_input_processor(input_processor_for_qwen) +class QWenLMHeadModel(nn.Module, SupportsMultiModal): def __init__( self, config: PretrainedConfig, + multimodal_config: MultiModalConfig, cache_config: Optional[CacheConfig] = None, quant_config: Optional[QuantizationConfig] = None, ): super().__init__() self.config = config + self.multimodal_config = multimodal_config self.quant_config = quant_config self.transformer = QWenModel(config, cache_config, quant_config) self.lm_head = ParallelLMHead(config.vocab_size, @@ -257,16 +875,47 @@ def __init__( self.logits_processor = LogitsProcessor(config.vocab_size) self.sampler = Sampler() - def forward( - self, - input_ids: torch.Tensor, - positions: torch.Tensor, - kv_caches: List[torch.Tensor], - attn_metadata: AttentionMetadata, - intermediate_tensors: Optional[IntermediateTensors] = None, - ) -> torch.Tensor: + def _get_image_input_type( + self, + pixel_values: Optional[torch.Tensor]) -> Optional[QwenImageInputs]: + """Determines if the provided pixel_values are normalized pixel values + or image embeddings. + + Args: + pixel_values: Optional data to processed into visual embeddings. + + Returns: + None of the QwenImageInputs type used to determine whether or not + the visual transformer needs to process the pixel_values. + """ + if pixel_values is not None and self.transformer.visual is not None: + pixel_values = flatten_bn(pixel_values) + if len(pixel_values.shape) == 3 and pixel_values.shape[ + 1] == MAX_QWEN_IMG_TOKENS and pixel_values.shape[ + 2] == self.config.visual["output_dim"]: + return QwenImageEmbeddingInputs( + type="image_embeds", + data=pixel_values, + ) + else: + # If we have the wrong shape, assume we still need to process + return QwenImagePixelInputs( + type="pixel_values", + data=pixel_values, + ) + return None + + def forward(self, + input_ids: torch.Tensor, + positions: torch.Tensor, + kv_caches: List[torch.Tensor], + attn_metadata: AttentionMetadata, + intermediate_tensors: Optional[IntermediateTensors] = None, + pixel_values: Optional[torch.Tensor] = None) -> torch.Tensor: + pixel_values = self._get_image_input_type(pixel_values) hidden_states = self.transformer(input_ids, positions, kv_caches, - attn_metadata, intermediate_tensors) + attn_metadata, intermediate_tensors, + pixel_values) return hidden_states def make_empty_intermediate_tensors( @@ -328,15 +977,6 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): # Skip loading extra bias for GPTQ models. if name.endswith(".bias") and name not in params_dict: continue - # Skip loading visual weights to support Qwen-VL models - # in cases with text-only inputs - # TODO: add support for Qwen-VL - if (name not in params_dict - and name.startswith("transformer.visual.")): - print_warning_once( - "Only text inputs are allowed. Images won't be handled " - "until Qwen-VL models are fully supported.") - continue # Skip layers on other devices. if is_pp_missing_parameter(name, self): continue diff --git a/vllm/model_executor/models/siglip.py b/vllm/model_executor/models/siglip.py index 114dbf09b0c53..0bee75e2f0cbb 100644 --- a/vllm/model_executor/models/siglip.py +++ b/vllm/model_executor/models/siglip.py @@ -443,14 +443,27 @@ def __init__( self.config = config embed_dim = config.hidden_size + if (num_hidden_layers_override is None + or num_hidden_layers_override == config.num_hidden_layers): + self.need_post_layernorm = True + elif num_hidden_layers_override > config.num_hidden_layers: + raise ValueError( + "num_hidden_layers_override cannot be greater than " + "num_hidden_layers") + else: + self.need_post_layernorm = False + self.embeddings = SiglipVisionEmbeddings(config) self.encoder = SiglipEncoder( config, quant_config=quant_config, num_hidden_layers_override=num_hidden_layers_override, ) - self.post_layernorm = nn.LayerNorm(embed_dim, - eps=config.layer_norm_eps) + if self.need_post_layernorm: + self.post_layernorm = nn.LayerNorm(embed_dim, + eps=config.layer_norm_eps) + else: + self.post_layernorm = nn.Identity() self.use_head = (True if not hasattr(config, "vision_use_head") else config.vision_use_head) if self.use_head: @@ -470,7 +483,6 @@ def forward( encoder_outputs = self.encoder(inputs_embeds=hidden_states) last_hidden_state = self.post_layernorm(encoder_outputs) - # TODO: add this back when pooled_output is used in inference # if self.use_head: # pooled_output = self.head(last_hidden_state) @@ -499,6 +511,10 @@ def __init__( num_hidden_layers_override=num_hidden_layers_override, ) + @property + def need_post_layernorm(self): + return self.vision_model.need_post_layernorm + def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding @@ -517,6 +533,11 @@ def load_weights(self, weights: Iterable[Tuple[str, torch.Tensor]]): layer_count = len(self.vision_model.encoder.layers) for name, loaded_weight in weights: + # post_layernorm is optional in SiglipVisionModel + if ("vision_model.post_layernorm" in name + and not self.need_post_layernorm): + continue + # omit layers when num_hidden_layers_override is set if "vision_model.encoder.layers." in name: layer_idx = int(name.split(".")[3]) diff --git a/vllm/model_executor/models/ultravox.py b/vllm/model_executor/models/ultravox.py index 7994945c5ac39..416fabda831a2 100644 --- a/vllm/model_executor/models/ultravox.py +++ b/vllm/model_executor/models/ultravox.py @@ -29,12 +29,12 @@ from vllm.model_executor.layers.sampler import SamplerOutput from vllm.model_executor.model_loader.weight_utils import default_weight_loader from vllm.model_executor.models.interfaces import SupportsMultiModal -from vllm.model_executor.models.utils import (filter_weights, +from vllm.model_executor.models.utils import (filter_weights, flatten_bn, init_vllm_registered_model, merge_multimodal_embeddings) from vllm.model_executor.sampling_metadata import SamplingMetadata from vllm.multimodal import MULTIMODAL_REGISTRY -from vllm.multimodal.base import MultiModalInputs +from vllm.multimodal.base import MultiModalInputs, NestedTensors from vllm.multimodal.utils import (cached_get_tokenizer, repeat_and_pad_placeholder_tokens) from vllm.sequence import VLLM_TOKEN_ID_ARRAY_TYPE, SequenceData @@ -48,13 +48,14 @@ class UltravoxAudioFeatureInputs(TypedDict): type: Literal["audio_features"] - data: Union[torch.Tensor, List[torch.Tensor]] - """Shape: `(batch_size * num_audios, 80, M)""" + data: NestedTensors + """Shape: `(batch_size, num_audios, 80, M)""" class UltravoxAudioEmbeddingInputs(TypedDict): type: Literal["audio_embeds"] - data: torch.Tensor + data: NestedTensors + """Shape: `(batch_size, num_audios, audio_feature_size, hidden_size)""" UltravoxAudioInputs = Union[UltravoxAudioFeatureInputs, @@ -85,24 +86,33 @@ def dummy_data_for_ultravox( audio_count = mm_counts["audio"] - audio_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [ - _AUDIO_PLACEHOLDER_TOKEN - ]) * get_ultravox_max_audio_tokens(ctx) * audio_count + audio_placeholder = array( + VLLM_TOKEN_ID_ARRAY_TYPE, + [_AUDIO_PLACEHOLDER_TOKEN]) * get_ultravox_max_audio_tokens(ctx) + + # Add a separator between each chunk. + audio_token_ids = (audio_placeholder + + array(VLLM_TOKEN_ID_ARRAY_TYPE, [0])) * audio_count other_token_ids = array(VLLM_TOKEN_ID_ARRAY_TYPE, [0]) * (seq_len - len(audio_token_ids)) audio_and_sr = (np.array([0.0] * feature_extractor.chunk_length), 1) - mm_dict = { - "audio": - audio_and_sr if audio_count == 1 else [audio_and_sr] * audio_count - } + mm_dict = {"audio": [audio_and_sr] * audio_count} return (SequenceData(audio_token_ids + other_token_ids), mm_dict) def input_mapper_for_ultravox(ctx: InputContext, data: object): - if isinstance(data, tuple): - (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], data) + if not isinstance(data, list): + data = [data] + + audio_features = [] + for audio_input in data: + if not isinstance(audio_input, tuple): + raise NotImplementedError( + f"Unsupported data type: {type(audio_input)}") + + (audio, sr) = cast(Tuple[np.ndarray, Union[float, int]], audio_input) feature_extractor = whisper_feature_extractor(ctx) if sr != feature_extractor.sampling_rate: @@ -121,15 +131,14 @@ def input_mapper_for_ultravox(ctx: InputContext, data: object): # Not enough audio; pad it. audio = np.pad(audio, (0, minimum_audio_length - len(audio))) - return MultiModalInputs({ - "audio_features": - feature_extractor(audio, - sampling_rate=sr, - padding="longest", - return_tensors="pt")["input_features"] - }) + single_audio_features = feature_extractor( + audio, sampling_rate=sr, padding="longest", + return_tensors="pt")["input_features"] - raise NotImplementedError(f"Unsupported data type: {type(data)}") + # Remove the batch dimension because we're wrapping it in a list. + audio_features.append(single_audio_features.squeeze(0)) + + return MultiModalInputs({"audio_features": audio_features}) def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs): @@ -138,25 +147,31 @@ def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs): return llm_inputs feature_extractor = whisper_feature_extractor(ctx) - audio_data, sample_rate = multi_modal_data["audio"] - - audio_length = audio_data.shape[0] - if sample_rate != feature_extractor.sampling_rate: - # Account for resampling. - adjustment = feature_extractor.sampling_rate / sample_rate - audio_length = math.ceil(adjustment * audio_length) - - feature_extractor_output_length = math.ceil( - (audio_length - - (feature_extractor.hop_length - 1)) / feature_extractor.hop_length) - - uv_config = ctx.get_hf_config(UltravoxConfig) - audio_num_tokens = min( - max( - 1, - math.ceil(feature_extractor_output_length / - (uv_config.stack_factor * 2))), - get_ultravox_max_audio_tokens(ctx)) + audios = multi_modal_data["audio"] + if not isinstance(audios, list): + audios = [audios] + + audio_token_counts = [] + for audio_data, sample_rate in audios: + audio_length = audio_data.shape[0] + if sample_rate != feature_extractor.sampling_rate: + # Account for resampling. + adjustment = feature_extractor.sampling_rate / sample_rate + audio_length = math.ceil(adjustment * audio_length) + + feature_extractor_output_length = math.ceil( + (audio_length - (feature_extractor.hop_length - 1)) / + feature_extractor.hop_length) + + uv_config = ctx.get_hf_config(UltravoxConfig) + audio_num_tokens = min( + max( + 1, + math.ceil(feature_extractor_output_length / + (uv_config.stack_factor * 2))), + get_ultravox_max_audio_tokens(ctx)) + audio_token_counts.append(audio_num_tokens) + tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer) new_prompt, new_token_ids = repeat_and_pad_placeholder_tokens( @@ -164,7 +179,7 @@ def input_processor_for_ultravox(ctx: InputContext, llm_inputs: LLMInputs): llm_inputs.get("prompt"), llm_inputs["prompt_token_ids"], placeholder_token_id=_AUDIO_PLACEHOLDER_TOKEN, - repeat_count=audio_num_tokens, + repeat_count=audio_token_counts, ) # NOTE: Create a defensive copy of the original inputs @@ -338,45 +353,52 @@ def _parse_and_validate_audio_input( raise ValueError("Incorrect type of audio features. " f"Got type: {type(audio_features)}") - # Remove the N dimension until multiple audios are supported. - if isinstance(audio_features, torch.Tensor): - audio_features = audio_features.squeeze(1) - else: - audio_features = [t.squeeze(0) for t in audio_features] - return UltravoxAudioFeatureInputs(type="audio_features", data=audio_features) if audio_embeds is not None: - if not isinstance(audio_embeds, torch.Tensor): + if not isinstance(audio_embeds, (torch.Tensor, list)): raise ValueError("Incorrect type of audio embeds. " f"Got type: {type(audio_embeds)}") - # Remove the N dimension until multiple audios are supported. - audio_embeds = audio_embeds.squeeze(1) - return UltravoxAudioEmbeddingInputs(type="audio_embeds", data=audio_embeds) raise AssertionError("This line should be unreachable.") def _process_audio_input( - self, audio_input: UltravoxAudioInputs - ) -> Union[torch.Tensor, List[torch.Tensor]]: + self, audio_input: UltravoxAudioInputs) -> NestedTensors: if audio_input["type"] == "audio_embeds": return audio_input["data"] audio_features = audio_input["data"] - if isinstance(audio_features, list): - # TODO: Batch these through the encoder/projector instead of - # serializing them. - return [ - self._audio_features_to_embeddings( - features.unsqueeze(0)).squeeze(0) - for features in audio_features - ] - else: - return self._audio_features_to_embeddings(audio_features) + if isinstance(audio_features, torch.Tensor): + # Combine the B and N dimensions for the encoder/projector + flattened = flatten_bn(audio_features) + flattened_embeddings = self._audio_features_to_embeddings( + flattened) + + # Restore the original dimensions + embeddings = flattened_embeddings.unflatten( + 0, audio_features.shape[:2]) + return embeddings + + result = [] + # TODO: Batch heterogeneous tensors through the encoder/projector + for audio_features_item in audio_features: + if isinstance(audio_features_item, torch.Tensor): + result.append( + self._audio_features_to_embeddings(audio_features_item)) + else: + embeddings = [ + # Add a batch dimension to embed it, then remove it. + self._audio_features_to_embeddings(tensor.unsqueeze(0) + ).squeeze(0) + for tensor in audio_features_item + ] + result.append(embeddings) + + return result def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, kv_caches: List[torch.Tensor], @@ -393,7 +415,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor, with the `input_ids`. Args: - input_features: A batch of audio inputs, [1, 80, M]. + audio_features: A batch of audio inputs [B, N, 80, M]. """ audio_input = self._parse_and_validate_audio_input(**kwargs) if audio_input is not None: diff --git a/vllm/model_executor/models/utils.py b/vllm/model_executor/models/utils.py index 16565e1467e8f..8b80dda96db49 100644 --- a/vllm/model_executor/models/utils.py +++ b/vllm/model_executor/models/utils.py @@ -12,6 +12,7 @@ from vllm.model_executor.model_loader.loader import build_model from vllm.model_executor.models import ModelRegistry from vllm.multimodal.base import NestedTensors +from vllm.sequence import IntermediateTensors from vllm.utils import is_pin_memory_available @@ -279,3 +280,18 @@ def is_pp_missing_parameter(name: str, model: torch.nn.Module) -> bool: if name.startswith(missing_layer_name): return True return False + + +def make_empty_intermediate_tensors_factory(keys: List[str], hidden_size: int): + + def make_empty_intermediate_tensors( + batch_size: int, dtype: torch.dtype, + device: torch.device) -> IntermediateTensors: + return IntermediateTensors({ + key: torch.zeros((batch_size, hidden_size), + dtype=dtype, + device=device) + for key in keys + }) + + return make_empty_intermediate_tensors diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 4bed267e99637..b76b765bc677a 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -120,6 +120,16 @@ async def async_fetch_audio( return librosa.load(BytesIO(audio_bytes), sr=None) +def get_and_parse_audio(audio_url: str) -> MultiModalDataDict: + audio, sr = fetch_audio(audio_url) + return {"audio": (audio, sr)} + + +def get_and_parse_image(image_url: str) -> MultiModalDataDict: + image = fetch_image(image_url) + return {"image": image} + + async def async_get_and_parse_audio(audio_url: str) -> MultiModalDataDict: audio, sr = await async_fetch_audio(audio_url) return {"audio": (audio, sr)} diff --git a/vllm/transformers_utils/tokenizers/mistral.py b/vllm/transformers_utils/tokenizers/mistral.py index 23ecfc0af6be4..533a86b787325 100644 --- a/vllm/transformers_utils/tokenizers/mistral.py +++ b/vllm/transformers_utils/tokenizers/mistral.py @@ -52,12 +52,13 @@ def __init__(self, tokenizer: PublicMistralTokenizer) -> None: assert isinstance(self.tokenizer, (Tekkenizer, SentencePieceTokenizer)), type( self.tokenizer) - self._is_tekken = isinstance(self.tokenizer, Tekkenizer) - if self._is_tekken: + if (is_tekken := isinstance(self.tokenizer, Tekkenizer)): # Make sure special tokens will not raise self.tokenizer.special_token_policy = SpecialTokenPolicy.IGNORE + self._is_tekken = is_tekken + # the following attributes are set to fit VLLM's design self.is_fast = True self.chat_template = True diff --git a/vllm/version.py b/vllm/version.py index 052eb76b5873c..039f6369b8ed5 100644 --- a/vllm/version.py +++ b/vllm/version.py @@ -9,4 +9,4 @@ stacklevel=2) __commit__ = "COMMIT_HASH_PLACEHOLDER" -__version__ = "0.5.5" +__version__ = "0.6.0" diff --git a/vllm/worker/neuron_model_runner.py b/vllm/worker/neuron_model_runner.py index f3defffdfa520..0cf7445d4388d 100644 --- a/vllm/worker/neuron_model_runner.py +++ b/vllm/worker/neuron_model_runner.py @@ -1,4 +1,5 @@ from dataclasses import dataclass +from importlib.util import find_spec from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple, Union import torch @@ -76,9 +77,14 @@ def __init__( self.model: nn.Module # initialize after load_model. def load_model(self) -> None: - self.model = get_neuron_model(self.model_config, - parallel_config=self.parallel_config, - scheduler_config=self.scheduler_config) + if find_spec("transformers_neuronx") is not None: + self.model = get_neuron_model( + self.model_config, + parallel_config=self.parallel_config, + scheduler_config=self.scheduler_config) + else: + raise NotImplementedError( + "Supports only Transformer-NeuronX based models.") def _prepare_prompt( self, diff --git a/vllm/worker/tpu_worker.py b/vllm/worker/tpu_worker.py index 44fa3aed5816d..9e0c522cee453 100644 --- a/vllm/worker/tpu_worker.py +++ b/vllm/worker/tpu_worker.py @@ -102,8 +102,9 @@ def init_device(self) -> None: # NOTE(woosuk): Set per-rank cache path since different ranks # can have slightly different XLA graphs. world_size = self.parallel_config.world_size + rank = xr.global_ordinal() per_rank_path = os.path.join(envs.VLLM_XLA_CACHE_PATH, - f"tp{world_size}_rank{self.rank}") + f"tp{world_size}_rank{rank}") xr.initialize_cache(per_rank_path, readonly=False) def load_model(self):