diff --git a/.buildkite/run-openvino-test.sh b/.buildkite/run-openvino-test.sh index 70e56596c4a86..35ad5c0ddde77 100755 --- a/.buildkite/run-openvino-test.sh +++ b/.buildkite/run-openvino-test.sh @@ -11,4 +11,4 @@ trap remove_docker_container EXIT remove_docker_container # Run the image and launch offline inference -docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/vllm/examples/offline_inference.py +docker run --network host --env VLLM_OPENVINO_KVCACHE_SPACE=1 --name openvino-test openvino-test python3 /workspace/examples/offline_inference.py diff --git a/vllm/attention/backends/openvino.py b/vllm/attention/backends/openvino.py index 6fddfc2002120..be06d16009988 100644 --- a/vllm/attention/backends/openvino.py +++ b/vllm/attention/backends/openvino.py @@ -1,5 +1,5 @@ from dataclasses import dataclass -from typing import List, Tuple, Type +from typing import Dict, List, Optional, Tuple, Type import openvino as ov import torch @@ -7,6 +7,7 @@ from vllm.attention.backends.abstract import (AttentionBackend, AttentionMetadata) from vllm.attention.backends.utils import CommonAttentionState +from vllm.multimodal import MultiModalPlaceholderMap def copy_cache_block(src_tensor: ov.Tensor, dst_tensor: ov.Tensor, @@ -128,3 +129,12 @@ class OpenVINOAttentionMetadata: # Shape: scalar # Type: i32 max_context_len: torch.Tensor + + # The index maps that relate multi-modal embeddings to the corresponding + # placeholders. + # + # N.B. These aren't really related to attention and don't belong on this + # type -- this is just a temporary solution to make them available to + # `model_executable`. + multi_modal_placeholder_index_maps: Optional[Dict[ + str, MultiModalPlaceholderMap.IndexMap]] diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index 07c06149f0206..522aa748f78b6 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -21,8 +21,8 @@ get_tensor_model_parallel_world_size, split_tensor_along_last_dim, tensor_model_parallel_all_gather) -from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, InputContext, - token_inputs) +from vllm.inputs import (INPUT_REGISTRY, DecoderOnlyInputs, DummyData, + InputContext, token_inputs) from vllm.model_executor import SamplingMetadata from vllm.model_executor.layers.activation import QuickGELU, SiluAndMul from vllm.model_executor.layers.layernorm import RMSNorm @@ -915,7 +915,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int, if "image_masks" in out: dummy_imgdata["image_masks"] = out["image_masks"] dummy_imgdata["seq_len"] = torch.tensor(seq_len, dtype=torch.long) - return dummy_seqdata, {"image": dummy_imgdata} + return DummyData(dummy_seqdata, {"image": dummy_imgdata}) def pad_images(