diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py index a328b5a2aeea7..9f744b6918818 100644 --- a/vllm/model_executor/models/molmo.py +++ b/vllm/model_executor/models/molmo.py @@ -928,7 +928,11 @@ def image_input_mapper_for_molmo( data: object, ): if isinstance(data, list): + assert len(data) == 1, "Molmo supports only one image per prompt." data = data[0] + + # Remove unused dummy PIL image + data.pop('raw_mm_data', None) return MultiModalKwargs(data) @@ -974,6 +978,7 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int, dummy_imgdata = { "images": out["images"], "image_input_idx": out["image_input_idx"], + "raw_mm_data": dummy_image, } if "image_masks" in out: dummy_imgdata["image_masks"] = out["image_masks"] diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py index bba71c29cc108..cb97f743b1d52 100644 --- a/vllm/v1/engine/mm_input_mapper.py +++ b/vllm/v1/engine/mm_input_mapper.py @@ -151,17 +151,31 @@ class MMHasher: def __init__(self): pass - def hash_mm_data( + def hash_dummy_mm_data( self, mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]: + """Hash user-defined dummy multimodal data used for profiling.""" + if mm_data is None: return None image_inputs = mm_data['image'] + # This is a temporary workaround for models (e.g, Molmo) that + # process multimodal data in the input processor (therefore + # image_inputs is MultiModalKwargs instead of raw input format). + # `raw_mm_data` with the original input format is expected + # in this case. + if isinstance(image_inputs, dict): + assert "raw_mm_data" in image_inputs and isinstance( + image_inputs["raw_mm_data"], PIL.Image.Image) + image_inputs = image_inputs.pop("raw_mm_data") + return self.hash_images(image_inputs) - def hash_prompt(self, prompt: PromptType) -> Optional[List[str]]: + def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]: + """Hash multimodal data in the user input prompt if they exist.""" + if "multi_modal_data" not in prompt: return None @@ -171,6 +185,7 @@ def hash_prompt(self, prompt: PromptType) -> Optional[List[str]]: return self.hash_images(image_inputs) def hash_images(self, image_inputs) -> Optional[List[str]]: + """Hash PIL image objects to strings.""" if not isinstance(image_inputs, list): image_inputs = [image_inputs] assert len(image_inputs) > 0 diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py index ffcaa158d252d..6ee8732bc902c 100644 --- a/vllm/v1/engine/processor.py +++ b/vllm/v1/engine/processor.py @@ -79,7 +79,7 @@ def process_inputs( # Compute MM hashes (if enabled) mm_hashes = None if self.use_hash: - mm_hashes = self.mm_hasher.hash_prompt(prompt) + mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt) # Process inputs. preprocessed_inputs = self.input_preprocessor.preprocess( diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 8ec4a252d5925..cb89246db0cc9 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -638,7 +638,7 @@ def profile_run(self) -> None: # Compute MM hashes (if enabled) mm_hashes = None if self.use_hash: - mm_hashes = self.mm_hasher.hash_mm_data(dummy_mm_data) + mm_hashes = self.mm_hasher.hash_dummy_mm_data(dummy_mm_data) dummy_mm_kwargs = self.mm_input_mapper_client.process_inputs( mm_data=dummy_mm_data,