fix: Qwen2-VL multi modal attr name

Signed-off-by: imkero <[email protected]>
vllm-project · Nov 11, 2024 · 9022a63 · 9022a63
1 parent b739f29
commit 9022a63
Showing 1 changed file with 13 additions and 13 deletions.
diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
@@ -79,7 +79,7 @@
 
 class Qwen2VLImagePixelInputs(TypedDict):
     type: Literal["pixel_values"]
-    data: torch.Tensor
+    pixel_values: torch.Tensor
     """Shape:
     `(num_patches, num_channels * patch_size * patch_size)`
     """
@@ -92,7 +92,7 @@ class Qwen2VLImagePixelInputs(TypedDict):
 
 class Qwen2VLImageEmbeddingInputs(TypedDict):
     type: Literal["image_embeds"]
-    data: torch.Tensor
+    image_embeds: torch.Tensor
     """Supported types:
     - List[`torch.Tensor`]: A list of tensors holding all images' features.
         Each tensor holds an image's features.
@@ -116,8 +116,8 @@ class Qwen2VLImageEmbeddingInputs(TypedDict):
 
 
 class Qwen2VLVideoPixelInputs(TypedDict):
-    type: Literal["pixel_values_video"]
-    data: torch.Tensor
+    type: Literal["pixel_values_videos"]
+    pixel_values_videos: torch.Tensor
     """Shape:
     `(num_patches,
       num_channels * temporal_patch_size * patch_size * patch_size)`
@@ -132,7 +132,7 @@ class Qwen2VLVideoPixelInputs(TypedDict):
 
 class Qwen2VLVideoEmbeddingInputs(TypedDict):
     type: Literal["video_embeds"]
-    data: torch.Tensor
+    video_embeds: torch.Tensor
     """Supported types:
     - List[`torch.Tensor`]: A list of tensors holding all videos' features.
         Each tensor holds an video's features.
@@ -1152,7 +1152,7 @@ def _parse_and_validate_image_input(
                                  f"Got type: {type(pixel_values)}")
 
             return Qwen2VLImagePixelInputs(type="pixel_values",
-                                           data=pixel_values,
+                                           pixel_values=pixel_values,
                                            image_grid_thw=image_grid_thw)
 
         if image_embeds is not None:
@@ -1163,7 +1163,7 @@ def _parse_and_validate_image_input(
                 raise ValueError("Incorrect type of image embeddings. "
                                  f"Got type: {type(image_embeds)}")
             return Qwen2VLImageEmbeddingInputs(type="image_embeds",
-                                               data=image_embeds,
+                                               image_embeds=image_embeds,
                                                image_grid_thw=image_grid_thw)
 
     def _parse_and_validate_video_input(
@@ -1182,8 +1182,8 @@ def _parse_and_validate_video_input(
                 video_grid_thw, "video grid_thw")
 
             return Qwen2VLVideoPixelInputs(
-                type="pixel_values_video",
-                data=pixel_values_videos,
+                type="pixel_values_videos",
+                pixel_values_videos=pixel_values_videos,
                 video_grid_thw=video_grid_thw,
             )
 
@@ -1195,23 +1195,23 @@ def _parse_and_validate_video_input(
                 raise ValueError("Incorrect type of video embeddings. "
                                  f"Got type: {type(video_embeds)}")
             return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
-                                               data=video_embeds,
+                                               video_embeds=video_embeds,
                                                video_grid_thw=video_grid_thw)
 
     def _process_image_input(self,
                              image_input: Qwen2VLImageInputs) -> torch.Tensor:
         if image_input["type"] == "image_embeds":
-            return image_input["data"].type(self.visual.dtype)
+            return image_input["image_embeds"].type(self.visual.dtype)
 
-        pixel_values = image_input["data"].type(self.visual.dtype)
+        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
         image_embeds = self.visual(pixel_values,
                                    grid_thw=image_input["image_grid_thw"])
         return image_embeds
 
     def _process_video_input(self,
                              video_input: Qwen2VLVideoInputs) -> torch.Tensor:
         if video_input["type"] == "video_embeds":
-            return video_input["data"].type(self.visual.dtype)
+            return video_input["video_embeds"].type(self.visual.dtype)
 
         pixel_values_videos = video_input["pixel_values_videos"].type(
             self.visual.dtype)