Skip to content

Commit

Permalink
fix: Qwen2-VL multi modal attr name
Browse files Browse the repository at this point in the history
Signed-off-by: imkero <[email protected]>
  • Loading branch information
imkero committed Nov 11, 2024
1 parent b739f29 commit 9022a63
Showing 1 changed file with 13 additions and 13 deletions.
26 changes: 13 additions & 13 deletions vllm/model_executor/models/qwen2_vl.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@

class Qwen2VLImagePixelInputs(TypedDict):
type: Literal["pixel_values"]
data: torch.Tensor
pixel_values: torch.Tensor
"""Shape:
`(num_patches, num_channels * patch_size * patch_size)`
"""
Expand All @@ -92,7 +92,7 @@ class Qwen2VLImagePixelInputs(TypedDict):

class Qwen2VLImageEmbeddingInputs(TypedDict):
type: Literal["image_embeds"]
data: torch.Tensor
image_embeds: torch.Tensor
"""Supported types:
- List[`torch.Tensor`]: A list of tensors holding all images' features.
Each tensor holds an image's features.
Expand All @@ -116,8 +116,8 @@ class Qwen2VLImageEmbeddingInputs(TypedDict):


class Qwen2VLVideoPixelInputs(TypedDict):
type: Literal["pixel_values_video"]
data: torch.Tensor
type: Literal["pixel_values_videos"]
pixel_values_videos: torch.Tensor
"""Shape:
`(num_patches,
num_channels * temporal_patch_size * patch_size * patch_size)`
Expand All @@ -132,7 +132,7 @@ class Qwen2VLVideoPixelInputs(TypedDict):

class Qwen2VLVideoEmbeddingInputs(TypedDict):
type: Literal["video_embeds"]
data: torch.Tensor
video_embeds: torch.Tensor
"""Supported types:
- List[`torch.Tensor`]: A list of tensors holding all videos' features.
Each tensor holds an video's features.
Expand Down Expand Up @@ -1152,7 +1152,7 @@ def _parse_and_validate_image_input(
f"Got type: {type(pixel_values)}")

return Qwen2VLImagePixelInputs(type="pixel_values",
data=pixel_values,
pixel_values=pixel_values,
image_grid_thw=image_grid_thw)

if image_embeds is not None:
Expand All @@ -1163,7 +1163,7 @@ def _parse_and_validate_image_input(
raise ValueError("Incorrect type of image embeddings. "
f"Got type: {type(image_embeds)}")
return Qwen2VLImageEmbeddingInputs(type="image_embeds",
data=image_embeds,
image_embeds=image_embeds,
image_grid_thw=image_grid_thw)

def _parse_and_validate_video_input(
Expand All @@ -1182,8 +1182,8 @@ def _parse_and_validate_video_input(
video_grid_thw, "video grid_thw")

return Qwen2VLVideoPixelInputs(
type="pixel_values_video",
data=pixel_values_videos,
type="pixel_values_videos",
pixel_values_videos=pixel_values_videos,
video_grid_thw=video_grid_thw,
)

Expand All @@ -1195,23 +1195,23 @@ def _parse_and_validate_video_input(
raise ValueError("Incorrect type of video embeddings. "
f"Got type: {type(video_embeds)}")
return Qwen2VLVideoEmbeddingInputs(type="video_embeds",
data=video_embeds,
video_embeds=video_embeds,
video_grid_thw=video_grid_thw)

def _process_image_input(self,
image_input: Qwen2VLImageInputs) -> torch.Tensor:
if image_input["type"] == "image_embeds":
return image_input["data"].type(self.visual.dtype)
return image_input["image_embeds"].type(self.visual.dtype)

pixel_values = image_input["data"].type(self.visual.dtype)
pixel_values = image_input["pixel_values"].type(self.visual.dtype)
image_embeds = self.visual(pixel_values,
grid_thw=image_input["image_grid_thw"])
return image_embeds

def _process_video_input(self,
video_input: Qwen2VLVideoInputs) -> torch.Tensor:
if video_input["type"] == "video_embeds":
return video_input["data"].type(self.visual.dtype)
return video_input["video_embeds"].type(self.visual.dtype)

pixel_values_videos = video_input["pixel_values_videos"].type(
self.visual.dtype)
Expand Down

0 comments on commit 9022a63

Please sign in to comment.