[Model] Add support for Qwen2-VL video embeddings input & multiple image embeddings input with varied resolutions #10221

imkero · 2024-11-11T12:05:02Z

Goal

Add support for Qwen2-VL multiple image embeddings input with varied resolutions

currently, vLLM implementation of Qwen2-VL's image embedding input requires all images in the input have the same resolution, however Qwen2-VL supports varied image resolution (with vision token num varied as well). Fix it in this PR

current vLLM impl:

vllm/vllm/model_executor/models/qwen2_vl.py

Lines 893 to 903 in 5fb1f93

    
           image_cnt = len(image_indices) 
        
           embed_dim = image_inputs.get('image_embeds').size(0) 
        
           assert embed_dim % image_cnt == 0 
        
           num_pad_tokens = embed_dim // image_cnt 
        
           for idx, token in enumerate(prompt_token_ids): 
        
               if idx in image_indices: 
        
                   prompt_token_ids_with_image.extend([token] * 
        
                                                      num_pad_tokens) 
        
               else: 
        
                   prompt_token_ids_with_image.append(token) 
        
           prompt_token_ids = prompt_token_ids_with_image

huggingface impl:

https://github.com/huggingface/transformers/blob/187439c3fa139b2102a874483e9f8f0cfa8e5557/src/transformers/models/qwen2_vl/processing_qwen2_vl.py#L133-L153

Add support for Qwen2-VL video embeddings input

Example code

from io import BytesIO
import requests
from PIL import Image
import torch
from vllm import LLM, SamplingParams
from vllm.model_executor.models.qwen2_vl import cached_get_processor

IMAGE_PLACEHOLDER = "<|vision_start|><|image_pad|><|vision_end|>"
VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"

llm = None
processor = None
model = "Qwen/Qwen2-VL-7B-Instruct"

def qwen2_vl_chat_template(*query):
    return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n"

def read_image(image_path: str):
    if image_path.startswith("https://"):
        print("Downloading image:", image_path)
        response = requests.get(image_path)
        response.raise_for_status()
        return Image.open(BytesIO(response.content))
    
    return Image.open(image_path)

def make_image_embeddings(images):
    image_processor = processor.image_processor

    preprocess_result = image_processor \
        .preprocess(images=images, return_tensors="pt") \
        .data
    pixel_values = preprocess_result["pixel_values"]
    image_grid_thw = preprocess_result["image_grid_thw"]

    with torch.no_grad():
        visual = llm.llm_engine.model_executor.driver_worker.model_runner.model.visual

        pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
        image_grid_thw_on_device = image_grid_thw.to(visual.device, dtype=torch.int64)
        image_embeds = visual(pixel_values_on_device, grid_thw=image_grid_thw_on_device)

    return {
        "image_embeds": image_embeds,
        "image_grid_thw": image_grid_thw,
    }

def make_video_embeddings(videos):
    image_processor = processor.image_processor

    preprocess_result = image_processor \
        .preprocess(images=None, videos=videos, return_tensors="pt") \
        .data
    pixel_values_video = preprocess_result["pixel_values_videos"]
    video_grid_thw = preprocess_result["video_grid_thw"]

    with torch.no_grad():
        visual = llm.llm_engine.model_executor.driver_worker.model_runner.model.visual

        pixel_values_on_device = pixel_values_video.to(visual.device, dtype=visual.dtype)
        video_grid_thw_on_device = video_grid_thw.to(visual.device, dtype=torch.int64)
        video_embeds = visual(pixel_values_on_device, grid_thw=video_grid_thw_on_device)

    return {
        "video_embeds": video_embeds,
        "video_grid_thw": video_grid_thw,
    }

def test_image_embedding_input():
    sampling_params = SamplingParams(
        temperature=0,
        top_k=1,
        top_p=0.0001,
        repetition_penalty=1.0,
        max_tokens=128,
        stop_token_ids=[],
    )

    images = {
        # 480 * 240, with text "1 + 1 = ?" in it, 153 image tokens
        "question": read_image("https://github.com/user-attachments/assets/da771a80-a6c9-4df6-a894-4eb229e99f1a"),

        # 64 * 64, with text "4" in it, 4 image tokens
        "option1": read_image("https://github.com/user-attachments/assets/8f1380b5-bba7-4c71-b2a9-ed3067a490eb"),

        # 240 * 240, with text "2" in it, 81 image tokens
        "option2": read_image("https://github.com/user-attachments/assets/73f9b31b-b0c9-432e-b1df-5c1275d8f26e"),
    }

    mm_data = {
        "image": make_image_embeddings([
            images["question"],
            images["option1"],
            images["option2"],
        ]),
    }

    def ask(question):
        outputs = llm.generate(
            prompts={
                "prompt": qwen2_vl_chat_template(
                    "Picture 1: ",
                    IMAGE_PLACEHOLDER,
                    "Picture 2: ",
                    IMAGE_PLACEHOLDER,
                    "Picture 3: ",
                    IMAGE_PLACEHOLDER,
                    question,
                ),
                "multi_modal_data": mm_data,
            },
            sampling_params=sampling_params
        )
        return outputs[0].outputs[0].text

    print(ask("Tell the text in Picture 1"))
    # 'The text in Picture 1 is "1 + 1 = ?".'

    print(ask("Tell the text in Picture 2"))
    # 'The text in Picture 2 is "4".'

    print(ask("Tell the text in Picture 3"))
    # 'The text in Picture 3 is "2".'

    print(ask("The question is in Picture 1, "
              "and two possible answers are in Picture 2 and Picture 3, "
              "which answer is correct to the question?"))
    # 'The correct answer to the question "1 + 1 = ?" is 2. Therefore, the correct answer is in Picture 3, which shows the number 2.'

def test_video_embedding_input():
    sampling_params = SamplingParams(
        temperature=0,
        top_k=1,
        top_p=0.0001,
        repetition_penalty=1.0,
        max_tokens=128,
        stop_token_ids=[],
    )

    videos = {
        "video1": [
            read_image("https://github.com/user-attachments/assets/5804b5b8-e078-4f93-8b6d-88384e0bcf57").resize((480, 270)),
            read_image("https://github.com/user-attachments/assets/777a6ef5-1110-4f53-8c29-41bc7c9f003a").resize((480, 270)),
            read_image("https://github.com/user-attachments/assets/2f93ff66-0782-4267-8976-3d872f9caba2").resize((480, 270)),
            read_image("https://github.com/user-attachments/assets/f04c0b74-a33c-4652-b4b0-e07e4b11d5c7").resize((480, 270)),
        ],
    }

    mm_data = {
        "video": make_video_embeddings([
            videos["video1"],
        ]),
    }

    def ask(question):
        outputs = llm.generate(
            prompts={
                "prompt": qwen2_vl_chat_template(
                    VIDEO_PLACEHOLDER,
                    question,
                ),
                "multi_modal_data": mm_data,
            },
            sampling_params=sampling_params
        )
        return outputs[0].outputs[0].text

    print(ask("Describe this video shortly"))

if __name__ == "__main__":
    llm = LLM(
        model=model,
        limit_mm_per_prompt={"image": 4, "video": 2},
        max_model_len=6400,
        max_num_seqs=1,
    )
    processor = cached_get_processor(model)

    test_image_embedding_input()
    test_video_embedding_input()

github-actions · 2024-11-11T12:05:17Z

👋 Hi! Thank you for contributing to the vLLM project.
Just a reminder: PRs would not trigger full CI run by default. Instead, it would only run fastcheck CI which starts running only a small and essential subset of CI tests to quickly catch errors. You can run other CI tests on top of those by going to your fastcheck build on Buildkite UI (linked in the PR checks section) and unblock them. If you do not have permission to unblock, ping simon-mo or khluu to add you in our Buildkite org.

Once the PR is approved and ready to go, your PR reviewer(s) can run CI to test the changes comprehensively before merging.

To run CI, PR reviewers can do one of these:

Add ready label to the PR
Enable auto-merge.

🚀

…age embeddings input with varied resolutions Signed-off-by: imkero <[email protected]>

DarkLight1337 · 2024-11-11T16:24:16Z

Thanks for adding this! Are you able to add embedding input tests for Qwen2-VL?

imkero · 2024-11-11T17:39:20Z

Thanks for adding this! Are you able to add embedding input tests for Qwen2-VL?

Sure! I will update this PR with related tests soon.

Signed-off-by: imkero <[email protected]>

DarkLight1337 · 2024-11-12T14:07:48Z

Would be great if you can incorporate this to test_models.py!

imkero · 2024-11-12T15:02:56Z

Would be great if you can incorporate this to test_models.py!

I have noticed test_models.py with VLMTestType.CUSTOM_INPUTS in it, but I think it would be better to separately implement a test still, because:

test_models.py aims at comparing vllm's output and huggingface's output in the same model with almost same input, while this PR added tests aim at comparing vllm's output with direct images/videos input and vllm's output with (same images'/videos') embeddings as input in Qwen2-VL
test_models.py defined tests are driven by definitions but getting image/video embeddings requires a runtime inference

So I think it should be ok to add a separate test file for this feature, maybe?

DarkLight1337 · 2024-11-12T15:33:53Z

PR added tests aim at comparing vllm's output with direct images/videos input and vllm's output with (same images'/videos') embeddings as input in Qwen2-VL

test_models.py defined tests are driven by definitions but getting image/video embeddings requires a runtime inference

So I think it should be ok to add a separate test file for this feature, maybe?

That's a good point. Let's keep the tests in a separate file then.

Can you mark these tests with core_models so that they are regularly tested in CI?

tests/models/decoder_only/vision_language/test_qwen2_vl.py

Signed-off-by: imkero <[email protected]>

DarkLight1337

The tests have passed, so LGTM. Thanks for implementing this!

…age embeddings input with varied resolutions (vllm-project#10221) Signed-off-by: imkero <[email protected]>

…age embeddings input with varied resolutions (vllm-project#10221) Signed-off-by: imkero <[email protected]> Signed-off-by: Sumit Dubey <[email protected]>

…age embeddings input with varied resolutions (vllm-project#10221) Signed-off-by: imkero <[email protected]>

…age embeddings input with varied resolutions (vllm-project#10221) Signed-off-by: imkero <[email protected]> Signed-off-by: Maxime Fournioux <[email protected]>

…age embeddings input with varied resolutions (vllm-project#10221) Signed-off-by: imkero <[email protected]> Signed-off-by: Tyler Michael Smith <[email protected]>

…age embeddings input with varied resolutions (vllm-project#10221) Signed-off-by: imkero <[email protected]>

mergify bot added the documentation Improvements or additions to documentation label Nov 11, 2024

[Model] Add support for Qwen2-VL video embeddings input & multiple im…

79ca0e8

…age embeddings input with varied resolutions Signed-off-by: imkero <[email protected]>

imkero force-pushed the feat/qwen2-vl-embedding-input branch from 9022a63 to 79ca0e8 Compare November 11, 2024 12:08

Fix Qwen2-VL embedding input grid_thw process

370726b

Signed-off-by: imkero <[email protected]>

imkero requested review from DarkLight1337 and ywang96 as code owners November 12, 2024 11:23

Add tests for Qwen2-VL embedding input inference

84ec384

Signed-off-by: imkero <[email protected]>

imkero force-pushed the feat/qwen2-vl-embedding-input branch from 39d61d2 to 84ec384 Compare November 12, 2024 11:41

DarkLight1337 reviewed Nov 12, 2024

View reviewed changes

tests/models/decoder_only/vision_language/test_qwen2_vl.py Outdated Show resolved Hide resolved

imkero added 2 commits November 13, 2024 12:28

Remove unused variable in test_qwen2_vl.py

6d1a8e5

Signed-off-by: imkero <[email protected]>

Add core_model mark for Qwen2-VL embedding-as-input tests

ee2344b

Signed-off-by: imkero <[email protected]>

imkero force-pushed the feat/qwen2-vl-embedding-input branch from 8b8db2e to ee2344b Compare November 13, 2024 04:30

imkero requested a review from DarkLight1337 November 13, 2024 05:57

DarkLight1337 approved these changes Nov 13, 2024

View reviewed changes

DarkLight1337 enabled auto-merge (squash) November 13, 2024 05:59

github-actions bot added the ready ONLY add when PR is ready to merge/full CI is needed label Nov 13, 2024

DarkLight1337 merged commit 3945c82 into vllm-project:main Nov 13, 2024
63 checks passed

rickyyx pushed a commit to rickyyx/vllm that referenced this pull request Nov 13, 2024

[Model] Add support for Qwen2-VL video embeddings input & multiple im…

2a8287f

…age embeddings input with varied resolutions (vllm-project#10221) Signed-off-by: imkero <[email protected]>

KuntaiDu pushed a commit to KuntaiDu/vllm that referenced this pull request Nov 20, 2024

[Model] Add support for Qwen2-VL video embeddings input & multiple im…

91d80c7

…age embeddings input with varied resolutions (vllm-project#10221) Signed-off-by: imkero <[email protected]>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Model] Add support for Qwen2-VL video embeddings input & multiple image embeddings input with varied resolutions #10221

[Model] Add support for Qwen2-VL video embeddings input & multiple image embeddings input with varied resolutions #10221

imkero commented Nov 11, 2024 •

edited

Loading

github-actions bot commented Nov 11, 2024

DarkLight1337 commented Nov 11, 2024

imkero commented Nov 11, 2024 •

edited

Loading

DarkLight1337 commented Nov 12, 2024

imkero commented Nov 12, 2024 •

edited

Loading

DarkLight1337 commented Nov 12, 2024 •

edited

Loading

DarkLight1337 left a comment

	image_cnt = len(image_indices)
	embed_dim = image_inputs.get('image_embeds').size(0)
	assert embed_dim % image_cnt == 0
	num_pad_tokens = embed_dim // image_cnt
	for idx, token in enumerate(prompt_token_ids):
	if idx in image_indices:
	prompt_token_ids_with_image.extend([token] *
	num_pad_tokens)
	else:
	prompt_token_ids_with_image.append(token)
	prompt_token_ids = prompt_token_ids_with_image

[Model] Add support for Qwen2-VL video embeddings input & multiple image embeddings input with varied resolutions #10221

[Model] Add support for Qwen2-VL video embeddings input & multiple image embeddings input with varied resolutions #10221

Conversation

imkero commented Nov 11, 2024 • edited Loading

Goal

Example code

github-actions bot commented Nov 11, 2024

DarkLight1337 commented Nov 11, 2024

imkero commented Nov 11, 2024 • edited Loading

DarkLight1337 commented Nov 12, 2024

imkero commented Nov 12, 2024 • edited Loading

DarkLight1337 commented Nov 12, 2024 • edited Loading

DarkLight1337 left a comment

Choose a reason for hiding this comment

imkero commented Nov 11, 2024 •

edited

Loading

imkero commented Nov 11, 2024 •

edited

Loading

imkero commented Nov 12, 2024 •

edited

Loading

DarkLight1337 commented Nov 12, 2024 •

edited

Loading