From 3bc91631e726eb67b494927e306a33f192c7df61 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Wed, 2 Oct 2024 18:50:49 +0400 Subject: [PATCH 01/16] Add model type to vlm config --- src/cpp/include/openvino/genai/vlm_config.hpp | 2 ++ src/cpp/src/vlm_config.cpp | 1 + 2 files changed, 3 insertions(+) diff --git a/src/cpp/include/openvino/genai/vlm_config.hpp b/src/cpp/include/openvino/genai/vlm_config.hpp index dd22e422bf..02d0f7c36a 100644 --- a/src/cpp/include/openvino/genai/vlm_config.hpp +++ b/src/cpp/include/openvino/genai/vlm_config.hpp @@ -12,6 +12,8 @@ namespace ov::genai { /// change VLMPipeline's behavior. Corresponds to config.json. class OPENVINO_GENAI_EXPORTS VLMConfig { public: + /// @brief A string denoting model type. + std::string model_type = ""; /// @brief A size of a single embedding returned by a resampler. /// Used to initialize positional embeddings for resampler input. size_t hidden_size = 2304; diff --git a/src/cpp/src/vlm_config.cpp b/src/cpp/src/vlm_config.cpp index 36d997ecbe..a13a0da702 100644 --- a/src/cpp/src/vlm_config.cpp +++ b/src/cpp/src/vlm_config.cpp @@ -10,6 +10,7 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) { OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config"); nlohmann::json parsed = nlohmann::json::parse(stream); using ov::genai::utils::read_json_param; + read_json_param(parsed, "model_type", model_type); // TODO Consider checking supported model type here instead of VisionEncoder constructor read_json_param(parsed, "hidden_size", hidden_size); read_json_param(parsed, "scale_emb", scale_emb); read_json_param(parsed, "query_num", query_num); From 870128869813e2818a7fed2142d2bae86276b425 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Wed, 2 Oct 2024 19:49:09 +0400 Subject: [PATCH 02/16] Add llava specific config params to processor config --- .../openvino/genai/processor_config.hpp | 8 ++++++++ src/cpp/src/processor_config.cpp | 18 +++++++++++++++++- 2 files changed, 25 insertions(+), 1 deletion(-) diff --git a/src/cpp/include/openvino/genai/processor_config.hpp b/src/cpp/include/openvino/genai/processor_config.hpp index 9a70d1f3ae..b42f08db40 100644 --- a/src/cpp/include/openvino/genai/processor_config.hpp +++ b/src/cpp/include/openvino/genai/processor_config.hpp @@ -33,6 +33,14 @@ class OPENVINO_GENAI_EXPORTS ProcessorConfig { /// Applied after norm_mean. /// llava calls it image_std. std::array norm_std{1.0f, 1.0f, 1.0f}; + + // llava specific config params + std::array image_mean{0.0f, 0.0f, 0.0f}; + std::array image_std{1.0f, 1.0f, 1.0f}; + size_t crop_size_height = 336; + size_t crop_size_width = 336; + size_t size_shortest_edge = 336; + /// @brief Default constructor ProcessorConfig() = default; /// @brief Construct ProcessorConfig from values in json_path. diff --git a/src/cpp/src/processor_config.cpp b/src/cpp/src/processor_config.cpp index 33673f7e79..cea7f98fd4 100644 --- a/src/cpp/src/processor_config.cpp +++ b/src/cpp/src/processor_config.cpp @@ -10,7 +10,7 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config"); nlohmann::json parsed = nlohmann::json::parse(stream); using ov::genai::utils::read_json_param; - read_json_param(parsed, "patch_size", patch_size); + read_json_param(parsed, "patch_size", patch_size); // For llava - stored in config.json vision_config read_json_param(parsed, "scale_resolution", scale_resolution); read_json_param(parsed, "max_slice_nums", max_slice_nums); if (parsed.contains("norm_mean")) { @@ -19,4 +19,20 @@ ov::genai::ProcessorConfig::ProcessorConfig(const std::filesystem::path& json_pa if (parsed.contains("norm_std")) { norm_std = parsed.at("norm_std").get>(); } + + // Setting llava config params + if (parsed.contains("image_mean")) { + image_mean = parsed.at("image_mean").get>(); + } + if (parsed.contains("image_std")) { + image_std = parsed.at("image_std").get>(); + } + + if (parsed.contains("crop_size")) { + crop_size_height = parsed.at("crop_size").at("height"); + crop_size_width = parsed.at("crop_size").at("width"); + } + if (parsed.contains("size")) { + size_shortest_edge = parsed.at("size").at("shortest_edge"); + } } From 5b4f1455a477047e9f98f9335062147aa33bf747 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Wed, 2 Oct 2024 20:25:39 +0400 Subject: [PATCH 03/16] Add model type to vision encoder, separate encode methods for llava and minicpm --- .../include/openvino/genai/vision_encoder.hpp | 17 ++- src/cpp/src/vision_encoder.cpp | 114 ++++++++++++++++-- 2 files changed, 118 insertions(+), 13 deletions(-) diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp index 7370b7f8aa..3fe80bf24f 100644 --- a/src/cpp/include/openvino/genai/vision_encoder.hpp +++ b/src/cpp/include/openvino/genai/vision_encoder.hpp @@ -41,11 +41,16 @@ struct EncodedImage { /// ov::InferRequest and configured by ProcessorConfig. class OPENVINO_GENAI_EXPORTS VisionEncoder { public: + /// @brief A string denoting model type. + std::string model_type; /// @brief A model for image encoding. ov::InferRequest m_encoder; /// @brief A config to follow. ProcessorConfig m_processor_config; + // LLaVa specific members + ov::InferRequest m_vision_embeddings; + /// @brief Construct from an already compiled model and a config. /// @param encoder Compiled model. /// @param processor_config Initial config. @@ -65,7 +70,8 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder { const std::filesystem::path& model_dir, const std::string& device="CPU", const ov::AnyMap device_config={}, - ov::Core core=ov::Core{} + ov::Core core=ov::Core{}, + std::string model_type="" ); /// @brief Compute embeddings of an image. @@ -117,5 +123,14 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder { image, AnyMap{std::forward(properties)...} ); } + +private: + EncodedImage encode_minicpm( + const ov::Tensor& image, const ProcessorConfig& config + ); + + EncodedImage encode_llava( + const ov::Tensor& image, const ProcessorConfig& config + ); }; } diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp index a35a5d8db7..f513b433d3 100644 --- a/src/cpp/src/vision_encoder.cpp +++ b/src/cpp/src/vision_encoder.cpp @@ -291,23 +291,94 @@ EncodedImage llava_image_embed_make_with_bytes_slice(clip_ctx& ctx_clip, const o } return {resized_source, resized_source_size, encoded_slices, sliced_sizes}; } + + +ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig& config) { + bool do_resize = true; + bool do_center_crop = true; + + // ov::Tensor to clip_image_u8 + clip_image_u8 input_image{ + int(image.get_shape().at(3)), + int(image.get_shape().at(2)), + {image.data(), image.data() + image.get_size()} + }; + + // Resize + clip_image_u8 resized_image; + if (do_resize) { + int target_size = config.size_shortest_edge; + float scale = static_cast(target_size) / std::min(input_image.nx, input_image.ny); + int new_width = static_cast(input_image.nx * scale); + int new_height = static_cast(input_image.ny * scale); + bicubic_resize(input_image, resized_image, new_width, new_height); + } else { + resized_image = input_image; + } + + // Center crop + clip_image_u8 cropped_image; + if (do_center_crop) { + int crop_height = config.crop_size_height; + int crop_width = config.crop_size_width; + int start_x = (resized_image.nx - crop_width) / 2; + int start_y = (resized_image.ny - crop_height) / 2; + + cropped_image.nx = crop_width; + cropped_image.ny = crop_height; + cropped_image.buf.resize(3 * crop_width * crop_height); + + for (int y = 0; y < crop_height; ++y) { + for (int x = 0; x < crop_width; ++x) { + for (int c = 0; c < 3; ++c) { + cropped_image.buf[(y * crop_width + x) * 3 + c] = + resized_image.buf[((start_y + y) * resized_image.nx + (start_x + x)) * 3 + c]; + } + } + } + } else { + cropped_image = resized_image; + } + + // Normalize + clip_ctx ctx; + std::copy(config.image_mean.begin(), config.image_mean.end(), ctx.image_mean); + std::copy(config.image_std.begin(), config.image_std.end(), ctx.image_std); + + clip_image_f32 normalized_image = clip_image_preprocess(ctx, cropped_image); + + // Convert clip_image_f32 to ov::Tensor + ov::Tensor result( + ov::element::f32, + {1, 3, size_t(normalized_image.ny), size_t(normalized_image.nx)}, + (void*)(normalized_image.buf.data()) + ); + + return result; +} } -VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core) : - VisionEncoder{ - core.compile_model( - model_dir / "image_encoder.xml", device, device_config - ).create_infer_request(), - ov::genai::utils::from_config_json_if_exists( +VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core, std::string model_type) : + model_type(model_type) { + if (model_type == "minicpmv") { + m_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request(); + } else if (model_type == "llava") { + // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel + m_vision_embeddings = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); + } else { + OPENVINO_THROW("Unsupported model type: " + model_type); + } + m_processor_config = ov::genai::utils::from_config_json_if_exists( model_dir, "preprocessor_config.json" - ) - } {} + ); +} EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) { - clip_ctx ctx_clip; - std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean); - std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std); - return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums); + if (model_type == "minicpmv") { + return encode_minicpm(image, config); + } else if (model_type == "llava") { + return encode_llava(image, config); + } } EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& config_map) { @@ -315,3 +386,22 @@ EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ov::AnyMap& co config_map, m_processor_config )); } + +EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const ProcessorConfig& config) { + clip_ctx ctx_clip; + std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean); + std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std); + return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums); +} + +EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const ProcessorConfig& config) { + ov::Tensor preprocessed_image = preprocess_image_llava(image, config); + + m_vision_embeddings.set_tensor("pixel_values", preprocessed_image); + m_vision_embeddings.infer(); + + ov::Tensor image_features = m_vision_embeddings.get_output_tensor(); + HeightWidth resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size}; + + return {image_features, resized_source_size}; +} From be3fab4233ed2b1dac6a8e4b2e34a5cc18474699 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Wed, 2 Oct 2024 20:28:13 +0400 Subject: [PATCH 04/16] Enable llava model in vlm pipeline, separate preparing inputs embeds for llava and minicpm --- .../include/openvino/genai/vlm_pipeline.hpp | 3 + src/cpp/src/vlm_pipeline.cpp | 91 ++++++++++++++++--- 2 files changed, 81 insertions(+), 13 deletions(-) diff --git a/src/cpp/include/openvino/genai/vlm_pipeline.hpp b/src/cpp/include/openvino/genai/vlm_pipeline.hpp index 85ea9dd661..44754d4b6c 100644 --- a/src/cpp/include/openvino/genai/vlm_pipeline.hpp +++ b/src/cpp/include/openvino/genai/vlm_pipeline.hpp @@ -162,6 +162,9 @@ class OPENVINO_GENAI_EXPORTS VLMPipeline { private: class VLMPipelineImpl; std::unique_ptr m_pimpl; + + ov::Tensor get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images); + ov::Tensor get_inputs_embeds_llava(const std::string& prompt, const std::vector& images); }; /* diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 89eb535aa7..f640c9814e 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -312,20 +312,33 @@ VLMPipeline::VLMPipeline( ) }, m_tokenizer{tokenizer}, - m_vision_encoder(model_dir, device, device_config, core), - m_resampler{core.compile_model( - model_dir / "resampler.xml", device, device_config - ).create_infer_request()}, - m_embedding{core.compile_model( - model_dir / "embed_tokens.xml", device, device_config - ).create_infer_request()}, - m_language{core.compile_model( - model_dir / "language_model.xml", device, device_config - ).create_infer_request()}, - m_pos_embed_cache{ - get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}) - }, + m_vision_encoder(model_dir, device, device_config, core, m_vlm_config.model_type), m_is_chat_conversation{false} { + if (m_vlm_config.model_type == "minicpmv") { + m_resampler = core.compile_model( + model_dir / "resampler.xml", device, device_config + ).create_infer_request(); + + m_embedding = core.compile_model( + model_dir / "embed_tokens.xml", device, device_config + ).create_infer_request(); + + m_language = core.compile_model( + model_dir / "language_model.xml", device, device_config + ).create_infer_request(); + + m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); + } else if (m_vlm_config.model_type == "llava") { + m_language = core.compile_model( + model_dir / "openvino_language_model.xml", device, device_config + ).create_infer_request(); + + // Reusing the same m_embedding for llava text_embeddings model + m_embedding = core.compile_model( + model_dir / "openvino_text_embeddings_model.xml", device, device_config + ).create_infer_request(); + } + m_language.get_tensor("attention_mask").set_shape({1, 0}); } @@ -448,12 +461,21 @@ DecodedResults VLMPipeline::generate( } } } + + // if (m_vlm_config.model_type == "minicpmv") { + // inputs_embeds = get_inputs_embeds_minicpm(prompt, images); + // } else if (m_vlm_config.model_type == "llava") { + // inputs_embeds = get_inputs_embeds_llava(prompt, images); + // } + m_language.set_tensor("inputs_embeds", inputs_embeds); size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1); m_language.get_tensor("attention_mask").set_shape({1, history_len + inputs_embeds.get_shape()[1]}); std::fill_n(m_language.get_tensor("attention_mask").data(), m_language.get_tensor("attention_mask").get_size(), 1); + m_language.get_tensor("position_ids").set_shape({1, inputs_embeds.get_shape().at(1)}); std::iota(m_language.get_tensor("position_ids").data(), m_language.get_tensor("position_ids").data() + m_language.get_tensor("position_ids").get_size(), history_len); + m_language.get_tensor("beam_idx").set_shape({ BATCH_SIZE }); m_language.get_tensor("beam_idx").data()[0] = 0; @@ -586,3 +608,46 @@ GenerationConfig VLMPipeline::get_generation_config() const { void VLMPipeline::set_generation_config(const GenerationConfig& new_config) { m_generation_config = new_config; } + +ov::Tensor VLMPipeline::get_inputs_embeds_llava(const std::string& prompt, const std::vector& images) { + std::string image_token = ""; // TODO Consider getting from vlm_config or json + std::string formatted_prompt = "USER: " + (images.empty() ? prompt : image_token + "\n" + prompt) + " ASSISTANT:"; + ov::Tensor input_ids = m_tokenizer.encode(formatted_prompt).input_ids; + if (images.empty()) { + return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); + } else { + OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); + EncodedImage encoded_image = m_vision_encoder.encode(images.at(0)); + ov::Tensor image_embeds = encoded_image.resized_source; + + ov::Tensor text_embeds = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); + + int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json + + return merge_text_and_image_embeddings(input_ids, text_embeds, image_embeds, image_token_index); + } +} + +ov::Tensor VLMPipeline::get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images) { + std::string wrapped = images.empty() ? + "<用户>" + prompt + "" : prompt + ""; + ov::Tensor input_ids = m_tokenizer.encode(wrapped).input_ids; + + if (images.empty()) { + //<用户> + prompt + LLM first input + return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); + } else { + OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); + EncodedImage embeds = m_vision_encoder.encode(images.at(0)); + ov::Tensor imgEmbedTensor = get_image_embedding(embeds, m_tokenizer, m_embedding, *this); + + ov::Shape img_embed_shape = imgEmbedTensor.get_shape(); + OPENVINO_ASSERT( + m_vlm_config.hidden_size == img_embed_shape.at(2), + "Unexpected embedding size"); + + //<用户> + image embedding + prompt + LLM first input + ov::Tensor prompt_tensor = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); + return concatenate_mid_dim(imgEmbedTensor, prompt_tensor); + } +} From 2ec5ef8bc381edda1eb81bfaaefdae97f2155d0a Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 4 Oct 2024 15:46:38 +0400 Subject: [PATCH 05/16] Add test for vlm sample with llava model --- .github/workflows/causal_lm_cpp.yml | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 3feb7c8563..bbccf0d22d 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -681,7 +681,7 @@ jobs: diff pred2.txt ref.txt echo "Chat sample python" passed - py-vlm_chat_sample-ubuntu: + cpp-vlm_chat_sample-ubuntu: runs-on: ubuntu-22.04-16-cores steps: - uses: actions/checkout@v4 @@ -700,17 +700,31 @@ jobs: source ./ov/setupvars.sh cmake -DCMAKE_BUILD_TYPE=Release -S ./ -B ./build/ cmake --build ./build/ --config Release --target visual_language_chat -j - - name: Download and convert a model and an image + - name: Download and convert MiniCPM-V-2_6 model and an image run: | source ./ov/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python ./samples/cpp/visual_language_chat/export_MiniCPM-V-2_6.py ./miniCPM-V-2_6/ wget https://github.com/openvinotoolkit/openvino_notebooks/assets/29454499/d5fbbd1a-d484-415c-88cb-9986625b7b11 - - run: > + - name: Run visual_language_chat sample - MiniCPM-V-2_6 + run: > source ./ov/setupvars.sh && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./miniCPM-V-2_6/ d5fbbd1a-d484-415c-88cb-9986625b7b11 <<< $'What is on the image?\nWhat is special on the image?' + - name: Download and convert LLaVa 1.5 model and an image + run: | + source ./ov/setupvars.sh + python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly + python -m pip install git+https://github.com/huggingface/optimum-intel.git + optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/ + wget https://llava-vl.github.io/static/images/monalisa.jpg + - name: Run visual_language_chat sample - LLaVa 1.5 + run: > + source ./ov/setupvars.sh + && timeout 120s ./build/samples/cpp/visual_language_chat/visual_language_chat ./llava_1_5_7b_ov/ monalisa.jpg + <<< $'Who drew this painting?\nWhen did the painter live?' cpp-continuous-batching-ubuntu: runs-on: ubuntu-20.04-8-cores From 49447b93a7b8467f31f1cea621fad237e7254694 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Tue, 8 Oct 2024 20:42:46 +0400 Subject: [PATCH 06/16] Restore function for merging text and image embeds for llava --- src/cpp/src/vlm_pipeline.cpp | 50 +++++++++++++++++++++++++++++++++++- 1 file changed, 49 insertions(+), 1 deletion(-) diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index f640c9814e..23f16d1602 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -294,6 +294,54 @@ ov::Tensor resample(VLMPipeline& pipe, const ov::Tensor& encoded_image, const st pipe.m_resampler.infer(); return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] } + +ov::Tensor merge_text_and_image_embeddings_llava( + const ov::Tensor& input_ids, + const ov::Tensor& text_embeds, + const ov::Tensor& image_embeds, + int64_t image_token_index +) { + auto text_embeds_shape = text_embeds.get_shape(); + auto image_embeds_shape = image_embeds.get_shape(); + + OPENVINO_ASSERT( + text_embeds_shape[2] == image_embeds_shape[2], + "Incompatible shapes between text_embeds and image_embeds" + ); + + size_t text_embeds_seq_length = text_embeds_shape[1]; + size_t hidden_size = text_embeds_shape[2]; + size_t image_embeds_seq_length = image_embeds_shape[1]; + + size_t merged_seq_length = text_embeds_seq_length + (image_embeds_seq_length - 1); + + ov::Tensor merged_embeds(text_embeds.get_element_type(), {BATCH_SIZE, merged_seq_length, hidden_size}); + + const int64_t* input_ids_data = input_ids.data(); + const float* text_embeds_data = text_embeds.data(); + const float* image_embeds_data = image_embeds.data(); + float* merged_data = merged_embeds.data(); + + + size_t merged_idx = 0; + for (size_t s = 0; s < text_embeds_seq_length; ++s) { + if (input_ids_data[s] == image_token_index) { + for (size_t i = 0; i < image_embeds_seq_length; ++i) { + std::copy_n(image_embeds_data + i * hidden_size, + hidden_size, + merged_data + merged_idx * hidden_size); + merged_idx++; + } + } else { + std::copy_n(text_embeds_data + s * hidden_size, + hidden_size, + merged_data + merged_idx * hidden_size); + merged_idx++; + } + } + + return merged_embeds; +} } class ov::genai::VLMPipeline::VLMPipelineImpl { @@ -624,7 +672,7 @@ ov::Tensor VLMPipeline::get_inputs_embeds_llava(const std::string& prompt, const int64_t image_token_index = 32000; // TODO Consider getting from m_vlm_config.image_token_index or config.json - return merge_text_and_image_embeddings(input_ids, text_embeds, image_embeds, image_token_index); + return merge_text_and_image_embeddings_llava(input_ids, text_embeds, image_embeds, image_token_index); } } From 790981fdfee00a1585fd9f4cdc8bfd525da023a1 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Tue, 8 Oct 2024 21:01:37 +0400 Subject: [PATCH 07/16] Move getting input embeds for minicpm to separate method --- src/cpp/src/vlm_pipeline.cpp | 253 ++++++++++++++++------------------- 1 file changed, 117 insertions(+), 136 deletions(-) diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 23f16d1602..4b1d13e984 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -398,123 +398,12 @@ DecodedResults VLMPipeline::generate( const GenerationConfig& generation_config, const StreamerVariant& streamer ) { - std::string images_prompt; - EncodedImage embeds; - if (!rgbs.empty()) { - OPENVINO_ASSERT(1 == rgbs.size(), "TODO: Only a single image allowed"); - embeds = m_vision_encoder.encode(rgbs.at(0)); - if (m_vlm_config.use_image_id) { - images_prompt = m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; - ++image_id; - } - std::string unk64; - for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { - unk64 += m_vlm_config.unk; - } - images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; - if (embeds.slices) { - ov::Shape slices_shape = embeds.slices.get_shape(); - for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { - for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { - images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; - } - images_prompt += '\n'; - } - } - if ('\n' != *(images_prompt.end() - 1)) { - // Image wasn't sliced, add \n to the end of image anyway. - // Strangely, \n isn't placed between . - images_prompt += '\n'; - } - } - images_prompt += prompt; - std::string new_templated_chat_history; - if (m_is_chat_conversation) { - // KV cache in model already contains prompts and answers from previous iterations. - // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns - // token_ids = {, ...}. So if tokenizer applies only to the new prompt, - // will be inserted on every iteration. - // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt - // and takes only the difference between them. - // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but - // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. - m_history.push_back({{"role", "user"}, {"content", images_prompt}}); - constexpr bool add_generation_prompt = true; - new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + ov::Tensor inputs_embeds; + if (m_vlm_config.model_type == "minicpmv") { + inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs); + } else if (m_vlm_config.model_type == "llava") { + inputs_embeds = get_inputs_embeds_llava(prompt, rgbs); } - ov::Tensor special_tokens = m_tokenizer.encode( - m_vlm_config.im_start - + m_vlm_config.im_end - + m_vlm_config.slice_start - + m_vlm_config.slice_end - ).input_ids; - OPENVINO_ASSERT( - 4 == special_tokens.get_shape().at(1), - "Every special token must be represented with a single int." - ); - size_t im_start_id = special_tokens.data()[0]; - size_t im_end_id = special_tokens.data()[1]; - size_t slice_start_id = special_tokens.data()[2]; - size_t slice_end_id = special_tokens.data()[3]; - ov::Tensor input_ids = m_tokenizer.encode(new_templated_chat_history).input_ids; - m_embedding.set_input_tensor(input_ids); - m_embedding.infer(); - ov::Tensor inputs_embeds = m_embedding.get_output_tensor(); - OPENVINO_ASSERT( - m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), - "Unexpected embedding size" - ); - if (!rgbs.empty()) { - int64_t* ids = input_ids.data(); - const ov::Tensor& resampled_source = resample(*this, embeds.resized_source, {embeds.resized_source_size}); - float* emb = resampled_source.data(); - bool replacing = false; - for (size_t token_idx = 0; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) { - if (im_start_id == ids[token_idx]) { - replacing = true; - } - if (replacing) { - std::copy_n(emb, resampled_source.get_size(), inputs_embeds.data() + token_idx * m_vlm_config.hidden_size); - token_idx += resampled_source.get_shape().at(1); - replacing = false; - break; - } - } - if (embeds.slices) { - size_t token_idx = 0; - const ov::Shape& slices_shape = embeds.slices.get_shape(); - const std::vector& sliced_sizes = embeds.slices_sizes; - for (size_t i = 0; i < slices_shape.at(0); ++i) { - for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { - size_t d2 = slices_shape.at(2); - size_t d3 = slices_shape.at(3); - ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, embeds.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; - const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {sliced_sizes.at(i * slices_shape.at(1) + ja)}); - for (; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) { - if (slice_start_id == ids[token_idx]) { - replacing = true; - } - if (slice_end_id == ids[token_idx]) { - replacing = false; - break; - } - if (replacing) { - std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds.data() + token_idx * m_vlm_config.hidden_size); - token_idx += vision_embed_tensor_i_j.get_shape().at(1); - replacing = false; - break; - } - } - } - } - } - } - - // if (m_vlm_config.model_type == "minicpmv") { - // inputs_embeds = get_inputs_embeds_minicpm(prompt, images); - // } else if (m_vlm_config.model_type == "llava") { - // inputs_embeds = get_inputs_embeds_llava(prompt, images); - // } m_language.set_tensor("inputs_embeds", inputs_embeds); size_t history_len = m_language.get_tensor("attention_mask").get_shape().at(1); @@ -677,25 +566,117 @@ ov::Tensor VLMPipeline::get_inputs_embeds_llava(const std::string& prompt, const } ov::Tensor VLMPipeline::get_inputs_embeds_minicpm(const std::string& prompt, const std::vector& images) { - std::string wrapped = images.empty() ? - "<用户>" + prompt + "" : prompt + ""; - ov::Tensor input_ids = m_tokenizer.encode(wrapped).input_ids; - - if (images.empty()) { - //<用户> + prompt + LLM first input - return process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); - } else { - OPENVINO_ASSERT(1 == images.size(), "Only a single image allowed"); - EncodedImage embeds = m_vision_encoder.encode(images.at(0)); - ov::Tensor imgEmbedTensor = get_image_embedding(embeds, m_tokenizer, m_embedding, *this); - - ov::Shape img_embed_shape = imgEmbedTensor.get_shape(); - OPENVINO_ASSERT( - m_vlm_config.hidden_size == img_embed_shape.at(2), - "Unexpected embedding size"); - - //<用户> + image embedding + prompt + LLM first input - ov::Tensor prompt_tensor = process_prompt(m_embedding, input_ids, m_vlm_config.scale_emb); - return concatenate_mid_dim(imgEmbedTensor, prompt_tensor); + std::string images_prompt; + EncodedImage embeds; + if (!images.empty()) { + OPENVINO_ASSERT(1 == images.size(), "TODO: Only a single image allowed"); + embeds = m_vision_encoder.encode(images.at(0)); + if (m_vlm_config.use_image_id) { + images_prompt = m_vlm_config.im_id_start + std::to_string(image_id) + m_vlm_config.im_id_end; + ++image_id; + } + std::string unk64; + for (size_t idx = 0; idx < m_vlm_config.query_num; ++idx) { + unk64 += m_vlm_config.unk; + } + images_prompt += m_vlm_config.im_start + unk64 + m_vlm_config.im_end; + if (embeds.slices) { + ov::Shape slices_shape = embeds.slices.get_shape(); + for (size_t row_idx = 0; row_idx < slices_shape.at(0); ++row_idx) { + for (size_t col_idx = 0; col_idx < slices_shape.at(1); ++col_idx) { + images_prompt += m_vlm_config.slice_start + unk64 + m_vlm_config.slice_end; + } + images_prompt += '\n'; + } + } + if ('\n' != *(images_prompt.end() - 1)) { + // Image wasn't sliced, add \n to the end of image anyway. + // Strangely, \n isn't placed between . + images_prompt += '\n'; + } + } + images_prompt += prompt; + std::string new_templated_chat_history; + if (m_is_chat_conversation) { + // KV cache in model already contains prompts and answers from previous iterations. + // So only new prompt wrapped into chat template to be sent into model. Tokenizer always returns + // token_ids = {, ...}. So if tokenizer applies only to the new prompt, + // will be inserted on every iteration. + // So actual pipeline calculates input_ids for whole chat history + for whole chat history without the new prompt + // and takes only the difference between them. + // The chat history cannot be saved as already encoded tokens because generate call doesn't return token, but + // KV cache contains it. So we have to add it manually or get it by tokenization all chat history. + m_history.push_back({{"role", "user"}, {"content", images_prompt}}); + constexpr bool add_generation_prompt = true; + new_templated_chat_history = m_tokenizer.apply_chat_template(m_history, add_generation_prompt); + } + ov::Tensor special_tokens = m_tokenizer.encode( + m_vlm_config.im_start + + m_vlm_config.im_end + + m_vlm_config.slice_start + + m_vlm_config.slice_end + ).input_ids; + OPENVINO_ASSERT( + 4 == special_tokens.get_shape().at(1), + "Every special token must be represented with a single int." + ); + size_t im_start_id = special_tokens.data()[0]; + size_t im_end_id = special_tokens.data()[1]; + size_t slice_start_id = special_tokens.data()[2]; + size_t slice_end_id = special_tokens.data()[3]; + ov::Tensor input_ids = m_tokenizer.encode(new_templated_chat_history).input_ids; + m_embedding.set_input_tensor(input_ids); + m_embedding.infer(); + ov::Tensor inputs_embeds = m_embedding.get_output_tensor(); + OPENVINO_ASSERT( + m_vlm_config.hidden_size == inputs_embeds.get_shape().at(2), + "Unexpected embedding size" + ); + if (!images.empty()) { + int64_t* ids = input_ids.data(); + const ov::Tensor& resampled_source = resample(*this, embeds.resized_source, {embeds.resized_source_size}); + float* emb = resampled_source.data(); + bool replacing = false; + for (size_t token_idx = 0; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) { + if (im_start_id == ids[token_idx]) { + replacing = true; + } + if (replacing) { + std::copy_n(emb, resampled_source.get_size(), inputs_embeds.data() + token_idx * m_vlm_config.hidden_size); + token_idx += resampled_source.get_shape().at(1); + replacing = false; + break; + } + } + if (embeds.slices) { + size_t token_idx = 0; + const ov::Shape& slices_shape = embeds.slices.get_shape(); + const std::vector& sliced_sizes = embeds.slices_sizes; + for (size_t i = 0; i < slices_shape.at(0); ++i) { + for (size_t ja = 0; ja < slices_shape.at(1); ++ja) { + size_t d2 = slices_shape.at(2); + size_t d3 = slices_shape.at(3); + ov::Tensor encoded_view{ov::element::f32, {1, d2, d3}, embeds.slices.data() + (i * slices_shape.at(1) + ja) * d2 * d3}; + const ov::Tensor& vision_embed_tensor_i_j = resample(*this, encoded_view, {sliced_sizes.at(i * slices_shape.at(1) + ja)}); + for (; token_idx < inputs_embeds.get_shape().at(1); ++token_idx) { + if (slice_start_id == ids[token_idx]) { + replacing = true; + } + if (slice_end_id == ids[token_idx]) { + replacing = false; + break; + } + if (replacing) { + std::copy_n(vision_embed_tensor_i_j.data(), vision_embed_tensor_i_j.get_size(), inputs_embeds.data() + token_idx * m_vlm_config.hidden_size); + token_idx += vision_embed_tensor_i_j.get_shape().at(1); + replacing = false; + break; + } + } + } + } + } } + + return inputs_embeds; } From 1b5435c5511470d164d07e44b9e2c28871ebfc19 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Wed, 9 Oct 2024 15:12:31 +0400 Subject: [PATCH 08/16] Add vlm model type enum class --- .../include/openvino/genai/vision_encoder.hpp | 9 +++--- src/cpp/include/openvino/genai/vlm_config.hpp | 5 +-- .../include/openvino/genai/vlm_model_type.hpp | 31 +++++++++++++++++++ src/cpp/src/vision_encoder.cpp | 12 +++---- src/cpp/src/vlm_config.cpp | 2 +- src/cpp/src/vlm_pipeline.cpp | 10 +++--- 6 files changed, 50 insertions(+), 19 deletions(-) create mode 100644 src/cpp/include/openvino/genai/vlm_model_type.hpp diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp index 3fe80bf24f..e1c2be0102 100644 --- a/src/cpp/include/openvino/genai/vision_encoder.hpp +++ b/src/cpp/include/openvino/genai/vision_encoder.hpp @@ -5,6 +5,7 @@ #include "openvino/genai/processor_config.hpp" #include +#include "vlm_model_type.hpp" namespace ov::genai { /// @brief A pair describing image size. @@ -41,8 +42,8 @@ struct EncodedImage { /// ov::InferRequest and configured by ProcessorConfig. class OPENVINO_GENAI_EXPORTS VisionEncoder { public: - /// @brief A string denoting model type. - std::string model_type; + /// @brief A enum denoting model type. + VLMModelType model_type; /// @brief A model for image encoding. ov::InferRequest m_encoder; /// @brief A config to follow. @@ -68,10 +69,10 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder { /// @param core ov::Core to be used to compile the model. explicit VisionEncoder( const std::filesystem::path& model_dir, + const VLMModelType model_type, const std::string& device="CPU", const ov::AnyMap device_config={}, - ov::Core core=ov::Core{}, - std::string model_type="" + ov::Core core=ov::Core{} ); /// @brief Compute embeddings of an image. diff --git a/src/cpp/include/openvino/genai/vlm_config.hpp b/src/cpp/include/openvino/genai/vlm_config.hpp index 02d0f7c36a..46983c080a 100644 --- a/src/cpp/include/openvino/genai/vlm_config.hpp +++ b/src/cpp/include/openvino/genai/vlm_config.hpp @@ -6,14 +6,15 @@ #include "openvino/genai/visibility.hpp" #include #include +#include "vlm_model_type.hpp" namespace ov::genai { /// @brief A Configuration class passed to VLMPipeline and used to /// change VLMPipeline's behavior. Corresponds to config.json. class OPENVINO_GENAI_EXPORTS VLMConfig { public: - /// @brief A string denoting model type. - std::string model_type = ""; + /// @brief A enum denoting model type. + VLMModelType model_type; /// @brief A size of a single embedding returned by a resampler. /// Used to initialize positional embeddings for resampler input. size_t hidden_size = 2304; diff --git a/src/cpp/include/openvino/genai/vlm_model_type.hpp b/src/cpp/include/openvino/genai/vlm_model_type.hpp new file mode 100644 index 0000000000..9e35d543a7 --- /dev/null +++ b/src/cpp/include/openvino/genai/vlm_model_type.hpp @@ -0,0 +1,31 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 + +#pragma once + +#include +#include + +#include "openvino/genai/visibility.hpp" +#include + +namespace ov::genai { + +enum class OPENVINO_GENAI_EXPORTS VLMModelType { + MINICPM, + LLAVA, +}; + +inline VLMModelType to_vlm_model_type(const std::string& value) { + static const std::unordered_map model_types_map = { + {"minicpm", VLMModelType::MINICPM}, + {"llava", VLMModelType::LLAVA} + }; + + auto it = model_types_map.find(value); + if (it != model_types_map.end()) { + return it->second; + } + OPENVINO_THROW("Unsupported '", value, "' VLM model type"); +} +} \ No newline at end of file diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp index f513b433d3..02f7793297 100644 --- a/src/cpp/src/vision_encoder.cpp +++ b/src/cpp/src/vision_encoder.cpp @@ -358,15 +358,13 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig } } -VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std::string& device, const ov::AnyMap device_config, ov::Core core, std::string model_type) : +VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) : model_type(model_type) { - if (model_type == "minicpmv") { + if (model_type == VLMModelType::MINICPM) { m_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request(); - } else if (model_type == "llava") { + } else if (model_type == VLMModelType::LLAVA) { // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel m_vision_embeddings = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); - } else { - OPENVINO_THROW("Unsupported model type: " + model_type); } m_processor_config = ov::genai::utils::from_config_json_if_exists( model_dir, "preprocessor_config.json" @@ -374,9 +372,9 @@ VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const std:: } EncodedImage VisionEncoder::encode(const ov::Tensor& image, const ProcessorConfig& config) { - if (model_type == "minicpmv") { + if (model_type == VLMModelType::MINICPM) { return encode_minicpm(image, config); - } else if (model_type == "llava") { + } else if (model_type == VLMModelType::LLAVA) { return encode_llava(image, config); } } diff --git a/src/cpp/src/vlm_config.cpp b/src/cpp/src/vlm_config.cpp index a13a0da702..8d7585f2bb 100644 --- a/src/cpp/src/vlm_config.cpp +++ b/src/cpp/src/vlm_config.cpp @@ -10,7 +10,7 @@ ov::genai::VLMConfig::VLMConfig(const std::filesystem::path& json_path) { OPENVINO_ASSERT(stream.is_open(), "Failed to open '" + json_path.string() + "' with processor config"); nlohmann::json parsed = nlohmann::json::parse(stream); using ov::genai::utils::read_json_param; - read_json_param(parsed, "model_type", model_type); // TODO Consider checking supported model type here instead of VisionEncoder constructor + model_type = to_vlm_model_type(parsed.at("model_type")); read_json_param(parsed, "hidden_size", hidden_size); read_json_param(parsed, "scale_emb", scale_emb); read_json_param(parsed, "query_num", query_num); diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 4b1d13e984..8160bd4e6a 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -360,9 +360,9 @@ VLMPipeline::VLMPipeline( ) }, m_tokenizer{tokenizer}, - m_vision_encoder(model_dir, device, device_config, core, m_vlm_config.model_type), + m_vision_encoder(model_dir, m_vlm_config.model_type, device, device_config, core), m_is_chat_conversation{false} { - if (m_vlm_config.model_type == "minicpmv") { + if (m_vlm_config.model_type == VLMModelType::MINICPM) { m_resampler = core.compile_model( model_dir / "resampler.xml", device, device_config ).create_infer_request(); @@ -376,7 +376,7 @@ VLMPipeline::VLMPipeline( ).create_infer_request(); m_pos_embed_cache = get_2d_sincos_pos_embed(m_vlm_config.hidden_size, {70, 70}); - } else if (m_vlm_config.model_type == "llava") { + } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { m_language = core.compile_model( model_dir / "openvino_language_model.xml", device, device_config ).create_infer_request(); @@ -399,9 +399,9 @@ DecodedResults VLMPipeline::generate( const StreamerVariant& streamer ) { ov::Tensor inputs_embeds; - if (m_vlm_config.model_type == "minicpmv") { + if (m_vlm_config.model_type == VLMModelType::MINICPM) { inputs_embeds = get_inputs_embeds_minicpm(prompt, rgbs); - } else if (m_vlm_config.model_type == "llava") { + } else if (m_vlm_config.model_type == VLMModelType::LLAVA) { inputs_embeds = get_inputs_embeds_llava(prompt, rgbs); } From 8912b56d529175fd95b2ab2e7727356eb70237c2 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Wed, 9 Oct 2024 16:03:06 +0400 Subject: [PATCH 09/16] Fix typo in minicpm model type --- src/cpp/include/openvino/genai/vlm_model_type.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpp/include/openvino/genai/vlm_model_type.hpp b/src/cpp/include/openvino/genai/vlm_model_type.hpp index 9e35d543a7..0f811a116a 100644 --- a/src/cpp/include/openvino/genai/vlm_model_type.hpp +++ b/src/cpp/include/openvino/genai/vlm_model_type.hpp @@ -18,7 +18,7 @@ enum class OPENVINO_GENAI_EXPORTS VLMModelType { inline VLMModelType to_vlm_model_type(const std::string& value) { static const std::unordered_map model_types_map = { - {"minicpm", VLMModelType::MINICPM}, + {"minicpmv", VLMModelType::MINICPM}, {"llava", VLMModelType::LLAVA} }; From 729b06386b598105ab0c30efe040eaa66b5151e3 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 11 Oct 2024 13:58:33 +0400 Subject: [PATCH 10/16] Add llava to supported models --- src/docs/SUPPORTED_MODELS.md | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index 1232a081dd..bc57e2863f 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -167,14 +167,23 @@ The pipeline can work with other similar topologies produced by `optimum-intel` Example HuggingFace Models - MiniCPM-V-2_6 MiniCPMV + MiniCPM-V-2_6 + + LLaVA + LLaVA-v1.5 + + + + From 18b49c72f3d4a6cbcce27036c07ed2ed12db1c57 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 11 Oct 2024 14:53:42 +0400 Subject: [PATCH 11/16] Switch to optimum-intel from git in requirements --- README.md | 3 +-- samples/requirements.txt | 2 +- tests/python_tests/requirements.txt | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9a4d73802b..163768b18e 100644 --- a/README.md +++ b/README.md @@ -40,10 +40,9 @@ Continuous batching functionality is used within OpenVINO Model Server (OVMS) to # Install optimum-intel to be able to download, convert and optimize LLMs from Hugging Face # Optimum is not required to run models, only to convert and compress - pip install optimum[openvino] + pip install optimum-intel@git+https://github.com/huggingface/optimum-intel.git # (Optional) Install (TBD) to be able to download models from Model Scope - #pip install optimum[openvino] ``` ## Performing text generation diff --git a/samples/requirements.txt b/samples/requirements.txt index 4821d6dbef..a61fb6d68f 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cpu -optimum[openvino]==1.22.0 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diffusers==0.30.3 diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index 8c49f7c1e6..eab7f0f4c3 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,5 +1,5 @@ --extra-index-url https://download.pytorch.org/whl/cpu -optimum[openvino]==1.22.0 +optimum-intel @ git+https://github.com/huggingface/optimum-intel.git onnx==1.16.1 pytest llm_bench/python/who_what_benchmark From 4592cd6623ec28760bc27a318749bf25c1a2b282 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 11 Oct 2024 15:00:42 +0400 Subject: [PATCH 12/16] Remove redundant optimum install --- .github/workflows/causal_lm_cpp.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/causal_lm_cpp.yml b/.github/workflows/causal_lm_cpp.yml index 23437b6f67..b8fbe397d2 100644 --- a/.github/workflows/causal_lm_cpp.yml +++ b/.github/workflows/causal_lm_cpp.yml @@ -715,7 +715,6 @@ jobs: source ./ov/setupvars.sh python -m pip install ./thirdparty/openvino_tokenizers/[transformers] --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly python -m pip install --upgrade-strategy eager -r ./samples/requirements.txt --pre --extra-index-url https://storage.openvinotoolkit.org/simple/wheels/nightly - python -m pip install git+https://github.com/huggingface/optimum-intel.git optimum-cli export openvino --model llava-hf/llava-1.5-7b-hf ./llava_1_5_7b_ov/ wget https://llava-vl.github.io/static/images/monalisa.jpg - name: Run visual_language_chat sample - LLaVa 1.5 From 8f304288a24c79c02fefc2292c55a7eeb702029a Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 11 Oct 2024 15:34:04 +0400 Subject: [PATCH 13/16] Reorder supported vlm models --- src/docs/SUPPORTED_MODELS.md | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/docs/SUPPORTED_MODELS.md b/src/docs/SUPPORTED_MODELS.md index bc57e2863f..fb6df36950 100644 --- a/src/docs/SUPPORTED_MODELS.md +++ b/src/docs/SUPPORTED_MODELS.md @@ -167,20 +167,20 @@ The pipeline can work with other similar topologies produced by `optimum-intel` Example HuggingFace Models - MiniCPMV - MiniCPM-V-2_6 + LLaVA + LLaVA-v1.5 - LLaVA - LLaVA-v1.5 + MiniCPMV + MiniCPM-V-2_6 From 04a0014834dbdab131c5af7611058a8db9fc1f37 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 11 Oct 2024 15:44:11 +0400 Subject: [PATCH 14/16] Reuse m_vision_encoder --- src/cpp/include/openvino/genai/vision_encoder.hpp | 7 ++----- src/cpp/src/vision_encoder.cpp | 12 ++++++------ 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/src/cpp/include/openvino/genai/vision_encoder.hpp b/src/cpp/include/openvino/genai/vision_encoder.hpp index 5a5d0c37b0..902557d316 100644 --- a/src/cpp/include/openvino/genai/vision_encoder.hpp +++ b/src/cpp/include/openvino/genai/vision_encoder.hpp @@ -45,20 +45,17 @@ class OPENVINO_GENAI_EXPORTS VisionEncoder { /// @brief A enum denoting model type. VLMModelType model_type; /// @brief A model for image encoding. - ov::InferRequest m_encoder; + ov::InferRequest m_vision_encoder; /// @brief A config to follow. ProcessorConfig m_processor_config; - // LLaVa specific members - ov::InferRequest m_vision_embeddings; - /// @brief Construct from an already compiled model and a config. /// @param encoder Compiled model. /// @param processor_config Initial config. explicit VisionEncoder( const ov::InferRequest& encoder, const ProcessorConfig& processor_config=ProcessorConfig{} - ) : m_encoder{encoder}, m_processor_config{processor_config} {} + ) : m_vision_encoder{encoder}, m_processor_config{processor_config} {} /// @brief Construct the encoder from model_dir. /// @param model_dir A folder containing openvino_embedding.xml and diff --git a/src/cpp/src/vision_encoder.cpp b/src/cpp/src/vision_encoder.cpp index 856db0b96e..6c926e0ed8 100644 --- a/src/cpp/src/vision_encoder.cpp +++ b/src/cpp/src/vision_encoder.cpp @@ -432,10 +432,10 @@ ov::Tensor preprocess_image_llava(const ov::Tensor& image, const ProcessorConfig VisionEncoder::VisionEncoder(const std::filesystem::path& model_dir, const VLMModelType model_type, const std::string& device, const ov::AnyMap device_config, ov::Core core) : model_type(model_type) { if (model_type == VLMModelType::MINICPM) { - m_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request(); + m_vision_encoder = core.compile_model(model_dir / "image_encoder.xml", device, device_config).create_infer_request(); } else if (model_type == VLMModelType::LLAVA) { // Vision embeddings model is merged with multi modal projector at model export stage by optimum-intel - m_vision_embeddings = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); + m_vision_encoder = core.compile_model(model_dir / "openvino_vision_embeddings_model.xml", device, device_config).create_infer_request(); } m_processor_config = ov::genai::utils::from_config_json_if_exists( model_dir, "preprocessor_config.json" @@ -462,16 +462,16 @@ EncodedImage VisionEncoder::encode_minicpm(const ov::Tensor& image, const Proces ctx_clip.image_size = m_processor_config.image_size; std::copy(config.norm_mean.begin(), config.norm_mean.end(), ctx_clip.image_mean); std::copy(config.norm_std.begin(), config.norm_std.end(), ctx_clip.image_std); - return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums); + return llava_image_embed_make_with_bytes_slice(ctx_clip, image, m_vision_encoder, config.max_slice_nums, config.scale_resolution, config.patch_size, 0 == config.max_slice_nums); } EncodedImage VisionEncoder::encode_llava(const ov::Tensor& image, const ProcessorConfig& config) { ov::Tensor preprocessed_image = preprocess_image_llava(image, config); - m_vision_embeddings.set_tensor("pixel_values", preprocessed_image); - m_vision_embeddings.infer(); + m_vision_encoder.set_tensor("pixel_values", preprocessed_image); + m_vision_encoder.infer(); - ov::Tensor image_features = m_vision_embeddings.get_output_tensor(); + ov::Tensor image_features = m_vision_encoder.get_output_tensor(); ImageSize resized_source_size{config.crop_size_height / config.patch_size, config.crop_size_width / config.patch_size}; return {image_features, resized_source_size}; From d9feaead7a1c3e474417a8c5e54b551dd6bec9b0 Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 11 Oct 2024 16:21:28 +0400 Subject: [PATCH 15/16] Fix samples requirements with lowering numpy for macos --- samples/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/samples/requirements.txt b/samples/requirements.txt index a61fb6d68f..df71d0cbb1 100644 --- a/samples/requirements.txt +++ b/samples/requirements.txt @@ -1,5 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +numpy<2.0.0; sys_platform == 'darwin' einops==0.8.0 # For Qwen transformers_stream_generator==0.0.5 # For Qwen diffusers==0.30.3 From 8f1e347da687f52e0ab4e5ed7c85035e8be86b5c Mon Sep 17 00:00:00 2001 From: yatarkan Date: Fri, 11 Oct 2024 17:17:56 +0400 Subject: [PATCH 16/16] Fix python tests requirements with numpy for macos --- tests/python_tests/requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/python_tests/requirements.txt b/tests/python_tests/requirements.txt index eab7f0f4c3..372d3ac950 100644 --- a/tests/python_tests/requirements.txt +++ b/tests/python_tests/requirements.txt @@ -1,5 +1,6 @@ --extra-index-url https://download.pytorch.org/whl/cpu optimum-intel @ git+https://github.com/huggingface/optimum-intel.git +numpy<2.0.0; sys_platform == 'darwin' onnx==1.16.1 pytest llm_bench/python/who_what_benchmark