diff --git a/samples/cpp/visual_language_chat/CMakeLists.txt b/samples/cpp/visual_language_chat/CMakeLists.txt index 9a1b21632f..bcf171f3a8 100644 --- a/samples/cpp/visual_language_chat/CMakeLists.txt +++ b/samples/cpp/visual_language_chat/CMakeLists.txt @@ -8,6 +8,11 @@ find_package(OpenVINOGenAI REQUIRED NO_CMAKE_FIND_ROOT_PATH ) +FetchContent_Declare(cxxopts + URL https://github.com/jarro2783/cxxopts/archive/refs/tags/v3.1.1.tar.gz + URL_HASH SHA256=523175f792eb0ff04f9e653c90746c12655f10cb70f1d5e6d6d9491420298a08) +FetchContent_MakeAvailable(cxxopts) + file(DOWNLOAD https://raw.githubusercontent.com/nothings/stb/f75e8d1cad7d90d72ef7a4661f1b994ef78b4e31/stb_image.h ${CMAKE_BINARY_DIR}/stb_image.h @@ -15,7 +20,7 @@ file(DOWNLOAD add_executable(visual_language_chat visual_language_chat.cpp load_image.cpp) target_include_directories(visual_language_chat PRIVATE "${CMAKE_CURRENT_SOUCE_DIR}" "${CMAKE_BINARY_DIR}") -target_link_libraries(visual_language_chat PRIVATE openvino::genai) +target_link_libraries(visual_language_chat PRIVATE openvino::genai cxxopts::cxxopts) set_target_properties(visual_language_chat PROPERTIES COMPILE_PDB_NAME visual_language_chat diff --git a/samples/cpp/visual_language_chat/visual_language_chat.cpp b/samples/cpp/visual_language_chat/visual_language_chat.cpp index b1522903ef..4486376f48 100644 --- a/samples/cpp/visual_language_chat/visual_language_chat.cpp +++ b/samples/cpp/visual_language_chat/visual_language_chat.cpp @@ -5,23 +5,84 @@ #include #include +#include +#include +namespace fs = std::filesystem; + +#include + bool print_subword(std::string&& subword) { return !(std::cout << subword << std::flush); } int main(int argc, char* argv[]) try { - if (3 != argc) { - throw std::runtime_error(std::string{"Usage "} + argv[0] + " "); + + cxxopts::Options options("visual_language_chat", "Help command"); + + options.add_options() + ("m,model", "Path to model and tokenizers base directory", cxxopts::value()->default_value("")) + ("i,inputs", "Path to image or to directory with images", cxxopts::value()->default_value("")) + ("d,device", "Target device to run the model", cxxopts::value()->default_value("CPU")) + ("s,sampling", "Sampling method: [greedy|multinomial|beam_search]. Optional, 'greedy' by default.", cxxopts::value()->default_value("greedy")) + ("h,help", "Print usage"); + + cxxopts::ParseResult result; + try { + result = options.parse(argc, argv); + } catch (const cxxopts::exceptions::exception& e) { + std::cout << e.what() << "\n\n"; + std::cout << options.help() << std::endl; + return EXIT_FAILURE; + } + + if (result.count("help")) { + std::cout << options.help() << std::endl; + return EXIT_SUCCESS; } - ov::Tensor image = utils::load_image(argv[2]); - std::string device = "CPU"; // GPU can be used as well + + const std::string model_path = result["model"].as(); + const std::string device = result["device"].as(); + const std::string input_path = result["inputs"].as(); + const std::string sampling_method = result["sampling"].as(); + + ov::AnyMap properies; + if (sampling_method == "greedy") { + properies.insert(ov::genai::generation_config(ov::genai::greedy())); + properies.insert(ov::genai::streamer(print_subword)); + } else if (sampling_method == "beam_search") { + properies.insert(ov::genai::generation_config(ov::genai::beam_search())); + } else if (sampling_method == "multinomial") { + properies.insert(ov::genai::generation_config(ov::genai::multinomial())); + properies.insert(ov::genai::streamer(print_subword)); + } else { + throw std::runtime_error("Generation config should have values: [greedy|multinomial|beam_search] or could be empty, in which case the greedy approach will be used."); + } + + std::vector images; + if (!input_path.empty() && fs::exists(input_path)) { + if (fs::is_directory(input_path)) { + for (const auto& dir_entry : fs::directory_iterator(input_path)) { + ov::Tensor image = utils::load_image(dir_entry.path()); + images.push_back(std::move(image)); + } + } else if (fs::is_regular_file(input_path)) { + ov::Tensor image = utils::load_image(input_path); + images.push_back(std::move(image)); + } + } + + if (images.empty()) + throw std::runtime_error("No one image found by path " + input_path); + else + properies.insert(images.size() == 1 ? ov::genai::image(images.at(0)) : ov::genai::images(images)); + ov::AnyMap enable_compile_cache; if ("GPU" == device) { // Cache compiled models on disk for GPU to save time on the // next run. It's not beneficial for CPU. enable_compile_cache.insert({ov::cache_dir("vlm_cache")}); } - ov::genai::VLMPipeline pipe(argv[1], device, enable_compile_cache); + ov::genai::VLMPipeline pipe(model_path, device, enable_compile_cache); std::string prompt; pipe.start_chat(); @@ -29,22 +90,17 @@ int main(int argc, char* argv[]) try { if (!std::getline(std::cin, prompt)) { throw std::runtime_error("std::cin failed"); } - pipe.generate( - prompt, - // ov::genai::image(std::move(image)), - ov::genai::generation_config(ov::genai::beam_search()), - // ov::genai::generation_config(ov::genai::greedy()), - // ov::genai::generation_config(ov::genai::multinomial()), - ov::genai::streamer(print_subword) - ); + auto resuls = pipe.generate(prompt, properies); + if (sampling_method == "beam_search") { + std::cout << resuls.texts.at(0) << std::endl; + } std::cout << "\n----------\n" "question:\n"; while (std::getline(std::cin, prompt)) { - pipe.generate(prompt, - ov::genai::generation_config(ov::genai::beam_search()), - // ov::genai::generation_config(ov::genai::greedy()), - // ov::genai::generation_config(ov::genai::multinomial()), - ov::genai::streamer(print_subword)); + resuls = pipe.generate(prompt, properies); + if (sampling_method == "beam_search") { + std::cout << resuls.texts.at(0) << std::endl; + } std::cout << "\n----------\n" "question:\n"; } diff --git a/src/cpp/src/sampler.cpp b/src/cpp/src/sampler.cpp index aa3e2b1d5a..7fa36a9b04 100644 --- a/src/cpp/src/sampler.cpp +++ b/src/cpp/src/sampler.cpp @@ -577,13 +577,13 @@ std::vector Sampler::_try_finish_generation(SequenceGroup::Ptr & sequen } -std::vector Sampler::get_beam_idxs(uint64_t request_id) { - std::vector beams; - if (m_beam_search_info.find(request_id) != m_beam_search_info.end()) { - GroupBeamSearcher beam_searcher = m_beam_search_info.at(request_id); - std::vector beams = beam_searcher.get_beam_idxs(); +std::vector Sampler::get_beam_idxs(SequenceGroup::CPtr request) { + uint64_t request_id = request->get_request_id(); + auto beam_searcher = m_beam_search_info.find(request_id); + if (m_beam_search_info.find(request_id) == m_beam_search_info.end()) { + return std::vector(request->num_running_seqs(), 0); } - return beams; + return beam_searcher->second.get_beam_idxs(); } diff --git a/src/cpp/src/sampler.hpp b/src/cpp/src/sampler.hpp index f664bc16dc..e66fc4b700 100644 --- a/src/cpp/src/sampler.hpp +++ b/src/cpp/src/sampler.hpp @@ -61,7 +61,7 @@ class Sampler { SamplerOutput sample(std::vector & sequence_groups, ov::Tensor logits); void set_seed(size_t seed) { rng_engine.seed(seed); } void clear_beam_search_info(uint64_t request_id); - std::vector get_beam_idxs(uint64_t request_id); + std::vector get_beam_idxs(SequenceGroup::CPtr); }; class Sampler::GroupBeamSearcher { diff --git a/src/cpp/src/vlm_pipeline.cpp b/src/cpp/src/vlm_pipeline.cpp index 0d24ddffe7..be73306416 100644 --- a/src/cpp/src/vlm_pipeline.cpp +++ b/src/cpp/src/vlm_pipeline.cpp @@ -4,6 +4,7 @@ #include "openvino/genai/vlm_pipeline.hpp" #include "openvino/genai/tokenizer.hpp" #include "vlm_sampling.hpp" +#include "sampler.hpp" #include "clip.hpp" #include #include "../src/text_callback_streamer.hpp" @@ -11,10 +12,6 @@ #include #include -#include "sampler.hpp" - -#include "debug_utils.hpp" - using namespace ov::genai; namespace { @@ -298,90 +295,6 @@ ov::Tensor resample(VLMPipeline& pipe, const ov::Tensor& encoded_image, const st pipe.m_resampler.infer(); return pipe.m_resampler.get_output_tensor(); // [N, query_num, new_hidden_size] } -} - - -void forward_embedings_and_lm(SequenceGroup::CPtr sequence_group, ov::InferRequest& embedding, ov::InferRequest& language, const VLMConfig m_vlm_config, const std::shared_ptr sampler) { - // compute aggregated values - size_t num_sequences = sequence_group->num_running_seqs(); - size_t batch_size_in_sequences = num_sequences; - size_t total_num_tokens = sequence_group->get_num_scheduled_tokens() * num_sequences; - size_t total_num_blocks = sequence_group->get_num_blocks() * num_sequences; - size_t max_context_len_val = std::max(max_context_len_val, sequence_group->get_context_len()); - - ov::Tensor - input_ids(ov::element::i64, {total_num_tokens, 1}), - position_ids(ov::element::i64, {total_num_tokens, 1}), - beam_idx(ov::element::i32, { total_num_tokens }); - - // get raw pointers to copy to - int64_t - * input_ids_data = input_ids.data(), - * position_ids_data = position_ids.data(); - int32_t - * beam_idx_data = beam_idx.data(); - - std::vector running_sequences = sequence_group->get_running_sequences(); - size_t num_running_sequences = running_sequences.size(); - size_t num_scheduled_tokens = sequence_group->get_num_scheduled_tokens(); - size_t group_position_id = sequence_group->get_num_processed_tokens(); - - // spec: In case of multiple input tokens for current sequence (prompt_len > 1), - // context_len corresponds to first token within subgroup of scheduled tokens - size_t group_context_len = group_position_id; - - for (size_t seq_id = 0; seq_id < num_running_sequences; ++seq_id) { - Sequence::CPtr sequence = running_sequences[seq_id]; - - for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) { - // compute token for current sequence - input_ids_data[token_id] = position_id < sequence_group->get_prompt_len() ? - sequence_group->get_prompt_ids()[position_id] : - sequence->get_generated_ids()[position_id - sequence_group->get_prompt_len()]; - - position_ids_data[token_id] = position_id; - } - - // apply strides to shift to a next sequence - input_ids_data += num_scheduled_tokens; - position_ids_data += num_scheduled_tokens; - } - - embedding.set_input_tensor(input_ids); - - embedding.infer(); - const ov::Tensor& embed_prompt_tensor = embedding.get_output_tensor(); - float* embed_data = embed_prompt_tensor.data(); - for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) { - embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb; - } - - language.set_tensor("inputs_embeds", embed_prompt_tensor); - - language.get_tensor("attention_mask").set_shape({ total_num_tokens, language.get_tensor("attention_mask").get_shape()[1] + 1 }); - std::fill_n(language.get_tensor("attention_mask").data(), language.get_tensor("attention_mask").get_size(), 1); - - language.set_tensor("position_ids", position_ids); - std::vector beam_idxs = sampler->get_beam_idxs(sequence_group->get_request_id()); - if (beam_idxs.empty()) { - for (size_t i = 0; i < num_sequences; i++) { - beam_idx_data[i] = 0; - } - } else { - for (size_t i = 0; i < beam_idxs.size(); i++) { - beam_idx_data[i] = beam_idxs.at(i); - } - } - language.set_tensor("beam_idx", beam_idx); - - // print_tensor("input_ids", input_ids); - // print_tensor("position_ids", position_ids); - // print_tensor("attention_mask", language.get_tensor("attention_mask")); - // print_tensor("beam_idx", beam_idx); - - language.infer(); -} - EncodedGenerationResult get_lm_encoded_results( ov::InferRequest& language, @@ -389,7 +302,7 @@ EncodedGenerationResult get_lm_encoded_results( ov::Tensor inputs_embeds, const VLMConfig m_vlm_config, const std::shared_ptr streamer_ptr, - const std::shared_ptr sampler, + Sampler& sampler, std::vector requests ) { SequenceGroup::Ptr request = requests.back(); @@ -412,26 +325,75 @@ EncodedGenerationResult get_lm_encoded_results( int64_t sequence_len = language.get_tensor("logits").get_shape().at(1); request->schedule_tokens(sequence_len); - SamplerOutput sampler_output = sampler->sample(requests, language.get_tensor("logits")); + SamplerOutput sampler_output = sampler.sample(requests, language.get_tensor("logits")); language.get_tensor("inputs_embeds").set_shape({BATCH_SIZE, 1, m_vlm_config.hidden_size}); language.get_tensor("position_ids").set_shape({ BATCH_SIZE, 1 }); - while (!request->has_finished()) { request->schedule_tokens(1); + size_t num_sequences = request->num_running_seqs(); + size_t total_num_tokens = request->get_num_scheduled_tokens() * num_sequences; + + ov::Tensor + input_ids(ov::element::i64, {total_num_tokens, 1}), + position_ids(ov::element::i64, {total_num_tokens, 1}), + beam_idx(ov::element::i32, { total_num_tokens }); + + int64_t + * input_ids_data = input_ids.data(), + * position_ids_data = position_ids.data(); + + size_t num_scheduled_tokens = request->get_num_scheduled_tokens(); + size_t group_position_id = request->get_num_processed_tokens(); + for (Sequence::Ptr& sequence : request->get_running_sequences()) { + for (size_t token_id = 0, position_id = group_position_id; token_id < num_scheduled_tokens; ++token_id, ++position_id) { + // compute token for current sequence + input_ids_data[token_id] = position_id < request->get_prompt_len() ? + request->get_prompt_ids()[position_id] : + sequence->get_generated_ids()[position_id - request->get_prompt_len()]; + + position_ids_data[token_id] = position_id; + } + // apply strides to shift to a next sequence + input_ids_data += num_scheduled_tokens; + position_ids_data += num_scheduled_tokens; + } + + embedding.set_input_tensor(input_ids); + + embedding.infer(); + const ov::Tensor& embed_prompt_tensor = embedding.get_output_tensor(); + float* embed_data = embed_prompt_tensor.data(); + for (auto idx = 0; idx < embed_prompt_tensor.get_size(); idx++) { + embed_data[idx] = embed_data[idx] * m_vlm_config.scale_emb; + } - forward_embedings_and_lm(request, embedding, language, m_vlm_config, sampler); + language.set_tensor("inputs_embeds", embed_prompt_tensor); + + language.get_tensor("attention_mask").set_shape({ total_num_tokens, language.get_tensor("attention_mask").get_shape()[1] + 1 }); + std::fill_n(language.get_tensor("attention_mask").data(), language.get_tensor("attention_mask").get_size(), 1); + + language.set_tensor("position_ids", position_ids); + + std::vector beam_idxs = sampler.get_beam_idxs(request); + int32_t *beam_idx_data = beam_idx.data(); + for (size_t i = 0; i < beam_idxs.size(); i++) { + beam_idx_data[i] = beam_idxs.at(i); + } + language.set_tensor("beam_idx", beam_idx); + + language.infer(); if (streamer_ptr) { - // first sequences + // first sequence int64_t out_token = request.get()->operator[](0)->get_generated_ids().back(); if (streamer_ptr->put(out_token)) { break; } } - sampler_output = sampler->sample(requests, language.get_tensor("logits")); + sampler_output = sampler.sample(requests, language.get_tensor("logits")); } if (streamer_ptr) { @@ -455,6 +417,7 @@ EncodedGenerationResult get_lm_encoded_results( return result; } +} // anonymous class ov::genai::VLMPipeline::VLMPipelineImpl { @@ -608,11 +571,11 @@ DecodedResults VLMPipeline::generate( } } } - - std::shared_ptr sampler = std::make_shared(m_tokenizer); + + Sampler sampler = Sampler(m_tokenizer); std::vector requests; - auto attention_size = m_language.get_tensor("attention_mask").get_size(); // request_id, input_ids, generation_config, block_size, enable_prefix_caching + // request_id, input_ids, generation_config, block_size, enable_prefix_caching // now we have one prompt as input, so we need one request SequenceGroup::Ptr sequence_group = std::make_shared(0, encoded_input, generation_config, 1, false); sequence_group->set_sequence_group_ptr(sequence_group); @@ -632,6 +595,10 @@ DecodedResults VLMPipeline::generate( }, }, streamer); + if ((!(generation_config.is_greedy_decoding() || generation_config.is_multinomial())) && streamer_ptr) { + OPENVINO_THROW("Currently streaming is possible only for greedy or multinomial decoding"); + } + EncodedGenerationResult encoded_result = get_lm_encoded_results(m_language, m_embedding, inputs_embeds, m_vlm_config, streamer_ptr, sampler, requests); DecodedResults decoded; @@ -679,10 +646,6 @@ DecodedResults VLMPipeline::generate( if (config.eos_token_id == -1) config.set_eos_token_id(m_tokenizer.get_eos_token_id()); - // if (is_chat_conversation && config.num_return_sequences > 1) { - // config.num_return_sequences = 1; - // } - return generate( prompt, rgbs,