diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp index 0fa1207bd9935a..a274c8d1c1cae6 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/config.hpp @@ -74,6 +74,11 @@ struct OptionParser<int32_t> final { static int32_t parse(std::string_view val); }; +template <> +struct OptionParser<uint32_t> final { + static uint32_t parse(std::string_view val); +}; + template <> struct OptionParser<int64_t> final { static int64_t parse(std::string_view val); @@ -167,6 +172,25 @@ struct OptionPrinter final { } }; +template <typename K, typename V> +struct OptionPrinter<std::map<K, V>> final { + static std::string toString(const std::map<K, V>& val) { + std::stringstream ss; + std::size_t counter = 0; + std::size_t size = val.size(); + for (auto& [key, value] : val) { + std::string key_str = OptionPrinter<K>::toString(key); + std::string value_str = OptionPrinter<V>::toString(value); + ss << key_str << ":" << value_str; + if (counter < size - 1) { + ss << ","; + } + ++counter; + } + return ss.str(); + } +}; + // NB: boolean config option has values YES for true, NO for false template <> struct OptionPrinter<bool> final { diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp index 6d865ad5e4edf3..927b234df8ba15 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/config/npuw.hpp @@ -17,6 +17,7 @@ namespace intel_npu { // void registerNPUWOptions(OptionsDesc& desc); +void registerNPUWLLMOptions(OptionsDesc& desc); #define DEFINE_OPT(Name, Type, DefaultValue, PropertyKey, Mode) \ struct Name final : OptionBase<Name, Type> { \ @@ -66,4 +67,110 @@ DEFINE_OPT(NPUW_DUMP_SUBS, std::string, "", npuw::dump::subgraphs, CompileTime); DEFINE_OPT(NPUW_DUMP_SUBS_ON_FAIL, std::string, "", npuw::dump::subgraphs_on_fail, CompileTime); DEFINE_OPT(NPUW_DUMP_IO, std::string, "", npuw::dump::inputs_outputs, RunTime); DEFINE_OPT(NPUW_DUMP_IO_ITERS, bool, false, npuw::dump::io_iters, RunTime); +DEFINE_OPT(NPUW_LLM, bool, false, npuw::llm::enabled, CompileTime); +DEFINE_OPT(NPUW_LLM_MAX_PROMPT_LEN, uint32_t, 1024, npuw::llm::max_prompt_len, CompileTime); +DEFINE_OPT(NPUW_LLM_MIN_RESPONSE_LEN, uint32_t, 128, npuw::llm::min_response_len, CompileTime); + +namespace npuw { +namespace llm { +struct ModelDesc { + std::string type; + std::string name_or_path; + int num_key_value_heads; +}; +enum class GenerateHint { FAST_COMPILE, BEST_PERF }; +} // namespace llm +} // namespace npuw + +struct NPUW_LLM_MODEL_DESC final : OptionBase<NPUW_LLM_MODEL_DESC, ::intel_npu::npuw::llm::ModelDesc> { + static std::string_view key() { + return ov::intel_npu::npuw::llm::model_desc.name(); + } + + static constexpr std::string_view getTypeName() { + return "::intel_npu::npuw::llm::ModelDesc"; + } + + static ::intel_npu::npuw::llm::ModelDesc defaultValue() { + return {}; + } + + static ::intel_npu::npuw::llm::ModelDesc parse(std::string_view val) { + ::intel_npu::npuw::llm::ModelDesc res; + std::map<std::string, std::string> res_map = OptionParser<std::map<std::string, std::string>>::parse(val); + res.type = res_map["type"]; + res.name_or_path = res_map["name_or_path"]; + res.num_key_value_heads = std::stoi(res_map["num_key_value_heads"]); + return res; + } + + static std::string toString(const ::intel_npu::npuw::llm::ModelDesc& val) { + std::string res; + std::map<std::string, std::string> res_map; + res_map["type"] = val.type; + res_map["name_or_path"] = val.name_or_path; + res_map["num_key_value_heads"] = std::to_string(val.num_key_value_heads); + return OptionPrinter<std::map<std::string, std::string>>::toString(res_map); + } + + static OptionMode mode() { + return OptionMode::CompileTime; + } + + static bool isPublic() { + return true; + } +}; + +struct NPUW_LLM_GENERATE_HINT final : OptionBase<NPUW_LLM_GENERATE_HINT, ::intel_npu::npuw::llm::GenerateHint> { + static std::string_view key() { + return ov::intel_npu::npuw::llm::generate_hint.name(); + } + + static constexpr std::string_view getTypeName() { + return "::intel_npu::npuw::llm::GenerateHint"; + } + + static ::intel_npu::npuw::llm::GenerateHint defaultValue() { + return ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE; + } + + static ::intel_npu::npuw::llm::GenerateHint parse(std::string_view val) { + ::intel_npu::npuw::llm::GenerateHint res; + + if (val == "FAST_COMPILE") { + res = ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE; + } else if (val == "BEST_PERF") { + res = ::intel_npu::npuw::llm::GenerateHint::BEST_PERF; + } else { + OPENVINO_THROW("Unsupported \"GENERATE_HINT\" provided: ", + val, + ". Please select either \"FAST_COMPILE\" or \"BEST_PERF\"."); + } + return res; + } + + static std::string toString(const ::intel_npu::npuw::llm::GenerateHint& val) { + std::string res; + switch (val) { + case ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE: + res = "FAST_COMPILE"; + break; + case ::intel_npu::npuw::llm::GenerateHint::BEST_PERF: + res = "BEST_PERF"; + break; + default: + OPENVINO_THROW("Can't convert provided \"GENERATE_HINT\" : ", int(val), " to string."); + } + return res; + } + + static OptionMode mode() { + return OptionMode::CompileTime; + } + + static bool isPublic() { + return true; + } +}; } // namespace intel_npu diff --git a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp index af4a17988f451e..a416ca51233893 100644 --- a/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp +++ b/src/plugins/intel_npu/src/al/include/intel_npu/npuw_private_properties.hpp @@ -378,6 +378,51 @@ static constexpr ov::Property<std::string> inputs_outputs{"NPUW_DUMP_IO"}; static constexpr ov::Property<std::string> io_iters{"NPUW_DUMP_IO_ITERS"}; } // namespace dump +namespace llm { +/** + * @brief + * Type: bool. + * Tell NPUW that you want to pass dynamic stateful LLM model. + * Default value: false. + */ +static constexpr ov::Property<bool> enabled{"NPUW_LLM"}; + +/** + * @brief + * Type: std::map<std::string, std::string>. + * Tell NPUW about your LLM model. Use following structure for that: + * "type:<type>,name_or_path:<name_or_path>,num_key_value_heads:<number>". + * Default value: empty structure defined above. + */ +static constexpr ov::Property<std::string> model_desc{"NPUW_LLM_MODEL_DESC"}; + +/** + * @brief + * Type: uint32_t. + * Tell NPUW your desirable max prompt length. + * Default value: 1024. + */ +static constexpr ov::Property<uint32_t> max_prompt_len{"NPUW_LLM_MAX_PROMPT_LEN"}; + +/** + * @brief + * Type: uint32_t. + * Tell NPUW your desirable min response length. + * Default value: 128. + */ +static constexpr ov::Property<uint32_t> min_response_len{"NPUW_LLM_MIN_RESPONSE_LEN"}; + +/** + * @brief + * Type: std::string. + * Tell NPUW the preferrable hint for generation stage, that leads to usage of optimal configuration for it. + * Possible values: "FAST_COMPILE", "BEST_PERF". + * Default value: "FAST_COMPILE". + */ +static constexpr ov::Property<std::string> generate_hint{"NPUW_LLM_GENERATE_HINT"}; + +} // namespace llm + } // namespace npuw } // namespace intel_npu } // namespace ov diff --git a/src/plugins/intel_npu/src/al/src/config/config.cpp b/src/plugins/intel_npu/src/al/src/config/config.cpp index 9d4c600351afa6..a4e2b515b8e3f6 100644 --- a/src/plugins/intel_npu/src/al/src/config/config.cpp +++ b/src/plugins/intel_npu/src/al/src/config/config.cpp @@ -50,6 +50,14 @@ int32_t OptionParser<int32_t>::parse(std::string_view val) { } } +uint32_t OptionParser<uint32_t>::parse(std::string_view val) { + try { + return std::stoul(val.data()); + } catch (...) { + OPENVINO_THROW("Value '%s' is not a valid UINT32 option", val.data()); + } +} + int64_t OptionParser<int64_t>::parse(std::string_view val) { try { return std::stoll(val.data()); diff --git a/src/plugins/intel_npu/src/al/src/config/npuw.cpp b/src/plugins/intel_npu/src/al/src/config/npuw.cpp index 0c7978845c690c..4ee9e392406452 100644 --- a/src/plugins/intel_npu/src/al/src/config/npuw.cpp +++ b/src/plugins/intel_npu/src/al/src/config/npuw.cpp @@ -54,3 +54,11 @@ void intel_npu::registerNPUWOptions(OptionsDesc& desc) { desc.add<NPUW_DUMP_IO_ITERS>(); #endif } + +void intel_npu::registerNPUWLLMOptions(OptionsDesc& desc) { + desc.add<NPUW_LLM>(); + desc.add<NPUW_LLM_MODEL_DESC>(); + desc.add<NPUW_LLM_MAX_PROMPT_LEN>(); + desc.add<NPUW_LLM_MIN_RESPONSE_LEN>(); + desc.add<NPUW_LLM_GENERATE_HINT>(); +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp index f9573cb78f21ec..aa02ca8681e80f 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.cpp @@ -28,6 +28,7 @@ #include "intel_npu/config/config.hpp" #include "intel_npu/config/npuw.hpp" #include "intel_npu/npuw_private_properties.hpp" +#include "llm_compiled_model.hpp" #include "openvino/runtime/device_id_parser.hpp" #include "openvino/runtime/internal_properties.hpp" #include "openvino/runtime/properties.hpp" @@ -85,10 +86,33 @@ ov::npuw::DeviceProperties get_properties_per_device(const std::shared_ptr<const } // namespace npuw } // namespace ov +std::shared_ptr<ov::npuw::ICompiledModel> ov::npuw::ICompiledModel::create( + const std::shared_ptr<ov::Model>& model, + const std::shared_ptr<const ov::IPlugin>& plugin, + const ov::AnyMap& properties) { + LOG_INFO("Choosing which NPUW CompiledModel to create"); + LOG_BLOCK(); + std::shared_ptr<ov::npuw::ICompiledModel> compiled_model; + auto use_llm_key = ov::intel_npu::npuw::llm::enabled.name(); + if (properties.count(use_llm_key) && properties.at(use_llm_key).as<bool>() == true) { + LOG_INFO("ov::npuw::LLMCompiledModel will be created."); + compiled_model = std::make_shared<ov::npuw::LLMCompiledModel>(model, plugin, properties); + } else { + LOG_INFO("ov::npuw::CompiledModel will be created."); + compiled_model = std::make_shared<ov::npuw::CompiledModel>(model, plugin, properties); + } + LOG_INFO("Done"); + return compiled_model; +} + +ov::npuw::ICompiledModel::ICompiledModel(const std::shared_ptr<ov::Model>& model, + const std::shared_ptr<const ov::IPlugin>& plugin) + : ov::ICompiledModel(model, plugin) {} + ov::npuw::CompiledModel::CompiledModel(const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin, const ov::AnyMap& properties) - : ov::ICompiledModel(model, plugin), + : ov::npuw::ICompiledModel(model, plugin), m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), m_cfg(m_options_desc), m_name(model->get_friendly_name()), diff --git a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp index 8ccb1f83349e47..0e728570eda8d5 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/compiled_model.hpp @@ -22,10 +22,16 @@ class Plugin; namespace ov { namespace npuw { +class ICompiledModel : public ov::ICompiledModel { +public: + static std::shared_ptr<ov::npuw::ICompiledModel> create(const std::shared_ptr<ov::Model>& model, + const std::shared_ptr<const ov::IPlugin>& plugin, + const ov::AnyMap& properties); + ICompiledModel(const std::shared_ptr<ov::Model>& model, const std::shared_ptr<const ov::IPlugin>& plugin); +}; class InferRequest; - -class CompiledModel : public ov::ICompiledModel { +class CompiledModel : public ov::npuw::ICompiledModel { using DevList = std::vector<std::string>; using GetPropertiesMap = std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>; diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp new file mode 100644 index 00000000000000..e18b098969eb79 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -0,0 +1,346 @@ +// Copyright (C) 2023-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// +#include "llm_compiled_model.hpp" + +#include "llm_infer_request.hpp" +#include "logging.hpp" +#include "openvino/pass/stateful_to_stateless.hpp" +#include "openvino/runtime/iasync_infer_request.hpp" + +namespace { +uint32_t align_to(uint32_t value, uint32_t alignment) { + return (value + alignment - 1) & ~(alignment - 1); +} + +std::shared_ptr<ov::Model> redirect_new_kv_to_output(const std::shared_ptr<ov::Model>& model) { + const auto kStartOutputKVCacheLayers = 1u; + for (std::size_t i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { + auto kvout = model->output(i); + auto kvrslt = kvout.get_node(); + auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); + auto kvval = kvcat->inputs()[1].get_source_output(); + kvval.set_names({kvout.get_any_name()}); + kvrslt->inputs()[0].replace_source_output(kvval); + } + model->validate_nodes_and_infer_types(); + return model; +} + +std::shared_ptr<ov::Model> cvt_kvcache_to_fp16(const std::shared_ptr<ov::Model>& model) { + ov::preprocess::PrePostProcessor ppp(model); + + for (auto tensor : model->inputs()) { + if (tensor.get_any_name().find("past_key") != std::string::npos) { + ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + for (auto tensor : model->outputs()) { + if (tensor.get_any_name().find("present") != std::string::npos) { + ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + return ppp.build(); +} + +struct KVAxesPosition { + uint32_t batch; + uint32_t seq_len; +}; + +void reshape_to_static(std::shared_ptr<ov::Model> model, + const uint32_t input_size, + const uint32_t kvcache_size, + const KVAxesPosition& kv_axes_position) { + std::map<std::string, ov::PartialShape> new_shapes; + for (auto input : model->inputs()) { + const auto& input_name = input.get_any_name(); + ov::PartialShape new_shape; + if (input_name.find("input_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else if (input_name.find("attention_mask") != std::string::npos) { + new_shape = ov::PartialShape({1, kvcache_size}); + } else if (input_name.find("position_ids") != std::string::npos) { + new_shape = ov::PartialShape({1, input_size}); + } else { + const auto& partial_shape = input.get_partial_shape(); + new_shape = partial_shape; + new_shape[kv_axes_position.batch] = 1; + new_shape[kv_axes_position.seq_len] = kvcache_size - input_size; + } + new_shapes.emplace(input_name, new_shape); + } + model->reshape(new_shapes); +} + +KVAxesPosition get_kv_axes(const std::string& model_type) { + KVAxesPosition axes; + if (model_type == "chatglm") { + axes.batch = 1u; + axes.seq_len = 0u; + } else if (model_type == "qwen") { + // Note, qwen2 does not fall into this category and conforms to default layout + axes.batch = 0u; + axes.seq_len = 1u; + } else { + axes.batch = 0u; + axes.seq_len = 2u; + } + return axes; +} + +bool is_cw_compressed(const std::shared_ptr<ov::Model>& model) { + std::vector<std::string> rt_info_path = {"nncf", "weight_compression", "group_size"}; + if (!model->has_rt_info(rt_info_path)) { + // NB: Model isn't compressed by NNCF - skip + return false; + } + auto group_size = model->get_rt_info<int>(rt_info_path); + if (group_size == -1) { + // NB: Enable DQ for CW quantized models + return true; + } + return false; +} + +struct NPUDesc { + std::string arch; + int64_t max_tiles; +}; + +std::optional<NPUDesc> extract_npu_descriptor(const std::shared_ptr<const ov::IPlugin>& plugin) { + const ov::Any arch = plugin->get_property(ov::device::architecture.name(), ov::AnyMap{}); + const ov::Any max_tiles = plugin->get_property(ov::intel_npu::max_tiles.name(), ov::AnyMap{}); + return std::make_optional(NPUDesc{arch.as<std::string>(), max_tiles.as<int64_t>()}); +} + +std::optional<ov::Any> pop_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + std::optional<ov::Any> found = std::make_optional(it->second); + config.erase(it); + return found; + } + return std::nullopt; +} + +template <typename T> +std::optional<T> get_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + return std::make_optional(it->second.as<T>()); + } + return std::nullopt; +} + +template <typename T> +T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { + auto anyopt = pop_option(config, key); + if (anyopt.has_value()) { + return anyopt.value().as<T>(); + } + return default_value; +} + +ov::AnyMap get_baseline_common_config() { + ov::AnyMap config = { + {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"}, + {"NPUW_DEVICES", "NPU"}, + {"NPU_USE_NPUW", "YES"}, + {"NPUW_FOLD", "YES"}, + {"NPUW_DCOFF_TYPE", "f16"}, + {"NPUW_DCOFF_SCALE", "YES"}, + {"NPUW_WEIGHTS_BANK", "shared"}, + {"NPUW_SLICE_OUT", "YES"}, + {"NPUW_FUNCALL_ASYNC", "YES"}}; + return config; +} + +ov::AnyMap get_default_common_config(const std::shared_ptr<ov::Model>& model) { + auto config = get_baseline_common_config(); + const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0"); + if (npu_l0 && std::atoi(npu_l0) == 1) { + config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU"); + } else { + config.emplace("NPUW_FUNCALL_FOR_ALL", "YES"); + } + return config; +} + +ov::AnyMap get_default_prefill_config(const std::shared_ptr<ov::Model>& model, const std::optional<NPUDesc>& npudesc) { + auto config = get_default_common_config(model); + if (is_cw_compressed(model)) { + config.emplace("NPUW_DQ", "YES"); + } else { + config.emplace("NPUW_PMM", "NO"); + } + if (npudesc.has_value() && npudesc->arch == "4000" && npudesc->max_tiles != -1) { + config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); + } + return config; +} + +ov::AnyMap get_default_generate_config(const std::shared_ptr<ov::Model>& model, + const std::optional<NPUDesc>& npudesc, + const ::intel_npu::npuw::llm::GenerateHint hint) { + auto config = get_default_common_config(model); + if (hint == ::intel_npu::npuw::llm::GenerateHint::BEST_PERF) { + config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); + } + // NB: Unconditionally set for generation model + config.emplace("NPUW_DQ", "YES"); + if (npudesc.has_value() && npudesc->arch == "4000") { + config.emplace("NPU_DPU_GROUPS", 4); + } + return config; +} + +void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { + for (const auto& [key, value] : rhs) { + // NB: Overwrite the value if key already exists + if (auto it = lhs.find(key); it != lhs.end()) { + it->second = value; + } else { + lhs.emplace(key, value); + } + } +} + +void drop_cache_dir(ov::AnyMap& config) { + if (config.count("NPU_USE_NPUW") != 0u) { + pop_option(config, "CACHE_DIR"); + } +} + +void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) { + for (auto it = properties.begin(); it != properties.end(); ++it) { + if (it->first.find("NPUW_LLM") != it->first.npos) { + llm_properties.insert(*it); + } else { + other_properties.insert(*it); + } + } +} + +std::map<std::string, std::string> any_copy(const ov::AnyMap& params) { + std::map<std::string, std::string> result; + for (auto&& value : params) { + result.emplace(value.first, value.second.as<std::string>()); + } + return result; +} +} // namespace + +ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr<ov::Model>& model, + const std::shared_ptr<const ov::IPlugin>& plugin, + const ov::AnyMap& properties) + : ov::npuw::ICompiledModel(model, plugin), + m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), + m_cfg(m_options_desc) { + LOG_DEBUG("Creating LLMCompiledModel"); + LOG_BLOCK(); + + ::intel_npu::registerNPUWLLMOptions(*m_options_desc); + + std::map<std::string, ov::Any> npuw_llm_props; + std::map<std::string, ov::Any> other_props; + split_llm_properties(properties, npuw_llm_props, other_props); + m_cfg.update(any_copy(npuw_llm_props)); + + LOG_DEBUG("1. Creating kvcache model as clone of passed one."); + auto kvcache_model = model->clone(); + LOG_DEBUG("2. Transform kvcache model from stateful to stateless."); + ov::pass::StatefulToStateless().run_on_model(kvcache_model); + + LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one."); + auto prefill_model = kvcache_model->clone(); + prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); + LOG_DEBUG("4. Converting KV-cache in prefill model to FP16."); + prefill_model = cvt_kvcache_to_fp16(prefill_model); + + LOG_DEBUG("5. Optimize kvcache kvcache model to output key/values for new token."); + kvcache_model = redirect_new_kv_to_output(kvcache_model); + LOG_DEBUG("6. Converting KV-cache in kvcache model to FP16."); + kvcache_model = cvt_kvcache_to_fp16(kvcache_model); + + const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); + const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); + const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>(); + KVAxesPosition axes = get_kv_axes(model_desc.type); + m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len}; + LOG_DEBUG("7. Make prefill model with static shapes"); + reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); + LOG_DEBUG("8. Make kvcache model with static shapes"); + reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes); + + auto npudesc = extract_npu_descriptor(plugin); + + ov::AnyMap properties_copy = other_props; + auto prefill_config = get_default_prefill_config(model, npudesc); + // NB: GENERATE_HINT is only applicable for default generate config! + const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); + LOG_DEBUG("9. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); + auto generate_config = get_default_generate_config(model, npudesc, generate_hint); + merge_config_with(prefill_config, properties_copy); + merge_config_with(generate_config, properties_copy); + // FIXME: Drop CACHE_DIR option if NPUW is enabled + drop_cache_dir(prefill_config); + drop_cache_dir(generate_config); + + m_kvcache_compiled = std::make_shared<ov::npuw::CompiledModel>(kvcache_model, plugin, generate_config); + m_prefill_compiled = std::make_shared<ov::npuw::CompiledModel>(prefill_model, plugin, prefill_config); + + implement_properties(); + LOG_DEBUG("Done"); +} + +void ov::npuw::LLMCompiledModel::export_model(std::ostream& model) const { + OPENVINO_NOT_IMPLEMENTED; +} + +std::shared_ptr<const ov::Model> ov::npuw::LLMCompiledModel::get_runtime_model() const { + OPENVINO_NOT_IMPLEMENTED; +} + +void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) { + OPENVINO_NOT_IMPLEMENTED; +} + +ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const { + OPENVINO_SUPPRESS_DEPRECATED_START + auto&& configIterator = m_prop_to_opt.find(name); + if (configIterator != m_prop_to_opt.cend()) { + return std::get<1>(configIterator->second)(m_cfg); + } else { + return m_prefill_compiled->get_property(name); + } + OPENVINO_SUPPRESS_DEPRECATED_END +} + +std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_sync_infer_request() const { + auto* non_const_this = const_cast<ov::npuw::LLMCompiledModel*>(this); // because of const in API + return non_const_this->create_llm_infer_request(); +} + +std::shared_ptr<ov::ISyncInferRequest> ov::npuw::LLMCompiledModel::create_llm_infer_request() { + auto this_sptr = std::static_pointer_cast<ov::npuw::LLMCompiledModel>(shared_from_this()); + return std::make_shared<ov::npuw::LLMInferRequest>(this_sptr, m_kvcache_desc); +} + +void ov::npuw::LLMCompiledModel::implement_properties() { +#define BIND(N, T, GETTER) \ + { \ + ov::intel_npu::N.name(), { \ + ov::PropertyMutability::RW, [](const ::intel_npu::Config& config) -> ov::Any { \ + return config.GETTER<::intel_npu::T>(); \ + } \ + } \ + } + + m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get), + BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString), + BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get), + BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get), + BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)}); +#undef BIND +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp new file mode 100644 index 00000000000000..1a748997fd48fa --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.hpp @@ -0,0 +1,54 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include <memory> + +#include "compiled_model.hpp" + +namespace ov { +namespace npuw { + +class LLMInferRequest; +class LLMCompiledModel : public ov::npuw::ICompiledModel { + using GetPropertiesMap = + std::map<std::string, std::tuple<ov::PropertyMutability, std::function<ov::Any(const ::intel_npu::Config&)>>>; + +public: + struct KVCacheDesc { + uint32_t max_prompt_size = 0u; + uint32_t total_size = 0u; + uint32_t num_stored_tokens = 0u; + uint32_t dim = 0u; + }; + + LLMCompiledModel(const std::shared_ptr<ov::Model>& model, + const std::shared_ptr<const ov::IPlugin>& plugin, + const ov::AnyMap& properties); + LLMCompiledModel() = delete; + void export_model(std::ostream& model) const override; + std::shared_ptr<const ov::Model> get_runtime_model() const override; + + void set_property(const ov::AnyMap& properties) override; + ov::Any get_property(const std::string& name) const override; + +private: + friend class LLMInferRequest; + + std::shared_ptr<ov::ISyncInferRequest> create_llm_infer_request(); + std::shared_ptr<ov::ISyncInferRequest> create_sync_infer_request() const override; + void implement_properties(); + + std::shared_ptr<::intel_npu::OptionsDesc> m_options_desc; + ::intel_npu::Config m_cfg; + GetPropertiesMap m_prop_to_opt; + + KVCacheDesc m_kvcache_desc; + std::shared_ptr<ov::npuw::CompiledModel> m_kvcache_compiled; + std::shared_ptr<ov::npuw::CompiledModel> m_prefill_compiled; +}; + +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp new file mode 100644 index 00000000000000..a8c90884d3d926 --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.cpp @@ -0,0 +1,193 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "llm_infer_request.hpp" + +#include <regex> + +#include "llm_compiled_model.hpp" +#include "logging.hpp" +#include "openvino/runtime/iasync_infer_request.hpp" + +namespace { +template <typename T> +void fill_tensor(ov::SoPtr<ov::ITensor> tensor, T fill_val, size_t offset = 0u) { + T* tensor_data = tensor->data<T>(); + std::fill(tensor_data + offset, tensor_data + tensor->get_size(), fill_val); +} + +ov::SoPtr<ov::ITensor> make_tensor_slice(ov::SoPtr<ov::ITensor> tensor, + uint32_t dim, + uint32_t start_pos, + uint32_t end_pos) { + ov::Shape start_shape(std::vector<size_t>(tensor->get_shape().size(), 0u)); + start_shape[dim] = start_pos; + ov::Shape end_shape = tensor->get_shape(); + end_shape[dim] = end_pos; + return ov::get_tensor_impl(ov::Tensor(ov::make_tensor(tensor), start_shape, end_shape)); +} +} // anonymous namespace + +ov::npuw::LLMInferRequest::LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model, + const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc) + : ov::ISyncInferRequest(compiled_model), + m_kvcache_desc(kvcache_desc) { + m_kvcache_request = compiled_model->m_kvcache_compiled->create_infer_request(); + m_prefill_request = compiled_model->m_prefill_compiled->create_infer_request(); + + for (auto input_port : m_prefill_request->get_compiled_model()->inputs()) { + m_prefill_in_ports.emplace(input_port.get_any_name(), input_port); + } + for (auto output_port : m_prefill_request->get_compiled_model()->outputs()) { + m_prefill_out_ports.emplace(output_port.get_any_name(), output_port); + } + + for (auto input_port : m_kvcache_request->get_compiled_model()->inputs()) { + m_kvcache_in_ports.emplace(input_port.get_any_name(), input_port); + } + for (auto output_port : m_kvcache_request->get_compiled_model()->outputs()) { + m_kvcache_out_ports.emplace(output_port.get_any_name(), output_port); + } +} + +void ov::npuw::LLMInferRequest::prepare_for_new_conversation() { + // FIXME: for input_ids it must be padding from tokenizer that not available from here + // Get it from NPUW options + fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")), 0u); + fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")), 0u); + fill_tensor<int64_t>(m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")), 0u); + fill_tensor<int64_t>(m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")), 0u); + m_kvcache_desc.num_stored_tokens = 0u; +} + +void ov::npuw::LLMInferRequest::infer_prefill(ov::SoPtr<ov::ITensor> input_ids, + ov::SoPtr<ov::ITensor> attention_mask, + ov::SoPtr<ov::ITensor> position_ids) { + LOG_DEBUG("Calling inference for prefill model..."); + LOG_BLOCK(); + + prepare_for_new_conversation(); + + auto padded_input_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at("input_ids")); + const size_t offset = padded_input_ids->get_size() - input_ids->get_size(); + std::copy_n(input_ids->data<int64_t>(), input_ids->get_size(), padded_input_ids->data<int64_t>() + offset); + + auto padded_attention_mask = m_prefill_request->get_tensor(m_prefill_in_ports.at("attention_mask")); + std::copy_n(attention_mask->data<int64_t>(), + attention_mask->get_size(), + padded_attention_mask->data<int64_t>() + offset); + + auto padded_position_ids = m_prefill_request->get_tensor(m_prefill_in_ports.at("position_ids")); + std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), padded_position_ids->data<int64_t>() + offset); + + m_prefill_request->infer(); + m_kvcache_desc.num_stored_tokens += static_cast<uint32_t>(input_ids->get_size()); + m_need_copy_kvcache = true; + + m_logits = m_prefill_request->get_tensor(m_prefill_out_ports.at("logits")); + + LOG_DEBUG("Done"); +} + +void ov::npuw::LLMInferRequest::infer_generate(ov::SoPtr<ov::ITensor> input_ids, + ov::SoPtr<ov::ITensor> attention_mask, + ov::SoPtr<ov::ITensor> position_ids) { + LOG_DEBUG("Calling inference for generate model..."); + LOG_BLOCK(); + + // NB: KV-cache is full, further generation is impossible + if (m_kvcache_desc.num_stored_tokens == m_kvcache_desc.total_size) { + OPENVINO_THROW("KV-Cache is full."); + } + + if (m_need_copy_kvcache) { + LOG_DEBUG("Copying kv-cache from prefill to generate model."); + const std::size_t kStartOutputKVCacheLayers = 1u; + const auto& kvcache_compiled = m_kvcache_request->get_compiled_model(); + for (std::size_t i = 0; i < kvcache_compiled->outputs().size() - 1; ++i) { + const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + auto prefill_out_tensor = m_prefill_request->get_tensor(m_prefill_out_ports.at(output_name)); + + const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); + auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); + + // FIXME: We don't need to fill whole tensor with 0s, but only tensor.size() - num_stored_tokens + // taking into account kvcache dimension. + fill_tensor<ov::float16>(kvcache_in_tensor, 0); + + auto prefill_out_slice = + make_tensor_slice(prefill_out_tensor, + m_kvcache_desc.dim, + m_kvcache_desc.max_prompt_size - m_kvcache_desc.num_stored_tokens, + m_kvcache_desc.max_prompt_size); + + auto kvcache_in_slice = + make_tensor_slice(kvcache_in_tensor, m_kvcache_desc.dim, 0u, m_kvcache_desc.num_stored_tokens); + + prefill_out_slice->copy_to(kvcache_in_slice._ptr); + } + LOG_DEBUG("Prepare attention mask pattern."); + auto* attention_mask_data = + m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask"))->data<int64_t>(); + attention_mask_data[m_kvcache_desc.total_size - 1] = 1; + + m_need_copy_kvcache = false; + } + + // FIXME: these tensors should be shared between the parent & child models + auto kv_input_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("input_ids")); + std::copy_n(input_ids->data<int64_t>(), input_ids->get_size(), kv_input_ids->data<int64_t>()); + + auto kv_attn_mask = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("attention_mask")); + std::copy_n(attention_mask->data<int64_t>(), attention_mask->get_size(), kv_attn_mask->data<int64_t>()); + + auto kv_pos_ids = m_kvcache_request->get_tensor(m_kvcache_in_ports.at("position_ids")); + std::copy_n(position_ids->data<int64_t>(), position_ids->get_size(), kv_pos_ids->data<int64_t>()); + + m_kvcache_request->infer(); + m_logits = m_kvcache_request->get_tensor(m_kvcache_out_ports.at("logits")); + m_kvcache_desc.num_stored_tokens += 1; + + LOG_DEBUG("Write KV-cache for the new token to the correct input position for next iteration."); + const std::size_t kStartOutputKVCacheLayers = 1u; + const auto& kvcache_compiled = m_kvcache_request->get_compiled_model(); + for (std::size_t i = 0; i < kvcache_compiled->outputs().size() - 1; ++i) { + const auto& output_name = kvcache_compiled->outputs()[kStartOutputKVCacheLayers + i].get_any_name(); + const auto& input_name = std::regex_replace(output_name, std::regex("present"), "past_key_values"); + auto kvcache_in_tensor = m_kvcache_request->get_tensor(m_kvcache_in_ports.at(input_name)); + auto kvcache_in_slice = make_tensor_slice(kvcache_in_tensor, + m_kvcache_desc.dim, + m_kvcache_desc.num_stored_tokens - 1, + m_kvcache_desc.num_stored_tokens); + auto kvcache_out_tensor = m_kvcache_request->get_tensor(m_kvcache_out_ports.at(output_name)); + kvcache_out_tensor->copy_to(kvcache_in_slice._ptr); + } + LOG_DEBUG("Done"); +} + +void ov::npuw::LLMInferRequest::infer() { + const auto& inputs = get_inputs(); + + auto input_ids = get_tensor(inputs[0]); + auto attention_mask = get_tensor(inputs[1]); + auto position_ids = get_tensor(inputs[2]); + + OPENVINO_ASSERT(ov::element::i64 == input_ids->get_element_type()); + OPENVINO_ASSERT(ov::element::i64 == attention_mask->get_element_type()); + OPENVINO_ASSERT(ov::element::i64 == position_ids->get_element_type()); + + if (input_ids->get_size() != 1) { + infer_prefill(input_ids, attention_mask, position_ids); + } else { + infer_generate(input_ids, attention_mask, position_ids); + } +} + +ov::SoPtr<ov::ITensor> ov::npuw::LLMInferRequest::get_tensor(const ov::Output<const ov::Node>& port) const { + // NB: If asked for logits... + if (port == get_outputs()[0]) { + return m_logits; + } + return ov::ISyncInferRequest::get_tensor(port); +} diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp new file mode 100644 index 00000000000000..fbc6c702c4b62a --- /dev/null +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_infer_request.hpp @@ -0,0 +1,58 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include <memory> + +#include "llm_compiled_model.hpp" +#include "openvino/core/descriptor/output.hpp" +#include "openvino/runtime/isync_infer_request.hpp" + +namespace ov { +namespace npuw { + +class LLMInferRequest final : public ov::ISyncInferRequest { +public: + explicit LLMInferRequest(const std::shared_ptr<ov::npuw::LLMCompiledModel>& compiled_model, + const ov::npuw::LLMCompiledModel::KVCacheDesc& kvcache_desc); + + void infer() override; + + ov::SoPtr<ov::ITensor> get_tensor(const ov::Output<const ov::Node>& port) const override; + + void check_tensors() const override{}; + + std::vector<ov::ProfilingInfo> get_profiling_info() const override { + return {}; + } + std::vector<ov::SoPtr<ov::IVariableState>> query_state() const override { + return {}; + } + +private: + void prepare_for_new_conversation(); + + void infer_prefill(ov::SoPtr<ov::ITensor> input_ids, + ov::SoPtr<ov::ITensor> attention_mask, + ov::SoPtr<ov::ITensor> position_ids); + + void infer_generate(ov::SoPtr<ov::ITensor> input_ids, + ov::SoPtr<ov::ITensor> attention_mask, + ov::SoPtr<ov::ITensor> position_ids); + + std::shared_ptr<ov::IAsyncInferRequest> m_kvcache_request; + std::shared_ptr<ov::IAsyncInferRequest> m_prefill_request; + LLMCompiledModel::KVCacheDesc m_kvcache_desc; + ov::SoPtr<ov::ITensor> m_logits; + bool m_need_copy_kvcache = false; + + std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_in_ports; + std::unordered_map<std::string, ov::Output<const ov::Node>> m_prefill_out_ports; + std::unordered_map<std::string, ov::Output<const ov::Node>> m_kvcache_in_ports; + std::unordered_map<std::string, ov::Output<const ov::Node>> m_kvcache_out_ports; +}; + +} // namespace npuw +} // namespace ov diff --git a/src/plugins/intel_npu/src/plugin/npuw/logging.hpp b/src/plugins/intel_npu/src/plugin/npuw/logging.hpp index b258e3e6e6bfe9..95c9a742db7842 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/logging.hpp +++ b/src/plugins/intel_npu/src/plugin/npuw/logging.hpp @@ -62,3 +62,7 @@ void dump_failure(const std::shared_ptr<ov::Model>& model, const std::string& de OPENVINO_THROW("NPUW: Assertion " #expr " failed"); \ } \ } while (0) + +#ifdef _MSC_VER +# define __PRETTY_FUNCTION__ __FUNCSIG__ +#endif diff --git a/src/plugins/intel_npu/src/plugin/src/plugin.cpp b/src/plugins/intel_npu/src/plugin/src/plugin.cpp index a57628c2e45510..da425d5d01a5c3 100644 --- a/src/plugins/intel_npu/src/plugin/src/plugin.cpp +++ b/src/plugins/intel_npu/src/plugin/src/plugin.cpp @@ -7,6 +7,7 @@ #include <fstream> #include "compiled_model.hpp" +#include "npuw/compiled_model.hpp" #include "driver_compiler_adapter.hpp" #include "intel_npu/common/device_helpers.hpp" #include "intel_npu/common/igraph.hpp" @@ -16,7 +17,6 @@ #include "intel_npu/config/npuw.hpp" #include "intel_npu/config/runtime.hpp" #include "intel_npu/utils/zero/zero_init.hpp" -#include "npuw/compiled_model.hpp" #include "openvino/op/constant.hpp" #include "openvino/op/parameter.hpp" #include "openvino/runtime/intel_npu/properties.hpp" @@ -637,7 +637,7 @@ std::shared_ptr<ov::ICompiledModel> Plugin::compile_model(const std::shared_ptr< if (localProperties.count(ov::cache_dir.name()) || !_globalConfig.get<CACHE_DIR>().empty()) { OPENVINO_THROW("Option 'CACHE_DIR' is not supported with NPU_USE_NPUW!"); } - return std::make_shared<ov::npuw::CompiledModel>(model->clone(), shared_from_this(), localProperties); + return ov::npuw::ICompiledModel::create(model->clone(), shared_from_this(), localProperties); } else { // NPUW is disabled, remove the key from the properties localProperties.erase(useNpuwKey);