From 0d3107663920f25eb5a454268cdb31cc5f5abee0 Mon Sep 17 00:00:00 2001 From: Eugene Smirnov Date: Thu, 9 Jan 2025 09:45:33 +0100 Subject: [PATCH] vtensors transpose for llama3 --- .../src/plugin/npuw/llm_compiled_model.cpp | 633 +++++++++--------- 1 file changed, 311 insertions(+), 322 deletions(-) diff --git a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp index 853c839ff6faa1..cc163e533a4770 100644 --- a/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp +++ b/src/plugins/intel_npu/src/plugin/npuw/llm_compiled_model.cpp @@ -1,20 +1,74 @@ -// Copyright (C) 2023-2025 Intel Corporation +// Copyright (C) 2023-2024 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // #include "llm_compiled_model.hpp" #include "llm_infer_request.hpp" #include "logging.hpp" -#include "openvino/op/ops.hpp" +#include "openvino/pass/stateful_to_stateless.hpp" +#include "openvino/runtime/iasync_infer_request.hpp" #include "openvino/openvino.hpp" -#include "openvino/opsets/opset13.hpp" -#include "openvino/pass/graph_rewrite.hpp" +#include "openvino/pass/validate.hpp" #include "openvino/pass/matcher_pass.hpp" +#include "openvino/pass/graph_rewrite.hpp" #include "openvino/pass/pattern/op/wrap_type.hpp" -#include "openvino/pass/stateful_to_stateless.hpp" -#include "openvino/pass/validate.hpp" -#include "openvino/runtime/iasync_infer_request.hpp" -#include "serialization.hpp" +#include "openvino/op/ops.hpp" +#include "openvino/pass/pattern/op/optional.hpp" +#include "openvino/opsets/opset13.hpp" + +namespace { +uint32_t align_to(uint32_t value, uint32_t alignment) { + return (value + alignment - 1) & ~(alignment - 1); +} + +std::shared_ptr cvt_kvcache_to_fp16(const std::shared_ptr& model) { + ov::preprocess::PrePostProcessor ppp(model); + + for (auto tensor : model->inputs()) { + if (tensor.get_any_name().find("past_key") != std::string::npos) { + ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + for (auto tensor : model->outputs()) { + if (tensor.get_any_name().find("present") != std::string::npos) { + ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); + } + } + + return ppp.build(); +} + +void align_u4_zp_constants(const std::shared_ptr& model) { + for (auto op : model->get_ops()) { + if (ov::op::util::is_constant(op)) { + auto cst_op = std::dynamic_pointer_cast(op); + const auto cst_op_out = cst_op->output(0); + if (cst_op_out.get_element_type() == ov::element::u4 && ov::shape_size(cst_op_out.get_shape()) == 1u) { + ov::Tensor cst_tensor(ov::element::u4, cst_op_out.get_shape()); + *static_cast(cst_tensor.data()) = cst_op->get_vector()[0] & 0x0f; + auto new_cst_op = std::make_shared(cst_tensor); + for (auto target_input : cst_op_out.get_target_inputs()) { + target_input.replace_source_output(new_cst_op); + } + } + } + } +} + +std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { + const auto kStartOutputKVCacheLayers = 1u; + for (std::size_t i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { + auto kvout = model->output(i); + auto kvrslt = kvout.get_node(); + auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); + auto kvval = kvcat->inputs()[1].get_source_output(); + kvval.set_names({kvout.get_any_name()}); + kvrslt->inputs()[0].replace_source_output(kvval); + } + model->validate_nodes_and_infer_types(); + return model; +} namespace opp = ov::pass::pattern; class TransposeValueTensors : public ov::pass::MatcherPass { @@ -25,8 +79,106 @@ class TransposeValueTensors : public ov::pass::MatcherPass { using Ref = std::reference_wrapper; }; - OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::TransposeValueTensors"); TransposeValueTensors(Context::Ref ctx) { + register_matcher_llama2(ctx); + register_matcher_llama3(ctx); + } + +private: + // llama3.2, mistral, etc + void register_matcher_llama3(Context::Ref ctx) { + auto param = opp::wrap_type(); + auto transpose = opp::wrap_type({opp::any_input(), opp::any_input()}); + auto concat = opp::wrap_type({param, transpose}); + + // only difference is that broadcast wrapped into unsquese/reshape, while transposed tensor didn't change + const auto unsqueeze_axes = opp::wrap_type(); + auto unsqueeze = opp::wrap_type({concat, unsqueeze_axes}); + auto broadcast = opp::wrap_type({unsqueeze, opp::any_input()}); + auto reshape = opp::wrap_type({broadcast, opp::any_input()}); + + auto softmax = opp::wrap_type({opp::any_input()}); + auto matmul = opp::wrap_type({softmax, reshape}); + + auto callback = [=](ov::pass::pattern::Matcher& m) { + auto& node_to_output = m.get_pattern_value_map(); + + auto matched_node_param = node_to_output.at(param).get_node_shared_ptr(); + auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); + auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr(); + auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr(); + auto matched_node_unsqueeze = node_to_output.at(unsqueeze).get_node_shared_ptr(); + auto matched_node_unsqueeze_axes = node_to_output.at(unsqueeze_axes).get_node_shared_ptr(); + auto matched_node_broadcast = node_to_output.at(broadcast).get_node_shared_ptr(); + auto matched_node_reshape = node_to_output.at(reshape).get_node_shared_ptr(); + + + auto matched_param = std::static_pointer_cast(matched_node_param); + auto matched_concat = std::static_pointer_cast(matched_node_concat); + auto matched_transpose = std::static_pointer_cast(matched_node_transpose); + auto matched_matmul = std::static_pointer_cast(matched_node_matmul); + auto matched_unsqueeze = std::static_pointer_cast(matched_node_unsqueeze); + auto matched_broadcast = std::static_pointer_cast(matched_node_broadcast); + auto matched_reshape = std::static_pointer_cast(matched_node_reshape); + + auto shape_broadcast = matched_broadcast->get_output_shape(0); + OPENVINO_ASSERT(shape_broadcast.size() == 5u); + std::swap(shape_broadcast[3], shape_broadcast[4]); + + LOG_DEBUG("shape_broadcast for: "<< matched_broadcast->get_friendly_name() <<", shape=" << shape_broadcast); + + const auto broadcast_axes_node = std::make_shared(ov::element::i32, ov::Shape{5}, shape_broadcast); + broadcast_axes_node->set_friendly_name(matched_broadcast->get_friendly_name() + "/new_broadcast_shape"); + matched_broadcast->input(1).replace_source_output(broadcast_axes_node); + + auto shape_reshape = matched_reshape->get_output_shape(0); + OPENVINO_ASSERT(shape_reshape.size() == 4u); + std::swap(shape_reshape[2], shape_reshape[3]); + + LOG_DEBUG("shape_reshape for: "<< matched_reshape->get_friendly_name() <<", shape=" << shape_reshape); + + const auto reshape_axes_node = std::make_shared(ov::element::i32, ov::Shape{4}, shape_reshape); + reshape_axes_node->set_friendly_name(matched_reshape->get_friendly_name() + "/new_reshape_shape"); + matched_reshape->input(1).replace_source_output(reshape_axes_node); + + auto param_shape = matched_param->get_partial_shape(); + OPENVINO_ASSERT(param_shape.size() == 4u); + // NB: Transpose Parameter that correspond to V-tensor it will + // speed-up its multiplication with attention scores + std::swap(param_shape[2], param_shape[3]); + auto new_param = std::make_shared(matched_param->get_element_type(), param_shape); + new_param->set_friendly_name(matched_param->get_friendly_name()); + new_param->outputs().begin()->get_tensor().set_names(matched_param->outputs().begin()->get_tensor().get_names()); + ov::replace_node(matched_param, new_param); + // NB: Save in order to add/remove to the model later on + ctx.get().new_params.push_back(new_param); + ctx.get().old_params.push_back(matched_param); + + auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1}); + auto new_transpose = std::make_shared(matched_transpose->input_value(0), + order_cst->output(0)); + new_transpose->set_friendly_name(matched_transpose->get_friendly_name()); + ov::replace_node(matched_transpose, new_transpose); + + auto new_concat = std::make_shared( + ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u); + new_concat->set_friendly_name(matched_concat->get_friendly_name()); + ov::replace_node(matched_concat, new_concat); + + //------ update output dims + matched_unsqueeze->validate_and_infer_types(); + matched_broadcast->validate_and_infer_types(); + matched_reshape->validate_and_infer_types(); + + matched_matmul->set_transpose_b(true); + + return true; + }; + register_matcher(std::make_shared(matmul, "TransposeValueTensors_llama3"), std::move(callback)); + } + + // llama2, phi3, etc + void register_matcher_llama2(Context::Ref ctx) { auto param = opp::wrap_type(); auto transpose = opp::wrap_type({opp::any_input(), opp::any_input()}); auto concat = opp::wrap_type({param, transpose}); @@ -36,15 +188,15 @@ class TransposeValueTensors : public ov::pass::MatcherPass { auto callback = [=](ov::pass::pattern::Matcher& m) { auto& node_to_output = m.get_pattern_value_map(); - auto matched_node_param = node_to_output.at(param).get_node_shared_ptr(); - auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); + auto matched_node_param = node_to_output.at(param).get_node_shared_ptr(); + auto matched_node_concat = node_to_output.at(concat).get_node_shared_ptr(); auto matched_node_transpose = node_to_output.at(transpose).get_node_shared_ptr(); - auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr(); + auto matched_node_matmul = node_to_output.at(matmul).get_node_shared_ptr(); - auto matched_param = std::static_pointer_cast(matched_node_param); - auto matched_concat = std::static_pointer_cast(matched_node_concat); + auto matched_param = std::static_pointer_cast(matched_node_param); + auto matched_concat = std::static_pointer_cast(matched_node_concat); auto matched_transpose = std::static_pointer_cast(matched_node_transpose); - auto matched_matmul = std::static_pointer_cast(matched_node_matmul); + auto matched_matmul = std::static_pointer_cast(matched_node_matmul); auto shape = matched_param->get_partial_shape(); OPENVINO_ASSERT(shape.size() == 4u); @@ -53,22 +205,20 @@ class TransposeValueTensors : public ov::pass::MatcherPass { std::swap(shape[2], shape[3]); auto new_param = std::make_shared(matched_param->get_element_type(), shape); new_param->set_friendly_name(matched_param->get_friendly_name()); - new_param->outputs().begin()->get_tensor().set_names( - matched_param->outputs().begin()->get_tensor().get_names()); + new_param->outputs().begin()->get_tensor().set_names(matched_param->outputs().begin()->get_tensor().get_names()); ov::replace_node(matched_param, new_param); // NB: Save in order to add/remove to the model later on ctx.get().new_params.push_back(new_param); ctx.get().old_params.push_back(matched_param); auto order_cst = ov::op::v0::Constant::create(ov::element::i32, ov::Shape{4}, {0, 2, 3, 1}); - auto new_transpose = - std::make_shared(matched_transpose->input_value(0), order_cst->output(0)); + auto new_transpose = std::make_shared(matched_transpose->input_value(0), + order_cst->output(0)); new_transpose->set_friendly_name(matched_transpose->get_friendly_name()); ov::replace_node(matched_transpose, new_transpose); - auto new_concat = - std::make_shared(ov::OutputVector{new_param->output(0), new_transpose->output(0)}, - 3u); + auto new_concat = std::make_shared( + ov::OutputVector{new_param->output(0), new_transpose->output(0)}, 3u); new_concat->set_friendly_name(matched_concat->get_friendly_name()); ov::replace_node(matched_concat, new_concat); @@ -76,20 +226,20 @@ class TransposeValueTensors : public ov::pass::MatcherPass { return true; }; - register_matcher(std::make_shared(matmul, "TransposeValueTensors"), std::move(callback)); + register_matcher(std::make_shared(matmul, "TransposeValueTensors_llama2"), std::move(callback)); } }; class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { public: - OPENVINO_MATCHER_PASS_RTTI("npuw::LLMCompiledModel::ScaledDotProductAttentionDecomposition"); + OPENVINO_RTTI("ScaledDotProductAttentionDecomposition", "0"); ScaledDotProductAttentionDecomposition() { auto pattern_node = ov::pass::pattern::wrap_type(); ov::matcher_pass_callback callback = [=](ov::pass::pattern::Matcher& m) { auto& pattern_to_output = m.get_pattern_value_map(); auto node = ov::as_type_ptr( - pattern_to_output.at(pattern_node).get_node_shared_ptr()); + pattern_to_output.at(pattern_node).get_node_shared_ptr()); if (node == nullptr || transformation_callback(node)) { return false; @@ -135,7 +285,7 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { k_rank = register_new_node(k_rank, zero_i); auto minus_inf = register_new_node(v0::Constant::create(element::f32, Shape{}, {-std::numeric_limits::infinity()})) - ->output(0); + ->output(0); auto keep_dim_last = register_new_node(k_next_dim, zero_i); auto k_dims_before_transpose = register_new_node(zero_i, keep_dim_last, one_i, element::i32); @@ -148,9 +298,9 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { if (!node->get_causal()) { mask = node->input_value(3); - // two types of masks are supported. A boolean mask where a value of True indicates that the element - // should take part in attention. A float mask of the same type as query, key, value that is added to - // the attention score. + // two types of masks are supported. A boolean mask where a value of True indicates that the element should + // take part in attention. A float mask of the same type as query, key, value that is added to the attention + // score. if (mask.get_element_type() == element::boolean) { atten_mask = register_new_node(mask, scaled_atten); auto inv_mask = register_new_node(mask); @@ -165,8 +315,7 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { auto tsl = register_new_node(target_s_len, zero_i); auto mask_shape = register_new_node(OutputVector{tsl, ssl}, 0); mask = register_new_node(minus_inf, mask_shape); - auto horizontal_range = - register_new_node(zero_i, source_s_len, one_i, element::i32)->output(0); + auto horizontal_range = register_new_node(zero_i, source_s_len, one_i, element::i32)->output(0); horizontal_range = register_new_node(horizontal_range, zero_i); auto stop = register_new_node(target_s_len, one_i); auto vertical_range = register_new_node(one_i, stop, one_i, element::i32)->output(0); @@ -177,7 +326,9 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { scaled_atten = register_new_node(scaled_atten, atten_mask); } - scaled_atten = register_new_node(scaled_atten, -1); + auto smax = register_new_node(scaled_atten, -1); + scaled_atten = smax; + smax->set_friendly_name(smax->get_friendly_name() + "/why-softmax"); auto result = register_new_node(scaled_atten, value); result->set_friendly_name(node->get_friendly_name()); copy_runtime_info(node, get_new_nodes()); @@ -185,43 +336,6 @@ class ScaledDotProductAttentionDecomposition : public ov::pass::MatcherPass { } }; -namespace { -uint32_t align_to(uint32_t value, uint32_t alignment) { - return (value + alignment - 1) & ~(alignment - 1); -} - -std::shared_ptr cvt_kvcache_to_fp16(const std::shared_ptr& model) { - ov::preprocess::PrePostProcessor ppp(model); - - for (const auto& tensor : model->inputs()) { - if (tensor.get_any_name().find("past_key") != std::string::npos) { - ppp.input(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); - } - } - - for (const auto& tensor : model->outputs()) { - if (tensor.get_any_name().find("present") != std::string::npos) { - ppp.output(tensor.get_any_name()).tensor().set_element_type(ov::element::Type_t::f16); - } - } - - return ppp.build(); -} - -std::shared_ptr redirect_new_kv_to_output(const std::shared_ptr& model) { - const auto kStartOutputKVCacheLayers = 1u; - for (std::size_t i = kStartOutputKVCacheLayers; i < model->outputs().size(); ++i) { - auto kvout = model->output(i); - auto kvrslt = kvout.get_node(); - auto kvcat = kvrslt->inputs()[0].get_source_output().get_node(); - auto kvval = kvcat->inputs()[1].get_source_output(); - kvval.set_names({kvout.get_any_name()}); - kvrslt->inputs()[0].replace_source_output(kvval); - } - model->validate_nodes_and_infer_types(); - return model; -} - std::shared_ptr cvt_value_tensors_layout(std::shared_ptr model) { ov::preprocess::PrePostProcessor ppp(model); for (auto tensor : model->outputs()) { @@ -234,6 +348,15 @@ std::shared_ptr cvt_value_tensors_layout(std::shared_ptr m return ppp.build(); } +bool optimize_value_tensors1(std::shared_ptr model) { + ov::pass::GraphRewrite rewr; + rewr.add_matcher(); + rewr.run_on_model(model); + + ov::pass::Validate().run_on_model(model); + return false; +} + bool optimize_value_tensors(std::shared_ptr model) { ov::pass::GraphRewrite rewr; rewr.add_matcher(); @@ -247,6 +370,8 @@ bool optimize_value_tensors(std::shared_ptr model) { } ov::pass::Validate().run_on_model(model); + ov::npuw::dump_failure(model, "", ""); + // NB: if new_params is not empty - pass has been applied return !ctx.new_params.empty(); } @@ -261,7 +386,7 @@ void reshape_to_static(std::shared_ptr model, const uint32_t kvcache_size, const KVAxesPosition& kv_axes_position) { std::map new_shapes; - for (const auto& input : model->inputs()) { + for (auto input : model->inputs()) { const auto& input_name = input.get_any_name(); ov::PartialShape new_shape; if (input_name.find("input_ids") != std::string::npos) { @@ -276,11 +401,28 @@ void reshape_to_static(std::shared_ptr model, new_shape[kv_axes_position.batch] = 1; new_shape[kv_axes_position.seq_len] = kvcache_size - input_size; } + LOG_DEBUG("static shape: " << input_name << ", shape=" << new_shape); new_shapes.emplace(input_name, new_shape); } model->reshape(new_shapes); } +KVAxesPosition get_kv_axes(const std::string& model_type) { + KVAxesPosition axes; + if (model_type == "chatglm") { + axes.batch = 1u; + axes.seq_len = 0u; + } else if (model_type == "qwen") { + // Note, qwen2 does not fall into this category and conforms to default layout + axes.batch = 0u; + axes.seq_len = 1u; + } else { + axes.batch = 0u; + axes.seq_len = 2u; + } + return axes; +} + bool is_cw_compressed(const std::shared_ptr& model) { std::vector rt_info_path = {"nncf", "weight_compression", "group_size"}; if (!model->has_rt_info(rt_info_path)) { @@ -295,28 +437,47 @@ bool is_cw_compressed(const std::shared_ptr& model) { return false; } + void split_npuw_properties(const ov::AnyMap& properties, + ov::AnyMap& npu_properties, + ov::AnyMap& npuw_properties) { + for (auto it = properties.begin(); it != properties.end(); ++it) { + if (it->first.find("NPUW") != it->first.npos) { + npuw_properties.insert(*it); + } else { + npu_properties.insert(*it); + } + } + } + struct NPUDesc { std::string arch; int64_t max_tiles; - bool compiler_dq; }; -std::optional extract_npu_descriptor(const std::shared_ptr& plugin) { +std::optional extract_npu_descriptor(const std::shared_ptr& plugin, const ov::AnyMap& properties) { const auto all_devices = plugin->get_core()->get_available_devices(); if (std::find(all_devices.begin(), all_devices.end(), "NPU") == all_devices.end()) { return std::nullopt; } - const std::string arch = plugin->get_property(ov::device::architecture.name(), ov::AnyMap{}).as(); - const int64_t max_tiles = plugin->get_property(ov::intel_npu::max_tiles.name(), ov::AnyMap{}).as(); - bool compiler_dq = false; - const auto supported_properties = - plugin->get_property(ov::supported_properties.name(), ov::AnyMap{}).as>(); - if (std::find(supported_properties.begin(), supported_properties.end(), "NPU_COMPILER_DYNAMIC_QUANTIZATION") != - supported_properties.end()) { - compiler_dq = true; + ov::AnyMap npuw_prop, npu_prop; + split_npuw_properties(properties, npu_prop, npuw_prop); + + std::cout << "extract_npu_descriptor: npu_config="; + for (auto &key : npu_prop) { + std::cout << "CONFIG: " << key.first << " = " << key.second.as() << std::endl; } - return std::make_optional(NPUDesc{arch, max_tiles, compiler_dq}); + std::cout << std::endl; + + std::cout << "npuw_config="; + for (auto &key : npuw_prop) { + std::cout << "CONFIG: " << key.first << " = " << key.second.as() << std::endl; + } + std::cout << std::endl; + + const ov::Any arch = plugin->get_property(ov::device::architecture.name(), npu_prop); + const ov::Any max_tiles = plugin->get_property(ov::intel_npu::max_tiles.name(), npu_prop); + return std::make_optional(NPUDesc{arch.as(), max_tiles.as()}); } std::optional pop_option(ov::AnyMap& config, const std::string& option_name) { @@ -328,7 +489,24 @@ std::optional pop_option(ov::AnyMap& config, const std::string& option_ return std::nullopt; } -ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { +template +std::optional get_option(ov::AnyMap& config, const std::string& option_name) { + if (auto it = config.find(option_name); it != config.end()) { + return std::make_optional(it->second.as()); + } + return std::nullopt; +} + +template +T pop_or_default(ov::AnyMap& config, const std::string& key, const T& default_value) { + auto anyopt = pop_option(config, key); + if (anyopt.has_value()) { + return anyopt.value().as(); + } + return default_value; +} + +ov::AnyMap get_baseline_common_config() { ov::AnyMap config = { {"NPU_COMPILATION_MODE_PARAMS", "compute-layers-with-higher-precision=Sqrt,Power,ReduceMean,Add_RMSNorm"}, {"NPUW_DEVICES", "NPU"}, @@ -339,19 +517,11 @@ ov::AnyMap get_baseline_common_config(const std::optional& npudesc) { {"NPUW_WEIGHTS_BANK", "shared"}, {"NPUW_SLICE_OUT", "YES"}, {"NPUW_FUNCALL_ASYNC", "YES"}}; - // FIXME: this config logic is getting more and more complex - if (npudesc.has_value() && npudesc->compiler_dq) { - config.emplace("NPUW_DQ", "YES"); - config.emplace("NPUW_DQ_FULL", "NO"); - config.emplace("NPU_COMPILER_DYNAMIC_QUANTIZATION", "YES"); - config.erase("NPUW_DCOFF_TYPE"); - config.erase("NPUW_DCOFF_SCALE"); - } return config; } -ov::AnyMap get_default_common_config(const std::shared_ptr& model, const std::optional& npudesc) { - auto config = get_baseline_common_config(npudesc); +ov::AnyMap get_default_common_config(const std::shared_ptr& model) { + auto config = get_baseline_common_config(); const char* npu_l0 = std::getenv("DISABLE_OPENVINO_GENAI_NPU_L0"); if (npu_l0 && std::atoi(npu_l0) == 1) { config.emplace("NPUW_WEIGHTS_BANK_ALLOC", "CPU"); @@ -362,38 +532,30 @@ ov::AnyMap get_default_common_config(const std::shared_ptr& model, co } ov::AnyMap get_default_prefill_config(const std::shared_ptr& model, const std::optional& npudesc) { - auto config = get_default_common_config(model, npudesc); + auto config = get_default_common_config(model); + if (is_cw_compressed(model)) { + config.emplace("NPUW_DQ", "YES"); + } else { + config.emplace("NPUW_PMM", "NO"); + } if (npudesc.has_value() && npudesc->arch == "4000" && npudesc->max_tiles != -1) { config.emplace("NPU_DPU_GROUPS", npudesc->max_tiles); } - // Specify NPUW DQ if Compiler DQ is not enabled - if (!npudesc.has_value() || !npudesc->compiler_dq) { - if (is_cw_compressed(model)) { - config.emplace("NPUW_DQ", "YES"); - } else { - config.emplace("NPUW_PMM", "NO"); - } - } return config; } ov::AnyMap get_default_generate_config(const std::shared_ptr& model, const std::optional& npudesc, const ::intel_npu::npuw::llm::GenerateHint hint) { - auto config = get_default_common_config(model, npudesc); + auto config = get_default_common_config(model); if (hint == ::intel_npu::npuw::llm::GenerateHint::BEST_PERF) { config.emplace("NPUW_ONLINE_PIPELINE", "NONE"); } + // NB: Unconditionally set for generation model + config.emplace("NPUW_DQ", "YES"); if (npudesc.has_value() && npudesc->arch == "4000") { config.emplace("NPU_DPU_GROUPS", 4); } - if (hint == ::intel_npu::npuw::llm::GenerateHint::FAST_COMPILE) { - config.emplace("NPUW_UNFOLD_IREQS", "YES"); - } - // Specify NPUW DQ if Compiler DQ is not enabled - if (!npudesc.has_value() || !npudesc->compiler_dq) { - config.emplace("NPUW_DQ", "YES"); - } return config; } @@ -408,6 +570,12 @@ void merge_config_with(ov::AnyMap& lhs, const ov::AnyMap& rhs) { } } +void drop_cache_dir(ov::AnyMap& config) { + if (config.count("NPU_USE_NPUW") != 0u) { + pop_option(config, "CACHE_DIR"); + } +} + void split_llm_properties(const ov::AnyMap& properties, ov::AnyMap& llm_properties, ov::AnyMap& other_properties) { for (auto it = properties.begin(); it != properties.end(); ++it) { if (it->first.find("NPUW_LLM") != it->first.npos) { @@ -427,11 +595,11 @@ std::map any_copy(const ov::AnyMap& params) { } } // namespace + ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& model, const std::shared_ptr& plugin, const ov::AnyMap& properties) : ov::npuw::ICompiledModel(model, plugin), - m_name(model->get_friendly_name()), m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), m_cfg(m_options_desc) { LOG_DEBUG("Creating LLMCompiledModel"); @@ -439,240 +607,68 @@ ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& m ::intel_npu::registerNPUWLLMOptions(*m_options_desc); - std::map npuw_llm_props; - std::map other_props; + ov::AnyMap npuw_llm_props; + ov::AnyMap other_props; split_llm_properties(properties, npuw_llm_props, other_props); - - // Remove "NPUW_LLM_PREFILL_CONFIG", "NPUW_LLM_GENERATE_CONFIG" from map, - // to not pass them into ::intel_npu::Config object, as we don't need to - // preserve them somewhere. - auto prefill_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_PREFILL_CONFIG")); - auto generate_config_opt = pop_option(npuw_llm_props, std::string("NPUW_LLM_GENERATE_CONFIG")); - auto prefill_config_addition = pop_option(npuw_llm_props, std::string("++NPUW_LLM_PREFILL_CONFIG")); - auto generate_config_addition = pop_option(npuw_llm_props, std::string("++NPUW_LLM_GENERATE_CONFIG")); - m_cfg.update(any_copy(npuw_llm_props)); LOG_DEBUG("1. Creating kvcache model as clone of passed one."); auto kvcache_model = model->clone(); LOG_DEBUG("2. Transform kvcache model from stateful to stateless."); ov::pass::StatefulToStateless().run_on_model(kvcache_model); - LOG_DEBUG("3. Creating prefill model as clone of transformed kvcache one."); + LOG_DEBUG("3. Align u4 ZP constants."); + align_u4_zp_constants(kvcache_model); + LOG_DEBUG("4. Creating prefill model as clone of transformed kvcache one."); auto prefill_model = kvcache_model->clone(); prefill_model->set_friendly_name(kvcache_model->get_friendly_name() + "_prefill"); - const uint32_t batch_dim = m_cfg.get<::intel_npu::NPUW_LLM_BATCH_DIM>(); - const uint32_t seq_len_dim = m_cfg.get<::intel_npu::NPUW_LLM_SEQ_LEN_DIM>(); - KVAxesPosition axes{batch_dim, seq_len_dim}; - const uint32_t max_prompt_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); - const uint32_t min_response_len = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); - - m_kvcache_desc = KVCacheDesc{max_prompt_len, max_prompt_len + min_response_len, 0u, seq_len_dim}; - LOG_DEBUG("4. Make prefill model with static shapes"); + const ::intel_npu::npuw::llm::ModelDesc model_desc = m_cfg.get<::intel_npu::NPUW_LLM_MODEL_DESC>(); + const uint32_t kMaxPromptLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MAX_PROMPT_LEN>(), 64u); + const uint32_t kMinResponseLen = align_to(m_cfg.get<::intel_npu::NPUW_LLM_MIN_RESPONSE_LEN>(), 64u); + KVAxesPosition axes = get_kv_axes(model_desc.type); + m_kvcache_desc = KVCacheDesc{kMaxPromptLen, kMaxPromptLen + kMinResponseLen, 0u, axes.seq_len}; + LOG_DEBUG("5. Make prefill model with static shapes"); reshape_to_static(prefill_model, m_kvcache_desc.max_prompt_size, m_kvcache_desc.max_prompt_size, axes); - LOG_DEBUG("5. Make kvcache model with static shapes"); + LOG_DEBUG("6. Make kvcache model with static shapes"); reshape_to_static(kvcache_model, 1u, m_kvcache_desc.total_size, axes); - LOG_DEBUG("6.Check and apply opt layout if applicable."); - - const bool optimize_v_tensors = m_cfg.get<::intel_npu::NPUW_LLM_OPTIMIZE_V_TENSORS>(); + LOG_DEBUG("7.Check and apply opt layout if applicable."); // NB: Try to apply opt transpose only for Llama-2-7b-chat-hf model - if (optimize_v_tensors) { + if ( model_desc.name_or_path == "meta-llama/Llama-2-7b-chat-hf" || + (model_desc.type == "llama" && model_desc.num_key_value_heads == 32)) { if (optimize_value_tensors(kvcache_model)) { // NB: Check if TransposeValueTensors transformation was applied m_kvcache_desc.v_tensors_transposed = true; prefill_model = cvt_value_tensors_layout(prefill_model); } } - LOG_DEBUG("7. Optimize kvcache model to output key/values for new token."); + LOG_DEBUG("8. Optimize kvcache model to output key/values for new token."); kvcache_model = redirect_new_kv_to_output(kvcache_model); - LOG_DEBUG("8. Converting KV-cache in kvcache model to FP16."); + LOG_DEBUG("9. Converting KV-cache in kvcache model to FP16."); kvcache_model = cvt_kvcache_to_fp16(kvcache_model); - LOG_DEBUG("9. Converting KV-cache in prefill model to FP16."); + LOG_DEBUG("10. Converting KV-cache in prefill model to FP16."); prefill_model = cvt_kvcache_to_fp16(prefill_model); - auto npudesc = extract_npu_descriptor(plugin); - auto prefill_config = - prefill_config_opt.value_or(get_default_prefill_config(prefill_model, npudesc)).as(); + auto npudesc = extract_npu_descriptor(plugin, properties); + ov::AnyMap properties_copy = other_props; + auto prefill_config = get_default_prefill_config(model, npudesc); // NB: GENERATE_HINT is only applicable for default generate config! - if (generate_config_opt.has_value() && npuw_llm_props.count(ov::intel_npu::npuw::llm::generate_hint.name())) { - OPENVINO_THROW("GENERATE_HINT only works with default generate config!"); - } const ::intel_npu::npuw::llm::GenerateHint generate_hint = m_cfg.get<::intel_npu::NPUW_LLM_GENERATE_HINT>(); - auto generate_config = - generate_config_opt.value_or(get_default_generate_config(kvcache_model, npudesc, generate_hint)) - .as(); - - auto prefill_config_addition_value = - prefill_config_addition.has_value() ? prefill_config_addition.value().as() : ov::AnyMap{}; - auto generate_config_addition_value = - generate_config_addition.has_value() ? generate_config_addition.value().as() : ov::AnyMap{}; - - merge_config_with(prefill_config, other_props); - merge_config_with(generate_config, other_props); - merge_config_with(prefill_config, prefill_config_addition_value); - merge_config_with(generate_config, generate_config_addition_value); - - m_kvcache_compiled = std::dynamic_pointer_cast( - ov::npuw::ICompiledModel::create(kvcache_model, plugin, generate_config)); - OPENVINO_ASSERT(m_kvcache_compiled, - "Can't create ov::npuw::CompiledModel for passed kvcache " - "model and its config, please check passed config."); - m_prefill_compiled = std::dynamic_pointer_cast( - ov::npuw::ICompiledModel::create(prefill_model, plugin, prefill_config)); - OPENVINO_ASSERT(m_prefill_compiled, - "Can't create ov::npuw::CompiledModel for passed prefill " - "model and its config, please check passed config."); + LOG_DEBUG("11. Passed GENERATE_HINT: " << std::string(::intel_npu::NPUW_LLM_GENERATE_HINT::toString(generate_hint))); + auto generate_config = get_default_generate_config(model, npudesc, generate_hint); - implement_properties(); + merge_config_with(prefill_config, properties_copy); + merge_config_with(generate_config, properties_copy); - LOG_DEBUG("Done"); -} + m_kvcache_compiled = std::make_shared(kvcache_model, plugin, generate_config); + m_prefill_compiled = std::make_shared(prefill_model, plugin, prefill_config); -ov::npuw::LLMCompiledModel::LLMCompiledModel(const std::shared_ptr& model, - const std::shared_ptr& plugin, - const bool serialized) - : ov::npuw::ICompiledModel(model, plugin), - m_name(model->get_friendly_name()), - m_options_desc(std::make_shared<::intel_npu::OptionsDesc>()), - m_cfg(m_options_desc) { - NPUW_ASSERT(serialized && "This constructor should only be utilized during deserialization!"); - ::intel_npu::registerNPUWLLMOptions(*m_options_desc); - LOG_DEBUG("LLMCompiledModel is being deserialized, skipping the full constructor flow..."); -} - -void ov::npuw::LLMCompiledModel::export_model(std::ostream& stream) const { - LOG_INFO("Serializing LLMCompiledModel..."); - LOG_BLOCK(); - - using namespace ov::npuw::s11n; - - // Serialize magic number first - write(stream, NPUW_SERIALIZATION_INDICATOR); - - // Serialize general meta info - write(stream, OPENVINO_VERSION_MAJOR); - write(stream, OPENVINO_VERSION_MINOR); - write(stream, OPENVINO_VERSION_PATCH); - write(stream, std::string(NPUW_SERIALIZATION_VERSION)); - - // Serialize name - write(stream, m_name); - - // Serialize inputs and outputs - write(stream, inputs()); - write(stream, outputs()); - - // Serialize LLMCompiledModel-specific data - write(stream, m_kvcache_desc.max_prompt_size); - write(stream, m_kvcache_desc.total_size); - write(stream, m_kvcache_desc.num_stored_tokens); - write(stream, m_kvcache_desc.dim); - - // Write config - write(stream, m_cfg); - - // Serialize CompiledModels - m_kvcache_compiled->serialize(stream); - m_prefill_compiled->serialize(stream); - - // Serialize weights bank (if required) - const auto& kv_bank = m_kvcache_compiled->m_weights_bank; - const auto& p_bank = m_prefill_compiled->m_weights_bank; - NPUW_ASSERT(kv_bank && p_bank && kv_bank == p_bank && "Prefill and KVCache models' weight bank should be shared!"); - // FIXME: support weightless flow - write(stream, kv_bank->get_name()); - kv_bank->serialize(stream); - - LOG_INFO("Done."); + implement_properties(); + LOG_DEBUG("Done"); } -std::shared_ptr ov::npuw::LLMCompiledModel::deserialize( - std::istream& stream, - const std::shared_ptr& plugin) { - LOG_INFO("Deserializing LLMCompiledModel..."); - LOG_BLOCK(); - - using namespace ov::npuw::s11n; - - // Sanity check magic number - std::array serialization_indicator; - read(stream, serialization_indicator); - NPUW_ASSERT(serialization_indicator == NPUW_SERIALIZATION_INDICATOR && "This blob wasn't serialized via NPUW!"); - - // Deserialize general meta info - int vmajor, vminor, vpatch; - std::string s11n_version; - read(stream, vmajor); - read(stream, vminor); - read(stream, vpatch); - read(stream, s11n_version); - - if (vmajor != OPENVINO_VERSION_MAJOR || vminor != OPENVINO_VERSION_MINOR || vpatch != OPENVINO_VERSION_PATCH || - s11n_version != std::string(NPUW_SERIALIZATION_VERSION)) { - OPENVINO_THROW("This blobs was serialized with different OV version!", - " Serialized by OV ", - vmajor, - '.', - vminor, - '.', - vpatch, - " Current OV version ", - OPENVINO_VERSION_MAJOR, - '.', - OPENVINO_VERSION_MINOR, - '.', - OPENVINO_VERSION_PATCH, - " NPUW serialized by version ", - s11n_version, - " NPUW current serialization version ", - NPUW_SERIALIZATION_VERSION); - } - - // Deserialize model name first - std::string model_name; - read(stream, model_name); - - // Create a dummy CompiledModel with an empty ov::Model - this will skip the constructor flow - // to continue deserialization - ov::ParameterVector parameters; - ov::NodeVector results; - - read(stream, parameters); - read(stream, results); - - auto ov_model = std::make_shared(results, parameters, model_name); - - auto compiled = std::make_shared(ov_model, plugin, true); - - // Deserialize LLMCompiledModel-specific data - read(stream, compiled->m_kvcache_desc.max_prompt_size); - read(stream, compiled->m_kvcache_desc.total_size); - read(stream, compiled->m_kvcache_desc.num_stored_tokens); - read(stream, compiled->m_kvcache_desc.dim); - - // Deserialize config - read(stream, compiled->m_cfg); - - // Deserialize CompiledModels - compiled->m_kvcache_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin); - compiled->m_prefill_compiled = ov::npuw::CompiledModel::deserialize(stream, plugin); - - // Deserialize weights bank (if required) - std::string bank_name; - read(stream, bank_name); - auto bank = ov::npuw::weights::Bank::deserialize(stream, compiled->get_plugin()->get_core(), bank_name); - - // FIXME: support weightless option - compiled->m_kvcache_compiled->m_weights_bank = bank; - compiled->m_prefill_compiled->m_weights_bank = bank; - - // After bank deserialization - reconstruct NPU closures from the bank - compiled->m_kvcache_compiled->reconstruct_closure(); - compiled->m_prefill_compiled->reconstruct_closure(); - - LOG_INFO("Done."); - return compiled; +void ov::npuw::LLMCompiledModel::export_model(std::ostream& model) const { + OPENVINO_NOT_IMPLEMENTED; } std::shared_ptr ov::npuw::LLMCompiledModel::get_runtime_model() const { @@ -685,11 +681,6 @@ void ov::npuw::LLMCompiledModel::set_property(const ov::AnyMap& properties) { ov::Any ov::npuw::LLMCompiledModel::get_property(const std::string& name) const { OPENVINO_SUPPRESS_DEPRECATED_START - if (name == ov::intel_npu::npuw::llm::prefill_config.name() || - name == ov::intel_npu::npuw::llm::generate_config.name()) { - OPENVINO_THROW(name, " is write-only option!"); - } - auto&& configIterator = m_prop_to_opt.find(name); if (configIterator != m_prop_to_opt.cend()) { return std::get<1>(configIterator->second)(m_cfg); @@ -706,7 +697,7 @@ std::shared_ptr ov::npuw::LLMCompiledModel::create_sync_i std::shared_ptr ov::npuw::LLMCompiledModel::create_llm_infer_request() { auto this_sptr = std::static_pointer_cast(shared_from_this()); - return std::make_shared(this_sptr); + return std::make_shared(this_sptr, m_kvcache_desc); } void ov::npuw::LLMCompiledModel::implement_properties() { @@ -720,11 +711,9 @@ void ov::npuw::LLMCompiledModel::implement_properties() { } m_prop_to_opt.insert({BIND(npuw::llm::enabled, NPUW_LLM, get), - BIND(npuw::llm::batch_dim, NPUW_LLM_BATCH_DIM, get), - BIND(npuw::llm::batch_dim, NPUW_LLM_SEQ_LEN_DIM, get), + BIND(npuw::llm::model_desc, NPUW_LLM_MODEL_DESC, getString), BIND(npuw::llm::max_prompt_len, NPUW_LLM_MAX_PROMPT_LEN, get), BIND(npuw::llm::min_response_len, NPUW_LLM_MIN_RESPONSE_LEN, get), - BIND(npuw::llm::optimize_v_tensors, NPUW_LLM_OPTIMIZE_V_TENSORS, get), BIND(npuw::llm::generate_hint, NPUW_LLM_GENERATE_HINT, getString)}); #undef BIND }