openvinotoolkit · xczhai · Jan 7, 2025
@@ -619,14 +619,20 @@ void ReduceAdd2bh::generate() {
             vmovups(zmm3, ptr[src1 + loop_i * 4 + 16 * 4]);
             vaddps(zmm0, zmm0, zmm1);
             vaddps(zmm2, zmm2, zmm3);
-            if (m_to_f16) {
+            if (m_output_type == ov::element::f32) {
+                vmovups(ptr[dst + loop_i * 4], zmm0);
+                vmovups(ptr[dst + loop_i * 4 + 64], zmm2);
+                prefetchwt1(ptr[prefetch_dst + loop_i * 2]);                
+            } else if (m_output_type == ov::element::f16) {
                 vcvtps2ph(ptr[dst + loop_i * 2], zmm0, 0x4);
                 vcvtps2ph(ptr[dst + loop_i * 2 + 32], zmm2, 0x4);
                 prefetchwt1(ptr[prefetch_dst + loop_i * 2]);
-            } else {
+            } else if (m_output_type == ov::element::bf16) {
                 vcvtne2ps2bf16(zmm4, zmm2, zmm0);
                 prefetchwt1(ptr[prefetch_dst + loop_i * 2]);
                 vmovups(ptr[dst + loop_i * 2], zmm4);
+            } else {
+                OPENVINO_THROW("ReduceAdd2hb cannot be generated with precision " + m_output_type.to_string());
             }
         }
         add(loop_i, 32);
@@ -650,14 +656,20 @@ void ReduceAdd2bh::generate() {
         {
             vmovups(zmm0, ptr[src0 + loop_i * 4]);
             vmovups(zmm2, ptr[src0 + loop_i * 4 + 16 * 4]);
-            if (m_to_f16) {
+            if (m_output_type == ov::element::f32) {
+                vmovups(ptr[dst + loop_i * 4], zmm0);
+                vmovups(ptr[dst + loop_i * 4 + 64], zmm2);
+                prefetchwt1(ptr[prefetch_dst + loop_i * 2]);
+            } else if (m_output_type == ov::element::f16) {
                 vcvtps2ph(ptr[dst + loop_i * 2], zmm0, 0x4);
                 vcvtps2ph(ptr[dst + loop_i * 2 + 32], zmm2, 0x4);
                 prefetchwt1(ptr[prefetch_dst + loop_i * 2]);
-            } else {
+            } else if (m_output_type == ov::element::bf16) {
                 vcvtne2ps2bf16(zmm4, zmm2, zmm0);
                 prefetchwt1(ptr[prefetch_dst + loop_i * 2]);
                 vmovups(ptr[dst + loop_i * 2], zmm4);
+            } else {
+                OPENVINO_THROW("ReduceAdd2hb cannot be generated with precision " + m_output_type.to_string());
             }
         }
         add(loop_i, 32);

@@ -512,33 +512,50 @@ class ReduceAdd2bh : public dnnl::impl::cpu::x64::jit_generator {
     DECLARE_CPU_JIT_AUX_FUNCTIONS(ReduceAdd2bh)
 
     const bool m_do_reduce2;
-    const bool m_to_f16;
-    ReduceAdd2bh(bool do_reduce2, bool to_f16) : jit_generator(jit_name()), m_do_reduce2(do_reduce2), m_to_f16(to_f16) {
+    const ov::element::Type m_output_type;
+    ReduceAdd2bh(bool do_reduce2, const ov::element::Type output_type = ov::element::undefined)
+        : jit_generator(jit_name()), m_do_reduce2(do_reduce2), m_output_type(output_type) {
         create_kernel();
     }
 
     void generate() override;
 
     // add two float input eltwise and convert to bf16 : ConvertFP32toBF16(src0 + src1)
     void
-    call(float* src0, float* src1, size_t src_stride, void* pf16_dst, size_t dst_stride, int num_rows, int num_cols) {
-        auto* dst = reinterpret_cast<int16_t*>(pf16_dst);
-        for (int m = 0; m < num_rows; m++, src0 += src_stride, src1 += src_stride, dst += dst_stride) {
-            // the prefetch distance is increased to ensure by the time store happens
-            // prefetch has done and no HW prefetcher is triggered
-            auto* prefetch_dst = (m + 2 < num_rows) ? (dst + 2 * dst_stride) : (dst);
-            (*this)(src0, src1, dst, prefetch_dst, num_cols);
+    call(float* src0, float* src1, size_t src_stride, void* out_dst, size_t dst_stride, int num_rows, int num_cols) {
+        if (m_output_type == ov::element::f32) {
+            auto* dst = reinterpret_cast<float*>(out_dst);
+            for (int m = 0; m < num_rows; m++, src0 += src_stride, src1 += src_stride, dst += dst_stride) {
+                // the prefetch distance is increased to ensure by the time store happens
+                // prefetch has done and no HW prefetcher is triggered
+                auto* prefetch_dst = (m + 2 < num_rows) ? (dst + 2 * dst_stride) : (dst);
+                (*this)(src0, src1, dst, prefetch_dst, num_cols);
+            }
+        } else if (one_of(m_output_type, ov::element::bf16, ov::element::f16)) {
+            auto* dst = reinterpret_cast<int16_t*>(out_dst);
+            for (int m = 0; m < num_rows; m++, src0 += src_stride, src1 += src_stride, dst += dst_stride) {
+                // the prefetch distance is increased to ensure by the time store happens
+                // prefetch has done and no HW prefetcher is triggered
+                auto* prefetch_dst = (m + 2 < num_rows) ? (dst + 2 * dst_stride) : (dst);
+                (*this)(src0, src1, dst, prefetch_dst, num_cols);
+            }
+        } else {
+            OPENVINO_THROW("ReduceAdd2bh call with precision " + m_output_type.to_string());
         }
     }
 
     // convert tensor to bf16: ConvertFP32toBF16(src0)
     void call(float* src0, size_t src_stride, void* pf16_dst, size_t dst_stride, int num_rows, int num_cols) {
-        auto* dst = reinterpret_cast<int16_t*>(pf16_dst);
-        for (int m = 0; m < num_rows; m++, src0 += src_stride, dst += dst_stride) {
-            // the prefetch distance is increased to ensure by the time store happens
-            // prefetch has done and no HW prefetcher is triggered
-            auto* prefetch_dst = (m + 2 < num_rows) ? (dst + 2 * dst_stride) : (dst);
-            (*this)(src0, dst, prefetch_dst, num_cols);
+        if (one_of(m_output_type, ov::element::bf16, ov::element::f16)) {
+            auto* dst = reinterpret_cast<int16_t*>(pf16_dst);
+            for (int m = 0; m < num_rows; m++, src0 += src_stride, dst += dst_stride) {
+                // the prefetch distance is increased to ensure by the time store happens
+                // prefetch has done and no HW prefetcher is triggered
+                auto* prefetch_dst = (m + 2 < num_rows) ? (dst + 2 * dst_stride) : (dst);
+                (*this)(src0, dst, prefetch_dst, num_cols);
+            }
+        } else {
+            OPENVINO_THROW("ReduceAdd2bh call with precision " + m_output_type.to_string());
         }
     }
 };

@@ -27,7 +27,7 @@ namespace node {
 
 #if defined(OPENVINO_ARCH_X86_64)
 
-template <typename T>
+template <typename T, typename U>
 class LinearKsplit2 {
 public:
     std::vector<Work> works;
@@ -120,12 +120,13 @@ class LinearKsplit2 {
     void run(uint8_t* pA,
              int strideA,
              int M,
-             T* dstC,
+             U* dstC,
              int strideC,
              const LLMMLPNode::Config& config,
              MatrixDynQuantPerRow& src_dq,
-             float* w_scale) {
-        static ReduceAdd2bh jit_reduce2cvt(true, std::is_same<T, ov::float16>::value);
+             float* w_scale,
+             ov::element::Type output_type) {
+        static ReduceAdd2bh jit_reduce2cvt(true, output_type);
 
         ov::parallel_nt_static(m_threads_num, [&](const size_t ithr, const size_t nthr) {
             auto& work = works[ithr];
@@ -310,7 +311,7 @@ class LinearGateUp {
     int m_threads_num = 0;
 };
 
-template <typename T>
+template <typename T, typename U>
 struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
     LLMMLP* m_pnode;
     const LLMMLPNode::Config m_config;
@@ -319,7 +320,7 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
     uint8_t* m_scratch_base = nullptr;
 
     LinearGateUp<T> gate_up;
-    LinearKsplit2<T> down;
+    LinearKsplit2<T, U> down;
     int m_N;
     int m_M = 0;
 
@@ -437,9 +438,10 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
         int M = shape_size(ishape) / ishape[ishape.size() - 1];
 
         auto output = m_pnode->getDstMemoryAtPort(0);
-        auto* dstC = output->getDataAs<T>();
+        auto outPrecision = output->getPrecision();
+        auto* dstC = output->getDataAs<U>();
         const auto& dstStrides = output->getDescWithType<BlockedMemoryDesc>()->getStrides();
-        int strideC = dstStrides[dstStrides.size() - 2] * sizeof(T);
+        int strideC = dstStrides[dstStrides.size() - 2] * sizeof(U);
 
         float* p_w_scale_down = nullptr;
         if (m_config.down_quantized) {
@@ -477,19 +479,19 @@ struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
                 stride_up_act = m_quant_up_act.stride();
             }
 
-            down.run(p_up_act, stride_up_act, BM, dstC, strideC, m_config, m_quant_up_act, p_w_scale_down);
+            down.run(p_up_act, stride_up_act, BM, dstC, strideC, m_config, m_quant_up_act, p_w_scale_down, outPrecision);
 
             m += BM;
             pA += BM * strideA_in_bytes;
-            dstC += BM * strideC / sizeof(T);
+            dstC += BM * strideC / sizeof(U);
         }
     }
 
 private:
     size_t m_threads_num = 0lu;
 };
 #else
-template <typename T>
+template <typename T, typename U>
 struct LLMMLP::Executor : public LLMMLP::ExecutorBase {
     Executor(LLMMLP*, const LLMMLPNode::Config&, const DnnlScratchPadPtr&) {}
     void execute() {}
@@ -515,6 +517,7 @@ void LLMMLP::initSupportedPrimitiveDescriptors() {
     std::vector<PortConfigurator> outPortConfigs;
 
     auto rtPrecision = getOriginalInputPrecisionAtPort(0);
+    auto outPrecision = getOriginalOutputPrecisionAtPort(0);
 
     if (rtPrecision == ov::element::f32) {
         // fallback to supported precision if possible
@@ -559,7 +562,7 @@ void LLMMLP::initSupportedPrimitiveDescriptors() {
                                        -1);  // down_weight scales per OC
 
         // initialize output port
-        outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1);
+        outPortConfigs.emplace_back(LayoutType::ncsp, outPrecision, getOutputShapeAtPort(0), false, -1);
     } else {
         auto weightPrecision = ov::element::f16;
 
@@ -570,22 +573,31 @@ void LLMMLP::initSupportedPrimitiveDescriptors() {
         inPortConfigs.emplace_back(LayoutType::ncsp, weightPrecision, getInputShapeAtPort(3), false, -1);  // down
 
         // initialize output port
-        outPortConfigs.emplace_back(LayoutType::ncsp, rtPrecision, getOutputShapeAtPort(0), false, -1);
+        outPortConfigs.emplace_back(LayoutType::ncsp, outPrecision, getOutputShapeAtPort(0), false, -1);
     }
     addSupportedPrimDesc(inPortConfigs, outPortConfigs, impl_desc_type::ref_any);
 }
 
 void LLMMLP::createPrimitive() {
     auto rtPrecision = getInputPrecisions()[0];
+    auto outPrecision = getOutputPrecisions()[0];
 #ifdef OPENVINO_ARCH_X86_64
     if (rtPrecision == ov::element::bf16) {
-        m_executor = std::make_shared<Executor<ov::bfloat16>>(this, m_mlp_config, context->getScratchPad());
+        if (outPrecision == ov::element::f32) {
+            m_executor = std::make_shared<Executor<ov::bfloat16, float>>(this, m_mlp_config, context->getScratchPad());
+        } else if (outPrecision == ov::element::bf16) {
+            m_executor = std::make_shared<Executor<ov::bfloat16, ov::bfloat16>>(this, m_mlp_config, context->getScratchPad());
+        }
     } else if (rtPrecision == ov::element::f16) {
-        m_executor = std::make_shared<Executor<ov::float16>>(this, m_mlp_config, context->getScratchPad());
+        if (outPrecision == ov::element::f32) {
+            m_executor = std::make_shared<Executor<ov::float16, float>>(this, m_mlp_config, context->getScratchPad());
+        } else if (outPrecision == ov::element::f16) {
+            m_executor = std::make_shared<Executor<ov::float16, ov::float16>>(this, m_mlp_config, context->getScratchPad());
+        }
     }
 #endif
     if (!m_executor) {
-        OPENVINO_THROW("LLMMLP Executor creation fails with precision " + rtPrecision.to_string());
+        OPENVINO_THROW("LLMMLP Executor creation fails with runtime precision " + rtPrecision.to_string() + ", output precision " + outPrecision.to_string());
     }
 }
 

@@ -39,7 +39,7 @@ class LLMMLP : public Node {
         virtual ~ExecutorBase() = default;
     };
     std::shared_ptr<ExecutorBase> m_executor;
-    template <typename T>
+    template <typename T, typename U>
     struct Executor;
     LLMMLPNode::Config m_mlp_config;
 };

@@ -67,11 +67,21 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase {
 
     WeightBuffer wbuffer;
 
+    ov::element::Type output_type = ov::element::undefined;
+
     Executor(QKVProjection* pnode, DnnlScratchPadPtr scrachPad) : m_node(pnode), m_scrachPad(std::move(scrachPad)) {
         PlainTensor w0(pnode->getSrcMemoryAtPort(1));
         PlainTensor w1(pnode->getSrcMemoryAtPort(2));
         PlainTensor w2(pnode->getSrcMemoryAtPort(3));
 
+        if (std::is_same<T, ov::float16>::value) {
+            output_type = ov::element::f16;
+        } else if (std::is_same<T, ov::bfloat16>::value) {
+            output_type = ov::element::bf16;
+        } else {
+            OPENVINO_THROW("QKVProjection Executor creation fails with output precision " + std::string(typeid(T).name()));
+        }
+
         // in quantized mode, weights are already quantized in per-OC mode into INT8
         // and activations will be dynamically per-token quantized and using AMX-INT8 to get the result
         bool quantized_int8 = m_node->m_config.quantized;
@@ -200,7 +210,7 @@ struct QKVProjection::Executor : public QKVProjection::ExecutorBase {
     }
 
     void execute() override {
-        static ReduceAdd2bh jit_cvt(false, std::is_same<T, ov::float16>::value);
+        static ReduceAdd2bh jit_cvt(false, output_type);
 
         auto input = m_node->getSrcMemoryAtPort(0);
         const auto& ishape = input->getStaticDims();

@@ -0,0 +1,61 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "fc_convert_fusion.hpp"
+
+#include <utils/general_utils.h>
+
+#include <openvino/core/rt_info.hpp>
+#include <openvino/pass/pattern/op/wrap_type.hpp>
+#include <transformations/utils/utils.hpp>
+
+#include "itt.hpp"
+#include "ov_ops/fully_connected.hpp"
+
+namespace ov {
+namespace intel_cpu {
+
+FcConvertFusion::FcConvertFusion() {
+    MATCHER_SCOPE(FcConvertFusion);
+    using namespace ov::pass::pattern;
+
+    auto a = any_input();
+    auto b = any_input();
+    auto fc = wrap_type<ov::op::internal::FullyConnected>({a, b}, consumers_count(1));
+    auto convert = wrap_type<ov::op::v0::Convert>({fc}, type_matches(ov::element::f32));
+
+    ov::matcher_pass_callback callback = [=](Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+
+        const auto& m_a = pattern_map.at(a).get_node_shared_ptr();
+        const auto& m_b = pattern_map.at(b).get_node_shared_ptr();
+        const auto& m_fc = pattern_map.at(fc).get_node_shared_ptr();
+
+        if (!one_of(m_a->get_output_element_type(0), ov::element::f16, ov::element::bf16, ov::element::f32) &&
+            !one_of(m_b->get_output_element_type(0), ov::element::f16, ov::element::bf16, ov::element::f32)) {
+            return false;
+        }
+
+        const auto out = m_fc->outputs();
+        const bool has_only_child = (out.size() == 1) && (out[0].get_target_inputs().size() == 1);
+        if (!has_only_child) {
+            return false;
+        }
+
+        const auto& m_convert = pattern_map.at(convert).get_node_shared_ptr();
+        auto output_type = m_convert->get_output_element_type(0);
+        auto new_fc = std::make_shared<ov::op::internal::FullyConnected>(m_a, m_b, output_type);
+
+        new_fc->set_friendly_name(m_convert->get_friendly_name());
+        copy_runtime_info(m.get_matched_nodes(), new_fc);
+        replace_node(m_convert, new_fc);
+        return true;
+    };
+
+    auto m = std::make_shared<ov::pass::pattern::Matcher>(convert, matcher_name);
+    this->register_matcher(m, callback);
+}
+
+}  // namespace intel_cpu
+}  // namespace ov
@@ -0,0 +1,18 @@
+// Copyright (C) 2018-2024 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <openvino/pass/graph_rewrite.hpp>
+
+namespace ov {
+namespace intel_cpu {
+class FcConvertFusion : public ov::pass::MatcherPass {
+public:
+    OPENVINO_RTTI("FcConvertFusion", "0");
+    FcConvertFusion();
+};
+
+}  // namespace intel_cpu
+}  // namespace ov
@@ -8,6 +8,7 @@
 #include "common/pass/convert_to_leaky_relu.hpp"
 #include "common/pass/convert_to_power_static.hpp"
 #include "common/pass/convert_to_swish_cpu.hpp"
+#include "common/pass/fc_convert_fusion.hpp"
 #include "common/pass/fc_bias_fusion.hpp"
 #include "common/pass/move_fc_reshape_to_weights.hpp"
 #include "common/pass/move_readvalue_inputs_to_subgraph.hpp"
@@ -53,6 +54,7 @@ inline void ConvertToCPUSpecificOpset(std::shared_ptr<ov::Model>& model, const C
     CPU_REGISTER_PASS_X64(manager, pass::ConvertFCToFCQuantizedLegacy);
     CPU_REGISTER_PASS_COMMON(manager, MoveFCReshapeToWeights);
     CPU_REGISTER_PASS_COMMON(manager, ov::pass::Validate);
+    CPU_REGISTER_PASS_X64(manager, FcConvertFusion);
     CPU_REGISTER_PASS_COMMON(manager, AlignMatMulInputRanks);
     CPU_REGISTER_PASS_COMMON(manager, ConvertTileToSeqTiles);
     CPU_REGISTER_PASS_COMMON(manager, ConvertToPowerStatic);

@@ -58,7 +58,8 @@ void LLMMLPNode::validate_and_infer_types() {
 
     auto oshape = ishape;
     oshape[oshape.size() - 1] = w_down_shape[0];
-    set_output_type(0, itype, oshape);
+    auto otype = m_output_type == ov::element::undefined ? itype : m_output_type;
+    set_output_type(0, otype, oshape);
 }
 
 std::shared_ptr<Node> LLMMLPNode::clone_with_new_inputs(const ov::OutputVector& new_args) const {