From 3caf7b21f1e9cbc5e164cd8ed1ce999e0d901e8c Mon Sep 17 00:00:00 2001 From: Alexandra Sidorova Date: Wed, 15 Jan 2025 11:39:03 +0400 Subject: [PATCH] [CPU] Transpose insertion before FC in ConvertMatMulToFC (#28401) ### Details (Updated): - *Previously, if `MatMul` has `transposed_b=false` and decompressed convert on weights, the pass `ConvertMatMulToFC` inserted `Transpose` before this `Convert`. It means that if `Convert` has another consumer (`Result` or even `MatMul` with `transposed=true`), the inserted `Transpose` could break the shapes of `Convert` consumers (please see details in the mentioned ticket). The current PR inserts `Transpose` after existing `Convert` and updates CPUGraph-pass `FuseFCAndConvertOnWeights`.* ### Tickets: - *160215* --- src/plugins/intel_cpu/src/graph_optimizer.cpp | 41 +++-- .../common/pass/convert_matmul_to_fc.cpp | 14 +- .../src/common/matmul_decompress_convert.cpp | 149 +++++++++++++++++- .../transformations/convert_matmul_test.cpp | 12 +- 4 files changed, 181 insertions(+), 35 deletions(-) diff --git a/src/plugins/intel_cpu/src/graph_optimizer.cpp b/src/plugins/intel_cpu/src/graph_optimizer.cpp index 10f7b485bc0a16..cb1324e7435703 100644 --- a/src/plugins/intel_cpu/src/graph_optimizer.cpp +++ b/src/plugins/intel_cpu/src/graph_optimizer.cpp @@ -712,27 +712,50 @@ void GraphOptimizer::FuseFCAndConvertOnWeights(Graph& graph) { // This optimization fuses Convert (fp16 -> bf16/fp32) on weights directly to FC input to allow precision conversion // handling based on internal logic (e.g. fuse conversion with weights reordering) + + auto isSuitableTranspose = [](const NodePtr& node) { + return node->getType() == Type::Transpose && node->getChildEdges().size() == 1 && node->isConstant(); + }; + auto isSuitableConvert = [&](const NodePtr& node) { + return node->getType() == Type::Convert && node->isConstant() && + one_of(node->getOriginalInputPrecisionAtPort(0), ov::element::f16, ov::element::bf16) && + one_of(node->getOriginalOutputPrecisionAtPort(0), ov::element::f32, ov::element::bf16); + }; + auto& graphNodes = graph.GetNodes(); for (const auto& fullyConnected : graphNodes) { if (fullyConnected->getType() != Type::FullyConnected) { continue; } - const auto convert = fullyConnected->getParentEdgeAt(1)->getParent(); - if (convert->getType() != Type::Convert || - !one_of(convert->getOriginalInputPrecisionAtPort(0), ov::element::f16, ov::element::bf16) || - !one_of(convert->getOriginalOutputPrecisionAtPort(0), ov::element::f32, ov::element::bf16) || - !convert->isConstant()) { - continue; + + NodePtr transpose = nullptr; + auto parent = fullyConnected->getParentEdgeAt(1)->getParent(); + if (parent->getType() == Type::Transpose) { + if (!isSuitableTranspose(parent)) + continue; + + transpose = parent; + parent = transpose->getParentEdgeAt(0)->getParent(); } + const auto convert = parent; + if (!isSuitableConvert(convert)) + continue; + const auto weights = convert->getParentEdgeAt(0)->getParent(); const auto weights_out_edge = weights->getChildEdges()[0].lock(); - const auto fc_weights_path_edge = fullyConnected->getParentEdgeAt(1); + const auto fc_weights_path_edge = + transpose ? transpose->getParentEdgeAt(0) : fullyConnected->getParentEdgeAt(1); const auto inNum = weights_out_edge->getInputNum(); const auto outNum = fc_weights_path_edge->getOutputNum(); - fullyConnected->setOriginalInputPrecisionAtPort(1, convert->getOriginalInputPrecisionAtPort(0)); + const auto originalPrecision = convert->getOriginalInputPrecisionAtPort(0); + fullyConnected->setOriginalInputPrecisionAtPort(1, originalPrecision); + if (transpose) { + transpose->setOriginalInputPrecisionAtPort(0, originalPrecision); + transpose->setOriginalOutputPrecisionAtPort(0, originalPrecision); + } graph.RemoveEdge(fc_weights_path_edge); - graph.CreateEdge(weights, fullyConnected, inNum, outNum); + graph.CreateEdge(weights, transpose ? transpose : fullyConnected, inNum, outNum); if (convert->getChildEdges().empty()) { graph.DropNode(convert); } diff --git a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp index bf9fb16f8dab7c..0f2252fd5d256f 100644 --- a/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp +++ b/src/plugins/intel_cpu/src/transformations/cpu_opset/common/pass/convert_matmul_to_fc.cpp @@ -36,12 +36,8 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() { // So in case of adding new operations that takes matmul inputs we need keep update fc_input_a and fc_input_b. auto fc_input_a = pattern_map.at(activations_m); auto fc_input_b = pattern_map.at(weights_m); - bool is_convert = false; if (auto convert_node = ov::as_type_ptr(fc_input_b.get_node_shared_ptr())) { - if (is_decompression(convert_node)) { - is_convert = true; - fc_input_b = convert_node->get_input_node_shared_ptr(0); - } else { + if (!is_decompression(convert_node)) { return false; } } @@ -151,14 +147,6 @@ ov::intel_cpu::ConvertMatMulToFC::ConvertMatMulToFC() { fc_input_a = create_transpose(fc_input_a, matmul->get_friendly_name() + "/transpose_a"); } - // Connect Convert to new input if needed - if (is_convert) { - auto convert = pattern_map.at(weights_m).get_node_shared_ptr(); - convert->input(0).replace_source_output(fc_input_b); - convert->validate_and_infer_types(); - fc_input_b = convert; - } - auto bias = std::make_shared(element::undefined, Shape{0}); new_ops.push_back(bias); diff --git a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/matmul_decompress_convert.cpp b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/matmul_decompress_convert.cpp index 383385e9e5c1db..aa68ca17db7375 100644 --- a/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/matmul_decompress_convert.cpp +++ b/src/plugins/intel_cpu/tests/functional/custom/subgraph_tests/src/common/matmul_decompress_convert.cpp @@ -222,17 +222,18 @@ class MatMulDecompressConvertTest : public testing::WithParamInterface, // input shapes - std::pair, // transposeA, transposeB - ElementType, // weights precision - ov::AnyMap, // additional property - CPUSpecificParams>; class MatMulDecompressConvertTest2 : public MatMulDecompressConvertTest { protected: @@ -519,5 +515,144 @@ INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_2, } // namespace + +/* This test covers NNCF-case when decompression convert has not only MatMul consumer. + * Graph before: + ------------ --------------- + |Input(f32)| |Constant(f16)| + ------------ --------------- + | | + | --------------------------------- + | |Convert(decompression f16->f32)| + | --------------------------------- + | | | + ---------------------------- ----------------------- + |MatMul (transposed_b=true)| | Result | + ---------------------------- ----------------------- + | + ----------------------- + | Result | + ----------------------- + + * Exec graph: + ------------ ----------------------------- + |Input(f32)| | Constant(f16) | + ------------ ----------------------------- + | | | + | ------------- --------------------- + | | Transpose | | Convert(f16->f32) | + | ------------- --------------------- + | | | + ----------------------- ----------------------- + | FullyConnected | | Result | + ----------------------- ----------------------- + | + ----------------------- + | Result | + ----------------------- +*/ + +class MatMulDecompressConvertTest3 : public MatMulDecompressConvertTest { +protected: + void SetUp() override { + targetDevice = ov::test::utils::DEVICE_CPU; + + std::vector inputShapes; + std::pair transpose; + ElementType weiConstElemType; + ov::AnyMap additionalConfig; + CPUSpecificParams cpuParams; + + std::tie(inputShapes, transpose, weiConstElemType, additionalConfig, cpuParams) = this->GetParam(); + std::tie(inFmts, outFmts, priority, selectedType) = cpuParams; + + init_input_shapes(inputShapes); + + bool transpA = transpose.first; + bool transpB = transpose.second; + + if (transpA) + transposeCount++; + if (!transpB) + transposeCount++; + + if (transpA) { + transpose_shape(inputDynamicShapes[0]); + for (auto& shapes : targetStaticShapes) { + transpose_shape(shapes[0]); + } + } + if (transpB) { + transpose_shape(inputDynamicShapes[1]); + for (auto& shapes : targetStaticShapes) { + transpose_shape(shapes[1]); + } + } + + const auto& inShapeA = inputDynamicShapes[0]; + const auto& inShapeB = inputDynamicShapes[1]; + + configuration.insert(additionalConfig.begin(), additionalConfig.end()); + + ElementType netType = ElementType::f32; + ElementType convertOutType = ElementType::f32; + inType = outType = netType; + + std::string cpuNodeType = "FullyConnected"; + selectedType = makeSelectedTypeStr(selectedType, outType); + + ov::ParameterVector params{std::make_shared(inType, inShapeA)}; + std::shared_ptr inputB = ov::test::utils::make_constant(weiConstElemType, inShapeB.get_shape()); + inputB = std::make_shared(inputB, convertOutType); + mark_as_decompression(inputB); + expectedWeiConstElemType = weiConstElemType; + convertCount = 1; + + auto matMul = std::make_shared(params[0], inputB, transpA, transpB); + auto result0 = std::make_shared(matMul); + auto result1 = std::make_shared(inputB); + result1->set_friendly_name("ConstantResult"); + + modifyGraph(netType, params, matMul); + function = std::make_shared(ov::ResultVector{result0, result1}, params, "MatMulDecompressed3"); + } + + void check_execution_graph() override { + MatMulDecompressConvertTest::check_execution_graph(); + + // Check that Result has correct shape: the same as origin Constant + const auto results = compiledModel.outputs(); + const auto result_it = std::find_if(results.cbegin(), results.cend(), + [](const ov::Output& out) { + return out.get_node()->get_friendly_name() == "ConstantResult"; + }); + ASSERT_NE(result_it, results.cend()) + << "Target Result has not been found!"; + ASSERT_EQ(result_it->get_partial_shape(), inputDynamicShapes[1]) + << "Target Result has not origin shape. It has: " << result_it->get_partial_shape() << " but should have origin: " << inputDynamicShapes[1]; + } +}; + +TEST_P(MatMulDecompressConvertTest3, CompareWithRefs) { + SKIP_IF_CURRENT_TEST_IS_DISABLED(); + run(); + check_execution_graph(); +} + +namespace { +const auto testParams2D_FP16_3_smoke = + ::testing::Combine(::testing::Values(static_shapes_to_test_representation({{1, 16, 32}, {32, 64}})), + ::testing::Values(std::pair{false, false}), + ::testing::Values(ElementType::f16), + ::testing::Values(emptyConfig), + ::testing::ValuesIn(filter_specific_params(false))); + +INSTANTIATE_TEST_SUITE_P(smoke_FC_2D_FP16_3, + MatMulDecompressConvertTest3, + testParams2D_FP16_3_smoke, + MatMulDecompressConvertTest3::getTestCaseName); + +} // namespace + } // namespace test } // namespace ov diff --git a/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp b/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp index 3d2ab245d54c22..a7ed7296281c8f 100644 --- a/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp +++ b/src/plugins/intel_cpu/tests/unit/transformations/convert_matmul_test.cpp @@ -461,13 +461,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_decompress_convert_0) { auto input1 = std::make_shared(ov::element::f32, ov::Shape{3, 2, 2}); auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1}); + auto convert = std::make_shared(input2, ov::element::f32); auto transpose_constant = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); - auto transpose = std::make_shared(input2, transpose_constant); - auto convert = std::make_shared(transpose, ov::element::f32); + auto transpose = std::make_shared(convert, transpose_constant); auto matmul = std::make_shared( input1, - convert, + transpose, std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1}); @@ -491,13 +491,13 @@ TEST_F(TransformationTestsF, ConvertMatMulToFCTest_decompress_convert_1) { auto transpose1 = std::make_shared(input1, transpose_constant1); auto input2 = ov::opset1::Constant::create(ov::element::f16, ov::Shape{1, 2, 2}, {1}); + auto convert = std::make_shared(input2, ov::element::f32); auto transpose_constant2 = ov::opset1::Constant::create(ov::element::i32, ov::Shape{3}, {0, 2, 1}); - auto transpose2 = std::make_shared(input2, transpose_constant2); - auto convert = std::make_shared(transpose2, ov::element::f32); + auto transpose2 = std::make_shared(convert, transpose_constant2); auto matmul = std::make_shared( transpose1, - convert, + transpose2, std::make_shared(ov::element::undefined, ov::Shape{0})); model_ref = std::make_shared(ov::NodeVector{matmul}, ov::ParameterVector{input1});