From 0d261dbf8311067f4dbc948588a38ebee8918cbf Mon Sep 17 00:00:00 2001
From: "Min, Byungil" <byungil.min@intel.com>
Date: Thu, 5 Jan 2023 15:44:12 +0900
Subject: [PATCH] Add decomposing Reduce for Bugfix of byx reduction (#14449)

+ Add transformation to fix accuracy issue of oneDNN reduction : DecomposeReduceForFalseKeepdims
+ Add Reshape to modify output of Reduce and update keep_dims to true : reduce-reshape
+ Add exception logic for unsupported reduce mode by byx conversion

Signed-off-by: Min, Byungil <byungil.min@intel.com>
---
 .../intel_gpu/src/graph/layout_optimizer.cpp  |  16 +-
 .../decompose_reduce_for_false_keepdims.cpp   | 119 ++++++++++
 .../decompose_reduce_for_false_keepdims.hpp   |  28 +++
 .../src/plugin/transformations_pipeline.cpp   |   2 +
 ...compose_reduce_for_false_keepdims_test.cpp | 214 ++++++++++++++++++
 5 files changed, 374 insertions(+), 5 deletions(-)
 create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.cpp
 create mode 100644 src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.hpp
 create mode 100644 src/plugins/intel_gpu/tests/transformations/decompose_reduce_for_false_keepdims_test.cpp
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
index cadd0f9173956c..06c90ed217d375 100644
--- a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
+++ b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -99,14 +99,20 @@ static bool is_reduce_blocked_axes(reduce_node const& node) {
     auto num_spatial = format::spatial_num(node.get_output_layout().format);
     auto dims = node.get_output_layout().format.dimension();
 
+    // Check if it reduces all spatial axes
+    bool feature_axis_is_only_remaining = true;
+    for (size_t idx_spatial = (dims - num_spatial); idx_spatial < dims; idx_spatial++) {
+        if (count(reduce_axes.begin(), reduce_axes.end(), idx_spatial) == 0) {
+            feature_axis_is_only_remaining = false;
+            break;
+        }
+    }
 
     if (input_layout.is_static() &&
         (count(reduce_axes.begin(), reduce_axes.end(), 1) > 0 ||
-        (count(reduce_axes.begin(), reduce_axes.end(), 0) > 0 && input_layout.batch() > 1))) {
-        for (size_t idx_spatial = dims - num_spatial ; idx_spatial < dims ; idx_spatial++) {
-            if (count(reduce_axes.begin(), reduce_axes.end(), idx_spatial) == 0)
-                return true;
-        }
+        (count(reduce_axes.begin(), reduce_axes.end(), 0) > 0))) {
+        if (!feature_axis_is_only_remaining)
+            return true;
     }
 
     return false;
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.cpp b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.cpp
new file mode 100644
index 00000000000000..087f083b7c8f1d
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.cpp
@@ -0,0 +1,119 @@
+// Copyright (C) 2018-2022 Intel Corporationc
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "decompose_reduce_for_false_keepdims.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <ngraph/opsets/opset10.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/rt_info.hpp>
+#include <vector>
+
+namespace ov {
+namespace intel_gpu {
+
+DecomposeReduceForFalseKeepDims::DecomposeReduceForFalseKeepDims() {
+    // Get one MatcherPass for all modes
+    auto reduce_pattern = ngraph::pattern::wrap_type<ngraph::opset10::ReduceSum,
+                                                     ngraph::opset10::ReduceMean,
+                                                     ngraph::opset10::ReduceProd,
+                                                     ngraph::opset10::ReduceMin,
+                                                     ngraph::opset10::ReduceMax>(
+        {ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
+         ngraph::pattern::wrap_type<ngraph::opset10::Constant>()},
+        ngraph::pattern::has_static_shape());
+
+    // register callback
+    ov::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto reduce =
+            as_type_ptr<op::util::ArithmeticReductionKeepDims>(pattern_map.at(reduce_pattern).get_node_shared_ptr());
+        if (!reduce)
+            return false;
+
+        auto input = reduce->input_value(0);
+        const auto input_shape = input.get_shape();
+        const auto reduce_shape = reduce->output(0).get_shape();
+        const auto input_rank = input.get_partial_shape().rank().get_length();
+
+        auto axes_vector = reduce->get_reduction_axes().to_vector();
+        std::sort(axes_vector.begin(), axes_vector.end());
+
+        if (!reduce->get_keep_dims() &&
+            need_transformation_for_reordered_axes(axes_vector, input_rank, (input_rank - 2)) &&
+            input_shape.size() < 6) {
+            ngraph::NodeVector new_ops;
+
+            // Reduce
+            auto reduce_const =
+                ngraph::opset10::Constant::create(ngraph::element::i64, ngraph::Shape{axes_vector.size()}, axes_vector);
+
+            // Add each reduce mode supported by oneDNN
+            if (ngraph::is_type<ngraph::opset10::ReduceSum>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceSum>(input, reduce_const, true);
+            else if (ngraph::is_type<ngraph::opset10::ReduceMean>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceMean>(input, reduce_const, true);
+            else if (ngraph::is_type<ngraph::opset10::ReduceMin>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceMin>(input, reduce_const, true);
+            else if (ngraph::is_type<ngraph::opset10::ReduceMax>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceMax>(input, reduce_const, true);
+            else if (ngraph::is_type<ngraph::opset10::ReduceProd>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceProd>(input, reduce_const, true);
+            else
+                return false;
+
+            input.get_node_shared_ptr()->set_friendly_name(reduce->get_friendly_name());
+            new_ops.push_back(input.get_node_shared_ptr());
+
+            // Reshape
+            auto reshape_shape = ngraph::Shape((input_rank - axes_vector.size()), 1);
+            // Expected that a feature axis is only un-reduced unless a new case for this decomposition is added.
+            assert(reshape_shape.size() == 1);
+            reshape_shape[0] = reduce_shape[0];
+            input = std::make_shared<ngraph::opset10::Reshape>(
+                input,
+                ngraph::opset10::Constant::create(ngraph::element::i64,
+                                                  ngraph::Shape{reshape_shape.size()},
+                                                  reshape_shape),
+                false);
+
+            input.get_node_shared_ptr()->set_friendly_name(reduce->get_friendly_name() + "_reshape_false_keepdims");
+            new_ops.push_back(input.get_node_shared_ptr());
+
+            ngraph::copy_runtime_info(reduce, new_ops);
+            reduce->output(0).replace(input);
+            return true;
+        }
+
+        return false;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(reduce_pattern, "DecomposeReduceForFalseKeepDims");
+    register_matcher(m, callback);
+}
+
+bool DecomposeReduceForFalseKeepDims::need_transformation_for_reordered_axes(std::vector<int64_t> reduce_axes,
+                                                                             size_t num_dim,
+                                                                             size_t num_spatial) {
+    bool feature_axis_is_only_remaining = false;
+    // Case to reduce batch axis and spatial axes
+    if (reduce_axes.size() > 1 && count(reduce_axes.begin(), reduce_axes.end(), 0) != 0 &&
+        count(reduce_axes.begin(), reduce_axes.end(), 1) == 0) {
+        feature_axis_is_only_remaining = true;
+        // Check if it reduces all spatial axes
+        for (size_t idx_spatial = (num_dim - num_spatial); idx_spatial < num_dim; idx_spatial++) {
+            if (count(reduce_axes.begin(), reduce_axes.end(), idx_spatial) == 0) {
+                feature_axis_is_only_remaining = false;
+                break;
+            }
+        }
+    }
+
+    return feature_axis_is_only_remaining;
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.hpp b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.hpp
new file mode 100644
index 00000000000000..3f2f294d2564f7
--- /dev/null
+++ b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <transformations_visibility.hpp>
+
+namespace ov {
+namespace intel_gpu {
+
+/**
+ * @brief Add Reshape to modify output of Reduce and modify keep_dims to true : reduce-reshape
+ *        A clDNN Reduce reorders un-reduced axes of its output tensor to b-f and spatial order when keep_dims is false.
+ *        oneDNN reduction does not allow this. And clDNN execution shows a huge perf drop for blocked formats.
+ */
+class DecomposeReduceForFalseKeepDims : public ngraph::pass::MatcherPass {
+public:
+    // Decompose reduce if keep_dims is false and it reduces batch and spatial axes
+    DecomposeReduceForFalseKeepDims();
+
+    // Returns true if reduction axes includes one of blocked axis and all spatial axes
+    bool need_transformation_for_reordered_axes(std::vector<int64_t> reduce_axes, size_t num_dim, size_t num_spatial);
+};
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
index 1062aaa168a0da..ac31397dbc94a7 100644
--- a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
+++ b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -28,6 +28,7 @@
 
 #include "transformations/einsum_decomposition.hpp"
 #include "transformations/convert_pooling_to_reduce.hpp"
+#include "transformations/decompose_reduce_for_false_keepdims.hpp"
 
 #include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
 #include <transformations/opset_conversions/convert_opset2_to_opset1.hpp>
@@ -266,6 +267,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             pass_config->disable<ngraph::pass::ConvertReduceMeanToPooling>();
             pass_config->disable<ngraph::pass::ConvertReduceMaxToPooling>();
             manager.register_pass<ConvertAvgPoolingToReduce>();
+            manager.register_pass<DecomposeReduceForFalseKeepDims>();
         } else {
             pass_config->set_callback<ngraph::pass::ConvertReduceSumToPooling>(
             [](const_node_ptr &node) -> bool {
diff --git a/src/plugins/intel_gpu/tests/transformations/decompose_reduce_for_false_keepdims_test.cpp b/src/plugins/intel_gpu/tests/transformations/decompose_reduce_for_false_keepdims_test.cpp
new file mode 100644
index 00000000000000..71c69fb315d287
--- /dev/null
+++ b/src/plugins/intel_gpu/tests/transformations/decompose_reduce_for_false_keepdims_test.cpp
@@ -0,0 +1,214 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include <gtest/gtest.h>
+
+#include <memory>
+#include <ngraph/function.hpp>
+#include <ngraph/opsets/opset10.hpp>
+#include <ngraph/pass/manager.hpp>
+#include <plugin/transformations/decompose_reduce_for_false_keepdims.hpp>
+#include <string>
+#include <transformations/init_node_info.hpp>
+#include <transformations/utils/utils.hpp>
+#include <tuple>
+
+#include "intel_gpu/primitives/reduce.hpp"
+#include "ngraph/type/element_type.hpp"
+#include "openvino/core/descriptor/tensor.hpp"
+#include "test_utils.h"
+
+using namespace testing;
+using namespace ::tests;
+
+using InputShape = ngraph::PartialShape;
+using KeepDims = bool;
+using ReduceAxes = std::vector<int64_t>;
+using ReduceType = cldnn::reduce_mode;
+using ReshapeShape = std::vector<size_t>;
+using NeedDecompose = bool;
+
+class ReduceDecomposeTests
+    : public ::testing::Test,
+      public testing::WithParamInterface<
+          std::tuple<ReduceType, InputShape, ReduceAxes, KeepDims, NeedDecompose, ReshapeShape>> {
+public:
+    std::shared_ptr<ngraph::Function> fc;
+    bool need_decompose;
+    ReshapeShape result_shape;
+
+    void SetUp() override {
+        const auto& reduce_type = std::get<0>(GetParam());
+        const auto& input_shape = std::get<1>(GetParam());
+        const auto& axes = std::get<2>(GetParam());
+        const auto& keep_dims = std::get<3>(GetParam());
+        need_decompose = std::get<4>(GetParam());
+        result_shape = std::get<5>(GetParam());
+
+        fc = get_transformed_function(input_shape, axes, reduce_type, keep_dims);
+    }
+
+    static std::shared_ptr<ngraph::Function> get_transformed_function(const ngraph::PartialShape& input_shape,
+                                                                      const std::vector<int64_t>& axes,
+                                                                      const ReduceType& reduce_type,
+                                                                      const bool keep_dim) {
+        auto param = std::make_shared<ngraph::opset10::Parameter>(ngraph::element::f32, input_shape);
+        if (reduce_type == reduce_mode::logical_or || reduce_type == reduce_mode::logical_and)
+            param = std::make_shared<ngraph::opset10::Parameter>(ngraph::element::boolean, input_shape);
+
+        ngraph::Output<ngraph::Node> input = param->output(0);
+
+        auto axes_const = ngraph::opset10::Constant::create(ngraph::element::i64, ngraph::Shape{axes.size()}, axes);
+
+        if (reduce_type == reduce_mode::sum)
+            input = std::make_shared<ngraph::opset10::ReduceSum>(input, axes_const, keep_dim);
+        else if (reduce_type == reduce_mode::mean)
+            input = std::make_shared<ngraph::opset10::ReduceMean>(input, axes_const, keep_dim);
+        else if (reduce_type == reduce_mode::min)
+            input = std::make_shared<ngraph::opset10::ReduceMin>(input, axes_const, keep_dim);
+        else if (reduce_type == reduce_mode::max)
+            input = std::make_shared<ngraph::opset10::ReduceMax>(input, axes_const, keep_dim);
+        else if (reduce_type == reduce_mode::prod)
+            input = std::make_shared<ngraph::opset10::ReduceProd>(input, axes_const, keep_dim);
+        else if (reduce_type == reduce_mode::logical_or)
+            input = std::make_shared<ngraph::opset10::ReduceLogicalOr>(input, axes_const, keep_dim);
+        else if (reduce_type == reduce_mode::logical_and)
+            input = std::make_shared<ngraph::opset10::ReduceLogicalAnd>(input, axes_const, keep_dim);
+        else
+            throw std::runtime_error("Invalid reduce type for this test-case.");
+
+        return std::make_shared<ngraph::Function>(ngraph::NodeVector{input.get_node_shared_ptr()},
+                                                  ngraph::ParameterVector{param});
+    }
+};
+
+TEST_P(ReduceDecomposeTests, CompareFunctions) {
+    ngraph::pass::Manager m;
+    m.set_per_pass_validation(false);
+    m.register_pass<ngraph::pass::InitNodeInfo>();
+    m.register_pass<ov::intel_gpu::DecomposeReduceForFalseKeepDims>();
+    m.run_passes(fc);
+
+    bool success = false;
+    ov::Shape output_shape;
+    for (auto& ops : fc->get_ops()) {
+        std::string type_name(ops->get_type_name());
+
+        if (type_name.find("Reshape") != std::string::npos) {
+            success = true;
+        }
+        else if (type_name.find("Result") != std::string::npos) {
+            output_shape = ops->get_shape();
+        }
+    }
+    ASSERT_TRUE(success == need_decompose);
+    ASSERT_TRUE(output_shape == result_shape);
+}
+
+INSTANTIATE_TEST_SUITE_P(ReduceDecomposeForFalseKeepdims,
+                         ReduceDecomposeTests,
+                         testing::Values(std::make_tuple(reduce_mode::prod,
+                                                         InputShape{32, 32, 32, 32},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{false},
+                                                         true,
+                                                         ReshapeShape{32}),
+                                         std::make_tuple(reduce_mode::sum,
+                                                         InputShape{16, 3, 32, 32},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{false},
+                                                         true,
+                                                         ReshapeShape{3}),
+                                         std::make_tuple(reduce_mode::mean,
+                                                         InputShape{16, 3, 32, 32},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{false},
+                                                         true,
+                                                         ReshapeShape{3}),
+                                         std::make_tuple(reduce_mode::min,
+                                                         InputShape{16, 3, 32, 32},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{false},
+                                                         true,
+                                                         ReshapeShape{3}),
+                                         std::make_tuple(reduce_mode::max,
+                                                         InputShape{16, 3, 32, 32},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{false},
+                                                         true,
+                                                         ReshapeShape{3}),
+                                         std::make_tuple(reduce_mode::max,
+                                                         InputShape{8, 3, 64, 64},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{false},
+                                                         true,
+                                                         ReshapeShape{3})));
+
+INSTANTIATE_TEST_SUITE_P(ReduceDecomposeForFalseKeepdimsNotCase,
+                         ReduceDecomposeTests,
+                         testing::Values(std::make_tuple(reduce_mode::max,
+                                                         InputShape{32, 32, 32, 32},
+                                                         ReduceAxes{0, 2},
+                                                         KeepDims{false},
+                                                         false,
+                                                         ReshapeShape{32, 32}),
+                                         std::make_tuple(reduce_mode::max,
+                                                         InputShape{1, 3, 64, 64},
+                                                         ReduceAxes{0, 3},
+                                                         KeepDims{false},
+                                                         false,
+                                                         ReshapeShape{3, 64}),
+                                         std::make_tuple(reduce_mode::max,
+                                                         InputShape{32, 32, 32, 32},
+                                                         ReduceAxes{0},
+                                                         KeepDims{false},
+                                                         false,
+                                                         ReshapeShape{32, 32, 32}),
+                                         std::make_tuple(reduce_mode::logical_and,
+                                                         InputShape{16, 3, 32, 32},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{false},
+                                                         false,
+                                                         ReshapeShape{3}),
+                                         std::make_tuple(reduce_mode::logical_or,
+                                                         InputShape{16, 3, 32, 32},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{false},
+                                                         false,
+                                                         ReshapeShape{3}),
+                                         std::make_tuple(reduce_mode::max,
+                                                         InputShape{1, 3, 64, 64},
+                                                         ReduceAxes{0},
+                                                         KeepDims{false},
+                                                         false,
+                                                         ReshapeShape{3, 64, 64})));
+
+INSTANTIATE_TEST_SUITE_P(ReduceDecomposeForTrueKeepdims,
+                         ReduceDecomposeTests,
+                         testing::Values(std::make_tuple(reduce_mode::max,
+                                                         InputShape{32, 32, 32, 32},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{true},
+                                                         false,
+                                                         ReshapeShape{1, 32, 1, 1}),
+                                         std::make_tuple(reduce_mode::max,
+                                                         InputShape{1, 3, 64, 64},
+                                                         ReduceAxes{0, 2, 3},
+                                                         KeepDims{true},
+                                                         false,
+                                                         ReshapeShape{1, 3, 1, 1}),
+                                         std::make_tuple(reduce_mode::max,
+                                                         InputShape{32, 32, 32, 32},
+                                                         ReduceAxes{0, 2},
+                                                         KeepDims{true},
+                                                         false,
+                                                         ReshapeShape{1, 32, 1, 32})));
+
+TEST(DecomposeReduceForFalseKeepDims, Negative) {
+    auto f =
+        ReduceDecomposeTests::get_transformed_function(ngraph::PartialShape::dynamic(), {3}, reduce_mode::max, true);
+    ngraph::pass::Manager manager;
+    manager.register_pass<ov::intel_gpu::DecomposeReduceForFalseKeepDims>();
+    ASSERT_NO_THROW(manager.run_passes(f));
+}