Add decomposing Reduce for Bugfix of byx reduction (#14449)

+ Add transformation to fix accuracy issue of oneDNN reduction : DecomposeReduceForFalseKeepdims + Add Reshape to modify output of Reduce and update keep_dims to true : reduce-reshape + Add exception logic for unsupported reduce mode by byx conversion Signed-off-by: Min, Byungil <[email protected]>
openvinotoolkit · Jan 5, 2023 · 0d261db · 0d261db
1 parent 9427623
commit 0d261db
Show file tree

Hide file tree

Showing 5 changed files with 374 additions and 5 deletions.
diff --git a/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp b/src/plugins/intel_gpu/src/graph/layout_optimizer.cpp
@@ -99,14 +99,20 @@ static bool is_reduce_blocked_axes(reduce_node const& node) {
     auto num_spatial = format::spatial_num(node.get_output_layout().format);
     auto dims = node.get_output_layout().format.dimension();
 
+    // Check if it reduces all spatial axes
+    bool feature_axis_is_only_remaining = true;
+    for (size_t idx_spatial = (dims - num_spatial); idx_spatial < dims; idx_spatial++) {
+        if (count(reduce_axes.begin(), reduce_axes.end(), idx_spatial) == 0) {
+            feature_axis_is_only_remaining = false;
+            break;
+        }
+    }
 
     if (input_layout.is_static() &&
         (count(reduce_axes.begin(), reduce_axes.end(), 1) > 0 ||
-        (count(reduce_axes.begin(), reduce_axes.end(), 0) > 0 && input_layout.batch() > 1))) {
-        for (size_t idx_spatial = dims - num_spatial ; idx_spatial < dims ; idx_spatial++) {
-            if (count(reduce_axes.begin(), reduce_axes.end(), idx_spatial) == 0)
-                return true;
-        }
+        (count(reduce_axes.begin(), reduce_axes.end(), 0) > 0))) {
+        if (!feature_axis_is_only_remaining)
+            return true;
     }
 
     return false;

diff --git a/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.cpp b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.cpp
@@ -0,0 +1,119 @@
+// Copyright (C) 2018-2022 Intel Corporationc
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#include "decompose_reduce_for_false_keepdims.hpp"
+
+#include <algorithm>
+#include <cassert>
+#include <memory>
+#include <ngraph/opsets/opset10.hpp>
+#include <ngraph/pattern/op/wrap_type.hpp>
+#include <ngraph/rt_info.hpp>
+#include <vector>
+
+namespace ov {
+namespace intel_gpu {
+
+DecomposeReduceForFalseKeepDims::DecomposeReduceForFalseKeepDims() {
+    // Get one MatcherPass for all modes
+    auto reduce_pattern = ngraph::pattern::wrap_type<ngraph::opset10::ReduceSum,
+                                                     ngraph::opset10::ReduceMean,
+                                                     ngraph::opset10::ReduceProd,
+                                                     ngraph::opset10::ReduceMin,
+                                                     ngraph::opset10::ReduceMax>(
+        {ngraph::pattern::any_input(ngraph::pattern::has_static_shape()),
+         ngraph::pattern::wrap_type<ngraph::opset10::Constant>()},
+        ngraph::pattern::has_static_shape());
+
+    // register callback
+    ov::matcher_pass_callback callback = [=](ngraph::pattern::Matcher& m) {
+        const auto& pattern_map = m.get_pattern_value_map();
+        auto reduce =
+            as_type_ptr<op::util::ArithmeticReductionKeepDims>(pattern_map.at(reduce_pattern).get_node_shared_ptr());
+        if (!reduce)
+            return false;
+
+        auto input = reduce->input_value(0);
+        const auto input_shape = input.get_shape();
+        const auto reduce_shape = reduce->output(0).get_shape();
+        const auto input_rank = input.get_partial_shape().rank().get_length();
+
+        auto axes_vector = reduce->get_reduction_axes().to_vector();
+        std::sort(axes_vector.begin(), axes_vector.end());
+
+        if (!reduce->get_keep_dims() &&
+            need_transformation_for_reordered_axes(axes_vector, input_rank, (input_rank - 2)) &&
+            input_shape.size() < 6) {
+            ngraph::NodeVector new_ops;
+
+            // Reduce
+            auto reduce_const =
+                ngraph::opset10::Constant::create(ngraph::element::i64, ngraph::Shape{axes_vector.size()}, axes_vector);
+
+            // Add each reduce mode supported by oneDNN
+            if (ngraph::is_type<ngraph::opset10::ReduceSum>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceSum>(input, reduce_const, true);
+            else if (ngraph::is_type<ngraph::opset10::ReduceMean>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceMean>(input, reduce_const, true);
+            else if (ngraph::is_type<ngraph::opset10::ReduceMin>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceMin>(input, reduce_const, true);
+            else if (ngraph::is_type<ngraph::opset10::ReduceMax>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceMax>(input, reduce_const, true);
+            else if (ngraph::is_type<ngraph::opset10::ReduceProd>(reduce))
+                input = std::make_shared<ngraph::opset10::ReduceProd>(input, reduce_const, true);
+            else
+                return false;
+
+            input.get_node_shared_ptr()->set_friendly_name(reduce->get_friendly_name());
+            new_ops.push_back(input.get_node_shared_ptr());
+
+            // Reshape
+            auto reshape_shape = ngraph::Shape((input_rank - axes_vector.size()), 1);
+            // Expected that a feature axis is only un-reduced unless a new case for this decomposition is added.
+            assert(reshape_shape.size() == 1);
+            reshape_shape[0] = reduce_shape[0];
+            input = std::make_shared<ngraph::opset10::Reshape>(
+                input,
+                ngraph::opset10::Constant::create(ngraph::element::i64,
+                                                  ngraph::Shape{reshape_shape.size()},
+                                                  reshape_shape),
+                false);
+
+            input.get_node_shared_ptr()->set_friendly_name(reduce->get_friendly_name() + "_reshape_false_keepdims");
+            new_ops.push_back(input.get_node_shared_ptr());
+
+            ngraph::copy_runtime_info(reduce, new_ops);
+            reduce->output(0).replace(input);
+            return true;
+        }
+
+        return false;
+    };
+
+    auto m = std::make_shared<ngraph::pattern::Matcher>(reduce_pattern, "DecomposeReduceForFalseKeepDims");
+    register_matcher(m, callback);
+}
+
+bool DecomposeReduceForFalseKeepDims::need_transformation_for_reordered_axes(std::vector<int64_t> reduce_axes,
+                                                                             size_t num_dim,
+                                                                             size_t num_spatial) {
+    bool feature_axis_is_only_remaining = false;
+    // Case to reduce batch axis and spatial axes
+    if (reduce_axes.size() > 1 && count(reduce_axes.begin(), reduce_axes.end(), 0) != 0 &&
+        count(reduce_axes.begin(), reduce_axes.end(), 1) == 0) {
+        feature_axis_is_only_remaining = true;
+        // Check if it reduces all spatial axes
+        for (size_t idx_spatial = (num_dim - num_spatial); idx_spatial < num_dim; idx_spatial++) {
+            if (count(reduce_axes.begin(), reduce_axes.end(), idx_spatial) == 0) {
+                feature_axis_is_only_remaining = false;
+                break;
+            }
+        }
+    }
+
+    return feature_axis_is_only_remaining;
+}
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.hpp b/src/plugins/intel_gpu/src/plugin/transformations/decompose_reduce_for_false_keepdims.hpp
@@ -0,0 +1,28 @@
+// Copyright (C) 2018-2022 Intel Corporation
+// SPDX-License-Identifier: Apache-2.0
+//
+
+#pragma once
+
+#include <ngraph/pass/graph_rewrite.hpp>
+#include <transformations_visibility.hpp>
+
+namespace ov {
+namespace intel_gpu {
+
+/**
+ * @brief Add Reshape to modify output of Reduce and modify keep_dims to true : reduce-reshape
+ *        A clDNN Reduce reorders un-reduced axes of its output tensor to b-f and spatial order when keep_dims is false.
+ *        oneDNN reduction does not allow this. And clDNN execution shows a huge perf drop for blocked formats.
+ */
+class DecomposeReduceForFalseKeepDims : public ngraph::pass::MatcherPass {
+public:
+    // Decompose reduce if keep_dims is false and it reduces batch and spatial axes
+    DecomposeReduceForFalseKeepDims();
+
+    // Returns true if reduction axes includes one of blocked axis and all spatial axes
+    bool need_transformation_for_reordered_axes(std::vector<int64_t> reduce_axes, size_t num_dim, size_t num_spatial);
+};
+
+}  // namespace intel_gpu
+}  // namespace ov
diff --git a/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp b/src/plugins/intel_gpu/src/plugin/transformations_pipeline.cpp
@@ -28,6 +28,7 @@
 
 #include "transformations/einsum_decomposition.hpp"
 #include "transformations/convert_pooling_to_reduce.hpp"
+#include "transformations/decompose_reduce_for_false_keepdims.hpp"
 
 #include <transformations/opset_conversions/convert_opset3_to_opset2.hpp>
 #include <transformations/opset_conversions/convert_opset2_to_opset1.hpp>
@@ -266,6 +267,7 @@ void TransformationsPipeline::apply(std::shared_ptr<ov::Model> func) {
             pass_config->disable<ngraph::pass::ConvertReduceMeanToPooling>();
             pass_config->disable<ngraph::pass::ConvertReduceMaxToPooling>();
             manager.register_pass<ConvertAvgPoolingToReduce>();
+            manager.register_pass<DecomposeReduceForFalseKeepDims>();
         } else {
             pass_config->set_callback<ngraph::pass::ConvertReduceSumToPooling>(
             [](const_node_ptr &node) -> bool {