diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/reduce.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/reduce.cpp new file mode 100644 index 00000000000000..80bd72f74528b2 --- /dev/null +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/reduce.cpp @@ -0,0 +1,189 @@ +// Copyright (C) 2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "register.hpp" +#include "reduce_inst.h" +#include "implementation_map.hpp" + +#include "openvino/op/reduce_max.hpp" +#include "openvino/op/reduce_sum.hpp" +#include "openvino/op/reduce_prod.hpp" +#include "openvino/op/reduce_l1.hpp" +#include "openvino/op/reduce_l2.hpp" +#include "openvino/op/reduce_logical_and.hpp" +#include "openvino/op/reduce_logical_or.hpp" +#include "openvino/op/reduce_mean.hpp" +#include "openvino/op/reduce_min.hpp" + +namespace cldnn { +namespace cpu { + +namespace { + +template +std::shared_ptr make_reduce(bool keep_dims) { + auto op = std::make_shared(); + op->set_keep_dims(keep_dims); + return op; +} +} // namespace + +struct reduce_impl : public typed_primitive_impl { + using parent = typed_primitive_impl; + using parent::parent; + + reduce_mode mode = reduce_mode::sum; + std::vector axes = {}; + bool keep_dims = false; + + std::shared_ptr op; + + DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::reduce_impl) + + std::unique_ptr clone() const override { + return make_unique(*this); + } + + reduce_impl() : parent("reduce_cpu_impl") {} + + explicit reduce_impl(const reduce_node& outer) { + set_node_params(outer); + } + + void set_node_params(const program_node& arg) override { + OPENVINO_ASSERT(arg.is_type(), "[GPU] Incorrect program_node type"); + const auto& node = arg.as(); + mode = node.get_primitive()->mode; + axes = node.get_primitive()->axes; + keep_dims = node.get_primitive()->keep_dims; + } + + void save(BinaryOutputBuffer& ob) const override { + parent::save(ob); + ob << make_data(&mode, sizeof(reduce_mode)); + ob << axes; + ob << keep_dims; + } + + void load(BinaryInputBuffer& ib) override { + parent::load(ib); + ib >> make_data(&mode, sizeof(reduce_mode)); + ib >> axes; + ib >> keep_dims; + } + + event::ptr execute_impl(const std::vector& events, reduce_inst& instance) override { + OV_ITT_SCOPED_TASK(ov::intel_gpu::itt::domains::intel_gpu_plugin, "reduce::execute_impl"); + auto& stream = instance.get_network().get_stream(); + + const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph(); + + if (!pass_through_events) { + for (auto e : events) { + e->wait(); + } + } + + auto params = instance.get_impl_params(); + + ov::TensorVector input_host_tensors; + ov::TensorVector output_host_tensors; + + if (!op) { + op = make_reduce(keep_dims); + switch (mode) { + case reduce_mode::max: + op = make_reduce(keep_dims); + break; + case reduce_mode::min: + op = make_reduce(keep_dims); + break; + case reduce_mode::mean: + op = make_reduce(keep_dims); + break; + case reduce_mode::prod: + op = make_reduce(keep_dims); + break; + case reduce_mode::sum: + op = make_reduce(keep_dims); + break; + case reduce_mode::logical_and: + op = make_reduce(keep_dims); + break; + case reduce_mode::logical_or: + op = make_reduce(keep_dims); + break; + case reduce_mode::l1: + op = make_reduce(keep_dims); + break; + case reduce_mode::l2: + op = make_reduce(keep_dims); + break; + default: + OPENVINO_THROW("[GPU] Couldn't create reduce operation: unsupported reduce mode (", static_cast(mode), ")"); + } + } + + cldnn::mem_lock output_lock(instance.output_memory_ptr(), stream); + cldnn::mem_lock input_lock(instance.dep_memory_ptr(0), stream); + + input_host_tensors.push_back(make_tensor(params->input_layouts[0], input_lock.data())); + input_host_tensors.push_back(ov::Tensor(ov::element::i64, ov::Shape{axes.size()}, static_cast(axes.data()))); + + output_host_tensors.push_back(make_tensor(params->output_layouts[0], output_lock.data())); + + OPENVINO_ASSERT(op->evaluate(output_host_tensors, input_host_tensors), + "[GPU] Couldn't execute reduce primitive with id ", instance.id()); + + if (pass_through_events) { + if (events.size() > 1) { + return stream.group_events(events); + } else if (events.size() == 1) { + return events[0]; + } + } + + return stream.create_user_event(true); + } + + void init_kernels(const kernels_cache& , const kernel_impl_params&) override {} + + void update_dispatch_data(const kernel_impl_params& impl_param) override {} + +public: + static std::unique_ptr create(const reduce_node& arg, const kernel_impl_params& impl_param) { + return make_unique(); + } +}; + + +namespace detail { + +attach_reduce_impl::attach_reduce_impl() { + auto formats = { + format::bfyx, + format::bfzyx, + format::bfwzyx, + format::bfuwzyx, + format::bfvuwzyx, + }; + + auto types = { + data_types::f32, + data_types::f16, + data_types::i32, + data_types::i64, + data_types::i8, + data_types::u8, + }; + + implementation_map::add(impl_types::cpu, shape_types::static_shape, reduce_impl::create, types, formats); + implementation_map::add(impl_types::cpu, shape_types::dynamic_shape, reduce_impl::create, types, formats); +} + +} // namespace detail +} // namespace cpu +} // namespace cldnn + +BIND_BINARY_BUFFER_WITH_TYPE(cldnn::cpu::reduce_impl) diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp index b2868fa2aff6e3..c70b39cc9de7f1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp @@ -29,6 +29,7 @@ void register_implementations() { REGISTER_CPU(broadcast); REGISTER_CPU(tile); REGISTER_CPU(select); + REGISTER_CPU(reduce); } } // namespace cpu diff --git a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp index 0f845498027adf..aaa56678d08ca1 100644 --- a/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp +++ b/src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp @@ -22,6 +22,7 @@ #include "intel_gpu/primitives/broadcast.hpp" #include "intel_gpu/primitives/tile.hpp" #include "intel_gpu/primitives/select.hpp" +#include "intel_gpu/primitives/reduce.hpp" namespace cldnn { namespace cpu { @@ -53,6 +54,7 @@ REGISTER_CPU(reorder); REGISTER_CPU(broadcast); REGISTER_CPU(tile); REGISTER_CPU(select); +REGISTER_CPU(reduce); #undef REGISTER_CPU diff --git a/src/plugins/intel_gpu/src/graph/include/scatter_nd_update_inst.h b/src/plugins/intel_gpu/src/graph/include/scatter_nd_update_inst.h index b39cc07d234946..4718bf367d3b55 100644 --- a/src/plugins/intel_gpu/src/graph/include/scatter_nd_update_inst.h +++ b/src/plugins/intel_gpu/src/graph/include/scatter_nd_update_inst.h @@ -9,6 +9,18 @@ namespace cldnn { +template <> +struct typed_program_node : public typed_program_node_base { +private: + using parent = typed_program_node_base; + +public: + using parent::parent; + program_node& input(std::size_t i = 0) const { return get_dependency(i); } + + std::vector get_shape_infer_dependencies() const override { return {}; } +}; + using scatter_nd_update_node = typed_program_node; template <> diff --git a/src/plugins/intel_gpu/src/graph/scatter_nd_update.cpp b/src/plugins/intel_gpu/src/graph/scatter_nd_update.cpp index 2cfff52d4e1559..8f66889c25b884 100644 --- a/src/plugins/intel_gpu/src/graph/scatter_nd_update.cpp +++ b/src/plugins/intel_gpu/src/graph/scatter_nd_update.cpp @@ -29,9 +29,9 @@ layout scatter_nd_update_inst::calc_output_layout(scatter_nd_update_node const& template std::vector scatter_nd_update_inst::calc_output_layouts(scatter_nd_update_node const& /*node*/, const kernel_impl_params& impl_param) { - auto input0_layout = impl_param.get_input_layout(0); - auto input1_layout = impl_param.get_input_layout(1); - auto input2_layout = impl_param.get_input_layout(2); + const auto& input0_layout = impl_param.get_input_layout(0); + const auto& input1_layout = impl_param.get_input_layout(1); + const auto& input2_layout = impl_param.get_input_layout(2); std::vector input_shapes = { input0_layout.get(), // inputs_shape diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp index a85db18a475f7d..d0c3dcca089c90 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/reduce_gpu_test.cpp @@ -1652,6 +1652,38 @@ TEST(reduce_gpu, common_bfwzyx_log_sum_exp_keepdims) { } } +TEST(reduce_gpu, cpu_impl_int32) { + auto& engine = get_test_engine(); + auto input = engine.allocate_memory({{4}, data_types::i32, format::bfyx}); + + set_values(input, {1, 2, 3, 4}); + + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(reduce("reduce", input_info("input"), reduce_mode::prod, {0}, true)); + + auto config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{{"reduce", {format::bfyx, "", impl_types::cpu}}})); + network network(engine, topology, config); + + network.set_input_data("input", input); + + auto outputs = network.execute(); + + ASSERT_EQ(outputs.size(), size_t(1)); + ASSERT_EQ(outputs.begin()->first, "reduce"); + + auto output = outputs.at("reduce").get_memory(); + + std::vector ref_data = {24}; + + cldnn::mem_lock output_ptr(output, get_test_stream()); + + for (size_t i = 0; i < ref_data.size(); ++i) { + ASSERT_EQ(ref_data[i], output_ptr[i]); + } +} + TEST(reduce_gpu, dynamic) { auto& engine = get_test_engine(); auto input = engine.allocate_memory({data_types::f32, format::bfwzyx, {2, 3, 1, 1, 1, 1}});