From 1b06d0be45f19fd7a15165d04f590cfad4c8e737 Mon Sep 17 00:00:00 2001 From: Steve Yoo Date: Thu, 14 Nov 2024 13:32:02 +0000 Subject: [PATCH 1/7] [GPU] Skip crop fusing when it has padding and its user is reorder --- .../src/graph/graph_optimizer/prepare_buffer_fusing.cpp | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 65acb0beb66ba0..71c7e0a3a9e4c9 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -843,6 +843,8 @@ void prepare_buffer_fusing::run(program& p) { crop_params->input_offsets[0], node.get_primitive()->axis, false); + if (static_cast(crop_layout.data_padding) && node.get_users().front()->is_type()) + return; if (user_info.first) { node.get_users().front()->set_output_layout(user_info.second); } From ff3a6c75d7a3a3416929c6aa328ade7f3bad0d3d Mon Sep 17 00:00:00 2001 From: Steve Yoo Date: Fri, 15 Nov 2024 13:27:15 +0000 Subject: [PATCH 2/7] [GPU] Skip reorder opt when its depencency is crop --- .../src/graph/graph_optimizer/prepare_buffer_fusing.cpp | 2 -- .../src/graph/graph_optimizer/remove_redundant_reorders.cpp | 3 +++ 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp index 71c7e0a3a9e4c9..65acb0beb66ba0 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/prepare_buffer_fusing.cpp @@ -843,8 +843,6 @@ void prepare_buffer_fusing::run(program& p) { crop_params->input_offsets[0], node.get_primitive()->axis, false); - if (static_cast(crop_layout.data_padding) && node.get_users().front()->is_type()) - return; if (user_info.first) { node.get_users().front()->set_output_layout(user_info.second); } diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 1e5f943600fc05..3c6311e01218c5 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -295,6 +295,9 @@ void remove_redundant_reorders::run(program& p) { auto o_layout = r_node.get_output_layout(); const auto& i_layout = r_node.get_input_layout(0); + if (r_node.get_dependency(0).is_type()) + continue; + // Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer, // but pads need to be handled correctly. if (i_layout.format == format::b_fs_yx_fsv16 && o_layout.format == format::bfyx && !r_node.is_output() && From b7b1e7fd3991d032bead2004b2b716da65e4d914 Mon Sep 17 00:00:00 2001 From: Steve Yoo Date: Wed, 20 Nov 2024 21:10:34 +0000 Subject: [PATCH 3/7] [GPU] Add unit test --- .../unit/test_cases/reorder_gpu_test.cpp | 93 +++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp index 8ade3b6c8e0f31..0f9f119f275a78 100644 --- a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp @@ -2467,6 +2467,99 @@ TEST(reorder_gpu_f32, bfzyx_to_bsv16_fsv16_padded) } } +TEST(reorder_gpu_f32, bfzyx_to_bfyx_padded) { + tests::random_generator rg(GET_SUITE_NAME); + auto& engine = get_test_engine(); + + const int32_t b_in = 1024; + const int32_t f_in = 64; + const int32_t x_in = 72; + const int32_t y_in = 2; + const int32_t z_in = 3; + + const int32_t b_crop = 1024; + const int32_t f_crop = 64; + const int32_t x_crop = 72; + const int32_t y_crop = 2; + const int32_t z_crop = 1; + + const int32_t z0_off = 0; + const int32_t z1_off = 1; + const int32_t z2_off = 2; + + auto input = engine.allocate_memory({ data_types::f32,format::bfzyx,{ b_in, f_in, x_in, y_in, z_in } }); + + topology topology; + topology.add(input_layout("input", input->get_layout())); + topology.add(crop("crop0", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z0_off })); + topology.add(crop("crop1", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z1_off })); + topology.add(crop("crop2", input_info("input"), { b_crop, f_crop, x_crop, y_crop, z_crop }, { 0, 0, 0, 0, z2_off })); + topology.add(reorder("reorder0", input_info("crop0"), format::bfyx, data_types::f32)); + topology.add(reorder("reorder1", input_info("crop1"), format::bfyx, data_types::f32)); + topology.add(reorder("reorder2", input_info("crop2"), format::bfyx, data_types::f32)); + topology.add(reshape("reshape0", input_info("reorder0"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + topology.add(reshape("reshape1", input_info("reorder1"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + topology.add(reshape("reshape2", input_info("reorder2"), tensor(batch(b_in), feature(y_in), spatial(x_in, f_in)))); + + std::vector input_vec = rg.generate_random_1d(input->count(), -10, 10); + set_values(input, input_vec); + + ExecutionConfig config = get_test_default_config(engine); + config.set_property(ov::intel_gpu::optimize_data(true)); + network network(engine, topology, config); + + network.set_input_data("input", input); + auto outputs = network.execute(); + auto output0 = outputs.at("reshape0").get_memory(); + auto output1 = outputs.at("reshape1").get_memory(); + auto output2 = outputs.at("reshape2").get_memory(); + + cldnn::mem_lock output_ptr0(output0, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z0_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr0[output_linear_id], input_vec[linear_id]); + } + } + } + } + } + + cldnn::mem_lock output_ptr1(output1, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z1_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr1[output_linear_id], input_vec[linear_id]); + } + } + } + } + } + + cldnn::mem_lock output_ptr2(output2, get_test_stream()); + for (int b = 0; b < b_crop; ++b) { + for (int f = 0; f < f_crop; ++f) { + for (int z = 0; z < z_crop; ++z) { + for (int y = 0; y < y_crop; ++y) { + for (int x = 0; x < x_crop; ++x) { + int linear_id = x + x_in * (y + y_in * (z + z2_off + z_in * (f + f_in * b))); + int output_linear_id = x + x_crop * (y + y_crop * (z + z_crop * (f + f_crop * b))); + ASSERT_EQ(output_ptr2[output_linear_id], input_vec[linear_id]); + } + } + } + } + } +} + TEST(reorder_gpu_f32, b_fs_yx_fsv16_to_bfyx_opt_allowed) { auto& engine = get_test_engine(); From 4165b03e27eca88bc439799899e5fe7b45bf9ba5 Mon Sep 17 00:00:00 2001 From: Steve Yoo Date: Thu, 21 Nov 2024 12:25:52 +0000 Subject: [PATCH 4/7] [GPU] Fixed unit tests affected by skipping reorder opt --- .../tests/unit/passes/add_required_reorders_test.cpp | 4 ++-- .../tests/unit/passes/prepare_buffer_fusing_test.cpp | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp index 9a4cb71450a53c..0eb425b4dc1119 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/add_required_reorders_test.cpp @@ -192,9 +192,9 @@ TEST(add_required_reorders, skip_adding_reorder_batch_axis_padding) { crop_prim = network.get_primitive("crop2"); ASSERT_EQ(crop_prim->can_be_optimized(), true); auto reorder_prim = network.get_primitive("crop1_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); reorder_prim = network.get_primitive("crop2_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); auto concate = network.get_primitive("concat"); ASSERT_EQ(concate->can_be_optimized(), false); } diff --git a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp index 456fab4ae0286a..1eb11c662608e0 100644 --- a/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp +++ b/src/plugins/intel_gpu/tests/unit/passes/prepare_buffer_fusing_test.cpp @@ -1224,7 +1224,7 @@ TEST(prepare_buffer_fusing, test_implicit_crop_and_outerpadding) { auto reorder_prim = network.get_primitive("gather1_reorder"); ASSERT_EQ(reorder_prim->can_be_optimized(), true); reorder_prim = network.get_primitive("gather2_reorder"); - ASSERT_EQ(reorder_prim->can_be_optimized(), true); + ASSERT_EQ(reorder_prim->can_be_optimized(), false); auto reshape_prim = network.get_primitive("reshape1"); ASSERT_EQ(reshape_prim->can_be_optimized(), true); } From 6a7e5824b85c549bdddd6082b664f579405b6eb4 Mon Sep 17 00:00:00 2001 From: Steve Yoo Date: Thu, 5 Dec 2024 08:50:01 +0000 Subject: [PATCH 5/7] [GPU] Reinforce dependency check condition --- .../src/graph/graph_optimizer/remove_redundant_reorders.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 3c6311e01218c5..6cf4a571481a62 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -295,7 +295,7 @@ void remove_redundant_reorders::run(program& p) { auto o_layout = r_node.get_output_layout(); const auto& i_layout = r_node.get_input_layout(0); - if (r_node.get_dependency(0).is_type()) + if (!update_implementations || (r_node.get_dependency(0).is_type() && r_node.get_dependency(0).can_be_optimized())) continue; // Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer, From a097204d9543975270afe3f3263c49952108b430 Mon Sep 17 00:00:00 2001 From: Steve Yoo Date: Thu, 5 Dec 2024 21:08:51 +0000 Subject: [PATCH 6/7] [GPU] Confine update implementation condition to crop --- .../src/graph/graph_optimizer/remove_redundant_reorders.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index 6cf4a571481a62..f771300a160167 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -295,7 +295,8 @@ void remove_redundant_reorders::run(program& p) { auto o_layout = r_node.get_output_layout(); const auto& i_layout = r_node.get_input_layout(0); - if (!update_implementations || (r_node.get_dependency(0).is_type() && r_node.get_dependency(0).can_be_optimized())) + if ((!update_implementations && r_node.get_dependency(0).is_type()) || + (r_node.get_dependency(0).is_type() && r_node.get_dependency(0).can_be_optimized())) continue; // Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer, From ba177e81bc84859720d3b6ad60f7451497bdbc90 Mon Sep 17 00:00:00 2001 From: Steve Yoo Date: Thu, 12 Dec 2024 12:41:46 +0000 Subject: [PATCH 7/7] [GPU] Confine reorder skipping opt condition to different i/o rank --- .../src/graph/graph_optimizer/remove_redundant_reorders.cpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp index f771300a160167..ac7810c6e9154c 100644 --- a/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp +++ b/src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp @@ -295,8 +295,10 @@ void remove_redundant_reorders::run(program& p) { auto o_layout = r_node.get_output_layout(); const auto& i_layout = r_node.get_input_layout(0); - if ((!update_implementations && r_node.get_dependency(0).is_type()) || - (r_node.get_dependency(0).is_type() && r_node.get_dependency(0).can_be_optimized())) + auto is_r_node_rank_changed = r_node.get_output_layout().get_rank() != r_node.get_dependency(0).get_output_layout().get_rank(); + if (is_r_node_rank_changed && + ((!update_implementations && r_node.get_dependency(0).is_type()) || + (r_node.get_dependency(0).is_type() && r_node.get_dependency(0).can_be_optimized()))) continue; // Optimize reorder b_fs_yx_fsv16 -> bfyx when spatials are equal to 1. In this case we can reinterpret buffer,