[GPU] Enable b_fs_yx_fsv16 format for shape-agnostic quantize and reo…

…rder kernels (openvinotoolkit#26025) ### Details: - Currently, b_fs_yx_fsv16 blocked format is enabled for convolution operations, but the lack of it for reorder and quantize operations causes runtime static version kernel recompilation. This change enables support for the b_fs_yx_fsv16 format to allow shape-agnostic kernel selection. ### Tickets: - [CVS-145296](https://jira.devtools.intel.com/browse/CVS-145296)
xufang-lisa · Aug 13, 2024 · 42ac61b · 42ac61b
1 parent d6bb880
commit 42ac61b
Show file tree

Hide file tree

Showing 5 changed files with 181 additions and 1 deletion.
diff --git a/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp b/src/plugins/intel_gpu/src/graph/graph_optimizer/compile_graph.cpp
@@ -114,7 +114,9 @@ void compile_graph::run(program& p) {
 
         if (node->is_dynamic() && !is_planar) {
             if (!(node->is_type<convolution>() && node->get_output_layout().format == cldnn::format::b_fs_yx_fsv16) &&
-                !(node->is_type<group_normalization>() && node->get_output_layout().format == cldnn::format::b_fs_yx_fsv16)) {
+                !(node->is_type<group_normalization>() && node->get_output_layout().format == cldnn::format::b_fs_yx_fsv16) &&
+                !(node->is_type<reorder>() && node->get_output_layout().format == cldnn::format::b_fs_yx_fsv16) &&
+                !(node->is_type<quantize>() && node->get_output_layout().format == cldnn::format::b_fs_yx_fsv16)) {
                 can_select_impl = false;
             }
         }

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/quantize.cpp
@@ -142,6 +142,7 @@ attach_quantize_impl::attach_quantize_impl() {
         format::bfwzyx,
         format::bfuwzyx,
         format::bfvuwzyx,
+        format::b_fs_yx_fsv16,
     };
 
     auto keys = implementation_map<quantize>::combine(types, formats);

diff --git a/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp b/src/plugins/intel_gpu/src/graph/impls/ocl/reorder.cpp
@@ -181,6 +181,7 @@ attach_reorder_impl::attach_reorder_impl() {
         format::bfyx,
         format::bfzyx,
         format::bfwzyx,
+        format::b_fs_yx_fsv16
     };
     implementation_map<reorder>::add(impl_types::ocl, shape_types::dynamic_shape, reorder_impl::create, types, formats);
 

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/quantize_gpu_test.cpp
@@ -749,6 +749,109 @@ TEST(quantize_gpu, dynamic) {
     }
 }
 
+TEST(quantize_gpu, dynamic_fsv16) {
+    auto& engine = get_test_engine();
+
+    auto input       = engine.allocate_memory({ { 1, 16, 2, 2 }, data_types::f32, format::bfyx });
+    auto input_low   = engine.allocate_memory({ { 1, 16, 1, 1 }, data_types::f32, format::bfyx });
+    auto input_high  = engine.allocate_memory({ { 1, 16, 1, 1 }, data_types::f32, format::bfyx });
+    auto output_low  = engine.allocate_memory({ { 1, 1,  1, 1 }, data_types::f32, format::bfyx });
+    auto output_high = engine.allocate_memory({ { 1, 1,  1, 1 }, data_types::f32, format::bfyx });
+
+    layout in_dyn_layout { ov::PartialShape::dynamic(4), data_types::f32, format::bfyx };
+
+    set_values(input, { -1.0f, 2.1f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f,
+
+                         1.0f, 2.0f, 3.0f, 4.0f,
+                         5.0f, 2.0f, 2.0f, 3.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+
+                         1.0f, 1.0f, 1.0f, 1.0f,
+                         4.0f, 6.0f, 3.0f, 3.0f,
+                         3.0f, 5.0f, 1.0f, 1.0f,
+                         1.0f, 1.0f, 1.0f, 1.0f });
+
+    set_values(input_low,  { 0.0f, 1.0f, 2.0f, 3.0f,
+                             4.0f, 5.0f, 6.0f, 7.0f,
+                             7.0f, 6.0f, 5.0f, 4.0f,
+                             3.0f, 2.0f, 1.0f, 0.0f });
+    set_values(input_high, { 10.0f, 21.0f, 32.0f, 43.0f,
+                             54.0f, 65.0f, 76.0f, 87.0f,
+                             87.0f, 76.0f, 65.0f, 54.0f,
+                             43.0f, 32.0f, 21.0f, 10.0f });
+
+    set_values(output_low,  { 0.0f });
+    set_values(output_high, { 255.0f });
+
+    std::vector<uint8_t> ref_data = {
+            0, 54, 77, 102,
+            51, 13, 13, 26,
+            17, 34, 8, 8,
+            0, 13, 0, 0,
+
+            0, 0, 0, 0,
+            0, 4, 0, 0,
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+
+            0, 0, 0, 0,
+            0, 0, 0, 0,
+            0, 4, 0, 0,
+            0, 5, 0, 0,
+
+            0, 0, 0, 0,
+            17, 34, 8, 8,
+            26, 51, 0, 0,
+            26, 26, 26, 26
+    };
+
+    topology topology;
+    topology.add(
+        input_layout("input", in_dyn_layout),
+        data("input_low", input_low),
+        data("input_high", input_high),
+        data("output_low", output_low),
+        data("output_high", output_high),
+        reorder("reorder", input_info("input"), format::b_fs_yx_fsv16, data_types::f32),
+        quantize("quantize", input_info("reorder"), input_info("input_low"), input_info("input_high"), input_info("output_low"), input_info("output_high"), 255, data_types::u8),
+        reorder("output_reorder", input_info("quantize"), format::bfyx, data_types::u8)
+    );
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology, config);
+    network.set_input_data("input", input);
+
+    auto inst = network.get_primitive("quantize");
+    auto impl = inst->get_impl();
+    ASSERT_TRUE(impl != nullptr);
+    ASSERT_TRUE(impl->is_dynamic());
+
+    auto outputs = network.execute();
+
+    auto output = outputs.at("output_reorder").get_memory();
+    cldnn::mem_lock<uint8_t> output_ptr(output, get_test_stream());
+
+    // Check that layout and memory contains logical size of tensor
+    ASSERT_EQ(output->count(), (size_t)64);
+    ASSERT_EQ(output->get_layout().count(), (size_t)64);
+
+    ASSERT_EQ(output->size(), ref_data.size() * sizeof(uint8_t));
+
+    for (size_t i = 0; i < ref_data.size(); ++i) {
+        ASSERT_NEAR(output_ptr[i], ref_data[i], 1) << " index = " << i;
+    }
+}
+
 struct quantize_random_test_params {
     data_types  input_type;
     data_types  output_type;

diff --git a/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp b/src/plugins/intel_gpu/tests/unit/test_cases/reorder_gpu_test.cpp
@@ -1525,6 +1525,79 @@ TEST(reorder_gpu_f32, dynamic_bfyx_to_bfzyx) {
     }
 }
 
+TEST(reorder_gpu_f32, dynamic_bfyx_to_fsv16) {
+    auto& engine = get_test_engine();
+
+    ov::Shape in_shape{ 1, 2, 4, 2 };
+    layout in_layout{ov::PartialShape::dynamic(in_shape.size()), data_types::f16, format::bfyx};
+    auto input = engine.allocate_memory({ov::PartialShape(in_shape), data_types::f16, format::bfyx});
+
+    set_values<ov::float16>(input, {
+        ov::float16(1.f), ov::float16(0.f),
+        ov::float16(5.f), ov::float16(1.5f),
+
+        ov::float16(2.f), ov::float16(0.f),
+        ov::float16(6.f), ov::float16(5.2f),
+
+        ov::float16(3.f), ov::float16(0.5f),
+        ov::float16(7.f), ov::float16(12.f),
+
+        ov::float16(4.f), ov::float16(-0.5f),
+        ov::float16(8.f), ov::float16(8.f)
+    });
+
+    topology topology(
+        input_layout("input", in_layout),
+        reorder("reorder", input_info("input"), format::b_fs_yx_fsv16, data_types::f16),
+        activation("relu", input_info("reorder"), activation_func::relu),
+        reorder("output_reorder", input_info("relu"), format::bfyx, data_types::f32));
+
+    ExecutionConfig config = get_test_default_config(engine);
+    config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
+    network network(engine, topology, config);
+
+    auto fsv16_reorder_inst = network.get_primitive("reorder");
+    auto fsv16_reorder_impl = fsv16_reorder_inst->get_impl();
+    ASSERT_TRUE(fsv16_reorder_impl != nullptr);
+    ASSERT_TRUE(fsv16_reorder_impl->is_dynamic());
+
+    auto output_reorder_inst = network.get_primitive("output_reorder");
+    auto output_reorder_impl = output_reorder_inst->get_impl();
+    ASSERT_TRUE(output_reorder_impl != nullptr);
+    ASSERT_TRUE(output_reorder_impl->is_dynamic());
+
+    network.set_input_data("input", input);
+
+    auto outputs = network.execute();
+    ASSERT_EQ(outputs.size(), size_t(1));
+    ASSERT_EQ(outputs.begin()->first, "output_reorder");
+
+    auto output = outputs.begin()->second.get_memory();
+    ASSERT_TRUE(output->get_layout().format == format::bfyx);
+    auto l = output->get_layout();
+    auto expected_shape = ov::PartialShape(in_shape);
+    ASSERT_EQ(l.get_partial_shape(), expected_shape);
+
+    float answers[16] = {
+        1.f, 0.f,
+        5.f, 1.5f,
+
+        2.f, 0.f,
+        6.f, 5.2f,
+
+        3.f, 0.5f,
+        7.f, 12.f,
+
+        4.f, 0.f,
+        8.f, 8.f
+    };
+
+    cldnn::mem_lock<float> output_ptr(output, get_test_stream());
+    for (int i = 0; i < 16; i++) {
+        ASSERT_NEAR(answers[i], output_ptr[i], 1e-2f);
+    }
+}
+
 TEST(reorder_gpu_f32, basic_yxfb_to_bfzyx)
 {
     //  Input               : yxfb:2x2x2x2