From 1522455f01d52072854516ba790fa0a35274321d Mon Sep 17 00:00:00 2001 From: Chon Ming Lee Date: Tue, 14 Jan 2025 15:09:17 +0800 Subject: [PATCH] [GPU] group normalization optimization to reduce 5 kernels to 3 kernels (#28339) ### Details: Extra 2 kernels to calculate variance can be eliminated by using by compute variance using pow(input_data, 2) - pow(mean, 2). This will avoid reading input buffer twice and perform almost similar calculation for mean and variance Achieve about 30% performance improvement. ### Tickets: CVS-158816 --- .../group_normalization_gpu_b_fs_yx_fsv16.cl | 125 +++-------- .../group_normalization_gpu_bfyx_opt.cl | 208 +++--------------- ...oup_normalization_kernel_b_fs_yx_fsv16.cpp | 68 +----- .../group_normalization_kernel_bfyx_opt.cpp | 68 +----- 4 files changed, 83 insertions(+), 386 deletions(-) diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_b_fs_yx_fsv16.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_b_fs_yx_fsv16.cl index 475b067d85380f..4ccfd62e775720 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_b_fs_yx_fsv16.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_b_fs_yx_fsv16.cl @@ -5,12 +5,13 @@ #include "include/batch_headers/fetch_data.cl" #include "include/batch_headers/sub_group_block_read.cl" -#ifdef GROUP_NORM_KERNEL_FEATURE_MEAN +#ifdef GROUP_NORM_KERNEL_FEATURE_MEAN_SQR_MEAN REQD_SUB_GROUP_SIZE(SIMD) -KERNEL(calc_mean_per_feature)( +KERNEL(calc_mean_sqr_mean_per_feature)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, - __global ACCUMULATOR_TYPE* internal_mean + __global ACCUMULATOR_TYPE* internal_mean, + __global ACCUMULATOR_TYPE* internal_variance ) { const uint data_set_idx = get_global_id(1); // batch * feature split const uint in_data_set_idx = get_global_id(0); @@ -25,117 +26,49 @@ KERNEL(calc_mean_per_feature)( const uint data_set_offset = INPUT0_GET_INDEX(b, f_base, 0, 0); const uint my_data_offset = data_set_offset + in_data_set_idx; - __local ACCUMULATOR_TYPE mean_per_feature[SLM_SIZE]; + __local ACCUMULATOR_TYPE sum_per_feature[SLM_SIZE]; + __local ACCUMULATOR_TYPE sqr_sum_per_feature[SLM_SIZE]; - ACCUMULATOR_TYPE mean = ACCUMULATOR_VAL_ZERO; + ACCUMULATOR_TYPE sum = ACCUMULATOR_VAL_ZERO; + ACCUMULATOR_TYPE sqr_sum = ACCUMULATOR_VAL_ZERO; for (uint i = 0; i < items_num; ++i) { - mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]); + ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]); + sum += data; + sqr_sum += data * data; } if (in_data_set_idx < leftovers) { - mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]); + ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]); + sum += data; + sqr_sum += data * data; } - mean_per_feature[in_data_set_idx] = mean; + sum_per_feature[in_data_set_idx] = sum; + sqr_sum_per_feature[in_data_set_idx] = sqr_sum; const uint num_local_workers = LWS0; const uint worker_block_idx = in_data_set_idx / FSV; uint reduce_add_level = 1; while ((SLM_SIZE / FSV) > reduce_add_level) { barrier(CLK_LOCAL_MEM_FENCE); if (worker_block_idx % (reduce_add_level * 2) == 0 && (in_data_set_idx + FSV * reduce_add_level) < num_local_workers) { - mean_per_feature[in_data_set_idx] += mean_per_feature[in_data_set_idx + FSV * reduce_add_level]; + sum_per_feature[in_data_set_idx] += sum_per_feature[in_data_set_idx + FSV * reduce_add_level]; + sqr_sum_per_feature[in_data_set_idx] += sqr_sum_per_feature[in_data_set_idx + FSV * reduce_add_level]; } reduce_add_level *= 2; } if (worker_block_idx == 0 && (f_base + in_data_set_idx) < INPUT0_FEATURE_NUM) { - mean = mean_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size); + ACCUMULATOR_TYPE mean = sum_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size); + ACCUMULATOR_TYPE variance = sqr_sum_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size); uint bf = b * INPUT0_FEATURE_NUM + f_base + in_data_set_idx; internal_mean[bf] = mean; - } -} -#elif GROUP_NORM_KERNEL_GROUP_MEAN -KERNEL(calc_mean_per_group)( - __global ACCUMULATOR_TYPE* internal_mean -) { - const uint data_idx = get_global_id(0) + get_global_id(1) * GWS0; - const uint num_workers = LWS0; - const uint group_size = GWS0 / NUM_GROUPS; - const uint items_num = group_size / num_workers; - - if ((data_idx % group_size) < num_workers) { - ACCUMULATOR_TYPE my_sum = ACCUMULATOR_VAL_ZERO; - for (uint i = 0; i < items_num; ++i) { - my_sum += internal_mean[data_idx + num_workers * i]; - } - - ACCUMULATOR_TYPE mean = work_group_reduce_add(my_sum); - mean /= TO_ACCUMULATOR_TYPE(group_size); - for (uint i = 0; i < items_num; ++i) { - internal_mean[data_idx + num_workers * i] = mean; - } - } -} -#elif GROUP_NORM_KERNEL_FEATURE_VAR -REQD_SUB_GROUP_SIZE(SIMD) -KERNEL(calc_var_per_feature)( - OPTIONAL_SHAPE_INFO_ARG - const __global INPUT0_TYPE* input, - const __global ACCUMULATOR_TYPE* internal_mean, - __global ACCUMULATOR_TYPE* internal_variance -) { - const uint data_set_idx = get_global_id(1); // batch * feature split - const uint in_data_set_idx = get_global_id(0); - const uint workers_per_dataset = LWS0 / FSV; // 16 datasets are handled by one local workgroup - const uint data_set_size = INPUT0_SIZE_X * INPUT0_SIZE_Y; - const uint items_num = data_set_size / workers_per_dataset; - const uint leftovers = data_set_size - (items_num * workers_per_dataset); - - const uint INPUT0_ALIGNED_FEATURE_NUM = ALIGN(INPUT0_FEATURE_NUM, FSV); - const uint b = (data_set_idx * FSV) / INPUT0_ALIGNED_FEATURE_NUM; - const uint f_base = (data_set_idx * FSV) % INPUT0_ALIGNED_FEATURE_NUM; - const uint data_set_offset = INPUT0_GET_INDEX(b, f_base, 0, 0); - const uint my_data_offset = data_set_offset + in_data_set_idx; - - __local ACCUMULATOR_TYPE var_per_feature[SLM_SIZE]; - - uint bf = b * INPUT0_FEATURE_NUM + f_base + get_sub_group_local_id(); - - ACCUMULATOR_TYPE mean = internal_mean[bf]; - ACCUMULATOR_TYPE variance = ACCUMULATOR_VAL_ZERO; - - for (uint i = 0; i < items_num; ++i) { - ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]); - tmp -= mean; - variance = fma(tmp, tmp, variance); - } - - if (in_data_set_idx < leftovers) { - ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]); - tmp -= mean; - variance = fma(tmp, tmp, variance); - } - - var_per_feature[in_data_set_idx] = variance; - const uint num_local_workers = LWS0; - const uint worker_block_idx = in_data_set_idx / FSV; - uint reduce_add_level = 1; - while ((SLM_SIZE / FSV) > reduce_add_level) { - barrier(CLK_LOCAL_MEM_FENCE); - if (worker_block_idx % (reduce_add_level * 2) == 0 && (in_data_set_idx + FSV * reduce_add_level) < num_local_workers) { - var_per_feature[in_data_set_idx] += var_per_feature[in_data_set_idx + FSV * reduce_add_level]; - } - reduce_add_level *= 2; - } - - if (worker_block_idx == 0 && (f_base + get_sub_group_local_id()) < INPUT0_FEATURE_NUM) { - variance = var_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size); internal_variance[bf] = variance; } } -#elif GROUP_NORM_KERNEL_GROUP_VAR -KERNEL(calc_var_per_group)( +#elif GROUP_NORM_KERNEL_GROUP_MEAN_VARIANCE +KERNEL(calc_mean_variance_per_group)( + __global ACCUMULATOR_TYPE* internal_mean, __global ACCUMULATOR_TYPE* internal_variance ) { const uint data_idx = get_global_id(0) + get_global_id(1) * GWS0; @@ -144,15 +77,21 @@ KERNEL(calc_var_per_group)( const uint items_num = group_size / num_workers; if ((data_idx % group_size) < num_workers) { - ACCUMULATOR_TYPE my_variance = ACCUMULATOR_VAL_ZERO; + ACCUMULATOR_TYPE mean_sum = ACCUMULATOR_VAL_ZERO; + ACCUMULATOR_TYPE variance_sum = ACCUMULATOR_VAL_ZERO; for (uint i = 0; i < items_num; ++i) { - my_variance += internal_variance[data_idx + num_workers * i]; + mean_sum += internal_mean[data_idx + num_workers * i]; + variance_sum += internal_variance[data_idx + num_workers * i]; } - ACCUMULATOR_TYPE variance = work_group_reduce_add(my_variance); + ACCUMULATOR_TYPE mean = work_group_reduce_add(mean_sum); + ACCUMULATOR_TYPE variance = work_group_reduce_add(variance_sum); + mean /= TO_ACCUMULATOR_TYPE(group_size); variance /= TO_ACCUMULATOR_TYPE(group_size); + variance -= mean * mean; variance = native_powr(variance + TO_ACCUMULATOR_TYPE(EPSILON), -0.5f); for (uint i = 0; i < items_num; ++i) { + internal_mean[data_idx + num_workers * i] = mean; internal_variance[data_idx + num_workers * i] = variance; } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_bfyx_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_bfyx_opt.cl index 96a91bdc60b7d7..983632957b4d8e 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_bfyx_opt.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/group_normalization_gpu_bfyx_opt.cl @@ -5,14 +5,15 @@ #include "include/batch_headers/fetch_data.cl" #include "include/batch_headers/sub_group_block_read.cl" -#ifdef GROUP_NORM_KERNEL_FEATURE_MEAN +#ifdef GROUP_NORM_KERNEL_FEATURE_MEAN_SQR_MEAN #if !IS_DYNAMIC __attribute__((reqd_work_group_size(LWS0, LWS1, LWS2))) #endif -KERNEL(calc_mean_per_feature)( +KERNEL(calc_mean_sqr_mean_per_feature)( OPTIONAL_SHAPE_INFO_ARG const __global INPUT0_TYPE* input, - __global ACCUMULATOR_TYPE* internal_mean + __global ACCUMULATOR_TYPE* internal_mean, + __global ACCUMULATOR_TYPE* internal_sqr_mean ) { #if INPUT0_DIMS == 5 const uint bf = get_global_id(2) / LWS2; // batch * feature @@ -40,6 +41,7 @@ KERNEL(calc_mean_per_feature)( const uint x_leftover = INPUT0_SIZE_X - x_num_workers * x_block_size; ACCUMULATOR_TYPE mean = ACCUMULATOR_VAL_ZERO; + ACCUMULATOR_TYPE sqr_mean = ACCUMULATOR_VAL_ZERO; #if INPUT0_DIMS == 5 for (uint z = z_base; z < (z_base + z_block_size); ++z) { @@ -51,7 +53,9 @@ KERNEL(calc_mean_per_feature)( uint my_data_offset = INPUT0_GET_INDEX(b, f, y, x_base); #endif for (uint i = 0; i < x_block_size; ++i) { - mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); + ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); + mean += data; + sqr_mean += data * data; } } #if INPUT0_DIMS == 5 @@ -63,7 +67,9 @@ KERNEL(calc_mean_per_feature)( for (uint y = y_base; y < (y_base + y_block_size); ++y) { uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(2) + z_num_workers * z_block_size), y, x_base); for (uint i = 0; i < x_block_size; ++i) { - mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); + ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); + mean += data; + sqr_mean += data * data; } } } @@ -77,7 +83,9 @@ KERNEL(calc_mean_per_feature)( uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(1) + y_num_workers * y_block_size), x_base); #endif for (uint i = 0; i < x_block_size; ++i) { - mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); + ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); + mean += data; + sqr_mean += data * data; } #if INPUT0_DIMS == 5 } @@ -94,7 +102,9 @@ KERNEL(calc_mean_per_feature)( #else uint my_data_offset = INPUT0_GET_INDEX(b, f, y, (get_local_id(0) + x_num_workers * x_block_size)); #endif - mean += TO_ACCUMULATOR_TYPE(input[my_data_offset]); + ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset]); + mean += data; + sqr_mean += data * data; } #if INPUT0_DIMS == 5 } @@ -111,7 +121,9 @@ KERNEL(calc_mean_per_feature)( uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(1) + y_num_workers * y_block_size), (get_local_id(0) + x_num_workers * x_block_size)); #endif - mean += TO_ACCUMULATOR_TYPE(input[my_data_offset]); + ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset]); + mean += data; + sqr_mean += data * data; } #if INPUT0_DIMS == 5 @@ -122,177 +134,21 @@ KERNEL(calc_mean_per_feature)( const uint worker_idx = get_local_linear_id(); mean = work_group_reduce_add(mean); + sqr_mean = work_group_reduce_add(sqr_mean); if (worker_idx == 0) { mean = mean / TO_ACCUMULATOR_TYPE(INPUT0_SIZE_Z * INPUT0_SIZE_Y * INPUT0_SIZE_X); + sqr_mean = sqr_mean / TO_ACCUMULATOR_TYPE(INPUT0_SIZE_Z * INPUT0_SIZE_Y * INPUT0_SIZE_X); internal_mean[bf] = mean; + internal_sqr_mean[bf] = sqr_mean; } } -#elif GROUP_NORM_KERNEL_GROUP_MEAN +#elif GROUP_NORM_KERNEL_GROUP_MEAN_VARIANCE #if !IS_DYNAMIC __attribute__((reqd_work_group_size(LWS0, LWS1, LWS2))) #endif -KERNEL(calc_mean_per_group)( - __global ACCUMULATOR_TYPE* internal_mean -) { - const uint data_idx = get_global_id(0) + get_global_id(1) * GWS0; - const uint num_workers = LWS0; - const uint group_size = GWS0 / NUM_GROUPS; - const uint items_num = group_size / num_workers; - - if ((data_idx % group_size) < num_workers) { - ACCUMULATOR_TYPE my_sum = ACCUMULATOR_VAL_ZERO; - for (uint i = 0; i < items_num; ++i) { - my_sum += internal_mean[data_idx + num_workers * i]; - } - - ACCUMULATOR_TYPE mean = work_group_reduce_add(my_sum); - mean /= TO_ACCUMULATOR_TYPE(group_size); - for (uint i = 0; i < items_num; ++i) { - internal_mean[data_idx + num_workers * i] = mean; - } - } -} -#elif GROUP_NORM_KERNEL_FEATURE_VAR -#if !IS_DYNAMIC -__attribute__((reqd_work_group_size(LWS0, LWS1, LWS2))) -#endif -KERNEL(calc_var_per_feature)( - OPTIONAL_SHAPE_INFO_ARG - const __global INPUT0_TYPE* input, - const __global ACCUMULATOR_TYPE* internal_mean, - __global ACCUMULATOR_TYPE* internal_variance -) { - #if INPUT0_DIMS == 5 - const uint bf = get_global_id(2) / LWS2; // batch * feature - #else - const uint bf = get_global_id(2); // batch * feature - #endif - const uint b = bf / INPUT0_FEATURE_NUM; - const uint f = bf % INPUT0_FEATURE_NUM; - #if INPUT0_DIMS == 5 - const uint z_num_workers = LWS2; - #endif - const uint y_num_workers = LWS1; - const uint x_num_workers = LWS0; - #if INPUT0_DIMS == 5 - const uint z_block_size = INPUT0_SIZE_Z / z_num_workers; - const uint z_base = get_local_id(2) * z_block_size; - const uint z_leftover = INPUT0_SIZE_Z - z_num_workers * z_block_size; - #endif - const uint y_block_size = INPUT0_SIZE_Y / y_num_workers; - const uint y_base = get_local_id(1) * y_block_size; - const uint y_leftover = INPUT0_SIZE_Y - y_num_workers * y_block_size; - - const uint x_block_size = INPUT0_SIZE_X / x_num_workers; - const uint x_base = get_local_id(0); - const uint x_leftover = INPUT0_SIZE_X - x_num_workers * x_block_size; - - const ACCUMULATOR_TYPE mean = internal_mean[bf]; - ACCUMULATOR_TYPE variance = ACCUMULATOR_VAL_ZERO; - - #if INPUT0_DIMS == 5 - for (uint z = z_base; z < (z_base + z_block_size); ++z) { - #endif - for (uint y = y_base; y < (y_base + y_block_size); ++y) { - #if INPUT0_DIMS == 5 - uint my_data_offset = INPUT0_GET_INDEX(b, f, z, y, x_base); - #else - uint my_data_offset = INPUT0_GET_INDEX(b, f, y, x_base); - #endif - for (uint i = 0; i < x_block_size; ++i) { - ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); - tmp -= mean; - variance = fma(tmp, tmp, variance); - } - } - #if INPUT0_DIMS == 5 - } - #endif - - #if INPUT0_DIMS == 5 - if (get_local_id(2) < z_leftover) { - for (uint y = y_base; y < (y_base + y_block_size); ++y) { - uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(2) + z_num_workers * z_block_size), y, x_base); - for (uint i = 0; i < x_block_size; ++i) { - ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); - tmp -= mean; - variance = fma(tmp, tmp, variance); - } - } - } - #endif - - if (get_local_id(1) < y_leftover) { - #if INPUT0_DIMS == 5 - for (uint z = z_base; z < (z_base + z_block_size); ++z) { - uint my_data_offset = INPUT0_GET_INDEX(b, f, z, (get_local_id(1) + y_num_workers * y_block_size), x_base); - #else - uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(1) + y_num_workers * y_block_size), x_base); - #endif - for (uint i = 0; i < x_block_size; ++i) { - ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * x_num_workers]); - tmp -= mean; - variance = fma(tmp, tmp, variance); - } - #if INPUT0_DIMS == 5 - } - #endif - } - - if (get_local_id(0) < x_leftover) { - #if INPUT0_DIMS == 5 - for (uint z = z_base; z < (z_base + z_block_size); ++z) { - #endif - for (uint y = y_base; y < (y_base + y_block_size); ++y) { - #if INPUT0_DIMS == 5 - uint my_data_offset = INPUT0_GET_INDEX(b, f, z, y, (get_local_id(0) + x_num_workers * x_block_size)); - #else - uint my_data_offset = INPUT0_GET_INDEX(b, f, y, (get_local_id(0) + x_num_workers * x_block_size)); - #endif - ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset]); - tmp -= mean; - variance = fma(tmp, tmp, variance); - } - #if INPUT0_DIMS == 5 - } - #endif - } - - #if INPUT0_DIMS == 5 - if (get_local_id(2) < z_leftover && get_local_id(1) < y_leftover && get_local_id(0) < x_leftover) { - uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(2) + z_num_workers * z_block_size), - (get_local_id(1) + y_num_workers * y_block_size), - (get_local_id(0) + x_num_workers * x_block_size)); - #else - if (get_local_id(1) < y_leftover && get_local_id(0) < x_leftover) { - uint my_data_offset = INPUT0_GET_INDEX(b, f, (get_local_id(1) + y_num_workers * y_block_size), - (get_local_id(0) + x_num_workers * x_block_size)); - #endif - ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset]); - tmp -= mean; - variance = fma(tmp, tmp, variance); - } - - #if INPUT0_DIMS == 5 - const uint num_local_workers = z_num_workers * y_num_workers * x_num_workers; - #else - const uint num_local_workers = y_num_workers * x_num_workers; - #endif - const uint worker_idx = get_local_linear_id(); - - variance = work_group_reduce_add(variance); - - if (worker_idx == 0) { - variance = variance / TO_ACCUMULATOR_TYPE(INPUT0_SIZE_Z * INPUT0_SIZE_Y * INPUT0_SIZE_X); - internal_variance[bf] = variance; - } -} -#elif GROUP_NORM_KERNEL_GROUP_VAR -#if !IS_DYNAMIC -__attribute__((reqd_work_group_size(LWS0, LWS1, LWS2))) -#endif -KERNEL(calc_var_per_group)( +KERNEL(calc_mean_variance_per_group)( + __global ACCUMULATOR_TYPE* internal_mean, __global ACCUMULATOR_TYPE* internal_variance ) { const uint data_idx = get_global_id(0) + get_global_id(1) * GWS0; @@ -301,15 +157,21 @@ KERNEL(calc_var_per_group)( const uint items_num = group_size / num_workers; if ((data_idx % group_size) < num_workers) { - ACCUMULATOR_TYPE my_variance = ACCUMULATOR_VAL_ZERO; + ACCUMULATOR_TYPE mean_sum = ACCUMULATOR_VAL_ZERO; + ACCUMULATOR_TYPE variance_sum = ACCUMULATOR_VAL_ZERO; for (uint i = 0; i < items_num; ++i) { - my_variance += internal_variance[data_idx + num_workers * i]; + mean_sum += internal_mean[data_idx + num_workers * i]; + variance_sum += internal_variance[data_idx + num_workers * i]; } - ACCUMULATOR_TYPE variance = work_group_reduce_add(my_variance); + ACCUMULATOR_TYPE mean = work_group_reduce_add(mean_sum); + ACCUMULATOR_TYPE variance = work_group_reduce_add(variance_sum); + mean /= TO_ACCUMULATOR_TYPE(group_size); variance /= TO_ACCUMULATOR_TYPE(group_size); + variance -= mean * mean; variance = native_powr(variance + TO_ACCUMULATOR_TYPE(EPSILON), -0.5f); for (uint i = 0; i < items_num; ++i) { + internal_mean[data_idx + num_workers * i] = mean; internal_variance[data_idx + num_workers * i] = variance; } } diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.cpp index 0e018c542808ce..3ead6ab5eb5306 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_b_fs_yx_fsv16.cpp @@ -153,18 +153,10 @@ void GroupNormalizationKernel_b_fs_yx_fsv16::GetUpdateDispatchDataFunc(KernelDat kd.kernels[1].params.workGroups.local = dispatchData.stage_2.lws; kd.kernels[1].skip_execution = KernelData::SkipKernelExecution(prim_params, 1); - kd.kernels[2].params.workGroups.global = dispatchData.stage_1.gws; - kd.kernels[2].params.workGroups.local = dispatchData.stage_1.lws; + kd.kernels[2].params.workGroups.global = dispatchData.stage_final.gws; + kd.kernels[2].params.workGroups.local = dispatchData.stage_final.lws; kd.kernels[2].skip_execution = KernelData::SkipKernelExecution(prim_params, 2); - kd.kernels[3].params.workGroups.global = dispatchData.stage_2.gws; - kd.kernels[3].params.workGroups.local = dispatchData.stage_2.lws; - kd.kernels[3].skip_execution = KernelData::SkipKernelExecution(prim_params, 3); - - kd.kernels[4].params.workGroups.global = dispatchData.stage_final.gws; - kd.kernels[4].params.workGroups.local = dispatchData.stage_final.lws; - kd.kernels[4].skip_execution = KernelData::SkipKernelExecution(prim_params, 4); - kd.internalBufferSizes.clear(); kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4); kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4); @@ -203,7 +195,7 @@ KernelsData GroupNormalizationKernel_b_fs_yx_fsv16::GetKernelsData(const Params MultiDispatchData dispatchData = SetDefault(prim_params); - KernelData kd = KernelData::Default(params, 5); + KernelData kd = KernelData::Default(params, 3); kd.internalBufferDataType = GetAccumulatorType(prim_params); GetUpdateDispatchDataFunc(kd); @@ -213,7 +205,7 @@ KernelsData GroupNormalizationKernel_b_fs_yx_fsv16::GetKernelsData(const Params { // Mean first stage auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_1); - cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_MEAN", 1)); + cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_MEAN_SQR_MEAN", 1)); auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; @@ -231,14 +223,16 @@ KernelsData GroupNormalizationKernel_b_fs_yx_fsv16::GetKernelsData(const Params 0, prim_params.is_shape_agnostic); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); if (!prim_params.has_dynamic_tensors()) { kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4); + kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4); } } { // Mean second stage auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_2); - cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_MEAN", 1)); + cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_MEAN_VARIANCE", 1)); auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[1]; @@ -255,52 +249,6 @@ KernelsData GroupNormalizationKernel_b_fs_yx_fsv16::GetKernelsData(const Params 0); kernel.params.arguments.clear(); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); - } - { - // Variance first stage - auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_1); - cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_VAR", 1)); - auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); - auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); - auto& kernel = kd.kernels[2]; - FillCLKernelData(kernel, - dispatchData.stage_1, - params.engineInfo, - finalKernelName, - jit, - entry_point, - "", - false, - false, - 1, - 0, - 0, - prim_params.is_shape_agnostic); - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); - if (!prim_params.has_dynamic_tensors()) { - kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * Align(prim_params.outputs[0].Feature().v, fsv) * 4); - } - } - { - // Variance second stage - auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_2); - cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_VAR", 1)); - auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); - auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); - auto& kernel = kd.kernels[3]; - FillCLKernelData(kernel, - dispatchData.stage_2, - params.engineInfo, - finalKernelName, - jit, - entry_point, - "", - false, - false, - 0, - 0); - kernel.params.arguments.clear(); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); } { @@ -309,7 +257,7 @@ KernelsData GroupNormalizationKernel_b_fs_yx_fsv16::GetKernelsData(const Params cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FINAL", 1)); auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); - auto& kernel = kd.kernels[4]; + auto& kernel = kd.kernels[2]; FillCLKernelData(kernel, dispatchData.stage_final, params.engineInfo, diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.cpp index ab65d1ea192c22..0ba841551bdd32 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/group_normalization/group_normalization_kernel_bfyx_opt.cpp @@ -172,18 +172,10 @@ void GroupNormalizationKernelBfyx::GetUpdateDispatchDataFunc(KernelData& kd) con kd.kernels[1].params.workGroups.local = dispatchData.stage_2.lws; kd.kernels[1].skip_execution = KernelData::SkipKernelExecution(prim_params, 1); - kd.kernels[2].params.workGroups.global = dispatchData.stage_1.gws; - kd.kernels[2].params.workGroups.local = dispatchData.stage_1.lws; + kd.kernels[2].params.workGroups.global = dispatchData.stage_final.gws; + kd.kernels[2].params.workGroups.local = dispatchData.stage_final.lws; kd.kernels[2].skip_execution = KernelData::SkipKernelExecution(prim_params, 2); - kd.kernels[3].params.workGroups.global = dispatchData.stage_2.gws; - kd.kernels[3].params.workGroups.local = dispatchData.stage_2.lws; - kd.kernels[3].skip_execution = KernelData::SkipKernelExecution(prim_params, 3); - - kd.kernels[4].params.workGroups.global = dispatchData.stage_final.gws; - kd.kernels[4].params.workGroups.local = dispatchData.stage_final.lws; - kd.kernels[4].skip_execution = KernelData::SkipKernelExecution(prim_params, 4); - kd.internalBufferSizes.clear(); kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4); kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4); @@ -200,7 +192,7 @@ KernelsData GroupNormalizationKernelBfyx::GetKernelsData(const Params ¶ms) c MultiDispatchData dispatchData = SetDefault(prim_params); - KernelData kd = KernelData::Default(params, 5); + KernelData kd = KernelData::Default(params, 3); kd.internalBufferDataType = GetAccumulatorType(prim_params); GetUpdateDispatchDataFunc(kd); @@ -210,7 +202,7 @@ KernelsData GroupNormalizationKernelBfyx::GetKernelsData(const Params ¶ms) c { // Mean first stage auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_1); - cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_MEAN", 1)); + cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_MEAN_SQR_MEAN", 1)); auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[0]; @@ -228,14 +220,16 @@ KernelsData GroupNormalizationKernelBfyx::GetKernelsData(const Params ¶ms) c 0, prim_params.is_shape_agnostic); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); + kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); if (!prim_params.has_dynamic_tensors()) { kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4); + kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4); } } { // Mean second stage auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_2); - cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_MEAN", 1)); + cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_MEAN_VARIANCE", 1)); auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); auto& kernel = kd.kernels[1]; @@ -252,52 +246,6 @@ KernelsData GroupNormalizationKernelBfyx::GetKernelsData(const Params ¶ms) c 0); kernel.params.arguments.clear(); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); - } - { - // Variance first stage - auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_1); - cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FEATURE_VAR", 1)); - auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); - auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); - auto& kernel = kd.kernels[2]; - FillCLKernelData(kernel, - dispatchData.stage_1, - params.engineInfo, - finalKernelName, - jit, - entry_point, - "", - false, - false, - 1, - 0, - 0, - prim_params.is_shape_agnostic); - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0}); - kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); - if (!prim_params.has_dynamic_tensors()) { - kd.internalBufferSizes.push_back(prim_params.outputs[0].Batch().v * prim_params.outputs[0].Feature().v * 4); - } - } - { - // Variance second stage - auto cldnn_jit = GetJitConstants(prim_params, dispatchData.stage_2); - cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_GROUP_VAR", 1)); - auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); - auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); - auto& kernel = kd.kernels[3]; - FillCLKernelData(kernel, - dispatchData.stage_2, - params.engineInfo, - finalKernelName, - jit, - entry_point, - "", - false, - false, - 0, - 0); - kernel.params.arguments.clear(); kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1}); } { @@ -306,7 +254,7 @@ KernelsData GroupNormalizationKernelBfyx::GetKernelsData(const Params ¶ms) c cldnn_jit.AddConstant(MakeJitConstant("GROUP_NORM_KERNEL_FINAL", 1)); auto entry_point = GetEntryPoint(finalKernelName, prim_params.layerID, params, entry_part_id++); auto jit = CreateJit(finalKernelName, cldnn_jit, entry_point); - auto& kernel = kd.kernels[4]; + auto& kernel = kd.kernels[2]; FillCLKernelData(kernel, dispatchData.stage_final, params.engineInfo,