Skip to content

Commit

Permalink
[GPU] group normalization optimization to reduce 5 kernels to 3 kerne…
Browse files Browse the repository at this point in the history
…ls (#28339)

### Details:
Extra 2 kernels to calculate variance can be eliminated by using by
compute variance using pow(input_data, 2) - pow(mean, 2). This will
avoid reading input buffer twice and perform almost similar calculation
for mean and variance

Achieve about 30% performance improvement.

### Tickets:
CVS-158816
  • Loading branch information
clee30 authored Jan 13, 2025
1 parent 22922a2 commit e5426ec
Show file tree
Hide file tree
Showing 4 changed files with 83 additions and 386 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,13 @@
#include "include/batch_headers/fetch_data.cl"
#include "include/batch_headers/sub_group_block_read.cl"

#ifdef GROUP_NORM_KERNEL_FEATURE_MEAN
#ifdef GROUP_NORM_KERNEL_FEATURE_MEAN_SQR_MEAN
REQD_SUB_GROUP_SIZE(SIMD)
KERNEL(calc_mean_per_feature)(
KERNEL(calc_mean_sqr_mean_per_feature)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
__global ACCUMULATOR_TYPE* internal_mean
__global ACCUMULATOR_TYPE* internal_mean,
__global ACCUMULATOR_TYPE* internal_variance
) {
const uint data_set_idx = get_global_id(1); // batch * feature split
const uint in_data_set_idx = get_global_id(0);
Expand All @@ -25,117 +26,49 @@ KERNEL(calc_mean_per_feature)(
const uint data_set_offset = INPUT0_GET_INDEX(b, f_base, 0, 0);
const uint my_data_offset = data_set_offset + in_data_set_idx;

__local ACCUMULATOR_TYPE mean_per_feature[SLM_SIZE];
__local ACCUMULATOR_TYPE sum_per_feature[SLM_SIZE];
__local ACCUMULATOR_TYPE sqr_sum_per_feature[SLM_SIZE];

ACCUMULATOR_TYPE mean = ACCUMULATOR_VAL_ZERO;
ACCUMULATOR_TYPE sum = ACCUMULATOR_VAL_ZERO;
ACCUMULATOR_TYPE sqr_sum = ACCUMULATOR_VAL_ZERO;

for (uint i = 0; i < items_num; ++i) {
mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]);
ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]);
sum += data;
sqr_sum += data * data;
}

if (in_data_set_idx < leftovers) {
mean += TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]);
ACCUMULATOR_TYPE data = TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]);
sum += data;
sqr_sum += data * data;
}

mean_per_feature[in_data_set_idx] = mean;
sum_per_feature[in_data_set_idx] = sum;
sqr_sum_per_feature[in_data_set_idx] = sqr_sum;
const uint num_local_workers = LWS0;
const uint worker_block_idx = in_data_set_idx / FSV;
uint reduce_add_level = 1;
while ((SLM_SIZE / FSV) > reduce_add_level) {
barrier(CLK_LOCAL_MEM_FENCE);
if (worker_block_idx % (reduce_add_level * 2) == 0 && (in_data_set_idx + FSV * reduce_add_level) < num_local_workers) {
mean_per_feature[in_data_set_idx] += mean_per_feature[in_data_set_idx + FSV * reduce_add_level];
sum_per_feature[in_data_set_idx] += sum_per_feature[in_data_set_idx + FSV * reduce_add_level];
sqr_sum_per_feature[in_data_set_idx] += sqr_sum_per_feature[in_data_set_idx + FSV * reduce_add_level];
}
reduce_add_level *= 2;
}

if (worker_block_idx == 0 && (f_base + in_data_set_idx) < INPUT0_FEATURE_NUM) {
mean = mean_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size);
ACCUMULATOR_TYPE mean = sum_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size);
ACCUMULATOR_TYPE variance = sqr_sum_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size);
uint bf = b * INPUT0_FEATURE_NUM + f_base + in_data_set_idx;
internal_mean[bf] = mean;
}
}
#elif GROUP_NORM_KERNEL_GROUP_MEAN
KERNEL(calc_mean_per_group)(
__global ACCUMULATOR_TYPE* internal_mean
) {
const uint data_idx = get_global_id(0) + get_global_id(1) * GWS0;
const uint num_workers = LWS0;
const uint group_size = GWS0 / NUM_GROUPS;
const uint items_num = group_size / num_workers;

if ((data_idx % group_size) < num_workers) {
ACCUMULATOR_TYPE my_sum = ACCUMULATOR_VAL_ZERO;
for (uint i = 0; i < items_num; ++i) {
my_sum += internal_mean[data_idx + num_workers * i];
}

ACCUMULATOR_TYPE mean = work_group_reduce_add(my_sum);
mean /= TO_ACCUMULATOR_TYPE(group_size);
for (uint i = 0; i < items_num; ++i) {
internal_mean[data_idx + num_workers * i] = mean;
}
}
}
#elif GROUP_NORM_KERNEL_FEATURE_VAR
REQD_SUB_GROUP_SIZE(SIMD)
KERNEL(calc_var_per_feature)(
OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
const __global ACCUMULATOR_TYPE* internal_mean,
__global ACCUMULATOR_TYPE* internal_variance
) {
const uint data_set_idx = get_global_id(1); // batch * feature split
const uint in_data_set_idx = get_global_id(0);
const uint workers_per_dataset = LWS0 / FSV; // 16 datasets are handled by one local workgroup
const uint data_set_size = INPUT0_SIZE_X * INPUT0_SIZE_Y;
const uint items_num = data_set_size / workers_per_dataset;
const uint leftovers = data_set_size - (items_num * workers_per_dataset);

const uint INPUT0_ALIGNED_FEATURE_NUM = ALIGN(INPUT0_FEATURE_NUM, FSV);
const uint b = (data_set_idx * FSV) / INPUT0_ALIGNED_FEATURE_NUM;
const uint f_base = (data_set_idx * FSV) % INPUT0_ALIGNED_FEATURE_NUM;
const uint data_set_offset = INPUT0_GET_INDEX(b, f_base, 0, 0);
const uint my_data_offset = data_set_offset + in_data_set_idx;

__local ACCUMULATOR_TYPE var_per_feature[SLM_SIZE];

uint bf = b * INPUT0_FEATURE_NUM + f_base + get_sub_group_local_id();

ACCUMULATOR_TYPE mean = internal_mean[bf];
ACCUMULATOR_TYPE variance = ACCUMULATOR_VAL_ZERO;

for (uint i = 0; i < items_num; ++i) {
ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + i * workers_per_dataset * FSV]);
tmp -= mean;
variance = fma(tmp, tmp, variance);
}

if (in_data_set_idx < leftovers) {
ACCUMULATOR_TYPE tmp = TO_ACCUMULATOR_TYPE(input[my_data_offset + items_num * workers_per_dataset * FSV + in_data_set_idx]);
tmp -= mean;
variance = fma(tmp, tmp, variance);
}

var_per_feature[in_data_set_idx] = variance;
const uint num_local_workers = LWS0;
const uint worker_block_idx = in_data_set_idx / FSV;
uint reduce_add_level = 1;
while ((SLM_SIZE / FSV) > reduce_add_level) {
barrier(CLK_LOCAL_MEM_FENCE);
if (worker_block_idx % (reduce_add_level * 2) == 0 && (in_data_set_idx + FSV * reduce_add_level) < num_local_workers) {
var_per_feature[in_data_set_idx] += var_per_feature[in_data_set_idx + FSV * reduce_add_level];
}
reduce_add_level *= 2;
}

if (worker_block_idx == 0 && (f_base + get_sub_group_local_id()) < INPUT0_FEATURE_NUM) {
variance = var_per_feature[in_data_set_idx] / TO_ACCUMULATOR_TYPE(data_set_size);
internal_variance[bf] = variance;
}
}
#elif GROUP_NORM_KERNEL_GROUP_VAR
KERNEL(calc_var_per_group)(
#elif GROUP_NORM_KERNEL_GROUP_MEAN_VARIANCE
KERNEL(calc_mean_variance_per_group)(
__global ACCUMULATOR_TYPE* internal_mean,
__global ACCUMULATOR_TYPE* internal_variance
) {
const uint data_idx = get_global_id(0) + get_global_id(1) * GWS0;
Expand All @@ -144,15 +77,21 @@ KERNEL(calc_var_per_group)(
const uint items_num = group_size / num_workers;

if ((data_idx % group_size) < num_workers) {
ACCUMULATOR_TYPE my_variance = ACCUMULATOR_VAL_ZERO;
ACCUMULATOR_TYPE mean_sum = ACCUMULATOR_VAL_ZERO;
ACCUMULATOR_TYPE variance_sum = ACCUMULATOR_VAL_ZERO;
for (uint i = 0; i < items_num; ++i) {
my_variance += internal_variance[data_idx + num_workers * i];
mean_sum += internal_mean[data_idx + num_workers * i];
variance_sum += internal_variance[data_idx + num_workers * i];
}

ACCUMULATOR_TYPE variance = work_group_reduce_add(my_variance);
ACCUMULATOR_TYPE mean = work_group_reduce_add(mean_sum);
ACCUMULATOR_TYPE variance = work_group_reduce_add(variance_sum);
mean /= TO_ACCUMULATOR_TYPE(group_size);
variance /= TO_ACCUMULATOR_TYPE(group_size);
variance -= mean * mean;
variance = native_powr(variance + TO_ACCUMULATOR_TYPE(EPSILON), -0.5f);
for (uint i = 0; i < items_num; ++i) {
internal_mean[data_idx + num_workers * i] = mean;
internal_variance[data_idx + num_workers * i] = variance;
}
}
Expand Down
Loading

0 comments on commit e5426ec

Please sign in to comment.