Skip to content

Commit

Permalink
Add missing code in dynamic fc impl (#28026)
Browse files Browse the repository at this point in the history
### Details:
 - *Add acc_tmp in general calc in fc funcion in common include file*

### Tickets:
 - *158460*
  • Loading branch information
ahnyoung-paul authored Dec 23, 2024
1 parent 9f17ebf commit f93d051
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -601,8 +601,10 @@ inline void FUNC(fc_bf_tiled_kernel_default)(
#endif
#if TILE_OFM > 1
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds;
acc_tmp[bi][fi] = 0;
#else
acc[bi] += acc_tmp[bi] * ds;
acc_tmp[bi] = 0;
#endif
}
}
Expand Down Expand Up @@ -972,7 +974,7 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
// =====================================================================================================================================
// Main computation loop
const uint iterations = MAIN_LOOP_ELEMENTS_COUNT / TILE_IFM_ELEMENTS_SIZE; // TILE_IFM_ELEMENTS_SIZE : (TILE_IFM * SIMD)
// Each sub-group loads 2 Batch
// Each sub-group loads 2 Batch
uint idx_sglid = (sglid * TILE_K) % TILE_IFM_ELEMENTS_SIZE; // same index for sglid 0~7 : to tile_k direction
uint batch_sglid = (sglid * TILE_K) / TILE_IFM_ELEMENTS_SIZE; // 0 to 1 : to batch direction

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -147,9 +147,7 @@ inline void (FUNC_NAME)(
// NOTE: Manually unrolling multiplication loop leads to lower register pressure and allows for bigger block sizes,
// but significantly degrades readability and generality of code.
// It doesn't also show noticable performance improvement on tested configurations.
#if DECOMPRESSION_SCALE_POST_OP
ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { };
#endif
ACCUMULATOR_VEC_TYPE acc_tmp[FORCED_TILE_B] = { };

unroll_for(uint ki = 0; ki < (TILE_IFM * SIMD) / TILE_K; ++ki) {
#if COMPRESSED_WEIGHTS_INT4
Expand Down Expand Up @@ -201,11 +199,7 @@ inline void (FUNC_NAME)(
unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) {
INPUT0_TYPE in_val = _sub_group_shuffle(((INPUT0_TYPE*)(&in_0[bi]))[total_k / SIMD], total_k % SIMD);
unroll_for (uint fi = 0; fi < TILE_OFM; ++fi) {
#if DECOMPRESSION_SCALE_POST_OP
((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
#else
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += in_val * ((ACCUMULATOR_TYPE*)(&wei))[W_IDX];
#endif
}
}
}
Expand Down Expand Up @@ -240,9 +234,20 @@ inline void (FUNC_NAME)(
ACCUMULATOR_TYPE ds = d_scales[fi % DECOMPRESSION_SCALE_LENGTH];
#endif
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi] * ds;
acc_tmp[bi][fi] = 0;
}
}
#endif

#if !DECOMPRESSION_SCALE_POST_OP
unroll_for (uint bi = 0; bi < FORCED_TILE_B; ++bi) {
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
((ACCUMULATOR_TYPE*)(&acc[bi]))[fi] += ((ACCUMULATOR_TYPE*)(&acc_tmp[bi]))[fi];
}
}
#endif


}
// =====================================================================================================================================
// Leftovers
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4137,6 +4137,10 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input
this->test_compressed_int4_scale_dyn_quan(false, true, 511, true);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_batch_1) {
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 1, 2048, 3072);
}

TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_edge_case) {
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 359, 1536, 2560);
}
Expand Down

0 comments on commit f93d051

Please sign in to comment.