Skip to content

Commit

Permalink
Improve perf and fix for weight zp
Browse files Browse the repository at this point in the history
Signed-off-by: Min, Byungil <[email protected]>
  • Loading branch information
byungilm committed Dec 4, 2024
1 parent dd5b960 commit 2812544
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 3 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ KERNEL(quantize_input)(
const uint input_offset = offset * QUANTIZE_GROUP_SIZE;
const uint quantize_block = QUANTIZE_GROUP_SIZE / 4;
MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE) quantized_value;
float max_value = 0.0001f;
INPUT0_TYPE max_value = 0.0001h;

MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_buff;
for (uint i = 0 ; i < quantize_block ; ++i) {
Expand All @@ -40,7 +40,7 @@ KERNEL(quantize_input)(
max_value = fmax(max, max_value);
}

float quan_scale = max_value / 127.f;
float quan_scale = (float)max_value / 127.f;
#if COMPRESSED_WEIGHTS_INT8
int quantized_sum = 0;
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
}

size_t scale_group_size = get_scale_group_size(params);
size_t zp_group_num = params.decompression_zero_point.Feature().v;
size_t zp_group_size = 0;
if (params.has_decompression_zp)
const size_t zp_group_size = params.weights.IFM().v / params.decompression_zero_point.Feature().v;
Expand All @@ -118,9 +119,14 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
if ((scale_group_size % min_quantize_grp_size) == 0 && scale_group_size > min_quantize_grp_size) {
dynamic_quantization_group_size = scale_group_size;

if (is_dyn_quan_8bit_asym(params) && params.has_decompression_zp &&
dynamic_quantization_group_size < zp_group_size && (zp_group_size % min_quantize_grp_size) == 0) {
dynamic_quantization_group_size = zp_group_size;
}

GPU_DEBUG_TRACE_DETAIL << "FC dyn-quantize by per-token. Actual dyn_quan_group_size(" << dynamic_quantization_group_size
<< ") : From scale_group_size (" << scale_group_size << ", zp_group_size(" << zp_group_size
<< "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl;
<< "), zp_group_num(" << zp_group_num << "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl;
return (size_t)dynamic_quantization_group_size;
}
}
Expand Down

0 comments on commit 2812544

Please sign in to comment.