diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl index 24f71f2b936928..83866b88c66225 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl @@ -31,7 +31,7 @@ KERNEL(quantize_input)( const uint input_offset = offset * QUANTIZE_GROUP_SIZE; const uint quantize_block = QUANTIZE_GROUP_SIZE / 4; MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE) quantized_value; - float max_value = 0.0001f; + INPUT0_TYPE max_value = 0.0001h; MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_buff; for (uint i = 0 ; i < quantize_block ; ++i) { @@ -40,7 +40,7 @@ KERNEL(quantize_input)( max_value = fmax(max, max_value); } - float quan_scale = max_value / 127.f; + float quan_scale = (float)max_value / 127.f; #if COMPRESSED_WEIGHTS_INT8 int quantized_sum = 0; #endif diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp index 46d9ea818ee78b..ee2fc68c710d76 100644 --- a/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp @@ -108,6 +108,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para } size_t scale_group_size = get_scale_group_size(params); + size_t zp_group_num = params.decompression_zero_point.Feature().v; size_t zp_group_size = 0; if (params.has_decompression_zp) const size_t zp_group_size = params.weights.IFM().v / params.decompression_zero_point.Feature().v; @@ -118,9 +119,14 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para if ((scale_group_size % min_quantize_grp_size) == 0 && scale_group_size > min_quantize_grp_size) { dynamic_quantization_group_size = scale_group_size; + if (is_dyn_quan_8bit_asym(params) && params.has_decompression_zp && + dynamic_quantization_group_size < zp_group_size && (zp_group_size % min_quantize_grp_size) == 0) { + dynamic_quantization_group_size = zp_group_size; + } + GPU_DEBUG_TRACE_DETAIL << "FC dyn-quantize by per-token. Actual dyn_quan_group_size(" << dynamic_quantization_group_size << ") : From scale_group_size (" << scale_group_size << ", zp_group_size(" << zp_group_size - << "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl; + << "), zp_group_num(" << zp_group_num << "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl; return (size_t)dynamic_quantization_group_size; } }