Improve perf and fix for weight zp

Signed-off-by: Min, Byungil <[email protected]>
openvinotoolkit · Dec 4, 2024 · 2812544 · 2812544
1 parent dd5b960
commit 2812544
Show file tree

Hide file tree

Showing 2 changed files with 9 additions and 3 deletions.
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -31,7 +31,7 @@ KERNEL(quantize_input)(
     const uint input_offset = offset * QUANTIZE_GROUP_SIZE;
     const uint quantize_block = QUANTIZE_GROUP_SIZE / 4;
     MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE) quantized_value;
-    float max_value = 0.0001f;
+    INPUT0_TYPE max_value = 0.0001h;
 
     MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_buff;
     for (uint i = 0 ; i < quantize_block ; ++i) {
@@ -40,7 +40,7 @@ KERNEL(quantize_input)(
         max_value = fmax(max, max_value);
     }
 
-    float quan_scale = max_value / 127.f;
+    float quan_scale = (float)max_value / 127.f;
     #if COMPRESSED_WEIGHTS_INT8
         int quantized_sum = 0;
     #endif

diff --git a/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -108,6 +108,7 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
     }
 
     size_t scale_group_size = get_scale_group_size(params);
+    size_t zp_group_num = params.decompression_zero_point.Feature().v;
     size_t zp_group_size = 0;
     if (params.has_decompression_zp)
         const size_t zp_group_size = params.weights.IFM().v / params.decompression_zero_point.Feature().v;
@@ -118,9 +119,14 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
         if ((scale_group_size % min_quantize_grp_size) == 0 && scale_group_size > min_quantize_grp_size) {
             dynamic_quantization_group_size = scale_group_size;
 
+            if (is_dyn_quan_8bit_asym(params) && params.has_decompression_zp &&
+                dynamic_quantization_group_size < zp_group_size && (zp_group_size % min_quantize_grp_size) == 0) {
+                dynamic_quantization_group_size = zp_group_size;
+            }
+
             GPU_DEBUG_TRACE_DETAIL << "FC dyn-quantize by per-token. Actual dyn_quan_group_size(" << dynamic_quantization_group_size
                                     << ") : From scale_group_size (" << scale_group_size << ", zp_group_size("  << zp_group_size
-                                    <<  "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl;
+                                    << "), zp_group_num(" << zp_group_num << "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl;
             return (size_t)dynamic_quantization_group_size;
         }
     }