Clear unused lines

Signed-off-by: Min, Byungil <[email protected]>
openvinotoolkit · Dec 4, 2024 · dd5b960 · dd5b960
1 parent 369dbf4
commit dd5b960
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 110 deletions.
diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl
@@ -20,15 +20,8 @@
 #define INPUT_LOAD_SIZE                     4
 
 #if FC_KERNEL_DYNAMIC_QUANTIZE
-//#define VLOAD_N CAT(vload, VEC_SIZE)
-//#define VSTORE_N CAT(vstore, VEC_SIZE)
-//#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE)
-//#define AS_TYPE_N_(type, n, x) as_##type##n(x)
-//#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
-//#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x)
 
 KERNEL(quantize_input)(
-    // OPTIONAL_SHAPE_INFO_ARG
     const __global INPUT0_TYPE* input,
     __global DQ_TYPE* quantized_input,
     __global float* quan_var
@@ -37,37 +30,21 @@ KERNEL(quantize_input)(
 
     const uint input_offset = offset * QUANTIZE_GROUP_SIZE;
     const uint quantize_block = QUANTIZE_GROUP_SIZE / 4;
-    // MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_0[quantize_block];
     MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE) quantized_value;
-    // INPUT0_TYPE  max[quantize_block];
-    INPUT0_TYPE max_value = 0.001f;
+    float max_value = 0.0001f;
 
     MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_buff;
     for (uint i = 0 ; i < quantize_block ; ++i) {
-        // input_0[i] = vload4(0, &input[input_offset + i * 4]);
-        // max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3])));
         input_buff = vload4(0, &input[input_offset + i * 4]);
         INPUT0_TYPE max = fmax(fmax(fabs(input_buff[0]), fabs(input_buff[1])), fmax(fabs(input_buff[2]), fabs(input_buff[3])));
         max_value = fmax(max, max_value);
     }
-    // unroll_for (uint i = 0 ; i < quantize_block ; ++i) {
-    //     input_0[i] = vload4(0, &input[input_offset + i * 4]);
-    //     max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3])));
-    // }
-
-    // float max_value = 0.001f;
-    // for (uint i = 0 ; i < quantize_block ; i+=8) {
-    //     INPUT0_TYPE temp = fmax(fmax(fmax(max[i], max[i+1]), fmax(max[i+2], max[i+3])),
-    //                             fmax(fmax(max[i+4], max[i+5]), fmax(max[i+6], max[i+7])));
-    //     max_value = fmax((float)(temp), max_value);
-    // }
-
-    float quan_scale = (float)max_value / 127.f;
+
+    float quan_scale = max_value / 127.f;
     #if COMPRESSED_WEIGHTS_INT8
         int quantized_sum = 0;
     #endif
     for (uint i = 0 ; i < quantize_block ; ++i) {
-        // float4 buff = (convert_float4)(input_0[i]) / (float4)quan_scale;
         float4 buff = (convert_float4)(vload4(0, &input[input_offset + i * 4])) / (float4)quan_scale;
 
         quantized_value = CAT(CAT(convert_, MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE)), _rte)(buff);

diff --git a/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp b/...intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp
@@ -107,29 +107,20 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
         }
     }
 
-    const size_t scale_group_size = get_scale_group_size(params);
+    size_t scale_group_size = get_scale_group_size(params);
+    size_t zp_group_size = 0;
+    if (params.has_decompression_zp)
+        const size_t zp_group_size = params.weights.IFM().v / params.decompression_zero_point.Feature().v;
 
     // Per-token dyn-quan
-    if (dynamic_quantization_group_size >= min_quantize_grp_size && is_per_token_dynamic_quantize(params) &&
-        scale_group_size != 0) {
-        // if (is_dyn_quan_8bit_asym(params)) {
-        //     // Should calculate activation sum by scale_group_size for post-operation
-        //     dynamic_quantization_group_size = scale_group_size;
-        //     // printf("!!!! per token dyn-quan(%s) : scale_group_size(%u) input_f(%d) get_input_bf_size(params).second(%u)\n",
-        //     //         ((is_dyn_quan_8bit_asym(params) == true) ? "Y" : "N"),
-        //     //         scale_group_size, (int)get_input_bf_size(params).second, dynamic_quantization_group_size);
-        // } else {
-        //     dynamic_quantization_group_size = get_input_bf_size(params).second;
-        // }
-
-        auto selected_size = scale_group_size;
-        // auto selected_size = scale_group_size / 2;
-        if ((scale_group_size % min_quantize_grp_size) == 0 && selected_size > min_quantize_grp_size) {
-            dynamic_quantization_group_size = selected_size;
-
-            // printf("!!!! per token dyn-quan(%s) : scale_group_size(%u) / input_f(%d) dynamic_quantization_group_size(%u)\n",
-            //     ((is_per_token_dynamic_quantize(params) == true) ? "Y" : "N"),
-            //     scale_group_size, (int)get_input_bf_size(params).second, dynamic_quantization_group_size);
+    if (dynamic_quantization_group_size >= min_quantize_grp_size && is_per_token_dynamic_quantize(params)) {
+        // Validate size to fit dyn-quan group to the size of weight-scale and weight-zp
+        if ((scale_group_size % min_quantize_grp_size) == 0 && scale_group_size > min_quantize_grp_size) {
+            dynamic_quantization_group_size = scale_group_size;
+
+            GPU_DEBUG_TRACE_DETAIL << "FC dyn-quantize by per-token. Actual dyn_quan_group_size(" << dynamic_quantization_group_size
+                                    << ") : From scale_group_size (" << scale_group_size << ", zp_group_size("  << zp_group_size
+                                    <<  "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl;
             return (size_t)dynamic_quantization_group_size;
         }
     }
@@ -148,15 +139,10 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
                 dynamic_quantization_group_size = scale_group_size;
             }
 
-            // printf("!!!! per token dyn-quan(N) : scale_group_size(%u) / input_f(%d) dynamic_quantization_group_size(%u)\n",
-            //         scale_group_size, (int)get_input_bf_size(params).second, dynamic_quantization_group_size);
             return (size_t)dynamic_quantization_group_size;
         }
     }
 
-    // printf("!!!! per token dyn-quan(N) : scale_group_size(%u) / input_f(%d) dynamic_quantization_group_size(%u)\n",
-    //         scale_group_size, (int)get_input_bf_size(params).second, dynamic_quantization_group_size);
-
     return 0;
 }
 
@@ -734,27 +720,6 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
         jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1));
         jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1));
         jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
-        // jit.AddConstant(MakeJitConstant("VEC_SIZE", quantize_grp_size/16));
-
-        // [TEST]
-        // {
-        //     auto vec_size = get_match_vector_size(params);
-        //     auto bf_size = get_input_bf_size(params);
-        //     auto total_block_num = bf_size.second / (simd * vec_size);
-        //     size_t aligned_block_num = (total_block_num > 32) ? Align(total_block_num, 32) : total_block_num;
-        //     size_t block_num = (total_block_num > 32) ? 32 : total_block_num;
-
-        //     jit.AddConstant(MakeJitConstant("VEC_SIZE", vec_size));
-        //     jit.AddConstant(MakeJitConstant("SIMD", simd));
-        //     jit.AddConstant(MakeJitConstant("TOTAL_BLOCK_NUM", total_block_num));
-        //     jit.AddConstant(MakeJitConstant("ALIGNED_BLOCK_NUM", aligned_block_num));
-        //     jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num));
-        //     jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0]));
-        // }
-
-        // if (is_per_token_dynamic_quantize(params) && quantize_grp_size == get_input_bf_size(params).second) {
-        //     jit.AddConstant(MakeJitConstant("PER_TOKEN_QUANTIZE_SIZE", 1));
-        // }
     } else {
         if (add_decompress_scale_post_op)
             jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
@@ -894,12 +859,6 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
                     size_t input_size = input_f * dispatchData.tile_m * dispatchData.gws[2];
                     size_t quan_var_size = (input_size / quantize_grp_size) * 4 * 2;
 
-                    // printf(">>>> Update-intr-buffer(%s) : input_b(%u) input_f(%u) input_size(%u) quan_group_size(%u) \
-                    //         GWS[0](%d) per-token-GWS(%d) NUM_LOOP_IN_DYN_QUAN_GROUP(%d)\n",
-                    //         (kd.internalBufferSizes[0] < input_size) ? "Y" : "N",
-                    //         get_input_bf_size(prim_params).first, input_f, input_size, quantize_grp_size,
-                    //         (int)(input_size / quantize_grp_size), (int)(input_size / input_f), (int)(quantize_grp_size / (dispatchData.tile_mk * simd)));
-
                     if (kd.internalBufferSizes[0] < input_size ||
                         kd.internalBufferSizes[1] < quan_var_size || true) {
                         kd.internalBufferSizes.clear();
@@ -911,17 +870,6 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
 
                     kd.kernels[0].params.workGroups.global = {(std::max((input_size / quantize_grp_size), (size_t)1)), 1, 1};
                     kd.kernels[0].params.workGroups.local = {1, 1, 1};
-                    // [TEST]
-                    // {
-                    //     auto vec_size = get_match_vector_size(prim_params);
-                    //     auto bf_size = get_input_bf_size(prim_params);
-                    //     size_t total_block_num = bf_size.second / (simd * vec_size);
-                    //     size_t batch = get_input_bf_size(prim_params).first;
-                    //     size_t block_num = (total_block_num > 32) ? 32 : total_block_num;
-
-                    //     kd.kernels[0].params.workGroups.global = {simd, block_num, batch};
-                    //     kd.kernels[0].params.workGroups.local = {simd, block_num, 1};
-                    // }
                 }
             }
         };
@@ -1097,18 +1045,6 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
         dyn_quan_dispatch.gws = {(input_size / quantize_grp_size), 1, 1};
         dyn_quan_dispatch.lws = {1, 1, 1};
 
-        // [TEST]
-        // {
-        //     auto vec_size = get_match_vector_size(fc_params);
-        //     auto bf_size = get_input_bf_size(fc_params);
-        //     size_t total_block_num = bf_size.second / (simd * vec_size);
-        //     size_t batch = get_input_bf_size(fc_params).first;
-        //     size_t block_num = (total_block_num > 32) ? 32 : total_block_num;
-
-        //     dyn_quan_dispatch.gws = {simd, block_num, batch};
-        //     dyn_quan_dispatch.lws = {simd, block_num, 1};
-        // }
-
         quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws;
         quan_kernel.params.workGroups.local = dyn_quan_dispatch.lws;
         quan_kernel.skip_execution = false;
@@ -1133,7 +1069,6 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
                         fc_params.is_shape_agnostic);
 
         quan_kernel.params.arguments.clear();  // Clear original output argument
-        // quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
         quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
         quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
         quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
@@ -1142,13 +1077,6 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
         // float type of de_quan_scale and activation sum for each quantized group
         kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 4 * 2);
         kernel_number++;
-
-        // printf(">>>> Set-buffer : input_b(%u) input_f(%u) input_size(%u) quan_group_size(%u) \
-        //         GWS[0](%d) per-token-GWS(%d) NUM_LOOP_IN_DYN_QUAN_GROUP(%d)\n",
-        //         get_input_bf_size(fc_params).first, get_input_bf_size(fc_params).second, input_size, quantize_grp_size,
-        //         (int)(quan_kernel.params.workGroups.global[0]), (int)(input_size / get_input_bf_size(fc_params).second),
-        //         (int)(quantize_grp_size / (dispatchData.tile_mk * simd)));
-
     }
     // kd.internalBufferDataType = Datatype::F16;
     kd.internalBufferDataType = Datatype::F32;