Skip to content

Commit

Permalink
Clear unused lines
Browse files Browse the repository at this point in the history
Signed-off-by: Min, Byungil <[email protected]>
  • Loading branch information
byungilm committed Dec 4, 2024
1 parent 369dbf4 commit dd5b960
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 110 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,15 +20,8 @@
#define INPUT_LOAD_SIZE 4

#if FC_KERNEL_DYNAMIC_QUANTIZE
//#define VLOAD_N CAT(vload, VEC_SIZE)
//#define VSTORE_N CAT(vstore, VEC_SIZE)
//#define CONVERT_CHAR_N CAT(convert_char, VEC_SIZE)
//#define AS_TYPE_N_(type, n, x) as_##type##n(x)
//#define AS_TYPE_N(type, n, x) AS_TYPE_N_(type, n, x)
//#define AS_INPUT_TYPE_N(x) AS_TYPE_N(INPUT0_TYPE, VEC_SIZE, x)

KERNEL(quantize_input)(
// OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
__global DQ_TYPE* quantized_input,
__global float* quan_var
Expand All @@ -37,37 +30,21 @@ KERNEL(quantize_input)(

const uint input_offset = offset * QUANTIZE_GROUP_SIZE;
const uint quantize_block = QUANTIZE_GROUP_SIZE / 4;
// MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_0[quantize_block];
MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE) quantized_value;
// INPUT0_TYPE max[quantize_block];
INPUT0_TYPE max_value = 0.001f;
float max_value = 0.0001f;

MAKE_VECTOR_TYPE(INPUT0_TYPE, INPUT_LOAD_SIZE) input_buff;
for (uint i = 0 ; i < quantize_block ; ++i) {
// input_0[i] = vload4(0, &input[input_offset + i * 4]);
// max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3])));
input_buff = vload4(0, &input[input_offset + i * 4]);
INPUT0_TYPE max = fmax(fmax(fabs(input_buff[0]), fabs(input_buff[1])), fmax(fabs(input_buff[2]), fabs(input_buff[3])));
max_value = fmax(max, max_value);
}
// unroll_for (uint i = 0 ; i < quantize_block ; ++i) {
// input_0[i] = vload4(0, &input[input_offset + i * 4]);
// max[i] = fmax(fmax(fabs(input_0[i][0]), fabs(input_0[i][1])), fmax(fabs(input_0[i][2]), fabs(input_0[i][3])));
// }

// float max_value = 0.001f;
// for (uint i = 0 ; i < quantize_block ; i+=8) {
// INPUT0_TYPE temp = fmax(fmax(fmax(max[i], max[i+1]), fmax(max[i+2], max[i+3])),
// fmax(fmax(max[i+4], max[i+5]), fmax(max[i+6], max[i+7])));
// max_value = fmax((float)(temp), max_value);
// }

float quan_scale = (float)max_value / 127.f;

float quan_scale = max_value / 127.f;
#if COMPRESSED_WEIGHTS_INT8
int quantized_sum = 0;
#endif
for (uint i = 0 ; i < quantize_block ; ++i) {
// float4 buff = (convert_float4)(input_0[i]) / (float4)quan_scale;
float4 buff = (convert_float4)(vload4(0, &input[input_offset + i * 4])) / (float4)quan_scale;

quantized_value = CAT(CAT(convert_, MAKE_VECTOR_TYPE(DQ_TYPE, INPUT_LOAD_SIZE)), _rte)(buff);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -107,29 +107,20 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
}
}

const size_t scale_group_size = get_scale_group_size(params);
size_t scale_group_size = get_scale_group_size(params);
size_t zp_group_size = 0;
if (params.has_decompression_zp)
const size_t zp_group_size = params.weights.IFM().v / params.decompression_zero_point.Feature().v;

// Per-token dyn-quan
if (dynamic_quantization_group_size >= min_quantize_grp_size && is_per_token_dynamic_quantize(params) &&
scale_group_size != 0) {
// if (is_dyn_quan_8bit_asym(params)) {
// // Should calculate activation sum by scale_group_size for post-operation
// dynamic_quantization_group_size = scale_group_size;
// // printf("!!!! per token dyn-quan(%s) : scale_group_size(%u) input_f(%d) get_input_bf_size(params).second(%u)\n",
// // ((is_dyn_quan_8bit_asym(params) == true) ? "Y" : "N"),
// // scale_group_size, (int)get_input_bf_size(params).second, dynamic_quantization_group_size);
// } else {
// dynamic_quantization_group_size = get_input_bf_size(params).second;
// }

auto selected_size = scale_group_size;
// auto selected_size = scale_group_size / 2;
if ((scale_group_size % min_quantize_grp_size) == 0 && selected_size > min_quantize_grp_size) {
dynamic_quantization_group_size = selected_size;

// printf("!!!! per token dyn-quan(%s) : scale_group_size(%u) / input_f(%d) dynamic_quantization_group_size(%u)\n",
// ((is_per_token_dynamic_quantize(params) == true) ? "Y" : "N"),
// scale_group_size, (int)get_input_bf_size(params).second, dynamic_quantization_group_size);
if (dynamic_quantization_group_size >= min_quantize_grp_size && is_per_token_dynamic_quantize(params)) {
// Validate size to fit dyn-quan group to the size of weight-scale and weight-zp
if ((scale_group_size % min_quantize_grp_size) == 0 && scale_group_size > min_quantize_grp_size) {
dynamic_quantization_group_size = scale_group_size;

GPU_DEBUG_TRACE_DETAIL << "FC dyn-quantize by per-token. Actual dyn_quan_group_size(" << dynamic_quantization_group_size
<< ") : From scale_group_size (" << scale_group_size << ", zp_group_size(" << zp_group_size
<< "), ifm_size (" << get_input_bf_size(params).second << ")" << std::endl;
return (size_t)dynamic_quantization_group_size;
}
}
Expand All @@ -148,15 +139,10 @@ static size_t get_dynamic_quantize_group_size(const fully_connected_params& para
dynamic_quantization_group_size = scale_group_size;
}

// printf("!!!! per token dyn-quan(N) : scale_group_size(%u) / input_f(%d) dynamic_quantization_group_size(%u)\n",
// scale_group_size, (int)get_input_bf_size(params).second, dynamic_quantization_group_size);
return (size_t)dynamic_quantization_group_size;
}
}

// printf("!!!! per token dyn-quan(N) : scale_group_size(%u) / input_f(%d) dynamic_quantization_group_size(%u)\n",
// scale_group_size, (int)get_input_bf_size(params).second, dynamic_quantization_group_size);

return 0;
}

Expand Down Expand Up @@ -734,27 +720,6 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1));
jit.AddConstant(MakeJitConstant("DQ_DECOMPRESSION_SCALE_POST_OP", 1));
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
// jit.AddConstant(MakeJitConstant("VEC_SIZE", quantize_grp_size/16));

// [TEST]
// {
// auto vec_size = get_match_vector_size(params);
// auto bf_size = get_input_bf_size(params);
// auto total_block_num = bf_size.second / (simd * vec_size);
// size_t aligned_block_num = (total_block_num > 32) ? Align(total_block_num, 32) : total_block_num;
// size_t block_num = (total_block_num > 32) ? 32 : total_block_num;

// jit.AddConstant(MakeJitConstant("VEC_SIZE", vec_size));
// jit.AddConstant(MakeJitConstant("SIMD", simd));
// jit.AddConstant(MakeJitConstant("TOTAL_BLOCK_NUM", total_block_num));
// jit.AddConstant(MakeJitConstant("ALIGNED_BLOCK_NUM", aligned_block_num));
// jit.AddConstant(MakeJitConstant("BLOCK_NUM", block_num));
// jit.Merge(GetTensorFriendlyWorkGroupsJit(params.outputs[0]));
// }

// if (is_per_token_dynamic_quantize(params) && quantize_grp_size == get_input_bf_size(params).second) {
// jit.AddConstant(MakeJitConstant("PER_TOKEN_QUANTIZE_SIZE", 1));
// }
} else {
if (add_decompress_scale_post_op)
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
Expand Down Expand Up @@ -894,12 +859,6 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
size_t input_size = input_f * dispatchData.tile_m * dispatchData.gws[2];
size_t quan_var_size = (input_size / quantize_grp_size) * 4 * 2;

// printf(">>>> Update-intr-buffer(%s) : input_b(%u) input_f(%u) input_size(%u) quan_group_size(%u) \
// GWS[0](%d) per-token-GWS(%d) NUM_LOOP_IN_DYN_QUAN_GROUP(%d)\n",
// (kd.internalBufferSizes[0] < input_size) ? "Y" : "N",
// get_input_bf_size(prim_params).first, input_f, input_size, quantize_grp_size,
// (int)(input_size / quantize_grp_size), (int)(input_size / input_f), (int)(quantize_grp_size / (dispatchData.tile_mk * simd)));

if (kd.internalBufferSizes[0] < input_size ||
kd.internalBufferSizes[1] < quan_var_size || true) {
kd.internalBufferSizes.clear();
Expand All @@ -911,17 +870,6 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {

kd.kernels[0].params.workGroups.global = {(std::max((input_size / quantize_grp_size), (size_t)1)), 1, 1};
kd.kernels[0].params.workGroups.local = {1, 1, 1};
// [TEST]
// {
// auto vec_size = get_match_vector_size(prim_params);
// auto bf_size = get_input_bf_size(prim_params);
// size_t total_block_num = bf_size.second / (simd * vec_size);
// size_t batch = get_input_bf_size(prim_params).first;
// size_t block_num = (total_block_num > 32) ? 32 : total_block_num;

// kd.kernels[0].params.workGroups.global = {simd, block_num, batch};
// kd.kernels[0].params.workGroups.local = {simd, block_num, 1};
// }
}
}
};
Expand Down Expand Up @@ -1097,18 +1045,6 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
dyn_quan_dispatch.gws = {(input_size / quantize_grp_size), 1, 1};
dyn_quan_dispatch.lws = {1, 1, 1};

// [TEST]
// {
// auto vec_size = get_match_vector_size(fc_params);
// auto bf_size = get_input_bf_size(fc_params);
// size_t total_block_num = bf_size.second / (simd * vec_size);
// size_t batch = get_input_bf_size(fc_params).first;
// size_t block_num = (total_block_num > 32) ? 32 : total_block_num;

// dyn_quan_dispatch.gws = {simd, block_num, batch};
// dyn_quan_dispatch.lws = {simd, block_num, 1};
// }

quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws;
quan_kernel.params.workGroups.local = dyn_quan_dispatch.lws;
quan_kernel.skip_execution = false;
Expand All @@ -1133,7 +1069,6 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
fc_params.is_shape_agnostic);

quan_kernel.params.arguments.clear(); // Clear original output argument
// quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::SHAPE_INFO, 0});
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
Expand All @@ -1142,13 +1077,6 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
// float type of de_quan_scale and activation sum for each quantized group
kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 4 * 2);
kernel_number++;

// printf(">>>> Set-buffer : input_b(%u) input_f(%u) input_size(%u) quan_group_size(%u) \
// GWS[0](%d) per-token-GWS(%d) NUM_LOOP_IN_DYN_QUAN_GROUP(%d)\n",
// get_input_bf_size(fc_params).first, get_input_bf_size(fc_params).second, input_size, quantize_grp_size,
// (int)(quan_kernel.params.workGroups.global[0]), (int)(input_size / get_input_bf_size(fc_params).second),
// (int)(quantize_grp_size / (dispatchData.tile_mk * simd)));

}
// kd.internalBufferDataType = Datatype::F16;
kd.internalBufferDataType = Datatype::F32;
Expand Down

0 comments on commit dd5b960

Please sign in to comment.