From c64444969ea80f25a33f42a217527c08c599736f Mon Sep 17 00:00:00 2001 From: fishbell Date: Mon, 13 Jan 2025 14:01:05 +0800 Subject: [PATCH] initial trial for perf evaluation Signed-off-by: fishbell --- .../quantize_gpu_scale_shift_vload8_opt.cl | 142 +++++++++++++++ ...quantize_kernel_scale_shift_vload8_opt.cpp | 168 ++++++++++++++++++ .../quantize_kernel_scale_shift_vload8_opt.h | 25 +++ 3 files changed, 335 insertions(+) create mode 100644 src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp create mode 100644 src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.h diff --git a/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl new file mode 100644 index 00000000000000..efed489fbcf736 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl @@ -0,0 +1,142 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "include/batch_headers/fetch_data.cl" + +#define TO_OUTPUT_TYPE CAT(convert_, OUTPUT_TYPE) +#define INPUT0_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) +#define INPUT1_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, 8) +#define OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) + +#define TO_VECTOR_TYPE_IMPL_8(elem_type) CAT(convert_##elem_type, 8) +#define TO_VECTOR_TYPE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_, size)(elem_type) + +#define TO_VECTOR_TYPE_IMPL_SAT_8(elem_type) CAT(convert_##elem_type, 8##_sat) +#define TO_VECTOR_TYPE_IMPL_SAT_RTE_8(elem_type) CAT(convert_##elem_type, 8##_sat_rte) +#define TO_VECTOR_TYPE_SAT(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_, size)(elem_type) +#define TO_VECTOR_TYPE_SAT_RTE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_RTE_, size)(elem_type) +#define VLOAD_DECLS vload8(global_id, input) +#ifdef SUB_GROUP_SIZE +REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) +#endif +#ifndef IS_DYNAMIC +__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) +#endif +KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG + const __global INPUT0_TYPE* input, + const __global INPUT1_TYPE* input_low, + const __global INPUT2_TYPE* input_high, + const __global INPUT3_TYPE* output_low, + const __global INPUT4_TYPE* output_high, + const __global INPUT5_TYPE* input_scale, + const __global INPUT6_TYPE* input_shift, + const __global INPUT7_TYPE* output_scale, + const __global INPUT8_TYPE* output_shift, + __global OUTPUT_TYPE* output) +{ + const int global_id = get_global_id(0); + + const INPUT0_VEC_TYPE in0 = VLOAD_DECLS; + + OUTPUT_VEC_TYPE res; + + INPUT1_TYPE input_scale_val = IN_SCALE_VAL; + + INPUT1_TYPE input_shift_val = IN_SHIFT_VAL; + + INPUT1_TYPE output_scale_val = OUT_SCALE_VAL; + + INPUT1_TYPE output_shift_val = OUT_SHIFT_VAL; + + +#if HAS_CLAMP +#if CAN_USE_OUTPUT_RANGE + INPUT1_TYPE output_low_val = OUT_LO_VAL; + INPUT1_TYPE output_high_val = OUT_HI_VAL; +#else + INPUT1_TYPE input_low_val = IN_LO_VAL; + INPUT1_TYPE input_high_val = IN_HI_VAL; +#endif // CAN_USE_OUTPUT_RANGE +#endif // HAS_CLAMP + +// ************************************************************* // +// Calculations for optimized branch with the output range usage // +// ************************************************************* // + +#if CAN_USE_OUTPUT_RANGE + +#if HAS_PRE_SHIFT + INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val + input_shift_val; +#else + INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val; +#endif + +#if HAS_OUTPUT_RANGE_ROUND + val = round(val); +#endif + +#if HAS_POST_SCALE + val *= output_scale_val; +#endif + +#if HAS_POST_SHIFT + val += output_shift_val; +#endif + +#if HAS_CLAMP +#if HAS_MIN_CLAMP && HAS_MAX_CLAMP + val = clamp(val, output_low_val, output_high_val); +#elif HAS_MIN_CLAMP + val = max(val, output_low_val); +#else // HAS_MAX_CLAMP + val = min(val, output_high_val); +#endif +#endif // HAS_CLAMP + +// ************************************************************** // +// Calculations for alternative branch with the input range usage // +// ************************************************************** // + +#else // CAN_USE_OUTPUT_RANGE + +#if HAS_CLAMP + INPUT1_VEC_TYPE val = clamp(TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0), input_low_val, input_high_val); +#else + INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0); +#endif + +#if HAS_PRE_SHIFT + val = round(val * input_scale_val + input_shift_val); +#else + val = round(val * input_scale_val); +#endif + +#if HAS_POST_SCALE + val *= output_scale_val; +#endif + +#if HAS_POST_SHIFT + val += output_shift_val; +#endif + +#endif // CAN_USE_OUTPUT_RANGE + +// *********************************** // +// Common section with results writing // +// *********************************** // + +#if FEATURE_BLOCKED_FORMAT + if (of < OUTPUT_FEATURE_NUM) +#endif +#if OUTPUT_IS_FP + res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val); +#else + res = TO_VECTOR_TYPE_SAT_RTE(OUTPUT_TYPE, 8)(val);; +#endif + + vstore8(res, global_id, output); +} + +#undef TO_OUTPUT_TYPE +#undef TO_OUTPUT_TYPE_SAT_RTE \ No newline at end of file diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp new file mode 100644 index 00000000000000..a99ec9f8b05069 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp @@ -0,0 +1,168 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#include "quantize_kernel_scale_shift_vload8_opt.h" + +#include +#include + +#include "kernel_selector_utils.h" + +static const size_t sub_group_size = 32; +static const size_t feature_size = 32; + +namespace kernel_selector { +ParamsKey QuantizeKernelScaleShift_vload8::GetSupportedKey() const { + ParamsKey k; + k.EnableInputDataType(Datatype::UINT8); + k.EnableInputDataType(Datatype::INT8); + k.EnableInputDataType(Datatype::F16); + k.EnableInputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::F16); + k.EnableOutputDataType(Datatype::F32); + k.EnableOutputDataType(Datatype::UINT8); + k.EnableOutputDataType(Datatype::INT8); + k.EnableAllInputLayout(); + k.EnableAllOutputLayout(); + k.EnableTensorOffset(); + k.EnableTensorPitches(); + k.EnableBatching(); + k.EnableDifferentTypes(); + k.EnableQuantizeScaleShiftOpt(); + k.EnableDynamicShapesSupport(); + return k; +} + +CommonDispatchData QuantizeKernelScaleShift_vload8::SetDefault(const quantize_params& params) const { + CommonDispatchData dispatchData; + // need special handle for blocked format?? + if (true) { + dispatchData.gws[0] = std::max(params.outputs[0].LogicalSize() / 8, (size_t)1); + dispatchData.gws[1] = 1; + dispatchData.gws[2] = 1; + } + dispatchData.lws = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]}, + params.engineInfo); + return dispatchData; +} + +JitConstants QuantizeKernelScaleShift_vload8::GetJitConstants(const quantize_params& params, + const CommonDispatchData& dispatchData) const { + JitConstants jit = Parent::GetJitConstants(params, dispatchData); + + if (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 || + params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 || + params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32 || + params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 || + params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv32 || + params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || + params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 || + params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16 || + params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv32 || + params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv16 || + params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv32) { + jit.AddConstant(MakeJitConstant("FEATURE_BLOCKED_FORMAT", true)); + jit.AddConstant(MakeJitConstant("GWS_BATCH", 2)); + jit.AddConstant(MakeJitConstant("GWS_FEATURE", 1)); + jit.AddConstant(MakeJitConstant("GWS_YX", 0)); + jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size)); + } else { + auto tensor_jits = GetTensorFriendlyWorkGroupsJit(params.outputs[0]); + jit.Merge(tensor_jits); + } + + auto can_use_output_range = params.per_tensor_output_range && params.out_lo < params.out_hi; + auto has_output_range_round = + !(params.outputs[0].GetDType() == Datatype::INT8 || params.outputs[0].GetDType() == Datatype::UINT8); + + jit.AddConstant(MakeJitConstant("HAS_POST_SCALE", params.has_post_scale)); + jit.AddConstant(MakeJitConstant("HAS_POST_SHIFT", params.has_post_shift)); + jit.AddConstant(MakeJitConstant("HAS_PRE_SHIFT", params.has_pre_shift)); + jit.AddConstant(MakeJitConstant("HAS_CLAMP", params.has_clamp)); + jit.AddConstant(MakeJitConstant("HAS_MIN_CLAMP", params.has_min_clamp)); + jit.AddConstant(MakeJitConstant("HAS_MAX_CLAMP", params.has_max_clamp)); + jit.AddConstant(MakeJitConstant("IN_LO_VAL", params.in_lo)); + jit.AddConstant(MakeJitConstant("IN_HI_VAL", params.in_hi)); + jit.AddConstant(MakeJitConstant("OUT_LO_VAL", params.out_lo)); + jit.AddConstant(MakeJitConstant("OUT_HI_VAL", params.out_hi)); + jit.AddConstant(MakeJitConstant("IN_SCALE_VAL", params.in_scale)); + jit.AddConstant(MakeJitConstant("IN_SHIFT_VAL", params.in_shift)); + jit.AddConstant(MakeJitConstant("OUT_SCALE_VAL", params.out_scale)); + jit.AddConstant(MakeJitConstant("OUT_SHIFT_VAL", params.out_shift)); + jit.AddConstant(MakeJitConstant("CAN_USE_OUTPUT_RANGE", can_use_output_range)); + jit.AddConstant(MakeJitConstant("HAS_OUTPUT_RANGE_ROUND", has_output_range_round)); + + return jit; +} + +bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const { + const quantize_params& params = static_cast(p); + if (params.inputs.size() != 9) + return false; + + // this kernel is opt for per tensor quantization params for now + if (!params.per_tensor_input_range || !params.per_tensor_output_range || !params.per_tensor_input_scale || + !params.per_tensor_output_scale || !params.per_tensor_output_shift || + (params.has_pre_shift && !params.per_tensor_input_shift)) + return false; + // TBD, do we really need the strick block_size checking to support blocked foramt? + for (size_t i = 0; i < params.inputs.size(); i++) { + const auto input_layout = params.inputs[i].GetLayout(); + const auto batch_size = params.inputs[i].Batch().v; + const auto feature_size = params.inputs[i].Feature().v; + if ((input_layout == DataLayout::b_fs_yx_fsv16 && feature_size % 16 != 0) || + (input_layout == DataLayout::b_fs_yx_fsv32 && feature_size % 32 != 0) || + (input_layout == DataLayout::b_fs_zyx_fsv16 && feature_size % 16 != 0) || + (input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) || + input_layout == DataLayout::fs_b_yx_fsv32 || + (input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 && (feature_size % 16 != 0 || batch_size % 32 != 0)) || + (input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 && (feature_size % 32 != 0 || batch_size % 32 != 0))) + return false; + } + if ((params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 && params.outputs[0].Feature().v % 16 != 0) || + (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32 && params.outputs[0].Feature().v % 32 != 0) || + (params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16 && params.outputs[0].Feature().v % 16 != 0) || + (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4 && params.outputs[0].Feature().v % 8 != 0) || + params.outputs[0].GetLayout() == DataLayout::fs_b_yx_fsv32 || + (params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 && + (params.outputs[0].Feature().v % 16 != 0 || params.outputs[0].Batch().v % 32 != 0)) || + (params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 && + (params.outputs[0].Feature().v % 32 != 0 || params.outputs[0].Batch().v % 32 != 0))) + return false; + // TBD maybe need more stric check? + return true; +} + +KernelsData QuantizeKernelScaleShift_vload8::GetKernelsData(const Params& params) const { + assert(params.GetType() == KernelType::QUANTIZE); + + KernelData kd = KernelData::Default(params); + quantize_params& nparams = *static_cast(kd.params.get()); + + if (!Validate(params)) { + return {}; + } + + auto dispatchData = SetDefault(nparams); + auto entry_point = GetEntryPoint(kernelName, nparams.layerID, params); + auto cldnn_jit = GetJitConstants(nparams, dispatchData); + auto jit = CreateJit(kernelName, cldnn_jit, entry_point); + + GetUpdateDispatchDataFunc(kd); + + auto& kernel = kd.kernels[0]; + + kernel.params.workGroups.global = dispatchData.gws; + kernel.params.workGroups.local = dispatchData.lws; + kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT); + kernel.params.arguments = + GetArgsDesc(static_cast(nparams.inputs.size()), false, false, 0, 1, nparams.has_dynamic_tensors()); + + return {kd}; +} + +KernelsPriority QuantizeKernelScaleShift_vload8::GetKernelsPriority(const Params& /*params*/) const { + return FORCE_PRIORITY_8; +} +} // namespace kernel_selector diff --git a/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.h b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.h new file mode 100644 index 00000000000000..37a4e569bb1810 --- /dev/null +++ b/src/plugins/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.h @@ -0,0 +1,25 @@ +// Copyright (C) 2018-2024 Intel Corporation +// SPDX-License-Identifier: Apache-2.0 +// + +#pragma once + +#include "quantize_kernel_base.h" + +namespace kernel_selector { + +class QuantizeKernelScaleShift_vload8 : public QuantizeKernelBase { +public: + using Parent = QuantizeKernelBase; + + QuantizeKernelScaleShift_vload8() : QuantizeKernelBase("quantize_gpu_scale_shift_vload8_opt") {} + virtual ~QuantizeKernelScaleShift_vload8() {} + CommonDispatchData SetDefault(const quantize_params& params) const override; + KernelsPriority GetKernelsPriority(const Params& params) const override; + ParamsKey GetSupportedKey() const override; + KernelsData GetKernelsData(const Params& params) const override; +protected: + bool Validate(const Params& p) const override; + JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override; +}; +} // namespace kernel_selector