forked from openvinotoolkit/openvino
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Signed-off-by: fishbell <[email protected]>
- Loading branch information
Showing
3 changed files
with
335 additions
and
0 deletions.
There are no files selected for viewing
142 changes: 142 additions & 0 deletions
142
src/plugins/intel_gpu/src/kernel_selector/cl_kernels/quantize_gpu_scale_shift_vload8_opt.cl
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,142 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "include/batch_headers/fetch_data.cl" | ||
|
||
#define TO_OUTPUT_TYPE CAT(convert_, OUTPUT_TYPE) | ||
#define INPUT0_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, 8) | ||
#define INPUT1_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, 8) | ||
#define OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8) | ||
|
||
#define TO_VECTOR_TYPE_IMPL_8(elem_type) CAT(convert_##elem_type, 8) | ||
#define TO_VECTOR_TYPE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_, size)(elem_type) | ||
|
||
#define TO_VECTOR_TYPE_IMPL_SAT_8(elem_type) CAT(convert_##elem_type, 8##_sat) | ||
#define TO_VECTOR_TYPE_IMPL_SAT_RTE_8(elem_type) CAT(convert_##elem_type, 8##_sat_rte) | ||
#define TO_VECTOR_TYPE_SAT(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_, size)(elem_type) | ||
#define TO_VECTOR_TYPE_SAT_RTE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_RTE_, size)(elem_type) | ||
#define VLOAD_DECLS vload8(global_id, input) | ||
#ifdef SUB_GROUP_SIZE | ||
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE) | ||
#endif | ||
#ifndef IS_DYNAMIC | ||
__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) | ||
#endif | ||
KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG | ||
const __global INPUT0_TYPE* input, | ||
const __global INPUT1_TYPE* input_low, | ||
const __global INPUT2_TYPE* input_high, | ||
const __global INPUT3_TYPE* output_low, | ||
const __global INPUT4_TYPE* output_high, | ||
const __global INPUT5_TYPE* input_scale, | ||
const __global INPUT6_TYPE* input_shift, | ||
const __global INPUT7_TYPE* output_scale, | ||
const __global INPUT8_TYPE* output_shift, | ||
__global OUTPUT_TYPE* output) | ||
{ | ||
const int global_id = get_global_id(0); | ||
|
||
const INPUT0_VEC_TYPE in0 = VLOAD_DECLS; | ||
|
||
OUTPUT_VEC_TYPE res; | ||
|
||
INPUT1_TYPE input_scale_val = IN_SCALE_VAL; | ||
|
||
INPUT1_TYPE input_shift_val = IN_SHIFT_VAL; | ||
|
||
INPUT1_TYPE output_scale_val = OUT_SCALE_VAL; | ||
|
||
INPUT1_TYPE output_shift_val = OUT_SHIFT_VAL; | ||
|
||
|
||
#if HAS_CLAMP | ||
#if CAN_USE_OUTPUT_RANGE | ||
INPUT1_TYPE output_low_val = OUT_LO_VAL; | ||
INPUT1_TYPE output_high_val = OUT_HI_VAL; | ||
#else | ||
INPUT1_TYPE input_low_val = IN_LO_VAL; | ||
INPUT1_TYPE input_high_val = IN_HI_VAL; | ||
#endif // CAN_USE_OUTPUT_RANGE | ||
#endif // HAS_CLAMP | ||
|
||
// ************************************************************* // | ||
// Calculations for optimized branch with the output range usage // | ||
// ************************************************************* // | ||
|
||
#if CAN_USE_OUTPUT_RANGE | ||
|
||
#if HAS_PRE_SHIFT | ||
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val + input_shift_val; | ||
#else | ||
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val; | ||
#endif | ||
|
||
#if HAS_OUTPUT_RANGE_ROUND | ||
val = round(val); | ||
#endif | ||
|
||
#if HAS_POST_SCALE | ||
val *= output_scale_val; | ||
#endif | ||
|
||
#if HAS_POST_SHIFT | ||
val += output_shift_val; | ||
#endif | ||
|
||
#if HAS_CLAMP | ||
#if HAS_MIN_CLAMP && HAS_MAX_CLAMP | ||
val = clamp(val, output_low_val, output_high_val); | ||
#elif HAS_MIN_CLAMP | ||
val = max(val, output_low_val); | ||
#else // HAS_MAX_CLAMP | ||
val = min(val, output_high_val); | ||
#endif | ||
#endif // HAS_CLAMP | ||
|
||
// ************************************************************** // | ||
// Calculations for alternative branch with the input range usage // | ||
// ************************************************************** // | ||
|
||
#else // CAN_USE_OUTPUT_RANGE | ||
|
||
#if HAS_CLAMP | ||
INPUT1_VEC_TYPE val = clamp(TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0), input_low_val, input_high_val); | ||
#else | ||
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0); | ||
#endif | ||
|
||
#if HAS_PRE_SHIFT | ||
val = round(val * input_scale_val + input_shift_val); | ||
#else | ||
val = round(val * input_scale_val); | ||
#endif | ||
|
||
#if HAS_POST_SCALE | ||
val *= output_scale_val; | ||
#endif | ||
|
||
#if HAS_POST_SHIFT | ||
val += output_shift_val; | ||
#endif | ||
|
||
#endif // CAN_USE_OUTPUT_RANGE | ||
|
||
// *********************************** // | ||
// Common section with results writing // | ||
// *********************************** // | ||
|
||
#if FEATURE_BLOCKED_FORMAT | ||
if (of < OUTPUT_FEATURE_NUM) | ||
#endif | ||
#if OUTPUT_IS_FP | ||
res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val); | ||
#else | ||
res = TO_VECTOR_TYPE_SAT_RTE(OUTPUT_TYPE, 8)(val);; | ||
#endif | ||
|
||
vstore8(res, global_id, output); | ||
} | ||
|
||
#undef TO_OUTPUT_TYPE | ||
#undef TO_OUTPUT_TYPE_SAT_RTE |
168 changes: 168 additions & 0 deletions
168
...intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.cpp
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#include "quantize_kernel_scale_shift_vload8_opt.h" | ||
|
||
#include <iostream> | ||
#include <string> | ||
|
||
#include "kernel_selector_utils.h" | ||
|
||
static const size_t sub_group_size = 32; | ||
static const size_t feature_size = 32; | ||
|
||
namespace kernel_selector { | ||
ParamsKey QuantizeKernelScaleShift_vload8::GetSupportedKey() const { | ||
ParamsKey k; | ||
k.EnableInputDataType(Datatype::UINT8); | ||
k.EnableInputDataType(Datatype::INT8); | ||
k.EnableInputDataType(Datatype::F16); | ||
k.EnableInputDataType(Datatype::F32); | ||
k.EnableOutputDataType(Datatype::F16); | ||
k.EnableOutputDataType(Datatype::F32); | ||
k.EnableOutputDataType(Datatype::UINT8); | ||
k.EnableOutputDataType(Datatype::INT8); | ||
k.EnableAllInputLayout(); | ||
k.EnableAllOutputLayout(); | ||
k.EnableTensorOffset(); | ||
k.EnableTensorPitches(); | ||
k.EnableBatching(); | ||
k.EnableDifferentTypes(); | ||
k.EnableQuantizeScaleShiftOpt(); | ||
k.EnableDynamicShapesSupport(); | ||
return k; | ||
} | ||
|
||
CommonDispatchData QuantizeKernelScaleShift_vload8::SetDefault(const quantize_params& params) const { | ||
CommonDispatchData dispatchData; | ||
// need special handle for blocked format?? | ||
if (true) { | ||
dispatchData.gws[0] = std::max(params.outputs[0].LogicalSize() / 8, (size_t)1); | ||
dispatchData.gws[1] = 1; | ||
dispatchData.gws[2] = 1; | ||
} | ||
dispatchData.lws = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]}, | ||
params.engineInfo); | ||
return dispatchData; | ||
} | ||
|
||
JitConstants QuantizeKernelScaleShift_vload8::GetJitConstants(const quantize_params& params, | ||
const CommonDispatchData& dispatchData) const { | ||
JitConstants jit = Parent::GetJitConstants(params, dispatchData); | ||
|
||
if (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 || | ||
params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 || | ||
params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32 || | ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 || | ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv32 || | ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 || | ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 || | ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16 || | ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv32 || | ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv16 || | ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv32) { | ||
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKED_FORMAT", true)); | ||
jit.AddConstant(MakeJitConstant("GWS_BATCH", 2)); | ||
jit.AddConstant(MakeJitConstant("GWS_FEATURE", 1)); | ||
jit.AddConstant(MakeJitConstant("GWS_YX", 0)); | ||
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size)); | ||
} else { | ||
auto tensor_jits = GetTensorFriendlyWorkGroupsJit(params.outputs[0]); | ||
jit.Merge(tensor_jits); | ||
} | ||
|
||
auto can_use_output_range = params.per_tensor_output_range && params.out_lo < params.out_hi; | ||
auto has_output_range_round = | ||
!(params.outputs[0].GetDType() == Datatype::INT8 || params.outputs[0].GetDType() == Datatype::UINT8); | ||
|
||
jit.AddConstant(MakeJitConstant("HAS_POST_SCALE", params.has_post_scale)); | ||
jit.AddConstant(MakeJitConstant("HAS_POST_SHIFT", params.has_post_shift)); | ||
jit.AddConstant(MakeJitConstant("HAS_PRE_SHIFT", params.has_pre_shift)); | ||
jit.AddConstant(MakeJitConstant("HAS_CLAMP", params.has_clamp)); | ||
jit.AddConstant(MakeJitConstant("HAS_MIN_CLAMP", params.has_min_clamp)); | ||
jit.AddConstant(MakeJitConstant("HAS_MAX_CLAMP", params.has_max_clamp)); | ||
jit.AddConstant(MakeJitConstant("IN_LO_VAL", params.in_lo)); | ||
jit.AddConstant(MakeJitConstant("IN_HI_VAL", params.in_hi)); | ||
jit.AddConstant(MakeJitConstant("OUT_LO_VAL", params.out_lo)); | ||
jit.AddConstant(MakeJitConstant("OUT_HI_VAL", params.out_hi)); | ||
jit.AddConstant(MakeJitConstant("IN_SCALE_VAL", params.in_scale)); | ||
jit.AddConstant(MakeJitConstant("IN_SHIFT_VAL", params.in_shift)); | ||
jit.AddConstant(MakeJitConstant("OUT_SCALE_VAL", params.out_scale)); | ||
jit.AddConstant(MakeJitConstant("OUT_SHIFT_VAL", params.out_shift)); | ||
jit.AddConstant(MakeJitConstant("CAN_USE_OUTPUT_RANGE", can_use_output_range)); | ||
jit.AddConstant(MakeJitConstant("HAS_OUTPUT_RANGE_ROUND", has_output_range_round)); | ||
|
||
return jit; | ||
} | ||
|
||
bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const { | ||
const quantize_params& params = static_cast<const quantize_params&>(p); | ||
if (params.inputs.size() != 9) | ||
return false; | ||
|
||
// this kernel is opt for per tensor quantization params for now | ||
if (!params.per_tensor_input_range || !params.per_tensor_output_range || !params.per_tensor_input_scale || | ||
!params.per_tensor_output_scale || !params.per_tensor_output_shift || | ||
(params.has_pre_shift && !params.per_tensor_input_shift)) | ||
return false; | ||
// TBD, do we really need the strick block_size checking to support blocked foramt? | ||
for (size_t i = 0; i < params.inputs.size(); i++) { | ||
const auto input_layout = params.inputs[i].GetLayout(); | ||
const auto batch_size = params.inputs[i].Batch().v; | ||
const auto feature_size = params.inputs[i].Feature().v; | ||
if ((input_layout == DataLayout::b_fs_yx_fsv16 && feature_size % 16 != 0) || | ||
(input_layout == DataLayout::b_fs_yx_fsv32 && feature_size % 32 != 0) || | ||
(input_layout == DataLayout::b_fs_zyx_fsv16 && feature_size % 16 != 0) || | ||
(input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) || | ||
input_layout == DataLayout::fs_b_yx_fsv32 || | ||
(input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 && (feature_size % 16 != 0 || batch_size % 32 != 0)) || | ||
(input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 && (feature_size % 32 != 0 || batch_size % 32 != 0))) | ||
return false; | ||
} | ||
if ((params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 && params.outputs[0].Feature().v % 16 != 0) || | ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32 && params.outputs[0].Feature().v % 32 != 0) || | ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16 && params.outputs[0].Feature().v % 16 != 0) || | ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4 && params.outputs[0].Feature().v % 8 != 0) || | ||
params.outputs[0].GetLayout() == DataLayout::fs_b_yx_fsv32 || | ||
(params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 && | ||
(params.outputs[0].Feature().v % 16 != 0 || params.outputs[0].Batch().v % 32 != 0)) || | ||
(params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 && | ||
(params.outputs[0].Feature().v % 32 != 0 || params.outputs[0].Batch().v % 32 != 0))) | ||
return false; | ||
// TBD maybe need more stric check? | ||
return true; | ||
} | ||
|
||
KernelsData QuantizeKernelScaleShift_vload8::GetKernelsData(const Params& params) const { | ||
assert(params.GetType() == KernelType::QUANTIZE); | ||
|
||
KernelData kd = KernelData::Default<quantize_params>(params); | ||
quantize_params& nparams = *static_cast<quantize_params*>(kd.params.get()); | ||
|
||
if (!Validate(params)) { | ||
return {}; | ||
} | ||
|
||
auto dispatchData = SetDefault(nparams); | ||
auto entry_point = GetEntryPoint(kernelName, nparams.layerID, params); | ||
auto cldnn_jit = GetJitConstants(nparams, dispatchData); | ||
auto jit = CreateJit(kernelName, cldnn_jit, entry_point); | ||
|
||
GetUpdateDispatchDataFunc(kd); | ||
|
||
auto& kernel = kd.kernels[0]; | ||
|
||
kernel.params.workGroups.global = dispatchData.gws; | ||
kernel.params.workGroups.local = dispatchData.lws; | ||
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT); | ||
kernel.params.arguments = | ||
GetArgsDesc(static_cast<int>(nparams.inputs.size()), false, false, 0, 1, nparams.has_dynamic_tensors()); | ||
|
||
return {kd}; | ||
} | ||
|
||
KernelsPriority QuantizeKernelScaleShift_vload8::GetKernelsPriority(const Params& /*params*/) const { | ||
return FORCE_PRIORITY_8; | ||
} | ||
} // namespace kernel_selector |
25 changes: 25 additions & 0 deletions
25
...s/intel_gpu/src/kernel_selector/kernels/quantize/quantize_kernel_scale_shift_vload8_opt.h
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
// Copyright (C) 2018-2024 Intel Corporation | ||
// SPDX-License-Identifier: Apache-2.0 | ||
// | ||
|
||
#pragma once | ||
|
||
#include "quantize_kernel_base.h" | ||
|
||
namespace kernel_selector { | ||
|
||
class QuantizeKernelScaleShift_vload8 : public QuantizeKernelBase { | ||
public: | ||
using Parent = QuantizeKernelBase; | ||
|
||
QuantizeKernelScaleShift_vload8() : QuantizeKernelBase("quantize_gpu_scale_shift_vload8_opt") {} | ||
virtual ~QuantizeKernelScaleShift_vload8() {} | ||
CommonDispatchData SetDefault(const quantize_params& params) const override; | ||
KernelsPriority GetKernelsPriority(const Params& params) const override; | ||
ParamsKey GetSupportedKey() const override; | ||
KernelsData GetKernelsData(const Params& params) const override; | ||
protected: | ||
bool Validate(const Params& p) const override; | ||
JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override; | ||
}; | ||
} // namespace kernel_selector |