Skip to content

Commit

Permalink
initial trial for perf evaluation
Browse files Browse the repository at this point in the history
Signed-off-by: fishbell <[email protected]>
  • Loading branch information
songbell committed Jan 13, 2025
1 parent 65f6ce8 commit c644449
Show file tree
Hide file tree
Showing 3 changed files with 335 additions and 0 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "include/batch_headers/fetch_data.cl"

#define TO_OUTPUT_TYPE CAT(convert_, OUTPUT_TYPE)
#define INPUT0_VEC_TYPE MAKE_VECTOR_TYPE(INPUT0_TYPE, 8)
#define INPUT1_VEC_TYPE MAKE_VECTOR_TYPE(INPUT1_TYPE, 8)
#define OUTPUT_VEC_TYPE MAKE_VECTOR_TYPE(OUTPUT_TYPE, 8)

#define TO_VECTOR_TYPE_IMPL_8(elem_type) CAT(convert_##elem_type, 8)
#define TO_VECTOR_TYPE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_, size)(elem_type)

#define TO_VECTOR_TYPE_IMPL_SAT_8(elem_type) CAT(convert_##elem_type, 8##_sat)
#define TO_VECTOR_TYPE_IMPL_SAT_RTE_8(elem_type) CAT(convert_##elem_type, 8##_sat_rte)
#define TO_VECTOR_TYPE_SAT(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_, size)(elem_type)
#define TO_VECTOR_TYPE_SAT_RTE(elem_type, size) CAT(TO_VECTOR_TYPE_IMPL_SAT_RTE_, size)(elem_type)
#define VLOAD_DECLS vload8(global_id, input)
#ifdef SUB_GROUP_SIZE
REQD_SUB_GROUP_SIZE(SUB_GROUP_SIZE)
#endif
#ifndef IS_DYNAMIC
__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2)))
#endif
KERNEL(quantize_gpu_scale_shift_vload8_opt)(OPTIONAL_SHAPE_INFO_ARG
const __global INPUT0_TYPE* input,
const __global INPUT1_TYPE* input_low,
const __global INPUT2_TYPE* input_high,
const __global INPUT3_TYPE* output_low,
const __global INPUT4_TYPE* output_high,
const __global INPUT5_TYPE* input_scale,
const __global INPUT6_TYPE* input_shift,
const __global INPUT7_TYPE* output_scale,
const __global INPUT8_TYPE* output_shift,
__global OUTPUT_TYPE* output)
{
const int global_id = get_global_id(0);

const INPUT0_VEC_TYPE in0 = VLOAD_DECLS;

OUTPUT_VEC_TYPE res;

INPUT1_TYPE input_scale_val = IN_SCALE_VAL;

INPUT1_TYPE input_shift_val = IN_SHIFT_VAL;

INPUT1_TYPE output_scale_val = OUT_SCALE_VAL;

INPUT1_TYPE output_shift_val = OUT_SHIFT_VAL;


#if HAS_CLAMP
#if CAN_USE_OUTPUT_RANGE
INPUT1_TYPE output_low_val = OUT_LO_VAL;
INPUT1_TYPE output_high_val = OUT_HI_VAL;
#else
INPUT1_TYPE input_low_val = IN_LO_VAL;
INPUT1_TYPE input_high_val = IN_HI_VAL;
#endif // CAN_USE_OUTPUT_RANGE
#endif // HAS_CLAMP

// ************************************************************* //
// Calculations for optimized branch with the output range usage //
// ************************************************************* //

#if CAN_USE_OUTPUT_RANGE

#if HAS_PRE_SHIFT
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val + input_shift_val;
#else
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0) * input_scale_val;
#endif

#if HAS_OUTPUT_RANGE_ROUND
val = round(val);
#endif

#if HAS_POST_SCALE
val *= output_scale_val;
#endif

#if HAS_POST_SHIFT
val += output_shift_val;
#endif

#if HAS_CLAMP
#if HAS_MIN_CLAMP && HAS_MAX_CLAMP
val = clamp(val, output_low_val, output_high_val);
#elif HAS_MIN_CLAMP
val = max(val, output_low_val);
#else // HAS_MAX_CLAMP
val = min(val, output_high_val);
#endif
#endif // HAS_CLAMP

// ************************************************************** //
// Calculations for alternative branch with the input range usage //
// ************************************************************** //

#else // CAN_USE_OUTPUT_RANGE

#if HAS_CLAMP
INPUT1_VEC_TYPE val = clamp(TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0), input_low_val, input_high_val);
#else
INPUT1_VEC_TYPE val = TO_VECTOR_TYPE(INPUT1_TYPE, 8)(in0);
#endif

#if HAS_PRE_SHIFT
val = round(val * input_scale_val + input_shift_val);
#else
val = round(val * input_scale_val);
#endif

#if HAS_POST_SCALE
val *= output_scale_val;
#endif

#if HAS_POST_SHIFT
val += output_shift_val;
#endif

#endif // CAN_USE_OUTPUT_RANGE

// *********************************** //
// Common section with results writing //
// *********************************** //

#if FEATURE_BLOCKED_FORMAT
if (of < OUTPUT_FEATURE_NUM)
#endif
#if OUTPUT_IS_FP
res = TO_VECTOR_TYPE_SAT(OUTPUT_TYPE, 8)(val);
#else
res = TO_VECTOR_TYPE_SAT_RTE(OUTPUT_TYPE, 8)(val);;
#endif

vstore8(res, global_id, output);
}

#undef TO_OUTPUT_TYPE
#undef TO_OUTPUT_TYPE_SAT_RTE
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#include "quantize_kernel_scale_shift_vload8_opt.h"

#include <iostream>
#include <string>

#include "kernel_selector_utils.h"

static const size_t sub_group_size = 32;
static const size_t feature_size = 32;

namespace kernel_selector {
ParamsKey QuantizeKernelScaleShift_vload8::GetSupportedKey() const {
ParamsKey k;
k.EnableInputDataType(Datatype::UINT8);
k.EnableInputDataType(Datatype::INT8);
k.EnableInputDataType(Datatype::F16);
k.EnableInputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::F16);
k.EnableOutputDataType(Datatype::F32);
k.EnableOutputDataType(Datatype::UINT8);
k.EnableOutputDataType(Datatype::INT8);
k.EnableAllInputLayout();
k.EnableAllOutputLayout();
k.EnableTensorOffset();
k.EnableTensorPitches();
k.EnableBatching();
k.EnableDifferentTypes();
k.EnableQuantizeScaleShiftOpt();
k.EnableDynamicShapesSupport();
return k;
}

CommonDispatchData QuantizeKernelScaleShift_vload8::SetDefault(const quantize_params& params) const {
CommonDispatchData dispatchData;
// need special handle for blocked format??
if (true) {
dispatchData.gws[0] = std::max(params.outputs[0].LogicalSize() / 8, (size_t)1);
dispatchData.gws[1] = 1;
dispatchData.gws[2] = 1;
}
dispatchData.lws = GetOptimalLocalWorkGroupSizes({dispatchData.gws[0], dispatchData.gws[1], dispatchData.gws[2]},
params.engineInfo);
return dispatchData;
}

JitConstants QuantizeKernelScaleShift_vload8::GetJitConstants(const quantize_params& params,
const CommonDispatchData& dispatchData) const {
JitConstants jit = Parent::GetJitConstants(params, dispatchData);

if (params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv32 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv16_fsv32 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv16_fsv32 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv16 ||
params.outputs[0].GetLayout() == DataLayout::bs_fs_zyx_bsv32_fsv32) {
jit.AddConstant(MakeJitConstant("FEATURE_BLOCKED_FORMAT", true));
jit.AddConstant(MakeJitConstant("GWS_BATCH", 2));
jit.AddConstant(MakeJitConstant("GWS_FEATURE", 1));
jit.AddConstant(MakeJitConstant("GWS_YX", 0));
jit.AddConstant(MakeJitConstant("SUB_GROUP_SIZE", sub_group_size));
} else {
auto tensor_jits = GetTensorFriendlyWorkGroupsJit(params.outputs[0]);
jit.Merge(tensor_jits);
}

auto can_use_output_range = params.per_tensor_output_range && params.out_lo < params.out_hi;
auto has_output_range_round =
!(params.outputs[0].GetDType() == Datatype::INT8 || params.outputs[0].GetDType() == Datatype::UINT8);

jit.AddConstant(MakeJitConstant("HAS_POST_SCALE", params.has_post_scale));
jit.AddConstant(MakeJitConstant("HAS_POST_SHIFT", params.has_post_shift));
jit.AddConstant(MakeJitConstant("HAS_PRE_SHIFT", params.has_pre_shift));
jit.AddConstant(MakeJitConstant("HAS_CLAMP", params.has_clamp));
jit.AddConstant(MakeJitConstant("HAS_MIN_CLAMP", params.has_min_clamp));
jit.AddConstant(MakeJitConstant("HAS_MAX_CLAMP", params.has_max_clamp));
jit.AddConstant(MakeJitConstant("IN_LO_VAL", params.in_lo));
jit.AddConstant(MakeJitConstant("IN_HI_VAL", params.in_hi));
jit.AddConstant(MakeJitConstant("OUT_LO_VAL", params.out_lo));
jit.AddConstant(MakeJitConstant("OUT_HI_VAL", params.out_hi));
jit.AddConstant(MakeJitConstant("IN_SCALE_VAL", params.in_scale));
jit.AddConstant(MakeJitConstant("IN_SHIFT_VAL", params.in_shift));
jit.AddConstant(MakeJitConstant("OUT_SCALE_VAL", params.out_scale));
jit.AddConstant(MakeJitConstant("OUT_SHIFT_VAL", params.out_shift));
jit.AddConstant(MakeJitConstant("CAN_USE_OUTPUT_RANGE", can_use_output_range));
jit.AddConstant(MakeJitConstant("HAS_OUTPUT_RANGE_ROUND", has_output_range_round));

return jit;
}

bool QuantizeKernelScaleShift_vload8::Validate(const Params& p) const {
const quantize_params& params = static_cast<const quantize_params&>(p);
if (params.inputs.size() != 9)
return false;

// this kernel is opt for per tensor quantization params for now
if (!params.per_tensor_input_range || !params.per_tensor_output_range || !params.per_tensor_input_scale ||
!params.per_tensor_output_scale || !params.per_tensor_output_shift ||
(params.has_pre_shift && !params.per_tensor_input_shift))
return false;
// TBD, do we really need the strick block_size checking to support blocked foramt?
for (size_t i = 0; i < params.inputs.size(); i++) {
const auto input_layout = params.inputs[i].GetLayout();
const auto batch_size = params.inputs[i].Batch().v;
const auto feature_size = params.inputs[i].Feature().v;
if ((input_layout == DataLayout::b_fs_yx_fsv16 && feature_size % 16 != 0) ||
(input_layout == DataLayout::b_fs_yx_fsv32 && feature_size % 32 != 0) ||
(input_layout == DataLayout::b_fs_zyx_fsv16 && feature_size % 16 != 0) ||
(input_layout == DataLayout::b_fs_yx_fsv4 && feature_size % 8 != 0) ||
input_layout == DataLayout::fs_b_yx_fsv32 ||
(input_layout == DataLayout::bs_fs_yx_bsv32_fsv16 && (feature_size % 16 != 0 || batch_size % 32 != 0)) ||
(input_layout == DataLayout::bs_fs_yx_bsv32_fsv32 && (feature_size % 32 != 0 || batch_size % 32 != 0)))
return false;
}
if ((params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv16 && params.outputs[0].Feature().v % 16 != 0) ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv32 && params.outputs[0].Feature().v % 32 != 0) ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_zyx_fsv16 && params.outputs[0].Feature().v % 16 != 0) ||
(params.outputs[0].GetLayout() == DataLayout::b_fs_yx_fsv4 && params.outputs[0].Feature().v % 8 != 0) ||
params.outputs[0].GetLayout() == DataLayout::fs_b_yx_fsv32 ||
(params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv16 &&
(params.outputs[0].Feature().v % 16 != 0 || params.outputs[0].Batch().v % 32 != 0)) ||
(params.outputs[0].GetLayout() == DataLayout::bs_fs_yx_bsv32_fsv32 &&
(params.outputs[0].Feature().v % 32 != 0 || params.outputs[0].Batch().v % 32 != 0)))
return false;
// TBD maybe need more stric check?
return true;
}

KernelsData QuantizeKernelScaleShift_vload8::GetKernelsData(const Params& params) const {
assert(params.GetType() == KernelType::QUANTIZE);

KernelData kd = KernelData::Default<quantize_params>(params);
quantize_params& nparams = *static_cast<quantize_params*>(kd.params.get());

if (!Validate(params)) {
return {};
}

auto dispatchData = SetDefault(nparams);
auto entry_point = GetEntryPoint(kernelName, nparams.layerID, params);
auto cldnn_jit = GetJitConstants(nparams, dispatchData);
auto jit = CreateJit(kernelName, cldnn_jit, entry_point);

GetUpdateDispatchDataFunc(kd);

auto& kernel = kd.kernels[0];

kernel.params.workGroups.global = dispatchData.gws;
kernel.params.workGroups.local = dispatchData.lws;
kernel.code.kernelString = GetKernelString(kernelName, jit, entry_point, params.engineInfo, EXE_MODE_DEFAULT);
kernel.params.arguments =
GetArgsDesc(static_cast<int>(nparams.inputs.size()), false, false, 0, 1, nparams.has_dynamic_tensors());

return {kd};
}

KernelsPriority QuantizeKernelScaleShift_vload8::GetKernelsPriority(const Params& /*params*/) const {
return FORCE_PRIORITY_8;
}
} // namespace kernel_selector
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
// Copyright (C) 2018-2024 Intel Corporation
// SPDX-License-Identifier: Apache-2.0
//

#pragma once

#include "quantize_kernel_base.h"

namespace kernel_selector {

class QuantizeKernelScaleShift_vload8 : public QuantizeKernelBase {
public:
using Parent = QuantizeKernelBase;

QuantizeKernelScaleShift_vload8() : QuantizeKernelBase("quantize_gpu_scale_shift_vload8_opt") {}
virtual ~QuantizeKernelScaleShift_vload8() {}
CommonDispatchData SetDefault(const quantize_params& params) const override;
KernelsPriority GetKernelsPriority(const Params& params) const override;
ParamsKey GetSupportedKey() const override;
KernelsData GetKernelsData(const Params& params) const override;
protected:
bool Validate(const Params& p) const override;
JitConstants GetJitConstants(const quantize_params& params, const CommonDispatchData& dispatchData) const override;
};
} // namespace kernel_selector

0 comments on commit c644449

Please sign in to comment.