From 216389c8e32010b15895b4def1a76c3eae209c04 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Wed, 23 Oct 2024 06:51:40 -0700 Subject: [PATCH 01/18] Adding mean and where ops optimized on HiFi --- backends/cadence/aot/functions_hifi.yaml | 7 +- backends/cadence/hifi/kernels/CMakeLists.txt | 2 + backends/cadence/hifi/kernels/kernels.h | 28 + .../cadence/hifi/operators/CMakeLists.txt | 12 +- backends/cadence/hifi/operators/op_mean.cpp | 170 ++++ backends/cadence/hifi/operators/op_where.cpp | 176 ++++ .../nnlib/xa_nn_elm_where_f32xf32_f32.c | 838 ++++++++++++++++++ .../third-party/nnlib/xa_nn_reduce_32_32.c | 647 ++++++++++++++ 8 files changed, 1870 insertions(+), 10 deletions(-) create mode 100644 backends/cadence/hifi/operators/op_mean.cpp create mode 100644 backends/cadence/hifi/operators/op_where.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 70b2dd0207..8cf0c2de0a 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -62,6 +62,11 @@ - arg_meta: null kernel_name: torch::executor::full_out +- op: mean.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::mean_dim_out + - op: mul.out kernels: - arg_meta: null @@ -105,7 +110,7 @@ - op: where.self_out kernels: - arg_meta: null - kernel_name: torch::executor::where_out + kernel_name: cadence::impl::HiFi::where_out # custom ops - func: cadence::quantize_per_tensor.out(Tensor input, float scale, int zero_point, int quant_min, int quant_max, ScalarType dtype, *, Tensor(a!) out) -> Tensor(a!) diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 8fee7e8536..9321cc544e 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -13,6 +13,8 @@ add_library( ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c ) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index a206635a28..2087c9761b 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -55,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( const FLOAT32* __restrict__ p_inp2, const WORD32* const p_inp2_shape); +extern "C" WORD32 xa_nn_elm_where_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp1, + const FLOAT32* __restrict__ p_inp2, + const unsigned char* __restrict__ p_condition, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp1, + const WORD32* const p_inp1_shape, + const FLOAT32* __restrict__ p_inp2, + const WORD32* const p_inp2_shape, + const unsigned char* __restrict__ p_condition, + const WORD32* const p_condition_shape); + +extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp, + const WORD32* const p_inp_shape, + const WORD32* __restrict__ p_axis, + WORD32 num_out_dims, + WORD32 num_inp_dims, + WORD32 num_axis_dims, + void* __restrict__ p_scratch_in); + namespace cadence { namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index cbbb279e5d..dbe5867550 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -22,19 +22,12 @@ endif() set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/copy_ops_util.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/index_util.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/kernel_ops_util.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" @@ -57,6 +50,7 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/matmul_ops_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/reduce_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/repeat_util.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/slice_util.cpp" ) add_library(aten_ops_cadence ${_aten_ops__srcs}) target_link_libraries(aten_ops_cadence PUBLIC executorch) diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp new file mode 100644 index 0000000000..478e10da71 --- /dev/null +++ b/backends/cadence/hifi/operators/op_mean.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::ArrayRef; +using torch::executor::Error; +using torch::executor::optional; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +int prepare_data( + const Tensor& in, + Tensor& out, + optional> dim_list, + int* inp_shape, + int* out_shape, + int* p_axis, + int num_inp_dims, + int num_out_dims) { + for (int i = 0; i < num_inp_dims; i++) { + inp_shape[i] = in.size(i); + } + + for (int i = 0; i < num_out_dims; i++) { + out_shape[i] = out.size(i); + } + + int num_axis_dims = 0; + for (const auto& d : dim_list.value()) { + if (d < 0) { + p_axis[num_axis_dims] = num_inp_dims + d; + num_axis_dims++; + } else { + p_axis[num_axis_dims] = d; + num_axis_dims++; + } + } + + return num_axis_dims; +} + +Tensor& mean_dim_out( + RuntimeContext& ctx, + const Tensor& in, + optional> dim_list, + bool keepdim, + optional dtype, + Tensor& out) { + ET_KERNEL_CHECK( + ctx, + torch::executor::check_mean_dim_args(in, dim_list, keepdim, dtype, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_reduction_out(in, dim_list, keepdim, out) == + Error::Ok, + InvalidArgument, + out); + + constexpr auto name = "mean.out"; + constexpr int kNnlibMaxDim = 4; + + bool optimized = 1; + + if (out.scalar_type() != ScalarType::Float) + optimized = 0; + + if (in.dim() > kNnlibMaxDim) + optimized = 0; + + if (optimized) { + float* __restrict__ p_out = out.mutable_data_ptr(); + const float* __restrict__ p_inp = + (const float* __restrict__)in.const_data_ptr(); + + int num_elm = in.numel(); + + int num_inp_dims = in.dim(); + int num_out_dims = out.dim(); + + int inp_shape[kNnlibMaxDim]; + int out_shape[kNnlibMaxDim]; + int p_axis[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp_shape[i] = 1; + p_axis[i] = 1; + } + + int num_axis_dims = prepare_data( + in, + out, + dim_list, + inp_shape, + out_shape, + p_axis, + num_inp_dims, + num_out_dims); + + if (num_axis_dims == num_inp_dims) { + num_out_dims = 1; + out_shape[0] = 1; + } + + int scratch_size = xa_nn_reduce_getsize_nhwc( + -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1); + + void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size); + + xa_nn_reduce_mean_4D_f32_f32( + p_out, + out_shape, + p_inp, + inp_shape, + p_axis, + num_out_dims, + num_inp_dims, + num_axis_dims, + p_scratch_in); + + return out; + } + + ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { + ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { + CTYPE_OUT* out_data = out.mutable_data_ptr(); + const size_t num = torch::executor::get_reduced_dim_product(in, dim_list); + + for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { + CTYPE_OUT sum = 0; + if (in.numel() > 0) { + sum = torch::executor::map_reduce_over_dim_list( + [](CTYPE_IN v) { return static_cast(v); }, + [](CTYPE_OUT outv, CTYPE_OUT acc) { return acc + outv; }, + in, + dim_list, + out_ix); + } + out_data[out_ix] = sum / static_cast(num); + } + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp new file mode 100644 index 0000000000..06bd0bc3c9 --- /dev/null +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -0,0 +1,176 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using torch::executor::Error; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +Tensor& where_out( + RuntimeContext& ctx, + const Tensor& cond, + const Tensor& a, + const Tensor& b, + Tensor& out) { + ScalarType cond_type = cond.scalar_type(); + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = executorch::runtime::promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + torch::executor::resize_to_broadcast_target_size(a, b, cond, out) == + Error::Ok, + InvalidArgument, + out); + + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ + constexpr auto name = "where.self_out"; + + ET_CHECK_MSG( + cond_type == ScalarType::Bool || cond_type == ScalarType::Byte, + "Unhandled dtype %s for where.self_out", + torch::executor::toString(cond_type)); + + int a_dim = a.dim(), b_dim = b.dim(), con_dim = cond.dim(), + out_dim = out.dim(); + bool optimized = 1; + /*find broadcast*/ + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool cond_is_broadcasted = !out.sizes().equals(cond.sizes()); + const bool broadcast = + (a_is_broadcasted || b_is_broadcasted || cond_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = cond.dim() > max_dim ? cond.dim() : max_dim; + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + optimized = 0; + + if ((a_dim == 0) || (b_dim == 0) || (con_dim == 0)) + optimized = 0; + + if ((broadcast == 1) && (max_dim > kNnlibMaxDim)) + optimized = 0; + + if (optimized) { + const float* a_data = a.const_data_ptr(); + const float* b_data = b.const_data_ptr(); + float* out_data = out.mutable_data_ptr(); + const unsigned char* con = cond.const_data_ptr(); + + if (broadcast == 1) { + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; + int con_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + con_shape[i] = 1; + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + int off_c = kNnlibMaxDim - cond.dim(); + + for (int i = 0; i < out.dim(); i++) + out_shape[i + off_o] = out.size(i); + for (int i = 0; i < a.dim(); i++) + inp1_shape[i + off_a] = a.size(i); + for (int i = 0; i < b.dim(); i++) + inp2_shape[i + off_b] = b.size(i); + for (int i = 0; i < cond.dim(); i++) + con_shape[i + off_c] = cond.size(i); + + if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] || + con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) { + void* p_scratch = + malloc(out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]); + const unsigned char* p_brd_cond = (const unsigned char*)p_scratch; + xa_nn_broadcast_8_8( + (WORD8* __restrict__)p_brd_cond, + out_shape, + (const WORD8* __restrict__)con, + con_shape, + 4); + + for (int i = 0; i < 4; i++) { + con_shape[i] = out_shape[i]; + } + xa_nn_elm_where_broadcast_4D_f32xf32_f32( + out_data, + out_shape, + a_data, + inp1_shape, + b_data, + inp2_shape, + p_brd_cond, + con_shape); + free(p_scratch); + } else { + xa_nn_elm_where_broadcast_4D_f32xf32_f32( + out_data, + out_shape, + a_data, + inp1_shape, + b_data, + inp2_shape, + con, + con_shape); + } + } else { + xa_nn_elm_where_f32xf32_f32(out_data, a_data, b_data, con, out.numel()); + } + return out; + } + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_OUT = + typename torch::executor::promote_types::type; + torch::executor:: + apply_ternary_elementwise_fn( + [](const CTYPE_A val_a, + const CTYPE_B val_b, + const uint8_t val_c) { + CTYPE_OUT a_casted = static_cast(val_a); + CTYPE_OUT b_casted = static_cast(val_b); + return val_c ? a_casted : b_casted; + }, + a, + b, + cond, + out); + }); + }); + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c new file mode 100644 index 0000000000..6a7f6d0f77 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c @@ -0,0 +1,838 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +#include "xa_type_def.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h" +#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_where_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + const unsigned char *__restrict__ condition, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_where_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + const unsigned char *__restrict__ p_condition, + WORD32 num_elm) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + unsigned char *condition = p_condition; + xtfloatx2 x1, x2, y; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + con1 = XT_L8UI(condition, 0); + xtbool s = AE_MOVBA(con1); + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + XT_MOVT_S(a, a1, s); + XT_MOVF_S(a, a2, s); + XT_SSI(a, (xtfloat *)out, 0); + } +} + +static void internal_elm_where_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + const unsigned char * __restrict__ p_condition, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + unsigned char *condition = p_condition; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + /* For out = condition ? inp2 :inp1 */ + if(sign_flag){ + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + x1 = XT_LSI((xtfloat *)p_a, 0); + + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if((((unsigned)p_c)&7) == 0) + { + for(i=0; i> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + /* For out = condition ? inp2 :inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + condition = &p_condition[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x2, con); + XT_MOVF_SX2 (y, x1, con); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x2, con); + XT_MOVF_SX2 (y, x1, con); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, 0); + XT_LSIP(b0, (xtfloat *)p_b, 0); + con1 = XT_L8UI(condition, 0); + xtbool s = AE_MOVBA(con1); + XT_MOVT_S(c0, b0, s); + XT_MOVF_S(c0, a0, s); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } + /* For out = condition ? inp1 :inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + condition = &p_condition[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, 0); + XT_LSIP(b0, (xtfloat *)p_b, 0); + con1 = XT_L8UI(condition, 0); + xtbool s = AE_MOVBA(con1); + XT_MOVT_S(c0, a0, s); + XT_MOVF_S(c0, b0, s); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } +} + +static void internal_elm_where_broadcast_both_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + const unsigned char * __restrict__ p_condition, + WORD32 out_lc, + WORD32 in_lc) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + unsigned char *condition = p_condition; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)p_inp1; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + condition = &p_condition[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + con1 = XT_L8UI(condition, 0); + condition++; + con2 = XT_L8UI(condition, 0); + condition++; + con = AE_MOVBA1X2(con1, con2); + XT_MOVT_SX2 (y, x1, con); + XT_MOVF_SX2 (y, x2, con); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, 0); + XT_LSIP(b0, (xtfloat *)p_b, 0); + con1 = XT_L8UI(condition, 0); + xtbool s = AE_MOVBA(con1); + XT_MOVT_S(c0, a0, s); + XT_MOVF_S(c0, b0, s); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } +} + +WORD32 xa_nn_elm_where_broadcast_4D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp1, + const WORD32 *const p_inp1_shape, + const FLOAT32 * __restrict__ p_inp2, + const WORD32 *const p_inp2_shape, + const unsigned char *__restrict__ p_condition, + const WORD32 *const p_condition_shape + ) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + XA_NNLIB_ARG_CHK_PTR(p_condition, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_condition_shape, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_condition, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_condition_shape, sizeof(WORD32), -1); + + /* Check shapes */ + int i; + xtbool sign_flag; + for(i = 0; i < 4; i++) + { + if((p_inp1_shape[i] != p_inp2_shape[i]) && ((p_inp1_shape[i] != 1) && (p_inp2_shape[i] != 1))) + { + return -1; + } + } + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] == 1) + { + inp1_strides[i] = 0; + need_broadcast = 1; + } + else + { + inp1_const &= 0; + } + if(p_inp2_shape[i] == 1) + { + inp2_strides[i] = 0; + need_broadcast = 1; + } + else + { + inp2_const &= 0; + } + } + + int itr0, itr1, itr2; + FLOAT32 *p_out_tmp = p_out; + const unsigned char *__restrict p_condition_temp = p_condition; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_where_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + p_condition, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if((inp1_strides[3] == 1)&& (inp2_strides[3] == 1)) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if((inp1_strides[2] == 0) && (inp2_strides[2] == 0)) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_where_broadcast_both_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + p_condition_temp, + out_lc, + in_lc); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + p_condition_temp += in_lc * out_lc; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else + { + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_where_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + p_condition_temp, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + p_condition_temp += in_lc * out_lc; + } + + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + if((inp1_const == 1)&&(inp2_const == 1)) + { + internal_elm_where_broadcast_both_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_condition_temp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3]); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_where_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_condition_temp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + } + else + { + sign_flag = 0; + if((inp1_strides[3] == 0) && (inp2_strides[3] == 0)) + { + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_where_broadcast_both_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_condition_temp, + p_out_shape[3]); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + p_condition_temp += p_out_shape[3]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else + { + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_where_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_condition_temp, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + p_condition_temp += p_out_shape[3]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + } + return 0; +} + +#endif \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c new file mode 100644 index 0000000000..5978a92d26 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c @@ -0,0 +1,647 @@ +#include "xa_nnlib_common.h" +#include +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_common_macros.h" + +#define ALIGNMENT_8 8 + +#define ALIGN_PTR(x, bytes) ((((unsigned)(x))+(bytes-1))&(~(bytes-1))) + +static void vecmean16_inpx3(const xtfloatx2 *p_src1, const xtfloat* p_src2, const xtfloat* p_src3, xtfloatx2 *p_dst, int N){ + int i = 0; + ae_valign align_src1, align_dst; + ae_valign align_src2, align_src3; + align_src1 = AE_LA64_PP(p_src1); + align_src2 = AE_LA64_PP(p_src2); + align_src3 = AE_LA64_PP(p_src3); + align_dst = AE_ZALIGN64(); + + for(i=0; i < (N >> 2); i++) + { + xtfloatx2 j1_h, j1_l, j2_h, j2_l; + + xtfloatx2 wout1, wout2; + XT_LASX2IP(wout1, align_src1, p_src1); + XT_LASX2IP(wout2, align_src1, p_src1); + + XT_LASX2IP(j1_h, align_src2, (xtfloatx2 *)p_src2); + XT_LASX2IP(j1_l, align_src2, (xtfloatx2 *)p_src2); + XT_LASX2IP(j2_h, align_src3, (xtfloatx2 *)p_src3); + XT_LASX2IP(j2_l, align_src3, (xtfloatx2 *)p_src3); + + j1_h = XT_ADD_SX2(j1_h, j2_h); + j1_l = XT_ADD_SX2(j1_l, j2_l); + wout1 = XT_ADD_SX2(wout1, j1_h); + wout2 = XT_ADD_SX2(wout2, j1_l); + + XT_SASX2IP(wout1, align_dst, p_dst); + XT_SASX2IP(wout2, align_dst, p_dst); + } + AE_SA64POS_FP(align_dst, p_dst); // finalize the stream + + //Remainder Loop + for(i=0; i < (N & 3); i++) + { + xtfloat j1, j2; + xtfloat wout1; + XT_LSXP(wout1, (xtfloat *)p_src1, sizeof(xtfloat)); + j1 = (xtfloat) *(p_src2 + i); + j2 = (xtfloat) *(p_src3 + i); + + j1 = XT_ADD_S(j1, j2); + wout1 = XT_ADD_S(wout1, j1); + XT_SSXP(wout1, (xtfloat *)p_dst, sizeof(xtfloat)); + } +} + +static void vecmean16_inpx2(const xtfloatx2 *p_src1, const xtfloat* p_src2, xtfloatx2 *p_dst, int N){ + ae_valign align_src1, align_dst; + ae_valign align_src2; + align_src1 = AE_LA64_PP(p_src1); + align_src2 = AE_LA64_PP(p_src2); + align_dst = AE_ZALIGN64(); + + int i = 0; + for(i=0; i < (N >> 2); i++) + { + xtfloatx2 j1, j2; + xtfloatx2 wout1, wout2; + XT_LASX2IP(wout1, align_src1, p_src1); + XT_LASX2IP(wout2, align_src1, p_src1); + + XT_LASX2IP(j1, align_src2, (xtfloatx2 *)p_src2); + XT_LASX2IP(j2, align_src2, (xtfloatx2 *)p_src2); + + wout1 = XT_ADD_SX2(wout1, j1); + wout2 = XT_ADD_SX2(wout2, j2); + + XT_SASX2IP(wout1, align_dst, p_dst); + XT_SASX2IP(wout2, align_dst, p_dst); + } + AE_SA64POS_FP(align_dst, p_dst); // finalize the stream + + //Remainder Loop + for(i=0; i < (N & 3); i++) + { + xtfloat j1; + xtfloat wout1; + XT_LSXP(wout1, (xtfloat *)p_src1, sizeof(xtfloat)); + j1 = (xtfloat) *(p_src2 + i); + wout1 = XT_ADD_S(wout1, j1); + XT_SSXP(wout1, (xtfloat *)p_dst, sizeof(xtfloat)); + } +} + +static void vecmean32_inpx3(const xtfloatx2* p_src1, const xtfloatx2* p_wsrc2, const xtfloatx2* p_wsrc3, xtfloatx2 *p_dst, int N){ + ae_valign align_src1, align_src2, align_src3, align_dst; + align_src1 = AE_LA64_PP(p_src1); + align_src2 = AE_LA64_PP(p_wsrc2); + align_src3 = AE_LA64_PP(p_wsrc3); + align_dst = AE_ZALIGN64(); + + int i = 0; + for(i=0; i < (N >> 2); i++) + { + xtfloatx2 j1, j2, j3, j4; + xtfloatx2 wj1, wj2; + xtfloatx2 wout1, wout2; + XT_LASX2IP(wout1, align_src1, p_src1); + XT_LASX2IP(wout2, align_src1, p_src1); + XT_LASX2IP(j1, align_src2, p_wsrc2); + XT_LASX2IP(j2, align_src3, p_wsrc3); + XT_LASX2IP(j3, align_src2, p_wsrc2); + XT_LASX2IP(j4, align_src3, p_wsrc3); + + wj1 = XT_ADD_SX2(j1, j2); + wj2 = XT_ADD_SX2(j3, j4); + wout1 = XT_ADD_SX2(wout1, wj1); + wout2 = XT_ADD_SX2(wout2, wj2); + XT_SASX2IP(wout1, align_dst, p_dst); + XT_SASX2IP(wout2, align_dst, p_dst); + } + AE_SA64POS_FP(align_dst, p_dst); // finalize the stream + + //Remainder Loop + for(i=0; i < (N & 3); i++) + { + xtfloat j1, j2; + xtfloat wj1; + xtfloat wout1; + XT_LSXP(wout1, (xtfloat *)p_src1, 4); + XT_LSXP(j1, (xtfloat *)p_wsrc2, 4); + XT_LSXP(j2, (xtfloat *)p_wsrc3, 4); + wj1 = XT_ADD_S(j1, j2); + wout1 = XT_ADD_S(wout1, wj1); + XT_SSXP(wout1, (xtfloat *)p_dst, sizeof(xtfloat)); + } +} + +static void vecmean32_inpx2(const xtfloatx2* p_src1, const xtfloatx2* p_wsrc2, xtfloatx2 *p_dst, int N){ + ae_valign align_src1, align_src2, align_dst; + align_src1 = AE_LA64_PP(p_src1); + align_src2 = AE_LA64_PP(p_wsrc2); + align_dst = AE_ZALIGN64(); + + int i = 0; + for(i=0; i < (N >> 2); i++) + { + xtfloatx2 j1, j2; + xtfloatx2 wout1, wout2; + XT_LASX2IP(wout1, align_src1, p_src1); + XT_LASX2IP(wout2, align_src1, p_src1); + XT_LASX2IP(j1, align_src2, p_wsrc2); + XT_LASX2IP(j2, align_src2, p_wsrc2); + wout1 = XT_ADD_SX2(wout1, j1); + wout2 = XT_ADD_SX2(wout2, j2); + XT_SASX2IP(wout1, align_dst, p_dst); + XT_SASX2IP(wout2, align_dst, p_dst); + } + AE_SA64POS_FP(align_dst, p_dst); // finalize the stream + + //Remainder Loop + for(i=0; i < (N & 3); i++) + { + xtfloat j1; + xtfloat wout1; + XT_LSXP(wout1, (xtfloat *)p_src1, 4); + XT_LSXP(j1, (xtfloat *)p_wsrc2, 4); + wout1 = XT_ADD_S(wout1, j1); + XT_SSXP(wout1, (xtfloat *)p_dst, sizeof(WORD32)); + } +} + +static inline void xa_nn_reduce_sum_4D_f32_f32(const FLOAT32 * __restrict__ p_inp + ,const WORD32 *const p_4D_inp_shape + ,const WORD32 * __restrict__ p_axis_data + ,WORD32 num_inp_dims + ,WORD32 num_axis_dims + ,pVOID p_scratch_in) +{ + xtfloat *p_in = (xtfloat *)(p_inp); + xtfloat *p_scratch = (xtfloat *)(p_scratch_in); + + int temp_inp_n = p_4D_inp_shape[0]; + int temp_inp_h = p_4D_inp_shape[1]; + int temp_inp_w = p_4D_inp_shape[2]; + int temp_inp_c = p_4D_inp_shape[3]; + + int itr_axis = 0, itr_n = 0, itr_h = 0, itr_w = 0, itr_c = 0; + xtfloat *p_src2, *p_src3; + xtfloatx2 *p_src1; + xtfloatx2 * p_dst; + ae_valign align_src2; + + int axis_dims_count = num_axis_dims; + if(axis_dims_count) + { + switch(p_axis_data[itr_axis]) + { + case 0: { + int plane_size = temp_inp_h * temp_inp_w * temp_inp_c; + for(itr_n=0; itr_n < (temp_inp_n & ~(2 - 1)); itr_n += 2) + { + p_src1 = (xtfloatx2 *)p_scratch; + p_src2 = p_in + itr_n * plane_size; + p_src3 = p_in + (itr_n + 1) * plane_size; + p_dst = (xtfloatx2 *)p_scratch; + vecmean16_inpx3(p_src1, p_src2, p_src3, p_dst, plane_size); + } + + if(temp_inp_n & 1) + { + p_src1 = (xtfloatx2 *)p_scratch; + p_src2 = (p_in + itr_n * plane_size); + p_dst = (xtfloatx2 *)p_scratch; + vecmean16_inpx2(p_src1, p_src2, p_dst, plane_size); + } + temp_inp_n = 1; + }break; + case 1: { + int plane_size = temp_inp_h * temp_inp_w * temp_inp_c; + int wc_plane_size = temp_inp_w * temp_inp_c; + for(itr_n=0; itr_n < (temp_inp_n); itr_n++) + { + p_src1 = (xtfloatx2 *)(p_scratch + (itr_n * wc_plane_size)); + for(itr_h=0; itr_h < (temp_inp_h & ~(2 - 1)); itr_h += 2) + { + p_src2 = p_in + (itr_n * plane_size) + (itr_h * wc_plane_size); + p_src3 = p_in + (itr_n * plane_size) + ((itr_h + 1) * wc_plane_size); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * wc_plane_size)); + vecmean16_inpx3(p_src1, p_src2, p_src3, p_dst, wc_plane_size); + p_src1 = (xtfloatx2 *)(p_scratch + (itr_n * wc_plane_size)); + } + + if(temp_inp_h & 1) + { + p_src2 = p_in + (itr_n * plane_size) + (itr_h * wc_plane_size); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * wc_plane_size)); + vecmean16_inpx2(p_src1, p_src2, p_dst, wc_plane_size); + } + } + temp_inp_h = 1; + }break; + case 2:{ + int plane_size = temp_inp_h * temp_inp_w * temp_inp_c; + int wc_plane_size = temp_inp_w * temp_inp_c; + int hc_plane_size = temp_inp_h * temp_inp_c; + + for(itr_n=0; itr_n < (temp_inp_n); itr_n++) + { + for(itr_h=0; itr_h < (temp_inp_h); itr_h++) + { + p_src1 = (xtfloatx2 *)(p_scratch + (((itr_n * hc_plane_size) + itr_h * temp_inp_c))); + for(itr_w=0; itr_w < (temp_inp_w & ~(2 - 1)); itr_w += 2) + { + p_src2 = p_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + (itr_w * temp_inp_c); + p_src3 = p_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + ((itr_w + 1) * temp_inp_c); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * hc_plane_size) + itr_h * temp_inp_c); + vecmean16_inpx3(p_src1, p_src2, p_src3, p_dst, temp_inp_c); + p_src1 = (xtfloatx2 *)(p_scratch + (itr_n * hc_plane_size) + (itr_h * temp_inp_c)); + } + + if(temp_inp_w & 1) + { + p_src2 = p_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + (itr_w * temp_inp_c); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * hc_plane_size) + itr_h * temp_inp_c); + vecmean16_inpx2(p_src1, p_src2, p_dst, temp_inp_c); + } + } + } + temp_inp_w = 1; + }break; + case 3: { + int plane_size = temp_inp_h * temp_inp_w * temp_inp_c; + int wc_plane_size = temp_inp_w * temp_inp_c; + int hw_plane_size = temp_inp_h * temp_inp_w; + int rem_c = (temp_inp_c & 7); + + for(itr_n=0; itr_n < (temp_inp_n); itr_n++) + { + for(itr_h=0; itr_h < (temp_inp_h); itr_h++) + { + for(itr_w=0; itr_w < (temp_inp_w); itr_w++) + { + p_src1 = (xtfloatx2 *)(p_scratch + (((itr_n * hw_plane_size) + (itr_h * temp_inp_w) + itr_w))); + p_src2 = p_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + (itr_w * temp_inp_c); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * hw_plane_size) + (itr_h * temp_inp_w) + itr_w); + align_src2 = AE_LA64_PP(p_src2); + + for(itr_c=0; itr_c < (temp_inp_c >> 3); itr_c++) + { + xtfloatx2 j11, j12, j21, j22, i1; + i1 = XT_LSX((xtfloat *)p_src1, 0); + XT_LASX2IP(j11, align_src2, (xtfloatx2 *)p_src2); + XT_LASX2IP(j12, align_src2, (xtfloatx2 *)p_src2); + XT_LASX2IP(j21, align_src2, (xtfloatx2 *)p_src2); + XT_LASX2IP(j22, align_src2, (xtfloatx2 *)p_src2); + + j11 = XT_ADD_SX2(j11, j12); + j21 = XT_ADD_SX2(j21, j22); + + xtfloatx2 t1 = XT_SEL32_HH_SX2(j11, j11); + xtfloatx2 t2 = XT_SEL32_HH_SX2(j21, j21); + + j11 = XT_ADD_SX2(j11, t1); + j21 = XT_ADD_SX2(j21, t2); + + j11 = XT_ADD_SX2(j11, j21); + i1 = XT_ADD_SX2(i1, j11); + + XT_SSX(i1, (xtfloat *)p_dst, 0); + + p_src1 = p_dst; + } + //Remainder Loop + for(itr_c=0; itr_c < rem_c ; itr_c++) + { + xtfloat j1; + xtfloat i1; + i1 = XT_LSX((xtfloat *)p_src1, 0); + j1 = *p_src2++; + + i1 = XT_ADD_S(i1, j1); + XT_SSX(i1, (xtfloat *)p_dst, 0); + } + } + } + } + temp_inp_c = 1; + }break; + default: + break; + } + + axis_dims_count--; + itr_axis++; + } + + while(axis_dims_count) + { + ae_valign align_src; + xtfloat *p_scr_in = p_scratch; + xtfloatx2 *p_wsrc2, *p_wsrc3; + switch(p_axis_data[itr_axis]) + { + case 0: { + int plane_size = temp_inp_h * temp_inp_w * temp_inp_c; + for(itr_n=1; itr_n < ((temp_inp_n -1) & ~(2 - 1)); itr_n += 2) + { + p_src1 = (xtfloatx2 *)p_scratch; + p_wsrc2 = (xtfloatx2 *)(p_scr_in + itr_n * plane_size); + p_wsrc3 = (xtfloatx2 *)(p_scr_in + (itr_n + 1) * plane_size); + p_dst = (xtfloatx2 *)p_scratch; + vecmean32_inpx3(p_src1, p_wsrc2, p_wsrc3, p_dst, plane_size); + } + + if((temp_inp_n - 1) & 1) + { + p_src1 = (xtfloatx2 *)p_scratch; + p_wsrc2 = (xtfloatx2 *)(p_scr_in + itr_n * plane_size); + p_dst = (xtfloatx2 *)p_scratch; + vecmean32_inpx2(p_src1, p_wsrc2, p_dst, plane_size); + } + temp_inp_n = 1; + }break; + case 1: { + int plane_size = temp_inp_h * temp_inp_w * temp_inp_c; + int wc_plane_size = temp_inp_w * temp_inp_c; + for(itr_n=0; itr_n < (temp_inp_n); itr_n++) + { + p_src1 = (xtfloatx2 *)(p_scratch + + (itr_n * plane_size)); + for(itr_h = 1; itr_h < ((temp_inp_h - 1) & ~(2 - 1)); itr_h += 2) + { + p_wsrc2 = (xtfloatx2 *)(p_scr_in + (itr_n * plane_size) + (itr_h * wc_plane_size)); + p_wsrc3 = (xtfloatx2 *)(p_scr_in + (itr_n * plane_size) + ((itr_h + 1) * wc_plane_size)); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * wc_plane_size)); + vecmean32_inpx3(p_src1, p_wsrc2, p_wsrc3, p_dst, wc_plane_size); + p_src1 = (xtfloatx2 *)(p_scratch + (itr_n * wc_plane_size)); + } + + if((temp_inp_h - 1) & 1) + { + p_wsrc2 = (xtfloatx2 *)(p_scr_in + (itr_n * plane_size) + (itr_h * wc_plane_size)); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * wc_plane_size)); + vecmean32_inpx2(p_src1, p_wsrc2, p_dst, plane_size); + } + } + temp_inp_h = 1; + }break; + case 2:{ + int plane_size = temp_inp_h * temp_inp_w * temp_inp_c; + int wc_plane_size = temp_inp_w * temp_inp_c; + int hc_plane_size = temp_inp_h * temp_inp_c; + for(itr_n=0; itr_n < (temp_inp_n); itr_n++) + { + for(itr_h=0; itr_h < (temp_inp_h); itr_h++) + { + p_src1 = (xtfloatx2 *)(p_scratch + ((itr_n * plane_size) + (itr_h * wc_plane_size))); + for(itr_w = 1; itr_w < ((temp_inp_w - 1) & ~(2 - 1)); itr_w += 2) + { + p_wsrc2 = (xtfloatx2 *)(p_scr_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + (itr_w * temp_inp_c)); + p_wsrc3 = (xtfloatx2 *)(p_scr_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + ((itr_w + 1) * temp_inp_c)); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * hc_plane_size) + itr_h * temp_inp_c); + vecmean32_inpx3(p_src1, p_wsrc2, p_wsrc3, p_dst, temp_inp_c); + p_src1 = (xtfloatx2 *)(p_scratch + (itr_n * hc_plane_size) + (itr_h * temp_inp_c)); + } + + if((temp_inp_w - 1) & 1) + { + p_wsrc2 = (xtfloatx2 *)(p_scr_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + (itr_w * temp_inp_c)); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * hc_plane_size) + itr_h * temp_inp_c); + vecmean32_inpx2(p_src1, p_wsrc2, p_dst, temp_inp_c); + } + } + } + temp_inp_w = 1; + }break; + case 3: { + int plane_size = temp_inp_h * temp_inp_w * temp_inp_c; + int wc_plane_size = temp_inp_w * temp_inp_c; + int hw_plane_size = temp_inp_h * temp_inp_w; + int rem_c = ((temp_inp_c) & 3); + for(itr_n=0; itr_n < (temp_inp_n); itr_n++) + { + for(itr_h=0; itr_h < (temp_inp_h); itr_h++) + { + for(itr_w=0; itr_w < (temp_inp_w); itr_w++) + { + p_wsrc2 = (xtfloatx2 *)(p_scr_in + (itr_n * plane_size) + (itr_h * wc_plane_size) + (itr_w * temp_inp_c)); + p_dst = (xtfloatx2 *)(p_scratch + (itr_n * hw_plane_size) + (itr_h * temp_inp_w) + itr_w); + align_src = AE_LA64_PP(p_wsrc2); + xtfloatx2 i1 = AE_MOVXTFLOATX2_FROMF32X2(AE_MOVDA32(0)); + for(itr_c = 0; itr_c < (temp_inp_c >> 2); itr_c++) + { + xtfloatx2 j1, j2; + XT_LASX2IP(j1, align_src, p_wsrc2); + XT_LASX2IP(j2, align_src, p_wsrc2); + + xtfloatx2 t1 = XT_SEL32_HH_SX2(j1, j1); + xtfloatx2 t2 = XT_SEL32_HH_SX2(j2, j2); + + j1 = XT_ADD_SX2(t1, j1); + j2 = XT_ADD_SX2(t2, j2); + + i1 = XT_ADD_SX2(i1, j1); + i1 = XT_ADD_SX2(i1, j2); + } + + //Remainder Loop + for(itr_c=0; itr_c < rem_c; itr_c++) + { + xtfloat j1; + XT_LSXP(j1, (xtfloat *)p_wsrc2, sizeof(xtfloat)); + i1 = XT_ADD_S(i1, j1); + } + XT_SSX(i1, (xtfloat *)p_dst, 0); + } + } + } + temp_inp_c = 1; + }break; + default: + break; + } + axis_dims_count--; + itr_axis++; + } +} + +WORD32 xa_nn_reduce_mean_4D_f32_f32( + FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp, + const WORD32 *const p_inp_shape, + const WORD32 * __restrict__ p_axis, + WORD32 num_out_dims, + WORD32 num_inp_dims, + WORD32 num_axis_dims, + void * __restrict__ p_scratch_in) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp, -1); + XA_NNLIB_ARG_CHK_PTR(p_axis, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); + + /* Invalid input checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 4)), -1); + XA_NNLIB_ARG_CHK_COND(((num_out_dims <= 0) || (num_out_dims > 4)), -1); + XA_NNLIB_ARG_CHK_COND(((num_axis_dims < 0) || (num_axis_dims > 4)), -1); + + int axis_itr = 0, inp_itr = 0, out_itr = 0; + int num_elm_in_axis = 1; + int current, past = -1; + for(axis_itr=0; axis_itr < num_axis_dims; axis_itr++) + { + current = p_axis[axis_itr]; + XA_NNLIB_ARG_CHK_COND(((current < 0) || (current > (num_inp_dims - 1))), -1); + XA_NNLIB_ARG_CHK_COND((p_inp_shape[current] > 1024), -1); + + /* Avoid calculation in case of repeated axis dims*/ + if(current != past) + { + num_elm_in_axis *= p_inp_shape[current]; + past = current; + } + } + + for(inp_itr=0; inp_itr < num_inp_dims; inp_itr++) + { + XA_NNLIB_ARG_CHK_COND((p_inp_shape[inp_itr] <= 0), -1); + } + + int out_length = 1; + for(out_itr=0; out_itr < num_out_dims; out_itr++) + { + XA_NNLIB_ARG_CHK_COND((p_out_shape[out_itr] <= 0), -1); + out_length *= p_out_shape[out_itr]; + } + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_axis, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); + + FLOAT32 *p_in = (FLOAT32 *)(p_inp); + WORD32 *p_scratch = (WORD32 *)(ALIGN_PTR(p_scratch_in, ALIGNMENT_8)); + + // Changing order of axis data so that reduce max will be first computed + // across largest inp shape dim in axis. This is required to + // minimize the scratch usage. + int inp_length = 1, p_axis_data[4] = {0}, inp_shape_max; + if(num_axis_dims) + { + inp_shape_max = p_inp_shape[p_axis[0]]; + axis_itr = 1; + int max_axis_itr = 0; + int temp_p_axis_0 = p_axis[0]; + for(axis_itr = 0; axis_itr < num_axis_dims; axis_itr++) + { + p_axis_data[axis_itr] = p_axis[axis_itr]; + } + for(axis_itr = 1; axis_itr < num_axis_dims; axis_itr++) + { + if(p_inp_shape[p_axis[axis_itr]] > inp_shape_max) + { + inp_shape_max = p_inp_shape[p_axis[axis_itr]]; + max_axis_itr = axis_itr; + } + } + p_axis_data[0] = p_axis_data[max_axis_itr]; + p_axis_data[max_axis_itr] = temp_p_axis_0; + + inp_itr = 0; + for(inp_itr=0; inp_itr < num_inp_dims; inp_itr++) + { + inp_length *= p_inp_shape[inp_itr]; + } + + memset(p_scratch, 0, ((inp_length / inp_shape_max) * sizeof(WORD32))); //TODO: Alternate approach for memset? + } + + // Promoting lesser dim tensors to 4D tensors. Also modifying axis + // data accordingly. + int p_4D_inp_shape[4] = {1, 1, 1, 1}; + int itr = num_inp_dims - 1; + int count = 3; + while(itr >= 0) + { + p_4D_inp_shape[count] = p_inp_shape[itr]; + itr--; + count--; + } + for(itr = 0; itr < num_axis_dims; itr++) + { + p_axis_data[itr] = p_axis_data[itr] + (4 - num_inp_dims); + } + ae_valign align_out = AE_ZALIGN64(); + + if(num_axis_dims) + { + if(num_elm_in_axis > 1) + { + xa_nn_reduce_sum_4D_f32_f32(p_in, + p_4D_inp_shape, + p_axis_data, + num_inp_dims, + num_axis_dims, + p_scratch); + itr = 0; + xtfloatx2 *p_src1 = (xtfloatx2 *)(p_scratch); + + float div = 1; + + for(int i = 0; i < num_axis_dims; i++) + { + div = div * (float)p_4D_inp_shape[p_axis_data[i]]; + } + + float mul = 1 / div; + + xtfloatx2 multiplier = XT_LSX((xtfloat *)&mul, 0); + + for(itr = 0; itr < (out_length >> 3); itr++) + { + xtfloatx2 temp1, temp2, temp3, temp4; + + temp2 = XT_LSX2X(p_src1, 8); + temp3 = XT_LSX2X(p_src1, 16); + temp4 = XT_LSX2X(p_src1, 24); + XT_LSX2XP(temp1, p_src1, 32); + + temp1 = XT_MUL_SX2(temp1, multiplier); + temp2 = XT_MUL_SX2(temp2, multiplier); + temp3 = XT_MUL_SX2(temp3, multiplier); + temp4 = XT_MUL_SX2(temp4, multiplier); + + XT_SASX2IP(temp1, align_out, (xtfloatx2 *)p_out); + XT_SASX2IP(temp2, align_out, (xtfloatx2 *)p_out); + XT_SASX2IP(temp3, align_out, (xtfloatx2 *)p_out); + XT_SASX2IP(temp4, align_out, (xtfloatx2 *)p_out); + } + AE_SA64POS_FP(align_out, p_out); + + for(itr = 0; itr < (out_length & 7); itr++) + { + xtfloat temp1; + XT_LSXP(temp1, (xtfloat *)p_src1, 4); + temp1 = XT_MUL_S(temp1, multiplier); + XT_SSXP(temp1, (xtfloat *)p_out, 4); + } + } + else + { + + memcpy(p_out, p_inp, inp_length * sizeof(FLOAT32)); + } + } + else + { + memcpy(p_out, p_inp, inp_length * sizeof(FLOAT32)); + } + + return 0; +} From 9b71aeda0388c73c4c607911c0a7c581f107dc17 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Wed, 6 Nov 2024 09:21:43 -0800 Subject: [PATCH 02/18] Adding quantized linear optimized versions for int8 and uint8 --- .../hifi/operators/quantized_linear_out.cpp | 90 ++++++++++++------- 1 file changed, 57 insertions(+), 33 deletions(-) diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index 8944a24ddb..b2eb680f8d 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -21,50 +21,74 @@ using executorch::runtime::getLeadingDims; using executorch::runtime::KernelRuntimeContext; void quantized_linear_out( - KernelRuntimeContext& ctx, + __ET_UNUSED KernelRuntimeContext& ctx, const Tensor& src, const Tensor& weight, const Tensor& bias, int64_t src_zero_point, - const Tensor& weight_zero_point, + const Tensor& weight_zero_point_t, const Tensor& out_multiplier, const Tensor& out_shift, int64_t out_zero_point, - const executorch::aten::optional& offset, + __ET_UNUSED const executorch::aten::optional& offset, Tensor& out) { - // input comes in shape [leading_dims, in_dim] - // weight comes in shape [out_dim, in_dim] - // output comes in empty with shape [leading_dims, out_dim] - // Perform matrix multiply (M x N) x (N x P)' => M x P int64_t leading_dims = getLeadingDims(src, src.dim() - 1); - int64_t out_dim = weight.size(0); // = out_dim - int64_t in_dim = weight.size(1); // = in_dim + int64_t out_dim = weight.size(0); + int64_t in_dim = weight.size(1); - const uint8_t* __restrict__ in_data = src.const_data_ptr(); - const uint8_t* __restrict__ weight_data = weight.const_data_ptr(); - const int32_t* __restrict__ bias_data = bias.const_data_ptr(); - uint8_t* __restrict__ out_data = out.mutable_data_ptr(); + if (out.scalar_type() == executorch::aten::ScalarType::Byte) { + const uint8_t* __restrict__ in_data = src.const_data_ptr(); + const uint8_t* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + uint8_t* __restrict__ out_data = out.mutable_data_ptr(); - // The nnlib kernel to compute quantized linear via matmul. - int32_t ret = cadence::impl::HiFi::kernels::matmul_asym8uxasym8u_asym8u( - out_data, // p_out - weight_data, // p_mat1, - in_data, // p_mat2, - bias_data, // p_bias - out_dim, // rows of p_mat1 - in_dim, // cols of p_mat1 - in_dim, // row_stride of p_mat1 - leading_dims, // vec_count, i.e., rows of p_mat2 - in_dim, // vec_offset of p_mat2. - out_dim, // out_offset, i.e., offset of next output element written - 1, // out_stride, i.e., stride to go to next output row - -weight_zero_point.const_data_ptr()[0], // mat1_zero_bias - -src_zero_point, // mat2_zero_bias - out_multiplier.const_data_ptr(), // out_multiplier - out_shift.const_data_ptr(), // out_shift - out_zero_point, // out_zero_bias - false); // per channel quantization - ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed"); + // The nnlib kernel to compute quantized linear via matmul. + xa_nn_matmul_asym8uxasym8u_asym8u( + out_data, + weight_data, + in_data, + bias_data, + out_dim, + in_dim, + in_dim, + leading_dims, + in_dim, + out_dim, + 1, + -weight_zero_point_t.const_data_ptr()[0], + -src_zero_point, + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + out_zero_point); + } else if (out.scalar_type() == executorch::aten::ScalarType::Char) { + const int8_t* __restrict__ in_data = src.const_data_ptr(); + const int8_t* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + int8_t* __restrict__ out_data = out.mutable_data_ptr(); + + xa_nn_matmul_asym8sxasym8s_asym8s( + out_data, + weight_data, + in_data, + bias_data, + out_dim, + in_dim, + in_dim, + leading_dims, + in_dim, + out_dim, + 1, + -weight_zero_point_t.const_data_ptr()[0], + -src_zero_point, + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + out_zero_point); + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(src.scalar_type())); + } } }; // namespace native From 07743ab46071d1ca4b8546c343ce74407aeedae8 Mon Sep 17 00:00:00 2001 From: nishpoonia <94543206+nishpoonia@users.noreply.github.com> Date: Thu, 7 Nov 2024 20:51:49 +0530 Subject: [PATCH 03/18] adding pow, remainder, minimum, maximum operators (#33) * adding pow, remainder, minimum, maximum operators * adding pow, remainder, minimum, maximum operators --- backends/cadence/aot/functions_hifi.yaml | 32 +- backends/cadence/hifi/kernels/CMakeLists.txt | 3 + backends/cadence/hifi/kernels/kernels.h | 42 + .../cadence/hifi/operators/CMakeLists.txt | 4 + .../cadence/hifi/operators/op_maximum.cpp | 172 +++ .../cadence/hifi/operators/op_minimum.cpp | 171 +++ backends/cadence/hifi/operators/op_pow.cpp | 351 +++++ backends/cadence/hifi/operators/op_rsqrt.cpp | 53 + .../third-party/nnlib/xa_nn_broadcast_32.c | 313 +++++ .../third-party/nnlib/xa_nn_broadcast_32_32.c | 313 +++++ .../nnlib/xa_nn_elm_minimum_maximum_f32.c | 847 ++++++++++++ .../third-party/nnlib/xa_nn_elm_pow_f32.c | 1151 +++++++++++++++++ 12 files changed, 3451 insertions(+), 1 deletion(-) create mode 100644 backends/cadence/hifi/operators/op_maximum.cpp create mode 100644 backends/cadence/hifi/operators/op_minimum.cpp create mode 100644 backends/cadence/hifi/operators/op_pow.cpp create mode 100644 backends/cadence/hifi/operators/op_rsqrt.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 8cf0c2de0a..bd1102ab0b 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -62,11 +62,21 @@ - arg_meta: null kernel_name: torch::executor::full_out +- op: maximum.out + kernels: + - arg_meta: null + kernel_name: impl::HiFi::maximum_out + - op: mean.out kernels: - arg_meta: null kernel_name: cadence::impl::HiFi::mean_dim_out - + +- op: minimum.out + kernels: + - arg_meta: null + kernel_name: impl::HiFi::minimum_out + - op: mul.out kernels: - arg_meta: null @@ -77,6 +87,26 @@ - arg_meta: null kernel_name: torch::executor::permute_copy_out +- op: pow.Scalar_out + kernels: + - arg_meta: null + kernel_name: impl::HiFi::pow_Scalar_out + +- op: pow.Tensor_Scalar_out + kernels: + - arg_meta: null + kernel_name: impl::HiFi::pow_Tensor_Scalar_out + +- op: pow.Tensor_Tensor_out + kernels: + - arg_meta: null + kernel_name: impl::HiFi::pow_Tensor_Tensor_out + +- op: rsqrt.out + kernels: + - arg_meta: null + kernel_name: impl::HiFi::rsqrt_out + - op: sigmoid.out kernels: - arg_meta: null diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 9321cc544e..3d321443f8 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -9,10 +9,13 @@ add_library( cadence_kernels kernels.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c ) diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 2087c9761b..7233fe6c29 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -15,6 +15,14 @@ #include "xa_nnlib_kernels_api.h" /* Potential NNLIB function/APIs */ + +extern "C" WORD32 xa_nn_broadcast_32_32( + WORD32* __restrict__ p_out, + const int* const out_shape, + WORD32* __restrict__ p_in, + const int* const in_shape, + int num_dims); + extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -47,6 +55,34 @@ extern "C" WORD32 xa_nn_elm_div_mode_broadcast_4D_f32xf32_f32( const WORD32* const p_inp2_shape, WORD32 mode); +extern "C" WORD32 xa_nn_elm_maximum_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp1, + const FLOAT32* __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp1, + const WORD32* const p_inp1_shape, + const FLOAT32* __restrict__ p_inp2, + const WORD32* const p_inp2_shape); + +extern "C" WORD32 xa_nn_elm_minimum_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp1, + const FLOAT32* __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp1, + const WORD32* const p_inp1_shape, + const FLOAT32* __restrict__ p_inp2, + const WORD32* const p_inp2_shape); + extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -55,6 +91,12 @@ extern "C" WORD32 xa_nn_elm_mul_broadcast_4D_f32xf32_f32( const FLOAT32* __restrict__ p_inp2, const WORD32* const p_inp2_shape); +extern "C" void xa_nn_elm_pow_f32( + FLOAT32* restrict z, + const FLOAT32* restrict x, + const FLOAT32* restrict y, + WORD32 N); + extern "C" WORD32 xa_nn_elm_where_f32xf32_f32( FLOAT32* __restrict__ p_out, const FLOAT32* __restrict__ p_inp1, diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index dbe5867550..6d21c4b49a 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -22,8 +22,12 @@ endif() set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp" diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp new file mode 100644 index 0000000000..97578765cf --- /dev/null +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -0,0 +1,172 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::promoteTypes; +using torch::executor::apply_binary_elementwise_fn; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = + torch::executor::native::utils::max_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MaximumInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& maximum_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + bool optimized = true; + /*find broadcast*/ + bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + optimized = false; + if ((broadcast == true) && (max_dim > kNnlibMaxDim)) + optimized = false; + + if (optimized) { + float* a_data = a.mutable_data_ptr(); + float* b_data = b.mutable_data_ptr(); + float* out_data = out.mutable_data_ptr(); + + if (broadcast == true) { + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + off_o] = out.size(i); + } + + for (int i = 0; i < a.dim(); i++) + inp1_shape[i + off_a] = a.size(i); + + for (int i = 0; i < b.dim(); i++) + inp2_shape[i + off_b] = b.size(i); + + xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); + } else { + xa_nn_elm_maximum_f32xf32_f32(out_data, a_data, b_data, out.numel()); + } + return out; + } + ET_SWITCH_REALHB_TYPES(a_type, ctx, "maximum.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "maximum.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "maximum.out", CTYPE_OUT, [&]() { + MaximumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp new file mode 100644 index 0000000000..fd9cfe4f95 --- /dev/null +++ b/backends/cadence/hifi/operators/op_minimum.cpp @@ -0,0 +1,171 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::promoteTypes; +using torch::executor::apply_binary_elementwise_fn; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = + torch::executor::native::utils::min_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct MinimumInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& minimum_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + bool optimized = true; + /*find broadcast*/ + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) + optimized = false; + if ((broadcast == true) && (max_dim > kNnlibMaxDim)) + optimized = false; + + if (optimized) { + float* a_data = a.mutable_data_ptr(); + float* b_data = b.mutable_data_ptr(); + float* out_data = out.mutable_data_ptr(); + + if (broadcast == true) { + int out_shape[kNnlibMaxDim]; + int inp1_shape[kNnlibMaxDim]; + int inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp1_shape[i] = 1; + inp2_shape[i] = 1; + } + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + + for (int i = 0; i < out.dim(); i++) { + out_shape[i + off_o] = out.size(i); + } + + for (int i = 0; i < a.dim(); i++) + inp1_shape[i + off_a] = a.size(i); + + for (int i = 0; i < b.dim(); i++) + inp2_shape[i + off_b] = b.size(i); + + xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + out_data, out_shape, a_data, inp1_shape, b_data, inp2_shape); + } else { + xa_nn_elm_minimum_f32xf32_f32(out_data, a_data, b_data, out.numel()); + } + return out; + } + ET_SWITCH_REALHB_TYPES(a_type, ctx, "minimum.out", CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, "minimum.out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALHB_TYPES(out_type, ctx, "minimum.out", CTYPE_OUT, [&]() { + MinimumInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp new file mode 100644 index 0000000000..04533b290b --- /dev/null +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -0,0 +1,351 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include +#include + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::runtime::can_cast; +using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::promoteTypes; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + +namespace impl { +namespace HiFi { +namespace native { + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + torch::executor::apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct PowInner + : public ReportCanCastBug {}; + +} // namespace + +Tensor& pow_Tensor_Tensor_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK( + ctx, common_type != exec_aten::ScalarType::Bool, InvalidArgument, out); + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + constexpr auto name = "pow.Tensor_Tensor_out"; + constexpr int kNnlibMaxDim = 16; + int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); + bool optimized = true; + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted && b_is_broadcasted); + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if (out_type != ScalarType::Float) + optimized = false; + + if (max_dim > kNnlibMaxDim) + optimized = false; + + WORD32 num_elm = out.numel(); + + if (optimized) { + if (broadcast) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + WORD32 p_inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i] = a.size(i); + for (int i = 0; i < b_dim; i++) + p_inp2_shape[i] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2; + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + free(ptr2); + } else if (a_is_broadcasted && (!b_is_broadcasted)) { + FLOAT32* __restrict__ ptr1 = + (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = + (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i] = a.size(i); + + xa_nn_broadcast_32_32( + (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else if (b_is_broadcasted && (!a_is_broadcasted)) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < b_dim; i++) + p_inp1_shape[i] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1; + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else { + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); + } + return out; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + PowInner< + !std::is_same::value && + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + + return out; +} + +Tensor& pow_Tensor_Scalar_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = torch::executor::native::utils::get_scalar_dtype(b); + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + a_type, b, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + constexpr auto name = "pow.Tensor_Scalar_out"; + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + CTYPE_B val_b = 0; + torch::executor::native::utils::extract_scalar(b, &val_b); + torch::executor::apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +Tensor& pow_Scalar_out( + KernelRuntimeContext& ctx, + const Scalar& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, b.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = torch::executor::native::utils::get_scalar_dtype(a); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = + torch::executor::native::utils::promote_type_with_scalar( + b_type, a, /*half_to_float*/ false); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + constexpr auto name = "pow.Scalar_out"; + if (common_type == ScalarType::Half) { + common_type = ScalarType::Float; + } + + ET_SWITCH_SCALAR_OBJ_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + CTYPE_A val_a = 0; + torch::executor::native::utils::extract_scalar(a, &val_a); + + torch::executor::apply_unary_map_fn( + [val_a](const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = std::pow(a_casted, b_casted); + return static_cast(value); + }, + b.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp new file mode 100644 index 0000000000..c94800aef1 --- /dev/null +++ b/backends/cadence/hifi/operators/op_rsqrt.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; + +namespace impl { +namespace HiFi { +namespace native { +namespace { + +double rsqrt(double x) { + return 1.0 / std::sqrt(x); +} + +} // namespace + +Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { + bool optimized = true; + + if (out.scalar_type() != ScalarType::Float) + optimized = false; + + if (optimized) { + WORD32 num_elm = out.numel(); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp = + (const FLOAT32* __restrict__)in.const_data_ptr(); + + xa_nn_elm_rsqrt_f32_f32(p_out, p_inp, num_elm); + return out; + } + + return torch::executor::native::internal:: + unary_ufunc_realhbbf16_to_floathbf16(rsqrt, ctx, in, out); +} + +} // namespace native +} // namespace HiFi +} // namespace impl diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c new file mode 100644 index 0000000000..cad3f1a25b --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_8_8.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c new file mode 100644 index 0000000000..34a7111ee7 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32_32.c @@ -0,0 +1,313 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +/* + * xa_nn_broadcast_32_32.c + */ + +#include "xa_nnlib_common.h" +//#include "xa_nn_basic_state.h" + +#include +#include + +#include "stdio.h" + +/* + * This file is sourced from ../hifi5/xa_nn_broadcast_8_8.c + */ + +#define NUMDIMS_MAX 8 + +typedef struct bcast_expansion_struct_{ + size_t load_num_elem; + int replicate_loadedElm_times; + int repeat_operation; +} bcast_expansion_rule ; + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src); + +void *xa_nn_memcpy(void * dest1,const void *src1, size_t n1) +{ + char *dest = (char *)dest1; + char *src = (char *)src1; + int n = (int)n1; + ae_int16x4 * __restrict d_align_addr, * __restrict s_align_addr; + int i; + void *orig_dest = dest; + + if (n < 32) { + return memcpy(dest, src, n); + } + + if ( !(((int) dest) %8) && !(((int) src) %8)) { // 64-bit aligned + s_align_addr = (ae_int16x4 *) src; + d_align_addr = (ae_int16x4 *) dest; + for (i=0; i>3; i++) { + d_align_addr[i] = s_align_addr[i]; + } + + for (i=(n&~7); i>3; i++) { + AE_LA16X4_IP(t, s_align, s_align_addr); + AE_LA16X4_IP(t2, s_align, s_align_addr); + AE_SA16X4_IP(t, d_align, d_align_addr); + AE_SA16X4_IP(t2, d_align, d_align_addr); + } + AE_SA64POS_FP(d_align, d_align_addr); + ae_int16 *s_src = (ae_int16 *) src; + ae_int16 *s_dest = (ae_int16 *) dest; + for (i=8*i; i8, -1); + + int i = 0; + + /* Check for valid IO shapes */ + for(i=0; i=0){ + + /* Find the sub-matrix size */ + while(in_shape[dim] != 1 && dim>=0){ + num_elem_load *= out_shape[dim]; + dim--; + } + + /* Find the number of times this sub-matrix needs to be copied */ + num_copy_times = 1; + while(in_shape[dim] == 1 && dim>=0){ + num_copy_times *= out_shape[dim]; + dim--; + } + + /* Find the number of times the above copy needs to be repeated */ + num_repeat = 1; + while(in_shape[dim] != 1 && dim>=0){ + num_repeat *= 1 * out_shape[dim]; + dim--; + } + + bcast_expansion_steps[k].load_num_elem = num_elem_load; + bcast_expansion_steps[k].replicate_loadedElm_times = num_copy_times; + bcast_expansion_steps[k].repeat_operation = num_repeat; + k++; + + num_elem_load = num_elem_load * num_copy_times * num_repeat; + } + + res = broadcast_node_32(bcast_expansion_steps, num_dims-1, + p_out, p_in); + (void)res; /* Unused return value */ + + return 0; +} + +WORD32* broadcast_node_32(bcast_expansion_rule *steps, unsigned int step_id, + WORD32 *dst, WORD32 *src) { + int step_itr=0, rep_itr=0; + int i=0, j=0, k=0; + bcast_expansion_rule *step = NULL; + + // ignore steps that are null + while(steps[step_id].repeat_operation == 0 && step_id>0){ + step_id--; + } + + // step is now the parent node for this iteration + step = &steps[step_id]; + size_t numLoadedElm = step->load_num_elem; + + WORD32 *cp_dst = dst; + WORD32 *cp_src = src; + WORD32 *cp_src_temp=NULL; + WORD32 *cp_dst_temp=NULL; + + if(numLoadedElm>32){ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, (void*)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; jrepeat_operation; j++){ + for(i=0; ireplicate_loadedElm_times; i++){ + xa_nn_memcpy(cp_dst, cp_src, 4 * numLoadedElm); + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } + else{ + if(step_id > 0){ + for(step_itr=0; step_itrrepeat_operation; step_itr++){ + src = broadcast_node_32(steps, step_id-1, dst, src); + cp_src = dst; + cp_dst = dst + numLoadedElm; + for(rep_itr=1; rep_itrreplicate_loadedElm_times; rep_itr++){ + for(k=0; k<(int)numLoadedElm; k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + } + cp_dst += numLoadedElm; + } + dst = cp_dst; + } + return src; + } else { + if(numLoadedElm == 1){ + for(j=0; jrepeat_operation; j++){ +// memset((void*)cp_dst, *(WORD32 *)cp_src, 4 * step->replicate_loadedElm_times); + for(i = 0; i < step->replicate_loadedElm_times; i++) + cp_dst[i] = cp_src[0]; + cp_dst += step->replicate_loadedElm_times; + cp_src++; + } + } else { + for(j=0; j < step->repeat_operation; j++){ + for(i=0; i < step->replicate_loadedElm_times; i++){ + for(k=0; k<(int)(numLoadedElm); k++){ + cp_src_temp = cp_src; + cp_dst_temp = cp_dst; + cp_dst_temp[k] = cp_src_temp[k]; + + } + cp_dst += numLoadedElm; + } + cp_src += numLoadedElm; + } + } + return cp_src; + } + } +} diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c new file mode 100644 index 0000000000..3af93fc00c --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c @@ -0,0 +1,847 @@ +/******************************************************************************* +* Copyright (c) 2018-2024 Cadence Design Systems, Inc. +* +* Permission is hereby granted, free of charge, to any person obtaining +* a copy of this software and associated documentation files (the +* "Software"), to use this Software with Cadence processor cores only and +* not with any other processors and platforms, subject to +* the following conditions: +* +* The above copyright notice and this permission notice shall be included +* in all copies or substantial portions of the Software. +* +* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. +* IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY +* CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, +* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE +* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + +******************************************************************************/ +#include "nnlib-hifi4/xa_nnlib/include/xa_type_def.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h" +#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h" +#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h" +#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h" + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_maximum_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_maximum_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + y = XT_MAX_SX2(x2, x1); + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_MAX_SX2(x2, x1); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_MAX_S(a1, a2); + XT_SSI(a, (xtfloat *)out, 0); + } + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_maximum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_MAX_SX2(x2, x1); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_MAX_SX2(x2, x1); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_MAX_S(b0, a0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } +} + +static void internal_elm_maximum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_maximum_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_maximum_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_maximum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_maximum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_minimum_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_minimum_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + unsigned char con1, con2; + xtbool2 con = int32_rtor_xtbool2(0x00000003); + + if(((((unsigned)p_out)&7) == 0) && ((((unsigned)p_inp1)&7) == 0) && ((((unsigned)p_inp2)&7) == 0)) + { + for(i=0;i < num_elm>>1;i++) + { + XT_LSX2IP(x1, inp1, 2*sizeof(FLOAT32)); + XT_LSX2IP(x2, inp2, 2*sizeof(FLOAT32)); + y = XT_MIN_SX2(x2, x1); + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_MIN_SX2(x2, x1); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_MIN_S(a1, a2); + XT_SSI(a, (xtfloat *)out, 0); + } + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_minimum_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_MIN_SX2(x2, x1); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_MIN_SX2(x2, x1); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_MIN_S(b0, a0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } +} + +static void internal_elm_minimum_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_minimum_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_minimum_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_minimum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_minimum_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c new file mode 100644 index 0000000000..4dcec52f97 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c @@ -0,0 +1,1151 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ("Cadence */ +/* Libraries") are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* DSP Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2015-2018 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +/* + NatureDSP Signal Processing Library. Vector mathematics + Vector operations + code optimized for HiFi4 core + IntegrIT, 2006-2018 +*/ + +#include "../include/NatureDSP_Signal_math.h" +#include "NatureDSP_types.h" +#include "xa_nn_common.h" + +/* Common helper macros. */ +#include "xa_nnlib_common_fpu.h" + +#include "xa_nnlib_common.h" +/* Constant tables. */ + +const union ufloat32uint32 ALIGN(8) xa_nnlib_pow2f_coef[] = +{ + { 0x39222a65 }, + { 0x3aaf931c }, + { 0x3c1d94fc }, + { 0x3d63578a }, + { 0x3e75fdf0 }, + { 0x3f317218 }, + { 0x3f800000 } + + //{ 0x3aaf931b }, + //{ 0x3c1e7220 }, + //{ 0x3d63578a }, + //{ 0x3e75fcc9 }, + //{ 0x3f317218 }, + //{ 0x3f800000 } + +}; + +const union ufloat32uint32 ALIGN(8) xa_nnlib_log2f_coef[] = +{ + { 0x3d726a49 }, + { 0x3dd91c88 }, + { 0x3ddde76c }, + { 0x3de21e63 }, + { 0x3dfe600b }, + { 0x3e124679 }, + { 0x3e2ab2f1 }, + { 0x3e4ccd1b }, + { 0x3e7fffde }, + { 0x3eaaaaaa }, + { 0x3f000000 }, + { 0x3f800000 }, + /* log2(e) */ + { 0x3fb8aa3b }, /* 1.4426950216 */ + { 0x32a57060 } /* 1.9259629891e-008 */ +}; + +const union ufloat32uint32 xa_nnlib_pow_plusInff ={0x7f800000}; + +const union ufloat32uint32 xa_nnlib_pow_qNaNf = { 0x7fc00000 }; + +#define MIN(a,b) ( (a)<(b) ? (a) : (b) ) +#define MAX(a,b) ( (a)>(b) ? (a) : (b) ) + +/*------------------------------------------------------------------------- + Power function + These routines calculate power function for 32-bit fixed-point numbers or + floating point numbers. + For the fixed point API, The base is represented in Q31, the exponent + is represented in Q6.25. Results are represented as normalized fixed point + number with separate mantissa in Q31 and exponent. + + Precision: + 32x32 32-bit inputs, 32-bit outputs + f floating point input, floating point output + + Accuracy: + 2 ULP for fixed point API + 2 ULP under condition that |y|<=100 + + Notes: +1. Scalar floating point raise to a power functions conform to ANSI C requirements on + standard math library functions in respect to treatment of errno and floating- + point exceptions. Vectorized function does not touch errno and may raise or not raise + floating point exceptions. +2. For floating point API, If x<0 is finite, y is finite and not an integer value, + then the respective result z is set to NaN +3. For fixed point API, function returns zero for all non-positive x. Fixed point + functions never touch errno + + Special cases: + x | y | Result | Extra Conditions + --------+--------+--------+--------------------- + floating point API + --------+--------+--------+--------------------- + +/-0 | y | +/-inf | odd y<0 + +/-0 | y | +inf | even y<0 + +/-0 | y | +/-0 | odd y>0 + +/-0 | y | 0 | even y>0 + +/-1 | +/-inf | 1 | + 1 | y | 1 | any y including NaN + x | +/-0 | 1 | any x including NaN + x | y | NaN | finite x<0 and finite + | | | non-integer y (see + | | | note 2) + x | -inf | +inf | |x|<1 + x | -inf | 0 | |x|>1 + x | +inf | 0 | |x|<1 + x | +inf | +inf | |x|>1 + -inf | y | -0 | y an odd integer <0 + -inf | y | 0 | y<0 and not an odd + | | | integer + -inf | y | -inf | y an odd integer >0 + -inf | y | +inf | y>0 and not an odd + | | | integer + +inf | y | 0 | y<0 + +inf | y | +inf | y>0 + --------+--------+--------+--------------------- + fixed point API + --------+--------+--------+--------------------- + x | y | 0 | x<=0 + --------+--------+--------+--------------------- + + Input: + x[N] input data,Q0.31 or floating point + y[N] input data,Q6.25 or floating point + N length of vectors + Output (fixed point API): + m[N] mantissa of output, Q31 + e[N] exponent of output + Output (floating point API): + z[N] results: floating point + + Restriction: + z,x,y,m should not overlap +-------------------------------------------------------------------------*/ + +#if !HAVE_VFPU && !HAVE_FPU +DISCARD_FUN(void, xa_nn_elm_pow_f32, (FLOAT32 * restrict z, const FLOAT32 * restrict y, const FLOAT32 * restrict x, WORD32 N)) +#elif HAVE_VFPU +#define sz_f32 (int)sizeof(FLOAT32) +static void mypowf(FLOAT32 * scr, + FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + WORD32 N ) +{ + /* Table of different constants used in computations */ + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + int n; + const xtfloatx2 * pX; + const xtfloatx2 * pY; + + const xtfloatx2 * restrict S_rd; + xtfloatx2 * restrict S_wr; + xtfloatx2 * restrict pZ; + const ae_int32 * restrict TBL; + const xtfloat * restrict TBL_LOG2; + const xtfloat * restrict TBL_POW2; + xtfloatx2 x0, y0, z0, t0, t1, ef0; + xtfloatx2 c2f, c3f, c4f; + xtfloatx2 _0, _1, half; + ae_int32x2 c0i, c1i, c5i, c7i, c8i; + ae_int32x2 e0, xi0, yi0, ex0; + xtbool2 bsx, bsy, bdenorm, bsmall; + ae_valign aX, aY, aZ; + + /* overall number of blocks; number of values in the current block */ + int blkLen; + /* Block size, blkLen <= blkSize */ + const int blkSize = MAX_ALLOCA_SZ / (3*sz_f32); + + + if (N <= 0) return; + + NASSERT(N % 2 == 0); + NASSERT_ALIGN16(scr); + + /* + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ + + + blkLen = 0; + TBL = (const ae_int32 *)c_tbl; + for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize) + { + blkLen = XT_MIN(N, blkSize); + _0 = 0.0f; + _1 = (1.0f); + half = (0.5f); + { + pX = (const xtfloatx2*)x; + S_wr = (xtfloatx2*)scr; + aX = AE_LA64_PP(pX); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LASX2IP(x0, aX, pX); + + x0 = XT_ABS_SX2(x0); + c0i = AE_L32_I(TBL, 0 * 4); /*-126*/ + c1i = AE_L32_I(TBL, 1 * 4); /*-150*/ + c2f = XT_LSI((xtfloat*)TBL, 2 * 4); + c3f = XT_LSI((xtfloat*)TBL, 3 * 4); + /* process denormalized values */ + bdenorm = XT_OLE_SX2(x0, c2f); + t0 = XT_MUL_SX2(x0, c3f); + XT_MOVT_SX2(x0, t0, bdenorm); + e0 = c0i; + AE_MOVT32X2(e0, c1i, bdenorm); + /* extract exponent */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(c2f);/* load mantissa mask */ //!!!!!!!!!!!!! + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, c5i); + x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + c4f = XT_LSI((xtfloat*)TBL, 4 * 4); + bsmall = XT_OLT_SX2(x0, c4f); + t0 = XT_ADD_SX2(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_SX2(x0, t0, bsmall); + AE_MOVT32X2(e0, ex0, bsmall); + x0 = XT_SUB_SX2(_1, x0); //!!! + ef0 = XT_FLOAT_SX2(e0, 0); //!!! + XT_SSX2IP(x0, S_wr, 2 * sz_f32); + XT_SSX2IP(ef0, S_wr, 2*2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloatx2 p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloatx2 p10, p11, p12, p13; + xtfloatx2 t2, w0, w1; + S_wr = ( xtfloatx2*)scr+2; + S_rd = (const xtfloatx2*)scr; + TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef; + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(x0, S_rd, 3*2 * sz_f32); + //XT_LSX2IP(ef0, S_rd, 2 * sz_f32); + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI(TBL_LOG2, 0 * 4); + p1 = XT_LSI(TBL_LOG2, 1 * 4); + p2 = XT_LSI(TBL_LOG2, 2 * 4); + p3 = XT_LSI(TBL_LOG2, 3 * 4); + p4 = XT_LSI(TBL_LOG2, 4 * 4); + p5 = XT_LSI(TBL_LOG2, 5 * 4); + p6 = XT_LSI(TBL_LOG2, 6 * 4); + p7 = XT_LSI(TBL_LOG2, 7 * 4); + p8 = XT_LSX(TBL_LOG2, 8 * 4); + p9 = XT_LSX(TBL_LOG2, 9 * 4); + + XT_MADD_SX2(p1, x0, p0); + XT_MADD_SX2(p2, x0, p1); + XT_MADD_SX2(p3, x0, p2); + XT_MADD_SX2(p4, x0, p3); + XT_MADD_SX2(p5, x0, p4); + XT_MADD_SX2(p6, x0, p5); + XT_MADD_SX2(p7, x0, p6); + XT_MADD_SX2(p8, x0, p7); + XT_MADD_SX2(p9, x0, p8); + t2 = p9; + XT_SSX2IP(t2, S_wr, 3*2 * sz_f32); + } + S_wr = (xtfloatx2*)scr; + S_rd = (const xtfloatx2*)scr; + for (n = 0; n<(blkLen >> 1); n++) + { + p10 = XT_LSX(TBL_LOG2, 10 * 4); + p11 = XT_LSX(TBL_LOG2, 11 * 4); + p12 = XT_LSX(TBL_LOG2, 12 * 4); + p13 = XT_LSX(TBL_LOG2, 13 * 4); + + XT_LSX2IP(x0, S_rd, 2 * sz_f32); + XT_LSX2IP(ef0, S_rd, 2 * sz_f32); + XT_LSX2IP(t2, S_rd, 2 * sz_f32); + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_SX2(x0, t2); t1 = t0; + XT_MSUB_SX2(t1, x0, t2); + w0 = XT_ADD_SX2(t0, p10); + w1 = XT_SUB_SX2(w0, p10); + w1 = XT_SUB_SX2(t0, w1); + w1 = XT_SUB_SX2(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_SX2(x0, t0); w1 = w0; + XT_MSUB_SX2(w1, x0, t0); t0 = w0; + XT_MSUB_SX2(w1, x0, t1); t1 = w1; + w0 = XT_ADD_SX2(t0, p11); + w1 = XT_SUB_SX2(w0, p11); + w1 = XT_SUB_SX2(t0, w1); + w1 = XT_SUB_SX2(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_SX2(x0); + w0 = XT_MUL_SX2(x0, t0); w1 = w0; + XT_MSUB_SX2(w1, x0, t0); t0 = w0; + XT_MSUB_SX2(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_SX2(t0, p12); w1 = w0; + XT_MSUB_SX2(w1, t0, p12); + XT_MADD_SX2(w1, t1, p12); + XT_MSUB_SX2(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_SX2(t0, ef0); + w1 = XT_SUB_SX2(w0, ef0); + w1 = XT_SUB_SX2(t0, w1); + t1 = XT_SUB_SX2(w1, t1);//!!!! + t0 = w0; // !!!!! + XT_SSX2IP(t0, S_wr, 2 * sz_f32); + XT_SSX2IP(t1, S_wr, 2*2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloatx2 xy, dxy, c0, c1; + xtfloatx2 p0, p1, p2, p3, p4, p5, p6; + S_wr = ( xtfloatx2*)scr+2; + S_rd = (const xtfloatx2*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloatx2*)y; + aY = AE_LA64_PP(pY); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(t0, S_rd, 2 * sz_f32); + XT_LSX2IP(t1, S_rd, 2*2 * sz_f32); + + XT_LASX2IP(y0, aY, pY); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0)); + dxy = XT_NEG_SX2(xy); + XT_MADD_SX2(dxy, y0, t0); + XT_MADD_SX2(dxy, y0, t1); + dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f); + /* compute 2^fract */ + p0 = XT_LSI(TBL_POW2, 0 * 4); + p1 = XT_LSI(TBL_POW2, 1 * 4); + p2 = XT_LSI(TBL_POW2, 2 * 4); + p3 = XT_LSI(TBL_POW2, 3 * 4); + p4 = XT_LSI(TBL_POW2, 4 * 4); + + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_SX2(p1, dxy, p0); + XT_MADD_SX2(p2, dxy, p1); + XT_MADD_SX2(p3, dxy, p2); + XT_MADD_SX2(p4, dxy, p3); + XT_SSX2IP(p4, S_wr, 3*2 * sz_f32); + } + __Pragma("no_reorder"); + S_wr = (xtfloatx2*)scr; + S_rd = (const xtfloatx2*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloatx2*)y; + aY = AE_LA64_PP(pY); + for (n = 0; n<(blkLen >> 1); n++) + { + + XT_LSX2IP(t0, S_rd, 2 * sz_f32); + XT_LSX2IP(t1, S_rd, 2 * sz_f32); + XT_LSX2IP(p4, S_rd, 2 * sz_f32); + p5 = XT_LSI(TBL_POW2, 5 * 4); + p6 = XT_LSI(TBL_POW2, 6 * 4); + XT_LASX2IP(y0, aY, pY); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FIROUND_SX2(XT_MUL_SX2(y0, t0)); + dxy = XT_NEG_SX2(xy); + XT_MADD_SX2(dxy, y0, t0); + XT_MADD_SX2(dxy, y0, t1); + dxy = XT_MIN_SX2(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_SX2(dxy, (xtfloatx2)-1.0f); + XT_MADD_SX2(p5, dxy, p4); + XT_MADD_SX2(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_SX2(xy, 0); + c7i = AE_L32_I(TBL, 7 * 4);/* -252 */ + c8i = AE_L32_X(TBL, 8 * 4);/* 254 */ + e0 = AE_MAX32(e0, c7i); + e0 = AE_MIN32(e0, c8i); + e0 = AE_ADD32(e0, c8i); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0); + c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0); + z0 = XT_MUL_SX2(z0, c1); + z0 = XT_MUL_SX2(z0, c0); //!!!!!!!!!!!! + XT_SSX2IP(z0, S_wr, 2 * sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtbool2 b_yint, b_e0, b0, b_notspec; + xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloatx2 xabs, spec; + ae_int32x2 sgn, zi0; + + S_rd = (const xtfloatx2*)scr; + pY = (const xtfloatx2*)y; + pX = (const xtfloatx2*)x; + pZ = ( xtfloatx2*)z; + aY = AE_LA64_PP(pY); + aX = AE_LA64_PP(pX); + aZ = AE_ZALIGN64(); + for (n = 0; n<(blkLen >> 1); n++) + { + XT_LSX2IP(z0, S_rd, 2 * sz_f32); + XT_LASX2IP(x0, aX, pX); + XT_LASX2IP(y0, aY, pY); + /* Take sign of x and y */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + yi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(y0); + bsx = XT_OLT_SX2(xi0, (xtfloatx2)0.0f); + bsy = XT_OLT_SX2(yi0, (xtfloatx2)0.0f); + + xabs = XT_ABS_SX2(x0); + /* check if y is integer */ + t0 = XT_FITRUNC_SX2(y0); + b_yint = XT_OEQ_SX2(t0, y0); + + /* check if y is odd */ + e0 = XT_TRUNC_SX2(y0, 0); //temp0 + b_e0 = AE_EQ32(e0, MAX_INT32);//~b_tmp0 + b0i = AE_MOVAB2(b_e0); + b1i = AE_MOVAB2(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA2(b0i); + AE_MOVF32X2(e0, AE_ZERO32(), b0); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = XT_OEQ_SX2((xtfloatx2)0.0f, y0); /* y ==0 */ + b_yinf = XT_OEQ_SX2(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f); /* |y|==Inf */ + b_xeqz = XT_OEQ_SX2(x0, (xtfloatx2)0.0f); /* x ==0 */ + b_xeq1 = XT_OEQ_SX2(xabs, (xtfloatx2)1.0f); /* |x|==1 */ + b_xinf = XT_OEQ_SX2(xabs, xa_nnlib_pow_plusInff.f); /* |x|==INF */ + + yint = AE_MOVAB2(b_yint); + yeqz = AE_MOVAB2(b_yeqz); + yinf = AE_MOVAB2(b_yinf); + xeqz = AE_MOVAB2(b_xeqz); + xeq1 = AE_MOVAB2(b_xeq1); + xinf = AE_MOVAB2(b_xinf); + sx = AE_MOVAB2(bsx); + sy = AE_MOVAB2(bsy); + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA2(NaN1); + b_NaN2 = XT_UN_SX2(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA2(one); + b_Inf = AE_MOVBA2(Inf); + b_zero = AE_MOVBA2(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloatx2)xa_nnlib_pow_qNaNf.f; + XT_MOVF_SX2(spec, half, b_NaN1); + XT_MOVT_SX2(spec, _0, b_zero); + XT_MOVT_SX2(spec, xa_nnlib_pow_plusInff.f, b_Inf); + XT_MOVT_SX2(spec, xa_nnlib_pow_qNaNf.f, b_NaN2); + XT_MOVT_SX2(spec, _1, b_one); + + b_notspec = XT_OEQ_SX2(spec, half); + /* Replace result with special numbers if needed */ + XT_MOVF_SX2(z0, spec, b_notspec); + /* Restore sign and store result */ + zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0); + XT_SASX2IP(z0, aZ, pZ); + } + } + XT_SASX2POSFP(aZ, pZ); + } +} /* mypowf() */ +void xa_nn_elm_pow_f32( FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + int N ) +{ + const int blkSize = MAX_ALLOCA_SZ/sz_f32; + /* Allocate a fixed-size scratch area on the stack. */ + FLOAT32 ALIGN(16) scr[blkSize]; + int M; + if ( N<=0 ) return; + M=N&~1; + if ( M ) + { + mypowf(scr,z,x,y,M); + y += M; + x += M; + z += M; + N&=1; + } + if (N) + { // processing the tail + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + xtfloat x0, y0, t0, ef0, t1, t2; + xtfloat xy, dxy, z0, c0, c1; + xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloat p10, p11, p12, p13, w0, w1; + xtbool bdenorm, bsmall; + ae_int32 e0, xi0, ex0; + x0=XT_LSI((const xtfloat*)x,0); + + x0 = XT_ABS_S(x0); + + /* process denormalized values */ + bdenorm = xtbool2_extract_0(XT_OLE_S(x0, XT_LSI((xtfloat*)c_tbl, 2 * 4))); + t0 = XT_MUL_S(x0, XT_LSI((xtfloat*)c_tbl, 3 * 4)); + XT_MOVT_S(x0, t0, (bdenorm)); + e0 = AE_L32_I((ae_int32 *)c_tbl, 0 * 4);; + AE_MOVT_32(e0, AE_L32_I((ae_int32 *)c_tbl, 1 * 4), (bdenorm)); + /* extract exponent */ + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(XT_LSI((xtfloat*)c_tbl, 2 * 4));/* load mantissa mask */ //!!!!!!!!!!!!! + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, AE_L32_I((ae_int32 *)c_tbl, 5 * 4)); + x0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + + bsmall = xtbool2_extract_0(XT_OLT_S(x0, XT_LSI((xtfloat*)c_tbl, 4 * 4))); + + + t0 = XT_ADD_S(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_S(x0, t0, bsmall); + AE_MOVT_32(e0, ex0, bsmall); + x0 = XT_SUB_S(1.0f, x0); //!!! + ef0 = XT_FLOAT_S(e0, 0); //!!! + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 0 * 4); + p1 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 1 * 4); + p2 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 2 * 4); + p3 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 3 * 4); + p4 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 4 * 4); + p5 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 5 * 4); + p6 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 6 * 4); + p7 = XT_LSI((const xtfloat *)xa_nnlib_log2f_coef, 7 * 4); + p8 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 8 * 4); + p9 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 9 * 4); + + + XT_MADD_S(p1, x0, p0); + XT_MADD_S(p2, x0, p1); + XT_MADD_S(p3, x0, p2); + XT_MADD_S(p4, x0, p3); + XT_MADD_S(p5, x0, p4); + XT_MADD_S(p6, x0, p5); + XT_MADD_S(p7, x0, p6); + XT_MADD_S(p8, x0, p7); + XT_MADD_S(p9, x0, p8); + t2 = p9; + + + p10 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 10 * 4); + p11 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 11 * 4); + p12 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 12 * 4); + p13 = XT_LSX((const xtfloat *)xa_nnlib_log2f_coef, 13 * 4); + + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_S(x0, t2); t1 = t0; + XT_MSUB_S(t1, x0, t2); + w0 = XT_ADD_S(t0, p10); + w1 = XT_SUB_S(w0, p10); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + w0 = XT_ADD_S(t0, p11); + w1 = XT_SUB_S(w0, p11); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_S(x0); + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_S(t0, p12); w1 = w0; + XT_MSUB_S(w1, t0, p12); + XT_MADD_S(w1, t1, p12); + XT_MSUB_S(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_S(t0, ef0); + w1 = XT_SUB_S(w0, ef0); + w1 = XT_SUB_S(t0, w1); + t1 = XT_SUB_S(w1, t1);//!!!! + t0 = w0; // !!!!! + + /* compute y*log2(x) and separate result into integer and fractional parts */ + y0 = XT_LSI((const xtfloat*)y, 0); + xy = XT_FIROUND_S(XT_MUL_S(y0, t0)); + dxy = XT_NEG_S(xy); + XT_MADD_S(dxy, y0, t0); + XT_MADD_S(dxy, y0, t1); + dxy = XT_MIN_S(dxy, (xtfloatx2)1.0f); + dxy = XT_MAX_S(dxy, (xtfloatx2)-1.0f); + /* compute 2^fract */ + p0 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 0 * 4); + p1 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 1 * 4); + p2 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 2 * 4); + p3 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 3 * 4); + p4 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 4 * 4); + p5 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 5 * 4); + p6 = XT_LSI( (const xtfloat *)xa_nnlib_pow2f_coef, 6 * 4); + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_S(p1, dxy, p0); + XT_MADD_S(p2, dxy, p1); + XT_MADD_S(p3, dxy, p2); + XT_MADD_S(p4, dxy, p3); + XT_MADD_S(p5, dxy, p4); + XT_MADD_S(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_SX2(xy, 0); + e0 = AE_MAX32(e0, AE_L32_I((ae_int32 *)c_tbl, 7 * 4)); + e0 = AE_MIN32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4)); + e0 = AE_ADD32(e0, AE_L32_X((ae_int32 *)c_tbl, 8 * 4)); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + c0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(e0); + c1 = XT_AE_MOVXTFLOATX2_FROMINT32X2(ex0); + z0 = XT_MUL_S(z0, c1); + z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!! + + + /* Take sign of x and y */ + { + xtbool2 bsx, bsy, b_yint, b_e0, b0, b_notspec; + + xtbool2 b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool2 b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloat xabs, spec; + ae_int32 sgn, zi0; + + x0 = XT_LSI((const xtfloat*)x, 0); + y0 = XT_LSI((const xtfloat*)y, 0); + xi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(x0); + bsx = (XT_OLT_S(x0, (xtfloat)0.0f)); + bsy = (XT_OLT_S(y0, (xtfloat)0.0f)); + + xabs = XT_ABS_S(x0); + /* check if y is integer */ + t0 = XT_FITRUNC_S(y0); + b_yint = (XT_OEQ_S(t0, y0)); + + /* check if y is odd */ + e0 = XT_TRUNC_S(y0, 0); //temp0 + b_e0 = (AE_EQ32(e0, MAX_INT32));//~b_tmp0 + b0i = AE_MOVAB2(b_e0); + b1i = AE_MOVAB2(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA2(b0i); + AE_MOVF_32(e0, AE_ZERO32(), xtbool2_extract_0(b0)); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = (XT_OEQ_S((xtfloatx2)0.0f, y0)); /* y ==0 */ + b_yinf = (XT_OEQ_S(XT_ABS_SX2(y0), xa_nnlib_pow_plusInff.f)); /* |y|==Inf */ + b_xeqz = (XT_OEQ_S(x0, (xtfloatx2)0.0f)); /* x ==0 */ + b_xeq1 = (XT_OEQ_S(xabs, (xtfloatx2)1.0f)); /* |x|==1 */ + b_xinf = (XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f)); /* |x|==INF */ + + yint = AE_MOVAB2 (b_yint); + yeqz = AE_MOVAB2 (b_yeqz); + yinf = AE_MOVAB2 (b_yinf); + xeqz = AE_MOVAB2 (b_xeqz); + xeq1 = AE_MOVAB2 (b_xeq1); + xinf = AE_MOVAB2 (b_xinf); + sx = AE_MOVAB2 (bsx); + sy = AE_MOVAB2 (bsy); + + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA2(NaN1); + b_NaN2 = XT_UN_SX2(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA2(one); + b_Inf = AE_MOVBA2(Inf); + b_zero = AE_MOVBA2(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloat)xa_nnlib_pow_qNaNf.f; + XT_MOVF_S(spec, 0.5f, xtbool2_extract_0(b_NaN1)); + XT_MOVT_S(spec, 0.0f, xtbool2_extract_0(b_zero)); + XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, xtbool2_extract_0(b_Inf)); + XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, xtbool2_extract_0(b_NaN2)); + XT_MOVT_S(spec, 1.0f, xtbool2_extract_0(b_one)); + + b_notspec = XT_OEQ_S(spec, 0.5f); + /* Replace result with special numbers if needed */ + XT_MOVF_S(z0, spec, xtbool2_extract_0(b_notspec)); + /* Restore sign and store result */ + zi0 = XT_AE_MOVINT32X2_FROMXTFLOATX2(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_AE_MOVXTFLOATX2_FROMINT32X2(zi0); + + XT_SSI(z0,(xtfloat*)z,0); + + } + } + +} /* vec_powf() */ +#else +#define sz_f32 (int)sizeof(FLOAT32) +void xa_nn_elm_pow_f32(FLOAT32 * restrict z, + const FLOAT32 * restrict x, + const FLOAT32 * restrict y, + int N) +{ + + const int blkSizef = MAX_ALLOCA_SZ / sz_f32; + /* Allocate a fixed-size scratch area on the stack. */ + float ALIGN(16) scr[blkSizef]; + /* Table of different constants used in computations */ + static const int32_t c_tbl[] = + { + -126, + -150, + (int32_t)0x007FFFFF,/* max denormalized floating-point number / mantissa mask */ + (int32_t)0x4B800000,/* 2^24 */ + (int32_t)0x3F3504F3,/* sqrt(0.5) */ + (int32_t)0x3F000000,/* 0.5 */ + (int32_t)0xBF000000,/* -0.5 */ + -252, + 254 + }; + int n; + const xtfloat * pX; + const xtfloat * pY; + + const xtfloat * restrict S_rd; + xtfloat * restrict S_wr; + xtfloat * restrict pZ; + const ae_int32 * restrict TBL; + const xtfloat * restrict TBL_LOG2; + const xtfloat * restrict TBL_POW2; + xtfloat x0, y0, z0, t0, t1, ef0; + xtfloat c2f, c3f, c4f; + xtfloat _0, _1, half; + ae_int32x2 c0i, c1i, c5i, c6i, c7i, c8i; + ae_int32 e0, xi0, yi0, ex0; + xtbool bsx, bsy, bdenorm, bsmall; + + /* overall number of blocks; number of values in the current block */ + int blkLen; + /* Block size, blkLen <= blkSize */ + const int blkSize = MAX_ALLOCA_SZ / (3 * sz_f32); + + + if (N <= 0) return; + + NASSERT_ALIGN16(scr); + + /* + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ + + blkLen = 0; + TBL = (const ae_int32 *)c_tbl; + for (; N>0; N -= blkLen, x += blkSize, y += blkSize, z += blkSize) + { + blkLen = XT_MIN(N, blkSize); + _0 = 0.0f; + _1 = (1.0f); + half = (0.5f); + { + pX = (const xtfloat*)x; + S_wr = ( xtfloat*)scr; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(x0, pX, sz_f32); + + x0 = XT_ABS_S(x0); + c0i = AE_L32_I(TBL, 0 * 4); /* -126 */ + c1i = AE_L32_I(TBL, 1 * 4); /* -150 */ + c2f = XT_LSI((xtfloat*)TBL, 2 * 4); + c3f = XT_LSI((xtfloat*)TBL, 3 * 4); + /* process denormalized values */ + bdenorm = XT_OLE_S(x0, c2f); + t0 = XT_MUL_S(x0, c3f); + XT_MOVT_S(x0, t0, bdenorm); + e0 = c0i; + + AE_MOVT_32(e0, c1i, bdenorm); + /* extract exponent */ + xi0 = XT_RFR(x0); + ex0 = AE_SRLI32(xi0, 23); + e0 = AE_ADD32(e0, ex0); + /* extract mantissa */ + ex0 = XT_RFR(c2f);/* load mantissa mask */ //!!!!!!!!!!!!! + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + xi0 = AE_AND32(xi0, ex0); + xi0 = AE_OR32(xi0, c5i); + x0 = XT_WFR(xi0); + /* adjust the mantissa to range [ sqrt(0.5) ; sqrt(2.0) ) */ + c4f = XT_LSI((xtfloat*)TBL, 4 * 4); + bsmall = XT_OLT_S(x0, c4f); + t0 = XT_ADD_S(x0, x0); + ex0 = AE_SUB32(e0, 1); + XT_MOVT_S(x0, t0, bsmall); + AE_MOVT_32(e0, ex0, bsmall); + x0 = XT_SUB_S(_1, x0); //!!! + ef0 = XT_FLOAT_S(e0, 0); //!!! + XT_SSIP(x0, S_wr, sz_f32); + XT_SSIP(ef0, S_wr, 2 * sz_f32); + + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloat p0, p1, p2, p3, p4, p5, p6, p7, p8, p9; + xtfloat p10, p11, p12, p13; + xtfloat t2, w0, w1; + S_wr = ( xtfloat*)scr + 2; + S_rd = (const xtfloat*)scr; + TBL_LOG2 = (const xtfloat *)xa_nnlib_log2f_coef; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(x0, S_rd, 3*sz_f32); + + /* evaluate polynomial approximation */ + /* Load table of coefficients */ + + p0 = XT_LSI(TBL_LOG2, 0 * 4); + p1 = XT_LSI(TBL_LOG2, 1 * 4); + p2 = XT_LSI(TBL_LOG2, 2 * 4); + p3 = XT_LSI(TBL_LOG2, 3 * 4); + p4 = XT_LSI(TBL_LOG2, 4 * 4); + p5 = XT_LSI(TBL_LOG2, 5 * 4); + p6 = XT_LSI(TBL_LOG2, 6 * 4); + p7 = XT_LSI(TBL_LOG2, 7 * 4); + p8 = XT_LSX(TBL_LOG2, 8 * 4); + p9 = XT_LSX(TBL_LOG2, 9 * 4); + + XT_MADD_S(p1, x0, p0); + XT_MADD_S(p2, x0, p1); + XT_MADD_S(p3, x0, p2); + XT_MADD_S(p4, x0, p3); + XT_MADD_S(p5, x0, p4); + XT_MADD_S(p6, x0, p5); + XT_MADD_S(p7, x0, p6); + XT_MADD_S(p8, x0, p7); + XT_MADD_S(p9, x0, p8); + t2 = p9; + XT_SSIP(t2, S_wr, 3 * sz_f32); + } + S_wr = ( xtfloat*)scr; + S_rd = (const xtfloat*)scr; + + for (n = 0; n<(blkLen); n++) + { + p10 = XT_LSX(TBL_LOG2, 10 * 4); + p11 = XT_LSX(TBL_LOG2, 11 * 4); + p12 = XT_LSX(TBL_LOG2, 12 * 4); + p13 = XT_LSX(TBL_LOG2, 13 * 4); + + XT_LSIP(x0, S_rd, sz_f32); + XT_LSIP(ef0, S_rd, sz_f32); + XT_LSIP(t2, S_rd, sz_f32); + + /* next coefficients are computed in extended precision */ + t0 = XT_MUL_S(x0, t2); t1 = t0; + XT_MSUB_S(t1, x0, t2); + w0 = XT_ADD_S(t0, p10); + w1 = XT_SUB_S(w0, p10); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + w0 = XT_ADD_S(t0, p11); + w1 = XT_SUB_S(w0, p11); + w1 = XT_SUB_S(t0, w1); + w1 = XT_SUB_S(w1, t1); + t0 = w0; t1 = w1; + x0 = XT_NEG_S(x0); + w0 = XT_MUL_S(x0, t0); w1 = w0; + XT_MSUB_S(w1, x0, t0); t0 = w0; + XT_MSUB_S(w1, x0, t1); t1 = w1; + /* multiply by log2(e) */ + w0 = XT_MUL_S(t0, p12); w1 = w0; + XT_MSUB_S(w1, t0, p12); + XT_MADD_S(w1, t1, p12); + XT_MSUB_S(w1, t0, p13); + t0 = w0; t1 = w1; + /* add exponent */ + w0 = XT_ADD_S(t0, ef0); + w1 = XT_SUB_S(w0, ef0); + w1 = XT_SUB_S(t0, w1); + t1 = XT_SUB_S(w1, t1);//!!!! + t0 = w0; // !!!!! + XT_SSIP(t0, S_wr, sz_f32); + XT_SSIP(t1, S_wr, sz_f32); + } + } + __Pragma("no_reorder"); + /* */ + { + xtfloat xy, dxy, c0, c1, _m1;; + xtfloat p0, p1, p2, p3, p4, p5, p6; + S_wr = ( xtfloat*)scr; + S_rd = (const xtfloat*)scr; + TBL_POW2 = (const xtfloat *)xa_nnlib_pow2f_coef; + pY = (const xtfloat*)y; + _m1 = -1.0f; + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(t0, S_rd, sz_f32); + XT_LSIP(t1, S_rd, sz_f32); + XT_LSIP(y0, pY, sz_f32); + /* compute y*log2(x) and separate result into integer and fractional parts */ + xy = XT_FLOAT_S(XT_ROUND_S(XT_MUL_S(y0, t0), 0), 0); + dxy = XT_NEG_S(xy); + XT_MADD_S(dxy, y0, t0); + XT_MADD_S(dxy, y0, t1); + c5i = AE_L32_I(TBL, 5 * 4);/* 0.5 */ + c6i = AE_L32_I(TBL, 6 * 4);/* -0.5 */ + dxy = XT_MIN_S(dxy, _1); + dxy = XT_MAX_S(dxy, _m1); + /* compute 2^fract */ + p0 = XT_LSI(TBL_POW2, 0 * 4); + p1 = XT_LSI(TBL_POW2, 1 * 4); + p2 = XT_LSI(TBL_POW2, 2 * 4); + p3 = XT_LSI(TBL_POW2, 3 * 4); + p4 = XT_LSI(TBL_POW2, 4 * 4); + p5 = XT_LSI(TBL_POW2, 5 * 4); + p6 = XT_LSI(TBL_POW2, 6 * 4); + /* NOTE: do not change the order of computations and way of polynomial decomposition ! */ + XT_MADD_S(p1, dxy, p0); + XT_MADD_S(p2, dxy, p1); + XT_MADD_S(p3, dxy, p2); + XT_MADD_S(p4, dxy, p3); + XT_MADD_S(p5, dxy, p4); + XT_MADD_S(p6, dxy, p5); + z0 = p6; + /* apply integer part */ + e0 = XT_TRUNC_S(xy, 0); + c7i = AE_L32_I(TBL, 7 * 4);/* -252 */ + c8i = AE_L32_X(TBL, 8 * 4);/* 254 */ + e0 = AE_MAX32(e0, c7i); + e0 = AE_MIN32(e0, c8i); + e0 = AE_ADD32(e0, c8i); + ex0 = AE_SRAI32(e0, 1); + e0 = AE_SUB32(e0, ex0); + ex0 = AE_SLLI32(ex0, 23); + e0 = AE_SLLI32(e0, 23); + + c0 = XT_WFR(e0); + c1 = XT_WFR(ex0); + z0 = XT_MUL_S(z0, c1); + z0 = XT_MUL_S(z0, c0); //!!!!!!!!!!!! + XT_SSIP(z0, S_wr, sz_f32); + + } + } + __Pragma("no_reorder"); + /* */ + { + xtbool b_yint, b_e0, b0, b_notspec; + xtbool b_yeqz, b_yinf, b_xeqz, b_xeq1, b_xinf; + xtbool b_NaN1, b_NaN2, b_one, b_Inf, b_zero; + uint32_t b0i, b1i; + uint32_t yeqz, yinf, xeqz, xeq1, xinf, sx, sy, yint; + uint32_t one, NaN1, Inf, zero; + xtfloat xabs, spec; + ae_int32x2 sgn, zi0; + + S_rd = (const xtfloat*)scr; + pY = (const xtfloat*)y; + pX = (const xtfloat*)x; + pZ = (xtfloat*)z; + + for (n = 0; n<(blkLen); n++) + { + XT_LSIP(z0, S_rd, sz_f32); + XT_LSIP(x0, pX, sz_f32); + XT_LSIP(y0, pY, sz_f32); + + /* Take sign of x and y */ + xi0 = XT_RFR(x0); + yi0 = XT_RFR(y0); + bsx = XT_OLT_S(x0, (xtfloat)0.0f); + bsy = XT_OLT_S(y0, (xtfloat)0.0f); + + xabs = XT_ABS_S(x0); + /* check if y is integer */ + { /* validate if y is integral - all numbers bigger than 2^23 are assumed as integral */ + xtfloat t, c; + t = XT_ABS_S((xtfloat)y0); + c = 8388608.f; + XT_MOVT_S(c, t, XT_ULT_S(t, 8388608.f)); + t = c; + t0 = XT_FLOAT_S(XT_TRUNC_S(t, 0), 0); + b_yint = XT_OEQ_S(XT_FLOAT_S(XT_TRUNC_S(t, 0), 0), t); + } + + /* check if y is odd */ + e0 = XT_TRUNC_S(y0, 0); //temp0 + b_e0 = xtbool2_extract_0(AE_EQ32(e0, MAX_INT32));//~b_tmp0 + b0i = AE_MOVAB(b_e0); + b1i = AE_MOVAB(b_yint); + b0i = b1i&(~b0i); + b0 = AE_MOVBA(b0i); + AE_MOVF_32(e0, AE_ZERO32(), b0); + e0 = AE_SLLI32(e0, 31); + sgn = AE_AND32(e0, xi0); + /* process special numbers */ + b_yeqz = XT_OEQ_S((xtfloat)0.0f, y0); /* y ==0 */ + b_yinf = XT_OEQ_S(XT_ABS_S(y0), xa_nnlib_pow_plusInff.f); /* |y|==Inf */ + b_xeqz = XT_OEQ_S(x0, (xtfloat)0.0f); /* x ==0 */ + b_xeq1 = XT_OEQ_S(xabs, (xtfloat)1.0f); /* |x|==1 */ + b_xinf = XT_OEQ_S(xabs, xa_nnlib_pow_plusInff.f); /* |x|==INF */ + + yint = AE_MOVAB(b_yint); + yeqz = AE_MOVAB(b_yeqz); + yinf = AE_MOVAB(b_yinf); + xeqz = AE_MOVAB(b_xeqz); + xeq1 = AE_MOVAB(b_xeq1); + xinf = AE_MOVAB(b_xinf); + sx = AE_MOVAB(bsx); + sy = AE_MOVAB(bsy); + one = xeq1 & (yinf | (~sx)); /* |x|==1 && ( |y|==Inf || x>0 ) */ + one = one | yeqz; /* ( |x|==1 && ( |y|==Inf || x>0 ) ) || y==0 --> z=1.0 */ + NaN1 = sx&(~yint); /* x<0 && y is not an integer --> z=NaN */ + Inf = xinf&(~sy); /* x==INF && y>0 --> z=INF */ + Inf = Inf | (xeqz & sy); /* x==0 && y<0 --> z=INF */ + zero = xeqz &(~sy); /* x==0 && y>0 --> z=0.0 */ + zero = zero | (xinf & sy); /* x==INF && y<0 --> z=0.0 */ + + b_NaN1 = AE_MOVBA(NaN1); + b_NaN2 = XT_UN_S(x0, y0); /* isnan(x) || isnan(y) --> z=NaN */ + b_one = AE_MOVBA(one); + b_Inf = AE_MOVBA(Inf); + b_zero = AE_MOVBA(zero); + + /* Save special numbers and mask for special numbers */ + spec = (xtfloat)xa_nnlib_pow_qNaNf.f; + XT_MOVF_S(spec, half, b_NaN1); + XT_MOVT_S(spec, _0, b_zero); + XT_MOVT_S(spec, xa_nnlib_pow_plusInff.f, b_Inf); + XT_MOVT_S(spec, xa_nnlib_pow_qNaNf.f, b_NaN2); + XT_MOVT_S(spec, _1, b_one); + + b_notspec = XT_OEQ_S(spec, half); + /* Replace result with special numbers if needed */ + XT_MOVF_S(z0, spec, b_notspec); + /* Restore sign and store result */ + zi0 = XT_RFR(z0); + zi0 = AE_XOR32(zi0, sgn); + z0 = XT_WFR(zi0); + XT_SSIP(z0, pZ, sz_f32); + } + } + } + +} /* vec_powf() */ +#endif From edc1b3d8e45ca91ef3f67343bac3402d45b5e6e0 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Wed, 13 Nov 2024 02:19:38 -0800 Subject: [PATCH 04/18] Fix for build issue faced in div_mod on old tools --- .../nnlib/xa_nn_elm_div_mode_f32_broadcast.c | 56 +++++++++---------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c index 95b449f43f..17c619d150 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c @@ -54,7 +54,7 @@ WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out, XT_LASX2IP(x1, inp1_a, inp1); XT_LASX2IP(x2, inp2_a, inp2); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, out); } } @@ -66,7 +66,7 @@ WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out, XT_LASX2IP(x1, inp1_a, inp1); XT_LASX2IP(x2, inp2_a, inp2); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, out); } } @@ -80,9 +80,9 @@ WORD32 xa_nn_elm_div_mode_f32xf32_f32(FLOAT32 * __restrict__ p_out, XT_LSIP(a2, (xtfloat *)inp2, 0); a = XT_DIV_S(a1, a2); if(mode == 0) - a = FITRUNC_S(a); + a = XT_FITRUNC_S(a); else - a = FIFLOOR_S(a); + a = XT_FIFLOOR_S(a); XT_SSI(a, (xtfloat *)out, 0); } @@ -138,7 +138,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x2, x1); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -149,7 +149,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x2, x1); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -166,7 +166,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LASX2IP(x1, vinp1, p_a); XT_LASX2IP(x2, vinp2, p_b); y = XT_DIV_SX2(x2, x1); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -177,7 +177,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LASX2IP(x1, vinp1, p_a); XT_LASX2IP(x2, vinp2, p_b); y = XT_DIV_SX2(x2, x1); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -189,9 +189,9 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); c0 = XT_DIV_S(b0, a0); if(mode == 0) - c0 = FITRUNC_S(c0); + c0 = XT_FITRUNC_S(c0); else - c0 = FIFLOOR_S(c0); + c0 = XT_FIFLOOR_S(c0); XT_SSI(c0, (xtfloat *)p_c, 0); } } @@ -213,7 +213,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -224,7 +224,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -241,7 +241,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LASX2IP(x1, vinp1, p_a); XT_LASX2IP(x2, vinp2, p_b); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -252,7 +252,7 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LASX2IP(x1, vinp1, p_a); XT_LASX2IP(x2, vinp2, p_b); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -264,9 +264,9 @@ static void internal_elm_div_mode_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict_ XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); c0 = XT_DIV_S(a0, b0); if(mode == 0) - c0 = FITRUNC_S(c0); + c0 = XT_FITRUNC_S(c0); else - c0 = FIFLOOR_S(c0); + c0 = XT_FIFLOOR_S(c0); XT_SSI(c0, (xtfloat *)p_c, 0); } } @@ -302,7 +302,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x2, x1); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -312,7 +312,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x2, x1); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -328,7 +328,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LASX2IP(x1, inp1_a, p_a); y = XT_DIV_SX2(x2, x1); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -338,7 +338,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LASX2IP(x1, inp1_a, p_a); y = XT_DIV_SX2(x2, x1); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -349,9 +349,9 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); out = XT_DIV_S(x2, a0_7); if(mode == 0) - out = FITRUNC_S(out); + out = XT_FITRUNC_S(out); else - out = FIFLOOR_S(out); + out = XT_FIFLOOR_S(out); XT_SSI(out, (xtfloat *)p_c, 0); } } @@ -366,7 +366,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -376,7 +376,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); } } @@ -392,7 +392,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LASX2IP(x1, inp1_a, p_a); y = XT_DIV_SX2(x1, x2); - y = FITRUNC_SX2(y); + y = XT_FITRUNC_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -402,7 +402,7 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p { XT_LASX2IP(x1, inp1_a, p_a); y = XT_DIV_SX2(x1, x2); - y = FIFLOOR_SX2(y); + y = XT_FIFLOOR_SX2(y); XT_SASX2IP(y, out_a, p_c); } } @@ -413,9 +413,9 @@ static void internal_elm_div_mode_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p XT_LSIP(a0_7, (xtfloat *)p_a, sizeof(FLOAT32)); out = XT_DIV_S(a0_7, x2); if(mode == 0) - out = FITRUNC_S(out); + out = XT_FITRUNC_S(out); else - out = FIFLOOR_S(out); + out = XT_FIFLOOR_S(out); XT_SSI(out, (xtfloat *)p_c, 0); } } From afca3dbeecc003f45dcd7747c28542e09912c640 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Tue, 19 Nov 2024 03:45:03 -0800 Subject: [PATCH 05/18] Fix build failure due to merge issue --- .../hifi/operators/quantized_linear_out.cpp | 68 ++++++------------- 1 file changed, 19 insertions(+), 49 deletions(-) diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index 7e13df3641..accc610132 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -26,53 +26,28 @@ using ::executorch::aten::Tensor; using ::executorch::runtime::getLeadingDims; using ::executorch::runtime::KernelRuntimeContext; -void quantized_linear_out( - __ET_UNUSED KernelRuntimeContext& ctx, - const Tensor& src, + + // The nnlib kernel to compute quantized linear via matmul. + +void _quantized_linear_asym8u( + const Tensor& in, const Tensor& weight, const Tensor& bias, - int64_t src_zero_point, - const Tensor& weight_zero_point_t, + int64_t in_zero_point, + const Tensor& weight_zero_point, const Tensor& out_multiplier, const Tensor& out_shift, int64_t out_zero_point, - __ET_UNUSED const executorch::aten::optional& offset, + __ET_UNUSED const optional& offset, Tensor& out) { - int64_t leading_dims = getLeadingDims(src, src.dim() - 1); - int64_t out_dim = weight.size(0); - int64_t in_dim = weight.size(1); - - if (out.scalar_type() == executorch::aten::ScalarType::Byte) { - const uint8_t* __restrict__ in_data = src.const_data_ptr(); - const uint8_t* __restrict__ weight_data = weight.const_data_ptr(); - const int32_t* __restrict__ bias_data = bias.const_data_ptr(); - uint8_t* __restrict__ out_data = out.mutable_data_ptr(); - - // The nnlib kernel to compute quantized linear via matmul. - xa_nn_matmul_asym8uxasym8u_asym8u( - out_data, - weight_data, - in_data, - bias_data, - out_dim, - in_dim, - in_dim, - leading_dims, - in_dim, - out_dim, - 1, - -weight_zero_point_t.const_data_ptr()[0], - -src_zero_point, - out_multiplier.const_data_ptr()[0], - out_shift.const_data_ptr()[0], - out_zero_point); - } else if (out.scalar_type() == executorch::aten::ScalarType::Char) { - const int8_t* __restrict__ in_data = src.const_data_ptr(); - const int8_t* __restrict__ weight_data = weight.const_data_ptr(); - const int32_t* __restrict__ bias_data = bias.const_data_ptr(); - int8_t* __restrict__ out_data = out.mutable_data_ptr(); - - xa_nn_matmul_asym8sxasym8s_asym8s( + const int64_t leading_dims = getLeadingDims(in, in.dim() - 1); + const int64_t out_dim = weight.size(0); // = out_dim + const int64_t in_dim = weight.size(1); // = in_dim + const uint8_t* __restrict__ in_data = in.const_data_ptr(); + const uint8_t* __restrict__ weight_data = weight.const_data_ptr(); + const int32_t* __restrict__ bias_data = bias.const_data_ptr(); + uint8_t* __restrict__ out_data = out.mutable_data_ptr(); + int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u( out_data, weight_data, in_data, @@ -84,17 +59,12 @@ void quantized_linear_out( in_dim, out_dim, 1, - -weight_zero_point_t.const_data_ptr()[0], - -src_zero_point, + -weight_zero_point.const_data_ptr()[0], // mat1_zero_bias + -in_zero_point, // mat2_zero_bias out_multiplier.const_data_ptr()[0], out_shift.const_data_ptr()[0], out_zero_point); - } else { - ET_CHECK_MSG( - false, - "Unhandled input dtype %hhd", - static_cast(src.scalar_type())); - } + ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed"); } void inline _quantized_linear_asym8s( From f1f0bb3539f5fb829723e65088dfccd2aef88c23 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Fri, 22 Nov 2024 12:19:44 -0800 Subject: [PATCH 06/18] Fixing review comments on PR 6867 --- backends/cadence/aot/functions_hifi.yaml | 12 ++++++------ backends/cadence/hifi/operators/op_maximum.cpp | 3 +++ backends/cadence/hifi/operators/op_minimum.cpp | 2 ++ backends/cadence/hifi/operators/op_pow.cpp | 3 +++ backends/cadence/hifi/operators/op_rsqrt.cpp | 2 ++ 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 8ccd362cbf..b6a2c50001 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -80,7 +80,7 @@ - op: maximum.out kernels: - arg_meta: null - kernel_name: impl::HiFi::maximum_out + kernel_name: cadence::impl::HiFi::maximum_out - op: mean.out kernels: @@ -90,7 +90,7 @@ - op: minimum.out kernels: - arg_meta: null - kernel_name: cadence::impl::HiFi::mean_dim_out + kernel_name: cadence::impl::HiFi::minimum_out - op: mul.out kernels: @@ -105,22 +105,22 @@ - op: pow.Scalar_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Scalar_out + kernel_name: cadence::impl::HiFi::pow_Scalar_out - op: pow.Tensor_Scalar_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Tensor_Scalar_out + kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out - op: pow.Tensor_Tensor_out kernels: - arg_meta: null - kernel_name: impl::HiFi::pow_Tensor_Tensor_out + kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out - op: rsqrt.out kernels: - arg_meta: null - kernel_name: impl::HiFi::rsqrt_out + kernel_name: cadence::impl::HiFi::rsqrt_out - op: sigmoid.out kernels: diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp index 97578765cf..f9a3658891 100644 --- a/backends/cadence/hifi/operators/op_maximum.cpp +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -23,6 +23,8 @@ using torch::executor::apply_binary_elementwise_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; + +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -170,3 +172,4 @@ Tensor& maximum_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_minimum.cpp b/backends/cadence/hifi/operators/op_minimum.cpp index fd9cfe4f95..6f81ad5c3e 100644 --- a/backends/cadence/hifi/operators/op_minimum.cpp +++ b/backends/cadence/hifi/operators/op_minimum.cpp @@ -23,6 +23,7 @@ using torch::executor::apply_binary_elementwise_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -169,3 +170,4 @@ Tensor& minimum_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index 04533b290b..9669e96123 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -26,6 +26,7 @@ using executorch::runtime::promoteTypes; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -349,3 +350,5 @@ Tensor& pow_Scalar_out( } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence + diff --git a/backends/cadence/hifi/operators/op_rsqrt.cpp b/backends/cadence/hifi/operators/op_rsqrt.cpp index c94800aef1..1cf717988a 100644 --- a/backends/cadence/hifi/operators/op_rsqrt.cpp +++ b/backends/cadence/hifi/operators/op_rsqrt.cpp @@ -15,6 +15,7 @@ using exec_aten::ScalarType; using exec_aten::Tensor; using executorch::aten::RuntimeContext; +namespace cadence { namespace impl { namespace HiFi { namespace native { @@ -51,3 +52,4 @@ Tensor& rsqrt_out(RuntimeContext& ctx, const Tensor& in, Tensor& out) { } // namespace native } // namespace HiFi } // namespace impl +} // namespace cadence From f8cf4087cca90f3c1007af62e3cf0cfe6c0b45ae Mon Sep 17 00:00:00 2001 From: dijopaul <87994875+dijopaul@users.noreply.github.com> Date: Fri, 29 Nov 2024 00:56:05 +0530 Subject: [PATCH 07/18] Malloc fix (#39) * Adding cat, full, permute_copy and relu ops (#34) * Adding cat, full, permute_copy * updating relu wrt new ref (#36) * Temporary memory allocation, replacing mallocs (#38) * Integrated temporary mem alloc functionality in place of malloc * Namespace related changes * Cleanup the main application * Adding atan2, softmax, clamp and remainder ops (#37) * Replaced malloc with temp_memory_allocator --------- Co-authored-by: nishpoonia <94543206+nishpoonia@users.noreply.github.com> Co-authored-by: Rushi-cad --- backends/cadence/aot/functions_hifi.yaml | 37 +- backends/cadence/hifi/kernels/CMakeLists.txt | 5 + backends/cadence/hifi/kernels/kernels.cpp | 5 + backends/cadence/hifi/kernels/kernels.h | 61 +- .../cadence/hifi/operators/CMakeLists.txt | 12 +- backends/cadence/hifi/operators/op_atan2.cpp | 201 ++++ backends/cadence/hifi/operators/op_cat.cpp | 158 ++++ backends/cadence/hifi/operators/op_clamp.cpp | 445 +++++++++ backends/cadence/hifi/operators/op_full.cpp | 100 ++ .../cadence/hifi/operators/op_maximum.cpp | 1 - backends/cadence/hifi/operators/op_mean.cpp | 4 +- .../hifi/operators/op_permute_copy.cpp | 198 ++++ backends/cadence/hifi/operators/op_pow.cpp | 13 +- .../cadence/hifi/operators/op_remainder.cpp | 258 +++++ .../cadence/hifi/operators/op_softmax.cpp | 195 ++++ backends/cadence/hifi/operators/op_where.cpp | 6 +- .../hifi/operators/quantized_relu_out.cpp | 103 ++ .../hifi/third-party/nnlib/xa_nn_concat_32.c | 172 ++++ .../third-party/nnlib/xa_nn_elm_atan2_f32.c | 882 ++++++++++++++++++ .../nnlib/xa_nn_elm_clamp_f32_broadcast.c | 798 ++++++++++++++++ .../nnlib/xa_nn_elm_remainder_broadcast_f32.c | 525 +++++++++++ .../third-party/nnlib/xa_nn_transpose_32.c | 260 ++++++ .../executor_runner/executor_runner.cpp | 10 +- 23 files changed, 4429 insertions(+), 20 deletions(-) create mode 100644 backends/cadence/hifi/operators/op_atan2.cpp create mode 100644 backends/cadence/hifi/operators/op_cat.cpp create mode 100644 backends/cadence/hifi/operators/op_clamp.cpp create mode 100644 backends/cadence/hifi/operators/op_full.cpp create mode 100644 backends/cadence/hifi/operators/op_permute_copy.cpp create mode 100644 backends/cadence/hifi/operators/op_remainder.cpp create mode 100644 backends/cadence/hifi/operators/op_softmax.cpp create mode 100644 backends/cadence/hifi/operators/quantized_relu_out.cpp create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c create mode 100644 backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index b6a2c50001..5c53f7e7ae 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -20,7 +20,12 @@ - op: _softmax.out kernels: - arg_meta: null - kernel_name: torch::executor::softmax_out + kernel_name: cadence::impl::HiFi::softmax_out + +- op: atan2.out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::atan2_out - op: add.out kernels: @@ -35,7 +40,12 @@ - op: cat.out kernels: - arg_meta: null - kernel_name: torch::executor::cat_out + kernel_name: cadence::impl::HiFi::cat_out + +- op: clamp.Tensor_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::clamp_tensor_out - op: clone.out kernels: @@ -60,7 +70,12 @@ - op: full.out kernels: - arg_meta: null - kernel_name: torch::executor::full_out + kernel_name: cadence::impl::HiFi::full_out + +- op: gt.Scalar_out + kernels: + - arg_meta: null + kernel_name: torch::executor::gt_scalar_out - op: gelu.out kernels: @@ -85,7 +100,7 @@ - op: mean.out kernels: - arg_meta: null - kernel_name: cadence::impl::HiFi::mean_dim_out + kernel_name: cadence::impl::HiFi::mean_dim_out - op: minimum.out kernels: @@ -100,7 +115,7 @@ - op: permute_copy.out kernels: - arg_meta: null - kernel_name: torch::executor::permute_copy_out + kernel_name: cadence::impl::HiFi::permute_copy_out - op: pow.Scalar_out kernels: @@ -117,6 +132,11 @@ - arg_meta: null kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out +- op: remainder.Tensor_out + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::remainder_Tensor_out + - op: rsqrt.out kernels: - arg_meta: null @@ -170,7 +190,6 @@ - arg_meta: null kernel_name: cadence::impl::HiFi::dequantize_per_tensor_out - - func: cadence::quantized_layer_norm.out(Tensor input, Tensor in_scale, Tensor in_zero_point, int[] normalized_shape, Tensor weight, Tensor bias, float eps, float output_scale, int output_zero_point, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null @@ -184,6 +203,12 @@ kernels: - arg_meta: null kernel_name: cadence::impl::HiFi::quantized_linear_out + +- func: cadence::quantized_relu.out(Tensor X, Tensor X_zero_point, int out_zero_point, Tensor out_multiplier, Tensor out_shift, *, Tensor(a!) out) -> Tensor(a!) + kernels: + - arg_meta: null + kernel_name: cadence::impl::HiFi::quantized_relu_out + - func: cadence::quantized_linear.per_tensor_out(Tensor src, Tensor weight, Tensor bias, SymInt src_zero_point, SymInt weight_zero_point, SymInt out_multiplier, SymInt out_shift, SymInt out_zero_point, Tensor? offset, *, Tensor(a!) out) -> Tensor(a!) kernels: - arg_meta: null diff --git a/backends/cadence/hifi/kernels/CMakeLists.txt b/backends/cadence/hifi/kernels/CMakeLists.txt index 3d321443f8..9bbd386c75 100644 --- a/backends/cadence/hifi/kernels/CMakeLists.txt +++ b/backends/cadence/hifi/kernels/CMakeLists.txt @@ -10,14 +10,19 @@ add_library( kernels.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/matmul_asym8uxasym8u_asym8u.cpp ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_broadcast_32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_add_f32_broadcast.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_clamp_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_div_mode_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_mul_f32_broadcast.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_pow_f32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_where_f32xf32_f32.c ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_reduce_32_32.c + ${EXECUTORCH_ROOT}/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c ) # Let files say "include ". set(_common_include_directories ${EXECUTORCH_ROOT}/..) diff --git a/backends/cadence/hifi/kernels/kernels.cpp b/backends/cadence/hifi/kernels/kernels.cpp index 1b335c846b..bf4a2d143f 100644 --- a/backends/cadence/hifi/kernels/kernels.cpp +++ b/backends/cadence/hifi/kernels/kernels.cpp @@ -20,6 +20,11 @@ memcpy(void* dst, const void* src, size_t num_bytes) { MEMCPY_8b(dst, src, num_bytes); } +void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size) { + Result temp_mem_res = ctx.allocate_temp(size); + return temp_mem_res.ok() ? temp_mem_res.get() : nullptr; +} + // Quantize a fp32 value to an int8_t/uint8_t value template __attribute__((always_inline)) T diff --git a/backends/cadence/hifi/kernels/kernels.h b/backends/cadence/hifi/kernels/kernels.h index 10927adc2a..c5795a617a 100644 --- a/backends/cadence/hifi/kernels/kernels.h +++ b/backends/cadence/hifi/kernels/kernels.h @@ -14,8 +14,12 @@ /* For NNLIB APIs */ #include "xa_nnlib_kernels_api.h" -/* Potential NNLIB function/APIs */ +#include + +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::Result; +/* Potential NNLIB function/APIs */ extern "C" WORD32 xa_nn_broadcast_32_32( WORD32* __restrict__ p_out, const int* const out_shape, @@ -23,6 +27,16 @@ extern "C" WORD32 xa_nn_broadcast_32_32( const int* const in_shape, int num_dims); +extern "C" WORD32 xa_nn_concat_32_32( + WORD32* __restrict__ p_out, + const WORD32* const p_out_shape, + const WORD32** pp_inps, + const WORD32* const* pp_inps_shape, + WORD32 num_out_dims, + WORD32 num_inp, + WORD32 num_inp_dims, + WORD32 axis); + extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -31,6 +45,26 @@ extern "C" WORD32 xa_nn_elm_add_broadcast_4D_f32xf32_f32( const FLOAT32* __restrict__ p_inp2, const WORD32* const p_inp2_shape); +extern "C" void +xa_nn_elm_atan2_f32(FLOAT32* z, const FLOAT32* y, const FLOAT32* x, WORD32 N); + +extern "C" WORD32 xa_nn_elm_clamp_f32xf32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp, + const FLOAT32* __restrict__ p_min, + const FLOAT32* __restrict__ p_max, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp, + const WORD32* const p_inp_shape, + const FLOAT32* __restrict__ p_min, + const WORD32* const p_min_shape, + const FLOAT32* __restrict__ p_max, + const WORD32* const p_max_shape); + extern "C" WORD32 xa_nn_elm_div_broadcast_4D_f32xf32_f32( FLOAT32* __restrict__ p_out, const WORD32* const p_out_shape, @@ -97,6 +131,20 @@ extern "C" void xa_nn_elm_pow_f32( const FLOAT32* restrict y, WORD32 N); +extern "C" WORD32 xa_nn_elm_remainder_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const FLOAT32* __restrict__ p_inp1, + const FLOAT32* __restrict__ p_inp2, + WORD32 num_elm); + +extern "C" WORD32 xa_nn_elm_remainder_broadcast_4D_f32xf32_f32( + FLOAT32* __restrict__ p_out, + const WORD32* const p_out_shape, + const FLOAT32* __restrict__ p_inp1, + const WORD32* const p_inp1_shape, + const FLOAT32* __restrict__ p_inp2, + const WORD32* const p_inp2_shape); + extern "C" WORD32 xa_nn_elm_where_f32xf32_f32( FLOAT32* __restrict__ p_out, const FLOAT32* __restrict__ p_inp1, @@ -125,11 +173,22 @@ extern "C" WORD32 xa_nn_reduce_mean_4D_f32_f32( WORD32 num_axis_dims, void* __restrict__ p_scratch_in); +extern "C" WORD32 xa_nn_transpose_32_32( + WORD32* __restrict__ p_out, + const WORD32* const p_out_shape, + const WORD32* __restrict__ p_inp, + const WORD32* const p_inp_shape, + const WORD32* __restrict__ p_permute_vec, + WORD32 num_out_dims, + WORD32 num_inp_dims); + namespace cadence { namespace impl { namespace HiFi { namespace kernels { +void* allocate_temp_memory(KernelRuntimeContext& ctx, size_t size); + void memcpy(void* dst, const void* src, size_t num_bytes); WORD32 matmul_asym8uxasym8u_asym8u( diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index 5e51f7fd3b..ab5a04897e 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -21,28 +21,34 @@ endif() # ATen compliant ops that are needed to run this model. set(_aten_ops__srcs "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_add.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_atan2.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_cat.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_clamp.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_div.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_full.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_maximum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mean.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_minimum.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_mul.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_pow.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_remainder.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_rsqrt.cpp" + "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_softmax.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sigmoid.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_sub.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_tanh.cpp" "${EXECUTORCH_ROOT}/backends/cadence/hifi/operators/op_where.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_bmm.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_cat.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp" + "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gt.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_softmax.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp" @@ -74,7 +80,7 @@ target_include_directories( # Custom ops that are needed to run the test model. add_library( custom_ops "quantized_linear_out.cpp" "quantized_layer_norm.cpp" - "quantize_per_tensor.cpp" "dequantize_per_tensor.cpp" + "quantize_per_tensor.cpp" "quantized_relu_out.cpp" "dequantize_per_tensor.cpp" ) target_include_directories( custom_ops PUBLIC ${ROOT_DIR}/.. ${CMAKE_BINARY_DIR} diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp new file mode 100644 index 0000000000..db2fc23be1 --- /dev/null +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -0,0 +1,201 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::tensors_have_same_dim_order; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +Tensor& atan2_out( + KernelRuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType out_type = out.scalar_type(); + + constexpr auto name = "atan2.out"; + constexpr int kNnlibMaxDim = 16; + int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); + bool optimized = true; + + const bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + const bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + const bool broadcast = (a_is_broadcasted && b_is_broadcasted); + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + if (out_type != ScalarType::Float) + optimized = false; + + if (max_dim > kNnlibMaxDim) + optimized = false; + + WORD32 num_elm = out.numel(); + + if (optimized) { + if (broadcast) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(WORD32)); + WORD32* __restrict__ ptr2 = + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)a.const_data_ptr(); + WORD32* __restrict__ pin2 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + WORD32 p_inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i] = a.size(i); + for (int i = 0; i < b_dim; i++) + p_inp2_shape[i] = b.size(i); + + WORD32 ret_val = + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + ret_val = + xa_nn_broadcast_32_32(ptr2, p_out_shape, pin2, p_inp2_shape, out_dim); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr2; + + xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + free(ptr2); + } else if (a_is_broadcasted && (!b_is_broadcasted)) { + FLOAT32* __restrict__ ptr1 = + (FLOAT32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(WORD32)); + + FLOAT32* __restrict__ pin1 = + (FLOAT32* __restrict__)a.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < a_dim; i++) + p_inp1_shape[i] = a.size(i); + + WORD32 ret_val = xa_nn_broadcast_32_32( + (WORD32*)ptr1, p_out_shape, (WORD32*)pin1, p_inp1_shape, out_dim); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = (const FLOAT32* __restrict__)ptr1; + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else if (b_is_broadcasted && (!a_is_broadcasted)) { + WORD32* __restrict__ ptr1 = + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(WORD32)); + + WORD32* __restrict__ pin1 = + (WORD32* __restrict__)b.const_data_ptr(); + + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + + for (int i = 0; i < out_dim; i++) + p_out_shape[i] = out.size(i); + for (int i = 0; i < b_dim; i++) + p_inp1_shape[i] = b.size(i); + + xa_nn_broadcast_32_32(ptr1, p_out_shape, pin1, p_inp1_shape, out_dim); + + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = (const FLOAT32* __restrict__)ptr1; + + xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm); + + free(ptr1); + } else { + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm); + } + return out; + } + + ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { + ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { + ET_SWITCH_FLOATH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + torch::executor:: + apply_binary_elementwise_fn( + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_OUT casted_a = static_cast(val_a); + CTYPE_OUT casted_b = static_cast(val_b); + return static_cast(std::atan2(casted_a, casted_b)); + }, + a, + b, + out); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp new file mode 100644 index 0000000000..1a62892445 --- /dev/null +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +#include + +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::getLeadingDims; +using executorch::runtime::getTrailingDims; +using executorch::runtime::resize_tensor; +using executorch::runtime::tensors_have_same_dim_order; +using torch::executor::check_cat_args; +using torch::executor::Error; +using torch::executor::get_cat_out_target_size; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +Tensor& cat_out( + RuntimeContext& ctx, + exec_aten::ArrayRef tensors, + int64_t dim, + Tensor& out) { + constexpr auto name = "cat.out"; + constexpr int kNnlibMaxDim = 16; + + bool optimized = true; + + if (out.scalar_type() != ScalarType::Float) + optimized = false; + + if (optimized) { + WORD32 num_inp = tensors.size(); + WORD32 num_inp_dims = out.dim(); + WORD32 num_out_dims = num_inp_dims; + WORD32 axis = dim; + + WORD32 inp_shape[kNnlibMaxDim][kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + + WORD32* ptr_shape[kNnlibMaxDim]; + const WORD32* ptr[kNnlibMaxDim]; + + int k = 0; + for (int i = 0; i < num_inp; i++) { + if (tensors[i].numel() == 0) + continue; + ptr[k] = (const WORD32*)tensors[i].const_data_ptr(); + for (int j = 0; j < num_inp_dims; j++) { + inp_shape[k][j] = tensors[i].size(j); + } + ptr_shape[k] = inp_shape[k]; + k++; + } + + num_inp = k; + + for (int i = 0; i < num_out_dims; i++) { + p_out_shape[i] = out.size(i); + } + + const WORD32** pp_inps = &ptr[0]; + + WORD32* p_out = (WORD32*)out.mutable_data_ptr(); + + const WORD32* const* pp_inps_shape = (const WORD32* const*)&ptr_shape[0]; + + WORD32 ret_val = xa_nn_concat_32_32( + p_out, + p_out_shape, + pp_inps, + pp_inps_shape, + num_out_dims, + num_inp, + num_inp_dims, + axis); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + return out; + } + + if (dim < 0) { + dim += out.dim(); + } + + ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), Internal, out); + + Tensor::SizesType + expected_out_size[executorch::runtime::kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + // Special handling when all inputs are 1D-empty tensors for aten consistency + // In that case, just return an 1D-empty tensor without checking dim + bool all_1d_empty = true; + for (size_t i = 0; i < tensors.size(); ++i) { + if (tensors[i].numel() != 0 || tensors[i].dim() != 1) { + all_1d_empty = false; + break; + } + } + if (all_1d_empty) { + return out; + } + + const size_t outer = getLeadingDims(out, dim); + const size_t dim_stride = getTrailingDims(out, dim); + const size_t ninputs = tensors.size(); + + const auto out_type = out.scalar_type(); + ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&] { + CTYPE_OUT* out_ptr = out.mutable_data_ptr(); + for (size_t i = 0; i < outer; ++i) { + for (size_t j = 0; j < ninputs; ++j) { + const auto in_type = tensors[j].scalar_type(); + ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&] { + if (tensors[j].numel() == 0) { + return; + } + size_t inner = tensors[j].size(dim) * dim_stride; + const CTYPE_IN* const in_ptr = + tensors[j].const_data_ptr() + i * inner; + + for (size_t k = 0; k < inner; ++k) { + out_ptr[k] = static_cast(in_ptr[k]); + } + out_ptr += inner; + }); + } + } + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp new file mode 100644 index 0000000000..290c4d087d --- /dev/null +++ b/backends/cadence/hifi/operators/op_clamp.cpp @@ -0,0 +1,445 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +using Scalar = exec_aten::Scalar; +using ScalarType = exec_aten::ScalarType; +using Tensor = exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::canCast; +using executorch::runtime::isFloatingType; +using executorch::runtime::isIntegralType; +using executorch::runtime::promoteTypes; +using torch::executor::apply_ternary_elementwise_fn; +using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; +using torch::executor::native::utils::extract_scalar; +using torch::executor::native::utils::get_scalar_dtype; +using torch::executor::native::utils::max_override; +using torch::executor::native::utils::min_override; +using torch::executor::native::utils::promote_type_with_scalar; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +namespace { + +template +/** Check if val, when cast to CTYPE_CAST, is not in the range of CTYPE_OUT */ +bool is_out_of_bounds(CTYPE_VAL val) { + const CTYPE_CAST val_cast = static_cast(val); + return val_cast < std::numeric_limits::lowest() || + val_cast > std::numeric_limits::max(); +} + +__ET_NODISCARD bool check_bounds( + const Scalar& val_scalar, + const ScalarType& val_type, + const ScalarType& out_type, + const char* val_name) { + auto is_valid = true; + + ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, "clamp.out", CTYPE_VAL, [&]() { + CTYPE_VAL val = 0; + extract_scalar(val_scalar, &val); + if (isIntegralType(out_type, /*includeBool=*/false)) { + ET_SWITCH_INT_TYPES(out_type, ctx, "clamp.out", CTYPE_OUT, [&]() { + if (is_out_of_bounds(val)) { + ET_LOG(Error, "%s value out of bounds", val_name); + is_valid = false; + } + }); + } else if (isFloatingType(out_type)) { + ET_SWITCH_FLOATH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() { + if (std::isfinite(val) && + is_out_of_bounds(val)) { + ET_LOG(Error, "%s value out of bounds", val_name); + is_valid = false; + } + }); + } + }); + + return is_valid; +} + +} // namespace + +Tensor& clamp_out( + RuntimeContext& ctx, + const Tensor& in, + const exec_aten::optional& min_opt, + const exec_aten::optional& max_opt, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, in.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType in_type = in.scalar_type(); + ScalarType min_type = in_type; + ScalarType max_type = in_type; + ScalarType common_type = in_type; + ScalarType out_type = out.scalar_type(); + + bool has_min = min_opt.has_value(); + if (has_min) { + min_type = get_scalar_dtype(min_opt.value()); + common_type = promote_type_with_scalar(common_type, min_opt.value()); + ET_KERNEL_CHECK( + ctx, + check_bounds(min_opt.value(), min_type, out_type, "minimum"), + InvalidArgument, + out); + } + bool has_max = max_opt.has_value(); + if (has_max) { + max_type = get_scalar_dtype(max_opt.value()); + common_type = promote_type_with_scalar(common_type, max_opt.value()); + ET_KERNEL_CHECK( + ctx, + check_bounds(max_opt.value(), max_type, out_type, "maximum"), + InvalidArgument, + out); + } + + ET_KERNEL_CHECK_MSG( + ctx, + has_min || has_max, + InvalidArgument, + out, + "At least one of 'min' or 'max' must not be None"); + + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + ET_SWITCH_REALH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() { + // Extract optional min value + CTYPE_OUT min = 0; + if (has_min) { + ET_SWITCH_SCALAR_OBJ_TYPES(min_type, ctx, "clamp", CTYPE_MIN, [&]() { + CTYPE_MIN min_val = 0; + extract_scalar(min_opt.value(), &min_val); + min = static_cast(min_val); + }); + } + + // Extract optional max value + CTYPE_OUT max = 0; + if (has_max) { + ET_SWITCH_SCALAR_OBJ_TYPES(max_type, ctx, "clamp", CTYPE_MAX, [&]() { + CTYPE_MAX max_val = 0; + extract_scalar(max_opt.value(), &max_val); + max = static_cast(max_val); + }); + } + + ET_SWITCH_REALHB_TYPES(in_type, ctx, "clamp", CTYPE_IN, [&]() { + torch::executor::apply_unary_map_fn( + [has_min, min, has_max, max](const CTYPE_IN val_in) { + CTYPE_OUT val_out = static_cast(val_in); + if (has_min) { + val_out = max_override(val_out, min); + } + if (has_max) { + val_out = min_override(val_out, max); + } + return val_out; + }, + in.const_data_ptr(), + out.mutable_data_ptr(), + in.numel()); + }); + }); + + return out; +} + +Tensor& clamp_tensor_out( + RuntimeContext& ctx, + const Tensor& in, + const exec_aten::optional& min_opt, + const exec_aten::optional& max_opt, + Tensor& out) { + (void)ctx; + + bool has_min = min_opt.has_value(); + bool has_max = max_opt.has_value(); + + ET_KERNEL_CHECK_MSG( + ctx, + has_min || has_max, + InvalidArgument, + out, + "At least one of 'min' or 'max' must not be None"); + + const Tensor& min = has_min ? min_opt.value() : in; + const Tensor& max = has_max ? max_opt.value() : in; + + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(in, min, max, out) == Error::Ok, + InvalidArgument, + out); + + constexpr int kNnlibMaxDim = + 4; /*fallback to not optimised if broadcast and dim > 4 */ + + ScalarType in_type = in.scalar_type(); + ScalarType min_type = min.scalar_type(); + ScalarType max_type = max.scalar_type(); + ScalarType common_type = in_type; + ScalarType out_type = out.scalar_type(); + + if (has_min) { + common_type = promoteTypes(common_type, min_type, /*half_to_float*/ true); + } + if (has_max) { + common_type = promoteTypes(common_type, max_type, /*half_to_float*/ true); + } + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + bool in_is_broadcasted = !out.sizes().equals(in.sizes()); + bool min_is_broadcasted = !out.sizes().equals(min.sizes()); + bool max_is_broadcasted = !out.sizes().equals(max.sizes()); + bool broadcast = + (in_is_broadcasted || min_is_broadcasted || max_is_broadcasted); + + int max_dim = in.dim() > min.dim() ? in.dim() : min.dim(); + max_dim = max.dim() > max_dim ? max.dim() : max_dim; + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + bool optimized = true; + bool fall_back = false; + if ((in_type != ScalarType::Float) || (min_type != ScalarType::Float) || + (max_type != ScalarType::Float)) + optimized = false; + if ((broadcast == true) && (max_dim > kNnlibMaxDim)) + optimized = false; + + if (optimized) { + if (!has_min) { + const float* const max_data = max.const_data_ptr(); + const float* const inp_data = in.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + if (broadcast) { + int out_shape[kNnlibMaxDim]; + int inp_shape[kNnlibMaxDim]; + int max_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp_shape[i] = 1; + max_shape[i] = 1; + } + + int max_dim = max.dim(), inp_dim = in.dim(), out_dim = out.dim(); + int off_o = kNnlibMaxDim - out_dim; + int off_max = kNnlibMaxDim - max_dim; + int off_inp = kNnlibMaxDim - inp_dim; + for (int i = 0; i < out_dim; i++) { + out_shape[i + off_o] = out.size(i); + } + for (int i = 0; i < max_dim; i++) { + max_shape[i + off_max] = max.size(i); + } + for (int i = 0; i < inp_dim; i++) { + inp_shape[i + off_inp] = in.size(i); + } + + WORD32 ret_val = xa_nn_elm_minimum_broadcast_4D_f32xf32_f32( + out_data, out_shape, inp_data, inp_shape, max_data, max_shape); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + } else { + WORD32 ret_val = xa_nn_elm_minimum_f32xf32_f32( + out_data, inp_data, max_data, out.numel()); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } + } else if (!has_max) { + const float* const min_data = min.const_data_ptr(); + const float* const inp_data = in.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + if (broadcast == 1) { + int out_shape[kNnlibMaxDim]; + int inp_shape[kNnlibMaxDim]; + int min_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp_shape[i] = 1; + min_shape[i] = 1; + } + + int min_dim = min.dim(), max_dim = max.dim(), inp_dim = in.dim(), + out_dim = out.dim(); + int off_o = kNnlibMaxDim - out_dim; + int off_min = kNnlibMaxDim - min_dim; + int off_inp = kNnlibMaxDim - inp_dim; + for (int i = 0; i < out_dim; i++) + out_shape[i + off_o] = out.size(i); + for (int i = 0; i < min_dim; i++) + min_shape[i + off_min] = min.size(i); + for (int i = 0; i < inp_dim; i++) + inp_shape[i + off_inp] = in.size(i); + WORD32 ret_val = xa_nn_elm_maximum_broadcast_4D_f32xf32_f32( + out_data, out_shape, inp_data, inp_shape, min_data, min_shape); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + } else { + WORD32 ret_val = xa_nn_elm_maximum_f32xf32_f32( + out_data, inp_data, min_data, out.numel()); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } + } else { + const float* const min_data = min.const_data_ptr(); + const float* const max_data = max.const_data_ptr(); + const float* const inp_data = in.const_data_ptr(); + float* const out_data = out.mutable_data_ptr(); + if (broadcast == 1) { + int out_shape[kNnlibMaxDim]; + int inp_shape[kNnlibMaxDim]; + int min_shape[kNnlibMaxDim]; + int max_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + out_shape[i] = 1; + inp_shape[i] = 1; + min_shape[i] = 1; + max_shape[i] = 1; + } + + int min_dim = min.dim(), max_dim = max.dim(), inp_dim = in.dim(), + out_dim = out.dim(); + int off_o = kNnlibMaxDim - out_dim; + int off_min = kNnlibMaxDim - min_dim; + int off_max = kNnlibMaxDim - max_dim; + int off_inp = kNnlibMaxDim - inp_dim; + for (int i = 0; i < out_dim; i++) + out_shape[i + off_o] = out.size(i); + for (int i = 0; i < min_dim; i++) + min_shape[i + off_min] = min.size(i); + + for (int i = 0; i < max_dim; i++) + max_shape[i + off_max] = max.size(i); + + for (int i = 0; i < inp_dim; i++) + inp_shape[i + off_inp] = in.size(i); + + if (inp_shape[0] != out_shape[0] || inp_shape[1] != out_shape[1] || + inp_shape[2] != out_shape[2] || inp_shape[3] != out_shape[3]) { + void* p_scratch = (void*)kernels::allocate_temp_memory( + ctx, + (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) * + sizeof(int)); + const FLOAT32* p_brd_cond = (const FLOAT32*)p_scratch; + xa_nn_broadcast_32_32( + (WORD32*)p_brd_cond, out_shape, (WORD32*)inp_data, inp_shape, 4); + + for (int i = 0; i < 4; i++) { + inp_shape[i] = out_shape[i]; + } + + WORD32 ret_val = xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32( + out_data, + out_shape, + p_brd_cond, + inp_shape, + min_data, + min_shape, + max_data, + max_shape); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + free(p_scratch); + } else { + WORD32 ret_val = xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32( + out_data, + out_shape, + inp_data, + inp_shape, + min_data, + min_shape, + max_data, + max_shape); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } + } else { + WORD32 ret_val = xa_nn_elm_clamp_f32xf32xf32_f32( + out_data, inp_data, min_data, max_data, out.numel()); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } + } + return out; + } + + constexpr auto name = "clamp.Tensor_out"; + + ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() { + ET_SWITCH_REALHB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() { + ET_SWITCH_REALHB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() { + ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { + apply_ternary_elementwise_fn< + CTYPE_IN, + CTYPE_MIN, + CTYPE_MAX, + CTYPE_OUT>( + [has_min, has_max]( + const CTYPE_IN val_in, + const CTYPE_MIN val_min, + const CTYPE_MAX val_max) { + CTYPE_OUT val_out = static_cast(val_in); + if (has_min) { + val_out = + max_override(val_out, static_cast(val_min)); + } + if (has_max) { + val_out = + min_override(val_out, static_cast(val_max)); + } + return val_out; + }, + in, + min, + max, + out); + }); + }); + }); + }); + return out; +} +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_full.cpp b/backends/cadence/hifi/operators/op_full.cpp new file mode 100644 index 0000000000..47804a64f4 --- /dev/null +++ b/backends/cadence/hifi/operators/op_full.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include +#include + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +using exec_aten::IntArrayRef; +using exec_aten::RuntimeContext; +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using torch::executor::Error; +using torch::executor::native::utils::extract_scalar; +using torch::executor::native::utils::get_scalar_dtype; + +Tensor& full_out( + RuntimeContext& ctx, + const IntArrayRef sizes, + const Scalar& fill_value, + Tensor& out) { + (void)ctx; + + ScalarType val_type = get_scalar_dtype(fill_value); + ScalarType out_type = out.scalar_type(); + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, sizes) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + constexpr auto name = "full.out"; + + bool optimized = false; + if (out_type == ScalarType::Long || out_type == ScalarType::Float || + out_type == ScalarType::Byte || out_type == ScalarType::Char) + optimized = true; + + if (out_type != val_type) + optimized = false; + + if (optimized) { + if (out_type == ScalarType::Long) { + int* data_out = out.mutable_data_ptr(); + int val; + extract_scalar(fill_value, &val); + for (size_t i = 0; i < out.numel(); ++i) { + data_out[i] = val; + } + } else if (out_type == ScalarType::Float) { + float* data_out = out.mutable_data_ptr(); + float val; + extract_scalar(fill_value, &val); + + WORD32 ret_val = xa_nn_memset_f32_f32(data_out, val, out.numel()); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + } else if (out_type == ScalarType::Byte || out_type == ScalarType::Char) { + char* data_out = out.mutable_data_ptr(); + int val; + extract_scalar(fill_value, &val); + memset((void*)data_out, val, out.numel()); + } + return out; + } + + ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, name, CTYPE_VAL, [&] { + CTYPE_VAL val; + extract_scalar(fill_value, &val); + + ET_SWITCH_REAL_TYPES_AND(Bool, out_type, ctx, name, CTYPE_OUT, [&] { + CTYPE_OUT val_casted = static_cast(val); + auto data_out = out.mutable_data_ptr(); + for (size_t i = 0; i < out.numel(); ++i) { + data_out[i] = val_casted; + } + }); + }); + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/operators/op_maximum.cpp b/backends/cadence/hifi/operators/op_maximum.cpp index f9a3658891..f85d3470e9 100644 --- a/backends/cadence/hifi/operators/op_maximum.cpp +++ b/backends/cadence/hifi/operators/op_maximum.cpp @@ -23,7 +23,6 @@ using torch::executor::apply_binary_elementwise_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; - namespace cadence { namespace impl { namespace HiFi { diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp index 478e10da71..cdc844ec5c 100644 --- a/backends/cadence/hifi/operators/op_mean.cpp +++ b/backends/cadence/hifi/operators/op_mean.cpp @@ -125,7 +125,9 @@ Tensor& mean_dim_out( int scratch_size = xa_nn_reduce_getsize_nhwc( -3, inp_shape, num_inp_dims, p_axis, num_axis_dims, 1); - void* __restrict__ p_scratch_in = (void* __restrict__)malloc(scratch_size); + void* __restrict__ p_scratch_in = + (void* __restrict__)kernels::allocate_temp_memory( + ctx, scratch_size * sizeof(int)); xa_nn_reduce_mean_4D_f32_f32( p_out, diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp new file mode 100644 index 0000000000..bb72eaf521 --- /dev/null +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -0,0 +1,198 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include +#include + +using exec_aten::ScalarType; +using exec_aten::SizesType; +using exec_aten::Tensor; +using executorch::runtime::IntArrayRef; +using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::kTensorDimensionLimit; +using executorch::runtime::resize_tensor; +using executorch::runtime::tensors_have_same_dim_order; +using torch::executor::check_permute_copy_args; +using torch::executor::Error; +using torch::executor::get_permute_copy_out_target_size; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +namespace { + +void increment_coordinate_permuted( + const Tensor& tensor, + size_t* const coordinate, + IntArrayRef dims) { + for (int i = dims.size() - 1; i >= 0; i--) { + size_t d = dims[i] >= 0 ? dims[i] : dims[i] + tensor.dim(); + coordinate[d]++; + if (coordinate[d] == tensor.size(d)) { + coordinate[d] = 0; + } else { + return; + } + } +} + +} // namespace + +Tensor& permute_copy_out( + KernelRuntimeContext& ctx, + const Tensor& in, + IntArrayRef dims, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, check_permute_copy_args(in, dims, out), InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); + + Tensor::SizesType expected_out_size[kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_permute_copy_out_target_size( + in, dims, expected_out_size, &expected_out_dim); + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + const auto in_type = out.scalar_type(); + + constexpr auto name = "permute_copy.out"; + constexpr int kNnlibMaxDim = 16; + + bool optimized = false; + + if (out.scalar_type() == ScalarType::Float || + out.scalar_type() == ScalarType::Char || + out.scalar_type() == ScalarType::Byte) + optimized = true; + + if (in.dim() > kNnlibMaxDim) + optimized = false; + + if (optimized) { + if (in_type == ScalarType::Float) { + WORD32* p_inp = (WORD32*)in.const_data_ptr(); + WORD32* p_out = (WORD32*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + WORD32 ret_val = xa_nn_transpose_32_32( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + } else if (in_type == ScalarType::Char) { + WORD8* p_inp = (WORD8*)in.const_data_ptr(); + WORD8* p_out = (WORD8*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + WORD32 val = xa_nn_transpose_8_8( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + ET_KERNEL_CHECK(ctx, val == 0, Internal, out); + + } else if (in_type == ScalarType::Byte) { + WORD8* p_inp = (WORD8*)in.const_data_ptr(); + WORD8* p_out = (WORD8*)out.mutable_data_ptr(); + + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; + + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } + + WORD32 val = xa_nn_transpose_8_8( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + ET_KERNEL_CHECK(ctx, val == 0, Internal, out); + } + return out; + } + + size_t in_coord[kTensorDimensionLimit] = {0}; + size_t trailing_dims_memo[kTensorDimensionLimit]; + executorch::runtime::memoizeTrailingDims(in, trailing_dims_memo); + + // in and out must be the same dtype + ET_SWITCH_ALL_TYPES(in_type, ctx, name, CTYPE, [&] { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + for (size_t i = 0; i < out.numel(); ++i) { + out_data[i] = + in_data[executorch::runtime::coordinateToIndexWithTrailingDimsMemo( + in, in_coord, trailing_dims_memo)]; + increment_coordinate_permuted(in, in_coord, dims); + } + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index 9669e96123..74c24afbc0 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -120,9 +120,11 @@ Tensor& pow_Tensor_Tensor_out( if (optimized) { if (broadcast) { WORD32* __restrict__ ptr1 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ ptr2 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); @@ -155,7 +157,8 @@ Tensor& pow_Tensor_Tensor_out( free(ptr2); } else if (a_is_broadcasted && (!b_is_broadcasted)) { FLOAT32* __restrict__ ptr1 = - (FLOAT32* __restrict__)malloc((num_elm + 2) * sizeof(WORD32)); + (FLOAT32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); @@ -182,7 +185,8 @@ Tensor& pow_Tensor_Tensor_out( free(ptr1); } else if (b_is_broadcasted && (!a_is_broadcasted)) { WORD32* __restrict__ ptr1 = - (WORD32* __restrict__)malloc(num_elm * sizeof(WORD32)); + (WORD32* __restrict__)kernels::allocate_temp_memory( + ctx, num_elm * sizeof(int)); WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); @@ -351,4 +355,3 @@ Tensor& pow_Scalar_out( } // namespace HiFi } // namespace impl } // namespace cadence - diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp new file mode 100644 index 0000000000..7fba5a5385 --- /dev/null +++ b/backends/cadence/hifi/operators/op_remainder.cpp @@ -0,0 +1,258 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include + +#include "kernels.h" + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +using exec_aten::Scalar; +using exec_aten::ScalarType; +using exec_aten::Tensor; +using executorch::aten::RuntimeContext; +using executorch::runtime::canCast; +using executorch::runtime::promoteTypes; +using torch::executor::apply_binary_elementwise_fn; +using torch::executor::apply_unary_map_fn; +using torch::executor::Error; +using torch::executor::native::utils::extract_scalar; +using torch::executor::native::utils::get_scalar_dtype; +using torch::executor::native::utils::promote_type_with_scalar; +using torch::executor::native::utils::remainder_override; +using torch::executor::resize_to_broadcast_target_size; +using executorch::runtime::can_cast; +using executorch::runtime::CppTypeToScalarType; + +namespace { +template < + bool can_cast, + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner { + static void run(const Tensor& a, const Tensor& b, Tensor& out) { + apply_binary_elementwise_fn( + // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) + [](const CTYPE_A val_a, const CTYPE_B val_b) { + CTYPE_IN a_casted = static_cast(val_a); + CTYPE_IN b_casted = static_cast(val_b); + CTYPE_IN value = remainder_override(a_casted, b_casted); + + return static_cast(value); + }, + a, + b, + out); + } +}; + +struct ReportCanCastBug { + static void run(const Tensor&, const Tensor&, Tensor&) { + ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); + } +}; + +template < + typename CTYPE_A, + typename CTYPE_B, + typename CTYPE_IN, + typename CTYPE_OUT> +struct RemainderInner + : public ReportCanCastBug {}; + +} // namespace +Tensor& remainder_Tensor_out( + RuntimeContext& ctx, + const Tensor& a, + const Tensor& b, + Tensor& out) { + (void)ctx; + + constexpr int kNnlibMaxDim = + 4; /*fallback to not optimised if broadcast and dim > 4 */ + + bool a_is_broadcasted = !out.sizes().equals(a.sizes()); + bool b_is_broadcasted = !out.sizes().equals(b.sizes()); + bool broadcast = (a_is_broadcasted || b_is_broadcasted); + + int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); + max_dim = out.dim() > max_dim ? out.dim() : max_dim; + + bool optimized = true; + + if((a.scalar_type() != ScalarType::Float)||(b.scalar_type() != ScalarType::Float)) + optimized = false; + + if ((broadcast == true) && (max_dim > kNnlibMaxDim)) + optimized = false; + + if(optimized) + { + FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); + const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); + const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); + + if(broadcast) + { + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + WORD32 p_inp2_shape[kNnlibMaxDim]; + + for(int i = 0; i < kNnlibMaxDim; i++) + { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; + } + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + + for(int i = 0; i < out.dim(); i++) + p_out_shape[i+off_o] = out.size(i); + for(int i = 0; i < a.dim(); i++) + p_inp1_shape[i+off_a] = a.size(i); + for(int i = 0; i < b.dim(); i++) + p_inp2_shape[i+off_b] = b.size(i); + + WORD32 ret_val = xa_nn_elm_remainder_broadcast_4D_f32xf32_f32(p_out, + p_out_shape, + p_inp1, + p_inp1_shape, + p_inp2, + p_inp2_shape); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } + else{ + WORD32 ret_val = xa_nn_elm_remainder_f32xf32_f32(p_out, + p_inp1, + p_inp2, + out.numel()); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } + return out; + } + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "remainder.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "remainder.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES( + out_type, ctx, "remainder.Tensor_out", CTYPE_OUT, [&]() { + RemainderInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); + + return out; +} + +Tensor& remainder_Scalar_out( + RuntimeContext& ctx, + const Tensor& a, + const Scalar& b, + Tensor& out) { + (void)ctx; + + // Resize for dynamic shape + ET_KERNEL_CHECK_MSG( + ctx, + resize_tensor(out, a.sizes()) == Error::Ok, + InvalidArgument, + out, + "Failed to resize output tensor."); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = get_scalar_dtype(b); + ScalarType common_type = promote_type_with_scalar(a_type, b); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "remainder.Scalar_out", CTYPE_A, [&]() { + ET_SWITCH_SCALAR_OBJ_TYPES( + b_type, ctx, "remainder.Scalar_out", CTYPE_B, [&]() { + CTYPE_B val_b = 0; + extract_scalar(b, &val_b); + ET_SWITCH_REAL_TYPES( + common_type, ctx, "remainder.Scalar_out", CTYPE_IN, [&]() { + ET_SWITCH_REAL_TYPES( + out_type, + ctx, + "remainder.Scalar_out", + CTYPE_OUT, + [&]() { + apply_unary_map_fn( + [val_b](const CTYPE_A val_a) { + CTYPE_IN a_casted = + static_cast(val_a); + CTYPE_IN b_casted = + static_cast(val_b); + CTYPE_IN value = remainder_override( + a_casted, b_casted); + + return static_cast(value); + }, + a.const_data_ptr(), + out.mutable_data_ptr(), + out.numel()); + }); + }); + }); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp new file mode 100644 index 0000000000..a2068fd15b --- /dev/null +++ b/backends/cadence/hifi/operators/op_softmax.cpp @@ -0,0 +1,195 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include + +#include +#include +#include +#include +#include "kernels.h" + +using Tensor = exec_aten::Tensor; +using exec_aten::ScalarType; +using executorch::runtime::KernelRuntimeContext; +using torch::executor::Error; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +Tensor& softmax_out( + KernelRuntimeContext& ctx, + const Tensor& in, + int64_t dim, + bool half_to_float, + Tensor& out) { + (void)ctx; + + ET_KERNEL_CHECK( + ctx, + torch::executor::check_softmax_args(in, dim, half_to_float, out), + InvalidArgument, + out); + + ET_KERNEL_CHECK( + ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); + + ET_KERNEL_CHECK( + ctx, + executorch::runtime::tensors_have_same_dim_order(in, out), + InvalidArgument, + out); + + // Adjust for negative dim + dim = dim < 0 ? dim + executorch::runtime::nonzero_dim(in) : dim; + + const exec_aten::optional& dim_t = dim; + const size_t d = ET_NORMALIZE_IX(dim_t.value(), in.dim()); + const size_t size = in.size(d); + + size_t stride = 1, outer_size = 1; + + size_t outer_stride = 1; + + constexpr auto name = "_softmax.out"; + constexpr int kNnlibMaxDim = 16; + + bool optimized = true; + + if (out.scalar_type() != ScalarType::Float) + optimized = false; + + if (in.dim() > kNnlibMaxDim) + optimized = false; + + if (optimized) { + int* p_inp = (int*)in.const_data_ptr(); + int* out_data = (int*)out.mutable_data_ptr(); + + int num_inp_dims = in.dim(); + int num_out_dims = num_inp_dims; + + int p_inp_shape[kNnlibMaxDim]; + int p_out_shape[kNnlibMaxDim]; + int p_permute_vec[kNnlibMaxDim]; + + for (int i = 0; i < num_inp_dims; i++) + p_inp_shape[i] = in.size(i); + + for (int i = 0; i < num_inp_dims; i++) { + if (i == d) + p_permute_vec[i] = num_inp_dims - 1; + else if (i == (num_inp_dims - 1)) + p_permute_vec[num_inp_dims - 1] = d; + else + p_permute_vec[i] = i; + + p_out_shape[i] = p_inp_shape[p_permute_vec[i]]; + + if (i != d) + outer_size = outer_size * p_inp_shape[i]; + } + + outer_stride = size; + + int* p_out = + (int*)kernels::allocate_temp_memory(ctx, out.numel() * sizeof(int)); + int* p_out1 = + (int*)kernels::allocate_temp_memory(ctx, out.numel() * sizeof(int)); + + WORD32 ret_val = xa_nn_transpose_32_32( + p_out, + p_out_shape, + p_inp, + p_inp_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + for (size_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { + size_t outer = outer_idx * outer_stride; + for (size_t inner_idx = 0; inner_idx < stride; ++inner_idx) { + size_t base = outer + inner_idx; + + float* p_in_data = (float*)&p_out[base]; + float* p_out_data = (float*)&p_out1[base]; + + ret_val = xa_nn_vec_softmax_f32_f32(p_out_data, p_in_data, size); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } + } + + ret_val = xa_nn_transpose_32_32( + out_data, + p_inp_shape, + p_out1, + p_out_shape, + p_permute_vec, + num_out_dims, + num_inp_dims); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + + return out; + } + + ET_SWITCH_FLOATH_TYPES(in.scalar_type(), ctx, name, CTYPE, [&]() { + const CTYPE* const in_data = in.const_data_ptr(); + CTYPE* const out_data = out.mutable_data_ptr(); + + torch::executor::apply_over_dim( + [in_data, out_data]( + const size_t size, const size_t stride, const size_t base) { + // calculate max in softmax dim. During softmax computation each + // value is subtracted by the maximum in value before calling exp + // to preserve numerical stability. + const CTYPE max_in = torch::executor::apply_unary_reduce_fn( + [](const CTYPE val_in, CTYPE val_accum) { + return std::max(val_in, val_accum); + }, + in_data + base, + size, + stride); + + const CTYPE temp_sum = + torch::executor::apply_unary_map_reduce_fn( + [max_in](const CTYPE val_in) { + return std::exp(val_in - max_in); + }, + [](const CTYPE mapped_in, CTYPE val_accum) { + return val_accum + mapped_in; + }, + in_data + base, + size, + stride); + + torch::executor::apply_unary_map_fn( + [max_in, temp_sum](const CTYPE val_in) { + return std::exp(val_in - max_in) / temp_sum; + }, + in_data + base, + out_data + base, + size, + stride); + }, + in, + dim); + }); + + return out; +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp index 06bd0bc3c9..c4ad8177cf 100644 --- a/backends/cadence/hifi/operators/op_where.cpp +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -109,8 +109,10 @@ Tensor& where_out( if (con_shape[0] != out_shape[0] || con_shape[1] != out_shape[1] || con_shape[2] != out_shape[2] || con_shape[3] != out_shape[3]) { - void* p_scratch = - malloc(out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]); + void* p_scratch = (void*)kernels::allocate_temp_memory( + ctx, + (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) * + sizeof(int)); const unsigned char* p_brd_cond = (const unsigned char*)p_scratch; xa_nn_broadcast_8_8( (WORD8* __restrict__)p_brd_cond, diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp new file mode 100644 index 0000000000..6b7fae6e05 --- /dev/null +++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp @@ -0,0 +1,103 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include +#include + +using Tensor = exec_aten::Tensor; +using KernelRuntimeContext = torch::executor::KernelRuntimeContext; +using ScalarType = exec_aten::ScalarType; + +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + +template +void quantized_relu_( + const Tensor& input, + const Tensor& in_zero_point, + const int64_t out_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + T q_zero_point = in_zero_point.const_data_ptr()[0]; + const T* __restrict__ in = input.const_data_ptr(); + T* __restrict__ out = output.mutable_data_ptr(); + + const int32_t* __restrict__ out_multiplier_data = + out_multiplier.const_data_ptr(); + const int32_t* __restrict__ out_shift_data = + out_shift.const_data_ptr(); + + // Compute the out_scale from out_multiplier and out_shift + const float out_scale = + -out_multiplier_data[0] * 1.0 / (1 << 31) * pow(2, out_shift_data[0]); + + for (size_t i = 0, e = input.numel(); i < e; ++i) { + float temp = in[i] > q_zero_point ? (in[i] - q_zero_point) : 0; + out[i] = kernels::quantize(temp, out_scale, (int32_t)out_zero_point); + } +} + +void quantized_relu_out( + KernelRuntimeContext& ctx, + const Tensor& input, + const Tensor& in_zero_point, + const int64_t out_zero_point, + const Tensor& out_multiplier, + const Tensor& out_shift, + Tensor& output) { + if (input.scalar_type() == executorch::aten::ScalarType::Byte) { + const uint8_t* p_in = input.const_data_ptr(); + uint8_t* p_out = output.mutable_data_ptr(); + uint8_t q_zero_point = in_zero_point.const_data_ptr()[0]; + + WORD32 ret_val = xa_nn_vec_relu_asym8u_asym8u( + p_out, + p_in, + (int)q_zero_point, + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + (int)out_zero_point, + (int)out_zero_point, + 255, + input.numel()); + + ET_CHECK_MSG(ret_val == 0, "An internal error occured"); + + } else if (input.scalar_type() == executorch::aten::ScalarType::Char) { + const int8_t* p_in = input.const_data_ptr(); + int8_t* p_out = output.mutable_data_ptr(); + int8_t q_zero_point = in_zero_point.const_data_ptr()[0]; + + WORD32 ret_val = xa_nn_vec_relu_asym8s_asym8s( + p_out, + p_in, + (int)q_zero_point, + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + (int)out_zero_point, + (int)out_zero_point, + 127, + input.numel()); + + ET_CHECK_MSG(ret_val == 0, "An internal error occured"); + + } else { + ET_CHECK_MSG( + false, + "Unhandled input dtype %hhd", + static_cast(input.scalar_type())); + } +} + +} // namespace native +} // namespace HiFi +} // namespace impl +} // namespace cadence \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c new file mode 100644 index 0000000000..244f404d2e --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_concat_32.c @@ -0,0 +1,172 @@ +#include "xa_type_def.h" +#include "xa_nn_common.h" +#include "xa_nnlib_kernels_api.h" +#include "xa_nnlib_common_macros.h" +#include "xa_nnlib_err_chk.h" +#include "xa_nnlib_common.h" + +WORD32 xa_nn_concat_32_32(WORD32 * __restrict__ p_out + ,const WORD32 *const p_out_shape + ,const WORD32 **pp_inps + ,const WORD32 *const *pp_inps_shape + ,WORD32 num_out_dims + ,WORD32 num_inp + ,WORD32 num_inp_dims + ,WORD32 axis) +{ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps, -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps, sizeof(WORD32 *), -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape, sizeof(WORD32 *), -1); + //Validate Arguments + XA_NNLIB_ARG_CHK_COND((num_out_dims <= 0 || num_out_dims > 6), -1); + XA_NNLIB_ARG_CHK_COND((num_inp <= 0 || num_inp > 10), -1); + XA_NNLIB_ARG_CHK_COND((num_inp_dims != num_out_dims), -1); + XA_NNLIB_ARG_CHK_COND((axis < -num_out_dims || axis >= num_out_dims), -1); + + int i = 0, j = 0; + for(i = 0; i < num_out_dims; i++) + { + XA_NNLIB_ARG_CHK_COND((p_out_shape[i] <= 0), -1); + } + + if(axis < 0) + axis = num_out_dims + axis; + + WORD32 concat_size = 0; + for (i = 0; i < num_inp; i++) + { + XA_NNLIB_ARG_CHK_PTR(pp_inps[i], -1); + XA_NNLIB_ARG_CHK_PTR(pp_inps_shape[i], -1); + XA_NNLIB_ARG_CHK_ALIGN(pp_inps_shape[i], sizeof(WORD32), -1); +#pragma loop_count min=1 + for(j = 0; j < num_out_dims; j++) + { + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][j] != p_out_shape[j] && j != axis), -1); + } + + XA_NNLIB_ARG_CHK_COND((pp_inps_shape[i][axis] <= 0), -1); + concat_size += pp_inps_shape[i][axis]; + } + + XA_NNLIB_ARG_CHK_COND((p_out_shape[axis] != concat_size), -1); + + //Calculate outer and inner size for axis + WORD32 outer_size = 1; +#pragma no_simd + for(int i = 0; i < axis; i++) + { + outer_size *= p_out_shape[i]; + } + + WORD32 base_inner_size = 1; +#pragma no_simd + for(int i = axis + 1; i < num_out_dims; i++) + { + base_inner_size *= p_out_shape[i]; + } + + WORD32 *ptmp_out = p_out; + for(int i = 0; i < num_inp; i++) + { + const WORD32 copy_size = pp_inps_shape[i][axis] * base_inner_size; + WORD32 *output_ptr = ptmp_out; + const WORD32* input_ptr = pp_inps[i]; + + if(((copy_size & 1) == 0) && (((concat_size * base_inner_size) & 1) == 0) + && (((unsigned)input_ptr & 1) == 0) && (((unsigned)output_ptr & 1) == 0)) + { + if(copy_size <= 8) + { + const ae_f32 *pae_inp = (const ae_f32 *)input_ptr; + for(int k = 0; k < outer_size; k++) + { + ae_f32 *pae_out = (ae_f32 *)output_ptr; +#pragma concurrent +#pragma no_simd + for(int ic = 0; ic < copy_size; ic++) + { + *pae_out++ = *pae_inp++; + } + output_ptr += concat_size * base_inner_size; + } + } + else + { + for(int k = 0; k < outer_size; k++) + { + const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; + ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + for(int ic = 0; ic < (copy_size >> 1); ic++) + { + ae_int32x2 d0; + AE_LA32X2_IP(d0, inp_a, pae_inp); + AE_SA32X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + const ae_f32 *puae_inp = (const ae_f32 *)pae_inp; + ae_f32 *puae_out = (ae_f32 *)pae_out; +#pragma concurrent + for(int ic = 0; ic < (copy_size & 1); ic++) + { + puae_out[copy_size - 1] = puae_inp[copy_size - 1]; + } + input_ptr += copy_size; + output_ptr += concat_size * base_inner_size; + } + } + } + else + { + if(copy_size <= 6) + { + for(int k = 0; k < outer_size; k++) + { +#pragma concurrent +#pragma no_unroll + for(int ic = 0; ic < copy_size; ic++) + { + output_ptr[ic] = *input_ptr++; + } + output_ptr += concat_size * base_inner_size; + } + } + else + { + for(int k = 0; k < outer_size; k++) + { + const ae_int32x2 *pae_inp = (const ae_int32x2 *)input_ptr; + ae_int32x2 *pae_out = (ae_int32x2 *)output_ptr; + ae_valign inp_a, out_a; + inp_a = AE_LA64_PP(pae_inp); + out_a = AE_ZALIGN64(); + +#pragma concurrent + for(int ic = 0; ic < copy_size >> 1; ic++) + { + ae_int32x2 d0; + AE_LA32X2_IP(d0, inp_a, pae_inp); + AE_SA32X2_IP(d0, out_a, pae_out); + } + AE_SA64POS_FP(out_a, pae_out); + + for(int ic = 0; ic < (copy_size & 1); ic++) + { + output_ptr[copy_size - 1] = input_ptr[copy_size - 1]; + } + input_ptr += copy_size; + output_ptr += concat_size * base_inner_size; + } + } + } + ptmp_out += copy_size; + } + return 0; +} \ No newline at end of file diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c new file mode 100644 index 0000000000..6f95360ed9 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_atan2_f32.c @@ -0,0 +1,882 @@ +/* ------------------------------------------------------------------------ */ +/* Copyright (c) 2018 by Cadence Design Systems, Inc. ALL RIGHTS RESERVED. */ +/* These coded instructions, statements, and computer programs ("Cadence */ +/* Libraries") are the copyrighted works of Cadence Design Systems Inc. */ +/* Cadence IP is licensed for use with Cadence processor cores only and */ +/* must not be used for any other processors and platforms. Your use of the */ +/* Cadence Libraries is subject to the terms of the license agreement you */ +/* have entered into with Cadence Design Systems, or a sublicense granted */ +/* to you by a direct Cadence licensee. */ +/* ------------------------------------------------------------------------ */ +/* IntegrIT, Ltd. www.integrIT.com, info@integrIT.com */ +/* */ +/* DSP Library */ +/* */ +/* This library contains copyrighted materials, trade secrets and other */ +/* proprietary information of IntegrIT, Ltd. This software is licensed for */ +/* use with Cadence processor cores only and must not be used for any other */ +/* processors and platforms. The license to use these sources was given to */ +/* Cadence, Inc. under Terms and Condition of a Software License Agreement */ +/* between Cadence, Inc. and IntegrIT, Ltd. */ +/* ------------------------------------------------------------------------ */ +/* Copyright (C) 2015-2018 IntegrIT, Limited. */ +/* All Rights Reserved. */ +/* ------------------------------------------------------------------------ */ +#include + +#include "../include/NatureDSP_Signal_math.h" +#include "NatureDSP_types.h" +#include "xa_nn_common.h" + +/* Common helper macros. */ +#include "xa_nnlib_common_fpu.h" + +#include "xa_nnlib_common.h" + +const union ufloat32uint32 xa_nnlib_plusInff ={0x7f800000}; +const union ufloat32uint32 xa_nnlib_qNaNf = { 0x7fc00000 }; +const union ufloat32uint32 pif ={0x40490fdb}; /* pi */ +const union ufloat32uint32 pi2f={0x3fc90fdb}; /* pi/2 */ + +const union ufloat32uint32 ALIGN(8) xa_nnlib_atanftbl1[8] = +{ + {0x3dbc14c0},/* 9.183645248413086e-002 */ + {0xbe30c39c},/*-1.726211905479431e-001 */ + {0x3b2791e4},/* 2.556913532316685e-003 */ + {0x3e4dac9d},/* 2.008537799119949e-001 */ + {0xb97d9a57},/*-2.418545627733693e-004 */ + {0xbeaaa7b5},/*-3.333107531070709e-001 */ + {0xb54f34c8},/*-7.719031600572635e-007 */ + {0x31cf3fa2} /* 6.031727117772334e-009 */ +}; + +const union ufloat32uint32 ALIGN(8) xa_nnlib_atanftbl2[8]= +{ + {0xbcccc037},/*-2.499399892985821e-002 */ + {0x3e217c35},/* 1.577003747224808e-001 */ + {0xbecf4163},/*-4.047957360744476e-001 */ + {0x3ef7b762},/* 4.838209748268127e-001 */ + {0xbdf35059},/*-1.188055947422981e-001 */ + {0xbe9b8b75},/*-3.037983477115631e-001 */ + {0xbb80ed5c},/*-3.934545442461968e-003 */ + {0x3956fc52} /* 2.050262701231986e-004 */ +}; + +#if !HAVE_VFPU && !HAVE_FPU +DISCARD_FUN(void, xa_nn_elm_atan2_f32,( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, int N )) +#elif HAVE_VFPU +#define sz_f32 (int)sizeof(FLOAT32) + +/*=========================================================================== + Vector matematics: + vec_atan2 full quadrant Arctangent +===========================================================================*/ + +/*------------------------------------------------------------------------- + Full-Quadrant Arc Tangent + The functions compute the arc tangent of the ratios y[N]/x[N] and store the + result to output vector z[N]. + Floating point functions output is in radians. Fixed point functions + scale its output by pi. + + NOTE: + 1. Scalar floating point function is compatible with standard ANSI C routines and set + errno and exception flags accordingly + 2. Scalar floating point function assigns EDOM to errno whenever y==0 and x==0. + + Accuracy: + 24 bit version: 768 (3.57e-7) + floating point: 2 ULP + + Special cases: + y | x | result | extra conditions + --------|-------|-----------|--------------------- + +/-0 | -0 | +/-pi | + +/-0 | +0 | +/-0 | + +/-0 | x | +/-pi | x<0 + +/-0 | x | +/-0 | x>0 + y | +/-0 | -pi/2 | y<0 + y | +/-0 | pi/2 | y>0 + +/-y | -inf | +/-pi | finite y>0 + +/-y | +inf | +/-0 | finite y>0 + +/-inf | x | +/-pi/2 | finite x + +/-inf | -inf | +/-3*pi/4 | + +/-inf | +inf | +/-pi/4 | + + Input: + y[N] vector of numerator values, Q31 or floating point + x[N] vector of denominator values, Q31 or floating point + N length of vectors + Output: + z[N] results, Q31 or floating point + +---------------------------------------------------------------------------*/ + +void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WORD32 N ) +{ + /* + const union ufloat32uint32* p; + int sx,sy,big; + sx=takesignf(x); + sy=takesignf(y); + x=fabs(x); + y=fabs(y); + if(x==0.f && y==0.f) + { + // The actual result depends on input signs. + x = 1.f; + y = 0.f; + } + + big=x>y; + if(big) + { + x=y/x; + } + else + { + // compare x==y is necessary to support (+/-Inf, +/-Inf) cases + x = (x == y) ? 1.0f : x / y; + } + p = (x<0.5f) ? atanftbl1 : atanftbl2; + // approximate atan(x)/x-1 + y = p[0].f; + y = x*y + p[1].f; + y = x*y + p[2].f; + y = x*y + p[3].f; + y = x*y + p[4].f; + y = x*y + p[5].f; + y = x*y + p[6].f; + y = x*y + p[7].f; + // convert result to true atan(x) + y = x*y + x; + + if (!big) y = pi2f.f - y; + if (sx) y = pif.f - y; + if (sy) y = -y; + return y; + */ + + const xtfloatx2 * X; + const xtfloatx2 * Y; + xtfloatx2 * restrict Z; + const xtfloatx2 * S_rd; + xtfloatx2 * restrict S_wr; + + ae_valign X_va, Y_va, Z_va; + + /* Current block index; overall number of blocks; number of values in the current block */ + int blkIx, blkNum, blkLen; + /* Block size, blkLen <= blkSize */ + const int blkSize = MAX_ALLOCA_SZ/sz_f32; + /* Allocate a fixed-size scratch area on the stack. */ + FLOAT32 ALIGN(8) scr[blkSize]; + + int n; + + if ( N<=0 ) return; + + NASSERT_ALIGN8( scr ); + + /* + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ + + blkNum = ( N + blkSize-1 )/blkSize; + + for ( blkIx=0; blkIxy0 ) p0 = y0/x0; + * // Special case of x==y is necessary to support (+/-Inf, +/-Inf) cases. + * else p0 = ( x0==y0 ? 1.f : x0/y0 ); + * + * scr[n] = p0; + * } + * } + */ + + { + /* Input values */ + xtfloatx2 x0, y0; + /* Numerator; denominator; reciprocal; quotient */ + xtfloatx2 num, den, rcp, quo; + /* Scaling factor; error term */ + xtfloatx2 scl, eps; + /* Is NaN; Inf/Inf; x/Inf; 0/0; x and y are subnormal */ + xtbool2 b_nan, b_num_inf, b_den_inf, b_eqz, b_subn; + + X = (xtfloatx2*)( (uintptr_t)x + blkIx*blkSize*sz_f32 ); + Y = (xtfloatx2*)( (uintptr_t)y + blkIx*blkSize*sz_f32 ); + S_wr = (xtfloatx2*)scr; + + X_va = XT_LASX2PP( X ); + Y_va = XT_LASX2PP( Y ); + + __Pragma( "loop_count min=1" ); + for ( n=0; n<(blkLen+1)/2; n++ ) + { + XT_LASX2IP( x0, X_va, X ); + XT_LASX2IP( y0, Y_va, Y ); + + /* Replicate NaNs in both x and y to ensure NaN propagation. */ + b_nan = XT_UN_SX2( x0, y0 ); + XT_MOVT_SX2( x0, xa_nnlib_qNaNf.f, b_nan ); + XT_MOVT_SX2( y0, xa_nnlib_qNaNf.f, b_nan ); + + x0 = XT_ABS_SX2( x0 ); + y0 = XT_ABS_SX2( y0 ); + + /* num <= den */ + num = XT_MIN_SX2( x0, y0 ); + den = XT_MAX_SX2( y0, x0 ); + + /* Scale up numerator and denominator if BOTH are subnormal. */ + b_subn = XT_OLT_SX2( num, FLT_MIN ); + scl = (xtfloatx2)8388608.f; XT_MOVF_SX2( scl, (xtfloatx2)1.0f, b_subn ); + num = XT_MUL_SX2( num, scl ); + den = XT_MUL_SX2( den, scl ); + + /* Classify numerator and denominator. */ + b_num_inf = XT_OEQ_SX2( num, xa_nnlib_plusInff.f ); /* Inf/Inf */ + b_den_inf = XT_OEQ_SX2( den, xa_nnlib_plusInff.f ); /* x/Inf */ + b_eqz = XT_OEQ_SX2( den, (xtfloatx2)(xtfloatx2)(0.0f) ); /* 0/0 */ + + /* Initial appromimation for 1/den. */ + rcp = XT_RECIP0_SX2( den ); + /* Newton-Raphson iteration for 1/den. */ + eps = (xtfloatx2)1.0f; + XT_MSUB_SX2( eps, rcp, den ); + XT_MADD_SX2( rcp, rcp, eps ); + /* Approximation for the quotient num/den. */ + quo = XT_MUL_SX2( num, rcp ); + /* Refine the quotient by a modified Newton-Raphson iteration. */ + eps = num; + XT_MSUB_SX2( eps, quo, den ); + XT_MADD_SX2( quo, rcp, eps ); + + /* Force conventional results for special cases. */ + XT_MOVT_SX2( quo, (xtfloatx2)(0.0f), b_den_inf ); /* x/Inf -> 0 */ + XT_MOVT_SX2( quo, (xtfloatx2)1.0f, b_num_inf ); /* Inf/Inf -> 1 */ + XT_MOVT_SX2( quo, (xtfloatx2)(0.0f), b_eqz ); /* 0/0 -> 0 */ + + XT_SSX2IP( quo, S_wr, +2*sz_f32 ); + } + } + + __Pragma( "no_reorder" ); + + /* + * Part II, polynomial approximation and full quadrant restoration. + * Reference C code: + * + * { + * const union ufloat32uint32 * ptbl; + * float32_t x0, y0, z0, p0; + * int sx, sy; + * + * for ( n=0; n0 + y | +/-0 | -pi/2 | y<0 + y | +/-0 | pi/2 | y>0 + +/-y | -inf | +/-pi | finite y>0 + +/-y | +inf | +/-0 | finite y>0 + +/-inf | x | +/-pi/2 | finite x + +/-inf | -inf | +/-3*pi/4 | + +/-inf | +inf | +/-pi/4 | + +Input: + y[N] input data, Q15 or floating point + x[N] input data, Q15 or floating point + N length of vectors +Output: + z[N] result, Q15 or floating point + +Restrictions: +x, y, z should not overlap +---------------------------------------------------------------------------*/ + +// Taken from Fusion +void xa_nn_elm_atan2_f32( FLOAT32 * z, const FLOAT32 * y, const FLOAT32 * x, WORD32 N ) +{ + /* + * const union ufloat32uint32* p; + * int sx,sy,big; + * sx=takesignf(x); + * sy=takesignf(y); + * x=fabs(x); + * y=fabs(y); + * if(x==0.f && y==0.f) + * { + * // The actual result depends on input signs. + * x = 1.f; + * y = 0.f; + * } + * + * big=x>y; + * if(big) + * { + * x=y/x; + * } + * else + * { + * // compare x==y is necessary to support (+/-Inf, +/-Inf) cases + * x = (x == y) ? 1.0f : x / y; + * } + * p = (x<0.5f) ? atanftbl1 : atanftbl2; + * // approximate atan(x)/x-1 + * y = p[0].f; + * y = x*y + p[1].f; + * y = x*y + p[2].f; + * y = x*y + p[3].f; + * y = x*y + p[4].f; + * y = x*y + p[5].f; + * y = x*y + p[6].f; + * y = x*y + p[7].f; + * // convert result to true atan(x) + * y = x*y + x; + * + * if (!big) y = pi2f.f - y; + * if (sx) y = pif.f - y; + * if (sy) y = -y; + * return y; + */ + const xtfloat * restrict X; + const xtfloat * restrict Y; + int32_t * restrict Z; + const xtfloat * restrict S_rd; + xtfloat * restrict S_wr; + const xtfloat * restrict POLY_TBL1; + const xtfloat * restrict POLY_TBL2; + + /* Current block index; overall number of blocks; number of values in the current block */ + int blkIx, blkNum, blkLen; + /* Block size, blkLen <= blkSize */ + const int blkSize = MAX_ALLOCA_SZ / sz_f32; + /* Allocate a fixed-size scratch area on the stack. */ + float32_t ALIGN(8) scr[blkSize]; + + int n; + + if (N <= 0) return; + + NASSERT_ALIGN8(scr); + + /* + * Data are processed in blocks of scratch area size. Further, the algorithm + * implementation is splitted in order to feed the optimizing compiler with a + * few loops of managable size. + */ + + blkNum = (N + blkSize - 1) / blkSize; + POLY_TBL1 = (xtfloat*)xa_nnlib_atanftbl1; + POLY_TBL2 = (xtfloat*)xa_nnlib_atanftbl2; + for (blkIx = 0; blkIxy0 ) p0 = y0/x0; + * // Special case of x==y is necessary to support (+/-Inf, +/-Inf) cases. + * else p0 = ( x0==y0 ? 1.f : x0/y0 ); + * + * scr[n] = p0; + * } + * } + */ + + { + /* Input values */ + xtfloat x0, y0, i0; + /* Numerator; denominator; reciprocal; quotient */ + xtfloat num, den, rcp, quo; + /* Auxiliary vars */ + xtfloat s, eps; + /* Is NaN; Inf/Inf; x/Inf; 0/0; x and y are subnormal */ + xtbool b_nan, b_num_inf, b_den_inf, b_eqz, b_subn; + const xtfloat * pT; + + X = (xtfloat*)((uintptr_t)x + blkIx*blkSize*sz_f32); + Y = (xtfloat*)((uintptr_t)y + blkIx*blkSize*sz_f32); + S_wr = (xtfloat*)scr; + + static const uint32_t TAB[4] = { 0x7fc00000, 0x00800000, + 0x4b000000, 0x7f800000 + }; + pT = (xtfloat *)TAB; + __Pragma("loop_count min=1"); + for (n = 0; n 0 or x/Inf -> 0*/ + XT_MOVT_S(quo, XT_CONST_S(1), b_num_inf); /* Inf/Inf -> 1 */ + + XT_SSIP(quo, S_wr, sz_f32); + } + } + __Pragma("no_reorder"); + + /* + * Part II, polynomial approximation and full quadrant restoration. + * Reference C code: + * + * { + * const union ufloat32uint32 * ptbl; + * float32_t x0, y0, z0, p0; + * int sx, sy; + * + * for ( n=0; n>1;i++) + { + XT_LSX2IP(x1, inp, 2*sizeof(FLOAT32)); + XT_LSX2IP(d_min, min, 2*sizeof(FLOAT32)); + XT_LSX2IP(d_max, max, 2*sizeof(FLOAT32)); + + y = XT_MAX_SX2(x1, d_min); + y = XT_MIN_SX2(y, d_max); + + XT_SSX2IP( y, out, 2*sizeof(FLOAT32)); + } + } + else + { + ae_valign inp_a, min_a, max_a, out_a; + + inp_a = XT_LASX2PP(inp); + min_a = XT_LASX2PP(min); + max_a = XT_LASX2PP(max); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp_a, inp); + XT_LASX2IP(d_min, min_a, min); + XT_LASX2IP(d_max, max_a, max); + + y = XT_MAX_SX2(x1, d_min); + y = XT_MIN_SX2(y, d_max); + + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + } + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a3, a; + XT_LSIP(a1, (xtfloat *)inp, 0); + XT_LSIP(a2, (xtfloat *)min, 0); + XT_LSIP(a3, (xtfloat *)max, 0); + a = XT_MAX_S(a1, a2); + a = XT_MIN_S(a, a3); + XT_SSI(a, (xtfloat *)out, 0); + } + return 0; +} + +static void internal_elm_clamp_broadcast_f32xf32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_min, + const FLOAT32 * __restrict__ p_max, + const FLOAT32 * __restrict__ p_inp, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_min; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_max; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + xtfloatx2 *__restrict__ input = (xtfloatx2 *)p_inp; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out, in0; + xtfloatx2 d_inp, x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + +/* Min pointer is pointing to actual max and max to min */ + if(sign_flag){ + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0) && ((((unsigned)input)&7) == 0)) + { + for(i=0; i> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out, in0; + xtfloatx2 d_inp, x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + x1 = XT_LSI((xtfloat *)p_a, 0); + + if(((((unsigned)p_c)&7) == 0) && ((((unsigned)input)&7) == 0)) + { + for(i=0; i> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 d_inp, x1, x2, y; + xtfloat in0, a0, b0, c0; + unsigned char con1, con2; + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_min[i * in_lc]; + p_b = (xtfloatx2 *)p_max; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + input = (xtfloatx2 *)&p_inp[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0) && ((((unsigned)input)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + XT_LSX2IP(d_inp, input, 2 * sizeof(FLOAT32)); + y = XT_MAX_SX2(d_inp, x2); + y = XT_MIN_SX2(y, x1); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp, vmin, vmax, out_a = AE_ZALIGN64(); + vmin = XT_LASX2PP(p_a); + vmax = XT_LASX2PP(p_b); + vinp = XT_LASX2PP(input); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vmin, p_a); + XT_LASX2IP(x2, vmax, p_b); + XT_LASX2IP(d_inp, vinp, input); + y = XT_MAX_SX2(d_inp, x2); + y = XT_MIN_SX2(y, x1); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, 0); + XT_LSIP(b0, (xtfloat *)p_b, 0); + XT_LSIP(in0, (xtfloat *)input, 0); + c0 = XT_MAX_S(in0, b0); + c0 = XT_MIN_S(a0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_min[i * in_lc]; + p_b = (xtfloatx2 *)p_max; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + input = (xtfloatx2 *)&p_inp[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0) && ((((unsigned)input)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + XT_LSX2IP(d_inp, input, 2 * sizeof(FLOAT32)); + y = XT_MAX_SX2(d_inp, x1); + y = XT_MIN_SX2(y, x2); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp, vmin, vmax, out_a = AE_ZALIGN64(); + vmin = XT_LASX2PP(p_a); + vmax = XT_LASX2PP(p_b); + vinp = XT_LASX2PP(input); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vmin, p_a); + XT_LASX2IP(x2, vmax, p_b); + XT_LASX2IP(d_inp, vinp, input); + y = XT_MAX_SX2(d_inp, x1); + y = XT_MIN_SX2(y, x2); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, 0); + XT_LSIP(b0, (xtfloat *)p_b, 0); + XT_LSIP(in0, (xtfloat *)input, 0); + c0 = XT_MAX_S(in0, a0); + c0 = XT_MIN_S(c0, b0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } +} + +static void internal_elm_clamp_broadcast_both_2D_f32xf32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_min, + const FLOAT32 * __restrict__ p_max, + const FLOAT32 * __restrict__ p_inp, + WORD32 out_lc, + WORD32 in_lc) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_min; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_max; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + xtfloatx2 *__restrict__ input = (xtfloatx2 *)p_inp; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 d_inp, x1, x2, y; + xtfloat in0, a0, b0, c0; + unsigned char con1, con2; + + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)p_min; + p_b = (xtfloatx2 *)p_max; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + input = (xtfloatx2 *)&p_inp[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0) && ((((unsigned)input)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + XT_LSX2IP(d_inp, input, 2 * sizeof(FLOAT32)); + y = XT_MAX_SX2(d_inp, x1); + y = XT_MIN_SX2(y, x2); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp, vmin, vmax, out_a = AE_ZALIGN64(); + vmin = XT_LASX2PP(p_a); + vmax = XT_LASX2PP(p_b); + vinp = XT_LASX2PP(input); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vmin, p_a); + XT_LASX2IP(x2, vmax, p_b); + XT_LASX2IP(d_inp, vinp, input); + y = XT_MAX_SX2(d_inp, x1); + y = XT_MIN_SX2(y, x2); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, 0); + XT_LSIP(b0, (xtfloat *)p_b, 0); + XT_LSIP(in0, (xtfloat *)input, 0); + c0 = XT_MAX_S(in0, a0); + c0 = XT_MIN_S(c0, b0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } +} + +WORD32 xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32(FLOAT32 * __restrict__ p_out, + const WORD32 *const p_out_shape, + const FLOAT32 * __restrict__ p_inp, + const WORD32 *const p_inp_shape, + const FLOAT32 * __restrict__ p_min, + const WORD32 *const p_min_shape, + const FLOAT32 * __restrict__ p_max, + const WORD32 *const p_max_shape + ) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp, -1); + XA_NNLIB_ARG_CHK_PTR(p_min, -1); + XA_NNLIB_ARG_CHK_PTR(p_max, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_min_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_max_shape, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_min, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_max, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_min_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_max_shape, sizeof(WORD32), -1); + /* Check shapes */ + int i; + xtbool sign_flag; + for(i = 0; i < 4; i++) + { + if((p_min_shape[i] != p_max_shape[i]) && ((p_min_shape[i] != 1) && (p_max_shape[i] != 1))) + { + return -1; + } + } + const float *p_min_new = p_min; + for(i = 0; i < 4; i++) + { + for(int j=0; j < p_min_shape[i]; j++) + { + p_min_new++; + } + } + const FLOAT32 *p_max_new = p_max; + for(i = 0; i < 4; i++) + { + for(int j=0; j < p_max_shape[i]; j++) + { + p_max_new++; + } + } + const FLOAT32 *p_inp_new = p_inp; + for(i = 0; i < 4; i++) + { + for(int j=0; j < p_inp_shape[i]; j++) + { + p_inp_new++; + } + } + WORD32 min_strides[4], max_strides[4]; + min_strides[3] = 1; + max_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(min_strides[i + 1], max_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_min_shape[i + 1], p_max_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + min_strides[i] = AE_MOVAD32_H(d_str); + max_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int min_const = 1, max_const = 1; + for(i = 0; i < 4; i++) + { + if(p_min_shape[i] == 1) + { + min_strides[i] = 0; + need_broadcast = 1; + } + else + { + min_const &= 0; + } + if(p_max_shape[i] == 1) + { + max_strides[i] = 0; + need_broadcast = 1; + } + else + { + max_const &= 0; + } + } + + int itr0, itr1, itr2; + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict p_inp_temp = p_inp; + const FLOAT32 *__restrict__ p_min_tmp = p_min; + const FLOAT32 *__restrict__ p_max_tmp = p_max; + + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_clamp_broadcast_2D_f32xf32xf32_f32( + p_out, + p_min, + p_max, + p_inp, + 1, + p_out_shape[0] * min_strides[0], + sign_flag); + } + else if((min_strides[3] == 1)&& (max_strides[3] == 1)) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if((min_strides[2] == 0) && (max_strides[2] == 0)) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_min_tmp0 = p_min_tmp; + const FLOAT32 *__restrict__ p_max_tmp0 = p_max_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_clamp_broadcast_both_2D_f32xf32xf32_f32( + p_out_tmp, + p_min_tmp0, + p_max_tmp0, + p_inp_temp, + out_lc, + in_lc); + p_out_tmp += in_lc * out_lc; + p_min_tmp0 += min_strides[1]; + p_max_tmp0 += max_strides[1]; + p_inp_temp += in_lc * out_lc; + } + p_min_tmp += min_strides[0]; + p_max_tmp += max_strides[0]; + } + } + else + { + if(min_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_min_tmp; p_min_tmp = p_max_tmp; p_max_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = min_strides[0]; + tmp_strides[1] = min_strides[1]; + + min_strides[0] = max_strides[0]; + min_strides[1] = max_strides[1]; + + max_strides[0] = tmp_strides[0]; + max_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(max_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_min_tmp0 = p_min_tmp; + const FLOAT32 *__restrict__ p_max_tmp0 = p_max_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_clamp_broadcast_2D_f32xf32xf32_f32( + p_out_tmp, + p_min_tmp0, + p_max_tmp0, + p_inp_temp, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_min_tmp0 += min_strides[1]; + p_max_tmp0 += max_strides[1]; + p_inp_temp += in_lc * out_lc; + } + + p_min_tmp += min_strides[0]; + p_max_tmp += max_strides[0]; + } + } + } + else if(min_const == 1 || max_const == 1) + { + if((min_const == 1)&&(max_const == 1)) + { + internal_elm_clamp_broadcast_both_f32xf32xf32_f32( + p_out_tmp, + p_min_tmp, + p_max_tmp, + p_inp_temp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3]); + } + else + { + sign_flag = 0; + if(min_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_min_tmp; p_min_tmp = p_max_tmp; p_max_tmp = tmp; + } + internal_elm_clamp_broadcast_f32xf32xf32_f32( + p_out_tmp, + p_min_tmp, + p_max_tmp, + p_inp_temp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + } + else + { + sign_flag = 0; + if((min_strides[3] == 0) && (max_strides[3] == 0)) + { + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_min_tmp0 = p_min_tmp; + const FLOAT32 *__restrict__ p_max_tmp0 = p_max_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_min_tmp1 = p_min_tmp0; + const FLOAT32 *__restrict__ p_max_tmp1 = p_max_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_clamp_broadcast_both_f32xf32xf32_f32( + p_out_tmp, + p_min_tmp1, + p_max_tmp1, + p_inp_temp, + p_out_shape[3]); + } + p_out_tmp += p_out_shape[3]; + p_min_tmp1 += min_strides[2]; + p_max_tmp1 += max_strides[2]; + p_inp_temp += p_out_shape[3]; + } + p_min_tmp0 += min_strides[1]; + p_max_tmp0 += max_strides[1]; + } + p_min_tmp += min_strides[0]; + p_max_tmp += max_strides[0]; + } + } + else + { + if(min_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_min_tmp; p_min_tmp = p_max_tmp; p_max_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = min_strides[0]; + tmp_strides[1] = min_strides[1]; + tmp_strides[2] = min_strides[2]; + + min_strides[0] = max_strides[0]; + min_strides[1] = max_strides[1]; + min_strides[2] = max_strides[2]; + + max_strides[0] = tmp_strides[0]; + max_strides[1] = tmp_strides[1]; + max_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_min_tmp0 = p_min_tmp; + const FLOAT32 *__restrict__ p_max_tmp0 = p_max_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_min_tmp1 = p_min_tmp0; + const FLOAT32 *__restrict__ p_max_tmp1 = p_max_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_clamp_broadcast_f32xf32xf32_f32( + p_out_tmp, + p_min_tmp1, + p_max_tmp1, + p_inp_temp, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_min_tmp1 += min_strides[2]; + p_max_tmp1 += max_strides[2]; + p_inp_temp += p_out_shape[3]; + } + p_min_tmp0 += min_strides[1]; + p_max_tmp0 += max_strides[1]; + } + p_min_tmp += min_strides[0]; + p_max_tmp += max_strides[0]; + } + } + } + return 0; +} +#endif diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c new file mode 100644 index 0000000000..3b40752211 --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_remainder_broadcast_f32.c @@ -0,0 +1,525 @@ +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nn_common.h" +#include "xa_nnlib_err_chk.h" +//#include "xa_nn_basic_state.h" +#include "xa_nnlib_kernels_api.h" + + +#if !HAVE_VFPU +DISCARD_FUN_FOR_NONVOID_RETURN( + WORD32, xa_nn_elm_remainder_f32xf32_f32, + ( + FLOAT32 *p_out, + const FLOAT32 *p_inp1, + const FLOAT32 *p_inp2, + WORD32 num_elm + ) + ) +#else +WORD32 xa_nn_elm_remainder_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp1, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp2, -1); + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp1, sizeof(FLOAT32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp2, sizeof(FLOAT32), -1); + /* Basic Parameter checks */ + XA_NNLIB_ARG_CHK_COND((num_elm <= 0), -1); + + int i; + xtfloatx2 *inp1 = (xtfloatx2 *)p_inp1; + xtfloatx2 *inp2 = (xtfloatx2 *)p_inp2; + xtfloatx2 *out = (xtfloatx2 *)p_out; + xtfloatx2 x1, x2, y; + ae_valign inp1_a, inp2_a, out_a; + + inp1_a = XT_LASX2PP(inp1); + inp2_a = XT_LASX2PP(inp2); + out_a = AE_ZALIGN64(); + /* Each iteration of loop is independent so safe to use concurrent pragma */ +#pragma concurrent + for(i=0;i < num_elm>>1;i++) + { + XT_LASX2IP(x1, inp1_a, inp1); + XT_LASX2IP(x2, inp2_a, inp2); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, out); + } + XT_SASX2POSFP(out_a, out); + + // Remainder Loop + if (num_elm & 1) + { + xtfloat a1, a2, a; + XT_LSIP(a1, (xtfloat *)inp1, 0); + XT_LSIP(a2, (xtfloat *)inp2, 0); + a = XT_DIV_S(a1, a2); + a = FIFLOOR_S(a); + a = XT_MUL_S(a, a2); + a = XT_SUB_S(a1, a); + XT_SSI(a, (xtfloat *)out, 0); + } + + return 0; +} +#endif + +#if HAVE_VFPU +static void internal_elm_remainder_broadcast_2D_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 out_lc, + WORD32 in_lc, + xtbool sign_flag) +{ + int i, j; + + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + int num_simd2_ops; + int num_scalar_ops; + + if(out_lc) + { + num_simd2_ops = in_lc >> 1; + num_scalar_ops = in_lc & 1; + } + else + { + num_simd2_ops = (in_lc >> 2) << 1; + num_scalar_ops = in_lc & 3; + } + + xtfloatx2 x1, x2, y; + xtfloat a0, b0, c0; + + /* For computing inp2 - inp1 */ + if(sign_flag){ + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x2, x1); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x2, x1); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x1); + y = XT_SUB_SX2(x2, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(b0, a0); + c0 = FIFLOOR_S(c0); + c0 = XT_MUL_S(c0, a0); + c0 = XT_SUB_S(b0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } + /* For computing inp1 - inp2 */ + else + { + for(i = 0; i < out_lc; i++) + { + p_a = (xtfloatx2 *)&p_inp1[i * in_lc]; + p_b = (xtfloatx2 *)p_inp2; + p_c = (xtfloatx2 *)&p_out[i * in_lc]; + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_b)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(j = 0; j < num_simd2_ops; j++) + { + XT_LSX2IP(x1, p_a, 2 * sizeof(FLOAT32)); + XT_LSX2IP(x2, p_b, 2 * sizeof(FLOAT32)); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SSX2IP(y, p_c, 2 * sizeof(FLOAT32)); + } + } + else + { + ae_valign vinp1, vinp2, out_a = AE_ZALIGN64(); + vinp1 = XT_LASX2PP(p_a); + vinp2 = XT_LASX2PP(p_b); + + for(j = 0; j < num_simd2_ops; j++) + { + XT_LASX2IP(x1, vinp1, p_a); + XT_LASX2IP(x2, vinp2, p_b); + y = XT_DIV_SX2(x1, x2); + y = FIFLOOR_SX2(y); + y = XT_MUL_SX2(y, x2); + y = XT_SUB_SX2(x1, y); + XT_SASX2IP(y, out_a, p_c); + } + XT_SASX2POSFP(out_a, (xtfloatx2 *)p_c); + } + if(num_scalar_ops !=0) + { + XT_LSIP(a0, (xtfloat *)p_a, sizeof(FLOAT32)); + XT_LSIP(b0, (xtfloat *)p_b, sizeof(FLOAT32)); + c0 = XT_DIV_S(a0, b0); + c0 = FIFLOOR_S(c0); + c0 = XT_MUL_S(c0, b0); + c0 = XT_SUB_S(a0, c0); + XT_SSI(c0, (xtfloat *)p_c, 0); + } + } + } +} + +static void internal_elm_remainder_broadcast_f32xf32_f32(FLOAT32 * __restrict__ p_out, + const FLOAT32 * __restrict__ p_inp1, + const FLOAT32 * __restrict__ p_inp2, + WORD32 num_elm, + xtbool sign_flag) +{ + int i; + xtfloatx2 * __restrict__ p_a = (xtfloatx2 *)p_inp1; + xtfloatx2 * __restrict__ p_b = (xtfloatx2 *)p_inp2; + xtfloatx2 *__restrict__ p_c = (xtfloatx2 *)p_out; + + const int num_simd2_ops = num_elm >> 1; + const int num_scalar_ops = num_elm & 1; + + xtfloat a0_7, out; + xtfloatx2 x1, x2, y; + x2 = XT_LSI((xtfloat *)p_b, 0); + + /* For computing inp2 - inp1 */ + if(sign_flag){ + if(((((unsigned)p_a)&7) == 0) && ((((unsigned)p_c)&7) == 0)) + { + for(i=0; i p_inp2_shape[i] ? p_inp1_shape[i] : p_inp2_shape[i]))) + { + return -1; + } + } + + WORD32 inp1_strides[4], inp2_strides[4]; + inp1_strides[3] = 1; + inp2_strides[3] = 1; + for(i = 2; i >= 0; i--) + { + ae_int32x2 d_str, d_shape; + d_str = AE_MOVDA32X2(inp1_strides[i + 1], inp2_strides[i + 1]); + d_shape = AE_MOVDA32X2(p_inp1_shape[i + 1], p_inp2_shape[i + 1]); + d_str = AE_MULP32X2(d_str, d_shape); + inp1_strides[i] = AE_MOVAD32_H(d_str); + inp2_strides[i] = AE_MOVAD32_L(d_str); + } + + int need_broadcast = 0; + int inp1_const = 1, inp2_const = 1; + for(i = 0; i < 4; i++) + { + if(p_inp1_shape[i] != p_inp2_shape[i]) + { + if(p_inp1_shape[i] == 1) + inp1_strides[i] = 0; + else + inp2_strides[i] = 0; + + need_broadcast = 1; + } + if(p_inp1_shape[i] != 1) + inp1_const &= 0; + if(p_inp2_shape[i] != 1) + inp2_const &= 0; + } + int itr0, itr1, itr2; + + FLOAT32 *p_out_tmp = p_out; + const FLOAT32 *__restrict__ p_inp1_tmp = p_inp1; + const FLOAT32 *__restrict__ p_inp2_tmp = p_inp2; + if(need_broadcast == 0) + { + sign_flag = 0; + internal_elm_remainder_broadcast_2D_f32xf32_f32( + p_out, + p_inp1, + p_inp2, + 1, + p_out_shape[0] * inp1_strides[0], + sign_flag); + } + else if(inp1_strides[3] == inp2_strides[3]) + { + WORD32 in_lc, out_lc; + sign_flag = 0; + in_lc = p_out_shape[2] * p_out_shape[3]; + out_lc = 1; + if(inp1_strides[2] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[2]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + else if(inp2_strides[2] == 0) + { + in_lc = p_out_shape[3]; + out_lc = p_out_shape[2]; + } + + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + internal_elm_remainder_broadcast_2D_f32xf32_f32( + p_out_tmp, + p_inp1_tmp0, + p_inp2_tmp0, + out_lc, + in_lc, + sign_flag); + p_out_tmp += in_lc * out_lc; + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + else if(inp1_const == 1 || inp2_const == 1) + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + sign_flag = 1; + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + } + internal_elm_remainder_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp, + p_inp2_tmp, + p_out_shape[0] * p_out_shape[1] * p_out_shape[2] * p_out_shape[3], + sign_flag); + } + else + { + sign_flag = 0; + if(inp1_strides[3] == 0) + { + const FLOAT32 *tmp; + tmp = p_inp1_tmp; p_inp1_tmp = p_inp2_tmp; p_inp2_tmp = tmp; + sign_flag = 1; + int tmp_strides[3]; + tmp_strides[0] = inp1_strides[0]; + tmp_strides[1] = inp1_strides[1]; + tmp_strides[2] = inp1_strides[2]; + + inp1_strides[0] = inp2_strides[0]; + inp1_strides[1] = inp2_strides[1]; + inp1_strides[2] = inp2_strides[2]; + + inp2_strides[0] = tmp_strides[0]; + inp2_strides[1] = tmp_strides[1]; + inp2_strides[2] = tmp_strides[2]; + } + for(itr0 = 0; itr0 < p_out_shape[0]; itr0++) + { + const FLOAT32 *__restrict__ p_inp1_tmp0 = p_inp1_tmp; + const FLOAT32 *__restrict__ p_inp2_tmp0 = p_inp2_tmp; + for(itr1 = 0; itr1 < p_out_shape[1]; itr1++) + { + const FLOAT32 *__restrict__ p_inp1_tmp1 = p_inp1_tmp0; + const FLOAT32 *__restrict__ p_inp2_tmp1 = p_inp2_tmp0; + for(itr2 = 0; itr2 < p_out_shape[2]; itr2++) + { + { + internal_elm_remainder_broadcast_f32xf32_f32( + p_out_tmp, + p_inp1_tmp1, + p_inp2_tmp1, + p_out_shape[3], + sign_flag); + } + p_out_tmp += p_out_shape[3]; + p_inp1_tmp1 += inp1_strides[2]; + p_inp2_tmp1 += inp2_strides[2]; + } + p_inp1_tmp0 += inp1_strides[1]; + p_inp2_tmp0 += inp2_strides[1]; + } + p_inp1_tmp += inp1_strides[0]; + p_inp2_tmp += inp2_strides[0]; + } + } + return 0; +} +#endif + diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c new file mode 100644 index 0000000000..e7b80e3a1d --- /dev/null +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_transpose_32.c @@ -0,0 +1,260 @@ +#include "xa_nnlib_common.h" +#include "stdio.h" +/* + * Currently only supports upto 5D input tensors. + * 1/2/3/4 D input tensors will be scaled up to 5D. + * For example, 2x3 -> 1x1x1x2x3. + */ + +WORD32 xa_nn_transpose_32_32(WORD32 * __restrict__ p_out + ,const WORD32 *const p_out_shape + ,const WORD32 * __restrict__ p_inp + ,const WORD32 *const p_inp_shape + ,const WORD32 * __restrict__ p_permute_vec + ,WORD32 num_out_dims + ,WORD32 num_inp_dims) +{ + /* NULL pointer checks */ + XA_NNLIB_ARG_CHK_PTR(p_out, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp, -1); + XA_NNLIB_ARG_CHK_PTR(p_permute_vec, -1); + XA_NNLIB_ARG_CHK_PTR(p_out_shape, -1); + XA_NNLIB_ARG_CHK_PTR(p_inp_shape, -1); + + /* Invalid input checks */ + XA_NNLIB_ARG_CHK_COND(((num_inp_dims <= 0) || (num_inp_dims > 5)), -1); + XA_NNLIB_ARG_CHK_COND((num_out_dims != num_inp_dims), -1); + + int itr = 0; + for(itr=0; itr < num_inp_dims; itr++) + { + XA_NNLIB_ARG_CHK_COND((p_inp_shape[itr] <= 0), -1); + } + for(itr=0; itr < num_out_dims; itr++) + { + XA_NNLIB_ARG_CHK_COND((p_out_shape[itr] <= 0), -1); + } + + + /* Output shape provided must be correct based on input + * shape and permute values */ + for(itr=0; itr < num_out_dims; itr++) + { + int output_dim = p_out_shape[itr]; + int expected_dim = p_inp_shape[p_permute_vec[itr]]; + XA_NNLIB_ARG_CHK_COND((output_dim != expected_dim), -1); + } + + /* Pointer alignment checks */ + XA_NNLIB_ARG_CHK_ALIGN(p_out, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_permute_vec, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_out_shape, sizeof(WORD32), -1); + XA_NNLIB_ARG_CHK_ALIGN(p_inp_shape, sizeof(WORD32), -1); + + /* Shift all dim with 1 in the outer part */ + int eff_output_shape[5]; + int eff_permute_vec[5]; + + for(int i = 0; i < num_out_dims; i++) + { + eff_output_shape[i] = p_out_shape[i]; + eff_permute_vec[i] = p_permute_vec[i]; + } + + int one_i=num_out_dims-1, non_one_i=num_out_dims-1; + while(one_i > 0 && non_one_i >=0){ + while(one_i > 0 && eff_output_shape[one_i]!=1){ + one_i--; + } + non_one_i = one_i; + while(non_one_i >= 0 && eff_output_shape[non_one_i]==1) + { + non_one_i--; + } + if(one_i > 0 && non_one_i >=0){ + int temp; + /*swap output_shape*/ + { + temp = eff_output_shape[one_i]; + eff_output_shape[one_i] = eff_output_shape[non_one_i]; + eff_output_shape[non_one_i] = temp; + } + /*swap permute_vec*/ + { + temp = eff_permute_vec[one_i]; + eff_permute_vec[one_i] = eff_permute_vec[non_one_i]; + eff_permute_vec[non_one_i] = temp; + } + + } + } + + /* Promoting lesser dim tensors to 5D tensors. + * Also updating the permute_vec and shapes as needed for optimization */ + int p_5D_inp_shape[5] = {1, 1, 1, 1, 1}; + int p_5D_out_shape[5] = {1, 1, 1, 1, 1}; + int p_5D_permute_vec[5] = {0, 1, 2, 3, 4}; + + /* Check if any inner inp dimension is same in the output */ + int last_dim_same = 1, last_n_same_dim = 0; + itr = num_inp_dims - 1; + while(itr >= 0) + { + last_n_same_dim = (last_dim_same && (eff_permute_vec[itr] == itr)) ? (last_n_same_dim + 1) : last_n_same_dim; + last_dim_same = (eff_permute_vec[itr] == itr) ? last_dim_same & 1 : last_dim_same & 0; + itr--; + } + + int dims_added = 5 - num_inp_dims; + itr = num_inp_dims - 1; + int same_count = last_n_same_dim; + int count = 4; + while(itr >= 0) + { + p_5D_inp_shape[count] = (same_count > 0) ? p_5D_inp_shape[count]*p_inp_shape[itr] : p_inp_shape[itr]; + p_5D_out_shape[count] = (same_count > 0) ? p_5D_out_shape[count]*eff_output_shape[itr] : eff_output_shape[itr]; + same_count--; + itr--; + count = (same_count > 0) ? count : count - 1; + } + + itr = num_inp_dims - 1; + same_count = (last_n_same_dim) ? num_inp_dims - (last_n_same_dim - 1) : 0; + count = 4; + while(itr >= 0) + { + p_5D_permute_vec[count] = (same_count > 0) ? eff_permute_vec[itr-(last_n_same_dim - 1)] + dims_added + last_n_same_dim - 1 : eff_permute_vec[itr] + dims_added; + same_count--; + itr--; + count--; + } + + int out_dim0, out_dim1, out_dim2, out_dim3, out_dim4; + int inp_dim1, inp_dim2, inp_dim3, inp_dim4; + int inp_stride[5]; + + out_dim0 = p_5D_out_shape[0]; + out_dim1 = p_5D_out_shape[1]; + out_dim2 = p_5D_out_shape[2]; + out_dim3 = p_5D_out_shape[3]; + out_dim4 = p_5D_out_shape[4]; + + inp_dim1 = p_5D_inp_shape[1]; + inp_dim2 = p_5D_inp_shape[2]; + inp_dim3 = p_5D_inp_shape[3]; + inp_dim4 = p_5D_inp_shape[4]; + + inp_stride[0] = inp_dim1*inp_dim2*inp_dim3*inp_dim4; + inp_stride[1] = inp_dim2*inp_dim3*inp_dim4; + inp_stride[2] = inp_dim3*inp_dim4; + inp_stride[3] = inp_dim4; + inp_stride[4] = 1; + + if(last_n_same_dim) + { + int itr0, itr1, itr2, itr3, itr4; + WORD32 *p_inp0 = (WORD32 *)p_inp; + for(itr0 = 0; itr0 < out_dim0; itr0++) + { + WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); +#pragma loop_count min=1 + for(itr1 = 0; itr1 < out_dim1; itr1++) + { + WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); +#pragma loop_count min=1 + for(itr2 = 0; itr2 < out_dim2; itr2++) + { + WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); +#pragma loop_count min=1 + for(itr3 = 0; itr3 < out_dim3; itr3++, p_out+=out_dim4) + { + WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); + if((((unsigned)p_inp4 & 1) == 0) && (((unsigned)p_out & 1) == 0)) + { + ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); + ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); + ae_int32x2 d0; + for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) + { + AE_L32X2_IP(d0, pae_i, 2 * sizeof(WORD32)); + AE_S32X2_IP(d0, pae_o, 2 * sizeof(WORD32)); + } + ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); + ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); +#pragma loop_count max=3 + for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) + { + puae_o[itr4] = puae_i[itr4]; + } + } + else + { + ae_int32x2 *__restrict__ pae_i = (ae_int32x2 *)(p_inp4); + ae_int32x2 *__restrict__ pae_o = (ae_int32x2 *)(p_out); + ae_valign a_inp = AE_LA64_PP(pae_i); + ae_valign a_out = AE_ZALIGN64(); + ae_int32x2 d0; + for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) + { + AE_LA32X2_IP(d0, a_inp, pae_i); + AE_SA32X2_IP(d0, a_out, pae_o); + } + AE_SA64POS_FP(a_out, pae_o); + ae_int32 *__restrict__ puae_i = (ae_int32 *)(pae_i); + ae_int32 *__restrict__ puae_o = (ae_int32 *)(pae_o); +#pragma loop_count max=3 + for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) + { + puae_o[itr4] = puae_i[itr4]; + } + } + } + } + } + } + } + else + { + int itr0, itr1, itr2, itr3, itr4; + WORD32 *p_inp0 = (WORD32 *)p_inp; + for(itr0 = 0; itr0 < out_dim0; itr0++) + { + WORD32 *p_inp1 = p_inp0+(itr0*inp_stride[p_5D_permute_vec[0]]); + for(itr1 = 0; itr1 < out_dim1; itr1++) + { + WORD32 *p_inp2 = p_inp1+(itr1*inp_stride[p_5D_permute_vec[1]]); + for(itr2 = 0; itr2 < out_dim2; itr2++) + { + WORD32 *p_inp3 = p_inp2+(itr2*inp_stride[p_5D_permute_vec[2]]); + for(itr3 = 0; itr3 < out_dim3; itr3++) + { + WORD32 *p_inp4 = p_inp3+(itr3*inp_stride[p_5D_permute_vec[3]]); + + ae_valign a_out = AE_ZALIGN64(); + for(itr4 = 0; itr4 < (out_dim4 >> 1); itr4++) + { + ae_int32x2 d0, d1; + ae_int32x2 tmp0; + + AE_L32_XP(d0, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); + AE_L32_XP(d1, (ae_int32 *)p_inp4, inp_stride[p_5D_permute_vec[4]] << 2); + + tmp0 = AE_SEL32_HH(d0, d1); + + AE_SA32X2_IP(tmp0, a_out, (ae_int32x2 *)p_out); + } + AE_SA64POS_FP(a_out, p_out); +#pragma loop_count max=3 + for(itr4 = 0; itr4 < (out_dim4 & 1); itr4++) + { + *p_out++ = *p_inp4; + } + } + } + } + } + } + + return 0; +} \ No newline at end of file diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 93c150c0b9..514a82c0ff 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -32,6 +32,8 @@ static uint8_t method_allocator_pool[4 * 1024U * 1024U]; // 4 MB +static uint8_t temp_allocator_pool[1024U * 1024U]; + DEFINE_string( model_path, "model.pte", @@ -120,6 +122,10 @@ int main(int argc, char** argv) { MemoryAllocator method_allocator{ MemoryAllocator(sizeof(method_allocator_pool), method_allocator_pool)}; + // Temporary memory required by kernels + MemoryAllocator temp_allocator{ + MemoryAllocator(sizeof(temp_allocator_pool), temp_allocator_pool)}; + // The memory-planned buffers will back the mutable tensors used by the // method. The sizes of these buffers were determined ahead of time during the // memory-planning pasees. @@ -144,7 +150,8 @@ int main(int argc, char** argv) { // Assemble all of the allocators into the MemoryManager that the Executor // will use. - MemoryManager memory_manager(&method_allocator, &planned_memory); + MemoryManager memory_manager( + &method_allocator, &planned_memory, &temp_allocator); // // Load the method from the program, using the provided allocators. Running @@ -172,6 +179,7 @@ int main(int argc, char** argv) { // Run the model. Error status = method->execute(); + ET_CHECK_MSG( status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32, From 911021f057d8b203e4497091387b8f0eb9518be2 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Sun, 1 Dec 2024 23:08:39 -0800 Subject: [PATCH 08/18] Cleaning cmakelist to avoid duplications --- backends/cadence/hifi/operators/CMakeLists.txt | 3 --- 1 file changed, 3 deletions(-) diff --git a/backends/cadence/hifi/operators/CMakeLists.txt b/backends/cadence/hifi/operators/CMakeLists.txt index ab5a04897e..a5a8263bd7 100644 --- a/backends/cadence/hifi/operators/CMakeLists.txt +++ b/backends/cadence/hifi/operators/CMakeLists.txt @@ -43,16 +43,13 @@ set(_aten_ops__srcs "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_clone.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_embedding.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gt.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_full.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_gelu.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_hardtanh.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_max_pool2d_with_indices.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_permute_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_slice_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_split_with_sizes_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_to_copy.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_view_copy.cpp" - "${EXECUTORCH_ROOT}/kernels/portable/cpu/op_where.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/pattern/unary_ufunc_realhbbf16_to_floathbf16.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/activation_ops_util.cpp" "${EXECUTORCH_ROOT}/kernels/portable/cpu/util/broadcast_util.cpp" From 18cf518bd85ae5625143eb44755c2ff4f8b5d7fa Mon Sep 17 00:00:00 2001 From: dijopaul Date: Tue, 3 Dec 2024 01:16:26 -0800 Subject: [PATCH 09/18] Fixing lint issues and removing free statements --- backends/cadence/hifi/operators/op_atan2.cpp | 4 - backends/cadence/hifi/operators/op_clamp.cpp | 1 - backends/cadence/hifi/operators/op_pow.cpp | 4 - .../cadence/hifi/operators/op_remainder.cpp | 158 +++++++++--------- backends/cadence/hifi/operators/op_where.cpp | 2 +- .../hifi/operators/quantized_linear_out.cpp | 31 ++-- 6 files changed, 92 insertions(+), 108 deletions(-) diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp index db2fc23be1..234432ae5d 100644 --- a/backends/cadence/hifi/operators/op_atan2.cpp +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -103,8 +103,6 @@ Tensor& atan2_out( xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm); - free(ptr1); - free(ptr2); } else if (a_is_broadcasted && (!b_is_broadcasted)) { FLOAT32* __restrict__ ptr1 = (FLOAT32* __restrict__)kernels::allocate_temp_memory( @@ -134,7 +132,6 @@ Tensor& atan2_out( xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm); - free(ptr1); } else if (b_is_broadcasted && (!a_is_broadcasted)) { WORD32* __restrict__ ptr1 = (WORD32* __restrict__)kernels::allocate_temp_memory( @@ -161,7 +158,6 @@ Tensor& atan2_out( xa_nn_elm_atan2_f32(p_out, p_inp1, p_inp2, num_elm); - free(ptr1); } else { FLOAT32* __restrict__ p_out = (FLOAT32* __restrict__)out.mutable_data_ptr(); diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp index 290c4d087d..8ed57a8e16 100644 --- a/backends/cadence/hifi/operators/op_clamp.cpp +++ b/backends/cadence/hifi/operators/op_clamp.cpp @@ -379,7 +379,6 @@ Tensor& clamp_tensor_out( ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); - free(p_scratch); } else { WORD32 ret_val = xa_nn_elm_clamp_broadcast_4D_f32Xf32xf32_f32( out_data, diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index 74c24afbc0..3c42698f21 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -153,8 +153,6 @@ Tensor& pow_Tensor_Tensor_out( xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); - free(ptr1); - free(ptr2); } else if (a_is_broadcasted && (!b_is_broadcasted)) { FLOAT32* __restrict__ ptr1 = (FLOAT32* __restrict__)kernels::allocate_temp_memory( @@ -182,7 +180,6 @@ Tensor& pow_Tensor_Tensor_out( xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); - free(ptr1); } else if (b_is_broadcasted && (!a_is_broadcasted)) { WORD32* __restrict__ ptr1 = (WORD32* __restrict__)kernels::allocate_temp_memory( @@ -209,7 +206,6 @@ Tensor& pow_Tensor_Tensor_out( xa_nn_elm_pow_f32(p_out, p_inp1, p_inp2, num_elm); - free(ptr1); } else { FLOAT32* __restrict__ p_out = (FLOAT32* __restrict__)out.mutable_data_ptr(); diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp index 7fba5a5385..69c855f590 100644 --- a/backends/cadence/hifi/operators/op_remainder.cpp +++ b/backends/cadence/hifi/operators/op_remainder.cpp @@ -25,18 +25,18 @@ using exec_aten::Scalar; using exec_aten::ScalarType; using exec_aten::Tensor; using executorch::aten::RuntimeContext; +using executorch::runtime::can_cast; using executorch::runtime::canCast; +using executorch::runtime::CppTypeToScalarType; using executorch::runtime::promoteTypes; using torch::executor::apply_binary_elementwise_fn; using torch::executor::apply_unary_map_fn; using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; using torch::executor::native::utils::extract_scalar; using torch::executor::native::utils::get_scalar_dtype; using torch::executor::native::utils::promote_type_with_scalar; using torch::executor::native::utils::remainder_override; -using torch::executor::resize_to_broadcast_target_size; -using executorch::runtime::can_cast; -using executorch::runtime::CppTypeToScalarType; namespace { template < @@ -103,93 +103,87 @@ Tensor& remainder_Tensor_out( bool optimized = true; - if((a.scalar_type() != ScalarType::Float)||(b.scalar_type() != ScalarType::Float)) + if ((a.scalar_type() != ScalarType::Float) || + (b.scalar_type() != ScalarType::Float)) optimized = false; if ((broadcast == true) && (max_dim > kNnlibMaxDim)) optimized = false; - if(optimized) - { - FLOAT32 * __restrict__ p_out = (FLOAT32 * __restrict__ )out.mutable_data_ptr(); - const FLOAT32 * __restrict__ p_inp1 = (const FLOAT32 * __restrict__)a.const_data_ptr(); - const FLOAT32 * __restrict__ p_inp2 = (const FLOAT32 * __restrict__)b.const_data_ptr(); - - if(broadcast) - { - WORD32 p_out_shape[kNnlibMaxDim]; - WORD32 p_inp1_shape[kNnlibMaxDim]; - WORD32 p_inp2_shape[kNnlibMaxDim]; - - for(int i = 0; i < kNnlibMaxDim; i++) - { - p_inp1_shape[i] = 1; - p_inp2_shape[i] = 1; - p_out_shape[i] = 1; - } - - int off_o = kNnlibMaxDim - out.dim(); - int off_a = kNnlibMaxDim - a.dim(); - int off_b = kNnlibMaxDim - b.dim(); - - for(int i = 0; i < out.dim(); i++) - p_out_shape[i+off_o] = out.size(i); - for(int i = 0; i < a.dim(); i++) - p_inp1_shape[i+off_a] = a.size(i); - for(int i = 0; i < b.dim(); i++) - p_inp2_shape[i+off_b] = b.size(i); - - WORD32 ret_val = xa_nn_elm_remainder_broadcast_4D_f32xf32_f32(p_out, - p_out_shape, - p_inp1, - p_inp1_shape, - p_inp2, - p_inp2_shape); - - ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + if (optimized) { + FLOAT32* __restrict__ p_out = + (FLOAT32* __restrict__)out.mutable_data_ptr(); + const FLOAT32* __restrict__ p_inp1 = + (const FLOAT32* __restrict__)a.const_data_ptr(); + const FLOAT32* __restrict__ p_inp2 = + (const FLOAT32* __restrict__)b.const_data_ptr(); + + if (broadcast) { + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_inp1_shape[kNnlibMaxDim]; + WORD32 p_inp2_shape[kNnlibMaxDim]; + + for (int i = 0; i < kNnlibMaxDim; i++) { + p_inp1_shape[i] = 1; + p_inp2_shape[i] = 1; + p_out_shape[i] = 1; } - else{ - WORD32 ret_val = xa_nn_elm_remainder_f32xf32_f32(p_out, - p_inp1, - p_inp2, - out.numel()); + + int off_o = kNnlibMaxDim - out.dim(); + int off_a = kNnlibMaxDim - a.dim(); + int off_b = kNnlibMaxDim - b.dim(); + + for (int i = 0; i < out.dim(); i++) + p_out_shape[i + off_o] = out.size(i); + for (int i = 0; i < a.dim(); i++) + p_inp1_shape[i + off_a] = a.size(i); + for (int i = 0; i < b.dim(); i++) + p_inp2_shape[i + off_b] = b.size(i); + + WORD32 ret_val = xa_nn_elm_remainder_broadcast_4D_f32xf32_f32( + p_out, p_out_shape, p_inp1, p_inp1_shape, p_inp2, p_inp2_shape); + + ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); + } else { + WORD32 ret_val = + xa_nn_elm_remainder_f32xf32_f32(p_out, p_inp1, p_inp2, out.numel()); ET_KERNEL_CHECK(ctx, ret_val == 0, Internal, out); } - return out; + return out; } - // Determine output size and resize for dynamic shapes - ET_KERNEL_CHECK( - ctx, - resize_to_broadcast_target_size(a, b, out) == Error::Ok, - InvalidArgument, - out); - - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); - ScalarType common_type = promoteTypes(a_type, b_type); - ScalarType out_type = out.scalar_type(); - - ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); - - ET_SWITCH_REAL_TYPES_AND( - Bool, a_type, ctx, "remainder.Tensor_out", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, b_type, ctx, "remainder.Tensor_out", CTYPE_B, [&]() { - using CTYPE_IN = typename torch::executor:: - promote_types::type; - ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REAL_TYPES( - out_type, ctx, "remainder.Tensor_out", CTYPE_OUT, [&]() { - RemainderInner< - can_cast::value, - CTYPE_A, - CTYPE_B, - CTYPE_IN, - CTYPE_OUT>::run(a, b, out); - }); - }); - }); + // Determine output size and resize for dynamic shapes + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + ScalarType common_type = promoteTypes(a_type, b_type); + ScalarType out_type = out.scalar_type(); + + ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + + ET_SWITCH_REAL_TYPES_AND( + Bool, a_type, ctx, "remainder.Tensor_out", CTYPE_A, [&]() { + ET_SWITCH_REAL_TYPES_AND( + Bool, b_type, ctx, "remainder.Tensor_out", CTYPE_B, [&]() { + using CTYPE_IN = typename torch::executor:: + promote_types::type; + ET_DCHECK(CppTypeToScalarType::value == common_type); + ET_SWITCH_REAL_TYPES( + out_type, ctx, "remainder.Tensor_out", CTYPE_OUT, [&]() { + RemainderInner< + can_cast::value, + CTYPE_A, + CTYPE_B, + CTYPE_IN, + CTYPE_OUT>::run(a, b, out); + }); + }); + }); return out; } @@ -236,8 +230,8 @@ Tensor& remainder_Scalar_out( static_cast(val_a); CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = remainder_override( - a_casted, b_casted); + CTYPE_IN value = + remainder_override(a_casted, b_casted); return static_cast(value); }, diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp index c4ad8177cf..7e9c5f9dfa 100644 --- a/backends/cadence/hifi/operators/op_where.cpp +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -133,7 +133,7 @@ Tensor& where_out( inp2_shape, p_brd_cond, con_shape); - free(p_scratch); + } else { xa_nn_elm_where_broadcast_4D_f32xf32_f32( out_data, diff --git a/backends/cadence/hifi/operators/quantized_linear_out.cpp b/backends/cadence/hifi/operators/quantized_linear_out.cpp index accc610132..b8e1d117fb 100644 --- a/backends/cadence/hifi/operators/quantized_linear_out.cpp +++ b/backends/cadence/hifi/operators/quantized_linear_out.cpp @@ -26,8 +26,7 @@ using ::executorch::aten::Tensor; using ::executorch::runtime::getLeadingDims; using ::executorch::runtime::KernelRuntimeContext; - - // The nnlib kernel to compute quantized linear via matmul. +// The nnlib kernel to compute quantized linear via matmul. void _quantized_linear_asym8u( const Tensor& in, @@ -48,22 +47,22 @@ void _quantized_linear_asym8u( const int32_t* __restrict__ bias_data = bias.const_data_ptr(); uint8_t* __restrict__ out_data = out.mutable_data_ptr(); int32_t ret = xa_nn_matmul_asym8uxasym8u_asym8u( - out_data, - weight_data, - in_data, - bias_data, - out_dim, - in_dim, - in_dim, - leading_dims, - in_dim, - out_dim, - 1, + out_data, + weight_data, + in_data, + bias_data, + out_dim, + in_dim, + in_dim, + leading_dims, + in_dim, + out_dim, + 1, -weight_zero_point.const_data_ptr()[0], // mat1_zero_bias -in_zero_point, // mat2_zero_bias - out_multiplier.const_data_ptr()[0], - out_shift.const_data_ptr()[0], - out_zero_point); + out_multiplier.const_data_ptr()[0], + out_shift.const_data_ptr()[0], + out_zero_point); ET_DCHECK_MSG(ret == 0, "HiFi quantized::linear failed"); } From 5e471f25bcc6a0141078f8518b19dd23f0cd35de Mon Sep 17 00:00:00 2001 From: nishpoonia <94543206+nishpoonia@users.noreply.github.com> Date: Mon, 23 Dec 2024 13:37:02 +0530 Subject: [PATCH 10/18] adding ET_KERNEL_CHECK for allocate_temp_memory (#41) * adding ET_KERNEL_CHECK for allocate_temp_memory * solving lint error * Removing redundant check --- backends/cadence/hifi/operators/op_atan2.cpp | 9 +++++++++ backends/cadence/hifi/operators/op_clamp.cpp | 4 ++++ backends/cadence/hifi/operators/op_mean.cpp | 2 ++ backends/cadence/hifi/operators/op_pow.cpp | 9 +++++++++ backends/cadence/hifi/operators/op_softmax.cpp | 5 +++++ backends/cadence/hifi/operators/op_where.cpp | 3 +++ 6 files changed, 32 insertions(+) diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp index 234432ae5d..2b0e3131c9 100644 --- a/backends/cadence/hifi/operators/op_atan2.cpp +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -66,10 +66,15 @@ Tensor& atan2_out( WORD32* __restrict__ ptr1 = (WORD32* __restrict__)kernels::allocate_temp_memory( ctx, num_elm * sizeof(WORD32)); + + ET_KERNEL_CHECK(ctx, ptr1 != nullptr, MemoryAllocationFailed, out); + WORD32* __restrict__ ptr2 = (WORD32* __restrict__)kernels::allocate_temp_memory( ctx, num_elm * sizeof(WORD32)); + ET_KERNEL_CHECK(ctx, ptr2 != nullptr, MemoryAllocationFailed, out); + WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); WORD32* __restrict__ pin2 = @@ -108,6 +113,8 @@ Tensor& atan2_out( (FLOAT32* __restrict__)kernels::allocate_temp_memory( ctx, num_elm * sizeof(WORD32)); + ET_KERNEL_CHECK(ctx, ptr1 != nullptr, MemoryAllocationFailed, out); + FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); @@ -137,6 +144,8 @@ Tensor& atan2_out( (WORD32* __restrict__)kernels::allocate_temp_memory( ctx, num_elm * sizeof(WORD32)); + ET_KERNEL_CHECK(ctx, ptr1 != nullptr, MemoryAllocationFailed, out); + WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp index 8ed57a8e16..e8d83e37da 100644 --- a/backends/cadence/hifi/operators/op_clamp.cpp +++ b/backends/cadence/hifi/operators/op_clamp.cpp @@ -359,6 +359,10 @@ Tensor& clamp_tensor_out( ctx, (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) * sizeof(int)); + + ET_KERNEL_CHECK( + ctx, p_scratch != nullptr, MemoryAllocationFailed, out); + const FLOAT32* p_brd_cond = (const FLOAT32*)p_scratch; xa_nn_broadcast_32_32( (WORD32*)p_brd_cond, out_shape, (WORD32*)inp_data, inp_shape, 4); diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp index cdc844ec5c..2584f753f6 100644 --- a/backends/cadence/hifi/operators/op_mean.cpp +++ b/backends/cadence/hifi/operators/op_mean.cpp @@ -129,6 +129,8 @@ Tensor& mean_dim_out( (void* __restrict__)kernels::allocate_temp_memory( ctx, scratch_size * sizeof(int)); + ET_KERNEL_CHECK(ctx, p_scratch_in != nullptr, MemoryAllocationFailed, out); + xa_nn_reduce_mean_4D_f32_f32( p_out, out_shape, diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index 3c42698f21..a2775ca19f 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -122,10 +122,15 @@ Tensor& pow_Tensor_Tensor_out( WORD32* __restrict__ ptr1 = (WORD32* __restrict__)kernels::allocate_temp_memory( ctx, num_elm * sizeof(int)); + + ET_KERNEL_CHECK(ctx, ptr1 != nullptr, MemoryAllocationFailed, out); + WORD32* __restrict__ ptr2 = (WORD32* __restrict__)kernels::allocate_temp_memory( ctx, num_elm * sizeof(int)); + ET_KERNEL_CHECK(ctx, ptr2 != nullptr, MemoryAllocationFailed, out); + WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); WORD32* __restrict__ pin2 = @@ -158,6 +163,8 @@ Tensor& pow_Tensor_Tensor_out( (FLOAT32* __restrict__)kernels::allocate_temp_memory( ctx, num_elm * sizeof(int)); + ET_KERNEL_CHECK(ctx, ptr1 != nullptr, MemoryAllocationFailed, out); + FLOAT32* __restrict__ pin1 = (FLOAT32* __restrict__)a.const_data_ptr(); @@ -185,6 +192,8 @@ Tensor& pow_Tensor_Tensor_out( (WORD32* __restrict__)kernels::allocate_temp_memory( ctx, num_elm * sizeof(int)); + ET_KERNEL_CHECK(ctx, ptr1 != nullptr, MemoryAllocationFailed, out); + WORD32* __restrict__ pin1 = (WORD32* __restrict__)b.const_data_ptr(); diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp index a2068fd15b..0d687ff095 100644 --- a/backends/cadence/hifi/operators/op_softmax.cpp +++ b/backends/cadence/hifi/operators/op_softmax.cpp @@ -101,9 +101,14 @@ Tensor& softmax_out( int* p_out = (int*)kernels::allocate_temp_memory(ctx, out.numel() * sizeof(int)); + + ET_KERNEL_CHECK(ctx, p_out != nullptr, MemoryAllocationFailed, out); + int* p_out1 = (int*)kernels::allocate_temp_memory(ctx, out.numel() * sizeof(int)); + ET_KERNEL_CHECK(ctx, p_out1 != nullptr, MemoryAllocationFailed, out); + WORD32 ret_val = xa_nn_transpose_32_32( p_out, p_out_shape, diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp index 7e9c5f9dfa..0bb3883df6 100644 --- a/backends/cadence/hifi/operators/op_where.cpp +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -113,6 +113,9 @@ Tensor& where_out( ctx, (out_shape[0] * out_shape[1] * out_shape[2] * out_shape[3]) * sizeof(int)); + + ET_KERNEL_CHECK(ctx, p_scratch != nullptr, MemoryAllocationFailed, out); + const unsigned char* p_brd_cond = (const unsigned char*)p_scratch; xa_nn_broadcast_8_8( (WORD8* __restrict__)p_brd_cond, From 991961b3527db12887f253b5c73eb9274efde595 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Wed, 8 Jan 2025 22:35:13 -0800 Subject: [PATCH 11/18] Fixing lint error due to merge --- backends/cadence/hifi/operators/op_pow.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index 2526b34db5..a2775ca19f 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -131,7 +131,6 @@ Tensor& pow_Tensor_Tensor_out( ET_KERNEL_CHECK(ctx, ptr2 != nullptr, MemoryAllocationFailed, out); - WORD32* __restrict__ pin1 = (WORD32* __restrict__)a.const_data_ptr(); WORD32* __restrict__ pin2 = From 540243a75af5de6ae46ab35b5e8da225119af309 Mon Sep 17 00:00:00 2001 From: dijopaul <87994875+dijopaul@users.noreply.github.com> Date: Thu, 9 Jan 2025 13:57:42 +0530 Subject: [PATCH 12/18] Update functions_hifi.yaml - fixing build issue on previous commit --- backends/cadence/aot/functions_hifi.yaml | 25 ------------------------ 1 file changed, 25 deletions(-) diff --git a/backends/cadence/aot/functions_hifi.yaml b/backends/cadence/aot/functions_hifi.yaml index 9a72b50255..61cf07c4d5 100644 --- a/backends/cadence/aot/functions_hifi.yaml +++ b/backends/cadence/aot/functions_hifi.yaml @@ -107,11 +107,6 @@ - arg_meta: null kernel_name: cadence::impl::HiFi::minimum_out -- op: minimum.out - kernels: - - arg_meta: null - kernel_name: cadence::impl::HiFi::minimum_out - - op: mul.out kernels: - arg_meta: null @@ -147,26 +142,6 @@ - arg_meta: null kernel_name: cadence::impl::HiFi::rsqrt_out -- op: pow.Scalar_out - kernels: - - arg_meta: null - kernel_name: cadence::impl::HiFi::pow_Scalar_out - -- op: pow.Tensor_Scalar_out - kernels: - - arg_meta: null - kernel_name: cadence::impl::HiFi::pow_Tensor_Scalar_out - -- op: pow.Tensor_Tensor_out - kernels: - - arg_meta: null - kernel_name: cadence::impl::HiFi::pow_Tensor_Tensor_out - -- op: rsqrt.out - kernels: - - arg_meta: null - kernel_name: cadence::impl::HiFi::rsqrt_out - - op: sigmoid.out kernels: - arg_meta: null From 1f681c732c3ba4b2a1bdb18665b9237b74c1f0ac Mon Sep 17 00:00:00 2001 From: Nishak Date: Fri, 10 Jan 2025 12:01:57 -0800 Subject: [PATCH 13/18] Incorporating review comments: removing nesting to check data type and removing exec_ten uses --- backends/cadence/hifi/operators/op_atan2.cpp | 68 +++-- backends/cadence/hifi/operators/op_cat.cpp | 64 ++--- backends/cadence/hifi/operators/op_clamp.cpp | 214 ++++++++------- backends/cadence/hifi/operators/op_full.cpp | 10 +- backends/cadence/hifi/operators/op_mean.cpp | 10 +- .../hifi/operators/op_permute_copy.cpp | 6 +- backends/cadence/hifi/operators/op_pow.cpp | 255 ++++++++---------- .../cadence/hifi/operators/op_remainder.cpp | 188 +++++++------ .../cadence/hifi/operators/op_softmax.cpp | 4 +- backends/cadence/hifi/operators/op_where.cpp | 69 +++-- .../hifi/operators/quantized_relu_out.cpp | 6 +- 11 files changed, 455 insertions(+), 439 deletions(-) diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp index 2b0e3131c9..fd595a935c 100644 --- a/backends/cadence/hifi/operators/op_atan2.cpp +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -8,26 +8,54 @@ #include #include +#include #include #include -using exec_aten::ScalarType; -using exec_aten::Tensor; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::isFloatingType; using executorch::runtime::KernelRuntimeContext; +using executorch::runtime::promoteTypes; using executorch::runtime::tensors_have_same_dim_order; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +using torch::executor::native::utils::apply_bitensor_elementwise_fn; +using torch::executor::native::utils::get_compute_type; +using torch::executor::native::utils::SupportedTensorDtypes; namespace cadence { namespace impl { namespace HiFi { namespace native { +namespace { + +ScalarType get_common_type(ScalarType a_type, ScalarType b_type) { + if (isFloatingType(a_type) && isFloatingType(b_type)) { + return promoteTypes(a_type, b_type); + } else if (isFloatingType(a_type)) { + return a_type; + } else if (isFloatingType(b_type)) { + return b_type; + } + return ScalarType::Float; +} + +} // namespace + Tensor& atan2_out( KernelRuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { + // Common Dtype + ScalarType common_type = get_common_type(a.scalar_type(), b.scalar_type()); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + // Determine output size and resize for dynamic shapes ET_KERNEL_CHECK( ctx, @@ -35,14 +63,11 @@ Tensor& atan2_out( InvalidArgument, out); - ET_KERNEL_CHECK( - ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); - - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); ScalarType out_type = out.scalar_type(); - constexpr auto name = "atan2.out"; + ScalarType compute_type = get_compute_type(common_type); + + static constexpr const char op_name[] = "atan2.out"; constexpr int kNnlibMaxDim = 16; int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); bool optimized = true; @@ -180,21 +205,18 @@ Tensor& atan2_out( return out; } - ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { - ET_SWITCH_FLOATH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { - torch::executor:: - apply_binary_elementwise_fn( - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_OUT casted_a = static_cast(val_a); - CTYPE_OUT casted_b = static_cast(val_b); - return static_cast(std::atan2(casted_a, casted_b)); - }, - a, - b, - out); - }); - }); + ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + apply_bitensor_elementwise_fn( + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + return std::atan2(val_a, val_b); + }, + ctx, + a, + SupportedTensorDtypes::REALHBBF16, + b, + SupportedTensorDtypes::REALHBBF16, + out, + SupportedTensorDtypes::FLOATHBF16); }); return out; diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp index 1a62892445..14b7abe5bb 100644 --- a/backends/cadence/hifi/operators/op_cat.cpp +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -12,9 +12,9 @@ #include -using exec_aten::ScalarType; -using exec_aten::Tensor; using executorch::aten::RuntimeContext; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using executorch::runtime::getLeadingDims; using executorch::runtime::getTrailingDims; using executorch::runtime::resize_tensor; @@ -33,6 +33,36 @@ Tensor& cat_out( exec_aten::ArrayRef tensors, int64_t dim, Tensor& out) { + if (dim < 0) { + dim += out.dim(); + } + + ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), Internal, out); + + Tensor::SizesType + expected_out_size[executorch::runtime::kTensorDimensionLimit]; + size_t expected_out_dim = 0; + get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); + + ET_KERNEL_CHECK( + ctx, + resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, + InvalidArgument, + out); + + // Special handling when all inputs are 1D-empty tensors for aten consistency + // In that case, just return an 1D-empty tensor without checking dim + bool all_1d_empty = true; + for (size_t i = 0; i < tensors.size(); ++i) { + if (tensors[i].numel() != 0 || tensors[i].dim() != 1) { + all_1d_empty = false; + break; + } + } + if (all_1d_empty) { + return out; + } + constexpr auto name = "cat.out"; constexpr int kNnlibMaxDim = 16; @@ -92,36 +122,6 @@ Tensor& cat_out( return out; } - if (dim < 0) { - dim += out.dim(); - } - - ET_KERNEL_CHECK(ctx, check_cat_args(tensors, dim, out), Internal, out); - - Tensor::SizesType - expected_out_size[executorch::runtime::kTensorDimensionLimit]; - size_t expected_out_dim = 0; - get_cat_out_target_size(tensors, dim, expected_out_size, &expected_out_dim); - - ET_KERNEL_CHECK( - ctx, - resize_tensor(out, {expected_out_size, expected_out_dim}) == Error::Ok, - InvalidArgument, - out); - - // Special handling when all inputs are 1D-empty tensors for aten consistency - // In that case, just return an 1D-empty tensor without checking dim - bool all_1d_empty = true; - for (size_t i = 0; i < tensors.size(); ++i) { - if (tensors[i].numel() != 0 || tensors[i].dim() != 1) { - all_1d_empty = false; - break; - } - } - if (all_1d_empty) { - return out; - } - const size_t outer = getLeadingDims(out, dim); const size_t dim_stride = getTrailingDims(out, dim); const size_t ninputs = tensors.size(); diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp index e8d83e37da..397e73d6e2 100644 --- a/backends/cadence/hifi/operators/op_clamp.cpp +++ b/backends/cadence/hifi/operators/op_clamp.cpp @@ -15,26 +15,33 @@ #include #include #include +#include #include #include #include -using Scalar = exec_aten::Scalar; -using ScalarType = exec_aten::ScalarType; -using Tensor = exec_aten::Tensor; using executorch::aten::RuntimeContext; +using executorch::aten::Scalar; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using executorch::runtime::canCast; using executorch::runtime::isFloatingType; using executorch::runtime::isIntegralType; using executorch::runtime::promoteTypes; +using executorch::runtime::tensors_have_same_dim_order; using torch::executor::apply_ternary_elementwise_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +using torch::executor::native::utils::apply_tritensor_elementwise_fn; +using torch::executor::native::utils::apply_unitensor_elementwise_fn; using torch::executor::native::utils::extract_scalar; +using torch::executor::native::utils::get_compute_type; using torch::executor::native::utils::get_scalar_dtype; using torch::executor::native::utils::max_override; using torch::executor::native::utils::min_override; using torch::executor::native::utils::promote_type_with_scalar; +using torch::executor::native::utils::scalar_to; +using torch::executor::native::utils::SupportedTensorDtypes; namespace cadence { namespace impl { @@ -51,7 +58,7 @@ bool is_out_of_bounds(CTYPE_VAL val) { val_cast > std::numeric_limits::max(); } -__ET_NODISCARD bool check_bounds( +ET_NODISCARD bool check_bounds( const Scalar& val_scalar, const ScalarType& val_type, const ScalarType& out_type, @@ -85,40 +92,48 @@ __ET_NODISCARD bool check_bounds( } // namespace Tensor& clamp_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& in, const exec_aten::optional& min_opt, const exec_aten::optional& max_opt, Tensor& out) { - (void)ctx; + bool has_min = min_opt.has_value(); + bool has_max = max_opt.has_value(); ET_KERNEL_CHECK_MSG( ctx, - resize_tensor(out, in.sizes()) == Error::Ok, + has_min || has_max, InvalidArgument, out, - "Failed to resize output tensor."); + "At least one of 'min' or 'max' must not be None"); + // Input Dtypes ScalarType in_type = in.scalar_type(); - ScalarType min_type = in_type; - ScalarType max_type = in_type; - ScalarType common_type = in_type; + ScalarType min_type = has_min ? get_scalar_dtype(min_opt.value()) : in_type; + ScalarType max_type = has_max ? get_scalar_dtype(max_opt.value()) : in_type; ScalarType out_type = out.scalar_type(); - bool has_min = min_opt.has_value(); + // Common Dtype + ScalarType common_type = in_type; if (has_min) { - min_type = get_scalar_dtype(min_opt.value()); common_type = promote_type_with_scalar(common_type, min_opt.value()); + } + if (has_max) { + common_type = promote_type_with_scalar(common_type, max_opt.value()); + } + + // Check Common Dtype + ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + + // Check Scalar Bounds + if (has_min) { ET_KERNEL_CHECK( ctx, check_bounds(min_opt.value(), min_type, out_type, "minimum"), InvalidArgument, out); } - bool has_max = max_opt.has_value(); if (has_max) { - max_type = get_scalar_dtype(max_opt.value()); - common_type = promote_type_with_scalar(common_type, max_opt.value()); ET_KERNEL_CHECK( ctx, check_bounds(max_opt.value(), max_type, out_type, "maximum"), @@ -126,52 +141,39 @@ Tensor& clamp_out( out); } - ET_KERNEL_CHECK_MSG( - ctx, - has_min || has_max, - InvalidArgument, - out, - "At least one of 'min' or 'max' must not be None"); + // Check Dim Order + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); - ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + // Resize + ET_KERNEL_CHECK( + ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); - ET_SWITCH_REALH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() { - // Extract optional min value - CTYPE_OUT min = 0; - if (has_min) { - ET_SWITCH_SCALAR_OBJ_TYPES(min_type, ctx, "clamp", CTYPE_MIN, [&]() { - CTYPE_MIN min_val = 0; - extract_scalar(min_opt.value(), &min_val); - min = static_cast(min_val); - }); - } + // Compute Dtype + ScalarType compute_type = get_compute_type(common_type); - // Extract optional max value - CTYPE_OUT max = 0; - if (has_max) { - ET_SWITCH_SCALAR_OBJ_TYPES(max_type, ctx, "clamp", CTYPE_MAX, [&]() { - CTYPE_MAX max_val = 0; - extract_scalar(max_opt.value(), &max_val); - max = static_cast(max_val); - }); - } + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "clamp.out"; - ET_SWITCH_REALHB_TYPES(in_type, ctx, "clamp", CTYPE_IN, [&]() { - torch::executor::apply_unary_map_fn( - [has_min, min, has_max, max](const CTYPE_IN val_in) { - CTYPE_OUT val_out = static_cast(val_in); - if (has_min) { - val_out = max_override(val_out, min); - } - if (has_max) { - val_out = min_override(val_out, max); - } - return val_out; - }, - in.const_data_ptr(), - out.mutable_data_ptr(), - in.numel()); - }); + ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + apply_unitensor_elementwise_fn( + [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) { + CTYPE_COMPUTE val_out = val_in; + if (has_min) { + val_out = max_override( + val_out, scalar_to(min_opt.value())); + } + if (has_max) { + val_out = min_override( + val_out, scalar_to(max_opt.value())); + } + return val_out; + }, + ctx, + in, + SupportedTensorDtypes::REALHBBF16, + out, + SupportedTensorDtypes::SAME_AS_COMMON); }); return out; @@ -198,29 +200,42 @@ Tensor& clamp_tensor_out( const Tensor& min = has_min ? min_opt.value() : in; const Tensor& max = has_max ? max_opt.value() : in; + // Common Dtype + ScalarType common_type = in.scalar_type(); + if (has_min) { + common_type = promoteTypes(common_type, min.scalar_type()); + } + if (has_max) { + common_type = promoteTypes(common_type, max.scalar_type()); + } + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, canCast(common_type, out.scalar_type()), InvalidArgument, out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, + tensors_have_same_dim_order(in, min, max, out), + InvalidArgument, + out); + + // Resize ET_KERNEL_CHECK( ctx, resize_to_broadcast_target_size(in, min, max, out) == Error::Ok, InvalidArgument, out); + // Compute Dtype + ScalarType compute_type = get_compute_type(common_type); + constexpr int kNnlibMaxDim = 4; /*fallback to not optimised if broadcast and dim > 4 */ ScalarType in_type = in.scalar_type(); ScalarType min_type = min.scalar_type(); ScalarType max_type = max.scalar_type(); - ScalarType common_type = in_type; - ScalarType out_type = out.scalar_type(); - - if (has_min) { - common_type = promoteTypes(common_type, min_type, /*half_to_float*/ true); - } - if (has_max) { - common_type = promoteTypes(common_type, max_type, /*half_to_float*/ true); - } - - ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); bool in_is_broadcasted = !out.sizes().equals(in.sizes()); bool min_is_broadcasted = !out.sizes().equals(min.sizes()); @@ -406,40 +421,35 @@ Tensor& clamp_tensor_out( return out; } - constexpr auto name = "clamp.Tensor_out"; - - ET_SWITCH_REALHB_TYPES(in_type, ctx, name, CTYPE_IN, [&]() { - ET_SWITCH_REALHB_TYPES(min_type, ctx, name, CTYPE_MIN, [&]() { - ET_SWITCH_REALHB_TYPES(max_type, ctx, name, CTYPE_MAX, [&]() { - ET_SWITCH_REALHB_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { - apply_ternary_elementwise_fn< - CTYPE_IN, - CTYPE_MIN, - CTYPE_MAX, - CTYPE_OUT>( - [has_min, has_max]( - const CTYPE_IN val_in, - const CTYPE_MIN val_min, - const CTYPE_MAX val_max) { - CTYPE_OUT val_out = static_cast(val_in); - if (has_min) { - val_out = - max_override(val_out, static_cast(val_min)); - } - if (has_max) { - val_out = - min_override(val_out, static_cast(val_max)); - } - return val_out; - }, - in, - min, - max, - out); - }); - }); - }); + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "clamp.Tensor_out"; + + ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + apply_tritensor_elementwise_fn( + [has_min, has_max]( + const CTYPE_COMPUTE val_in, + const CTYPE_COMPUTE val_min, + const CTYPE_COMPUTE val_max) { + CTYPE_COMPUTE val_out = val_in; + if (has_min) { + val_out = max_override(val_out, val_min); + } + if (has_max) { + val_out = min_override(val_out, val_max); + } + return val_out; + }, + ctx, + in, + SupportedTensorDtypes::REALHBBF16, + min, + SupportedTensorDtypes::REALHBBF16, + max, + SupportedTensorDtypes::REALHBBF16, + out, + SupportedTensorDtypes::REALHBBF16); }); + return out; } } // namespace native diff --git a/backends/cadence/hifi/operators/op_full.cpp b/backends/cadence/hifi/operators/op_full.cpp index 47804a64f4..38f8ff85d4 100644 --- a/backends/cadence/hifi/operators/op_full.cpp +++ b/backends/cadence/hifi/operators/op_full.cpp @@ -16,11 +16,11 @@ namespace impl { namespace HiFi { namespace native { -using exec_aten::IntArrayRef; -using exec_aten::RuntimeContext; -using exec_aten::Scalar; -using exec_aten::ScalarType; -using exec_aten::Tensor; +using executorch::aten::IntArrayRef; +using executorch::aten::RuntimeContext; +using executorch::aten::Scalar; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using torch::executor::Error; using torch::executor::native::utils::extract_scalar; using torch::executor::native::utils::get_scalar_dtype; diff --git a/backends/cadence/hifi/operators/op_mean.cpp b/backends/cadence/hifi/operators/op_mean.cpp index c81a37271e..342c982a07 100644 --- a/backends/cadence/hifi/operators/op_mean.cpp +++ b/backends/cadence/hifi/operators/op_mean.cpp @@ -6,7 +6,6 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include #include @@ -14,9 +13,9 @@ #include -using exec_aten::ScalarType; -using exec_aten::Tensor; using executorch::aten::RuntimeContext; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using executorch::runtime::ArrayRef; using torch::executor::Error; using torch::executor::optional; @@ -146,10 +145,11 @@ Tensor& mean_dim_out( return out; } - ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, "mean.out", CTYPE_IN, [&] { - ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, "mean.out", CTYPE_OUT, [&] { + ET_SWITCH_REALHB_TYPES(in.scalar_type(), ctx, name, CTYPE_IN, [&] { + ET_SWITCH_FLOATH_TYPES(out.scalar_type(), ctx, name, CTYPE_OUT, [&] { CTYPE_OUT* out_data = out.mutable_data_ptr(); const size_t num = torch::executor::get_reduced_dim_product(in, dim_list); + for (size_t out_ix = 0; out_ix < out.numel(); ++out_ix) { CTYPE_OUT sum = 0; if (in.numel() > 0) { diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp index bb72eaf521..65fb647b94 100644 --- a/backends/cadence/hifi/operators/op_permute_copy.cpp +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -10,9 +10,9 @@ #include #include -using exec_aten::ScalarType; -using exec_aten::SizesType; -using exec_aten::Tensor; +using executorch::aten::ScalarType; +using executorch::aten::SizesType; +using executorch::aten::Tensor; using executorch::runtime::IntArrayRef; using executorch::runtime::KernelRuntimeContext; using executorch::runtime::kTensorDimensionLimit; diff --git a/backends/cadence/hifi/operators/op_pow.cpp b/backends/cadence/hifi/operators/op_pow.cpp index a2775ca19f..6ca7ccdebe 100644 --- a/backends/cadence/hifi/operators/op_pow.cpp +++ b/backends/cadence/hifi/operators/op_pow.cpp @@ -11,94 +11,67 @@ #include #include #include +#include #include #include #include -using exec_aten::Scalar; -using exec_aten::ScalarType; -using exec_aten::Tensor; +using executorch::aten::Scalar; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using executorch::runtime::can_cast; using executorch::runtime::canCast; using executorch::runtime::CppTypeToScalarType; using executorch::runtime::KernelRuntimeContext; using executorch::runtime::promoteTypes; +using executorch::runtime::tensors_have_same_dim_order; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +using torch::executor::native::utils::apply_bitensor_elementwise_fn; +using torch::executor::native::utils::apply_unitensor_elementwise_fn; +using torch::executor::native::utils::get_compute_type; +using torch::executor::native::utils::promote_type_with_scalar; +using torch::executor::native::utils::scalar_to; +using torch::executor::native::utils::SupportedTensorDtypes; namespace cadence { namespace impl { namespace HiFi { namespace native { -namespace { -template < - bool can_cast, - typename CTYPE_A, - typename CTYPE_B, - typename CTYPE_IN, - typename CTYPE_OUT> -struct PowInner; - -template < - typename CTYPE_A, - typename CTYPE_B, - typename CTYPE_IN, - typename CTYPE_OUT> -struct PowInner { - static void run(const Tensor& a, const Tensor& b, Tensor& out) { - torch::executor::apply_binary_elementwise_fn( - // NOLINTNEXTLINE(facebook-hte-ConstantArgumentPassByValue) - [](const CTYPE_A val_a, const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = std::pow(a_casted, b_casted); - return static_cast(value); - }, - a, - b, - out); - } -}; - -struct ReportCanCastBug { - static void run(const Tensor&, const Tensor&, Tensor&) { - ET_DCHECK_MSG(false, "BUG: canCast should have been checked above"); - } -}; - -template < - typename CTYPE_A, - typename CTYPE_B, - typename CTYPE_IN, - typename CTYPE_OUT> -struct PowInner - : public ReportCanCastBug {}; - -} // namespace - Tensor& pow_Tensor_Tensor_out( KernelRuntimeContext& ctx, const Tensor& a, const Tensor& b, Tensor& out) { - // Determine output size and resize for dynamic shapes + // Common Dtype + ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type()); + + // Check Common Dtype ET_KERNEL_CHECK( ctx, - resize_to_broadcast_target_size(a, b, out) == Error::Ok, + (canCast(common_type, out.scalar_type()) && + common_type != ScalarType::Bool), InvalidArgument, out); - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); - ScalarType common_type = promoteTypes(a_type, b_type, /*half_to_float*/ true); - ScalarType out_type = out.scalar_type(); + // Check Dim Order + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + // Resize ET_KERNEL_CHECK( - ctx, common_type != exec_aten::ScalarType::Bool, InvalidArgument, out); - ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + // Compute Dtype + ScalarType compute_type = get_compute_type(common_type); + if (compute_type != ScalarType::Float) { + compute_type = ScalarType::Double; + } - constexpr auto name = "pow.Tensor_Tensor_out"; constexpr int kNnlibMaxDim = 16; int a_dim = a.dim(), b_dim = b.dim(), out_dim = out.dim(); bool optimized = true; @@ -109,6 +82,8 @@ Tensor& pow_Tensor_Tensor_out( int max_dim = a.dim() > b.dim() ? a.dim() : b.dim(); max_dim = out.dim() > max_dim ? out.dim() : max_dim; + ScalarType out_type = out.scalar_type(); + if (out_type != ScalarType::Float) optimized = false; @@ -228,21 +203,21 @@ Tensor& pow_Tensor_Tensor_out( return out; } - ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { - using CTYPE_IN = typename torch::executor:: - promote_types::type; - ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { - PowInner< - !std::is_same::value && - can_cast::value, - CTYPE_A, - CTYPE_B, - CTYPE_IN, - CTYPE_OUT>::run(a, b, out); - }); - }); + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "pow.Tensor_Tensor_out"; + + ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + apply_bitensor_elementwise_fn( + [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + return std::pow(val_a, val_b); + }, + ctx, + a, + SupportedTensorDtypes::REALHBBF16, + b, + SupportedTensorDtypes::REALHBBF16, + out, + SupportedTensorDtypes::REALHBF16); }); return out; @@ -253,50 +228,43 @@ Tensor& pow_Tensor_Scalar_out( const Tensor& a, const Scalar& b, Tensor& out) { - (void)ctx; + // Common Dtype + ScalarType common_type = promote_type_with_scalar(a.scalar_type(), b); - // Resize for dynamic shape - ET_KERNEL_CHECK_MSG( + // Check Common Dtype + ET_KERNEL_CHECK( ctx, - resize_tensor(out, a.sizes()) == Error::Ok, + (canCast(common_type, out.scalar_type()) && + common_type != ScalarType::Bool), InvalidArgument, - out, - "Failed to resize output tensor."); - - ScalarType a_type = a.scalar_type(); - ScalarType b_type = torch::executor::native::utils::get_scalar_dtype(b); - ScalarType common_type = - torch::executor::native::utils::promote_type_with_scalar( - a_type, b, /*half_to_float*/ false); - ScalarType out_type = out.scalar_type(); + out); - ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + // Check Dim Order + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + + // Resize + ET_KERNEL_CHECK( + ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out); - constexpr auto name = "pow.Tensor_Scalar_out"; - if (common_type == ScalarType::Half) { - common_type = ScalarType::Float; + // Compute Dtype + ScalarType compute_type = get_compute_type(common_type); + if (compute_type != ScalarType::Float) { + compute_type = ScalarType::Double; } - ET_SWITCH_REALHB_TYPES(a_type, ctx, name, CTYPE_A, [&]() { - ET_SWITCH_SCALAR_OBJ_TYPES(b_type, ctx, name, CTYPE_B, [&]() { - ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() { - ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { - CTYPE_B val_b = 0; - torch::executor::native::utils::extract_scalar(b, &val_b); - torch::executor::apply_unary_map_fn( - [val_b](const CTYPE_A val_a) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = std::pow(a_casted, b_casted); - - return static_cast(value); - }, - a.const_data_ptr(), - out.mutable_data_ptr(), - out.numel()); - }); - }); - }); + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "pow.Tensor_Scalar_out"; + + ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_b = scalar_to(b); + apply_unitensor_elementwise_fn( + [val_b](const CTYPE_COMPUTE val_a) { return std::pow(val_a, val_b); }, + ctx, + a, + SupportedTensorDtypes::REALHBBF16, + out, + SupportedTensorDtypes::REALHBF16); }); return out; @@ -307,50 +275,43 @@ Tensor& pow_Scalar_out( const Scalar& a, const Tensor& b, Tensor& out) { - (void)ctx; + // Common Dtype + ScalarType common_type = promote_type_with_scalar(b.scalar_type(), a); - // Resize for dynamic shape - ET_KERNEL_CHECK_MSG( + // Check Common Dtype + ET_KERNEL_CHECK( ctx, - resize_tensor(out, b.sizes()) == Error::Ok, + (canCast(common_type, out.scalar_type()) && + common_type != ScalarType::Bool), InvalidArgument, - out, - "Failed to resize output tensor."); - - ScalarType a_type = torch::executor::native::utils::get_scalar_dtype(a); - ScalarType b_type = b.scalar_type(); - ScalarType common_type = - torch::executor::native::utils::promote_type_with_scalar( - b_type, a, /*half_to_float*/ false); - ScalarType out_type = out.scalar_type(); + out); - ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + // Check Dim Order + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(b, out), InvalidArgument, out); + + // Resize + ET_KERNEL_CHECK( + ctx, resize_tensor(out, b.sizes()) == Error::Ok, InvalidArgument, out); - constexpr auto name = "pow.Scalar_out"; - if (common_type == ScalarType::Half) { - common_type = ScalarType::Float; + // Compute Dtype + ScalarType compute_type = get_compute_type(common_type); + if (compute_type != ScalarType::Float) { + compute_type = ScalarType::Double; } - ET_SWITCH_SCALAR_OBJ_TYPES(a_type, ctx, name, CTYPE_A, [&]() { - ET_SWITCH_REALHB_TYPES(b_type, ctx, name, CTYPE_B, [&]() { - ET_SWITCH_REAL_TYPES(common_type, ctx, name, CTYPE_IN, [&]() { - ET_SWITCH_REALH_TYPES(out_type, ctx, name, CTYPE_OUT, [&]() { - CTYPE_A val_a = 0; - torch::executor::native::utils::extract_scalar(a, &val_a); - - torch::executor::apply_unary_map_fn( - [val_a](const CTYPE_B val_b) { - CTYPE_IN a_casted = static_cast(val_a); - CTYPE_IN b_casted = static_cast(val_b); - CTYPE_IN value = std::pow(a_casted, b_casted); - return static_cast(value); - }, - b.const_data_ptr(), - out.mutable_data_ptr(), - out.numel()); - }); - }); - }); + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "pow.Scalar_out"; + + ET_SWITCH_FLOAT_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_a = scalar_to(a); + apply_unitensor_elementwise_fn( + [val_a](const CTYPE_COMPUTE val_b) { return std::pow(val_a, val_b); }, + ctx, + b, + SupportedTensorDtypes::REALHBBF16, + out, + SupportedTensorDtypes::REALHBF16); }); return out; diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp index 69c855f590..ef356457ee 100644 --- a/backends/cadence/hifi/operators/op_remainder.cpp +++ b/backends/cadence/hifi/operators/op_remainder.cpp @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -21,22 +22,29 @@ namespace impl { namespace HiFi { namespace native { -using exec_aten::Scalar; -using exec_aten::ScalarType; -using exec_aten::Tensor; using executorch::aten::RuntimeContext; +using executorch::aten::Scalar; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using executorch::runtime::can_cast; using executorch::runtime::canCast; using executorch::runtime::CppTypeToScalarType; +using executorch::runtime::is_integral_type; using executorch::runtime::promoteTypes; +using executorch::runtime::tensors_have_same_dim_order; using torch::executor::apply_binary_elementwise_fn; using torch::executor::apply_unary_map_fn; using torch::executor::Error; using torch::executor::resize_to_broadcast_target_size; +using torch::executor::native::utils::apply_bitensor_elementwise_fn; +using torch::executor::native::utils::apply_unitensor_elementwise_fn; using torch::executor::native::utils::extract_scalar; +using torch::executor::native::utils::get_compute_type; using torch::executor::native::utils::get_scalar_dtype; using torch::executor::native::utils::promote_type_with_scalar; using torch::executor::native::utils::remainder_override; +using torch::executor::native::utils::scalar_to; +using torch::executor::native::utils::SupportedTensorDtypes; namespace { template < @@ -91,6 +99,30 @@ Tensor& remainder_Tensor_out( Tensor& out) { (void)ctx; + // Common Dtype + ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type()); + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (canCast(common_type, out.scalar_type()) && + common_type != ScalarType::Bool), + InvalidArgument, + out); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, b, out), InvalidArgument, out); + + // Resize + ET_KERNEL_CHECK( + ctx, + resize_to_broadcast_target_size(a, b, out) == Error::Ok, + InvalidArgument, + out); + + // Compute Dtype + ScalarType compute_type = get_compute_type(common_type); constexpr int kNnlibMaxDim = 4; /*fallback to not optimised if broadcast and dim > 4 */ @@ -152,96 +184,96 @@ Tensor& remainder_Tensor_out( } return out; } - // Determine output size and resize for dynamic shapes - ET_KERNEL_CHECK( + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "remainder.Tensor_out"; + + bool div_by_zero_error = false; + + ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + apply_bitensor_elementwise_fn( + [&div_by_zero_error]( + const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b) { + CTYPE_COMPUTE value = 0; + if (is_integral_type::value) { + if (val_b == 0) { + div_by_zero_error = true; + return value; + } + } + value = remainder_override(val_a, val_b); + return value; + }, + ctx, + a, + SupportedTensorDtypes::REALHBBF16, + b, + SupportedTensorDtypes::REALHBBF16, + out, + SupportedTensorDtypes::REALHBF16); + }); + + ET_KERNEL_CHECK_MSG( ctx, - resize_to_broadcast_target_size(a, b, out) == Error::Ok, + !div_by_zero_error, InvalidArgument, - out); - - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); - ScalarType common_type = promoteTypes(a_type, b_type); - ScalarType out_type = out.scalar_type(); - - ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); - - ET_SWITCH_REAL_TYPES_AND( - Bool, a_type, ctx, "remainder.Tensor_out", CTYPE_A, [&]() { - ET_SWITCH_REAL_TYPES_AND( - Bool, b_type, ctx, "remainder.Tensor_out", CTYPE_B, [&]() { - using CTYPE_IN = typename torch::executor:: - promote_types::type; - ET_DCHECK(CppTypeToScalarType::value == common_type); - ET_SWITCH_REAL_TYPES( - out_type, ctx, "remainder.Tensor_out", CTYPE_OUT, [&]() { - RemainderInner< - can_cast::value, - CTYPE_A, - CTYPE_B, - CTYPE_IN, - CTYPE_OUT>::run(a, b, out); - }); - }); - }); + out, + "Remainder operation encountered integer division by zero"); return out; } Tensor& remainder_Scalar_out( - RuntimeContext& ctx, + KernelRuntimeContext& ctx, const Tensor& a, const Scalar& b, Tensor& out) { - (void)ctx; + // Common Dtype + ScalarType common_type = promote_type_with_scalar(a.scalar_type(), b); + + // Check Common Dtype + ET_KERNEL_CHECK( + ctx, + (canCast(common_type, out.scalar_type()) && + common_type != ScalarType::Bool), + InvalidArgument, + out); - // Resize for dynamic shape + // Check for intergral division by zero ET_KERNEL_CHECK_MSG( ctx, - resize_tensor(out, a.sizes()) == Error::Ok, + !(executorch::runtime::isIntegralType(common_type, true) && + scalar_to(b) == 0), InvalidArgument, out, - "Failed to resize output tensor."); - - ScalarType a_type = a.scalar_type(); - ScalarType b_type = get_scalar_dtype(b); - ScalarType common_type = promote_type_with_scalar(a_type, b); - ScalarType out_type = out.scalar_type(); - - ET_KERNEL_CHECK(ctx, canCast(common_type, out_type), InvalidArgument, out); - - ET_SWITCH_REAL_TYPES_AND( - Bool, a_type, ctx, "remainder.Scalar_out", CTYPE_A, [&]() { - ET_SWITCH_SCALAR_OBJ_TYPES( - b_type, ctx, "remainder.Scalar_out", CTYPE_B, [&]() { - CTYPE_B val_b = 0; - extract_scalar(b, &val_b); - ET_SWITCH_REAL_TYPES( - common_type, ctx, "remainder.Scalar_out", CTYPE_IN, [&]() { - ET_SWITCH_REAL_TYPES( - out_type, - ctx, - "remainder.Scalar_out", - CTYPE_OUT, - [&]() { - apply_unary_map_fn( - [val_b](const CTYPE_A val_a) { - CTYPE_IN a_casted = - static_cast(val_a); - CTYPE_IN b_casted = - static_cast(val_b); - CTYPE_IN value = - remainder_override(a_casted, b_casted); - - return static_cast(value); - }, - a.const_data_ptr(), - out.mutable_data_ptr(), - out.numel()); - }); - }); - }); - }); + "Remainder operation encountered integer division by zero"); + + // Check Dim Order + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(a, out), InvalidArgument, out); + + // Resize + ET_KERNEL_CHECK( + ctx, resize_tensor(out, a.sizes()) == Error::Ok, InvalidArgument, out); + + // Compute Dtype + ScalarType compute_type = get_compute_type(common_type); + + // @lint-ignore CLANGTIDY facebook-hte-CArray + static constexpr const char op_name[] = "remainder.Scalar_out"; + + ET_SWITCH_REAL_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { + const CTYPE_COMPUTE val_b = scalar_to(b); + apply_unitensor_elementwise_fn( + [val_b](const CTYPE_COMPUTE val_a) { + return remainder_override(val_a, val_b); + }, + ctx, + a, + SupportedTensorDtypes::REALHBBF16, + out, + SupportedTensorDtypes::REALHBF16); + }); return out; } diff --git a/backends/cadence/hifi/operators/op_softmax.cpp b/backends/cadence/hifi/operators/op_softmax.cpp index 0d687ff095..e026afd2c9 100644 --- a/backends/cadence/hifi/operators/op_softmax.cpp +++ b/backends/cadence/hifi/operators/op_softmax.cpp @@ -14,8 +14,8 @@ #include #include "kernels.h" -using Tensor = exec_aten::Tensor; -using exec_aten::ScalarType; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; using executorch::runtime::KernelRuntimeContext; using torch::executor::Error; diff --git a/backends/cadence/hifi/operators/op_where.cpp b/backends/cadence/hifi/operators/op_where.cpp index 3871761e7d..435c758f1b 100644 --- a/backends/cadence/hifi/operators/op_where.cpp +++ b/backends/cadence/hifi/operators/op_where.cpp @@ -8,15 +8,20 @@ #include #include -#include #include #include #include -using exec_aten::ScalarType; -using exec_aten::Tensor; using executorch::aten::RuntimeContext; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using executorch::runtime::promoteTypes; +using executorch::runtime::tensors_have_same_dim_order; using torch::executor::Error; +using torch::executor::resize_to_broadcast_target_size; +using torch::executor::native::utils::apply_tritensor_elementwise_fn; +using torch::executor::native::utils::get_compute_type; +using torch::executor::native::utils::SupportedTensorDtypes; namespace cadence { namespace impl { @@ -29,29 +34,27 @@ Tensor& where_out( const Tensor& a, const Tensor& b, Tensor& out) { - ScalarType cond_type = cond.scalar_type(); - ScalarType a_type = a.scalar_type(); - ScalarType b_type = b.scalar_type(); - ScalarType common_type = executorch::runtime::promoteTypes(a_type, b_type); - ScalarType out_type = out.scalar_type(); + // Common Dtype + ScalarType common_type = promoteTypes(a.scalar_type(), b.scalar_type()); + + // Check Common Dtype + ET_KERNEL_CHECK(ctx, common_type == out.scalar_type(), InvalidArgument, out); - ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); + // Check Dim Order + ET_KERNEL_CHECK( + ctx, tensors_have_same_dim_order(cond, a, b, out), InvalidArgument, out); - // Determine output size and resize for dynamic shapes + // Resize ET_KERNEL_CHECK( ctx, - torch::executor::resize_to_broadcast_target_size(a, b, cond, out) == - Error::Ok, + resize_to_broadcast_target_size(a, b, cond, out) == Error::Ok, InvalidArgument, out); - constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ - constexpr auto name = "where.self_out"; + // Compute Dtype + ScalarType compute_type = get_compute_type(common_type); - ET_CHECK_MSG( - cond_type == ScalarType::Bool || cond_type == ScalarType::Byte, - "Unhandled dtype %s for where.self_out", - torch::executor::toString(cond_type)); + constexpr int kNnlibMaxDim = 4; /*fallback if broadcast and dim > 4 */ int a_dim = a.dim(), b_dim = b.dim(), con_dim = cond.dim(), out_dim = out.dim(); @@ -67,6 +70,9 @@ Tensor& where_out( max_dim = cond.dim() > max_dim ? cond.dim() : max_dim; max_dim = out.dim() > max_dim ? out.dim() : max_dim; + ScalarType a_type = a.scalar_type(); + ScalarType b_type = b.scalar_type(); + if ((a_type != ScalarType::Float) || (b_type != ScalarType::Float)) optimized = 0; @@ -155,41 +161,26 @@ Tensor& where_out( } return out; } - - // Compute Dtype - ScalarType compute_type = - torch::executor::native::utils::get_compute_type(common_type); - // @lint-ignore CLANGTIDY facebook-hte-CArray static constexpr const char op_name[] = "where.self_out"; ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - torch::executor::native::utils::apply_tritensor_elementwise_fn< - CTYPE_COMPUTE, - op_name>( + apply_tritensor_elementwise_fn( [](const CTYPE_COMPUTE val_a, const CTYPE_COMPUTE val_b, const CTYPE_COMPUTE val_c) { return val_c ? val_a : val_b; }, ctx, a, - torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + SupportedTensorDtypes::REALHBBF16, b, - torch::executor::native::utils::SupportedTensorDtypes::REALHBBF16, + SupportedTensorDtypes::REALHBBF16, cond, - torch::executor::native::utils::SupportedTensorDtypes::BOOL_OR_BYTE, + SupportedTensorDtypes::BOOL_OR_BYTE, out, - torch::executor::native::utils::SupportedTensorDtypes::SAME_AS_COMMON); + SupportedTensorDtypes::SAME_AS_COMMON); }); - return out; -} -Tensor& where_self_out( - RuntimeContext& ctx, - const Tensor& cond, - const Tensor& a, - const Tensor& b, - Tensor& out) { - return cadence::impl::HiFi::native::where_out(ctx, cond, a, b, out); + return out; } } // namespace native diff --git a/backends/cadence/hifi/operators/quantized_relu_out.cpp b/backends/cadence/hifi/operators/quantized_relu_out.cpp index 6b7fae6e05..d78e555ad1 100644 --- a/backends/cadence/hifi/operators/quantized_relu_out.cpp +++ b/backends/cadence/hifi/operators/quantized_relu_out.cpp @@ -9,9 +9,9 @@ #include #include -using Tensor = exec_aten::Tensor; -using KernelRuntimeContext = torch::executor::KernelRuntimeContext; -using ScalarType = exec_aten::ScalarType; +using executorch::aten::ScalarType; +using executorch::aten::Tensor; +using torch::executor::KernelRuntimeContext; namespace cadence { namespace impl { From 3539f52c76f37c3fa868b88e5d05ecc2e7cf89c7 Mon Sep 17 00:00:00 2001 From: Nishak Date: Mon, 13 Jan 2025 10:18:46 -0800 Subject: [PATCH 14/18] clean up --- backends/cadence/hifi/operators/op_full.cpp | 10 +++++----- backends/cadence/hifi/operators/op_remainder.cpp | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/backends/cadence/hifi/operators/op_full.cpp b/backends/cadence/hifi/operators/op_full.cpp index 38f8ff85d4..3d30433d37 100644 --- a/backends/cadence/hifi/operators/op_full.cpp +++ b/backends/cadence/hifi/operators/op_full.cpp @@ -11,11 +11,6 @@ #include #include -namespace cadence { -namespace impl { -namespace HiFi { -namespace native { - using executorch::aten::IntArrayRef; using executorch::aten::RuntimeContext; using executorch::aten::Scalar; @@ -25,6 +20,11 @@ using torch::executor::Error; using torch::executor::native::utils::extract_scalar; using torch::executor::native::utils::get_scalar_dtype; +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + Tensor& full_out( RuntimeContext& ctx, const IntArrayRef sizes, diff --git a/backends/cadence/hifi/operators/op_remainder.cpp b/backends/cadence/hifi/operators/op_remainder.cpp index ef356457ee..d8c4a6d2d8 100644 --- a/backends/cadence/hifi/operators/op_remainder.cpp +++ b/backends/cadence/hifi/operators/op_remainder.cpp @@ -17,11 +17,6 @@ #include "kernels.h" -namespace cadence { -namespace impl { -namespace HiFi { -namespace native { - using executorch::aten::RuntimeContext; using executorch::aten::Scalar; using executorch::aten::ScalarType; @@ -46,6 +41,11 @@ using torch::executor::native::utils::remainder_override; using torch::executor::native::utils::scalar_to; using torch::executor::native::utils::SupportedTensorDtypes; +namespace cadence { +namespace impl { +namespace HiFi { +namespace native { + namespace { template < bool can_cast, From 4923b83e0e369389f2699a2fd3c5d4a7c81d4672 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Tue, 21 Jan 2025 01:45:53 -0800 Subject: [PATCH 15/18] Fixing review comment on PR 7567 --- examples/portable/executor_runner/executor_runner.cpp | 1 - 1 file changed, 1 deletion(-) diff --git a/examples/portable/executor_runner/executor_runner.cpp b/examples/portable/executor_runner/executor_runner.cpp index 514a82c0ff..65ba762743 100644 --- a/examples/portable/executor_runner/executor_runner.cpp +++ b/examples/portable/executor_runner/executor_runner.cpp @@ -179,7 +179,6 @@ int main(int argc, char** argv) { // Run the model. Error status = method->execute(); - ET_CHECK_MSG( status == Error::Ok, "Execution of method %s failed with status 0x%" PRIx32, From 224aaf47ed50b55bd5fcdd9ba687bbc1106a01c1 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Thu, 23 Jan 2025 02:50:33 -0800 Subject: [PATCH 16/18] Fixing review comments in PR 7567 --- backends/cadence/hifi/operators/op_atan2.cpp | 3 +- backends/cadence/hifi/operators/op_clamp.cpp | 131 ------------------ .../hifi/operators/op_permute_copy.cpp | 52 ++----- .../nnlib/xa_nn_elm_minimum_maximum_f32.c | 9 +- 4 files changed, 18 insertions(+), 177 deletions(-) diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp index fd595a935c..24a562b9fe 100644 --- a/backends/cadence/hifi/operators/op_atan2.cpp +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -6,11 +6,12 @@ * LICENSE file in the root directory of this source tree. */ +#include #include #include #include #include -#include + using executorch::aten::ScalarType; using executorch::aten::Tensor; diff --git a/backends/cadence/hifi/operators/op_clamp.cpp b/backends/cadence/hifi/operators/op_clamp.cpp index 397e73d6e2..d31161a7d5 100644 --- a/backends/cadence/hifi/operators/op_clamp.cpp +++ b/backends/cadence/hifi/operators/op_clamp.cpp @@ -48,137 +48,6 @@ namespace impl { namespace HiFi { namespace native { -namespace { - -template -/** Check if val, when cast to CTYPE_CAST, is not in the range of CTYPE_OUT */ -bool is_out_of_bounds(CTYPE_VAL val) { - const CTYPE_CAST val_cast = static_cast(val); - return val_cast < std::numeric_limits::lowest() || - val_cast > std::numeric_limits::max(); -} - -ET_NODISCARD bool check_bounds( - const Scalar& val_scalar, - const ScalarType& val_type, - const ScalarType& out_type, - const char* val_name) { - auto is_valid = true; - - ET_SWITCH_SCALAR_OBJ_TYPES(val_type, ctx, "clamp.out", CTYPE_VAL, [&]() { - CTYPE_VAL val = 0; - extract_scalar(val_scalar, &val); - if (isIntegralType(out_type, /*includeBool=*/false)) { - ET_SWITCH_INT_TYPES(out_type, ctx, "clamp.out", CTYPE_OUT, [&]() { - if (is_out_of_bounds(val)) { - ET_LOG(Error, "%s value out of bounds", val_name); - is_valid = false; - } - }); - } else if (isFloatingType(out_type)) { - ET_SWITCH_FLOATH_TYPES(out_type, ctx, "clamp", CTYPE_OUT, [&]() { - if (std::isfinite(val) && - is_out_of_bounds(val)) { - ET_LOG(Error, "%s value out of bounds", val_name); - is_valid = false; - } - }); - } - }); - - return is_valid; -} - -} // namespace - -Tensor& clamp_out( - KernelRuntimeContext& ctx, - const Tensor& in, - const exec_aten::optional& min_opt, - const exec_aten::optional& max_opt, - Tensor& out) { - bool has_min = min_opt.has_value(); - bool has_max = max_opt.has_value(); - - ET_KERNEL_CHECK_MSG( - ctx, - has_min || has_max, - InvalidArgument, - out, - "At least one of 'min' or 'max' must not be None"); - - // Input Dtypes - ScalarType in_type = in.scalar_type(); - ScalarType min_type = has_min ? get_scalar_dtype(min_opt.value()) : in_type; - ScalarType max_type = has_max ? get_scalar_dtype(max_opt.value()) : in_type; - ScalarType out_type = out.scalar_type(); - - // Common Dtype - ScalarType common_type = in_type; - if (has_min) { - common_type = promote_type_with_scalar(common_type, min_opt.value()); - } - if (has_max) { - common_type = promote_type_with_scalar(common_type, max_opt.value()); - } - - // Check Common Dtype - ET_KERNEL_CHECK(ctx, common_type == out_type, InvalidArgument, out); - - // Check Scalar Bounds - if (has_min) { - ET_KERNEL_CHECK( - ctx, - check_bounds(min_opt.value(), min_type, out_type, "minimum"), - InvalidArgument, - out); - } - if (has_max) { - ET_KERNEL_CHECK( - ctx, - check_bounds(max_opt.value(), max_type, out_type, "maximum"), - InvalidArgument, - out); - } - - // Check Dim Order - ET_KERNEL_CHECK( - ctx, tensors_have_same_dim_order(in, out), InvalidArgument, out); - - // Resize - ET_KERNEL_CHECK( - ctx, resize_tensor(out, in.sizes()) == Error::Ok, InvalidArgument, out); - - // Compute Dtype - ScalarType compute_type = get_compute_type(common_type); - - // @lint-ignore CLANGTIDY facebook-hte-CArray - static constexpr const char op_name[] = "clamp.out"; - - ET_SWITCH_REALB_TYPES(compute_type, ctx, op_name, CTYPE_COMPUTE, [&]() { - apply_unitensor_elementwise_fn( - [has_min, min_opt, has_max, max_opt](const CTYPE_COMPUTE val_in) { - CTYPE_COMPUTE val_out = val_in; - if (has_min) { - val_out = max_override( - val_out, scalar_to(min_opt.value())); - } - if (has_max) { - val_out = min_override( - val_out, scalar_to(max_opt.value())); - } - return val_out; - }, - ctx, - in, - SupportedTensorDtypes::REALHBBF16, - out, - SupportedTensorDtypes::SAME_AS_COMMON); - }); - - return out; -} - Tensor& clamp_tensor_out( RuntimeContext& ctx, const Tensor& in, diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp index 65fb647b94..ba3ce7cefe 100644 --- a/backends/cadence/hifi/operators/op_permute_copy.cpp +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -85,22 +85,22 @@ Tensor& permute_copy_out( optimized = false; if (optimized) { - if (in_type == ScalarType::Float) { - WORD32* p_inp = (WORD32*)in.const_data_ptr(); - WORD32* p_out = (WORD32*)out.mutable_data_ptr(); + WORD32 num_inp_dims = in.dim(); + WORD32 num_out_dims = num_inp_dims; - WORD32 num_inp_dims = in.dim(); - WORD32 num_out_dims = num_inp_dims; + WORD32 p_inp_shape[kNnlibMaxDim]; + WORD32 p_out_shape[kNnlibMaxDim]; + WORD32 p_permute_vec[kNnlibMaxDim]; - WORD32 p_inp_shape[kNnlibMaxDim]; - WORD32 p_out_shape[kNnlibMaxDim]; - WORD32 p_permute_vec[kNnlibMaxDim]; + for (int i = 0; i < num_inp_dims; i++) { + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; + } - for (int i = 0; i < num_inp_dims; i++) { - p_inp_shape[i] = in.size(i); - p_out_shape[i] = in.size(dims[i]); - p_permute_vec[i] = dims[i]; - } + if (in_type == ScalarType::Float) { + WORD32* p_inp = (WORD32*)in.const_data_ptr(); + WORD32* p_out = (WORD32*)out.mutable_data_ptr(); WORD32 ret_val = xa_nn_transpose_32_32( p_out, @@ -117,19 +117,6 @@ Tensor& permute_copy_out( WORD8* p_inp = (WORD8*)in.const_data_ptr(); WORD8* p_out = (WORD8*)out.mutable_data_ptr(); - WORD32 num_inp_dims = in.dim(); - WORD32 num_out_dims = num_inp_dims; - - WORD32 p_inp_shape[kNnlibMaxDim]; - WORD32 p_out_shape[kNnlibMaxDim]; - WORD32 p_permute_vec[kNnlibMaxDim]; - - for (int i = 0; i < num_inp_dims; i++) { - p_inp_shape[i] = in.size(i); - p_out_shape[i] = in.size(dims[i]); - p_permute_vec[i] = dims[i]; - } - WORD32 val = xa_nn_transpose_8_8( p_out, p_out_shape, @@ -145,19 +132,6 @@ Tensor& permute_copy_out( WORD8* p_inp = (WORD8*)in.const_data_ptr(); WORD8* p_out = (WORD8*)out.mutable_data_ptr(); - WORD32 num_inp_dims = in.dim(); - WORD32 num_out_dims = num_inp_dims; - - WORD32 p_inp_shape[kNnlibMaxDim]; - WORD32 p_out_shape[kNnlibMaxDim]; - WORD32 p_permute_vec[kNnlibMaxDim]; - - for (int i = 0; i < num_inp_dims; i++) { - p_inp_shape[i] = in.size(i); - p_out_shape[i] = in.size(dims[i]); - p_permute_vec[i] = dims[i]; - } - WORD32 val = xa_nn_transpose_8_8( p_out, p_out_shape, diff --git a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c index ea7ed151cb..50d24c8bae 100644 --- a/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c +++ b/backends/cadence/hifi/third-party/nnlib/xa_nn_elm_minimum_maximum_f32.c @@ -19,12 +19,9 @@ * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ******************************************************************************/ -#include "nnlib-hifi4/xa_nnlib/include/xa_type_def.h" -#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_common_fpu.h" -#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nn_common.h" -#include "nnlib-hifi4/xa_nnlib/algo/common/include/xa_nnlib_err_chk.h" -#include "nnlib-hifi4/xa_nnlib/algo/kernels/basic/hifi4/xa_nn_basic_state.h" -#include "nnlib-hifi4/xa_nnlib/include/nnlib/xa_nnlib_kernels_api.h" +#include "xa_type_def.h" +#include "xa_nnlib_common_fpu.h" +#include "xa_nnlib_err_chk.h" #if !HAVE_VFPU DISCARD_FUN_FOR_NONVOID_RETURN( From 6409958ca1f230f192e15afe28f362bf13001985 Mon Sep 17 00:00:00 2001 From: dijopaul Date: Fri, 24 Jan 2025 00:15:26 -0800 Subject: [PATCH 17/18] Fixing lint error in PR7567 --- backends/cadence/hifi/operators/op_atan2.cpp | 3 +-- backends/cadence/hifi/operators/op_permute_copy.cpp | 6 +++--- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/backends/cadence/hifi/operators/op_atan2.cpp b/backends/cadence/hifi/operators/op_atan2.cpp index 24a562b9fe..fd595a935c 100644 --- a/backends/cadence/hifi/operators/op_atan2.cpp +++ b/backends/cadence/hifi/operators/op_atan2.cpp @@ -6,12 +6,11 @@ * LICENSE file in the root directory of this source tree. */ -#include #include #include #include #include - +#include using executorch::aten::ScalarType; using executorch::aten::Tensor; diff --git a/backends/cadence/hifi/operators/op_permute_copy.cpp b/backends/cadence/hifi/operators/op_permute_copy.cpp index ba3ce7cefe..1d56d79dfd 100644 --- a/backends/cadence/hifi/operators/op_permute_copy.cpp +++ b/backends/cadence/hifi/operators/op_permute_copy.cpp @@ -93,9 +93,9 @@ Tensor& permute_copy_out( WORD32 p_permute_vec[kNnlibMaxDim]; for (int i = 0; i < num_inp_dims; i++) { - p_inp_shape[i] = in.size(i); - p_out_shape[i] = in.size(dims[i]); - p_permute_vec[i] = dims[i]; + p_inp_shape[i] = in.size(i); + p_out_shape[i] = in.size(dims[i]); + p_permute_vec[i] = dims[i]; } if (in_type == ScalarType::Float) { From d62648a8f45996f336c860d4c2f3f2e3ee5bfb8a Mon Sep 17 00:00:00 2001 From: dijopaul Date: Fri, 24 Jan 2025 05:45:26 -0800 Subject: [PATCH 18/18] Updating cat to support Int variant --- backends/cadence/hifi/operators/op_cat.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/backends/cadence/hifi/operators/op_cat.cpp b/backends/cadence/hifi/operators/op_cat.cpp index 14b7abe5bb..e367d71b79 100644 --- a/backends/cadence/hifi/operators/op_cat.cpp +++ b/backends/cadence/hifi/operators/op_cat.cpp @@ -68,7 +68,8 @@ Tensor& cat_out( bool optimized = true; - if (out.scalar_type() != ScalarType::Float) + if ((out.scalar_type() != ScalarType::Float) && + (out.scalar_type() != ScalarType::Int)) optimized = false; if (optimized) {