From 647736803f965a21c76b155880a58b44b16cb8af Mon Sep 17 00:00:00 2001 From: Scott Wolchok Date: Tue, 7 Jan 2025 14:21:08 -0800 Subject: [PATCH] Use c10 version of half/bfloat16 in executorch Pull Request resolved: https://github.com/pytorch/executorch/pull/7040 Pull Request resolved: https://github.com/pytorch/pytorch/pull/144111 Accomplished by importing relevant files from c10 into executorch/runtime/core/portable_type/c10, and then using `using` in the top-level ExecuTorch headers. This approach should keep the ExecuTorch build hermetic for embedded use cases. In the future, we should add a CI job to ensure the c10 files stay identical to the PyTorch ones. ghstack-source-id: 260556840 @exported-using-ghexport Differential Revision: [D66106969](https://our.internmc.facebook.com/intern/diff/D66106969/) --- .lintrunner.toml | 4 + CMakeLists.txt | 13 +- backends/qualcomm/CMakeLists.txt | 2 + backends/xnnpack/CMakeLists.txt | 2 +- build/executorch-config.cmake | 13 +- runtime/core/portable_type/bfloat16.h | 328 +------- runtime/core/portable_type/bfloat16_math.h | 278 +------ runtime/core/portable_type/c10/TARGETS | 8 + .../core/portable_type/c10/macros/Export.h | 160 ++++ .../core/portable_type/c10/macros/Macros.h | 511 ++++++++++++ runtime/core/portable_type/c10/targets.bzl | 100 +++ .../portable_type/c10/util/BFloat16-inl.h | 343 ++++++++ .../portable_type/c10/util/BFloat16-math.h | 292 +++++++ .../core/portable_type/c10/util/BFloat16.h | 130 +++ .../core/portable_type/c10/util/Half-inl.h | 350 ++++++++ runtime/core/portable_type/c10/util/Half.h | 423 ++++++++++ .../portable_type/c10/util/TypeSafeSignMath.h | 140 ++++ .../core/portable_type/c10/util/bit_cast.h | 44 + .../c10/util/floating_point_utils.h | 33 + runtime/core/portable_type/half.h | 759 +----------------- runtime/core/portable_type/targets.bzl | 3 + runtime/kernel/test/CMakeLists.txt | 5 +- shim/xplat/executorch/build/env_interface.bzl | 5 +- 23 files changed, 2585 insertions(+), 1361 deletions(-) create mode 100644 runtime/core/portable_type/c10/TARGETS create mode 100644 runtime/core/portable_type/c10/macros/Export.h create mode 100644 runtime/core/portable_type/c10/macros/Macros.h create mode 100644 runtime/core/portable_type/c10/targets.bzl create mode 100644 runtime/core/portable_type/c10/util/BFloat16-inl.h create mode 100644 runtime/core/portable_type/c10/util/BFloat16-math.h create mode 100644 runtime/core/portable_type/c10/util/BFloat16.h create mode 100644 runtime/core/portable_type/c10/util/Half-inl.h create mode 100644 runtime/core/portable_type/c10/util/Half.h create mode 100644 runtime/core/portable_type/c10/util/TypeSafeSignMath.h create mode 100644 runtime/core/portable_type/c10/util/bit_cast.h create mode 100644 runtime/core/portable_type/c10/util/floating_point_utils.h diff --git a/.lintrunner.toml b/.lintrunner.toml index cd8a8d535e..3f8865f53b 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -78,6 +78,8 @@ exclude_patterns = [ # File contains @generated 'extension/llm/custom_ops/spinquant/fast_hadamard_transform_special.h', 'extension/llm/custom_ops/spinquant/test/fast_hadamard_transform_special_unstrided_cpu.h', + # Want to be able to keep c10 in sync with PyTorch core. + 'runtime/core/portable_type/c10/**', ] command = [ 'python', @@ -261,6 +263,8 @@ exclude_patterns = [ 'extension/**', 'kernels/optimized/**', 'runtime/core/exec_aten/**', + # Want to be able to keep c10 in sync with PyTorch core. + 'runtime/core/portable_type/c10/**', 'runtime/executor/tensor_parser_aten.cpp', 'scripts/**', 'test/**', diff --git a/CMakeLists.txt b/CMakeLists.txt index 8a9102848d..3363b108ce 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -361,7 +361,7 @@ if(NOT "${_repo_dir_name}" STREQUAL "executorch") "fix for this restriction." ) endif() -set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/..) +set(_common_include_directories ${CMAKE_CURRENT_SOURCE_DIR}/.. ${CMAKE_CURRENT_SOURCE_DIR}/runtime/core/portable_type) # # The `__srcs` lists are defined by including ${EXECUTORCH_SRCS_FILE}. @@ -544,6 +544,7 @@ endif() target_include_directories( executorch_core PUBLIC ${_common_include_directories} ) +target_compile_definitions(executorch_core PUBLIC C10_USING_CUSTOM_GENERATED_MACROS) target_compile_options(executorch_core PUBLIC ${_common_compile_options}) if(MAX_KERNEL_NUM) target_compile_definitions( @@ -564,6 +565,7 @@ if(EXECUTORCH_BUILD_PYBIND AND APPLE) target_include_directories( executorch_core_shared PUBLIC ${_common_include_directories} ) + target_compile_definitions(executorch_core_shared PUBLIC C10_USING_CUSTOM_GENERATED_MACROS) target_compile_options( executorch_core_shared PUBLIC ${_common_compile_options} ) @@ -584,6 +586,7 @@ endif() add_library(executorch ${_executorch__srcs}) target_link_libraries(executorch PRIVATE executorch_core) target_include_directories(executorch PUBLIC ${_common_include_directories}) +target_compile_definitions(executorch PUBLIC C10_USING_CUSTOM_GENERATED_MACROS) target_compile_options(executorch PUBLIC ${_common_compile_options}) target_link_options_shared_lib(executorch) @@ -617,6 +620,12 @@ endif() # Install `executorch` library as well as `executorch-config.cmake` under # ${CMAKE_INSTALL_PREFIX}/ +install(DIRECTORY runtime/core/ DESTINATION include/executorch/runtime/core FILES_MATCHING PATTERN "*.h") +install(DIRECTORY runtime/kernel/ DESTINATION include/executorch/runtime/kernel FILES_MATCHING PATTERN "*.h") +install(DIRECTORY runtime/platform/ DESTINATION include/executorch/runtime/platform FILES_MATCHING PATTERN "*.h") +install(DIRECTORY extension/kernel_util/ DESTINATION include/executorch/extension/kernel_util FILES_MATCHING PATTERN "*.h") +install(DIRECTORY extension/tensor/ DESTINATION include/executorch/extension/tensor FILES_MATCHING PATTERN "*.h") +install(DIRECTORY extension/threadpool/ DESTINATION include/executorch/extension/threadpool FILES_MATCHING PATTERN "*.h") install( TARGETS executorch executorch_core DESTINATION lib @@ -775,6 +784,8 @@ if(EXECUTORCH_BUILD_PYBIND) target_include_directories( util PUBLIC ${_common_include_directories} ${TORCH_INCLUDE_DIRS} ) + target_compile_definitions(util PUBLIC C10_USING_CUSTOM_GENERATED_MACROS) + target_compile_options(util PUBLIC ${_pybind_compile_options}) target_link_libraries(util PRIVATE torch c10 executorch extension_tensor) diff --git a/backends/qualcomm/CMakeLists.txt b/backends/qualcomm/CMakeLists.txt index 3c66796594..d58c19c3e1 100644 --- a/backends/qualcomm/CMakeLists.txt +++ b/backends/qualcomm/CMakeLists.txt @@ -53,6 +53,7 @@ add_custom_command( ) add_compile_options("-Wall" "-Werror" "-Wno-sign-compare") +add_compile_definitions(C10_USING_CUSTOM_GENERATED_MACROS) # GNU emit wanring for ignored attributes Unfortunately, we use [[maybe_unused]] # which can be ignored by GNU. So we make it a warning, not an error in GNU. @@ -72,6 +73,7 @@ endif() include_directories( BEFORE ${_common_include_directories} ${QNN_SDK_ROOT}/include/QNN ${EXECUTORCH_SOURCE_DIR}/third-party/flatbuffers/include + ${EXECUTORCH_SOURCE_DIR}/runtime/core/portable_type ) set(_qnn_schema__srcs diff --git a/backends/xnnpack/CMakeLists.txt b/backends/xnnpack/CMakeLists.txt index ed8cf8d8e1..72834d111b 100644 --- a/backends/xnnpack/CMakeLists.txt +++ b/backends/xnnpack/CMakeLists.txt @@ -129,7 +129,7 @@ if(NOT CMAKE_TOOLCHAIN_FILE MATCHES ".*(iOS|ios\.toolchain)\.cmake$") list(TRANSFORM _xnn_executor_runner__srcs PREPEND "${EXECUTORCH_ROOT}/") add_executable(xnn_executor_runner ${_xnn_executor_runner__srcs}) target_link_libraries( - xnn_executor_runner xnnpack_backend gflags portable_ops_lib + xnn_executor_runner xnnpack_backend gflags portable_ops_lib executorch ) target_compile_options(xnn_executor_runner PUBLIC ${_common_compile_options}) endif() diff --git a/build/executorch-config.cmake b/build/executorch-config.cmake index 96e6390b6d..40c28d0b96 100644 --- a/build/executorch-config.cmake +++ b/build/executorch-config.cmake @@ -26,20 +26,21 @@ cmake_minimum_required(VERSION 3.19) -set(_root "${CMAKE_CURRENT_LIST_DIR}/../..") +set(_root "${CMAKE_CURRENT_LIST_DIR}/../../..") set(required_lib_list executorch executorch_core portable_kernels) set(EXECUTORCH_LIBRARIES) -set(EXECUTORCH_INCLUDE_DIRS ${_root}) +set(EXECUTORCH_INCLUDE_DIRS ${_root}/include ${_root}/include/executorch/runtime/core/portable_type ${_root}/lib) foreach(lib ${required_lib_list}) set(lib_var "LIB_${lib}") add_library(${lib} STATIC IMPORTED) find_library( ${lib_var} ${lib} - HINTS "${_root}" + HINTS "${_root}/lib" CMAKE_FIND_ROOT_PATH_BOTH ) set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}") - target_include_directories(${lib} INTERFACE ${_root}) + target_compile_definitions(${lib} INTERFACE C10_USING_CUSTOM_GENERATED_MACROS) + target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type ${_root}/lib) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endforeach() @@ -93,7 +94,7 @@ foreach(lib ${lib_list}) set(lib_var "LIB_${lib}") find_library( ${lib_var} ${lib} - HINTS "${_root}" + HINTS "${_root}/lib" CMAKE_FIND_ROOT_PATH_BOTH ) if(NOT ${lib_var}) @@ -109,7 +110,7 @@ foreach(lib ${lib_list}) add_library(${lib} STATIC IMPORTED) endif() set_target_properties(${lib} PROPERTIES IMPORTED_LOCATION "${${lib_var}}") - target_include_directories(${lib} INTERFACE ${_root}) + target_include_directories(${lib} INTERFACE ${_root}/include ${_root}/include/executorch/runtime/core/portable_type ${_root}/lib) list(APPEND EXECUTORCH_LIBRARIES ${lib}) endif() endforeach() diff --git a/runtime/core/portable_type/bfloat16.h b/runtime/core/portable_type/bfloat16.h index c1ff250885..233d571478 100644 --- a/runtime/core/portable_type/bfloat16.h +++ b/runtime/core/portable_type/bfloat16.h @@ -8,260 +8,15 @@ #pragma once -#include -#include -#include -#include -#include - -namespace executorch { -namespace runtime { -namespace etensor { +#include +namespace executorch::runtime::etensor { +using c10::BFloat16; namespace internal { -inline float f32_from_bits(uint16_t src) { - float res = 0; - uint32_t tmp = src; - tmp <<= 16; - std::memcpy(&res, &tmp, sizeof(tmp)); - return res; -} - -inline uint16_t round_to_nearest_even(float src) { - if (std::isnan(src)) { - return UINT16_C(0x7FC0); - } - uint32_t U32 = 0; - std::memcpy(&U32, &src, sizeof(U32)); - uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF); - return static_cast((U32 + rounding_bias) >> 16); -} +using c10::detail::f32_from_bits; +using c10::detail::round_to_nearest_even; } // namespace internal - -/** - * The "brain floating-point" type, compatible with c10/util/BFloat16.h from - * pytorch core. - * - * This representation uses 1 bit for the sign, 8 bits for the exponent and 7 - * bits for the mantissa. - */ -struct alignas(2) BFloat16 { - uint16_t x; - - BFloat16() = default; - struct from_bits_t {}; - static constexpr from_bits_t from_bits() { - return from_bits_t(); - } - - constexpr BFloat16(unsigned short bits, from_bits_t) : x(bits) {} - /* implicit */ BFloat16(float value) - : x(internal::round_to_nearest_even(value)) {} - operator float() const { - return internal::f32_from_bits(x); - } -}; - -inline std::ostream& operator<<(std::ostream& out, const BFloat16& value) { - out << (float)value; - return out; -} - -/// Arithmetic - -inline BFloat16 operator+(const BFloat16& a, const BFloat16& b) { - return static_cast(a) + static_cast(b); -} - -inline BFloat16 operator-(const BFloat16& a, const BFloat16& b) { - return static_cast(a) - static_cast(b); -} - -inline BFloat16 operator*(const BFloat16& a, const BFloat16& b) { - return static_cast(a) * static_cast(b); -} - -inline BFloat16 operator/(const BFloat16& a, const BFloat16& b) { - return static_cast(a) / static_cast(b); -} - -inline BFloat16 operator-(const BFloat16& a) { - return -static_cast(a); -} - -inline BFloat16& operator+=(BFloat16& a, const BFloat16& b) { - a = a + b; - return a; -} - -inline BFloat16& operator-=(BFloat16& a, const BFloat16& b) { - a = a - b; - return a; -} - -inline BFloat16& operator*=(BFloat16& a, const BFloat16& b) { - a = a * b; - return a; -} - -inline BFloat16& operator/=(BFloat16& a, const BFloat16& b) { - a = a / b; - return a; -} - -inline BFloat16& operator|(BFloat16& a, const BFloat16& b) { - a.x = a.x | b.x; - return a; -} - -inline BFloat16& operator^(BFloat16& a, const BFloat16& b) { - a.x = a.x ^ b.x; - return a; -} - -inline BFloat16& operator&(BFloat16& a, const BFloat16& b) { - a.x = a.x & b.x; - return a; -} - -/// Arithmetic with floats - -inline float operator+(BFloat16 a, float b) { - return static_cast(a) + b; -} -inline float operator-(BFloat16 a, float b) { - return static_cast(a) - b; -} -inline float operator*(BFloat16 a, float b) { - return static_cast(a) * b; -} -inline float operator/(BFloat16 a, float b) { - return static_cast(a) / b; -} - -inline float operator+(float a, BFloat16 b) { - return a + static_cast(b); -} -inline float operator-(float a, BFloat16 b) { - return a - static_cast(b); -} -inline float operator*(float a, BFloat16 b) { - return a * static_cast(b); -} -inline float operator/(float a, BFloat16 b) { - return a / static_cast(b); -} - -inline float& operator+=(float& a, const BFloat16& b) { - return a += static_cast(b); -} -inline float& operator-=(float& a, const BFloat16& b) { - return a -= static_cast(b); -} -inline float& operator*=(float& a, const BFloat16& b) { - return a *= static_cast(b); -} -inline float& operator/=(float& a, const BFloat16& b) { - return a /= static_cast(b); -} - -/// Arithmetic with doubles - -inline double operator+(BFloat16 a, double b) { - return static_cast(a) + b; -} -inline double operator-(BFloat16 a, double b) { - return static_cast(a) - b; -} -inline double operator*(BFloat16 a, double b) { - return static_cast(a) * b; -} -inline double operator/(BFloat16 a, double b) { - return static_cast(a) / b; -} - -inline double operator+(double a, BFloat16 b) { - return a + static_cast(b); -} -inline double operator-(double a, BFloat16 b) { - return a - static_cast(b); -} -inline double operator*(double a, BFloat16 b) { - return a * static_cast(b); -} -inline double operator/(double a, BFloat16 b) { - return a / static_cast(b); -} - -/// Arithmetic with ints - -inline BFloat16 operator+(BFloat16 a, int b) { - return a + static_cast(b); -} -inline BFloat16 operator-(BFloat16 a, int b) { - return a - static_cast(b); -} -inline BFloat16 operator*(BFloat16 a, int b) { - return a * static_cast(b); -} -inline BFloat16 operator/(BFloat16 a, int b) { - return a / static_cast(b); -} - -inline BFloat16 operator+(int a, BFloat16 b) { - return static_cast(a) + b; -} -inline BFloat16 operator-(int a, BFloat16 b) { - return static_cast(a) - b; -} -inline BFloat16 operator*(int a, BFloat16 b) { - return static_cast(a) * b; -} -inline BFloat16 operator/(int a, BFloat16 b) { - return static_cast(a) / b; -} - -//// Arithmetic with int64_t - -inline BFloat16 operator+(BFloat16 a, int64_t b) { - return a + static_cast(b); -} -inline BFloat16 operator-(BFloat16 a, int64_t b) { - return a - static_cast(b); -} -inline BFloat16 operator*(BFloat16 a, int64_t b) { - return a * static_cast(b); -} -inline BFloat16 operator/(BFloat16 a, int64_t b) { - return a / static_cast(b); -} - -inline BFloat16 operator+(int64_t a, BFloat16 b) { - return static_cast(a) + b; -} -inline BFloat16 operator-(int64_t a, BFloat16 b) { - return static_cast(a) - b; -} -inline BFloat16 operator*(int64_t a, BFloat16 b) { - return static_cast(a) * b; -} -inline BFloat16 operator/(int64_t a, BFloat16 b) { - return static_cast(a) / b; -} - -// Overloading < and > operators, because std::max and std::min use them. - -inline bool operator>(BFloat16& lhs, BFloat16& rhs) { - return float(lhs) > float(rhs); -} - -inline bool operator<(BFloat16& lhs, BFloat16& rhs) { - return float(lhs) < float(rhs); -} - -} // namespace etensor -} // namespace runtime -} // namespace executorch +} // namespace executorch::runtime::etensor namespace torch { namespace executor { @@ -270,74 +25,3 @@ namespace executor { using ::executorch::runtime::etensor::BFloat16; } // namespace executor } // namespace torch - -namespace std { - -template <> -class numeric_limits { - public: - static constexpr bool is_signed = true; - static constexpr bool is_specialized = true; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr auto has_denorm = numeric_limits::has_denorm; - static constexpr auto has_denorm_loss = - numeric_limits::has_denorm_loss; - static constexpr auto round_style = numeric_limits::round_style; - static constexpr bool is_iec559 = false; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - static constexpr int digits = 8; - static constexpr int digits10 = 2; - static constexpr int max_digits10 = 4; - static constexpr int radix = 2; - static constexpr int min_exponent = -125; - static constexpr int min_exponent10 = -37; - static constexpr int max_exponent = 128; - static constexpr int max_exponent10 = 38; - static constexpr auto traps = numeric_limits::traps; - static constexpr auto tinyness_before = - numeric_limits::tinyness_before; - - static constexpr torch::executor::BFloat16 min() { - return torch::executor::BFloat16( - 0x0080, torch::executor::BFloat16::from_bits()); - } - static constexpr torch::executor::BFloat16 lowest() { - return torch::executor::BFloat16( - 0xFF7F, torch::executor::BFloat16::from_bits()); - } - static constexpr torch::executor::BFloat16 max() { - return torch::executor::BFloat16( - 0x7F7F, torch::executor::BFloat16::from_bits()); - } - static constexpr torch::executor::BFloat16 epsilon() { - return torch::executor::BFloat16( - 0x3C00, torch::executor::BFloat16::from_bits()); - } - static constexpr torch::executor::BFloat16 round_error() { - return torch::executor::BFloat16( - 0x3F00, torch::executor::BFloat16::from_bits()); - } - static constexpr torch::executor::BFloat16 infinity() { - return torch::executor::BFloat16( - 0x7F80, torch::executor::BFloat16::from_bits()); - } - static constexpr torch::executor::BFloat16 quiet_NaN() { - return torch::executor::BFloat16( - 0x7FC0, torch::executor::BFloat16::from_bits()); - } - static constexpr torch::executor::BFloat16 signaling_NaN() { - return torch::executor::BFloat16( - 0x7F80, torch::executor::BFloat16::from_bits()); - } - static constexpr torch::executor::BFloat16 denorm_min() { - return torch::executor::BFloat16( - 0x0001, torch::executor::BFloat16::from_bits()); - } -}; - -} // namespace std diff --git a/runtime/core/portable_type/bfloat16_math.h b/runtime/core/portable_type/bfloat16_math.h index 68ee77cf34..3f6bf14a46 100644 --- a/runtime/core/portable_type/bfloat16_math.h +++ b/runtime/core/portable_type/bfloat16_math.h @@ -11,280 +11,4 @@ #include #include -namespace std { - -template -struct is_reduced_floating_point - : std::integral_constant< - bool, - std::is_same::value || - std::is_same::value> {}; - -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T acos(T a) { - return std::acos(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T asin(T a) { - return std::asin(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T atan(T a) { - return std::atan(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T atanh(T a) { - return std::atanh(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T erf(T a) { - return std::erf(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T erfc(T a) { - return std::erfc(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T exp(T a) { - return std::exp(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T expm1(T a) { - return std::expm1(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline bool isfinite(T a) { - return std::isfinite(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T log(T a) { - return std::log(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T log10(T a) { - return std::log10(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T log1p(T a) { - return std::log1p(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T log2(T a) { - return std::log2(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T ceil(T a) { - return std::ceil(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T cos(T a) { - return std::cos(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T floor(T a) { - return std::floor(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T nearbyint(T a) { - return std::nearbyint(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T sin(T a) { - return std::sin(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T tan(T a) { - return std::tan(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T sinh(T a) { - return std::sinh(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T cosh(T a) { - return std::cosh(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T tanh(T a) { - return std::tanh(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T trunc(T a) { - return std::trunc(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T lgamma(T a) { - return std::lgamma(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T sqrt(T a) { - return std::sqrt(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T rsqrt(T a) { - return 1.0 / std::sqrt(float(a)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T abs(T a) { - return std::abs(float(a)); -} -#if defined(_MSC_VER) && defined(__CUDACC__) -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T pow(T a, double b) { - return std::pow(float(a), float(b)); -} -#else -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T pow(T a, double b) { - return std::pow(float(a), b); -} -#endif -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T pow(T a, T b) { - return std::pow(float(a), float(b)); -} -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T fmod(T a, T b) { - return std::fmod(float(a), float(b)); -} - -/* - The following function is inspired from the implementation in `musl` - Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT - ---------------------------------------------------------------------- - Copyright © 2005-2020 Rich Felker, et al. - - Permission is hereby granted, free of charge, to any person obtaining - a copy of this software and associated documentation files (the - "Software"), to deal in the Software without restriction, including - without limitation the rights to use, copy, modify, merge, publish, - distribute, sublicense, and/or sell copies of the Software, and to - permit persons to whom the Software is furnished to do so, subject to - the following conditions: - - The above copyright notice and this permission notice shall be - included in all copies or substantial portions of the Software. - - THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, - EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF - MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. - IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY - CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, - TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE - SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - ---------------------------------------------------------------------- - */ -template < - typename T, - typename std::enable_if::value, int>::type = 0> -inline T nextafter(T from, T to) { - // Reference: - // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c - using int_repr_t = uint16_t; - constexpr uint8_t bits = 16; - union { - T f; - int_repr_t i; - } ufrom = {from}, uto = {to}; - - // get a mask to get the sign bit i.e. MSB - int_repr_t sign_mask = int_repr_t{1} << (bits - 1); - - // short-circuit: if either is NaN, return NaN - if (from != from || to != to) { - return from + to; - } - - // short-circuit: if they are exactly the same. - if (ufrom.i == uto.i) { - return from; - } - - // mask the sign-bit to zero i.e. positive - // equivalent to abs(x) - int_repr_t abs_from = ufrom.i & ~sign_mask; - int_repr_t abs_to = uto.i & ~sign_mask; - if (abs_from == 0) { - // if both are zero but with different sign, - // preserve the sign of `to`. - if (abs_to == 0) { - return to; - } - // smallest subnormal with sign of `to`. - ufrom.i = (uto.i & sign_mask) | int_repr_t{1}; - return ufrom.f; - } - - // if abs(from) > abs(to) or sign(from) != sign(to) - if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) { - ufrom.i--; - } else { - ufrom.i++; - } - - return ufrom.f; -} - -} // namespace std +#include diff --git a/runtime/core/portable_type/c10/TARGETS b/runtime/core/portable_type/c10/TARGETS new file mode 100644 index 0000000000..2341af9282 --- /dev/null +++ b/runtime/core/portable_type/c10/TARGETS @@ -0,0 +1,8 @@ +# Any targets that should be shared between fbcode and xplat must be defined in +# targets.bzl. This file can contain fbcode-only targets. + +load(":targets.bzl", "define_common_targets") + +oncall("executorch") + +define_common_targets() diff --git a/runtime/core/portable_type/c10/macros/Export.h b/runtime/core/portable_type/c10/macros/Export.h new file mode 100644 index 0000000000..cb68060ed8 --- /dev/null +++ b/runtime/core/portable_type/c10/macros/Export.h @@ -0,0 +1,160 @@ +#ifndef C10_MACROS_EXPORT_H_ +#define C10_MACROS_EXPORT_H_ + +/* Header file to define the common scaffolding for exported symbols. + * + * Export is by itself a quite tricky situation to deal with, and if you are + * hitting this file, make sure you start with the background here: + * - Linux: https://gcc.gnu.org/wiki/Visibility + * - Windows: + * https://docs.microsoft.com/en-us/cpp/cpp/dllexport-dllimport?view=vs-2017 + * + * Do NOT include this file directly. Instead, use c10/macros/Macros.h + */ + +// You do not need to edit this part of file unless you are changing the core +// pytorch export abstractions. +// +// This part defines the C10 core export and import macros. This is controlled +// by whether we are building shared libraries or not, which is determined +// during build time and codified in c10/core/cmake_macros.h. +// When the library is built as a shared lib, EXPORT and IMPORT will contain +// visibility attributes. If it is being built as a static lib, then EXPORT +// and IMPORT basically have no effect. + +// As a rule of thumb, you should almost NEVER mix static and shared builds for +// libraries that depend on c10. AKA, if c10 is built as a static library, we +// recommend everything dependent on c10 to be built statically. If c10 is built +// as a shared library, everything dependent on it should be built as shared. In +// the PyTorch project, all native libraries shall use the macro +// C10_BUILD_SHARED_LIB to check whether pytorch is building shared or static +// libraries. + +// For build systems that do not directly depend on CMake and directly build +// from the source directory (such as Buck), one may not have a cmake_macros.h +// file at all. In this case, the build system is responsible for providing +// correct macro definitions corresponding to the cmake_macros.h.in file. +// +// In such scenarios, one should define the macro +// C10_USING_CUSTOM_GENERATED_MACROS +// to inform this header that it does not need to include the cmake_macros.h +// file. + +#ifndef C10_USING_CUSTOM_GENERATED_MACROS +#include +#endif // C10_USING_CUSTOM_GENERATED_MACROS + +#ifdef _WIN32 +#define C10_HIDDEN +#if defined(C10_BUILD_SHARED_LIBS) +#define C10_EXPORT __declspec(dllexport) +#define C10_IMPORT __declspec(dllimport) +#else +#define C10_EXPORT +#define C10_IMPORT +#endif +#else // _WIN32 +#if defined(__GNUC__) +#define C10_EXPORT __attribute__((__visibility__("default"))) +#define C10_HIDDEN __attribute__((__visibility__("hidden"))) +#else // defined(__GNUC__) +#define C10_EXPORT +#define C10_HIDDEN +#endif // defined(__GNUC__) +#define C10_IMPORT C10_EXPORT +#endif // _WIN32 + +#ifdef NO_EXPORT +#undef C10_EXPORT +#define C10_EXPORT +#endif + +// Definition of an adaptive XX_API macro, that depends on whether you are +// building the library itself or not, routes to XX_EXPORT and XX_IMPORT. +// Basically, you will need to do this for each shared library that you are +// building, and the instruction is as follows: assuming that you are building +// a library called libawesome.so. You should: +// (1) for your cmake target (usually done by "add_library(awesome, ...)"), +// define a macro called AWESOME_BUILD_MAIN_LIB using +// target_compile_options. +// (2) define the AWESOME_API macro similar to the one below. +// And in the source file of your awesome library, use AWESOME_API to +// annotate public symbols. + +// Here, for the C10 library, we will define the macro C10_API for both import +// and export. + +// This one is being used by libc10.so +#ifdef C10_BUILD_MAIN_LIB +#define C10_API C10_EXPORT +#else +#define C10_API C10_IMPORT +#endif + +// This one is being used by libtorch.so +#ifdef CAFFE2_BUILD_MAIN_LIB +#define TORCH_API C10_EXPORT +#else +#define TORCH_API C10_IMPORT +#endif + +// You may be wondering: Whose brilliant idea was it to split torch_cuda into +// two pieces with confusing names? +// Once upon a time, there _was_ only TORCH_CUDA_API. All was happy until we +// tried to compile PyTorch for CUDA 11.1, which ran into relocation marker +// issues when linking big binaries. +// (https://github.com/pytorch/pytorch/issues/39968) We had two choices: +// (1) Stop supporting so many GPU architectures +// (2) Do something else +// We chose #2 and decided to split the behemoth that was torch_cuda into two +// smaller libraries, one with most of the core kernel functions (torch_cuda_cu) +// and the other that had..well..everything else (torch_cuda_cpp). The idea was +// this: instead of linking our static libraries (like the hefty +// libcudnn_static.a) with another huge library, torch_cuda, and run into pesky +// relocation marker issues, we could link our static libraries to a smaller +// part of torch_cuda (torch_cuda_cpp) and avoid the issues. + +// libtorch_cuda_cu.so +#ifdef TORCH_CUDA_CU_BUILD_MAIN_LIB +#define TORCH_CUDA_CU_API C10_EXPORT +#elif defined(BUILD_SPLIT_CUDA) +#define TORCH_CUDA_CU_API C10_IMPORT +#endif + +// libtorch_cuda_cpp.so +#ifdef TORCH_CUDA_CPP_BUILD_MAIN_LIB +#define TORCH_CUDA_CPP_API C10_EXPORT +#elif defined(BUILD_SPLIT_CUDA) +#define TORCH_CUDA_CPP_API C10_IMPORT +#endif + +// libtorch_cuda.so (where torch_cuda_cu and torch_cuda_cpp are a part of the +// same api) +#ifdef TORCH_CUDA_BUILD_MAIN_LIB +#define TORCH_CUDA_CPP_API C10_EXPORT +#define TORCH_CUDA_CU_API C10_EXPORT +#elif !defined(BUILD_SPLIT_CUDA) +#define TORCH_CUDA_CPP_API C10_IMPORT +#define TORCH_CUDA_CU_API C10_IMPORT +#endif + +#if defined(TORCH_HIP_BUILD_MAIN_LIB) +#define TORCH_HIP_API C10_EXPORT +#else +#define TORCH_HIP_API C10_IMPORT +#endif + +#if defined(TORCH_XPU_BUILD_MAIN_LIB) +#define TORCH_XPU_API C10_EXPORT +#else +#define TORCH_XPU_API C10_IMPORT +#endif + +// Enums only need to be exported on windows for non-CUDA files +#if defined(_WIN32) && defined(__CUDACC__) +#define C10_API_ENUM C10_API +#else +#define C10_API_ENUM +#endif + +#endif // C10_MACROS_MACROS_H_ diff --git a/runtime/core/portable_type/c10/macros/Macros.h b/runtime/core/portable_type/c10/macros/Macros.h new file mode 100644 index 0000000000..919eb6c856 --- /dev/null +++ b/runtime/core/portable_type/c10/macros/Macros.h @@ -0,0 +1,511 @@ +#ifndef C10_MACROS_MACROS_H_ +#define C10_MACROS_MACROS_H_ +#include + +/* Main entry for c10/macros. + * + * In your code, include c10/macros/Macros.h directly, instead of individual + * files in this folder. + */ + +// For build systems that do not directly depend on CMake and directly build +// from the source directory (such as Buck), one may not have a cmake_macros.h +// file at all. In this case, the build system is responsible for providing +// correct macro definitions corresponding to the cmake_macros.h.in file. +// +// In such scenarios, one should define the macro +// C10_USING_CUSTOM_GENERATED_MACROS +// to inform this header that it does not need to include the cmake_macros.h +// file. + +#ifndef C10_USING_CUSTOM_GENERATED_MACROS +#include +#endif // C10_USING_CUSTOM_GENERATED_MACROS + +#include + +#if defined(__clang__) +#define __ubsan_ignore_float_divide_by_zero__ \ + __attribute__((no_sanitize("float-divide-by-zero"))) +#define __ubsan_ignore_undefined__ __attribute__((no_sanitize("undefined"))) +#define __ubsan_ignore_signed_int_overflow__ \ + __attribute__((no_sanitize("signed-integer-overflow"))) +#define __ubsan_ignore_pointer_overflow__ \ + __attribute__((no_sanitize("pointer-overflow"))) +#define __ubsan_ignore_function__ __attribute__((no_sanitize("function"))) +#define __ubsan_ignore_float_cast_overflow__ \ + __attribute__((no_sanitize("float-cast-overflow"))) +#else +#define __ubsan_ignore_float_divide_by_zero__ +#define __ubsan_ignore_undefined__ +#define __ubsan_ignore_signed_int_overflow__ +#define __ubsan_ignore_pointer_overflow__ +#define __ubsan_ignore_function__ +#define __ubsan_ignore_float_cast_overflow__ +#endif + +// Detect address sanitizer as some stuff doesn't work with it +#undef C10_ASAN_ENABLED + +// for clang +#if defined(__has_feature) +#if ((__has_feature(address_sanitizer))) +#define C10_ASAN_ENABLED 1 +#endif +#endif + +// for gcc +#if defined(__SANITIZE_ADDRESS__) +#if __SANITIZE_ADDRESS__ +#if !defined(C10_ASAN_ENABLED) +#define C10_ASAN_ENABLED 1 +#endif +#endif +#endif + +#if !defined(C10_ASAN_ENABLED) +#define C10_ASAN_ENABLED 0 +#endif + +// Detect undefined-behavior sanitizer (UBSAN) +#undef C10_UBSAN_ENABLED + +// for clang or gcc >= 14 +// NB: gcc 14 adds support for Clang's __has_feature +// https://gcc.gnu.org/gcc-14/changes.html +// gcc < 14 doesn't have a macro for UBSAN +// (e.g. __SANITIZE_UNDEFINED__ does not exist in gcc) +// https://github.com/google/sanitizers/issues/765 +#if defined(__has_feature) +#if ((__has_feature(undefined_behavior_sanitizer))) +#define C10_UBSAN_ENABLED 1 +#endif +#endif + +#if !defined(C10_UBSAN_ENABLED) +#define C10_UBSAN_ENABLED 0 +#endif + +// Disable the copy and assignment operator for a class. Note that this will +// disable the usage of the class in std containers. +#define C10_DISABLE_COPY_AND_ASSIGN(classname) \ + classname(const classname&) = delete; \ + classname& operator=(const classname&) = delete + +#define C10_CONCATENATE_IMPL(s1, s2) s1##s2 +#define C10_CONCATENATE(s1, s2) C10_CONCATENATE_IMPL(s1, s2) + +#define C10_MACRO_EXPAND(args) args + +#define C10_STRINGIZE_IMPL(x) #x +#define C10_STRINGIZE(x) C10_STRINGIZE_IMPL(x) + +/** + * C10_ANONYMOUS_VARIABLE(str) introduces a new identifier which starts with + * str and ends with a unique number. + */ +#ifdef __COUNTER__ +#define C10_UID __COUNTER__ +#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __COUNTER__) +#else +#define C10_UID __LINE__ +#define C10_ANONYMOUS_VARIABLE(str) C10_CONCATENATE(str, __LINE__) +#endif + +#ifdef __has_cpp_attribute +#define C10_HAS_CPP_ATTRIBUTE(x) __has_cpp_attribute(x) +#else +#define C10_HAS_CPP_ATTRIBUTE(x) (0) +#endif + +#ifndef FBCODE_CAFFE2 +/// DEPRECATED: Warn if a type or return value is discarded. +#define C10_NODISCARD [[nodiscard]] + +/// DEPRECATED: Suppress an unused variable. +#define C10_UNUSED [[maybe_unused]] +#endif + +#if !defined(__has_attribute) +#define __has_attribute(x) 0 +#endif + +// Direct port of LLVM_ATTRIBUTE_USED. +#if __has_attribute(used) +#define C10_USED __attribute__((__used__)) +#else +#define C10_USED +#endif + +#define C10_RESTRICT __restrict + +// Simply define the namespace, in case a dependent library want to refer to +// the c10 namespace but not any nontrivial files. +namespace c10 {} +namespace c10::cuda {} +namespace c10::hip {} +namespace c10::xpu {} + +// Since C10 is the core library for caffe2 (and aten), we will simply reroute +// all abstractions defined in c10 to be available in caffe2 as well. +// This is only for backwards compatibility. Please use the symbols from the +// c10 namespace where possible. +namespace caffe2 { +using namespace c10; +} +namespace at { +using namespace c10; +} +namespace at::cuda { +using namespace c10::cuda; +} // namespace at::cuda + +// WARNING!!! THIS IS A GIANT HACK!!! +// This line means you cannot simultaneously include c10/hip +// and c10/cuda and then use them from the at::cuda namespace. +// This is true in practice, because HIPIFY works inplace on +// files in ATen/cuda, so it assumes that c10::hip is available +// from at::cuda. This namespace makes that happen. When +// HIPIFY is no longer out-of-place, we can switch the cuda +// here to hip and everyone is happy. +namespace at::cuda { +using namespace c10::hip; +} // namespace at::cuda + +namespace at::xpu { +using namespace c10::xpu; +} // namespace at::xpu + +// C10_LIKELY/C10_UNLIKELY +// +// These macros provide parentheses, so you can use these macros as: +// +// if C10_LIKELY(some_expr) { +// ... +// } +// +// NB: static_cast to boolean is mandatory in C++, because __builtin_expect +// takes a long argument, which means you may trigger the wrong conversion +// without it. +// +#if defined(__GNUC__) || defined(__ICL) || defined(__clang__) +#define C10_LIKELY(expr) (__builtin_expect(static_cast(expr), 1)) +#define C10_UNLIKELY(expr) (__builtin_expect(static_cast(expr), 0)) +#else +#define C10_LIKELY(expr) (expr) +#define C10_UNLIKELY(expr) (expr) +#endif + +/// C10_NOINLINE - Functions whose declaration is annotated with this will not +/// be inlined. +#ifdef __GNUC__ +#define C10_NOINLINE __attribute__((noinline)) +#elif _MSC_VER +#define C10_NOINLINE __declspec(noinline) +#else +#define C10_NOINLINE +#endif + +#if defined(_MSC_VER) +#define C10_ALWAYS_INLINE __forceinline +#elif __has_attribute(always_inline) || defined(__GNUC__) +#define C10_ALWAYS_INLINE __attribute__((__always_inline__)) inline +#else +#define C10_ALWAYS_INLINE inline +#endif + +// Unlike C10_ALWAYS_INLINE, C10_ALWAYS_INLINE_ATTRIBUTE can be used +// on a lambda. +#if defined(_MSC_VER) +// MSVC 14.39 is reasonably recent and doesn't like +// [[msvc::forceinline]] on a lambda, so don't try to use it. +#define C10_ALWAYS_INLINE_ATTRIBUTE +#elif __has_attribute(always_inline) || defined(__GNUC__) +#define C10_ALWAYS_INLINE_ATTRIBUTE __attribute__((__always_inline__)) +#else +#define C10_ALWAYS_INLINE_ATTRIBUTE +#endif + +#if defined(_MSC_VER) +#define C10_ATTR_VISIBILITY_HIDDEN +#elif defined(__GNUC__) +#define C10_ATTR_VISIBILITY_HIDDEN __attribute__((__visibility__("hidden"))) +#else +#define C10_ATTR_VISIBILITY_HIDDEN +#endif + +#define C10_ERASE C10_ALWAYS_INLINE C10_ATTR_VISIBILITY_HIDDEN + +#include + +#ifdef __HIPCC__ +// Unlike CUDA, HIP requires a HIP header to be included for __host__ to work. +// We do this #include here so that C10_HOST_DEVICE and friends will Just Work. +// See https://github.com/ROCm-Developer-Tools/HIP/issues/441 +#include +#endif + +#if defined(__CUDACC__) || defined(__HIPCC__) +// Designates functions callable from the host (CPU) and the device (GPU) +#define C10_HOST_DEVICE __host__ __device__ +#define C10_DEVICE __device__ +#define C10_HOST __host__ +// constants from +// (https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#features-and-technical-specifications) +// The maximum number of threads per multiprocessor is 1024 for Turing +// architecture (7.5), 1536 for Geforce Ampere (8.6)/Jetson Orin (8.7), and +// 2048 for all other architectures. You'll get warnings if you exceed these +// constants. Hence, the following macros adjust the input values from the user +// to resolve potential warnings. +#if __CUDA_ARCH__ == 750 +constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1024; +#elif __CUDA_ARCH__ == 860 || __CUDA_ARCH__ == 870 || __CUDA_ARCH__ == 890 +constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 1536; +#else +constexpr uint32_t CUDA_MAX_THREADS_PER_SM = 2048; +#endif +// CUDA_MAX_THREADS_PER_BLOCK is same for all architectures currently +constexpr uint32_t CUDA_MAX_THREADS_PER_BLOCK = 1024; +// CUDA_THREADS_PER_BLOCK_FALLBACK is the "canonical fallback" choice of block +// size. 256 is a good number for this fallback and should give good occupancy +// and versatility across all architectures. +constexpr uint32_t CUDA_THREADS_PER_BLOCK_FALLBACK = 256; +// NOTE: if you are thinking of constexpr-ify the inputs to launch bounds, it +// turns out that although __launch_bounds__ can take constexpr, it +// can't take a constexpr that has anything to do with templates. +// Currently we use launch_bounds that depend on template arguments in +// Loops.cuh, Reduce.cuh and LossCTC.cuh. Hence, C10_MAX_THREADS_PER_BLOCK +// and C10_MIN_BLOCKS_PER_SM are kept as macros. +// Suppose you were planning to write __launch_bounds__(a, b), based on your +// performance tuning on a modern GPU. Instead, you should write +// __launch_bounds__(C10_MAX_THREADS_PER_BLOCK(a), C10_MIN_BLOCKS_PER_SM(a, b)), +// which will also properly respect limits on old architectures. +#define C10_MAX_THREADS_PER_BLOCK(val) \ + (((val) <= CUDA_MAX_THREADS_PER_BLOCK) ? (val) \ + : CUDA_THREADS_PER_BLOCK_FALLBACK) +#define C10_MIN_BLOCKS_PER_SM(threads_per_block, blocks_per_sm) \ + ((((threads_per_block) * (blocks_per_sm) <= CUDA_MAX_THREADS_PER_SM) \ + ? (blocks_per_sm) \ + : ((CUDA_MAX_THREADS_PER_SM + (threads_per_block)-1) / \ + (threads_per_block)))) +// C10_LAUNCH_BOUNDS is analogous to __launch_bounds__ +#define C10_LAUNCH_BOUNDS_0 \ + __launch_bounds__( \ + 256, 4) // default launch bounds that should give good occupancy and + // versatility across all architectures. +#define C10_LAUNCH_BOUNDS_1(max_threads_per_block) \ + __launch_bounds__((C10_MAX_THREADS_PER_BLOCK((max_threads_per_block)))) +#define C10_LAUNCH_BOUNDS_2(max_threads_per_block, min_blocks_per_sm) \ + __launch_bounds__( \ + (C10_MAX_THREADS_PER_BLOCK((max_threads_per_block))), \ + (C10_MIN_BLOCKS_PER_SM((max_threads_per_block), (min_blocks_per_sm)))) +#else +#define C10_HOST_DEVICE +#define C10_HOST +#define C10_DEVICE +#endif + +#if defined(USE_ROCM) +#define C10_HIP_HOST_DEVICE __host__ __device__ +#else +#define C10_HIP_HOST_DEVICE +#endif + +#if defined(USE_ROCM) +#define C10_WARP_SIZE warpSize // = 64 or 32 (Defined in hip_runtime.h) +#else +#define C10_WARP_SIZE 32 +#endif + +#if defined(_MSC_VER) && _MSC_VER <= 1900 +#define __func__ __FUNCTION__ +#endif + +// CUDA_KERNEL_ASSERT checks the assertion +// even when NDEBUG is defined. This is useful for important assertions in CUDA +// code that would otherwise be suppressed when building Release. +#if defined(__ANDROID__) || defined(__APPLE__) || defined(__FreeBSD__) +// Those platforms do not support assert() +#define CUDA_KERNEL_ASSERT(cond) +#define CUDA_KERNEL_ASSERT_MSG(cond, msg) +#define SYCL_KERNEL_ASSERT(cond) +#elif defined(_MSC_VER) +#if defined(NDEBUG) +extern "C" { +C10_IMPORT +#if defined(__SYCL_DEVICE_ONLY__) +extern SYCL_EXTERNAL void _wassert( + const wchar_t* wexpr, + const wchar_t* wfile, + unsigned line); +#else +#if defined(__CUDA_ARCH__) +__host__ __device__ +#endif // __CUDA_ARCH__ + void + _wassert(wchar_t const* _Message, wchar_t const* _File, unsigned _Line); +#endif // __SYCL_DEVICE_ONLY__ +} +#endif // NDEBUG +#define CUDA_KERNEL_ASSERT(cond) \ + if (C10_UNLIKELY(!(cond))) { \ + (void)(_wassert( \ + _CRT_WIDE(#cond), \ + _CRT_WIDE(__FILE__), \ + static_cast(__LINE__)), \ + 0); \ + } +// TODO: This doesn't assert the message because I (chilli) couldn't figure out +// a nice way to convert a char* to a wchar_t* +#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ + if (C10_UNLIKELY(!(cond))) { \ + (void)(_wassert( \ + _CRT_WIDE(#cond), \ + _CRT_WIDE(__FILE__), \ + static_cast(__LINE__)), \ + 0); \ + } +#define SYCL_KERNEL_ASSERT(cond) \ + if (C10_UNLIKELY(!(cond))) { \ + (void)(_wassert( \ + _CRT_WIDE(#cond), \ + _CRT_WIDE(__FILE__), \ + static_cast(__LINE__)), \ + 0); \ + } +#else // __APPLE__, _MSC_VER +#if defined(NDEBUG) +extern "C" { +#if defined(__SYCL_DEVICE_ONLY__) +extern SYCL_EXTERNAL void __assert_fail( + const char* expr, + const char* file, + unsigned int line, + const char* func); +#else // __SYCL_DEVICE_ONLY__ +#if (defined(__CUDA_ARCH__) && !(defined(__clang__) && defined(__CUDA__))) +// CUDA supports __assert_fail function which are common for both device +// and host side code. +__host__ __device__ +#endif + + // This forward declaration matching the declaration of __assert_fail + // exactly how it is in glibc in case parts of the program are compiled with + // different NDEBUG settings. Otherwise we might get 'ambiguous declaration' + // error. Note: On ROCm - this declaration serves for host side compilation. + void + __assert_fail( + const char* assertion, + const char* file, + unsigned int line, + const char* function) noexcept __attribute__((__noreturn__)); + +#endif // __SYCL_DEVICE_ONLY__ +} +#endif // NDEBUG +// ROCm disable kernel assert by default +#if !defined(C10_USE_ROCM_KERNEL_ASSERT) and defined(USE_ROCM) +#define CUDA_KERNEL_ASSERT(cond) +#define CUDA_KERNEL_ASSERT_MSG(cond, msg) +#define SYCL_KERNEL_ASSERT(cond) +#else +#define CUDA_KERNEL_ASSERT(cond) \ + if (C10_UNLIKELY(!(cond))) { \ + __assert_fail( \ + #cond, __FILE__, static_cast(__LINE__), __func__); \ + } +#define CUDA_KERNEL_ASSERT_MSG(cond, msg) \ + if (C10_UNLIKELY(!(cond))) { \ + __assert_fail( \ + msg, __FILE__, static_cast(__LINE__), __func__); \ + } +#define SYCL_KERNEL_ASSERT(cond) \ + if (C10_UNLIKELY(!(cond))) { \ + __assert_fail( \ + #cond, __FILE__, static_cast(__LINE__), __func__); \ + } +#endif // C10_USE_ROCM_KERNEL_ASSERT and USE_ROCM +#endif // __APPLE__ + +#ifdef __APPLE__ +#include +#endif + +#if defined(__ANDROID__) +#define C10_ANDROID 1 +#define C10_MOBILE 1 +#elif ( \ + defined(__APPLE__) && \ + (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE)) +#define C10_IOS 1 +#define C10_MOBILE 1 +#endif // ANDROID / IOS + +#if defined(C10_MOBILE) && C10_MOBILE +#define C10_ALWAYS_INLINE_UNLESS_MOBILE inline +#else +#define C10_ALWAYS_INLINE_UNLESS_MOBILE C10_ALWAYS_INLINE +#endif + +#if !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) +#define CONSTEXPR_EXCEPT_WIN_CUDA constexpr +#define C10_HOST_CONSTEXPR_EXCEPT_WIN_CUDA constexpr + +#define STATIC_CONSTEXPR_STR_INL_EXCEPT_WIN_CUDA(field, val) \ + static constexpr const char field[] = val; +#define STATIC_CONST_STR_OUT_OF_LINE_FOR_WIN_CUDA(cls, field, val) +#endif // !defined(FBCODE_CAFFE2) && !defined(C10_NODEPRECATED) + +#ifndef HAS_DEMANGLE +#if defined(__ANDROID__) || defined(_WIN32) || defined(__EMSCRIPTEN__) +#define HAS_DEMANGLE 0 +#elif defined(__APPLE__) && \ + (TARGET_IPHONE_SIMULATOR || TARGET_OS_SIMULATOR || TARGET_OS_IPHONE) +#define HAS_DEMANGLE 0 +#else +#define HAS_DEMANGLE 1 +#endif +#endif // HAS_DEMANGLE + +#define _C10_PRAGMA__(string) _Pragma(#string) +#define _C10_PRAGMA_(string) _C10_PRAGMA__(string) + +#ifdef __clang__ +#define C10_CLANG_DIAGNOSTIC_PUSH() _Pragma("clang diagnostic push") +#define C10_CLANG_DIAGNOSTIC_POP() _Pragma("clang diagnostic pop") +#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) \ + _C10_PRAGMA_(clang diagnostic ignored flag) +#define C10_CLANG_HAS_WARNING(flag) __has_warning(flag) +#else +#define C10_CLANG_DIAGNOSTIC_PUSH() +#define C10_CLANG_DIAGNOSTIC_POP() +#define C10_CLANG_DIAGNOSTIC_IGNORE(flag) +#define C10_CLANG_HAS_WARNING(flag) 0 +#endif + +#ifdef __clang__ + +#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \ + _C10_PRAGMA_(clang diagnostic push) \ + _C10_PRAGMA_(clang diagnostic ignored "-Wunknown-warning-option") \ + _C10_PRAGMA_(clang diagnostic ignored warning) + +#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(clang diagnostic pop) + +#elif __GNUC__ + +#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) \ + _C10_PRAGMA_(GCC diagnostic push) \ + _C10_PRAGMA_(GCC diagnostic ignored "-Wpragmas") \ + _C10_PRAGMA_(GCC diagnostic ignored warning) + +#define C10_DIAGNOSTIC_POP() _C10_PRAGMA_(GCC diagnostic pop) + +#else + +#define C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED(warning) +#define C10_DIAGNOSTIC_POP() + +#endif + +#endif // C10_MACROS_MACROS_H_ diff --git a/runtime/core/portable_type/c10/targets.bzl b/runtime/core/portable_type/c10/targets.bzl new file mode 100644 index 0000000000..1e60b70a4b --- /dev/null +++ b/runtime/core/portable_type/c10/targets.bzl @@ -0,0 +1,100 @@ +load("@fbsource//xplat/executorch/build:runtime_wrapper.bzl", "runtime") + +def get_sleef_preprocessor_flags(): + if runtime.is_oss: + return [] + return ["-DAT_BUILD_ARM_VEC256_WITH_SLEEF"] + + +def define_common_targets(): + """Defines targets that should be shared between fbcode and xplat. + + The directory containing this targets.bzl file should also contain both + TARGETS and BUCK files that call this function. + """ + runtime.cxx_library( + name = "c10", + header_namespace = "c10", + exported_headers = [ + "macros/Export.h", + "macros/Macros.h", + "util/BFloat16.h", + "util/BFloat16-inl.h", + "util/BFloat16-math.h", + "util/Half.h", + "util/Half-inl.h", + "util/TypeSafeSignMath.h", + "util/bit_cast.h", + "util/floating_point_utils.h", + ], + exported_preprocessor_flags = [ + # NOTE: If we define C10_EMBEDDED to prevent Half and + # BFloat16 from supporting streams, non-ExecuTorch-core + # uses of other ATen headers that try to print ATen + # primitive types fail to build because, apparently, there + # are implicit conversions from Half/BFloat16 to a variety + # of primitive types, not just float. Since merely + # including shouldn't result in any runtime + # artifacts if stream code is never actually called, it + # seems best to just not define C10_EMBEDDED, but if you + # need it, it's there. + # "-DC10_EMBEDDED", + "-DC10_USE_GLOG", + "-DC10_USE_MINIMAL_GLOG", + "-DC10_USING_CUSTOM_GENERATED_MACROS", + ], + visibility = [ + "//executorch/runtime/core/portable_type/...", + ], + deps = select({ + "DEFAULT": [], + # Half-inl.h depends on vec_half.h from ATen, but only when building for x86. + "ovr_config//cpu:x86_64": [ + ":aten_headers_for_executorch", + ], + }), + ) + + runtime.cxx_library( + name = "aten_headers_for_executorch", + srcs = [], + visibility = ["//executorch/kernels/optimized/..."], + exported_deps = select({ + "DEFAULT": [], + "ovr_config//cpu:arm64": [ + "fbsource//third-party/sleef:sleef_arm", + ] if not runtime.is_oss else [], + # fbsource//third-party/sleef:sleef currently fails to + # link with missing symbols, hence the fbcode-specific dep below. + }), + fbcode_exported_deps = ([ + "//caffe2:aten-headers-cpu", + "//caffe2:generated-config-header", + "//caffe2/c10:c10_headers", + ] + select({ + "DEFAULT": [], + "ovr_config//cpu:x86_64": [ + "third-party//sleef:sleef", + ] + })) if not runtime.is_oss else [], + fbcode_exported_preprocessor_flags = [ + # We don't -DCPU_CAPABILITY=AVX2 because that trips + # -Wmacro-redefined, and we only care about getting + # reasonable vectorization and Sleef support. + "-DCPU_CAPABILITY_AVX2", + "-DHAVE_AVX2_CPU_DEFINITION", + "-DSTANDALONE_TORCH_HEADER", + ] + get_sleef_preprocessor_flags(), + xplat_exported_deps = [ + "//xplat/caffe2:aten_header", + "//xplat/caffe2:generated_aten_config_header", + "//xplat/caffe2/c10:c10_headers", + ], + exported_preprocessor_flags = select({ + # Intentionally punting on non-fbcode x86 sleef support + # for now because of fbsource//third-party/sleef:sleef + # linker failure. + "ovr_config//cpu:arm64": get_sleef_preprocessor_flags(), + "DEFAULT": [], + }) + ["-DSTANDALONE_TORCH_HEADER"], + ) diff --git a/runtime/core/portable_type/c10/util/BFloat16-inl.h b/runtime/core/portable_type/c10/util/BFloat16-inl.h new file mode 100644 index 0000000000..10ab0c828d --- /dev/null +++ b/runtime/core/portable_type/c10/util/BFloat16-inl.h @@ -0,0 +1,343 @@ +#pragma once + +#include +#include + +#include + +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") +#endif + +#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) +#if defined(CL_SYCL_LANGUAGE_VERSION) +#include // for SYCL 1.2.1 +#else +#include // for SYCL 2020 +#endif +#include +#endif + +namespace c10 { + +/// Constructors +inline C10_HOST_DEVICE BFloat16::BFloat16(float value) + : +#if defined(__CUDACC__) && !defined(USE_ROCM) && defined(__CUDA_ARCH__) && \ + __CUDA_ARCH__ >= 800 + x(__bfloat16_as_ushort(__float2bfloat16(value))) +#elif defined(__SYCL_DEVICE_ONLY__) && \ + defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) + x(c10::bit_cast(sycl::ext::oneapi::bfloat16(value))) +#else + // RNE by default + x(detail::round_to_nearest_even(value)) +#endif +{ +} + +/// Implicit conversions +inline C10_HOST_DEVICE BFloat16::operator float() const { +#if defined(__CUDACC__) && !defined(USE_ROCM) + return __bfloat162float(*reinterpret_cast(&x)); +#elif defined(__SYCL_DEVICE_ONLY__) && \ + defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) + return float(*reinterpret_cast(&x)); +#else + return detail::f32_from_bits(x); +#endif +} + +#if defined(__CUDACC__) && !defined(USE_ROCM) +inline C10_HOST_DEVICE BFloat16::BFloat16(const __nv_bfloat16& value) { + x = *reinterpret_cast(&value); +} +inline C10_HOST_DEVICE BFloat16::operator __nv_bfloat16() const { + return *reinterpret_cast(&x); +} +#endif + +#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) +inline C10_HOST_DEVICE BFloat16::BFloat16( + const sycl::ext::oneapi::bfloat16& value) { + x = *reinterpret_cast(&value); +} +inline C10_HOST_DEVICE BFloat16::operator sycl::ext::oneapi::bfloat16() const { + return *reinterpret_cast(&x); +} +#endif + +// CUDA intrinsics + +#if defined(__CUDACC__) || defined(__HIPCC__) +inline C10_DEVICE BFloat16 __ldg(const BFloat16* ptr) { +#if !defined(USE_ROCM) && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800 + return __ldg(reinterpret_cast(ptr)); +#else + return *ptr; +#endif +} +#endif + +/// Arithmetic + +inline C10_HOST_DEVICE BFloat16 +operator+(const BFloat16& a, const BFloat16& b) { + return static_cast(a) + static_cast(b); +} + +inline C10_HOST_DEVICE BFloat16 +operator-(const BFloat16& a, const BFloat16& b) { + return static_cast(a) - static_cast(b); +} + +inline C10_HOST_DEVICE BFloat16 +operator*(const BFloat16& a, const BFloat16& b) { + return static_cast(a) * static_cast(b); +} + +inline C10_HOST_DEVICE BFloat16 operator/(const BFloat16& a, const BFloat16& b) + __ubsan_ignore_float_divide_by_zero__ { + return static_cast(a) / static_cast(b); +} + +inline C10_HOST_DEVICE BFloat16 operator-(const BFloat16& a) { + return -static_cast(a); +} + +inline C10_HOST_DEVICE BFloat16& operator+=(BFloat16& a, const BFloat16& b) { + a = a + b; + return a; +} + +inline C10_HOST_DEVICE BFloat16& operator-=(BFloat16& a, const BFloat16& b) { + a = a - b; + return a; +} + +inline C10_HOST_DEVICE BFloat16& operator*=(BFloat16& a, const BFloat16& b) { + a = a * b; + return a; +} + +inline C10_HOST_DEVICE BFloat16& operator/=(BFloat16& a, const BFloat16& b) { + a = a / b; + return a; +} + +inline C10_HOST_DEVICE BFloat16& operator|(BFloat16& a, const BFloat16& b) { + a.x = a.x | b.x; + return a; +} + +inline C10_HOST_DEVICE BFloat16& operator^(BFloat16& a, const BFloat16& b) { + a.x = a.x ^ b.x; + return a; +} + +inline C10_HOST_DEVICE BFloat16& operator&(BFloat16& a, const BFloat16& b) { + a.x = a.x & b.x; + return a; +} + +/// Arithmetic with floats + +inline C10_HOST_DEVICE float operator+(BFloat16 a, float b) { + return static_cast(a) + b; +} +inline C10_HOST_DEVICE float operator-(BFloat16 a, float b) { + return static_cast(a) - b; +} +inline C10_HOST_DEVICE float operator*(BFloat16 a, float b) { + return static_cast(a) * b; +} +inline C10_HOST_DEVICE float operator/(BFloat16 a, float b) { + return static_cast(a) / b; +} + +inline C10_HOST_DEVICE float operator+(float a, BFloat16 b) { + return a + static_cast(b); +} +inline C10_HOST_DEVICE float operator-(float a, BFloat16 b) { + return a - static_cast(b); +} +inline C10_HOST_DEVICE float operator*(float a, BFloat16 b) { + return a * static_cast(b); +} +inline C10_HOST_DEVICE float operator/(float a, BFloat16 b) { + return a / static_cast(b); +} + +inline C10_HOST_DEVICE float& operator+=(float& a, const BFloat16& b) { + return a += static_cast(b); +} +inline C10_HOST_DEVICE float& operator-=(float& a, const BFloat16& b) { + return a -= static_cast(b); +} +inline C10_HOST_DEVICE float& operator*=(float& a, const BFloat16& b) { + return a *= static_cast(b); +} +inline C10_HOST_DEVICE float& operator/=(float& a, const BFloat16& b) { + return a /= static_cast(b); +} + +/// Arithmetic with doubles + +inline C10_HOST_DEVICE double operator+(BFloat16 a, double b) { + return static_cast(a) + b; +} +inline C10_HOST_DEVICE double operator-(BFloat16 a, double b) { + return static_cast(a) - b; +} +inline C10_HOST_DEVICE double operator*(BFloat16 a, double b) { + return static_cast(a) * b; +} +inline C10_HOST_DEVICE double operator/(BFloat16 a, double b) { + return static_cast(a) / b; +} + +inline C10_HOST_DEVICE double operator+(double a, BFloat16 b) { + return a + static_cast(b); +} +inline C10_HOST_DEVICE double operator-(double a, BFloat16 b) { + return a - static_cast(b); +} +inline C10_HOST_DEVICE double operator*(double a, BFloat16 b) { + return a * static_cast(b); +} +inline C10_HOST_DEVICE double operator/(double a, BFloat16 b) { + return a / static_cast(b); +} + +/// Arithmetic with ints + +inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int b) { + return a + static_cast(b); +} +inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int b) { + return a - static_cast(b); +} +inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int b) { + return a * static_cast(b); +} +inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int b) { + return a / static_cast(b); +} + +inline C10_HOST_DEVICE BFloat16 operator+(int a, BFloat16 b) { + return static_cast(a) + b; +} +inline C10_HOST_DEVICE BFloat16 operator-(int a, BFloat16 b) { + return static_cast(a) - b; +} +inline C10_HOST_DEVICE BFloat16 operator*(int a, BFloat16 b) { + return static_cast(a) * b; +} +inline C10_HOST_DEVICE BFloat16 operator/(int a, BFloat16 b) { + return static_cast(a) / b; +} + +//// Arithmetic with int64_t + +inline C10_HOST_DEVICE BFloat16 operator+(BFloat16 a, int64_t b) { + return a + static_cast(b); +} +inline C10_HOST_DEVICE BFloat16 operator-(BFloat16 a, int64_t b) { + return a - static_cast(b); +} +inline C10_HOST_DEVICE BFloat16 operator*(BFloat16 a, int64_t b) { + return a * static_cast(b); +} +inline C10_HOST_DEVICE BFloat16 operator/(BFloat16 a, int64_t b) { + return a / static_cast(b); +} + +inline C10_HOST_DEVICE BFloat16 operator+(int64_t a, BFloat16 b) { + return static_cast(a) + b; +} +inline C10_HOST_DEVICE BFloat16 operator-(int64_t a, BFloat16 b) { + return static_cast(a) - b; +} +inline C10_HOST_DEVICE BFloat16 operator*(int64_t a, BFloat16 b) { + return static_cast(a) * b; +} +inline C10_HOST_DEVICE BFloat16 operator/(int64_t a, BFloat16 b) { + return static_cast(a) / b; +} + +// Overloading < and > operators, because std::max and std::min use them. + +inline C10_HOST_DEVICE bool operator>(BFloat16& lhs, BFloat16& rhs) { + return float(lhs) > float(rhs); +} + +inline C10_HOST_DEVICE bool operator<(BFloat16& lhs, BFloat16& rhs) { + return float(lhs) < float(rhs); +} + +} // namespace c10 + +namespace std { + +template <> +class numeric_limits { + public: + static constexpr bool is_signed = true; + static constexpr bool is_specialized = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr auto has_denorm = numeric_limits::has_denorm; + static constexpr auto has_denorm_loss = + numeric_limits::has_denorm_loss; + static constexpr auto round_style = numeric_limits::round_style; + static constexpr bool is_iec559 = false; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + static constexpr int digits = 8; + static constexpr int digits10 = 2; + static constexpr int max_digits10 = 4; + static constexpr int radix = 2; + static constexpr int min_exponent = -125; + static constexpr int min_exponent10 = -37; + static constexpr int max_exponent = 128; + static constexpr int max_exponent10 = 38; + static constexpr auto traps = numeric_limits::traps; + static constexpr auto tinyness_before = + numeric_limits::tinyness_before; + + static constexpr c10::BFloat16 min() { + return c10::BFloat16(0x0080, c10::BFloat16::from_bits()); + } + static constexpr c10::BFloat16 lowest() { + return c10::BFloat16(0xFF7F, c10::BFloat16::from_bits()); + } + static constexpr c10::BFloat16 max() { + return c10::BFloat16(0x7F7F, c10::BFloat16::from_bits()); + } + static constexpr c10::BFloat16 epsilon() { + return c10::BFloat16(0x3C00, c10::BFloat16::from_bits()); + } + static constexpr c10::BFloat16 round_error() { + return c10::BFloat16(0x3F00, c10::BFloat16::from_bits()); + } + static constexpr c10::BFloat16 infinity() { + return c10::BFloat16(0x7F80, c10::BFloat16::from_bits()); + } + static constexpr c10::BFloat16 quiet_NaN() { + return c10::BFloat16(0x7FC0, c10::BFloat16::from_bits()); + } + static constexpr c10::BFloat16 signaling_NaN() { + return c10::BFloat16(0x7F80, c10::BFloat16::from_bits()); + } + static constexpr c10::BFloat16 denorm_min() { + return c10::BFloat16(0x0001, c10::BFloat16::from_bits()); + } +}; + +} // namespace std + +C10_CLANG_DIAGNOSTIC_POP() diff --git a/runtime/core/portable_type/c10/util/BFloat16-math.h b/runtime/core/portable_type/c10/util/BFloat16-math.h new file mode 100644 index 0000000000..bad374cbd4 --- /dev/null +++ b/runtime/core/portable_type/c10/util/BFloat16-math.h @@ -0,0 +1,292 @@ +#pragma once + +#include +#include + +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wimplicit-float-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-float-conversion") +#endif + +namespace std { + +template +struct is_reduced_floating_point + : std::integral_constant< + bool, + std::is_same_v || std::is_same_v> {}; + +template +constexpr bool is_reduced_floating_point_v = + is_reduced_floating_point::value; + +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T acos(T a) { + return std::acos(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T asin(T a) { + return std::asin(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T atan(T a) { + return std::atan(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T atanh(T a) { + return std::atanh(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T erf(T a) { + return std::erf(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T erfc(T a) { + return std::erfc(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T exp(T a) { + return std::exp(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T expm1(T a) { + return std::expm1(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline bool isfinite(T a) { + return std::isfinite(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T log(T a) { + return std::log(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T log10(T a) { + return std::log10(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T log1p(T a) { + return std::log1p(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T log2(T a) { + return std::log2(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T ceil(T a) { + return std::ceil(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T cos(T a) { + return std::cos(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T floor(T a) { + return std::floor(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T nearbyint(T a) { + return std::nearbyint(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T sin(T a) { + return std::sin(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T tan(T a) { + return std::tan(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T sinh(T a) { + return std::sinh(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T cosh(T a) { + return std::cosh(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T tanh(T a) { + return std::tanh(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T trunc(T a) { + return std::trunc(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T lgamma(T a) { + return std::lgamma(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T sqrt(T a) { + return std::sqrt(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T rsqrt(T a) { + return 1.0 / std::sqrt(float(a)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T abs(T a) { + return std::abs(float(a)); +} +#if defined(_MSC_VER) && defined(__CUDACC__) +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T pow(T a, double b) { + return std::pow(float(a), float(b)); +} +#else +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T pow(T a, double b) { + return std::pow(float(a), b); +} +#endif +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T pow(T a, T b) { + return std::pow(float(a), float(b)); +} +template < + typename T, + typename std::enable_if_t, int> = 0> +inline T fmod(T a, T b) { + return std::fmod(float(a), float(b)); +} + +/* + The following function is inspired from the implementation in `musl` + Link to License: https://git.musl-libc.org/cgit/musl/tree/COPYRIGHT + ---------------------------------------------------------------------- + Copyright © 2005-2020 Rich Felker, et al. + + Permission is hereby granted, free of charge, to any person obtaining + a copy of this software and associated documentation files (the + "Software"), to deal in the Software without restriction, including + without limitation the rights to use, copy, modify, merge, publish, + distribute, sublicense, and/or sell copies of the Software, and to + permit persons to whom the Software is furnished to do so, subject to + the following conditions: + + The above copyright notice and this permission notice shall be + included in all copies or substantial portions of the Software. + + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. + IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY + CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, + TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE + SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + ---------------------------------------------------------------------- + */ +template < + typename T, + typename std::enable_if_t, int> = 0> +C10_HOST_DEVICE inline T nextafter(T from, T to) { + // Reference: + // https://git.musl-libc.org/cgit/musl/tree/src/math/nextafter.c + using int_repr_t = uint16_t; + constexpr uint8_t bits = 16; + union { + T f; + int_repr_t i; + } ufrom = {from}, uto = {to}; + + // get a mask to get the sign bit i.e. MSB + int_repr_t sign_mask = int_repr_t{1} << (bits - 1); + + // short-circuit: if either is NaN, return NaN + if (from != from || to != to) { + return from + to; + } + + // short-circuit: if they are exactly the same. + if (ufrom.i == uto.i) { + return from; + } + + // mask the sign-bit to zero i.e. positive + // equivalent to abs(x) + int_repr_t abs_from = ufrom.i & ~sign_mask; + int_repr_t abs_to = uto.i & ~sign_mask; + if (abs_from == 0) { + // if both are zero but with different sign, + // preserve the sign of `to`. + if (abs_to == 0) { + return to; + } + // smallest subnormal with sign of `to`. + ufrom.i = (uto.i & sign_mask) | int_repr_t{1}; + return ufrom.f; + } + + // if abs(from) > abs(to) or sign(from) != sign(to) + if (abs_from > abs_to || ((ufrom.i ^ uto.i) & sign_mask)) { + ufrom.i--; + } else { + ufrom.i++; + } + + return ufrom.f; +} + +} // namespace std + +C10_CLANG_DIAGNOSTIC_POP() diff --git a/runtime/core/portable_type/c10/util/BFloat16.h b/runtime/core/portable_type/c10/util/BFloat16.h new file mode 100644 index 0000000000..ad1271fc72 --- /dev/null +++ b/runtime/core/portable_type/c10/util/BFloat16.h @@ -0,0 +1,130 @@ +#pragma once + +// Defines the bloat16 type (brain floating-point). This representation uses +// 1 bit for the sign, 8 bits for the exponent and 7 bits for the mantissa. + +#include +#include +#include +#include +#include +#ifndef C10_EMBEDDED +#include +#endif // C10_EMBEDDED + +#if defined(__CUDACC__) && !defined(USE_ROCM) +#include +#endif + +#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) +#if defined(CL_SYCL_LANGUAGE_VERSION) +#include // for SYCL 1.2.1 +#else +#include // for SYCL 2020 +#endif +#include +#endif + +namespace c10 { + +namespace detail { +inline C10_HOST_DEVICE float f32_from_bits(uint16_t src) { + float res = 0; + uint32_t tmp = src; + tmp <<= 16; + +#if defined(USE_ROCM) + float* tempRes; + + // We should be using memcpy in order to respect the strict aliasing rule + // but it fails in the HIP environment. + tempRes = reinterpret_cast(&tmp); + res = *tempRes; +#else + std::memcpy(&res, &tmp, sizeof(tmp)); +#endif + + return res; +} + +inline C10_HOST_DEVICE uint16_t bits_from_f32(float src) { + uint32_t res = 0; + +#if defined(USE_ROCM) + // We should be using memcpy in order to respect the strict aliasing rule + // but it fails in the HIP environment. + uint32_t* tempRes = reinterpret_cast(&src); + res = *tempRes; +#else + std::memcpy(&res, &src, sizeof(res)); +#endif + + return res >> 16; +} + +inline C10_HOST_DEVICE uint16_t round_to_nearest_even(float src) { +#if defined(USE_ROCM) + if (src != src) { +#elif defined(_MSC_VER) + if (isnan(src)) { +#else + if (std::isnan(src)) { +#endif + return UINT16_C(0x7FC0); + } else { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) + union { + uint32_t U32; // NOLINT(facebook-hte-BadMemberName) + float F32; // NOLINT(facebook-hte-BadMemberName) + }; + + F32 = src; + uint32_t rounding_bias = ((U32 >> 16) & 1) + UINT32_C(0x7FFF); + return static_cast((U32 + rounding_bias) >> 16); + } +} +} // namespace detail + +struct alignas(2) BFloat16 { + uint16_t x; + + // HIP wants __host__ __device__ tag, CUDA does not +#if defined(USE_ROCM) + C10_HOST_DEVICE BFloat16() = default; +#else + BFloat16() = default; +#endif + + struct from_bits_t {}; + static constexpr C10_HOST_DEVICE from_bits_t from_bits() { + return from_bits_t(); + } + + constexpr C10_HOST_DEVICE BFloat16(unsigned short bits, from_bits_t) + : x(bits) {} + /* implicit */ inline C10_HOST_DEVICE BFloat16(float value); + inline C10_HOST_DEVICE operator float() const; + +#if defined(__CUDACC__) && !defined(USE_ROCM) + inline C10_HOST_DEVICE BFloat16(const __nv_bfloat16& value); + explicit inline C10_HOST_DEVICE operator __nv_bfloat16() const; +#endif + +#if defined(SYCL_EXT_ONEAPI_BFLOAT16_MATH_FUNCTIONS) + inline C10_HOST_DEVICE BFloat16(const sycl::ext::oneapi::bfloat16& value); + explicit inline C10_HOST_DEVICE operator sycl::ext::oneapi::bfloat16() const; +#endif +}; + +#ifndef C10_EMBEDDED +C10_API inline std::ostream& operator<<( + std::ostream& out, + const BFloat16& value) { + out << (float)value; + return out; +} +#endif // C10_EMBEDDED + +} // namespace c10 + +#include // IWYU pragma: keep diff --git a/runtime/core/portable_type/c10/util/Half-inl.h b/runtime/core/portable_type/c10/util/Half-inl.h new file mode 100644 index 0000000000..ae4469e563 --- /dev/null +++ b/runtime/core/portable_type/c10/util/Half-inl.h @@ -0,0 +1,350 @@ +#pragma once + +#include +#include + +#include +#include + +#ifdef __CUDACC__ +#include +#endif + +#ifdef __HIPCC__ +#include +#endif + +#if defined(CL_SYCL_LANGUAGE_VERSION) +#include // for SYCL 1.2.1 +#elif defined(SYCL_LANGUAGE_VERSION) +#include // for SYCL 2020 +#endif + +#if (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \ + !defined(__APPLE__) +#include +#endif + +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") +#endif + +namespace c10 { + +#if defined(__aarch64__) && !defined(__CUDACC__) +/// Constructors +inline Half::Half(float16_t value) : x(detail::fp16_to_bits(value)) {} +inline Half::operator float16_t() const { + return detail::fp16_from_bits(x); +} +#else + +inline C10_HOST_DEVICE Half::Half(float value) + : +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + x(__half_as_short(__float2half(value))) +#elif defined(__SYCL_DEVICE_ONLY__) + x(c10::bit_cast(sycl::half(value))) +#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \ + !defined(__APPLE__) + x(at::vec::float2half_scalar(value)) +#else + x(detail::fp16_ieee_from_fp32_value(value)) +#endif +{ +} + +/// Implicit conversions + +inline C10_HOST_DEVICE Half::operator float() const { +#if defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __half2float(*reinterpret_cast(&x)); +#elif defined(__SYCL_DEVICE_ONLY__) + return float(c10::bit_cast(x)); +#elif (defined(CPU_CAPABILITY_AVX2) || defined(CPU_CAPABILITY_AVX512)) && \ + !defined(__APPLE__) + return at::vec::half2float_scalar(x); +#elif defined(__aarch64__) && !defined(__CUDACC__) + return detail::native_fp16_to_fp32_value(x); +#else + return detail::fp16_ieee_to_fp32_value(x); +#endif +} + +#endif /* !defined(__aarch64__) || defined(__CUDACC__) \ + */ + +#if defined(__CUDACC__) || defined(__HIPCC__) +inline C10_HOST_DEVICE Half::Half(const __half& value) { + x = *reinterpret_cast(&value); +} +inline C10_HOST_DEVICE Half::operator __half() const { + return *reinterpret_cast(&x); +} +#endif + +#ifdef SYCL_LANGUAGE_VERSION +inline C10_HOST_DEVICE Half::Half(const sycl::half& value) { + x = *reinterpret_cast(&value); +} +inline C10_HOST_DEVICE Half::operator sycl::half() const { + return *reinterpret_cast(&x); +} +#endif + +// CUDA intrinsics + +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 350)) || \ + (defined(__clang__) && defined(__CUDA__)) +inline __device__ Half __ldg(const Half* ptr) { + return __ldg(reinterpret_cast(ptr)); +} +#endif + +/// Arithmetic + +inline C10_HOST_DEVICE Half operator+(const Half& a, const Half& b) { + return static_cast(a) + static_cast(b); +} + +inline C10_HOST_DEVICE Half operator-(const Half& a, const Half& b) { + return static_cast(a) - static_cast(b); +} + +inline C10_HOST_DEVICE Half operator*(const Half& a, const Half& b) { + return static_cast(a) * static_cast(b); +} + +inline C10_HOST_DEVICE Half operator/(const Half& a, const Half& b) + __ubsan_ignore_float_divide_by_zero__ { + return static_cast(a) / static_cast(b); +} + +inline C10_HOST_DEVICE Half operator-(const Half& a) { +#if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 530) || \ + defined(__HIP_DEVICE_COMPILE__) + return __hneg(a); +#elif defined(__SYCL_DEVICE_ONLY__) + return -c10::bit_cast(a); +#else + return -static_cast(a); +#endif +} + +inline C10_HOST_DEVICE Half& operator+=(Half& a, const Half& b) { + a = a + b; + return a; +} + +inline C10_HOST_DEVICE Half& operator-=(Half& a, const Half& b) { + a = a - b; + return a; +} + +inline C10_HOST_DEVICE Half& operator*=(Half& a, const Half& b) { + a = a * b; + return a; +} + +inline C10_HOST_DEVICE Half& operator/=(Half& a, const Half& b) { + a = a / b; + return a; +} + +/// Arithmetic with floats + +inline C10_HOST_DEVICE float operator+(Half a, float b) { + return static_cast(a) + b; +} +inline C10_HOST_DEVICE float operator-(Half a, float b) { + return static_cast(a) - b; +} +inline C10_HOST_DEVICE float operator*(Half a, float b) { + return static_cast(a) * b; +} +inline C10_HOST_DEVICE float operator/(Half a, float b) + __ubsan_ignore_float_divide_by_zero__ { + return static_cast(a) / b; +} + +inline C10_HOST_DEVICE float operator+(float a, Half b) { + return a + static_cast(b); +} +inline C10_HOST_DEVICE float operator-(float a, Half b) { + return a - static_cast(b); +} +inline C10_HOST_DEVICE float operator*(float a, Half b) { + return a * static_cast(b); +} +inline C10_HOST_DEVICE float operator/(float a, Half b) + __ubsan_ignore_float_divide_by_zero__ { + return a / static_cast(b); +} + +inline C10_HOST_DEVICE float& operator+=(float& a, const Half& b) { + return a += static_cast(b); +} +inline C10_HOST_DEVICE float& operator-=(float& a, const Half& b) { + return a -= static_cast(b); +} +inline C10_HOST_DEVICE float& operator*=(float& a, const Half& b) { + return a *= static_cast(b); +} +inline C10_HOST_DEVICE float& operator/=(float& a, const Half& b) { + return a /= static_cast(b); +} + +/// Arithmetic with doubles + +inline C10_HOST_DEVICE double operator+(Half a, double b) { + return static_cast(a) + b; +} +inline C10_HOST_DEVICE double operator-(Half a, double b) { + return static_cast(a) - b; +} +inline C10_HOST_DEVICE double operator*(Half a, double b) { + return static_cast(a) * b; +} +inline C10_HOST_DEVICE double operator/(Half a, double b) + __ubsan_ignore_float_divide_by_zero__ { + return static_cast(a) / b; +} + +inline C10_HOST_DEVICE double operator+(double a, Half b) { + return a + static_cast(b); +} +inline C10_HOST_DEVICE double operator-(double a, Half b) { + return a - static_cast(b); +} +inline C10_HOST_DEVICE double operator*(double a, Half b) { + return a * static_cast(b); +} +inline C10_HOST_DEVICE double operator/(double a, Half b) + __ubsan_ignore_float_divide_by_zero__ { + return a / static_cast(b); +} + +/// Arithmetic with ints + +inline C10_HOST_DEVICE Half operator+(Half a, int b) { + return a + static_cast(b); +} +inline C10_HOST_DEVICE Half operator-(Half a, int b) { + return a - static_cast(b); +} +inline C10_HOST_DEVICE Half operator*(Half a, int b) { + return a * static_cast(b); +} +inline C10_HOST_DEVICE Half operator/(Half a, int b) { + return a / static_cast(b); +} + +inline C10_HOST_DEVICE Half operator+(int a, Half b) { + return static_cast(a) + b; +} +inline C10_HOST_DEVICE Half operator-(int a, Half b) { + return static_cast(a) - b; +} +inline C10_HOST_DEVICE Half operator*(int a, Half b) { + return static_cast(a) * b; +} +inline C10_HOST_DEVICE Half operator/(int a, Half b) { + return static_cast(a) / b; +} + +//// Arithmetic with int64_t + +inline C10_HOST_DEVICE Half operator+(Half a, int64_t b) { + return a + static_cast(b); +} +inline C10_HOST_DEVICE Half operator-(Half a, int64_t b) { + return a - static_cast(b); +} +inline C10_HOST_DEVICE Half operator*(Half a, int64_t b) { + return a * static_cast(b); +} +inline C10_HOST_DEVICE Half operator/(Half a, int64_t b) { + return a / static_cast(b); +} + +inline C10_HOST_DEVICE Half operator+(int64_t a, Half b) { + return static_cast(a) + b; +} +inline C10_HOST_DEVICE Half operator-(int64_t a, Half b) { + return static_cast(a) - b; +} +inline C10_HOST_DEVICE Half operator*(int64_t a, Half b) { + return static_cast(a) * b; +} +inline C10_HOST_DEVICE Half operator/(int64_t a, Half b) { + return static_cast(a) / b; +} + +/// NOTE: we do not define comparisons directly and instead rely on the implicit +/// conversion from c10::Half to float. + +} // namespace c10 + +namespace std { + +template <> +class numeric_limits { + public: + static constexpr bool is_specialized = true; + static constexpr bool is_signed = true; + static constexpr bool is_integer = false; + static constexpr bool is_exact = false; + static constexpr bool has_infinity = true; + static constexpr bool has_quiet_NaN = true; + static constexpr bool has_signaling_NaN = true; + static constexpr auto has_denorm = numeric_limits::has_denorm; + static constexpr auto has_denorm_loss = + numeric_limits::has_denorm_loss; + static constexpr auto round_style = numeric_limits::round_style; + static constexpr bool is_iec559 = true; + static constexpr bool is_bounded = true; + static constexpr bool is_modulo = false; + static constexpr int digits = 11; + static constexpr int digits10 = 3; + static constexpr int max_digits10 = 5; + static constexpr int radix = 2; + static constexpr int min_exponent = -13; + static constexpr int min_exponent10 = -4; + static constexpr int max_exponent = 16; + static constexpr int max_exponent10 = 4; + static constexpr auto traps = numeric_limits::traps; + static constexpr auto tinyness_before = + numeric_limits::tinyness_before; + static constexpr c10::Half min() { + return c10::Half(0x0400, c10::Half::from_bits()); + } + static constexpr c10::Half lowest() { + return c10::Half(0xFBFF, c10::Half::from_bits()); + } + static constexpr c10::Half max() { + return c10::Half(0x7BFF, c10::Half::from_bits()); + } + static constexpr c10::Half epsilon() { + return c10::Half(0x1400, c10::Half::from_bits()); + } + static constexpr c10::Half round_error() { + return c10::Half(0x3800, c10::Half::from_bits()); + } + static constexpr c10::Half infinity() { + return c10::Half(0x7C00, c10::Half::from_bits()); + } + static constexpr c10::Half quiet_NaN() { + return c10::Half(0x7E00, c10::Half::from_bits()); + } + static constexpr c10::Half signaling_NaN() { + return c10::Half(0x7D00, c10::Half::from_bits()); + } + static constexpr c10::Half denorm_min() { + return c10::Half(0x0001, c10::Half::from_bits()); + } +}; + +} // namespace std + +C10_CLANG_DIAGNOSTIC_POP() diff --git a/runtime/core/portable_type/c10/util/Half.h b/runtime/core/portable_type/c10/util/Half.h new file mode 100644 index 0000000000..5625d4c340 --- /dev/null +++ b/runtime/core/portable_type/c10/util/Half.h @@ -0,0 +1,423 @@ +#pragma once + +/// Defines the Half type (half-precision floating-point) including conversions +/// to standard C types and basic arithmetic operations. Note that arithmetic +/// operations are implemented by converting to floating point and +/// performing the operation in float32, instead of using CUDA half intrinsics. +/// Most uses of this type within ATen are memory bound, including the +/// element-wise kernels, and the half intrinsics aren't efficient on all GPUs. +/// If you are writing a compute bound kernel, you can use the CUDA half +/// intrinsics directly on the Half type from device code. + +#include +#include +#include +#include +#include + +#if defined(__cplusplus) +#include +#elif !defined(__OPENCL_VERSION__) +#include +#endif + +#ifdef _MSC_VER +#include +#endif + +#include +#include +#include +#include +#ifndef C10_EMBEDDED +#include +#endif // C10_EMBEDDED + +#ifdef __CUDACC__ +#include +#endif + +#ifdef __HIPCC__ +#include +#endif + +#if defined(CL_SYCL_LANGUAGE_VERSION) +#include // for SYCL 1.2.1 +#elif defined(SYCL_LANGUAGE_VERSION) +#include // for SYCL 2020 +#endif + +#if defined(__aarch64__) && !defined(__CUDACC__) +#include +#endif + +#if defined(__GNUC__) || defined(__clang__) +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \ + defined(_M_IX86) +#if defined(__F16C__) && \ + !(defined(__CUDA_ARCH__) || defined(__CUDACC__) || \ + defined(__HIP_DEVICE_COMPILE__)) +#define C10_X86_F16 1 +#include // import conversion ops from f16cintrin.h +#endif // defined(__F16C__) && !(defined(__CUDA_ARCH__) || defined(__CUDACC__) + // || defined(__HIP_DEVICE_COMPILE__)) +#endif // __x86_64__ || _M_X64 || __i386 || _M_IX86 +#endif // __GNUC__ || __clang__ + +namespace c10 { + +namespace detail { + +/* + * Convert a 16-bit floating-point number in IEEE half-precision format, in bit + * representation, to a 32-bit floating-point number in IEEE single-precision + * format, in bit representation. + * + * @note The implementation doesn't use any floating-point operations. + */ +inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) { + /* + * Extend the half-precision floating-point number to 32 bits and shift to the + * upper part of the 32-bit word: + * +---+-----+------------+-------------------+ + * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| + * +---+-----+------------+-------------------+ + * Bits 31 26-30 16-25 0-15 + * + * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 + * - zero bits. + */ + const uint32_t w = (uint32_t)h << 16; + /* + * Extract the sign of the input number into the high bit of the 32-bit word: + * + * +---+----------------------------------+ + * | S |0000000 00000000 00000000 00000000| + * +---+----------------------------------+ + * Bits 31 0-31 + */ + const uint32_t sign = w & UINT32_C(0x80000000); + /* + * Extract mantissa and biased exponent of the input number into the bits 0-30 + * of the 32-bit word: + * + * +---+-----+------------+-------------------+ + * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| + * +---+-----+------------+-------------------+ + * Bits 30 27-31 17-26 0-16 + */ + const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); + /* + * Renorm shift is the number of bits to shift mantissa left to make the + * half-precision number normalized. If the initial number is normalized, some + * of its high 6 bits (sign == 0 and 5-bit exponent) equals one. In this case + * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note + * that if we shift denormalized nonsign by renorm_shift, the unit bit of + * mantissa will shift into exponent, turning the biased exponent into 1, and + * making mantissa normalized (i.e. without leading 1). + */ +#ifdef _MSC_VER + unsigned long nonsign_bsr; + _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign); + uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31; +#else + uint32_t renorm_shift = __builtin_clz(nonsign); +#endif + renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; + /* + * Iff half-precision number has exponent of 15, the addition overflows + * it into bit 31, and the subsequent shift turns the high 9 bits + * into 1. Thus inf_nan_mask == 0x7F800000 if the half-precision number + * had exponent of 15 (i.e. was NaN or infinity) 0x00000000 otherwise + */ + const int32_t inf_nan_mask = + ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); + /* + * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 + * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31 + * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask == + * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h) + * 0x00000000 otherwise + */ + const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; + /* + * 1. Shift nonsign left by renorm_shift to normalize it (if the input + * was denormal) + * 2. Shift nonsign right by 3 so the exponent (5 bits originally) + * becomes an 8-bit field and 10-bit mantissa shifts into the 10 high + * bits of the 23-bit mantissa of IEEE single-precision number. + * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the + * different in exponent bias (0x7F for single-precision number less 0xF + * for half-precision number). + * 4. Subtract renorm_shift from the exponent (starting at bit 23) to + * account for renormalization. As renorm_shift is less than 0x70, this + * can be combined with step 3. + * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the + * input was NaN or infinity. + * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent + * into zero if the input was zero. + * 7. Combine with the sign of the input number. + */ + return sign | + ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | + inf_nan_mask) & + ~zero_mask); +} + +/* + * Convert a 16-bit floating-point number in IEEE half-precision format, in bit + * representation, to a 32-bit floating-point number in IEEE single-precision + * format. + * + * @note The implementation relies on IEEE-like (no assumption about rounding + * mode and no operations on denormals) floating-point operations and bitcasts + * between integer and floating-point variables. + */ +C10_HOST_DEVICE inline float fp16_ieee_to_fp32_value(uint16_t h) { +#ifdef C10_X86_F16 + return _cvtsh_ss(h); +#else + /* + * Extend the half-precision floating-point number to 32 bits and shift to the + * upper part of the 32-bit word: + * +---+-----+------------+-------------------+ + * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| + * +---+-----+------------+-------------------+ + * Bits 31 26-30 16-25 0-15 + * + * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 + * - zero bits. + */ + const uint32_t w = (uint32_t)h << 16; + /* + * Extract the sign of the input number into the high bit of the 32-bit word: + * + * +---+----------------------------------+ + * | S |0000000 00000000 00000000 00000000| + * +---+----------------------------------+ + * Bits 31 0-31 + */ + const uint32_t sign = w & UINT32_C(0x80000000); + /* + * Extract mantissa and biased exponent of the input number into the high bits + * of the 32-bit word: + * + * +-----+------------+---------------------+ + * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000| + * +-----+------------+---------------------+ + * Bits 27-31 17-26 0-16 + */ + const uint32_t two_w = w + w; + + /* + * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become + * mantissa and exponent of a single-precision floating-point number: + * + * S|Exponent | Mantissa + * +-+---+-----+------------+----------------+ + * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000| + * +-+---+-----+------------+----------------+ + * Bits | 23-31 | 0-22 + * + * Next, there are some adjustments to the exponent: + * - The exponent needs to be corrected by the difference in exponent bias + * between single-precision and half-precision formats (0x7F - 0xF = 0x70) + * - Inf and NaN values in the inputs should become Inf and NaN values after + * conversion to the single-precision number. Therefore, if the biased + * exponent of the half-precision input was 0x1F (max possible value), the + * biased exponent of the single-precision output must be 0xFF (max possible + * value). We do this correction in two steps: + * - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset + * below) rather than by 0x70 suggested by the difference in the exponent bias + * (see above). + * - Then we multiply the single-precision result of exponent adjustment by + * 2**(-112) to reverse the effect of exponent adjustment by 0xE0 less the + * necessary exponent adjustment by 0x70 due to difference in exponent bias. + * The floating-point multiplication hardware would ensure than Inf and + * NaN would retain their value on at least partially IEEE754-compliant + * implementations. + * + * Note that the above operations do not handle denormal inputs (where biased + * exponent == 0). However, they also do not operate on denormal inputs, and + * do not produce denormal results. + */ + constexpr uint32_t exp_offset = UINT32_C(0xE0) << 23; + // const float exp_scale = 0x1.0p-112f; + constexpr uint32_t scale_bits = (uint32_t)15 << 23; + float exp_scale_val = 0; + std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val)); + const float exp_scale = exp_scale_val; + const float normalized_value = + fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; + + /* + * Convert denormalized half-precision inputs into single-precision results + * (always normalized). Zero inputs are also handled here. + * + * In a denormalized number the biased exponent is zero, and mantissa has + * on-zero bits. First, we shift mantissa into bits 0-9 of the 32-bit word. + * + * zeros | mantissa + * +---------------------------+------------+ + * |0000 0000 0000 0000 0000 00|MM MMMM MMMM| + * +---------------------------+------------+ + * Bits 10-31 0-9 + * + * Now, remember that denormalized half-precision numbers are represented as: + * FP16 = mantissa * 2**(-24). + * The trick is to construct a normalized single-precision number with the + * same mantissa and thehalf-precision input and with an exponent which would + * scale the corresponding mantissa bits to 2**(-24). A normalized + * single-precision floating-point number is represented as: FP32 = (1 + + * mantissa * 2**(-23)) * 2**(exponent - 127) Therefore, when the biased + * exponent is 126, a unit change in the mantissa of the input denormalized + * half-precision number causes a change of the constructed single-precision + * number by 2**(-24), i.e. the same amount. + * + * The last step is to adjust the bias of the constructed single-precision + * number. When the input half-precision number is zero, the constructed + * single-precision number has the value of FP32 = 1 * 2**(126 - 127) = + * 2**(-1) = 0.5 Therefore, we need to subtract 0.5 from the constructed + * single-precision number to get the numerical equivalent of the input + * half-precision number. + */ + constexpr uint32_t magic_mask = UINT32_C(126) << 23; + constexpr float magic_bias = 0.5f; + const float denormalized_value = + fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; + + /* + * - Choose either results of conversion of input as a normalized number, or + * as a denormalized number, depending on the input exponent. The variable + * two_w contains input exponent in bits 27-31, therefore if its smaller than + * 2**27, the input is either a denormal number, or zero. + * - Combine the result of conversion of exponent and mantissa with the sign + * of the input number. + */ + constexpr uint32_t denormalized_cutoff = UINT32_C(1) << 27; + const uint32_t result = sign | + (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) + : fp32_to_bits(normalized_value)); + return fp32_from_bits(result); +#endif // C10_X86_F16 +} + +/* + * Convert a 32-bit floating-point number in IEEE single-precision format to a + * 16-bit floating-point number in IEEE half-precision format, in bit + * representation. + * + * @note The implementation relies on IEEE-like (no assumption about rounding + * mode and no operations on denormals) floating-point operations and bitcasts + * between integer and floating-point variables. + */ +inline uint16_t fp16_ieee_from_fp32_value(float f) { +#ifdef C10_X86_F16 + return _cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT); +#else + // const float scale_to_inf = 0x1.0p+112f; + // const float scale_to_zero = 0x1.0p-110f; + constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23; + constexpr uint32_t scale_to_zero_bits = (uint32_t)17 << 23; + float scale_to_inf_val = 0, scale_to_zero_val = 0; + std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val)); + std::memcpy( + &scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val)); + const float scale_to_inf = scale_to_inf_val; + const float scale_to_zero = scale_to_zero_val; + +#if defined(_MSC_VER) && _MSC_VER == 1916 + float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero; +#else + float base = (fabsf(f) * scale_to_inf) * scale_to_zero; +#endif + + const uint32_t w = fp32_to_bits(f); + const uint32_t shl1_w = w + w; + const uint32_t sign = w & UINT32_C(0x80000000); + uint32_t bias = shl1_w & UINT32_C(0xFF000000); + if (bias < UINT32_C(0x71000000)) { + bias = UINT32_C(0x71000000); + } + + base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; + const uint32_t bits = fp32_to_bits(base); + const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); + const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); + const uint32_t nonsign = exp_bits + mantissa_bits; + return static_cast( + (sign >> 16) | + (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign)); +#endif // C10_X86_F16 +} + +#ifdef C10_X86_F16 +#undef C10_X86_F16 +#endif // C10_X86_F16 + +#if defined(__aarch64__) && !defined(__CUDACC__) +inline float16_t fp16_from_bits(uint16_t h) { + return c10::bit_cast(h); +} + +inline uint16_t fp16_to_bits(float16_t f) { + return c10::bit_cast(f); +} + +// According to https://godbolt.org/z/frExdbsWG it would translate to single +// fcvt s0, h0 +inline float native_fp16_to_fp32_value(uint16_t h) { + return static_cast(fp16_from_bits(h)); +} + +inline uint16_t native_fp16_from_fp32_value(float f) { + return fp16_to_bits(static_cast(f)); +} +#endif + +} // namespace detail + +struct alignas(2) Half { + unsigned short x; + + struct from_bits_t {}; + C10_HOST_DEVICE static constexpr from_bits_t from_bits() { + return from_bits_t(); + } + + // HIP wants __host__ __device__ tag, CUDA does not +#if defined(USE_ROCM) + C10_HOST_DEVICE Half() = default; +#else + Half() = default; +#endif + + constexpr C10_HOST_DEVICE Half(unsigned short bits, from_bits_t) : x(bits) {} +#if defined(__aarch64__) && !defined(__CUDACC__) + inline Half(float16_t value); + inline operator float16_t() const; +#else + inline C10_HOST_DEVICE Half(float value); + inline C10_HOST_DEVICE operator float() const; +#endif + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_HOST_DEVICE Half(const __half& value); + inline C10_HOST_DEVICE operator __half() const; +#endif +#ifdef SYCL_LANGUAGE_VERSION + inline C10_HOST_DEVICE Half(const sycl::half& value); + inline C10_HOST_DEVICE operator sycl::half() const; +#endif +}; + +#ifndef C10_EMBEDDED +C10_API inline std::ostream& operator<<(std::ostream& out, const Half& value) { + out << (float)value; + return out; +} +#endif // C10_EMBEDDED + +} // namespace c10 + +#include // IWYU pragma: keep diff --git a/runtime/core/portable_type/c10/util/TypeSafeSignMath.h b/runtime/core/portable_type/c10/util/TypeSafeSignMath.h new file mode 100644 index 0000000000..2853ff48d1 --- /dev/null +++ b/runtime/core/portable_type/c10/util/TypeSafeSignMath.h @@ -0,0 +1,140 @@ +#pragma once + +#include +#include +#include + +C10_CLANG_DIAGNOSTIC_PUSH() +#if C10_CLANG_HAS_WARNING("-Wstring-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wstring-conversion") +#endif +#if C10_CLANG_HAS_WARNING("-Wimplicit-int-float-conversion") +C10_CLANG_DIAGNOSTIC_IGNORE("-Wimplicit-int-float-conversion") +#endif + +namespace c10 { + +/// Returns false since we cannot have x < 0 if x is unsigned. +template +inline constexpr bool is_negative( + const T& /*x*/, + std::true_type /*is_unsigned*/) { + return false; +} + +/// Returns true if a signed variable x < 0 +template +inline constexpr bool is_negative(const T& x, std::false_type /*is_unsigned*/) { + return x < T(0); +} + +/// Returns true if x < 0 +/// NOTE: Will fail on an unsigned custom type +/// For the most part it's possible to fix this if +/// the custom type has a constexpr constructor. +/// However, notably, c10::Half does not :-( +template +inline constexpr bool is_negative(const T& x) { + return is_negative(x, std::is_unsigned()); +} + +/// Returns the sign of an unsigned variable x as 0, 1 +template +inline constexpr int signum(const T& x, std::true_type /*is_unsigned*/) { + return T(0) < x; +} + +/// Returns the sign of a signed variable x as -1, 0, 1 +template +inline constexpr int signum(const T& x, std::false_type /*is_unsigned*/) { + return (T(0) < x) - (x < T(0)); +} + +/// Returns the sign of x as -1, 0, 1 +/// NOTE: Will fail on an unsigned custom type +/// For the most part it's possible to fix this if +/// the custom type has a constexpr constructor. +/// However, notably, c10::Half does not :-( +template +inline constexpr int signum(const T& x) { + return signum(x, std::is_unsigned()); +} + +/// Returns true if a and b are not both negative +template +inline constexpr bool signs_differ(const T& a, const U& b) { + return is_negative(a) != is_negative(b); +} + +// Suppress sign compare warning when compiling with GCC +// as later does not account for short-circuit rule before +// raising the warning, see https://godbolt.org/z/Tr3Msnz99 +#ifdef __GNUC__ +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +#endif + +/// Returns true if x is greater than the greatest value of the type Limit +template +inline constexpr bool greater_than_max(const T& x) { + constexpr bool can_overflow = + std::numeric_limits::digits > std::numeric_limits::digits; + return can_overflow && x > std::numeric_limits::max(); +} + +#ifdef __GNUC__ +#pragma GCC diagnostic pop +#endif + +/// Returns true if x < lowest(Limit). Standard comparison +template +inline constexpr bool less_than_lowest( + const T& x, + std::false_type /*limit_is_unsigned*/, + std::false_type /*x_is_unsigned*/) { + return x < std::numeric_limits::lowest(); +} + +/// Returns false since all the limit is signed and therefore includes +/// negative values but x cannot be negative because it is unsigned +template +inline constexpr bool less_than_lowest( + const T& /*x*/, + std::false_type /*limit_is_unsigned*/, + std::true_type /*x_is_unsigned*/) { + return false; +} + +/// Returns true if x < 0, where 0 is constructed from T. +/// Limit is not signed, so its lower value is zero +template +inline constexpr bool less_than_lowest( + const T& x, + std::true_type /*limit_is_unsigned*/, + std::false_type /*x_is_unsigned*/) { + return x < T(0); +} + +/// Returns false sign both types are unsigned +template +inline constexpr bool less_than_lowest( + const T& /*x*/, + std::true_type /*limit_is_unsigned*/, + std::true_type /*x_is_unsigned*/) { + return false; +} + +/// Returns true if x is less than the lowest value of type T +/// NOTE: Will fail on an unsigned custom type +/// For the most part it's possible to fix this if +/// the custom type has a constexpr constructor. +/// However, notably, c10::Half does not : +template +inline constexpr bool less_than_lowest(const T& x) { + return less_than_lowest( + x, std::is_unsigned(), std::is_unsigned()); +} + +} // namespace c10 + +C10_CLANG_DIAGNOSTIC_POP() diff --git a/runtime/core/portable_type/c10/util/bit_cast.h b/runtime/core/portable_type/c10/util/bit_cast.h new file mode 100644 index 0000000000..c1d2c10288 --- /dev/null +++ b/runtime/core/portable_type/c10/util/bit_cast.h @@ -0,0 +1,44 @@ +#pragma once + +#include +#include + +#if __has_include() && (__cplusplus >= 202002L || (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)) +#include +#define C10_HAVE_STD_BIT_CAST 1 +#else +#define C10_HAVE_STD_BIT_CAST 0 +#endif // __has_include() && (__cplusplus >= 202002L || + // (defined(__cpp_lib_bit_cast) && __cpp_lib_bit_cast >= 201806L)) + +namespace c10 { + +#if C10_HAVE_STD_BIT_CAST +using std::bit_cast; +#else +// Implementations of std::bit_cast() from C++ 20. +// +// This is a less sketchy version of reinterpret_cast. +// +// See https://en.cppreference.com/w/cpp/numeric/bit_cast for more +// information as well as the source of our implementations. +template +std::enable_if_t< + sizeof(To) == sizeof(From) && std::is_trivially_copyable_v && + std::is_trivially_copyable_v, + To> +// constexpr support needs compiler magic +bit_cast(const From& src) noexcept { + static_assert( + std::is_trivially_constructible_v, + "This implementation additionally requires " + "destination type to be trivially constructible"); + + To dst; + std::memcpy(&dst, &src, sizeof(To)); + return dst; +} +#endif // C10_HAVE_STD_BIT_CAST +#undef C10_HAVE_STD_BIT_CAST + +} // namespace c10 diff --git a/runtime/core/portable_type/c10/util/floating_point_utils.h b/runtime/core/portable_type/c10/util/floating_point_utils.h new file mode 100644 index 0000000000..b240c4ea23 --- /dev/null +++ b/runtime/core/portable_type/c10/util/floating_point_utils.h @@ -0,0 +1,33 @@ +#pragma once + +#include +#include +#include + +namespace c10::detail { + +C10_HOST_DEVICE inline float fp32_from_bits(uint32_t w) { +#if defined(__OPENCL_VERSION__) + return as_float(w); +#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return __uint_as_float((unsigned int)w); +#elif defined(__INTEL_COMPILER) + return _castu32_f32(w); +#else + return c10::bit_cast(w); +#endif +} + +C10_HOST_DEVICE inline uint32_t fp32_to_bits(float f) { +#if defined(__OPENCL_VERSION__) + return as_uint(f); +#elif defined(__CUDA_ARCH__) || defined(__HIP_DEVICE_COMPILE__) + return (uint32_t)__float_as_uint(f); +#elif defined(__INTEL_COMPILER) + return _castf32_u32(f); +#else + return c10::bit_cast(f); +#endif +} + +} // namespace c10::detail diff --git a/runtime/core/portable_type/half.h b/runtime/core/portable_type/half.h index fa40a80782..bf4c676ce8 100644 --- a/runtime/core/portable_type/half.h +++ b/runtime/core/portable_type/half.h @@ -8,757 +8,20 @@ #pragma once -#include -#include -#include -#include -#include - -#if defined(__GNUC__) || defined(__clang__) -#if defined(__aarch64__) -#ifndef __ARM_V8_ONLY__ -#define NATIVE_FP16 1 -#endif // __ARM_V8_ONLY__ -#endif // __aarch64__ -#endif // GNUC or clang - -#if defined(__GNUC__) || defined(__clang__) -#if defined(__x86_64__) || defined(_M_X64) || defined(__i386) || \ - defined(_M_IX86) -#if defined(__AVX2__) -#define X86_F16 1 -#include // import conversion ops from f16cintrin.h -#endif // __AVX2__ -#endif // __x86_64__ || _M_X64 || __i386 || _M_IX86 -#endif // __GNUC__ || __clang__ - -namespace executorch { -namespace runtime { -namespace etensor { - -/** - * A half-precision floating point type, compatible with c10/util/Half.h from - * pytorch core. - */ -struct alignas(2) Half { - union { -#ifdef NATIVE_FP16 - _Float16 y; -#endif - uint16_t x; - }; - - struct from_bits_t {}; - static constexpr from_bits_t from_bits() { - return from_bits_t(); - } - - Half() = default; - - constexpr Half(uint16_t bits, from_bits_t) : x(bits) {} - /* implicit */ inline Half(float value); - inline operator float() const; -}; +#include +namespace executorch::runtime::etensor { +using c10::Half; namespace internal { - -inline float fp32_from_bits(uint32_t w) { - static_assert(sizeof(float) == sizeof(uint32_t)); - union { - uint32_t as_bits; - float as_value; - } fp32 = {w}; - return fp32.as_value; -} - -inline uint32_t fp32_to_bits(float f) { - static_assert(sizeof(float) == sizeof(uint32_t)); - union { - float as_value; - uint32_t as_bits; - } fp32 = {f}; - return fp32.as_bits; -} - -/* - * Convert a 16-bit floating-point number in IEEE half-precision format, in bit - * representation, to a 32-bit floating-point number in IEEE single-precision - * format, in bit representation. - * - * @note The implementation doesn't use any floating-point operations. - */ -inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) { - /* - * Extend the half-precision floating-point number to 32 bits and shift to the - * upper part of the 32-bit word: - * +---+-----+------------+-------------------+ - * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| - * +---+-----+------------+-------------------+ - * Bits 31 26-30 16-25 0-15 - * - * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - * - zero bits. - */ - const uint32_t w = (uint32_t)h << 16; - /* - * Extract the sign of the input number into the high bit of the 32-bit word: - * - * +---+----------------------------------+ - * | S |0000000 00000000 00000000 00000000| - * +---+----------------------------------+ - * Bits 31 0-31 - */ - const uint32_t sign = w & UINT32_C(0x80000000); - /* - * Extract mantissa and biased exponent of the input number into the bits 0-30 - * of the 32-bit word: - * - * +---+-----+------------+-------------------+ - * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| - * +---+-----+------------+-------------------+ - * Bits 30 27-31 17-26 0-16 - */ - const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); - /* - * Renorm shift is the number of bits to shift mantissa left to make the - * half-precision number normalized. If the initial number is normalized, some - * of its high 6 bits (sign == 0 and 5-bit exponent) equals one. In this case - * renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note - * that if we shift denormalized nonsign by renorm_shift, the unit bit of - * mantissa will shift into exponent, turning the biased exponent into 1, and - * making mantissa normalized (i.e. without leading 1). - */ -#ifdef _MSC_VER - unsigned long nonsign_bsr; - _BitScanReverse(&nonsign_bsr, (unsigned long)nonsign); - uint32_t renorm_shift = (uint32_t)nonsign_bsr ^ 31; -#else - uint32_t renorm_shift = __builtin_clz(nonsign); -#endif - renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; - /* - * Iff half-precision number has exponent of 15, the addition overflows - * it into bit 31, and the subsequent shift turns the high 9 bits - * into 1. Thus inf_nan_mask == 0x7F800000 if the half-precision number - * had exponent of 15 (i.e. was NaN or infinity) 0x00000000 otherwise - */ - const int32_t inf_nan_mask = - ((int32_t)(nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); - /* - * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 - * into 1. Otherwise, bit 31 remains 0. The signed shift right by 31 - * broadcasts bit 31 into all bits of the zero_mask. Thus zero_mask == - * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h) - * 0x00000000 otherwise - */ - const int32_t zero_mask = (int32_t)(nonsign - 1) >> 31; - /* - * 1. Shift nonsign left by renorm_shift to normalize it (if the input - * was denormal) - * 2. Shift nonsign right by 3 so the exponent (5 bits originally) - * becomes an 8-bit field and 10-bit mantissa shifts into the 10 high - * bits of the 23-bit mantissa of IEEE single-precision number. - * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the - * different in exponent bias (0x7F for single-precision number less 0xF - * for half-precision number). - * 4. Subtract renorm_shift from the exponent (starting at bit 23) to - * account for renormalization. As renorm_shift is less than 0x70, this - * can be combined with step 3. - * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the - * input was NaN or infinity. - * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent - * into zero if the input was zero. - * 7. Combine with the sign of the input number. - */ - return sign | - ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | - inf_nan_mask) & - ~zero_mask); -} - -/* - * Convert a 16-bit floating-point number in IEEE half-precision format, in bit - * representation, to a 32-bit floating-point number in IEEE single-precision - * format. - * - * @note The implementation relies on IEEE-like (no assumption about rounding - * mode and no operations on denormals) floating-point operations and bitcasts - * between integer and floating-point variables. - */ -inline float fp16_ieee_to_fp32_value(uint16_t h) { -#ifdef X86_F16 - return _cvtsh_ss(h); -#else - - /* - * Extend the half-precision floating-point number to 32 bits and shift to the - * upper part of the 32-bit word: - * +---+-----+------------+-------------------+ - * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| - * +---+-----+------------+-------------------+ - * Bits 31 26-30 16-25 0-15 - * - * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - * - zero bits. - */ - const uint32_t w = (uint32_t)h << 16; - /* - * Extract the sign of the input number into the high bit of the 32-bit word: - * - * +---+----------------------------------+ - * | S |0000000 00000000 00000000 00000000| - * +---+----------------------------------+ - * Bits 31 0-31 - */ - const uint32_t sign = w & UINT32_C(0x80000000); - /* - * Extract mantissa and biased exponent of the input number into the high bits - * of the 32-bit word: - * - * +-----+------------+---------------------+ - * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000| - * +-----+------------+---------------------+ - * Bits 27-31 17-26 0-16 - */ - const uint32_t two_w = w + w; - - /* - * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become - * mantissa and exponent of a single-precision floating-point number: - * - * S|Exponent | Mantissa - * +-+---+-----+------------+----------------+ - * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000| - * +-+---+-----+------------+----------------+ - * Bits | 23-31 | 0-22 - * - * Next, there are some adjustments to the exponent: - * - The exponent needs to be corrected by the difference in exponent bias - * between single-precision and half-precision formats (0x7F - 0xF = 0x70) - * - Inf and NaN values in the inputs should become Inf and NaN values after - * conversion to the single-precision number. Therefore, if the biased - * exponent of the half-precision input was 0x1F (max possible value), the - * biased exponent of the single-precision output must be 0xFF (max possible - * value). We do this correction in two steps: - * - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset - * below) rather than by 0x70 suggested by the difference in the exponent bias - * (see above). - * - Then we multiply the single-precision result of exponent adjustment by - * 2**(-112) to reverse the effect of exponent adjustment by 0xE0 less the - * necessary exponent adjustment by 0x70 due to difference in exponent bias. - * The floating-point multiplication hardware would ensure than Inf and - * NaN would retain their value on at least partially IEEE754-compliant - * implementations. - * - * Note that the above operations do not handle denormal inputs (where biased - * exponent == 0). However, they also do not operate on denormal inputs, and - * do not produce denormal results. - */ - constexpr uint32_t exp_offset = UINT32_C(0xE0) << 23; - // const float exp_scale = 0x1.0p-112f; - constexpr uint32_t scale_bits = (uint32_t)15 << 23; - float exp_scale_val = 0; - std::memcpy(&exp_scale_val, &scale_bits, sizeof(exp_scale_val)); - const float exp_scale = exp_scale_val; - const float normalized_value = - fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; - - /* - * Convert denormalized half-precision inputs into single-precision results - * (always normalized). Zero inputs are also handled here. - * - * In a denormalized number the biased exponent is zero, and mantissa has - * on-zero bits. First, we shift mantissa into bits 0-9 of the 32-bit word. - * - * zeros | mantissa - * +---------------------------+------------+ - * |0000 0000 0000 0000 0000 00|MM MMMM MMMM| - * +---------------------------+------------+ - * Bits 10-31 0-9 - * - * Now, remember that denormalized half-precision numbers are represented as: - * FP16 = mantissa * 2**(-24). - * The trick is to construct a normalized single-precision number with the - * same mantissa and thehalf-precision input and with an exponent which would - * scale the corresponding mantissa bits to 2**(-24). A normalized - * single-precision floating-point number is represented as: FP32 = (1 + - * mantissa * 2**(-23)) * 2**(exponent - 127) Therefore, when the biased - * exponent is 126, a unit change in the mantissa of the input denormalized - * half-precision number causes a change of the constructed single-precision - * number by 2**(-24), i.e. the same amount. - * - * The last step is to adjust the bias of the constructed single-precision - * number. When the input half-precision number is zero, the constructed - * single-precision number has the value of FP32 = 1 * 2**(126 - 127) = - * 2**(-1) = 0.5 Therefore, we need to subtract 0.5 from the constructed - * single-precision number to get the numerical equivalent of the input - * half-precision number. - */ - constexpr uint32_t magic_mask = UINT32_C(126) << 23; - constexpr float magic_bias = 0.5f; - const float denormalized_value = - fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; - - /* - * - Choose either results of conversion of input as a normalized number, or - * as a denormalized number, depending on the input exponent. The variable - * two_w contains input exponent in bits 27-31, therefore if its smaller than - * 2**27, the input is either a denormal number, or zero. - * - Combine the result of conversion of exponent and mantissa with the sign - * of the input number. - */ - constexpr uint32_t denormalized_cutoff = UINT32_C(1) << 27; - const uint32_t result = sign | - (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) - : fp32_to_bits(normalized_value)); - return fp32_from_bits(result); - -#endif // not X86_F16 -} - -/* - * Convert a 32-bit floating-point number in IEEE single-precision format to a - * 16-bit floating-point number in IEEE half-precision format, in bit - * representation. - * - * @note The implementation relies on IEEE-like (no assumption about rounding - * mode and no operations on denormals) floating-point operations and bitcasts - * between integer and floating-point variables. - */ -inline uint16_t fp16_ieee_from_fp32_value(float f) { -#ifdef X86_F16 - return _cvtss_sh(f, _MM_FROUND_TO_NEAREST_INT); -#else - - // const float scale_to_inf = 0x1.0p+112f; - // const float scale_to_zero = 0x1.0p-110f; - constexpr uint32_t scale_to_inf_bits = (uint32_t)239 << 23; - constexpr uint32_t scale_to_zero_bits = (uint32_t)17 << 23; - float scale_to_inf_val = 0, scale_to_zero_val = 0; - std::memcpy(&scale_to_inf_val, &scale_to_inf_bits, sizeof(scale_to_inf_val)); - std::memcpy( - &scale_to_zero_val, &scale_to_zero_bits, sizeof(scale_to_zero_val)); - const float scale_to_inf = scale_to_inf_val; - const float scale_to_zero = scale_to_zero_val; - -#if defined(_MSC_VER) && _MSC_VER == 1916 - float base = ((signbit(f) != 0 ? -f : f) * scale_to_inf) * scale_to_zero; -#else - float base = (fabsf(f) * scale_to_inf) * scale_to_zero; -#endif - - const uint32_t w = fp32_to_bits(f); - const uint32_t shl1_w = w + w; - const uint32_t sign = w & UINT32_C(0x80000000); - uint32_t bias = shl1_w & UINT32_C(0xFF000000); - if (bias < UINT32_C(0x71000000)) { - bias = UINT32_C(0x71000000); - } - - base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; - const uint32_t bits = fp32_to_bits(base); - const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); - const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); - const uint32_t nonsign = exp_bits + mantissa_bits; - return static_cast( - (sign >> 16) | - (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign)); -#endif // not X86_F16 -} - +using c10::detail::fp16_ieee_from_fp32_value; +using c10::detail::fp16_ieee_to_fp32_bits; +using c10::detail::fp16_ieee_to_fp32_value; +using c10::detail::fp32_from_bits; +using c10::detail::fp32_to_bits; } // namespace internal - -/// Constructors -#ifdef NATIVE_FP16 -inline Half::Half(float value) : y(value) {} -#else -inline Half::Half(float value) - : x(internal::fp16_ieee_from_fp32_value(value)) {} -#endif - -/// Implicit conversions -#ifdef NATIVE_FP16 -inline Half::operator float() const { - return (float)y; -} -#else -inline Half::operator float() const { - return internal::fp16_ieee_to_fp32_value(x); -} -#endif - -/// Arithmetic - -#ifdef NATIVE_FP16 - -#define return_half(r) \ - do { \ - Half ret; \ - ret.y = r; \ - return ret; \ - } while (0) - -inline Half operator+(const Half& a, const Half& b) { - return_half(a.y + b.y); -} - -inline Half operator-(const Half& a, const Half& b) { - return_half(a.y - b.y); - return static_cast(a) - static_cast(b); -} - -inline Half operator*(const Half& a, const Half& b) { - return_half(a.y * b.y); -} - -inline Half operator/(const Half& a, const Half& b) { - return_half(a.y / b.y); -} - -inline Half operator-(const Half& a) { - return_half(-a.y); -} - -inline Half& operator+=(Half& a, const Half& b) { - a.y += b.y; - return a; -} - -inline Half& operator-=(Half& a, const Half& b) { - a.y -= b.y; - return a; -} - -inline Half& operator*=(Half& a, const Half& b) { - a.y *= b.y; - return a; -} - -inline Half& operator/=(Half& a, const Half& b) { - a.y /= b.y; - return a; -} - -#else - -inline Half operator+(const Half& a, const Half& b) { - return static_cast(a) + static_cast(b); -} - -inline Half operator-(const Half& a, const Half& b) { - return static_cast(a) - static_cast(b); -} - -inline Half operator*(const Half& a, const Half& b) { - return static_cast(a) * static_cast(b); -} - -inline Half operator/(const Half& a, const Half& b) { - return static_cast(a) / static_cast(b); -} - -inline Half operator-(const Half& a) { - return -static_cast(a); -} - -inline Half& operator+=(Half& a, const Half& b) { - a = a + b; - return a; -} - -inline Half& operator-=(Half& a, const Half& b) { - a = a - b; - return a; -} - -inline Half& operator*=(Half& a, const Half& b) { - a = a * b; - return a; -} - -inline Half& operator/=(Half& a, const Half& b) { - a = a / b; - return a; -} - -#endif - -/// Arithmetic with floats - -inline float operator+(Half a, float b) { - return static_cast(a) + b; -} -inline float operator-(Half a, float b) { - return static_cast(a) - b; -} -inline float operator*(Half a, float b) { - return static_cast(a) * b; -} -inline float operator/(Half a, float b) { - return static_cast(a) / b; -} - -inline float operator+(float a, Half b) { - return a + static_cast(b); -} -inline float operator-(float a, Half b) { - return a - static_cast(b); -} -inline float operator*(float a, Half b) { - return a * static_cast(b); -} -inline float operator/(float a, Half b) { - return a / static_cast(b); -} - -inline float& operator+=(float& a, const Half& b) { - return a += static_cast(b); -} -inline float& operator-=(float& a, const Half& b) { - return a -= static_cast(b); -} -inline float& operator*=(float& a, const Half& b) { - return a *= static_cast(b); -} -inline float& operator/=(float& a, const Half& b) { - return a /= static_cast(b); -} - -/// Arithmetic with doubles - -inline double operator+(Half a, double b) { - return static_cast(a) + b; -} -inline double operator-(Half a, double b) { - return static_cast(a) - b; -} -inline double operator*(Half a, double b) { - return static_cast(a) * b; -} -inline double operator/(Half a, double b) { - return static_cast(a) / b; -} - -inline double operator+(double a, Half b) { - return a + static_cast(b); -} -inline double operator-(double a, Half b) { - return a - static_cast(b); -} -inline double operator*(double a, Half b) { - return a * static_cast(b); -} -inline double operator/(double a, Half b) { - return a / static_cast(b); -} - -/// Arithmetic with ints - -#ifdef NATIVE_FP16 - -inline Half operator+(Half a, int32_t b) { - return_half(a.y + b); -} -inline Half operator-(Half a, int32_t b) { - return_half(a.y - b); -} -inline Half operator*(Half a, int32_t b) { - return_half(a.y * b); -} -inline Half operator/(Half a, int32_t b) { - return_half(a.y / b); -} - -inline Half operator+(int32_t a, Half b) { - return_half(a + b.y); -} -inline Half operator-(int32_t a, Half b) { - return_half(a - b.y); -} -inline Half operator*(int32_t a, Half b) { - return_half(a * b.y); -} -inline Half operator/(int32_t a, Half b) { - return_half(a / b.y); -} - -#else - -inline Half operator+(Half a, int32_t b) { - return a + static_cast(b); -} -inline Half operator-(Half a, int32_t b) { - return a - static_cast(b); -} -inline Half operator*(Half a, int32_t b) { - return a * static_cast(b); -} -inline Half operator/(Half a, int32_t b) { - return a / static_cast(b); -} - -inline Half operator+(int32_t a, Half b) { - return static_cast(a) + b; -} -inline Half operator-(int32_t a, Half b) { - return static_cast(a) - b; -} -inline Half operator*(int32_t a, Half b) { - return static_cast(a) * b; -} -inline Half operator/(int32_t a, Half b) { - return static_cast(a) / b; -} - -#endif - -//// Arithmetic with int64_t - -#ifdef NATIVE_FP16 - -inline Half operator+(Half a, int64_t b) { - return_half(a.y + b); -} -inline Half operator-(Half a, int64_t b) { - return_half(a.y - b); -} -inline Half operator*(Half a, int64_t b) { - return_half(a.y * b); -} -inline Half operator/(Half a, int64_t b) { - return_half(a.y / b); -} - -inline Half operator+(int64_t a, Half b) { - return_half(a + b.y); -} -inline Half operator-(int64_t a, Half b) { - return_half(a - b.y); -} -inline Half operator*(int64_t a, Half b) { - return_half(a * b.y); -} -inline Half operator/(int64_t a, Half b) { - return_half(a / b.y); -} - -#else - -inline Half operator+(Half a, int64_t b) { - return a + static_cast(b); -} -inline Half operator-(Half a, int64_t b) { - return a - static_cast(b); -} -inline Half operator*(Half a, int64_t b) { - return a * static_cast(b); -} -inline Half operator/(Half a, int64_t b) { - return a / static_cast(b); -} - -inline Half operator+(int64_t a, Half b) { - return static_cast(a) + b; -} -inline Half operator-(int64_t a, Half b) { - return static_cast(a) - b; -} -inline Half operator*(int64_t a, Half b) { - return static_cast(a) * b; -} -inline Half operator/(int64_t a, Half b) { - return static_cast(a) / b; -} - -#endif - -/// NOTE: we do not define comparisons directly and instead rely on the implicit -/// conversion Half to float. - -static inline std::ostream& operator<<( - std::ostream& out, - const executorch::runtime::etensor::Half& value) { - out << (float)value; - return out; -} - -} // namespace etensor -} // namespace runtime -} // namespace executorch -namespace torch { -namespace executor { +} // namespace executorch::runtime::etensor +namespace torch::executor { // TODO(T197294990): Remove these deprecated aliases once all users have moved // to the new `::executorch` namespaces. using ::executorch::runtime::etensor::Half; -} // namespace executor -} // namespace torch - -namespace std { - -template <> -class numeric_limits { - public: - static constexpr bool is_specialized = true; - static constexpr bool is_signed = true; - static constexpr bool is_integer = false; - static constexpr bool is_exact = false; - static constexpr bool has_infinity = true; - static constexpr bool has_quiet_NaN = true; - static constexpr bool has_signaling_NaN = true; - static constexpr auto has_denorm = numeric_limits::has_denorm; - static constexpr auto has_denorm_loss = - numeric_limits::has_denorm_loss; - static constexpr auto round_style = numeric_limits::round_style; - static constexpr bool is_iec559 = true; - static constexpr bool is_bounded = true; - static constexpr bool is_modulo = false; - static constexpr int digits = 11; - static constexpr int digits10 = 3; - static constexpr int max_digits10 = 5; - static constexpr int radix = 2; - static constexpr int min_exponent = -13; - static constexpr int min_exponent10 = -4; - static constexpr int max_exponent = 16; - static constexpr int max_exponent10 = 4; - static constexpr auto traps = numeric_limits::traps; - static constexpr auto tinyness_before = - numeric_limits::tinyness_before; - static constexpr executorch::runtime::etensor::Half min() { - return executorch::runtime::etensor::Half( - 0x0400, executorch::runtime::etensor::Half::from_bits()); - } - static constexpr executorch::runtime::etensor::Half lowest() { - return executorch::runtime::etensor::Half( - 0xFBFF, executorch::runtime::etensor::Half::from_bits()); - } - static constexpr executorch::runtime::etensor::Half max() { - return executorch::runtime::etensor::Half( - 0x7BFF, executorch::runtime::etensor::Half::from_bits()); - } - static constexpr executorch::runtime::etensor::Half epsilon() { - return executorch::runtime::etensor::Half( - 0x1400, executorch::runtime::etensor::Half::from_bits()); - } - static constexpr executorch::runtime::etensor::Half round_error() { - return executorch::runtime::etensor::Half( - 0x3800, executorch::runtime::etensor::Half::from_bits()); - } - static constexpr executorch::runtime::etensor::Half infinity() { - return executorch::runtime::etensor::Half( - 0x7C00, executorch::runtime::etensor::Half::from_bits()); - } - static constexpr executorch::runtime::etensor::Half quiet_NaN() { - return executorch::runtime::etensor::Half( - 0x7E00, executorch::runtime::etensor::Half::from_bits()); - } - static constexpr executorch::runtime::etensor::Half signaling_NaN() { - return executorch::runtime::etensor::Half( - 0x7D00, executorch::runtime::etensor::Half::from_bits()); - } - static constexpr executorch::runtime::etensor::Half denorm_min() { - return executorch::runtime::etensor::Half( - 0x0001, executorch::runtime::etensor::Half::from_bits()); - } -}; - -} // namespace std +} // namespace torch::executor diff --git a/runtime/core/portable_type/targets.bzl b/runtime/core/portable_type/targets.bzl index b8ccbe602e..0532def930 100644 --- a/runtime/core/portable_type/targets.bzl +++ b/runtime/core/portable_type/targets.bzl @@ -50,6 +50,9 @@ def define_common_targets(): "qint_types.h", "bits_types.h", ], + exported_deps = [ + "//executorch/runtime/core/portable_type/c10:c10", + ], visibility = [ "//executorch/extension/...", "//executorch/runtime/core/exec_aten/util/...", diff --git a/runtime/kernel/test/CMakeLists.txt b/runtime/kernel/test/CMakeLists.txt index b1b6044791..4e8c24776f 100644 --- a/runtime/kernel/test/CMakeLists.txt +++ b/runtime/kernel/test/CMakeLists.txt @@ -47,14 +47,11 @@ add_executable( ) target_link_libraries( operator_registry_max_kernel_num_test GTest::gtest GTest::gtest_main - GTest::gmock + GTest::gmock executorch ) target_compile_definitions( operator_registry_max_kernel_num_test PRIVATE "-DMAX_KERNEL_NUM=1" ) -target_include_directories( - operator_registry_max_kernel_num_test PRIVATE ${EXECUTORCH_ROOT}/.. -) # TODO: This is currently not working! # add_test(operator_registry_max_kernel_num_test operator_registry_max_kernel_num_test) diff --git a/shim/xplat/executorch/build/env_interface.bzl b/shim/xplat/executorch/build/env_interface.bzl index c4111c744b..d32e62c610 100644 --- a/shim/xplat/executorch/build/env_interface.bzl +++ b/shim/xplat/executorch/build/env_interface.bzl @@ -119,7 +119,7 @@ def _remove_platform_specific_args(kwargs): keys = [] for key in kwargs: if (key.endswith("_platform_preprocessor_flags") or key.endswith("_platform_deps") or - key.startswith("fbobjc") or key.endswith("_platform_compiler_flags")): + key.startswith("fbobjc") or key.endswith("_platform_compiler_flags") or key == "fbcode_exported_preprocessor_flags"): keys.append(key) for key in keys: kwargs.pop(key) @@ -140,7 +140,8 @@ def _patch_headers(kwargs): # header_namespace is to workaround the fact that all C++ source files are having the pattern: # `include ` but BUCK2 root is at executorch/ so the `executorch/` prefix is redundant. - kwargs["header_namespace"] = "executorch/" + native.package_name() + if "header_namespace" not in kwargs: + kwargs["header_namespace"] = "executorch/" + native.package_name() return kwargs def _patch_pp_flags(kwargs):