diff --git a/.github/workflows/macos_debug_fetch_hwloc.yml b/.github/workflows/macos_debug_fetch_hwloc.yml index f778a6b117d..caec2af3ac2 100644 --- a/.github/workflows/macos_debug_fetch_hwloc.yml +++ b/.github/workflows/macos_debug_fetch_hwloc.yml @@ -19,7 +19,6 @@ jobs: run: | brew install --overwrite python-tk && \ brew install --overwrite boost gperftools ninja autoconf automake && \ - autoreconf -f -i \ brew upgrade cmake - name: Configure shell: bash diff --git a/libs/core/algorithms/CMakeLists.txt b/libs/core/algorithms/CMakeLists.txt index 6fcfed897e2..9090345722d 100644 --- a/libs/core/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/CMakeLists.txt @@ -37,7 +37,9 @@ set(algorithms_headers hpx/parallel/algorithms/detail/parallel_stable_sort.hpp hpx/parallel/algorithms/detail/pivot.hpp hpx/parallel/algorithms/detail/reduce.hpp + hpx/parallel/algorithms/detail/reduce_deterministic.hpp hpx/parallel/algorithms/detail/replace.hpp + hpx/parallel/algorithms/detail/rfa.hpp hpx/parallel/algorithms/detail/rotate.hpp hpx/parallel/algorithms/detail/sample_sort.hpp hpx/parallel/algorithms/detail/search.hpp @@ -72,6 +74,7 @@ set(algorithms_headers hpx/parallel/algorithms/partition.hpp hpx/parallel/algorithms/reduce_by_key.hpp hpx/parallel/algorithms/reduce.hpp + hpx/parallel/algorithms/reduce_deterministic.hpp hpx/parallel/algorithms/remove_copy.hpp hpx/parallel/algorithms/remove.hpp hpx/parallel/algorithms/replace.hpp diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp new file mode 100644 index 00000000000..b3773088917 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/reduce_deterministic.hpp @@ -0,0 +1,83 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#pragma once + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include "rfa.hpp" + +namespace hpx::parallel::detail { + + template + struct sequential_reduce_deterministic_t final + : hpx::functional::detail::tag_fallback< + sequential_reduce_deterministic_t> + { + private: + template + friend constexpr T tag_fallback_invoke( + sequential_reduce_deterministic_t, ExPolicy&&, InIterB first, + InIterE last, T init, Reduce&& r) + { + /// TODO: Put constraint on Reduce to be a binary plus operator + (void) r; + hpx::parallel::detail::rfa::RFA_bins bins; + bins.initialize_bins(); + std::memcpy(rfa::__rfa_bin_host_buffer__, &bins, sizeof(bins)); + + hpx::parallel::detail::rfa::ReproducibleFloatingAccumulator rfa; + rfa.set_max_abs_val(init); + rfa.unsafe_add(init); + rfa.renorm(); + size_t count = 0; + T max_val = std::abs(*first); + for (auto e = first; e != last; ++e) + { + T temp_max_val = std::abs(static_cast(*e)); + if (max_val < temp_max_val) + { + rfa.set_max_abs_val(temp_max_val); + max_val = temp_max_val; + } + rfa.unsafe_add(*e); + count++; + if (count == rfa.endurance()) + { + rfa.renorm(); + count = 0; + } + } + return rfa.conv(); + } + }; + +#if !defined(HPX_COMPUTE_DEVICE_CODE) + template + inline constexpr sequential_reduce_deterministic_t + sequential_reduce_deterministic = + sequential_reduce_deterministic_t{}; +#else + template + HPX_HOST_DEVICE HPX_FORCEINLINE auto sequential_reduce_deterministic( + Args&&... args) + { + return sequential_reduce_deterministic_t{}( + std::forward(args)...); + } +#endif + +} // namespace hpx::parallel::detail diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp new file mode 100644 index 00000000000..2c72f10bfb9 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/detail/rfa.hpp @@ -0,0 +1,1213 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) +// +// --------------------------------------------------------------------------- +// This file has been taken from +// https://github.com/maddyscientist/reproducible_floating_sums commit +// b5a065741d4ea459437ca004b508de9dcb6a3e52. The boost copyright has been added +// to this file in accordance with the dual license terms for the Reproducible +// Floating-Point Summations and conformance with the HPX policy +// https://github.com/maddyscientist/reproducible_floating_sums/blob/feature/cuda/LICENSE.md +// --------------------------------------------------------------------------- +// +/// Copyright 2022 Richard Barnes, Peter Ahrens, James Demmel +/// Permission is hereby granted, free of charge, to any person obtaining a copy +/// of this software and associated documentation files (the "Software"), to deal +/// in the Software without restriction, including without limitation the rights +/// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +/// copies of the Software, and to permit persons to whom the Software is +/// furnished to do so, subject to the following conditions: +/// The above copyright notice and this permission notice shall be included in +/// all copies or substantial portions of the Software. +/// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +/// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +/// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +/// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +/// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +/// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +/// SOFTWARE. +//Reproducible Floating Point Accumulations via Binned Floating Point +//Adapted to C++ by Richard Barnes from ReproBLAS v2.1.0. +//ReproBLAS by Peter Ahrens, Hong Diep Nguyen, and James Demmel. +// +//The code accomplishes several objectives: +// +//1. Reproducible summation, independent of summation order, assuming only a +// subset of the IEEE 754 Floating Point Standard +// +//2. Has accuracy at least as good as conventional summation, and tunable +// +//3. Handles overflow, underflow, and other exceptions reproducibly. +// +//4. Makes only one read-only pass over the summands. +// +//5. Requires only one parallel reduction. +// +//6. Uses minimal memory (6 doubles per accumulator with fold=3). +// +//7. Relatively easy to use + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include + +namespace hpx::parallel::detail::rfa { + template + struct type4 + { + F x; + F y; + F z; + F w; + }; + + template + struct type2 + { + F x; + F y; + }; + using float4 = type4; + using double4 = type4; + using float2 = type2; + using double2 = type2; + + auto abs_max(float4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double4 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + auto z = std::abs(a.z); + auto w = std::abs(a.w); + const std::vector v = {x, y, z, w}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(float2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + + auto abs_max(double2 a) + { + auto x = std::abs(a.x); + auto y = std::abs(a.y); + const std::vector v = {x, y}; + return *std::max_element(v.begin(), v.end()); + } + +// disable zero checks +#define DISABLE_ZERO + +// disable nan / infinity checks +#define DISABLE_NANINF + +// jump table for indexing into data +#define MAX_JUMP 5 + static_assert(MAX_JUMP <= 5, "MAX_JUMP greater than max"); + + template + inline constexpr Real ldexp_impl(Real arg, int exp) noexcept + { + return std::ldexp(arg, exp); + // while (arg == 0) + // { + // return arg; + // } + // while (exp > 0) + // { + // arg *= static_cast(2); + // --exp; + // } + // while (exp < 0) + // { + // arg /= static_cast(2); + // ++exp; + // } + + // return arg; + } + + template + struct RFA_bins + { + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + + ///The binned floating-point reference bins + std::array bins = {}; + + constexpr ftype& operator[](int d) + { + return bins[d]; + } + + void initialize_bins() + { + if constexpr (std::is_same_v) + { + bins[0] = std::ldexp(0.75, MAX_EXP); + } + else + { + bins[0] = 2.0 * std::ldexp(0.75, MAX_EXP - 1); + } + + for (int index = 1; index <= MAXINDEX; index++) + { + bins[index] = std::ldexp(0.75, + MAX_EXP + MANT_DIG - BIN_WIDTH + 1 - index * BIN_WIDTH); + } + for (int index = MAXINDEX + 1; index < MAXINDEX + MAXFOLD; index++) + { + bins[index] = bins[index - 1]; + } + } + }; + + static char __rfa_bin_host_buffer__[sizeof(RFA_bins)]; + + ///Class to hold a reproducible summation of the numbers passed to it + /// + ///@param ftype Floating-point data type; either `float` or `double + ///@param FOLD The fold; use 3 as a default unless you understand it. + template ::value>* = + nullptr> + class alignas(2 * sizeof(ftype_)) ReproducibleFloatingAccumulator + { + public: + using ftype = ftype_; + static constexpr int FOLD = FOLD_; + + private: + std::array data = {{0}}; + + ///Floating-point precision bin width + static constexpr auto BIN_WIDTH = + std::is_same_v ? 40 : 13; + static constexpr auto MIN_EXP = + std::numeric_limits::min_exponent; + static constexpr auto MAX_EXP = + std::numeric_limits::max_exponent; + static constexpr auto MANT_DIG = std::numeric_limits::digits; + ///Binned floating-point maximum index + static constexpr auto MAXINDEX = + ((MAX_EXP - MIN_EXP + MANT_DIG - 1) / BIN_WIDTH) - 1; + //The maximum floating-point fold supported by the library + static constexpr auto MAXFOLD = MAXINDEX + 1; + ///Binned floating-point compression factor + ///This factor is used to scale down inputs before deposition into the bin of + ///highest index + static constexpr auto COMPRESSION = + 1.0 / (1 << (MANT_DIG - BIN_WIDTH + 1)); + ///Binned double precision expansion factor + ///This factor is used to scale up inputs after deposition into the bin of + ///highest index + static constexpr auto EXPANSION = + 1.0 * (1 << (MANT_DIG - BIN_WIDTH + 1)); + static constexpr auto EXP_BIAS = MAX_EXP - 2; + static constexpr auto EPSILON = std::numeric_limits::epsilon(); + ///Binned floating-point deposit endurance + ///The number of deposits that can be performed before a renorm is necessary. + ///Applies also to binned complex double precision. + static constexpr auto ENDURANCE = 1 << (MANT_DIG - BIN_WIDTH - 2); + + ///Return a binned floating-point reference bin + inline const ftype* binned_bins(const int x) const + { + return &reinterpret_cast&>( + __rfa_bin_host_buffer__)[x]; + } + + ///Get the bit representation of a float + static inline uint32_t& get_bits(float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a double + static inline uint64_t& get_bits(double& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const float + static inline uint32_t get_bits(const float& x) + { + return *reinterpret_cast(&x); + } + ///Get the bit representation of a const double + static inline uint64_t get_bits(const double& x) + { + return *reinterpret_cast(&x); + } + + ///Return primary vector value const ref + inline const ftype& primary(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[0]; + case 1: + if constexpr (FOLD >= 2) + return data[1]; + case 2: + if constexpr (FOLD >= 3) + return data[2]; + case 3: + if constexpr (FOLD >= 4) + return data[3]; + case 4: + if constexpr (FOLD >= 5) + return data[4]; + default: + return data[FOLD - 1]; + } + } + else + { + return data[i]; + } + } + + ///Return carry vector value const ref + inline const ftype& carry(int i) const + { + if constexpr (FOLD <= MAX_JUMP) + { + switch (i) + { + case 0: + if constexpr (FOLD >= 1) + return data[FOLD + 0]; + case 1: + if constexpr (FOLD >= 2) + return data[FOLD + 1]; + case 2: + if constexpr (FOLD >= 3) + return data[FOLD + 2]; + case 3: + if constexpr (FOLD >= 4) + return data[FOLD + 3]; + case 4: + if constexpr (FOLD >= 5) + return data[FOLD + 4]; + default: + return data[2 * FOLD - 1]; + } + } + else + { + return data[FOLD + i]; + } + } + + ///Return primary vector value ref + inline ftype& primary(int i) + { + const auto& c = *this; + return const_cast(c.primary(i)); + } + + ///Return carry vector value ref + inline ftype& carry(int i) + { + const auto& c = *this; + return const_cast(c.carry(i)); + } + +#ifdef DISABLE_ZERO + static inline constexpr bool ISZERO(const ftype) + { + return false; + } +#else + static inline constexpr bool ISZERO(const ftype x) + { + return x == 0.0; + } +#endif + +#ifdef DISABLE_NANINF + static inline constexpr int ISNANINF(const ftype) + { + return false; + } +#else + static inline constexpr int ISNANINF(const ftype x) + { + const auto bits = get_bits(x); + return (bits & ((2ull * MAX_EXP - 1) << (MANT_DIG - 1))) == + ((2ull * MAX_EXP - 1) << (MANT_DIG - 1)); + } +#endif + + static inline constexpr int EXP(const ftype x) + { + const auto bits = get_bits(x); + return (bits >> (MANT_DIG - 1)) & (2 * MAX_EXP - 1); + } + + ///Get index of float-point precision + ///The index of a non-binned type is the smallest index a binned type would + ///need to have to sum it reproducibly. Higher indices correspond to smaller + ///bins. + static inline constexpr int binned_dindex(const ftype x) + { + int exp = EXP(x); + if (exp == 0) + { + if (x == static_cast(0.0)) + { + return MAXINDEX; + } + else + { + std::frexp(x, &exp); + return (std::max)((MAX_EXP - exp) / BIN_WIDTH, MAXINDEX); + } + } + return ((MAX_EXP + EXP_BIAS) - exp) / BIN_WIDTH; + } + + ///Get index of manually specified binned double precision + ///The index of a binned type is the bin that it corresponds to. Higher + ///indices correspond to smaller bins. + inline int binned_index() const + { + return ((MAX_EXP + MANT_DIG - BIN_WIDTH + 1 + EXP_BIAS) - + EXP(primary(0))) / + BIN_WIDTH; + } + + ///Check if index of manually specified binned floating-point is 0 + ///A quick check to determine if the index is 0 + inline bool binned_index0() const + { + return EXP(primary(0)) == MAX_EXP + EXP_BIAS; + } + + ///Update manually specified binned fp with a scalar (X -> Y) + /// + ///This method updates the binned fp to an index suitable for adding numbers + ///with absolute value less than @p max_abs_val + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdupdate( + const ftype max_abs_val, const int incpriY, const int inccarY) + { + if (ISNANINF(primary(0))) + return; + + int X_index = binned_dindex(max_abs_val); + if (ISZERO(primary(0))) + { + const ftype* const bins = binned_bins(X_index); + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = bins[i]; + carry(i * inccarY) = 0.0; + } + } + else + { + int shift = binned_index() - X_index; + if (shift > 0) + { +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = primary((i - shift) * incpriY); + carry(i * inccarY) = carry((i - shift) * inccarY); + } + const ftype* const bins = binned_bins(X_index); +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif + for (int j = 0; j < FOLD; j++) + { + if (j >= shift) + break; + primary(j * incpriY) = bins[j]; + carry(j * inccarY) = 0.0; + } + } + } + } + + ///Add scalar @p X to suitably binned manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y where the index of Y is + ///larger than the index of @p X + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + void binned_dmddeposit(const ftype X, const int incpriY) + { + ftype M; + ftype x = X; + + if (ISNANINF(x) || ISNANINF(primary(0))) + { + primary(0) += x; + return; + } + + if (binned_index0()) + { + M = primary(0); + ftype qd = x * static_cast(COMPRESSION); + auto& ql = get_bits(qd); + ql |= 1; + qd += M; + primary(0) = qd; + M -= qd; + auto temp_m = (double) (((double) EXPANSION) * 0.5); + M *= static_cast(temp_m); + x += M; + x += M; +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif + for (int i = 1; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + else + { + ftype qd = x; + auto& ql = get_bits(qd); +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif + for (int i = 0; i < FOLD - 1; i++) + { + M = primary(i * incpriY); + qd = x; + ql |= 1; + qd += M; + primary(i * incpriY) = qd; + M -= qd; + x += M; + } + qd = x; + ql |= 1; + primary((FOLD - 1) * incpriY) += qd; + } + } + + ///Renormalize manually specified binned double precision + /// + ///Renormalization keeps the primary vector within the necessary bins by + ///shifting over to the carry vector + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + inline void binned_dmrenorm(const int incpriX, const int inccarX) + { + if (ISZERO(primary(0)) || ISNANINF(primary(0))) + return; + + for (int i = 0; i < FOLD; i++) + { + auto tmp_renormd = primary(i * incpriX); + auto& tmp_renorml = get_bits(tmp_renormd); + + carry(i * inccarX) += + (int) ((tmp_renorml >> (MANT_DIG - 3)) & 3) - 2; + + tmp_renorml &= ~(1ull << (MANT_DIG - 3)); + tmp_renorml |= 1ull << (MANT_DIG - 2); + primary(i * incpriX) = tmp_renormd; + } + } + + ///Add scalar to manually specified binned fp (Y += X) + /// + ///Performs the operation Y += X on an binned type Y + /// + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdadd(const ftype X, const int incpriY, const int inccarY) + { + binned_dmdupdate(X, incpriY, inccarY); + binned_dmddeposit(X, incpriY); + binned_dmrenorm(incpriY, inccarY); + } + + ///Convert manually specified binned fp to native double-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + double binned_conv_double(const int incpriX, const int inccarX) const + { + int i = 0; + + if (ISNANINF(primary(0))) + return (double) primary(0); + if (ISZERO(primary(0))) + return 0.0; + + double Y = 0.0; + double scale_down; + double scale_up; + int scaled; + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index <= (3 * MANT_DIG) / BIN_WIDTH) + { + scale_down = std::ldexp(0.5, 1 - (2 * MANT_DIG - BIN_WIDTH)); + scale_up = std::ldexp(0.5, 1 + (2 * MANT_DIG - BIN_WIDTH)); + scaled = (std::max)( + (std::min)(FOLD, (3 * MANT_DIG) / BIN_WIDTH - X_index), 0); + if (X_index == 0) + { + Y += ((double) carry(0)) * + ((((double) bins[0]) / 6.0) * scale_down * EXPANSION); + Y += ((double) carry(inccarX)) * + ((((double) bins[1]) / 6.0) * scale_down); + Y += ((double) primary(0) - (double) bins[0]) * scale_down * + EXPANSION; + i = 2; + } + else + { + Y += ((double) carry(0)) * + (((double) bins[0] / 6.0) * scale_down); + i = 1; + } + for (; i < scaled; i++) + { + Y += ((double) carry(i * inccarX)) * + (((double) bins[i] / 6.0) * scale_down); + Y += ((double) primary((i - 1) * incpriX) - + (double) (bins[i - 1])) * + scale_down; + } + if (i == FOLD) + { + Y += ((double) primary((FOLD - 1) * incpriX) - + (double) (bins[FOLD - 1])) * + scale_down; + return Y * scale_up; + } + if (std::isinf(Y * scale_up)) + { + return Y * scale_up; + } + Y *= scale_up; + for (; i < FOLD; i++) + { + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += ((double) primary((FOLD - 1) * incpriX) - + ((double) bins[FOLD - 1])); + } + else + { + Y += ((double) carry(0)) * ((double) bins[0] / 6.0); + for (i = 1; i < FOLD; i++) + { + Y += ((double) carry(i * inccarX)) * + ((double) bins[i] / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + } + return Y; + } + + ///Convert manually specified binned fp to native single-precision (X -> Y) + /// + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + float binned_conv_single(const int incpriX, const int inccarX) const + { + int i = 0; + double Y = 0.0; + + if (ISNANINF(primary(0))) + return primary(0); + if (ISZERO(primary(0))) + return 0.0f; + + //Note that the following order of summation is in order of decreasing + //exponent. The following code is specific to SBWIDTH=13, FLT_MANT_DIG=24, and + //the number of carries equal to 1. + const auto X_index = binned_index(); + const auto* const bins = binned_bins(X_index); + if (X_index == 0) + { + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0) * + (double) EXPANSION; + Y += (double) carry(inccarX) * + (double) (((double) bins[1]) / 6.0); + Y += (double) (primary(0) - bins[0]) * (double) EXPANSION; + i = 2; + } + else + { + Y += (double) carry(0) * (double) (((double) bins[0]) / 6.0); + i = 1; + } + for (; i < FOLD; i++) + { + Y += (double) carry(i * inccarX) * + (double) (((double) bins[i]) / 6.0); + Y += (double) (primary((i - 1) * incpriX) - bins[i - 1]); + } + Y += (double) (primary((FOLD - 1) * incpriX) - bins[FOLD - 1]); + + return (float) Y; + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + /// + ///@param other Another binned fp of the same type + ///@param incpriX stride within X's primary vector (use every incpriX'th element) + ///@param inccarX stride within X's carry vector (use every inccarX'th element) + ///@param incpriY stride within Y's primary vector (use every incpriY'th element) + ///@param inccarY stride within Y's carry vector (use every inccarY'th element) + void binned_dmdmadd(const ReproducibleFloatingAccumulator& x, + const int incpriX, const int inccarX, const int incpriY, + const int inccarY) + { + if (ISZERO(x.primary(0))) + return; + + if (ISZERO(primary(0))) + { + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + return; + } + + if (ISNANINF(x.primary(0)) || ISNANINF(primary(0))) + { + primary(0) += x.primary(0); + return; + } + + const auto X_index = x.binned_index(); + const auto Y_index = this->binned_index(); + const auto shift = Y_index - X_index; + if (shift > 0) + { + const auto* const bins = binned_bins(Y_index); +//shift Y upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif + for (int i = FOLD - 1; i >= 1; i--) + { + if (i < shift) + break; + primary(i * incpriY) = x.primary(i * incpriX) + + (primary((i - shift) * incpriY) - bins[i - shift]); + carry(i * inccarY) = + x.carry(i * inccarX) + carry((i - shift) * inccarY); + } +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif + for (int i = 0; i < FOLD; i++) + { + if (i == shift) + break; + primary(i * incpriY) = x.primary(i * incpriX); + carry(i * inccarY) = x.carry(i * inccarX); + } + } + else if (shift < 0) + { + const auto* const bins = binned_bins(X_index); +//shift X upwards and add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif + for (int i = 0; i < FOLD; i++) + { + if (i < -shift) + continue; + primary(i * incpriY) += + x.primary((i + shift) * incpriX) - bins[i + shift]; + carry(i * inccarY) += x.carry((i + shift) * inccarX); + } + } + else if (shift == 0) + { + const auto* const bins = binned_bins(X_index); +// add X to Y +#if !defined(HPX_CLANG_VERSION) + HPX_UNROLL +#endif + for (int i = 0; i < FOLD; i++) + { + primary(i * incpriY) += x.primary(i * incpriX) - bins[i]; + carry(i * inccarY) += x.carry(i * inccarX); + } + } + + binned_dmrenorm(incpriY, inccarY); + } + + ///Add two manually specified binned fp (Y += X) + ///Performs the operation Y += X + void binned_dbdbadd(const ReproducibleFloatingAccumulator& other) + { + binned_dmdmadd(other, 1, 1, 1, 1); + } + + public: + ReproducibleFloatingAccumulator() = default; + ReproducibleFloatingAccumulator( + const ReproducibleFloatingAccumulator&) = default; + ///Sets this binned fp equal to another binned fp + ReproducibleFloatingAccumulator& operator=( + const ReproducibleFloatingAccumulator&) = default; + + ///Set the binned fp to zero + void zero() + { + data = {0}; + } + + ///Return the fold of the binned fp + constexpr int fold() const + { + return FOLD; + } + + ///Return the endurance of the binned fp + constexpr size_t endurance() const + { + return ENDURANCE; + } + + ///Returns the number of reference bins. Used for judging memory usage. + constexpr size_t number_of_reference_bins() + { + return std::array::size(); + } + + ///Accumulate an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator+=(const U x) + { + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Accumulate-subtract an arithmetic @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator-=(const U x) + { + binned_dmdadd(-static_cast(x), 1, 1); + return *this; + } + + ///Accumulate a binned fp @p x into the binned fp. + ReproducibleFloatingAccumulator& operator+=( + const ReproducibleFloatingAccumulator& other) + { + binned_dbdbadd(other); + return *this; + } + + ///Accumulate-subtract a binned fp @p x into the binned fp. + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator& operator-=( + const ReproducibleFloatingAccumulator& other) + { + const auto temp = -other; + binned_dbdbadd(temp); + } + + ///Determines if two binned fp are equal + bool operator==(const ReproducibleFloatingAccumulator& other) const + { + return data == other.data; + } + + ///Determines if two binned fp are not equal + bool operator!=(const ReproducibleFloatingAccumulator& other) const + { + return !operator==(other); + } + + ///Sets this binned fp equal to the arithmetic value @p x + ///NOTE: Casts @p x to the type of the binned fp + template >* = nullptr> + ReproducibleFloatingAccumulator& operator=(const U x) + { + zero(); + binned_dmdadd(static_cast(x), 1, 1); + return *this; + } + + ///Returns the negative of this binned fp + ///NOTE: Makes a copy and performs arithmetic; slow. + ReproducibleFloatingAccumulator operator-() + { + constexpr int incpriX = 1; + constexpr int inccarX = 1; + ReproducibleFloatingAccumulator temp = *this; + if (primary(0) != 0.0) + { + const auto* const bins = binned_bins(binned_index()); + for (int i = 0; i < FOLD; i++) + { + temp.primary(i * incpriX) = + bins[i] - (primary(i * incpriX) - bins[i]); + temp.carry(i * inccarX) = -carry(i * inccarX); + } + } + return temp; + } + + ///Convert this binned fp into its native floating-point representation + ftype conv() const + { + if (std::is_same_v) + { + return static_cast(binned_conv_single(1, 1)); + } + else + { + return static_cast(binned_conv_double(1, 1)); + } + } + + ///@brief Get binned fp summation error bound + /// + ///This is a bound on the absolute error of a summation using binned types + /// + ///@param N The number of single precision floating point summands + ///@param max_abs_val The summand of maximum absolute value + ///@param binned_sum The value of the sum computed using binned types + ///@return The absolute error bound + static constexpr ftype error_bound( + const uint64_t N, const ftype max_abs_val, const ftype binned_sum) + { + const double X = std::abs(max_abs_val); + const double S = std::abs(binned_sum); + return static_cast( + (std::max)(X, std::ldexp(0.5, MIN_EXP - 1)) * + std::ldexp(0.5, (1 - FOLD) * BIN_WIDTH + 1) * N + + ((7.0 * EPSILON) / + (1.0 - 6.0 * std::sqrt(static_cast(EPSILON)) - + 7.0 * EPSILON)) * + S); + } + + ///Add @p x to the binned fp + void add(const ftype x) + { + binned_dmdadd(x, 1, 1); + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///@param first Start of range + ///@param last End of range + ///@param max_abs_val Maximum absolute value of any member of the range + template + void add(InputIt first, InputIt last, const ftype max_abs_val) + { + binned_dmdupdate(std::abs(max_abs_val), 1, 1); + size_t count = 0; + size_t N = last - first; + for (; first != last; first++, count++) + { + binned_dmddeposit(static_cast(*first), 1); + // first conditional allows compiler to remove the call here when possible + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + ///Add arithmetics in the range [first, last) to the binned fp + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param first Start of range + ///@param last End of range + template + void add(InputIt first, InputIt last) + { + const auto max_abs_val = *std::max_element( + first, last, [](const auto& a, const auto& b) { + return std::abs(a) < std::abs(b); + }); + add(first, last, static_cast(max_abs_val)); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///@param input Start of the range + ///@param N Number of elements to add + ///@param max_abs_val Maximum absolute value of any member of the range + template >* = nullptr> + void add(const T* input, const size_t N, const ftype max_abs_val) + { + if (N == 0) + return; + add(input, input + N, max_abs_val); + } + + ///Add @p N elements starting at @p input to the binned fp: [input, input+N) + /// + ///NOTE: A maximum absolute value is calculated, so two passes are made over + /// the data + /// + ///@param input Start of the range + ///@param N Number of elements to add + template >* = nullptr> + void add(const T* input, const size_t N) + { + if (N == 0) + return; + + T max_abs_val = input[0]; + for (size_t i = 0; i < N; i++) + { + max_abs_val = (std::max)(max_abs_val, std::abs(input[i])); + } + add(input, N, max_abs_val); + } + + ///Accumulate a float4 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float4& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + binned_dmddeposit(static_cast(x.z), 1); + binned_dmddeposit(static_cast(x.w), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const float2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + ///Accumulate a double2 @p x into the binned fp. + ///NOTE: Casts @p x to the type of the binned fp + ReproducibleFloatingAccumulator& operator+=(const double2& x) + { + binned_dmdupdate(abs_max(x), 1, 1); + binned_dmddeposit(static_cast(x.x), 1); + binned_dmddeposit(static_cast(x.y), 1); + return *this; + } + + void add(const float4* input, const size_t N, float max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + binned_dmddeposit(static_cast(input[i].z), 1); + binned_dmddeposit(static_cast(input[i].w), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const double2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float2* input, const size_t N, double max_abs_val) + { + if (N == 0) + return; + binned_dmdupdate(max_abs_val, 1, 1); + + size_t count = 0; + for (size_t i = 0; i < N; i++) + { + binned_dmddeposit(static_cast(input[i].x), 1); + binned_dmddeposit(static_cast(input[i].y), 1); + + if (N > ENDURANCE && count == ENDURANCE) + { + binned_dmrenorm(1, 1); + count = 0; + } + } + } + + void add(const float4* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = + fmaxf((float) max_abs_val, (float) abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const double2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = + fmax((double) max_abs_val, (double) abs_max(input[i])); + + add(input, N, max_abs_val); + } + + void add(const float2* input, const size_t N) + { + if (N == 0) + return; + + auto max_abs_val = abs_max(input[0]); + for (size_t i = 1; i < N; i++) + max_abs_val = + fmaxf((float) max_abs_val, (float) abs_max(input[i])); + + add(input, N, max_abs_val); + } + + ////////////////////////////////////// + //MANUAL OPERATIONS; USE WISELY + ////////////////////////////////////// + + ///Rebins for repeated accumulation of scalars with magnitude <= @p mav + /// + ///Once rebinned, `ENDURANCE` values <= @p mav can be added to the accumulator + ///with `unsafe_add` after which `renorm()` must be called. See the source of + ///`add()` for an example + template >* = nullptr> + void set_max_abs_val(const T mav) + { + binned_dmdupdate(std::abs(mav), 1, 1); + } + + ///Add @p x to the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` + void unsafe_add(const ftype x) + { + binned_dmddeposit(x, 1); + } + + ///Renormalizes the binned fp + /// + ///This is intended to be used after a call to `set_max_abs_val()` and one or + ///more calls to `unsafe_add()` + void renorm() + { + binned_dmrenorm(1, 1); + } + }; + +} // namespace hpx::parallel::detail::rfa diff --git a/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp new file mode 100644 index 00000000000..9920d3bd7a6 --- /dev/null +++ b/libs/core/algorithms/include/hpx/parallel/algorithms/reduce_deterministic.hpp @@ -0,0 +1,537 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +/// \file parallel/algorithms/reduce_deterministic.hpp +/// \page hpx::reduce_deterministic +/// \headerfile hpx/algorithm.hpp + +#pragma once + +#if defined(DOXYGEN) + +namespace hpx { + + // clang-format off + + /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// predicate \a f. + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// \tparam F The type of the function/function object to use + /// (deduced). Unlike its sequential form, the parallel + /// overload of \a reduce requires \a F to meet the + /// requirements of \a CopyConstructible. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// \param f Specifies the function (or function object) which + /// will be invoked for each of the elements in the + /// sequence specified by [first, last). This is a + /// binary predicate. The signature of this predicate + /// should be equivalent to: + /// \code + /// Ret fun(const Type1 &a, const Type1 &b); + /// \endcode \n + /// The signature does not need to have const&. + /// The types \a Type1 \a Ret must be + /// such that an object of type \a FwdIter can be + /// dereferenced and then implicitly converted to any + /// of those types. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a copy_if algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns \a T otherwise. + /// The \a reduce algorithm returns the result of the + /// generalized sum over the elements given by the input range + /// [first, last). + /// + /// \note GENERALIZED_SUM(op, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + hpx::parallel::util::detail::algorithm_result_t + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init, F&& f); + + /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a copy_if algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns \a T otherwise. + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + util::detail::algorithm_result_t + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last, T init); + + /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam ExPolicy The type of the execution policy to use (deduced). + /// It describes the manner in which the execution + /// of the algorithm may be parallelized and the manner + /// in which it executes the assignments. + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of a + /// forward iterator. + /// + /// \param policy The execution policy to use for the scheduling of + /// the iterations. + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a sequenced_policy + /// execute in sequential order in the calling thread. + /// + /// The reduce operations in the parallel \a reduce algorithm invoked + /// with an execution policy object of type \a parallel_policy + /// or \a parallel_task_policy are permitted to execute in an unordered + /// fashion in unspecified threads, and indeterminately sequenced + /// within each thread. + /// + /// \returns The \a reduce algorithm returns a \a hpx::future if the + /// execution policy is of type + /// \a sequenced_task_policy or + /// \a parallel_task_policy and + /// returns T otherwise (where T is the value_type of + /// \a FwdIter). + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note The type of the initial value (and the result type) \a T is + /// determined from the value_type of the used \a FwdIter. + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template + typename hpx::parallel::util::detail::algorithm_result::value_type + >::type + reduce_deterministic(ExPolicy&& policy, FwdIter first, FwdIter last); + + /// Returns GENERALIZED_SUM(f, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// predicate \a f. + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// \tparam F The type of the function/function object to use + /// (deduced). Unlike its sequential form, the parallel + /// overload of \a reduce requires \a F to meet the + /// requirements of \a CopyConstructible. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// \param f Specifies the function (or function object) which + /// will be invoked for each of the elements in the + /// sequence specified by [first, last). This is a + /// binary predicate. The signature of this predicate + /// should be equivalent to: + /// \code + /// Ret fun(const Type1 &a, const Type1 &b); + /// \endcode \n + /// The signature does not need to have const&. + /// The types \a Type1 \a Ret must be + /// such that an object of type \a InIter can be + /// dereferenced and then implicitly converted to any + /// of those types. + /// + /// \returns The \a reduce algorithm returns \a T. + /// The \a reduce algorithm returns the result of the + /// generalized sum over the elements given by the input range + /// [first, last). + /// + /// \note GENERALIZED_SUM(op, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(op, b1, ..., bK), GENERALIZED_SUM(op, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + T reduce_deterministic(FwdIter first, FwdIter last, T init, F&& f); + + /// Returns GENERALIZED_SUM(+, init, *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// \tparam T The type of the value to be used as initial (and + /// intermediate) values (deduced). + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// \param init The initial value for the generalized sum. + /// + /// \returns The \a reduce algorithm returns a \a T. + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template ::value_type> + T reduce_deterministic(FwdIter first, FwdIter last, T init); + + /// Returns GENERALIZED_SUM(+, T(), *first, ..., *(first + (last - first) - 1)). + /// Executed according to the policy. + /// + /// \note Complexity: O(\a last - \a first) applications of the + /// operator+(). + /// + /// \tparam FwdIter The type of the source begin and end iterators used + /// (deduced). + /// This iterator type must meet the requirements of an + /// input iterator. + /// + /// \param first Refers to the beginning of the sequence of elements + /// the algorithm will be applied to. + /// \param last Refers to the end of the sequence of elements the + /// algorithm will be applied to. + /// + /// \returns The \a reduce algorithm returns \a T (where T is the + /// value_type of \a FwdIter). + /// The \a reduce algorithm returns the result of the + /// generalized sum (applying operator+()) over the elements given + /// by the input range [first, last). + /// + /// \note The type of the initial value (and the result type) \a T is + /// determined from the value_type of the used \a FwdIter. + /// + /// \note GENERALIZED_SUM(+, a1, ..., aN) is defined as follows: + /// * a1 when N is 1 + /// * op(GENERALIZED_SUM(+, b1, ..., bK), GENERALIZED_SUM(+, bM, ..., bN)), + /// where: + /// * b1, ..., bN may be any permutation of a1, ..., aN and + /// * 1 < K+1 = M <= N. + /// + /// The difference between \a reduce and \a accumulate is + /// that the behavior of reduce may be non-deterministic for + /// non-associative or non-commutative binary predicate. + /// + template + typename std::iterator_traits::value_type + reduce_deterministic(FwdIter first, FwdIter last); + // clang-format on +} // namespace hpx + +#else // DOXYGEN + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace hpx::parallel { + + /////////////////////////////////////////////////////////////////////////// + // reduce + namespace detail { + + /// \cond NOINTERNAL + template + struct reduce_deterministic + : public algorithm, T> + { + constexpr reduce_deterministic() noexcept + : algorithm("reduce_deterministic") + { + } + + template + static constexpr T sequential(ExPolicy&& policy, InIterB first, + InIterE last, T_&& init, Reduce&& r) + { + return hpx::parallel::detail::sequential_reduce_deterministic< + ExPolicy>(HPX_FORWARD(ExPolicy, policy), first, last, + HPX_FORWARD(T_, init), HPX_FORWARD(Reduce, r)); + } + }; + /// \endcond + } // namespace detail +} // namespace hpx::parallel + +namespace hpx { + + /////////////////////////////////////////////////////////////////////////// + // CPO for hpx::reduce + inline constexpr struct reduce_deterministic_t final + : hpx::detail::tag_parallel_algorithm + { + private: + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::is_execution_policy_v && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last, T init, F f) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init), + HPX_MOVE(f)); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::is_execution_policy_v && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last, T init) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + HPX_FORWARD(ExPolicy, policy), first, last, HPX_MOVE(init), + std::plus<>{}); + } + + // clang-format off + template && + hpx::traits::is_iterator_v + )> + // clang-format on + friend hpx::parallel::util::detail::algorithm_result_t::value_type> + tag_fallback_invoke(hpx::reduce_deterministic_t, ExPolicy&& policy, + FwdIter first, FwdIter last) + { + static_assert(hpx::traits::is_forward_iterator_v, + "Requires at least forward iterator."); + + using value_type = + typename std::iterator_traits::value_type; + + return hpx::parallel::detail::reduce_deterministic() + .call(HPX_FORWARD(ExPolicy, policy), first, last, value_type{}, + std::plus<>{}); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::traits::is_iterator_v + )> + // clang-format on + friend T tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last, T init, F f) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + hpx::execution::seq, first, last, HPX_MOVE(init), HPX_MOVE(f)); + } + + // clang-format off + template ::value_type, + HPX_CONCEPT_REQUIRES_( + hpx::traits::is_iterator_v + )> + // clang-format on + friend T tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last, T init) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + return hpx::parallel::detail::reduce_deterministic().call( + hpx::execution::seq, first, last, HPX_MOVE(init), + std::plus<>{}); + } + + // clang-format off + template + )> + // clang-format on + friend typename std::iterator_traits::value_type + tag_fallback_invoke( + hpx::reduce_deterministic_t, InIter first, InIter last) + { + static_assert(hpx::traits::is_input_iterator_v, + "Requires at least input iterator."); + + using value_type = + typename std::iterator_traits::value_type; + + return hpx::parallel::detail::reduce_deterministic() + .call(hpx::execution::seq, first, last, value_type{}, + std::plus<>()); + } + } reduce_deterministic{}; +} // namespace hpx + +#endif // DOXYGEN diff --git a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt index 7d3a9204530..559ee830030 100644 --- a/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt +++ b/libs/core/algorithms/tests/unit/algorithms/CMakeLists.txt @@ -94,6 +94,7 @@ set(tests partition_copy reduce_ reduce_by_key + reduce_deterministic remove remove1 remove2 diff --git a/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp new file mode 100644 index 00000000000..5a06c509efd --- /dev/null +++ b/libs/core/algorithms/tests/unit/algorithms/reduce_deterministic.cpp @@ -0,0 +1,276 @@ +// Copyright (c) 2024 Shreyas Atre +// +// SPDX-License-Identifier: BSL-1.0 +// Distributed under the Boost Software License, Version 1.0. (See accompanying +// file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "test_utils.hpp" + +int seed = std::random_device{}(); +std::mt19937 gen(seed); + +template +T get_rand(T LO = (std::numeric_limits::min)(), + T HI = (std::numeric_limits::max)()) +{ + return LO + + static_cast(std::rand()) / + (static_cast(static_cast((RAND_MAX)) / (HI - LO))); +} + +/////////////////////////////////////////////////////////////////////////////// + +template +void test_reduce1(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result i.e. correct computation + using base_iterator_det = + typename std::vector::iterator; + using iterator_det = test::test_iterator; + + using base_iterator_ndet = + typename std::vector::iterator; + using iterator_ndet = test::test_iterator; + + std::vector deterministic(LEN); + std::vector nondeterministic(LEN); + + std::iota( + deterministic.begin(), deterministic.end(), FloatTypeDeterministic(0)); + + std::iota(nondeterministic.begin(), nondeterministic.end(), + FloatTypeNonDeterministic(0)); + + FloatTypeDeterministic val_det(0); + FloatTypeNonDeterministic val_non_det(0); + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = + hpx::reduce_deterministic(iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + // verify values + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + + HPX_TEST_EQ(static_cast(r1), r3); + HPX_TEST_EQ(static_cast(r2), r3); +} + +template +void test_reduce_determinism(IteratorTag) +{ + // check if different type for deterministic and nondeeterministic + // and same result + using base_iterator_det = + typename std::vector::iterator; + using iterator_det = test::test_iterator; + + constexpr FloatTypeDeterministic num_bounds_det = + std::is_same_v ? 1000.0 : 1000000.0; + + std::vector deterministic(LEN); + + for (size_t i = 0; i < LEN; ++i) + { + deterministic[i] = + get_rand(-num_bounds_det, num_bounds_det); + } + + std::vector deterministic_shuffled = deterministic; + + std::shuffle( + deterministic_shuffled.begin(), deterministic_shuffled.end(), gen); + + FloatTypeDeterministic val_det(41.999); + + auto op = [](FloatTypeDeterministic v1, FloatTypeDeterministic v2) { + return v1 + v2; + }; + + FloatTypeDeterministic r1 = + hpx::reduce_deterministic(iterator_det(std::begin(deterministic)), + iterator_det(std::end(deterministic)), val_det, op); + + FloatTypeDeterministic r1_shuffled = hpx::reduce_deterministic( + iterator_det(std::begin(deterministic_shuffled)), + iterator_det(std::end(deterministic_shuffled)), val_det, op); + + HPX_TEST_EQ(r1, + r1_shuffled); // Deterministically calculated, should always satisfy +} + +/// This test function is never called because it is not guaranteed to pass +/// It serves an important purpose to demonstrate that floating point summation +/// is not always associative i.e. a+b+c != a+c+b +template +void test_orig_reduce_determinism(IteratorTag) +{ + using base_iterator_ndet = + typename std::vector::iterator; + using iterator_ndet = test::test_iterator; + + constexpr auto num_bounds_ndet = + std::is_same_v ? 1000.0f : 1000000.0f; + + std::vector nondeterministic(LEN); + for (size_t i = 0; i < LEN; ++i) + { + nondeterministic[i] = get_rand( + -num_bounds_ndet, num_bounds_ndet); + } + std::vector nondeterministic_shuffled = + nondeterministic; + std::shuffle(nondeterministic_shuffled.begin(), + nondeterministic_shuffled.end(), gen); + + FloatTypeNonDeterministic val_non_det(41.999); + + auto op = [](FloatTypeNonDeterministic v1, FloatTypeNonDeterministic v2) { + return v1 + v2; + }; + + FloatTypeNonDeterministic r2 = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic)), + iterator_ndet(std::end(nondeterministic)), val_non_det, op); + FloatTypeNonDeterministic r2_shuffled = hpx::reduce(hpx::execution::seq, + iterator_ndet(std::begin(nondeterministic_shuffled)), + iterator_ndet(std::end(nondeterministic_shuffled)), val_non_det, op); + + FloatTypeNonDeterministic r3 = std::accumulate( + nondeterministic.begin(), nondeterministic.end(), val_non_det); + FloatTypeNonDeterministic r3_shuffled = + std::accumulate(nondeterministic_shuffled.begin(), + nondeterministic_shuffled.end(), val_non_det); + + /// failed around 131 times out of 1000 on macOS arm + /// Floating point addition is not necessarily associative, + /// might fail on an architecture not yet known with much higher precision + HPX_TEST_NEQ(r2, r2_shuffled); + HPX_TEST_NEQ(r3, r3_shuffled); +} + +template +void test_reduce1() +{ + using namespace hpx::execution; + + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); + test_reduce1(IteratorTag()); +} + +template +void test_reduce2() +{ + using namespace hpx::execution; + + test_reduce_determinism(IteratorTag()); + test_reduce_determinism(IteratorTag()); +} + +// template +// void test_reduce3() +// { +// using namespace hpx::execution; + +// test_orig_reduce_determinism(IteratorTag()); +// test_orig_reduce_determinism(IteratorTag()); +// } + +void reduce_test1() +{ + test_reduce1(); + test_reduce2(); + // test_reduce3(); + test_reduce1(); + test_reduce2(); +} + +/////////////////////////////////////////////////////////////////////////////// +int hpx_main(hpx::program_options::variables_map& vm) +{ + unsigned int seed = (unsigned int) std::time(nullptr); + bool seed_random = false; + + if (vm.count("seed")) + { + seed = vm["seed"].as(); + seed_random = true; + } + + if (vm.count("seed-random")) + seed_random = vm["seed-random"].as(); + + if (seed_random) + { + std::cout << "using seed: " << seed << std::endl; + std::cout << "** std::accumulate, hpx::reduce may fail due to " + "non-determinism of the floating summation" + << std::endl; + gen.seed(seed); + std::srand(seed); + } + else + { + gen.seed(223); + std::srand(223); + } + + reduce_test1(); + + return hpx::local::finalize(); +} + +int main(int argc, char* argv[]) +{ + // add command line option which controls the random number generator seed + using namespace hpx::program_options; + options_description desc_commandline( + "Usage: " HPX_APPLICATION_STRING " [options]"); + + desc_commandline.add_options()("seed,s", value(), + "the random number generator seed to use for this run"); + desc_commandline.add_options()("seed-random", value(), + "switch for the random number generator seed to use for this run"); + + // By default this test should run on all available cores + std::vector const cfg = {"hpx.os_threads=all"}; + + // Initialize and run HPX + hpx::local::init_params init_args; + init_args.desc_cmdline = desc_commandline; + init_args.cfg = cfg; + + HPX_TEST_EQ_MSG(hpx::local::init(hpx_main, argc, argv, init_args), 0, + "HPX main exited with non-zero status"); + + return hpx::util::report_errors(); +}