diff --git a/core/components/bit_packed_storage.hpp b/core/components/bit_packed_storage.hpp index 137eff5e417..7b3d5d5a8f2 100644 --- a/core/components/bit_packed_storage.hpp +++ b/core/components/bit_packed_storage.hpp @@ -31,6 +31,20 @@ constexpr int ceil_log2_constexpr(T value) } +/** + * Computes the rounded-down binary logarithm of a number in constexpr fashion. + * In performance-critical runtime contexts, use floor_log2 instead. + */ +template +constexpr int floor_log2_constexpr(T value) +{ + if (value == 1) { + return 0; + } + return 1 + floor_log2_constexpr(value / 2); +} + + /** * Computes the next larger (or equal) power of two for a number in constexpr * fashion. @@ -42,6 +56,17 @@ constexpr int round_up_pow2_constexpr(T value) } +/** + * Computes the next smaller (or equal) power of two for a number in constexpr + * fashion. + */ +template +constexpr int round_down_pow2_constexpr(T value) +{ + return T{1} << floor_log2_constexpr(value); +} + + /** * Computes the rounded-up binary logarithm of a number. */ @@ -56,6 +81,18 @@ constexpr int ceil_log2(T value) } +/** + * Computes the rounded-down binary logarithm of a number. + */ +template +constexpr int floor_log2(T value) +{ + assert(value >= 1); + return detail::find_highest_bit( + static_cast>(value)); +} + + /** * Computes the next larger (or equal) power of two for a number. */ @@ -66,12 +103,22 @@ constexpr int round_up_pow2(T value) } +/** + * Computes the next smaller (or equal) power of two for a number. + */ +template +constexpr int round_down_pow2(T value) +{ + return T{1} << floor_log2(value); +} + + /** * A compact representation of a span of unsigned integers with num_bits bits. * Each integer gets stored using round_up_pow2(num_bits) bits inside a WordType * word. */ -template +template class bit_packed_span { static_assert(std::is_unsigned_v); @@ -92,38 +139,40 @@ class bit_packed_span { return bits_per_word_log2 - bits_per_value_log2(num_bits); } - constexpr static size_type storage_size(size_type size, int num_bits) + constexpr static IndexType storage_size(IndexType size, int num_bits) { const auto shift = values_per_word_log2(num_bits); const auto div = WordType{1} << shift; return (size + div - 1) >> shift; } - constexpr void set_from_zero(size_type i, WordType value) + constexpr void set_from_zero(IndexType i, WordType value) { + assert(value >= 0); + assert(value <= mask_); const auto [block, shift] = get_block_and_shift(i); data_[block] |= value << shift; } - constexpr void clear(size_type i) + constexpr void clear(IndexType i) { const auto [block, shift] = get_block_and_shift(i); data_[block] &= ~(mask_ << shift); } - constexpr void set(size_type i, WordType value) + constexpr void set(IndexType i, WordType value) { clear(i); set_from_zero(i, value); } - constexpr WordType get(size_type i) const + constexpr WordType get(IndexType i) const { const auto [block, shift] = get_block_and_shift(i); return (data_[block] >> shift) & mask_; } - constexpr std::pair get_block_and_shift(size_type i) const + constexpr std::pair get_block_and_shift(IndexType i) const { assert(i >= 0); assert(i < size_); @@ -132,7 +181,7 @@ class bit_packed_span { } explicit constexpr bit_packed_span(WordType* data, int num_bits, - size_type size) + IndexType size) : data_{data}, size_{size}, mask_{(WordType{1} << num_bits) - 1}, @@ -145,7 +194,7 @@ class bit_packed_span { private: WordType* data_; - size_type size_; + IndexType size_; WordType mask_; int bits_per_value_; int values_per_word_log2_; @@ -177,7 +226,7 @@ class bit_packed_array { constexpr void set_from_zero(int i, int value) { - assert(value > 0); + assert(value >= 0); assert(value <= mask); data_[i / values_per_word] |= value << ((i % values_per_word) * bits_per_value); diff --git a/core/components/range_minimum_query.hpp b/core/components/range_minimum_query.hpp index bac7d36ee6d..21b53fa935f 100644 --- a/core/components/range_minimum_query.hpp +++ b/core/components/range_minimum_query.hpp @@ -125,127 +125,6 @@ struct cartesian_tree { }; -constexpr int ceil_log2_constexpr(int value) -{ - if (value == 1) { - return 0; - } - return 1 + ceil_log2_constexpr((value + 1) / 2); -} - - -constexpr int round_up_pow2_constexpr(int value) -{ - return 1 << ceil_log2_constexpr(value); -} - - -template -constexpr int ceil_log2(IndexType value) -{ - assert(value >= 1); - return value > 1 - ? detail::find_highest_bit( - static_cast>(value - 1)) + - 1 - : 0; -} - - -template -constexpr int round_up_pow2(IndexType value) -{ - return IndexType{1} << ceil_log2(value); -} - - -/** - * A compact representation of a span of unsigned integers with num_bits bits. - * Each integer gets stored using round_up_pow2_constexpr(num_bits) bits inside - * a WordType word. - */ -template -class bit_packed_span { - static_assert(std::is_unsigned_v); - -public: - constexpr static int bits_per_word = sizeof(WordType) * CHAR_BIT; - constexpr static int bits_per_word_log2 = - ceil_log2_constexpr(bits_per_word); - - // the constexpr here is only signalling for CUDA, - // since we don't have if consteval to switch between two implementations - constexpr static int bits_per_value_log2(int num_bits) - { - return ceil_log2(num_bits); - } - - constexpr static int values_per_word_log2(int num_bits) - { - return bits_per_word_log2 - bits_per_value_log2(num_bits); - } - - constexpr static size_type storage_size(size_type size, int num_bits) - { - const auto shift = values_per_word_log2(num_bits); - const auto div = WordType{1} << shift; - return (size + div - 1) >> shift; - } - - constexpr void set_from_zero(size_type i, WordType value) - { - const auto [block, shift] = get_block_and_shift(i); - data_[block] |= value << shift; - } - - constexpr void clear(size_type i) - { - const auto [block, shift] = get_block_and_shift(i); - data_[block] &= ~(mask_ << shift); - } - - constexpr void set(size_type i, WordType value) - { - clear(i); - set_from_zero(i, value); - } - - constexpr WordType get(size_type i) const - { - const auto [block, shift] = get_block_and_shift(i); - return (data_[block] >> shift) & mask_; - } - - constexpr std::pair get_block_and_shift(size_type i) const - { - assert(i >= 0); - assert(i < size_); - return std::make_pair(i >> values_per_word_log2_, - (i & local_index_mask_) * bits_per_value_); - } - - explicit constexpr bit_packed_span(WordType* data, int num_bits, - size_type size) - : data_{data}, - size_{size}, - mask_{(WordType{1} << num_bits) - 1}, - bits_per_value_{round_up_pow2(num_bits)}, - values_per_word_log2_{values_per_word_log2(num_bits)}, - local_index_mask_{(1 << values_per_word_log2_) - 1} - { - assert(bits_per_value_ <= bits_per_word); - } - -private: - WordType* data_; - size_type size_; - WordType mask_; - int bits_per_value_; - int values_per_word_log2_; - int local_index_mask_; -}; - - } // namespace detail @@ -256,7 +135,7 @@ class block_range_minimum_query_lookup_table { // how many trees does the lookup table (LUT) contain? constexpr static int num_trees = tree::num_trees; // how many bits do we need theoretically for this block? - constexpr static int num_bits = detail::ceil_log2_constexpr(block_size); + constexpr static int num_bits = ceil_log2_constexpr(block_size); constexpr block_range_minimum_query_lookup_table() : lookup_table{} { @@ -315,11 +194,31 @@ class range_minimum_query_superblocks { using storage_type = std::make_unsigned_t; constexpr static auto index_type_bits = 8 * sizeof(index_type); - range_minimum_query_superblocks(index_type* values, storage_type* storage, - IndexType size) - : values_{values}, storage_{storage}, size_{size} + range_minimum_query_superblocks(storage_type* storage, index_type size) + : storage_{storage}, size_{size} {} + constexpr int get(int block_size_log2_m1, size_type index) const + { + return get_level(block_size_log2_m1).get(index); + } + + constexpr void set(int block_size_log2_m1, size_type index, + index_type value) + { + get_level(block_size_log2_m1).set(index, value); + } + + constexpr static std::array + compute_block_offset_lookup() + { + std::array result{}; + for (int i = 1; i <= index_type_bits; i++) { + result[i] = result[i - 1] + compute_block_storage_size(i); + } + return result; + } + constexpr int get_offset(int block_size_log2_m1) const { constexpr auto offsets = compute_block_offset_lookup(); @@ -328,36 +227,60 @@ class range_minimum_query_superblocks { return offsets[block_size_log2_m1] * get_num_blocks(); } - constexpr int get(int block_size_log2_m1, size_type index) + constexpr index_type size() const { return size_; } + + constexpr index_type storage_size() const { - const auto values = storage_ + get_offset(block_size_log2_m1); - // TODO fix - return values[index]; + return compute_storage_size(size()); } - constexpr IndexType get_num_blocks() const + constexpr int num_levels() const { return compute_num_levels(size()); } + + constexpr static index_type compute_storage_size(index_type size) { - return (size_ + index_type_bits - 1) / index_type_bits; + return compute_block_offset_lookup()[compute_num_levels(size)] * + get_num_blocks(size); } - constexpr static std::array - compute_block_offset_lookup() + constexpr static int compute_num_levels(index_type size) { - std::array result{}; - for (int i = 1; i < index_type_bits; i++) { - result[i] = result[i - 1] + compute_block_storage_size(i); - } - return result; + return size > 1 ? (size > 2 ? ceil_log2(size) - 1 : 1) : 0; } private: + constexpr index_type get_num_blocks() const + { + return get_num_blocks(size_); + } + + constexpr static index_type get_num_blocks(index_type size) + { + return (size + index_type_bits - 1) / index_type_bits; + } + constexpr static int compute_block_storage_size(int block_size_log2) { - return detail::round_up_pow2_constexpr(block_size_log2); + return round_up_pow2_constexpr(block_size_log2); + } + + constexpr bit_packed_span get_level( + int block_size_log2_m1) const + { + const auto values = storage_ + get_offset(block_size_log2_m1); + const int num_bits = round_up_pow2(block_size_log2_m1 + 1); + return bit_packed_span{values, num_bits, + size_}; + } + + constexpr bit_packed_span get_level( + int block_size_log2_m1) + { + const auto values = storage_ + get_offset(block_size_log2_m1); + const int num_bits = round_up_pow2(block_size_log2_m1 + 1); + return bit_packed_span{values, num_bits, + size_}; } - // These are the values we query range minima for - IndexType* values_; // The storage stores the range minimum for every power-of-two block that is // smaller than size. There are n - 1 ranges of size 2, n - 3 ranges of size // 4, n - 7 ranges of size 8, ... so in total we have n log n ranges. @@ -365,9 +288,9 @@ class range_minimum_query_superblocks { // information for all n ranges, where we add infinity padding to the end. // Ranges of size 2 need 1 bit, ranges of size 4 need 2 bits, ranges of size // 8 need 3 bits, ... but for better memory access patterns, we always make - // sure every value from the range fits into a full IndexType word. + // sure every value from the range fits into a full index_type word. storage_type* storage_; - IndexType size_; + index_type size_; }; diff --git a/core/components/range_minimum_query_kernels.hpp b/core/components/range_minimum_query_kernels.hpp index 4ea9bd36125..5600f6f02ae 100644 --- a/core/components/range_minimum_query_kernels.hpp +++ b/core/components/range_minimum_query_kernels.hpp @@ -15,6 +15,7 @@ #include #include "core/base/kernel_declaration.hpp" +#include "core/components/bit_packed_storage.hpp" namespace gko { @@ -22,22 +23,23 @@ namespace kernels { #define GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL(IndexType) \ - void compute_lookup_small(std::shared_ptr exec, \ - const IndexType* values, IndexType size, \ - block_argmin_storage_type& block_argmin, \ - uint16* block_type) + void compute_lookup_small( \ + std::shared_ptr exec, const IndexType* values, \ + IndexType size, block_argmin_storage_type& block_argmin, \ + IndexType* block_min, uint16* block_type) #define GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_LARGE_KERNEL(IndexType) \ void compute_lookup_large( \ - std::shared_ptr exec, const IndexType* values, \ - const block_argmin_storage_type& block_argmin, IndexType size, \ + std::shared_ptr exec, \ + const IndexType* block_min, IndexType num_blocks, \ range_minimum_query_superblocks& superblocks) #define GKO_DECLARE_ALL_AS_TEMPLATES \ constexpr int small_block_size = 8; \ - using block_argmin_storage_type = detail::bit_packed_span; \ + template \ + using block_argmin_storage_type = bit_packed_span; \ template \ GKO_DECLARE_RANGE_MINIMUM_QUERY_COMPUTE_LOOKUP_SMALL_KERNEL(IndexType); \ template \ diff --git a/core/test/components/bit_packed_storage.cpp b/core/test/components/bit_packed_storage.cpp index bc29ee5b0a5..d74deb90961 100644 --- a/core/test/components/bit_packed_storage.cpp +++ b/core/test/components/bit_packed_storage.cpp @@ -12,18 +12,20 @@ #include "gtest/gtest.h" -template +template class BitPackedSpan : public ::testing::Test { public: - using word_type = WordType; - using bit_packed_span = gko::bit_packed_span; + using index_type = std::tuple_element_t<0, IndexWordType>; + using word_type = std::tuple_element_t<1, IndexWordType>; + using bit_packed_span = gko::bit_packed_span; std::default_random_engine rng{2457}; }; -using WordTypes = ::testing::Types; +using WordTypes = ::testing::Types, + std::pair>; -TYPED_TEST_SUITE(BitPackedSpan, WordTypes, TypenameNameGenerator); +TYPED_TEST_SUITE(BitPackedSpan, WordTypes, PairTypenameNameGenerator); TYPED_TEST(BitPackedSpan, Works) @@ -54,10 +56,8 @@ TYPED_TEST(BitPackedSpan, Works) val = dist2(this->rng); } - bit_packed_span span{packed_data.data(), num_bits, - static_cast(size)}; - bit_packed_span span2{packed_data2.data(), num_bits, - static_cast(size)}; + bit_packed_span span{packed_data.data(), num_bits, size}; + bit_packed_span span2{packed_data2.data(), num_bits, size}; for (const auto i : gko::irange{size}) { span.set_from_zero(i, data[i]); span2.set(i, data[i]); diff --git a/core/test/components/range_minimum_query.cpp b/core/test/components/range_minimum_query.cpp index 16a30182a10..620195a7b3c 100644 --- a/core/test/components/range_minimum_query.cpp +++ b/core/test/components/range_minimum_query.cpp @@ -23,7 +23,7 @@ TEST(RangeMinimumQuery, RepresentativesAreExhaustive) const auto rep_tree_number = tree::compute_tree_index(reps[tree_number]); - ASSERT_EQ(tree_number, rep_tree_number); + EXPECT_EQ(tree_number, rep_tree_number); } while (std::next_permutation(values, values + size)); } @@ -40,7 +40,7 @@ TEST(RangeMinimumQuery, RepresentativesLargeAreExhaustive) const auto rep_tree_number = tree::compute_tree_index(reps[tree_number]); - ASSERT_EQ(tree_number, rep_tree_number); + EXPECT_EQ(tree_number, rep_tree_number); } while (std::next_permutation(values, values + size)); } @@ -49,7 +49,7 @@ TEST(RangeMinimumQuery, LookupRepresentatives) { constexpr auto size = 8; using tree = gko::detail::cartesian_tree; - constexpr gko::block_range_minimum_query_lookup_table table; + /*constexpr*/ gko::block_range_minimum_query_lookup_table table; auto reps = tree::compute_tree_representatives(); for (const auto& rep : reps) { const auto tree = tree::compute_tree_index(rep); @@ -60,7 +60,7 @@ TEST(RangeMinimumQuery, LookupRepresentatives) const auto min_pos = first > last ? 0 : std::min_element(begin, end) - rep; - ASSERT_EQ(table.lookup(tree, first, last), min_pos); + EXPECT_EQ(table.lookup(tree, first, last), min_pos); } } } @@ -82,7 +82,7 @@ TEST(RangeMinimumQuery, LookupExhaustive) std::min_element(values + first, values + last + 1) - values; - ASSERT_EQ(lookup_val, actual_val); + EXPECT_EQ(lookup_val, actual_val); } } } while (std::next_permutation(values, values + size)); @@ -92,44 +92,75 @@ TEST(RangeMinimumQuery, LookupExhaustive) TEST(RangeMinimumQuery, OffsetsAreCorrect) { constexpr auto data = gko::range_minimum_query_superblocks< - int>::compute_block_offset_lookup(); + gko::int32>::compute_block_offset_lookup(); constexpr auto data_long = gko::range_minimum_query_superblocks< - long>::compute_block_offset_lookup(); - ASSERT_EQ(data[0], 0); - ASSERT_EQ(data_long[0], 0); + gko::int64>::compute_block_offset_lookup(); + EXPECT_EQ(data[0], 0); + EXPECT_EQ(data_long[0], 0); // blocks of size 2^1 need 1 bit each - ASSERT_EQ(data[1], 1); - ASSERT_EQ(data_long[1], 1); + EXPECT_EQ(data[1], 1); + EXPECT_EQ(data_long[1], 1); // blocks of size 2^2 need 2 bits each - ASSERT_EQ(data[2], 3); - ASSERT_EQ(data_long[2], 3); + EXPECT_EQ(data[2], 3); + EXPECT_EQ(data_long[2], 3); // blocks of size 2^3 need 4 bits each - ASSERT_EQ(data[3], 7); - ASSERT_EQ(data_long[3], 7); + EXPECT_EQ(data[3], 7); + EXPECT_EQ(data_long[3], 7); // blocks of size 2^4 need 4 bits each - ASSERT_EQ(data[4], 11); - ASSERT_EQ(data_long[4], 11); + EXPECT_EQ(data[4], 11); + EXPECT_EQ(data_long[4], 11); // blocks of size 2^5 need 8 bits each - ASSERT_EQ(data[5], 19); - ASSERT_EQ(data_long[5], 19); + EXPECT_EQ(data[5], 19); + EXPECT_EQ(data_long[5], 19); // blocks of size 2^6 need 8 bits each - ASSERT_EQ(data[6], 27); - ASSERT_EQ(data_long[6], 27); + EXPECT_EQ(data[6], 27); + EXPECT_EQ(data_long[6], 27); // blocks of size 2^7 need 8 bits each - ASSERT_EQ(data[7], 35); - ASSERT_EQ(data_long[7], 35); + EXPECT_EQ(data[7], 35); + EXPECT_EQ(data_long[7], 35); // blocks of size 2^8 need 8 bits each - ASSERT_EQ(data[8], 43); - ASSERT_EQ(data_long[8], 43); + EXPECT_EQ(data[8], 43); + EXPECT_EQ(data_long[8], 43); // blocks of size 2^9 - 2^16 need 16 bits each - ASSERT_EQ(data[9], 59); - ASSERT_EQ(data_long[9], 59); - ASSERT_EQ(data[16], 171); - ASSERT_EQ(data_long[16], 171); + EXPECT_EQ(data[9], 59); + EXPECT_EQ(data_long[9], 59); + EXPECT_EQ(data[16], 171); + EXPECT_EQ(data_long[16], 171); // blocks of size 2^17-2^32 need 32 bits each - ASSERT_EQ(data[31], 651); - ASSERT_EQ(data_long[31], 651); - ASSERT_EQ(data_long[32], 683); + EXPECT_EQ(data[31], 651); + EXPECT_EQ(data_long[31], 651); + EXPECT_EQ(data_long[32], 683); // blocks of size 2^33-2^64 need 64 bits each - ASSERT_EQ(data_long[63], 2667); + EXPECT_EQ(data_long[63], 2667); +} + + +TEST(RangeMinimumQuery, NumLevelsIsCorrect) +{ + using superblocks32 = gko::range_minimum_query_superblocks; + using superblocks64 = gko::range_minimum_query_superblocks; + EXPECT_EQ(superblocks32::compute_num_levels(0), 0); + EXPECT_EQ(superblocks64::compute_num_levels(0), 0); + EXPECT_EQ(superblocks32::compute_num_levels(1), 0); + EXPECT_EQ(superblocks64::compute_num_levels(1), 0); + EXPECT_EQ(superblocks32::compute_num_levels(2), 1); + EXPECT_EQ(superblocks64::compute_num_levels(2), 1); + EXPECT_EQ(superblocks32::compute_num_levels(3), 1); + EXPECT_EQ(superblocks64::compute_num_levels(3), 1); + EXPECT_EQ(superblocks32::compute_num_levels(4), 1); + EXPECT_EQ(superblocks64::compute_num_levels(4), 1); + EXPECT_EQ(superblocks32::compute_num_levels(5), 2); + EXPECT_EQ(superblocks64::compute_num_levels(5), 2); + EXPECT_EQ(superblocks32::compute_num_levels(8), 2); + EXPECT_EQ(superblocks64::compute_num_levels(8), 2); + EXPECT_EQ(superblocks32::compute_num_levels(9), 3); + EXPECT_EQ(superblocks64::compute_num_levels(9), 3); + EXPECT_EQ(superblocks32::compute_num_levels(16), 3); + EXPECT_EQ(superblocks64::compute_num_levels(16), 3); + EXPECT_EQ(superblocks32::compute_num_levels(17), 4); + EXPECT_EQ(superblocks64::compute_num_levels(17), 4); + EXPECT_EQ(superblocks32::compute_num_levels(32), 4); + EXPECT_EQ(superblocks64::compute_num_levels(32), 4); + EXPECT_EQ(superblocks32::compute_num_levels(33), 5); + EXPECT_EQ(superblocks64::compute_num_levels(33), 5); } diff --git a/omp/factorization/elimination_forest_kernels.cpp b/omp/factorization/elimination_forest_kernels.cpp index 20bf210bab2..12e59c04764 100644 --- a/omp/factorization/elimination_forest_kernels.cpp +++ b/omp/factorization/elimination_forest_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -121,6 +121,7 @@ void compute_subtree_sizes( const auto size = static_cast(forest.parents.get_size()); const auto child_ptrs = forest.child_ptrs.get_const_data(); const auto children = forest.children.get_const_data(); + gko::vector finished(size, exec); for (const auto node : irange{size}) { IndexType local_size{1}; const auto child_begin = child_ptrs[node]; @@ -131,6 +132,7 @@ void compute_subtree_sizes( local_size += subtree_sizes[child]; } subtree_sizes[node] = local_size; + finished[node] = true; } } @@ -153,7 +155,6 @@ void compute_subtree_euler_path_sizes( const auto child_end = child_ptrs[node + 1]; for (const auto child_idx : irange{child_begin, child_end}) { const auto child = children[child_idx]; - assert(finished[child]); // euler path: follow the edge into the subtree, traverse, follow // edge back local_size += subtree_euler_path_sizes[child] + 2; diff --git a/reference/components/range_minimum_query_kernels.cpp b/reference/components/range_minimum_query_kernels.cpp index ba626a8d81e..04c5433d50e 100644 --- a/reference/components/range_minimum_query_kernels.cpp +++ b/reference/components/range_minimum_query_kernels.cpp @@ -1,4 +1,4 @@ -// SPDX-FileCopyrightText: 2017 - 2024 The Ginkgo authors +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors // // SPDX-License-Identifier: BSD-3-Clause @@ -6,6 +6,9 @@ #include +#include + +#include "core/components/bit_packed_storage.hpp" #include "core/components/range_minimum_query.hpp" @@ -18,8 +21,8 @@ namespace range_minimum_query { template void compute_lookup_small(std::shared_ptr exec, const IndexType* values, IndexType size, - block_argmin_storage_type& block_argmin, - uint16* block_type) + block_argmin_storage_type& block_argmin, + IndexType* block_min, uint16* block_type) { using tree_index_type = std::decay_t; using lut_type = @@ -37,10 +40,13 @@ void compute_lookup_small(std::shared_ptr exec, : std::numeric_limits::max(); } const auto tree_number = table.compute_tree_index(local_values); - const auto min_it = std::min_element(values, values + small_block_size); - const auto min_idx = static_cast(std::distance(values, min_it)); + const auto min_it = + std::min_element(local_values, local_values + small_block_size); + const auto min_idx = + static_cast(std::distance(local_values, min_it)); const auto block_idx = i / small_block_size; block_argmin.set(block_idx, min_idx); + block_min[block_idx] = *min_it; block_type[block_idx] = static_cast(tree_number); } } @@ -51,23 +57,36 @@ GKO_INSTANTIATE_FOR_EACH_INDEX_TYPE( template void compute_lookup_large( - std::shared_ptr exec, const IndexType* values, - const block_argmin_storage_type& block_argmin, IndexType size, + std::shared_ptr exec, const IndexType* block_min, + IndexType num_blocks, range_minimum_query_superblocks& superblocks) { constexpr auto infinity = std::numeric_limits::max(); - const auto num_small_blocks = ceildiv(size, small_block_size); // initialize the first level of blocks - for (IndexType i = 0; i < num_small_blocks; i += 2) { - const auto min1 = values[i * small_block_size + block_argmin.get(i)]; - const auto min2 = - i + 1 < num_small_blocks - ? values[i * small_block_size + block_argmin.get(i)] - : infinity; - superblocks.set(0, i / 2, min1 < min2 ? 0 : 1); + for (IndexType i = 0; i < num_blocks; i += 2) { + const auto min1 = block_min[i]; + const auto min2 = i + 1 < num_blocks ? block_min[i + 1] : infinity; + // we need to use <= here to make sure ties always break to the left + superblocks.set(0, i, min1 <= min2 ? 0 : 1); } - for (IndexType block_size = 4; block_size < size; block_size *= 2) { - const auto min1 = superblocks.get() + // we computed argmins for blocks of size 2, now recursively combine them. + // the biggest block size needs to be able to span any subrange of [0, size) + // subtracting 1 prevents an unnecessary last level at power-of-two sizes + const auto num_levels = floor_log2(num_blocks - 1); + for (int block_level = 1; block_level < num_levels; block_level++) { + const auto block_size = IndexType{1} << (block_level + 1); + for (const auto i : irange{num_blocks}) { + const auto i2 = i + block_size / 2; + const auto argmin1 = i + superblocks.get(block_level - 1, i); + const auto argmin2 = i2 < num_blocks + ? i2 + superblocks.get(block_level - 1, i2) + : argmin1; + const auto min1 = block_min[argmin1]; + const auto min2 = block_min[argmin2]; + // we need to use <= here to make sure ties always break to the left + superblocks.set(block_level, i, + min1 <= min2 ? argmin1 - i : argmin2 - i); + } } } diff --git a/reference/test/components/CMakeLists.txt b/reference/test/components/CMakeLists.txt index e9737f5c106..b17880ab32d 100644 --- a/reference/test/components/CMakeLists.txt +++ b/reference/test/components/CMakeLists.txt @@ -3,4 +3,5 @@ ginkgo_create_test(fill_array_kernels) ginkgo_create_test(format_conversion_kernels) ginkgo_create_test(precision_conversion_kernels) ginkgo_create_test(prefix_sum_kernels) +ginkgo_create_test(range_minimum_query_kernels) ginkgo_create_test(reduce_array_kernels) diff --git a/reference/test/components/range_minimum_query_kernels.cpp b/reference/test/components/range_minimum_query_kernels.cpp new file mode 100644 index 00000000000..e13ddd7e23c --- /dev/null +++ b/reference/test/components/range_minimum_query_kernels.cpp @@ -0,0 +1,137 @@ +// SPDX-FileCopyrightText: 2017 - 2025 The Ginkgo authors +// +// SPDX-License-Identifier: BSD-3-Clause + +#include "core/components/range_minimum_query_kernels.hpp" + +#include +#include +#include +#include + +#include + +#include "core/base/index_range.hpp" +#include "core/components/range_minimum_query.hpp" +#include "core/test/utils.hpp" + + +template +class RangeMinimumQuery : public ::testing::Test { +protected: + using index_type = IndexType; + using word_type = std::make_unsigned_t; + using block_argmin_storage_type = + gko::kernels::reference::range_minimum_query::block_argmin_storage_type< + IndexType>; + using superblock_storage_type = + gko::range_minimum_query_superblocks; + RangeMinimumQuery() : ref{gko::ReferenceExecutor::create()}, rng{167349} {} + + std::vector create_random_values(index_type size) + { + std::vector values(size); + std::uniform_int_distribution dist(0, 10000); + for (auto& value : values) { + value = dist(this->rng); + } + return values; + } + + std::shared_ptr ref; + std::default_random_engine rng; +}; + +TYPED_TEST_SUITE(RangeMinimumQuery, gko::test::IndexTypes, + TypenameNameGenerator); + + +TYPED_TEST(RangeMinimumQuery, ComputeLookupSmall) +{ + using index_type = typename TestFixture::index_type; + using block_argmin_storage_type = + typename TestFixture::block_argmin_storage_type; + constexpr auto block_size = + gko::kernels::reference::range_minimum_query::small_block_size; + constexpr auto block_argmin_num_bits = gko::ceil_log2_constexpr(block_size); + // keep these in sync with small_block_size + for (index_type size : + {0, 1, 2, 3, 10, 15, 16, 17, 25, 127, 128, 129, 1023}) { + SCOPED_TRACE(size); + const auto values = this->create_random_values(size); + const auto num_blocks = static_cast(gko::ceildiv( + size, + gko::kernels::reference::range_minimum_query::small_block_size)); + std::vector block_argmin_storage( + block_argmin_storage_type::storage_size(num_blocks, + block_argmin_num_bits)); + block_argmin_storage_type block_argmin{ + block_argmin_storage.data(), block_argmin_num_bits, num_blocks}; + std::vector block_min(num_blocks); + std::vector block_type(num_blocks); + gko::block_range_minimum_query_lookup_table small_lut; + + gko::kernels::reference::range_minimum_query::compute_lookup_small( + this->ref, values.data(), size, block_argmin, block_min.data(), + block_type.data()); + + for (auto block : gko::irange{num_blocks}) { + SCOPED_TRACE(block); + const auto block_begin = values.begin() + block * block_size; + const auto block_end = + values.begin() + std::min(size, (block + 1) * block_size); + const auto block_local_size = block_end - block_begin; + const auto min_it = std::min_element(block_begin, block_end); + const auto min_value = *min_it; + const auto min_pos = min_it - block_begin; + ASSERT_EQ(min_pos, block_argmin.get(block)); + ASSERT_EQ(min_value, block_min[block]); + const auto tree = block_type[block]; + for (auto first : gko::irange{block_local_size}) { + for (auto last : gko::irange{first, block_local_size}) { + const auto argmin = std::distance( + block_begin, std::min_element(block_begin + first, + block_begin + last + 1)); + ASSERT_EQ(argmin, small_lut.lookup(static_cast(tree), + first, last)) + << "in range [" << first << "," << last << "]"; + } + } + } + } +} + +TYPED_TEST(RangeMinimumQuery, ComputeLookupLarge) +{ + using index_type = typename TestFixture::index_type; + using superblock_storage_type = + typename TestFixture::superblock_storage_type; + using word_type = typename TestFixture::word_type; + constexpr auto block_size = + gko::kernels::reference::range_minimum_query::small_block_size; + // keep these in sync with small_block_size + for (index_type num_blocks : + {2, 3, 10, 15, 16, 17, 25, 127, 128, 129, 1023}) { + SCOPED_TRACE(num_blocks); + const auto values = this->create_random_values(num_blocks); + std::vector block_min(num_blocks); + std::vector superblock_storage( + superblock_storage_type::compute_storage_size(num_blocks)); + superblock_storage_type superblocks(superblock_storage.data(), + num_blocks); + + gko::kernels::reference::range_minimum_query::compute_lookup_large( + this->ref, block_min.data(), num_blocks, superblocks); + + for (auto level : gko::irange(superblocks.num_levels())) { + const auto block_size = index_type{1} << (level + 1); + for (auto block : gko::irange(num_blocks)) { + const auto begin = block_min.begin() + block; + const auto end = block_min.begin() + + std::min(block + block_size, num_blocks); + const auto argmin = std::min_element(begin, end) - begin; + ASSERT_EQ(superblocks.get(level, block), argmin); + } + } + } +}