Skip to content

Commit

Permalink
Performance improvement in libcudf case conversion for long strings (r…
Browse files Browse the repository at this point in the history
…apidsai#15441)

Improves logic efficiency overall strings case conversion and reworks the specialized kernels for long strings to improve parallelization within each string.
Closes rapidsai#15406

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)
  - Bradley Dice (https://github.com/bdice)

URL: rapidsai#15441
  • Loading branch information
davidwendt authored Apr 12, 2024
1 parent 2e00cb1 commit f5df665
Show file tree
Hide file tree
Showing 2 changed files with 168 additions and 71 deletions.
232 changes: 164 additions & 68 deletions cpp/src/strings/case.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/case.hpp>
#include <cudf/strings/detail/char_tables.hpp>
Expand All @@ -34,6 +35,9 @@

#include <cuda/atomic>
#include <cuda/functional>
#include <thrust/for_each.h>
#include <thrust/merge.h>
#include <thrust/transform.h>

namespace cudf {
namespace strings {
Expand Down Expand Up @@ -110,23 +114,22 @@ struct convert_char_fn {
*
* This can be used in calls to make_strings_children.
*/
struct upper_lower_fn {
struct base_upper_lower_fn {
convert_char_fn converter;
column_device_view d_strings;
size_type* d_offsets{};
char* d_chars{};

__device__ void operator()(size_type idx) const
base_upper_lower_fn(convert_char_fn converter) : converter(converter) {}

__device__ inline void process_string(string_view d_str, size_type idx) const
{
if (d_strings.is_null(idx)) {
if (!d_chars) d_offsets[idx] = 0;
return;
}
auto const d_str = d_strings.element<string_view>(idx);
size_type bytes = 0;
char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) {
auto const size = converter.process_character(*itr, d_buffer);
size_type bytes = 0;
char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr;
for (auto itr = d_str.data(); itr < (d_str.data() + d_str.size_bytes()); ++itr) {
if (is_utf8_continuation_char(static_cast<u_char>(*itr))) continue;
char_utf8 chr = 0;
to_char_utf8(itr, chr);
auto const size = converter.process_character(chr, d_buffer);
if (d_buffer) {
d_buffer += size;
} else {
Expand All @@ -137,45 +140,116 @@ struct upper_lower_fn {
}
};

struct upper_lower_fn : public base_upper_lower_fn {
column_device_view d_strings;

upper_lower_fn(convert_char_fn converter, column_device_view const& d_strings)
: base_upper_lower_fn{converter}, d_strings{d_strings}
{
}

__device__ void operator()(size_type idx) const
{
if (d_strings.is_null(idx)) {
if (!d_chars) { d_offsets[idx] = 0; }
return;
}
auto const d_str = d_strings.element<string_view>(idx);
process_string(d_str, idx);
}
};

// Long strings are divided into smaller strings using this value as a guide.
// Generally strings are split into sub-blocks of bytes of this size but
// care is taken to not sub-block in the middle of a multi-byte character.
constexpr size_type LS_SUB_BLOCK_SIZE = 32;

/**
* @brief Count output bytes in warp-parallel threads
* @brief Produces sub-offsets for the chars in the given strings column
*/
struct sub_offset_fn {
char const* d_input_chars;
int64_t first_offset;
int64_t last_offset;

__device__ int64_t operator()(int64_t idx) const
{
auto const end = d_input_chars + last_offset;
auto position = (idx + 1) * LS_SUB_BLOCK_SIZE;
auto begin = d_input_chars + first_offset + position;
while ((begin < end) && is_utf8_continuation_char(static_cast<u_char>(*begin))) {
++begin;
++position;
}
return (begin < end) ? position + first_offset : last_offset;
}
};

/**
* @brief Specialized case conversion for long strings
*
* This executes as one warp per string and just computes the output sizes.
* This is needed since the offset count can exceed size_type.
* Also, nulls are ignored since this purely builds the output chars.
* The d_offsets are only temporary to help address the sub-blocks.
*/
struct count_bytes_fn {
struct upper_lower_ls_fn : public base_upper_lower_fn {
convert_char_fn converter;
column_device_view d_strings;
size_type* d_offsets;
char const* d_input_chars;
int64_t* d_input_offsets; // includes column offset

upper_lower_ls_fn(convert_char_fn converter, char const* d_input_chars, int64_t* d_input_offsets)
: base_upper_lower_fn{converter}, d_input_chars{d_input_chars}, d_input_offsets{d_input_offsets}
{
}

// idx is row index
__device__ void operator()(size_type idx) const
{
auto const str_idx = idx / cudf::detail::warp_size;
auto const lane_idx = idx % cudf::detail::warp_size;

// initialize the output for the atomicAdd
if (lane_idx == 0) { d_offsets[str_idx] = 0; }
__syncwarp();

if (d_strings.is_null(str_idx)) { return; }
auto const d_str = d_strings.element<string_view>(str_idx);
auto const str_ptr = d_str.data();

size_type size = 0;
for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
auto const chr = str_ptr[i];
if (is_utf8_continuation_char(chr)) { continue; }
char_utf8 u8 = 0;
to_char_utf8(str_ptr + i, u8);
size += converter.process_character(u8);
}
// this is every so slightly faster than using the cub::warp_reduce
if (size > 0) {
cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_offsets + str_idx)};
ref.fetch_add(size, cuda::std::memory_order_relaxed);
}
auto const offset = d_input_offsets[idx];
auto const d_str = string_view{d_input_chars + offset,
static_cast<size_type>(d_input_offsets[idx + 1] - offset)};
process_string(d_str, idx);
}
};

/**
* @brief Count output bytes in warp-parallel threads
*
* This executes as one warp per string and just computes the output sizes.
*/
CUDF_KERNEL void count_bytes_kernel(convert_char_fn converter,
column_device_view d_strings,
size_type* d_sizes)
{
auto idx = cudf::detail::grid_1d::global_thread_id();
if (idx >= (d_strings.size() * cudf::detail::warp_size)) { return; }

auto const str_idx = idx / cudf::detail::warp_size;
auto const lane_idx = idx % cudf::detail::warp_size;

// initialize the output for the atomicAdd
if (lane_idx == 0) { d_sizes[str_idx] = 0; }
__syncwarp();

if (d_strings.is_null(str_idx)) { return; }
auto const d_str = d_strings.element<string_view>(str_idx);
auto const str_ptr = d_str.data();

size_type size = 0;
for (auto i = lane_idx; i < d_str.size_bytes(); i += cudf::detail::warp_size) {
auto const chr = str_ptr[i];
if (is_utf8_continuation_char(chr)) { continue; }
char_utf8 u8 = 0;
to_char_utf8(str_ptr + i, u8);
size += converter.process_character(u8);
}
// this is slightly faster than using the cub::warp_reduce
if (size > 0) {
cuda::atomic_ref<size_type, cuda::thread_scope_block> ref{*(d_sizes + str_idx)};
ref.fetch_add(size, cuda::std::memory_order_relaxed);
}
}

/**
* @brief Special functor for processing ASCII-only data
*/
Expand Down Expand Up @@ -208,11 +282,18 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
auto const d_cases = get_character_cases_table();
auto const d_special = get_special_case_mapping_table();

auto const first_offset = (input.offset() == 0) ? 0L
: cudf::strings::detail::get_offset_value(
input.offsets(), input.offset(), stream);
auto const last_offset =
cudf::strings::detail::get_offset_value(input.offsets(), input.size() + input.offset(), stream);
auto const chars_size = last_offset - first_offset;

convert_char_fn ccfn{case_flag, d_flags, d_cases, d_special};
upper_lower_fn converter{ccfn, *d_strings};

// For smaller strings, use the regular string-parallel algorithm
if ((input.chars_size(stream) / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
if ((chars_size / (input.size() - input.null_count())) < AVG_CHAR_BYTES_THRESHOLD) {
auto [offsets, chars] =
cudf::strings::detail::make_strings_children(converter, input.size(), stream, mr);
return make_strings_column(input.size(),
Expand All @@ -235,40 +316,55 @@ std::unique_ptr<column> convert_case(strings_column_view const& input,
[] __device__(auto chr) { return is_utf8_continuation_char(chr); })) > 0;
if (!multi_byte_chars) {
// optimization for ASCII-only case: copy the input column and inplace replace each character
auto result = std::make_unique<column>(input.parent(), stream, mr);
auto d_chars = result->mutable_view().head<char>();
auto const chars_size = strings_column_view(result->view()).chars_size(stream);
auto result = std::make_unique<column>(input.parent(), stream, mr);
auto d_chars = result->mutable_view().head<char>();
thrust::transform(
rmm::exec_policy(stream), d_chars, d_chars + chars_size, d_chars, ascii_converter_fn{ccfn});
result->set_null_count(input.null_count());
return result;
}

// This will use a warp-parallel algorithm to compute the output sizes for each string
// and then uses the normal string parallel functor to build the output.
auto offsets = make_numeric_column(
data_type{type_to_id<size_type>()}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
auto d_offsets = offsets->mutable_view().data<size_type>();

// first pass, compute output sizes
// note: tried to use segmented-reduce approach instead here and it was consistently slower
count_bytes_fn counter{ccfn, *d_strings, d_offsets};
auto const count_itr = thrust::make_counting_iterator<size_type>(0);
thrust::for_each_n(
rmm::exec_policy(stream), count_itr, input.size() * cudf::detail::warp_size, counter);

// convert sizes to offsets
auto const bytes =
cudf::detail::sizes_to_offsets(d_offsets, d_offsets + input.size() + 1, d_offsets, stream);
CUDF_EXPECTS(bytes <= std::numeric_limits<size_type>::max(),
"Size of output exceeds the column size limit",
std::overflow_error);

rmm::device_uvector<char> chars(bytes, stream, mr);
// second pass, write output
converter.d_offsets = d_offsets;
converter.d_chars = chars.data();
thrust::for_each_n(rmm::exec_policy(stream), count_itr, input.size(), converter);
auto [offsets, bytes] = [&] {
rmm::device_uvector<size_type> sizes(input.size(), stream);
constexpr int block_size = 512;
cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
count_bytes_kernel<<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
ccfn, *d_strings, sizes.data());
// convert sizes to offsets
return cudf::strings::detail::make_offsets_child_column(sizes.begin(), sizes.end(), stream, mr);
}();

// build sub-offsets
auto const input_chars = input.chars_begin(stream);
auto const sub_count = chars_size / LS_SUB_BLOCK_SIZE;
auto tmp_offsets = rmm::device_uvector<int64_t>(sub_count + input.size() + 1, stream);
{
rmm::device_uvector<size_type> sub_offsets(sub_count, stream);
auto const count_itr = thrust::make_counting_iterator<size_type>(0);
thrust::transform(rmm::exec_policy_nosync(stream),
count_itr,
count_itr + sub_count,
sub_offsets.data(),
sub_offset_fn{input_chars, first_offset, last_offset});

// merge them with input offsets
auto input_offsets =
cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
thrust::merge(rmm::exec_policy_nosync(stream),
input_offsets,
input_offsets + input.size() + 1,
sub_offsets.begin(),
sub_offsets.end(),
tmp_offsets.begin());
}

// run case conversion over the new sub-strings
auto const tmp_size = static_cast<size_type>(tmp_offsets.size()) - 1;
upper_lower_ls_fn sub_conv{ccfn, input_chars, tmp_offsets.data()};
auto chars =
std::get<1>(cudf::strings::detail::make_strings_children(sub_conv, tmp_size, stream, mr));

return make_strings_column(input.size(),
std::move(offsets),
Expand Down
7 changes: 4 additions & 3 deletions cpp/tests/strings/case_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -235,7 +235,7 @@ TEST_F(StringsCaseTest, LongStrings)
{
// average string length >= AVG_CHAR_BYTES_THRESHOLD as defined in case.cu
cudf::test::strings_column_wrapper input{
"ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
"abcdéfghijklmnopqrstuvwxyzABCDÉFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()_+=- ",
"ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
"ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=- ",
"ABCDÉFGHIJKLMNOPQRSTUVWXYZabcdéfghijklmnopqrstuvwxyz1234567890!@#$%^&*()_+=-"};
Expand All @@ -256,7 +256,8 @@ TEST_F(StringsCaseTest, LongStrings)
results = cudf::strings::to_upper(view);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);

results = cudf::strings::to_upper(cudf::strings_column_view(cudf::slice(input, {1, 3}).front()));
view = cudf::strings_column_view(cudf::slice(input, {1, 3}).front());
results = cudf::strings::to_upper(view);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, cudf::slice(expected, {1, 3}).front());
}

Expand Down

0 comments on commit f5df665

Please sign in to comment.