Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added polynomials benchmark #17695

Open
wants to merge 11 commits into
base: branch-25.02
Choose a base branch
from
11 changes: 9 additions & 2 deletions cpp/benchmarks/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -344,11 +344,18 @@ ConfigureNVBench(CSV_WRITER_NVBENCH io/csv/csv_writer.cpp)

# ##################################################################################################
# * ast benchmark ---------------------------------------------------------------------------------
ConfigureNVBench(AST_NVBENCH ast/transform.cpp)
ConfigureNVBench(AST_NVBENCH ast/polynomials.cpp ast/transform.cpp)

# ##################################################################################################
# * binaryop benchmark ----------------------------------------------------------------------------
ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp)
ConfigureNVBench(
BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binaryop.cpp binaryop/polynomials.cpp
)

# ##################################################################################################
# * transform benchmark
# ---------------------------------------------------------------------------------
ConfigureNVBench(TRANSFORM_NVBENCH transform/polynomials.cpp)

# ##################################################################################################
# * nvtext benchmark -------------------------------------------------------------------
Expand Down
94 changes: 94 additions & 0 deletions cpp/benchmarks/ast/polynomials.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/ast/expressions.hpp>
#include <cudf/column/column.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/table/table.hpp>
#include <cudf/transform.hpp>
#include <cudf/utilities/error.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <nvbench/nvbench.cuh>
#include <nvbench/types.cuh>

#include <random>

template <typename key_type>
static void BM_ast_polynomials(nvbench::state& state)
{
auto const num_rows = static_cast<cudf::size_type>(state.get_int64("num_rows"));
auto const order = static_cast<cudf::size_type>(state.get_int64("order"));

CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0");

data_profile profile;
profile.set_distribution_params(cudf::type_to_id<key_type>(),
distribution_id::NORMAL,
static_cast<key_type>(0),
static_cast<key_type>(1));
auto table = create_random_table({cudf::type_to_id<key_type>()}, row_count{num_rows}, profile);
auto column_view = table->get_column(0);

std::vector<cudf::numeric_scalar<key_type>> constants;
{
std::random_device random_device;
std::mt19937 generator;
std::uniform_real_distribution<key_type> distribution{0, 1};

std::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(order + 1),
std::back_inserter(constants),
[&](int) { return distribution(generator); });
}

cudf::ast::tree tree{};

auto& column_ref = tree.push(cudf::ast::column_reference{0});

// computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e....
tree.push(cudf::ast::literal{constants[0]});

for (cudf::size_type i = 0; i < order; i++) {
mhaseeb123 marked this conversation as resolved.
Show resolved Hide resolved
auto& product =
tree.push(cudf::ast::operation{cudf::ast::ast_operator::MUL, tree.back(), column_ref});
auto& constant = tree.push(cudf::ast::literal{constants[i + 1]});
tree.push(cudf::ast::operation{cudf::ast::ast_operator::ADD, product, constant});
}

// Use the number of bytes read from global memory
state.add_global_memory_reads<key_type>(num_rows);
state.add_global_memory_writes<key_type>(num_rows);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
cudf::scoped_range range{"benchmark_iteration"};
cudf::compute_column(*table, tree.back(), launch.get_stream().get_stream());
});
}

#define AST_POLYNOMIAL_BENCHMARK_DEFINE(name, key_type) \
static void name(::nvbench::state& st) { ::BM_ast_polynomials<key_type>(st); } \
NVBENCH_BENCH(name) \
.set_name(#name) \
.add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \
.add_int64_axis("order", {1, 2, 4, 8, 16, 32})

AST_POLYNOMIAL_BENCHMARK_DEFINE(ast_polynomials_float32, float);

AST_POLYNOMIAL_BENCHMARK_DEFINE(ast_polynomials_float64, double);
101 changes: 101 additions & 0 deletions cpp/benchmarks/binaryop/polynomials.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/binaryop.hpp>
#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/types.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <random>

template <typename key_type>
static void BM_binaryop_polynomials(nvbench::state& state)
{
auto const num_rows{static_cast<cudf::size_type>(state.get_int64("num_rows"))};
auto const order{static_cast<cudf::size_type>(state.get_int64("order"))};

CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0");

data_profile profile;
profile.set_distribution_params(cudf::type_to_id<key_type>(),
distribution_id::NORMAL,
static_cast<key_type>(0),
static_cast<key_type>(1));
auto table = create_random_table({cudf::type_to_id<key_type>()}, row_count{num_rows}, profile);
auto column_view = table->get_column(0);

std::vector<cudf::numeric_scalar<key_type>> constants;
{
std::random_device random_device;
std::mt19937 generator;
std::uniform_real_distribution<key_type> distribution{0, 1};

std::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(order + 1),
std::back_inserter(constants),
[&](int) { return cudf::numeric_scalar<key_type>(distribution(generator)); });
}

// Use the number of bytes read from global memory
state.add_global_memory_reads<key_type>(num_rows);
state.add_global_memory_writes<key_type>(num_rows);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
// computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e....
cudf::scoped_range range{"benchmark_iteration"};
rmm::cuda_stream_view stream{launch.get_stream().get_stream()};
std::vector<std::unique_ptr<cudf::column>> intermediates;

auto result = cudf::make_column_from_scalar(constants[0], num_rows, stream);

for (cudf::size_type i = 0; i < order; i++) {
auto product = cudf::binary_operation(result->view(),
column_view,
cudf::binary_operator::MUL,
cudf::data_type{cudf::type_to_id<key_type>()},
stream);
auto sum = cudf::binary_operation(product->view(),
constants[i + 1],
cudf::binary_operator::ADD,
cudf::data_type{cudf::type_to_id<key_type>()},
stream);
intermediates.push_back(std::move(product));
intermediates.push_back(std::move(result));
result = std::move(sum);
}
});
}

#define BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(name, key_type) \
\
static void name(::nvbench::state& st) { ::BM_binaryop_polynomials<key_type>(st); } \
NVBENCH_BENCH(name) \
.set_name(#name) \
.add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \
.add_int64_axis("order", {1, 2, 4, 8, 16, 32})

BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(binaryop_polynomials_float32, float);

BINARYOP_POLYNOMIALS_BENCHMARK_DEFINE(binaryop_polynomials_float64, double);
108 changes: 108 additions & 0 deletions cpp/benchmarks/transform/polynomials.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright (c) 2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <benchmarks/common/generate_input.hpp>

#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/transform.hpp>
#include <cudf/types.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <nvbench/nvbench.cuh>

#include <algorithm>
#include <random>

template <typename key_type>
static void BM_transform_polynomials(nvbench::state& state)
{
auto const num_rows{static_cast<cudf::size_type>(state.get_int64("num_rows"))};
auto const order{static_cast<cudf::size_type>(state.get_int64("order"))};

CUDF_EXPECTS(order > 0, "Polynomial order must be greater than 0");

data_profile profile;
profile.set_distribution_params(cudf::type_to_id<key_type>(),
distribution_id::NORMAL,
static_cast<key_type>(0),
static_cast<key_type>(1));
auto table = create_random_table({cudf::type_to_id<key_type>()}, row_count{num_rows}, profile);
auto column_view = table->get_column(0);

std::vector<key_type> constants;
mhaseeb123 marked this conversation as resolved.
Show resolved Hide resolved

{
std::random_device random_device;
std::mt19937 generator;
std::uniform_real_distribution<key_type> distribution{0, 1};

std::transform(thrust::make_counting_iterator(0),
thrust::make_counting_iterator(order + 1),
std::back_inserter(constants),
[&](int) { return distribution(generator); });
}

// Use the number of bytes read from global memory
state.add_global_memory_reads<key_type>(num_rows);
state.add_global_memory_writes<key_type>(num_rows);

state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
// computes polynomials: (((ax + b)x + c)x + d)x + e... = ax**4 + bx**3 + cx**2 + dx + e....

cudf::scoped_range range{"benchmark_iteration"};

std::string expr = std::to_string(constants[0]);

for (cudf::size_type i = 0; i < order; i++) {
expr = "( " + expr + " ) * x + " + std::to_string(constants[i + 1]);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we think it's a legitimate benchmark if we hardcode these constants? I think it would be more fair to the other AST/binaryops if we provided an array of device scalar pointers that must be dereferenced for each multiplication.

A different way to think about this: do we want to JIT a new kernel for every possible set of constants and/or every possible order of polynomial? If we compute multiple polynomials, does that JIT overhead pay for itself, or do we need to assume that we will amortize the JIT overhead across multiple polynomials?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The transform API is pretty limited presently, We don't support multiple input columns/scalars in the transform API. As we discussed with Spark-Rapids, we would need to support that to meet their needs, but I don't think it would make much of a performance/throughput difference for this benchmark.
No, it wouldn't be okay to JIT a new kernel for each constant, but it would be reasonable to JIT a new kernel for each polynomial order.

If we compute multiple polynomials, does that JIT overhead pay for itself, or do we need to assume that we will amortize the JIT overhead across multiple polynomials?

I don't think it would pay for itself across multiple polynomials, Although there's a program cache, I don't have enough insight into its internals, I'll investigate and get back to you.

}

static_assert(std::is_same_v<key_type, float> || std::is_same_v<key_type, double>);
lamarrr marked this conversation as resolved.
Show resolved Hide resolved
std::string type = std::is_same_v<key_type, float> ? "float" : "double";

std::string udf = R"***(
__device__ inline void compute_polynomial (
)***" + type + R"***(* out,
)***" + type + R"***( x
)
{
*out = )***" + expr +
R"***(;
}
)***";

cudf::transform(column_view,
udf,
cudf::data_type{cudf::type_to_id<key_type>()},
false,
launch.get_stream().get_stream());
});
}

#define TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(name, key_type) \
\
static void name(::nvbench::state& st) { ::BM_transform_polynomials<key_type>(st); } \
NVBENCH_BENCH(name) \
.set_name(#name) \
.add_int64_axis("num_rows", {100'000, 1'000'000, 10'000'000, 100'000'000}) \
.add_int64_axis("order", {1, 2, 4, 8, 16, 32})

TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(transform_polynomials_float32, float);

TRANSFORM_POLYNOMIALS_BENCHMARK_DEFINE(transform_polynomials_float64, double);
Loading