From 0311216a3fe8f53bc7e89cbfe147f6fde6715aad Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 3 Jan 2025 11:17:08 -0800 Subject: [PATCH 01/19] Use rapids-cmake for the logger (#17674) This PR switches cudf to use rapids-cmake to fetch rapids-logger so that it uses a consistent version with the rest of RAPIDS to avoid any cases where transitive CPM loads result in multiple packages being built from source that require a different version of rapids-logger. This PR also cherry-picks the Python docs changes from https://github.com/rapidsai/cudf/pull/17669 so that our Sphinx docs can build again without warnings. Depends on https://github.com/rapidsai/rapids-cmake/pull/737 and https://github.com/rapidsai/rmm/pull/1776. Contributes to rapidsai/build-planning#104. Authors: - Vyas Ramasubramani (https://github.com/vyasr) - David Wendt (https://github.com/davidwendt) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17674 --- cpp/CMakeLists.txt | 9 ++--- .../user_guide/api_docs/general_functions.rst | 34 +++++++++---------- docs/cudf/source/user_guide/api_docs/io.rst | 8 ++--- 3 files changed, 24 insertions(+), 27 deletions(-) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8c6cd922747..cb814aa8c0f 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -274,11 +274,8 @@ endif() # add third party dependencies using CPM rapids_cpm_init() -# Not using rapids-cmake since we never want to find, always download. -CPMAddPackage( - NAME rapids_logger GITHUB_REPOSITORY rapidsai/rapids-logger GIT_SHALLOW FALSE GIT_TAG - c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 VERSION c510947ae9d3a67530cfe3e5eaccb5a3b8ea0e55 -) +include(${rapids-cmake-dir}/cpm/rapids_logger.cmake) +rapids_cpm_rapids_logger() rapids_make_logger(cudf EXPORT_SET cudf-exports) # find jitify diff --git a/docs/cudf/source/user_guide/api_docs/general_functions.rst b/docs/cudf/source/user_guide/api_docs/general_functions.rst index 38e070b0d53..5c5b5cb3b04 100644 --- a/docs/cudf/source/user_guide/api_docs/general_functions.rst +++ b/docs/cudf/source/user_guide/api_docs/general_functions.rst @@ -9,26 +9,26 @@ Data manipulations .. autosummary:: :toctree: api/ - cudf.concat - cudf.crosstab - cudf.cut - cudf.factorize - cudf.get_dummies - cudf.melt - cudf.merge - cudf.pivot - cudf.pivot_table - cudf.unstack + concat + crosstab + cut + factorize + get_dummies + melt + merge + pivot + pivot_table + unstack Top-level conversions --------------------- .. autosummary:: :toctree: api/ - cudf.to_numeric - cudf.from_dataframe - cudf.from_dlpack - cudf.from_pandas + to_numeric + from_dataframe + from_dlpack + from_pandas Top-level dealing with datetimelike data ---------------------------------------- @@ -36,8 +36,8 @@ Top-level dealing with datetimelike data .. autosummary:: :toctree: api/ - cudf.to_datetime - cudf.date_range + to_datetime + date_range Top-level dealing with Interval data ------------------------------------ @@ -45,4 +45,4 @@ Top-level dealing with Interval data .. autosummary:: :toctree: api/ - cudf.interval_range + interval_range diff --git a/docs/cudf/source/user_guide/api_docs/io.rst b/docs/cudf/source/user_guide/api_docs/io.rst index 417970715f8..ad8ba8a9bdf 100644 --- a/docs/cudf/source/user_guide/api_docs/io.rst +++ b/docs/cudf/source/user_guide/api_docs/io.rst @@ -35,10 +35,10 @@ Parquet read_parquet DataFrame.to_parquet - cudf.io.parquet.read_parquet_metadata - cudf.io.parquet.ParquetDatasetWriter - cudf.io.parquet.ParquetDatasetWriter.close - cudf.io.parquet.ParquetDatasetWriter.write_table + io.parquet.read_parquet_metadata + io.parquet.ParquetDatasetWriter + io.parquet.ParquetDatasetWriter.close + io.parquet.ParquetDatasetWriter.write_table ORC From 1dece5e2f5cde6f60f70475ac345820673185f1d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 3 Jan 2025 16:41:11 -0500 Subject: [PATCH 02/19] Fix possible overflow in WriteCoalescingCallbackWrapper::TearDown (#17642) Fixes possible overflow in `WriteCoalescingCallbackWrapper::TearDown` function if the `tile_out_count` is sufficiently large enough. The `out_char += blockDim.x` could overflow when within block-size of the max of `tile_out_count`. Authors: - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/17642 --- cpp/src/io/fst/agent_dfa.cuh | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 2a75c034dc8..5685b50c322 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -308,12 +308,14 @@ class WriteCoalescingCallbackWrapper { { __syncthreads(); if constexpr (!DiscardTranslatedOutput) { - for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { + for (thread_index_type out_char = threadIdx.x; out_char < tile_out_count; + out_char += blockDim.x) { out_it[tile_out_offset + out_char] = temp_storage.compacted_symbols[out_char]; } } if constexpr (!DiscardIndexOutput) { - for (uint32_t out_char = threadIdx.x; out_char < tile_out_count; out_char += blockDim.x) { + for (thread_index_type out_char = threadIdx.x; out_char < tile_out_count; + out_char += blockDim.x) { out_idx_it[tile_out_offset + out_char] = temp_storage.compacted_offset[out_char] + tile_in_offset; } From 07406b353f71cd089e398960f491419dbce3b164 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 3 Jan 2025 15:03:15 -0800 Subject: [PATCH 03/19] Fix groupby.len with null values in cudf.polars (#17671) closes https://github.com/rapidsai/cudf/issues/17667 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17671 --- .../cudf_polars/dsl/expressions/aggregation.py | 8 ++++++-- python/cudf_polars/tests/test_groupby.py | 8 +++++++- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index 2ba483c7b2d..b88b109a975 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 # TODO: remove need for this # ruff: noqa: D101 @@ -69,7 +69,11 @@ def __init__( # TODO: handle nans req = plc.aggregation.variance(ddof=options) elif name == "count": - req = plc.aggregation.count(null_handling=plc.types.NullPolicy.EXCLUDE) + req = plc.aggregation.count( + null_handling=plc.types.NullPolicy.EXCLUDE + if not options + else plc.types.NullPolicy.INCLUDE + ) elif name == "quantile": _, quantile = self.children if not isinstance(quantile, Literal): diff --git a/python/cudf_polars/tests/test_groupby.py b/python/cudf_polars/tests/test_groupby.py index 1e8246496cd..53b96ba574b 100644 --- a/python/cudf_polars/tests/test_groupby.py +++ b/python/cudf_polars/tests/test_groupby.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -213,3 +213,9 @@ def test_groupby_maintain_order_random(nrows, nkeys, with_nulls): ) q = df.lazy().group_by(key_names, maintain_order=True).agg(pl.col("value").sum()) assert_gpu_result_equal(q) + + +def test_groupby_len_with_nulls(): + df = pl.DataFrame({"a": [1, 1, 1, 2], "b": [1, None, 2, 3]}) + q = df.lazy().group_by("a").agg(pl.col("b").len()) + assert_gpu_result_equal(q, check_row_order=False) From 756d66bd25da6ded550932611386fc2ca2063486 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 3 Jan 2025 17:19:06 -0800 Subject: [PATCH 04/19] Avoid shallow copies in groupby methods (#17646) Noticed while working on https://github.com/rapidsai/cudf/pull/17644 that `diff` and `fillna` were make some unnecessary shallow copies of the `grouping.value` object. Also noticed that `_cov_or_corr` just pulled the column names out of `grouping.value` object, so made a separate API, `values_column_names` to just create the column names without pulling out the actual columns. Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17646 --- python/cudf/cudf/core/groupby/groupby.py | 32 +++++++++++++----------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 4137109cc96..6ae524d6346 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import copy @@ -49,7 +49,7 @@ from cudf.utils.utils import GetAttrGetItemMixin if TYPE_CHECKING: - from collections.abc import Generator, Iterable + from collections.abc import Generator, Hashable, Iterable from cudf._typing import ( AggType, @@ -2448,7 +2448,7 @@ def _cov_or_corr(self, func, method_name): # create expanded dataframe consisting all combinations of the # struct columns-pairs to be used in the correlation or covariance # i.e. (('col1', 'col1'), ('col1', 'col2'), ('col2', 'col2')) - column_names = self.grouping.values._column_names + column_names = self.grouping._values_column_names num_cols = len(column_names) column_pair_structs = {} @@ -2682,10 +2682,8 @@ def diff(self, periods=1, axis=0): if not axis == 0: raise NotImplementedError("Only axis=0 is supported.") - - values = self.obj.__class__._from_data( - self.grouping.values._data, self.obj.index - ) + values = self.grouping.values + values.index = self.obj.index return values - self.shift(periods=periods) def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries: @@ -2789,9 +2787,8 @@ def fillna( raise ValueError("Method can only be of 'ffill', 'bfill'.") return getattr(self, method, limit)() - values = self.obj.__class__._from_data( - self.grouping.values._data, self.obj.index - ) + values = self.grouping.values + values.index = self.obj.index return values.fillna( value=value, inplace=inplace, axis=axis, limit=limit ) @@ -3543,6 +3540,13 @@ def keys(self): self._key_columns[0], name=self.names[0] ) + @property + def _values_column_names(self) -> list[Hashable]: + # If the key columns are in `obj`, filter them out + return [ + x for x in self._obj._column_names if x not in self._named_columns + ] + @property def values(self) -> cudf.core.frame.Frame: """Return value columns as a frame. @@ -3553,11 +3557,9 @@ def values(self) -> cudf.core.frame.Frame: This is mainly used in transform-like operations. """ - # If the key columns are in `obj`, filter them out - value_column_names = [ - x for x in self._obj._column_names if x not in self._named_columns - ] - value_columns = self._obj._data.select_by_label(value_column_names) + value_columns = self._obj._data.select_by_label( + self._values_column_names + ) return self._obj.__class__._from_data(value_columns) def _handle_callable(self, by): From 62d72dff9363bf6a58154def9f99fdd4e8a9acc8 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 3 Jan 2025 21:05:50 -0800 Subject: [PATCH 05/19] Refactor distinct hash join to handle multiple probes with the same build table (#17609) This PR updates the distinct join implementation to allow the same build table to be reused for multiple probe operations. It also introduces several breaking changes, including removing the need for users to specify whether the input data contains nested columns. Additionally, the output order has been updated to align with the hash join behavior, with probe indices now appearing on the left and build indices on the right. The PR leverages the new conditional query API in the cuco hash set, enabling more efficient handling of nullable data. While this optimization improves performance, it is not currently reflected in benchmarks due to the absence of a dedicated test case for this scenario. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Jason Lowe (https://github.com/jlowe) - Bradley Dice (https://github.com/bdice) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17609 --- cpp/benchmarks/join/distinct_join.cu | 20 +- .../cudf/detail/distinct_hash_join.cuh | 112 ++++----- cpp/include/cudf/join.hpp | 35 +-- cpp/src/join/distinct_hash_join.cu | 238 ++++++++++-------- cpp/tests/join/distinct_join_tests.cpp | 59 +++-- java/src/main/native/src/TableJni.cpp | 32 +-- 6 files changed, 236 insertions(+), 260 deletions(-) diff --git a/cpp/benchmarks/join/distinct_join.cu b/cpp/benchmarks/join/distinct_join.cu index 3502cbcea2a..1085b03ac7b 100644 --- a/cpp/benchmarks/join/distinct_join.cu +++ b/cpp/benchmarks/join/distinct_join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,13 +23,8 @@ void distinct_inner_join(nvbench::state& state, auto join = [](cudf::table_view const& probe_input, cudf::table_view const& build_input, cudf::null_equality compare_nulls) { - auto const has_nulls = - cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - auto hj_obj = cudf::distinct_hash_join{ - build_input, probe_input, has_nulls, compare_nulls}; - return hj_obj.inner_join(); + auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls}; + return hj_obj.inner_join(probe_input); }; BM_join(state, join); @@ -42,13 +37,8 @@ void distinct_left_join(nvbench::state& state, auto join = [](cudf::table_view const& probe_input, cudf::table_view const& build_input, cudf::null_equality compare_nulls) { - auto const has_nulls = - cudf::has_nested_nulls(build_input) || cudf::has_nested_nulls(probe_input) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - auto hj_obj = cudf::distinct_hash_join{ - build_input, probe_input, has_nulls, compare_nulls}; - return hj_obj.left_join(); + auto hj_obj = cudf::distinct_hash_join{build_input, compare_nulls}; + return hj_obj.left_join(probe_input); }; BM_join(state, join); diff --git a/cpp/include/cudf/detail/distinct_hash_join.cuh b/cpp/include/cudf/detail/distinct_hash_join.cuh index 2acc10105cf..9a10163eb15 100644 --- a/cpp/include/cudf/detail/distinct_hash_join.cuh +++ b/cpp/include/cudf/detail/distinct_hash_join.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,19 +36,24 @@ using cudf::experimental::row::lhs_index_type; using cudf::experimental::row::rhs_index_type; /** - * @brief An comparator adapter wrapping both self comparator and two table comparator + * @brief A custom comparator used for the build table insertion */ -template -struct comparator_adapter { - comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {} - - __device__ constexpr auto operator()( +struct always_not_equal { + __device__ constexpr bool operator()( cuco::pair const&, cuco::pair const&) const noexcept { // All build table keys are distinct thus `false` no matter what return false; } +}; + +/** + * @brief An comparator adapter wrapping the two table comparator + */ +template +struct comparator_adapter { + comparator_adapter(Equal const& d_equal) : _d_equal{d_equal} {} __device__ constexpr auto operator()( cuco::pair const& lhs, @@ -62,56 +67,14 @@ struct comparator_adapter { Equal _d_equal; }; -template -struct hasher_adapter { - hasher_adapter(Hasher const& d_hasher = {}) : _d_hasher{d_hasher} {} - - template - __device__ constexpr auto operator()(cuco::pair const& key) const noexcept - { - return _d_hasher(key.first); - } - - private: - Hasher _d_hasher; -}; - /** * @brief Distinct hash join that builds hash table in creation and probes results in subsequent * `*_join` member functions. * - * @tparam HasNested Flag indicating whether there are nested columns in build/probe table + * This class enables the distinct hash join scheme that builds hash table once, and probes as many + * times as needed (possibly in parallel). */ -template -struct distinct_hash_join { - private: - /// Device row equal type - using d_equal_type = cudf::experimental::row::equality::strong_index_comparator_adapter< - cudf::experimental::row::equality::device_row_comparator>; - using hasher = hasher_adapter>; - using probing_scheme_type = cuco::linear_probing<1, hasher>; - using cuco_storage_type = cuco::storage<1>; - - /// Hash table type - using hash_table_type = cuco::static_set, - cuco::extent, - cuda::thread_scope_device, - comparator_adapter, - probing_scheme_type, - cudf::detail::cuco_allocator, - cuco_storage_type>; - - bool _has_nulls; ///< true if nulls are present in either build table or probe table - cudf::null_equality _nulls_equal; ///< whether to consider nulls as equal - cudf::table_view _build; ///< input table to build the hash map - cudf::table_view _probe; ///< input table to probe the hash map - std::shared_ptr - _preprocessed_build; ///< input table preprocssed for row operators - std::shared_ptr - _preprocessed_probe; ///< input table preprocssed for row operators - hash_table_type _hash_table; ///< hash table built on `_build` - +class distinct_hash_join { public: distinct_hash_join() = delete; ~distinct_hash_join() = default; @@ -120,21 +83,28 @@ struct distinct_hash_join { distinct_hash_join& operator=(distinct_hash_join const&) = delete; distinct_hash_join& operator=(distinct_hash_join&&) = delete; + /** + * @brief Hasher adapter used by distinct hash join + */ + struct hasher { + template + __device__ constexpr hash_value_type operator()( + cuco::pair const& key) const noexcept + { + return key.first; + } + }; + /** * @brief Constructor that internally builds the hash table based on the given `build` table. * * @throw cudf::logic_error if the number of columns in `build` table is 0. * * @param build The build table, from which the hash table is built - * @param probe The probe table - * @param has_nulls Flag to indicate if any nulls exist in the `build` table or - * any `probe` table that will be used later for join. * @param compare_nulls Controls whether null join-key values should match or not. * @param stream CUDA stream used for device memory operations and kernel launches. */ distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - bool has_nulls, cudf::null_equality compare_nulls, rmm::cuda_stream_view stream); @@ -143,12 +113,36 @@ struct distinct_hash_join { */ std::pair>, std::unique_ptr>> - inner_join(rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; /** * @copydoc cudf::distinct_hash_join::left_join */ std::unique_ptr> left_join( - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const; + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const; + + private: + using probing_scheme_type = cuco::linear_probing<1, hasher>; + using cuco_storage_type = cuco::storage<1>; + + /// Hash table type + using hash_table_type = cuco::static_set, + cuco::extent, + cuda::thread_scope_device, + always_not_equal, + probing_scheme_type, + cudf::detail::cuco_allocator, + cuco_storage_type>; + + bool _has_nested_columns; ///< True if nested columns are present in build and probe tables + cudf::null_equality _nulls_equal; ///< Whether to consider nulls as equal + cudf::table_view _build; ///< Input table to build the hash map + std::shared_ptr + _preprocessed_build; ///< Input table preprocssed for row operators + hash_table_type _hash_table; ///< Hash table built on `_build` }; } // namespace cudf::detail diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index afefd04d4fa..cc63565eee1 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -34,13 +34,6 @@ namespace CUDF_EXPORT cudf { -/** - * @brief Enum to indicate whether the distinct join table has nested columns or not - * - * @ingroup column_join - */ -enum class has_nested : bool { YES, NO }; - // forward declaration namespace hashing::detail { @@ -61,7 +54,6 @@ class hash_join; /** * @brief Forward declaration for our distinct hash join */ -template class distinct_hash_join; } // namespace detail @@ -469,20 +461,19 @@ class hash_join { rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; private: - const std::unique_ptr _impl; + std::unique_ptr _impl; }; /** * @brief Distinct hash join that builds hash table in creation and probes results in subsequent * `*_join` member functions * + * This class enables the distinct hash join scheme that builds hash table once, and probes as many + * times as needed (possibly in parallel). + * * @note Behavior is undefined if the build table contains duplicates. * @note All NaNs are considered as equal - * - * @tparam HasNested Flag indicating whether there are nested columns in build/probe table */ -// TODO: `HasNested` to be removed via dispatching -template class distinct_hash_join { public: distinct_hash_join() = delete; @@ -496,15 +487,10 @@ class distinct_hash_join { * @brief Constructs a distinct hash join object for subsequent probe calls * * @param build The build table that contains distinct elements - * @param probe The probe table, from which the keys are probed - * @param has_nulls Flag to indicate if there exists any nulls in the `build` table or - * any `probe` table that will be used later for join * @param compare_nulls Controls whether null join-key values should match or not * @param stream CUDA stream used for device memory operations and kernel launches */ distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - nullable_join has_nulls = nullable_join::YES, null_equality compare_nulls = null_equality::EQUAL, rmm::cuda_stream_view stream = cudf::get_default_stream()); @@ -512,16 +498,18 @@ class distinct_hash_join { * @brief Returns the row indices that can be used to construct the result of performing * an inner join between two tables. @see cudf::inner_join(). * + * @param probe The probe table, from which the keys are probed * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned indices' device memory. * - * @return A pair of columns [`build_indices`, `probe_indices`] that can be used to + * @return A pair of columns [`probe_indices`, `build_indices`] that can be used to * construct the result of performing an inner join between two tables * with `build` and `probe` as the join keys. */ [[nodiscard]] std::pair>, std::unique_ptr>> - inner_join(rmm::cuda_stream_view stream = cudf::get_default_stream(), + inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; /** @@ -532,19 +520,22 @@ class distinct_hash_join { * the row index of the matched row from the build table if there is a match. Otherwise, contains * `JoinNoneValue`. * + * @param probe The probe table, from which the keys are probed * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table and columns' device * memory. + * * @return A `build_indices` column that can be used to construct the result of * performing a left join between two tables with `build` and `probe` as the join * keys. */ [[nodiscard]] std::unique_ptr> left_join( + cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()) const; private: - using impl_type = typename cudf::detail::distinct_hash_join; ///< Implementation type + using impl_type = cudf::detail::distinct_hash_join; ///< Implementation type std::unique_ptr _impl; ///< Distinct hash join implementation }; diff --git a/cpp/src/join/distinct_hash_join.cu b/cpp/src/join/distinct_hash_join.cu index ce4d2067b82..d1a01ee76e4 100644 --- a/cpp/src/join/distinct_hash_join.cu +++ b/cpp/src/join/distinct_hash_join.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,28 +47,19 @@ namespace cudf { namespace detail { namespace { -template -auto prepare_device_equal( - std::shared_ptr build, - std::shared_ptr probe, - bool has_nulls, - cudf::null_equality compare_nulls) -{ - auto const two_table_equal = - cudf::experimental::row::equality::two_table_comparator(probe, build); - return comparator_adapter{two_table_equal.equal_to( - nullate::DYNAMIC{has_nulls}, compare_nulls)}; -} +bool constexpr has_nulls = true; ///< Always has nulls /** * @brief Device functor to create a pair of {hash_value, row_index} for a given row. - * - * @tparam Hasher The type of internal hasher to compute row hash. */ -template +template class build_keys_fn { + using hasher = + cudf::experimental::row::hash::device_row_hasher; + public: - CUDF_HOST_DEVICE build_keys_fn(Hasher const& hash) : _hash{hash} {} + CUDF_HOST_DEVICE constexpr build_keys_fn(hasher const& hash) : _hash{hash} {} __device__ __forceinline__ auto operator()(size_type i) const noexcept { @@ -76,7 +67,7 @@ class build_keys_fn { } private: - Hasher _hash; + hasher _hash; }; /** @@ -92,26 +83,19 @@ struct output_fn { }; } // namespace -template -distinct_hash_join::distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - bool has_nulls, - cudf::null_equality compare_nulls, - rmm::cuda_stream_view stream) - : _has_nulls{has_nulls}, +distinct_hash_join::distinct_hash_join(cudf::table_view const& build, + cudf::null_equality compare_nulls, + rmm::cuda_stream_view stream) + : _has_nested_columns{cudf::has_nested_columns(build)}, _nulls_equal{compare_nulls}, _build{build}, - _probe{probe}, _preprocessed_build{ cudf::experimental::row::equality::preprocessed_table::create(_build, stream)}, - _preprocessed_probe{ - cudf::experimental::row::equality::preprocessed_table::create(_probe, stream)}, _hash_table{build.num_rows(), CUCO_DESIRED_LOAD_FACTOR, cuco::empty_key{cuco::pair{std::numeric_limits::max(), rhs_index_type{JoinNoneValue}}}, - prepare_device_equal( - _preprocessed_build, _preprocessed_probe, has_nulls, compare_nulls), + always_not_equal{}, {}, cuco::thread_scope_device, cuco_storage_type{}, @@ -124,10 +108,10 @@ distinct_hash_join::distinct_hash_join(cudf::table_view const& build, if (this->_build.num_rows() == 0) { return; } auto const row_hasher = experimental::row::hash::row_hasher{this->_preprocessed_build}; - auto const d_hasher = row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls}); + auto const d_hasher = row_hasher.device_hasher(nullate::DYNAMIC{has_nulls}); - auto const iter = cudf::detail::make_counting_transform_iterator( - 0, build_keys_fn{d_hasher}); + auto const iter = + cudf::detail::make_counting_transform_iterator(0, build_keys_fn{d_hasher}); size_type const build_table_num_rows{build.num_rows()}; if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(this->_build))) { @@ -146,15 +130,15 @@ distinct_hash_join::distinct_hash_join(cudf::table_view const& build, } } -template std::pair>, std::unique_ptr>> -distinct_hash_join::inner_join(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const +distinct_hash_join::inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { cudf::scoped_range range{"distinct_hash_join::inner_join"}; - size_type const probe_table_num_rows{this->_probe.num_rows()}; + size_type const probe_table_num_rows{probe.num_rows()}; // If output size is zero, return immediately if (probe_table_num_rows == 0) { @@ -162,25 +146,62 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, std::make_unique>(0, stream, mr)); } + auto preprocessed_probe = + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); + auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator( + preprocessed_probe, _preprocessed_build); + auto build_indices = std::make_unique>(probe_table_num_rows, stream, mr); auto probe_indices = std::make_unique>(probe_table_num_rows, stream, mr); - auto const probe_row_hasher = - cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe}; - auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls}); - auto const iter = cudf::detail::make_counting_transform_iterator( - 0, build_keys_fn{d_probe_hasher}); + auto const probe_row_hasher = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; + auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls}); + auto const iter = cudf::detail::make_counting_transform_iterator( + 0, build_keys_fn{d_probe_hasher}); auto found_indices = rmm::device_uvector(probe_table_num_rows, stream); auto const found_begin = thrust::make_transform_output_iterator(found_indices.begin(), output_fn{}); - // TODO conditional find for nulls once `cuco::static_set::find_if` is added - // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not equal - // to `JoinNoneValue`, then `idx` has a match in the hash set. - this->_hash_table.find_async(iter, iter + probe_table_num_rows, found_begin, stream.value()); + auto const comparator_helper = [&](auto device_comparator) { + // If `idx` is within the range `[0, probe_table_num_rows)` and `found_indices[idx]` is not + // equal to `JoinNoneValue`, then `idx` has a match in the hash set. + if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) { + this->_hash_table.find_async(iter, + iter + probe_table_num_rows, + comparator_adapter{device_comparator}, + hasher{}, + found_begin, + stream.value()); + } else { + auto stencil = thrust::counting_iterator{0}; + auto const row_bitmask = + cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first; + auto const pred = + cudf::detail::row_is_valid{reinterpret_cast(row_bitmask.data())}; + + this->_hash_table.find_if_async(iter, + iter + probe_table_num_rows, + stencil, + pred, + comparator_adapter{device_comparator}, + hasher{}, + found_begin, + stream.value()); + } + }; + + if (_has_nested_columns) { + auto const device_comparator = + two_table_equal.equal_to(nullate::DYNAMIC{has_nulls}, _nulls_equal); + comparator_helper(device_comparator); + } else { + auto const device_comparator = + two_table_equal.equal_to(nullate::DYNAMIC{has_nulls}, _nulls_equal); + comparator_helper(device_comparator); + } auto const tuple_iter = cudf::detail::make_counting_transform_iterator( 0, @@ -203,16 +224,17 @@ distinct_hash_join::inner_join(rmm::cuda_stream_view stream, build_indices->resize(actual_size, stream); probe_indices->resize(actual_size, stream); - return {std::move(build_indices), std::move(probe_indices)}; + return {std::move(probe_indices), std::move(build_indices)}; } -template -std::unique_ptr> distinct_hash_join::left_join( - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const +std::unique_ptr> distinct_hash_join::left_join( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { cudf::scoped_range range{"distinct_hash_join::left_join"}; - size_type const probe_table_num_rows{this->_probe.num_rows()}; + size_type const probe_table_num_rows{probe.num_rows()}; // If output size is zero, return empty if (probe_table_num_rows == 0) { @@ -227,80 +249,82 @@ std::unique_ptr> distinct_hash_join::l thrust::fill( rmm::exec_policy_nosync(stream), build_indices->begin(), build_indices->end(), JoinNoneValue); } else { - auto const probe_row_hasher = - cudf::experimental::row::hash::row_hasher{this->_preprocessed_probe}; - auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{this->_has_nulls}); - auto const iter = cudf::detail::make_counting_transform_iterator( - 0, build_keys_fn{d_probe_hasher}); + auto preprocessed_probe = + cudf::experimental::row::equality::preprocessed_table::create(probe, stream); + auto const two_table_equal = cudf::experimental::row::equality::two_table_comparator( + preprocessed_probe, _preprocessed_build); + + auto const probe_row_hasher = cudf::experimental::row::hash::row_hasher{preprocessed_probe}; + auto const d_probe_hasher = probe_row_hasher.device_hasher(nullate::DYNAMIC{has_nulls}); + auto const iter = cudf::detail::make_counting_transform_iterator( + 0, build_keys_fn{d_probe_hasher}); auto const output_begin = thrust::make_transform_output_iterator(build_indices->begin(), output_fn{}); - // TODO conditional find for nulls once `cuco::static_set::find_if` is added - this->_hash_table.find_async(iter, iter + probe_table_num_rows, output_begin, stream.value()); + auto const comparator_helper = [&](auto device_comparator) { + if (this->_nulls_equal == cudf::null_equality::EQUAL or (not cudf::nullable(probe))) { + this->_hash_table.find_async(iter, + iter + probe_table_num_rows, + comparator_adapter{device_comparator}, + hasher{}, + output_begin, + stream.value()); + } else { + auto stencil = thrust::counting_iterator{0}; + auto const row_bitmask = + cudf::detail::bitmask_and(probe, stream, cudf::get_current_device_resource_ref()).first; + auto const pred = + cudf::detail::row_is_valid{reinterpret_cast(row_bitmask.data())}; + + this->_hash_table.find_if_async(iter, + iter + probe_table_num_rows, + stencil, + pred, + comparator_adapter{device_comparator}, + hasher{}, + output_begin, + stream.value()); + } + }; + + if (_has_nested_columns) { + auto const device_comparator = + two_table_equal.equal_to(nullate::DYNAMIC{has_nulls}, _nulls_equal); + comparator_helper(device_comparator); + } else { + auto const device_comparator = + two_table_equal.equal_to(nullate::DYNAMIC{has_nulls}, _nulls_equal); + comparator_helper(device_comparator); + } } return build_indices; } } // namespace detail -template <> -distinct_hash_join::~distinct_hash_join() = default; - -template <> -distinct_hash_join::~distinct_hash_join() = default; - -template <> -distinct_hash_join::distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - nullable_join has_nulls, - null_equality compare_nulls, - rmm::cuda_stream_view stream) - : _impl{std::make_unique( - build, probe, has_nulls == nullable_join::YES, compare_nulls, stream)} -{ -} - -template <> -distinct_hash_join::distinct_hash_join(cudf::table_view const& build, - cudf::table_view const& probe, - nullable_join has_nulls, - null_equality compare_nulls, - rmm::cuda_stream_view stream) - : _impl{std::make_unique( - build, probe, has_nulls == nullable_join::YES, compare_nulls, stream)} -{ -} +distinct_hash_join::~distinct_hash_join() = default; -template <> -std::pair>, - std::unique_ptr>> -distinct_hash_join::inner_join(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const +distinct_hash_join::distinct_hash_join(cudf::table_view const& build, + null_equality compare_nulls, + rmm::cuda_stream_view stream) + : _impl{std::make_unique(build, compare_nulls, stream)} { - return _impl->inner_join(stream, mr); } -template <> std::pair>, std::unique_ptr>> -distinct_hash_join::inner_join(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const -{ - return _impl->inner_join(stream, mr); -} - -template <> -std::unique_ptr> -distinct_hash_join::left_join(rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) const +distinct_hash_join::inner_join(cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { - return _impl->left_join(stream, mr); + return _impl->inner_join(probe, stream, mr); } -template <> -std::unique_ptr> distinct_hash_join::left_join( - rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) const +std::unique_ptr> distinct_hash_join::left_join( + cudf::table_view const& probe, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) const { - return _impl->left_join(stream, mr); + return _impl->left_join(probe, stream, mr); } } // namespace cudf diff --git a/cpp/tests/join/distinct_join_tests.cpp b/cpp/tests/join/distinct_join_tests.cpp index 9070efa38fe..e1ec8cda3ac 100644 --- a/cpp/tests/join/distinct_join_tests.cpp +++ b/cpp/tests/join/distinct_join_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,7 +53,7 @@ struct DistinctJoinTest : public cudf::test::BaseFixture { cudf::table_view const& expected_table, cudf::out_of_bounds_policy oob_policy = cudf::out_of_bounds_policy::DONT_CHECK) { - auto const& [build_join_indices, probe_join_indices] = result; + auto const& [probe_join_indices, build_join_indices] = result; auto build_indices_span = cudf::device_span{*build_join_indices}; auto probe_indices_span = cudf::device_span{*probe_join_indices}; @@ -89,10 +89,9 @@ TEST_F(DistinctJoinTest, IntegerInnerJoin) auto build_table = cudf::table_view{{build->view()}}; auto probe_table = cudf::table_view{{probe->view()}}; - auto distinct_join = cudf::distinct_hash_join{ - build_table, probe_table, cudf::nullable_join::NO}; + auto distinct_join = cudf::distinct_hash_join{build_table}; - auto result = distinct_join.inner_join(); + auto result = distinct_join.inner_join(probe_table); auto constexpr gold_size = size / 2; auto gold = cudf::sequence(gold_size, init, cudf::numeric_scalar{2}); @@ -120,8 +119,8 @@ TEST_F(DistinctJoinTest, InnerJoinNoNulls) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); column_wrapper col_gold_0{{1, 2}}; strcol_wrapper col_gold_1({"s0", "s0"}); @@ -162,8 +161,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithNulls) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); column_wrapper col_gold_0{{3, 2}}; strcol_wrapper col_gold_1({"s1", "s0"}, {true, true}); @@ -229,8 +228,8 @@ TEST_F(DistinctJoinTest, InnerJoinWithStructsAndNulls) Table probe(std::move(cols0)); Table build(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); column_wrapper col_gold_0{{3, 2}}; strcol_wrapper col_gold_1({"s1", "s0"}, {true, true}); @@ -284,8 +283,8 @@ TEST_F(DistinctJoinTest, EmptyBuildTableInnerJoin) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); this->compare_to_reference(build.view(), probe.view(), result, build.view()); } @@ -307,9 +306,9 @@ TEST_F(DistinctJoinTest, EmptyBuildTableLeftJoin) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -332,8 +331,8 @@ TEST_F(DistinctJoinTest, EmptyProbeTableInnerJoin) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.inner_join(); + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.inner_join(probe.view()); this->compare_to_reference(build.view(), probe.view(), result, probe.view()); } @@ -355,9 +354,9 @@ TEST_F(DistinctJoinTest, EmptyProbeTableLeftJoin) Table build(std::move(cols0)); Table probe(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; this->compare_to_reference( build.view(), probe.view(), gather_map, probe.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -391,9 +390,9 @@ TEST_F(DistinctJoinTest, LeftJoinNoNulls) cols_gold.push_back(col_gold_3.release()); Table gold(std::move(cols_gold)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; this->compare_to_reference( build.view(), probe.view(), gather_map, gold.view(), cudf::out_of_bounds_policy::NULLIFY); @@ -416,9 +415,9 @@ TEST_F(DistinctJoinTest, LeftJoinWithNulls) Table probe(std::move(cols0)); Table build(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; column_wrapper col_gold_0{{3, 1, 2, 0, 2}, {true, true, true, true, true}}; strcol_wrapper col_gold_1({"s1", "s1", "", "s4", "s0"}, {true, true, false, true, true}); @@ -461,9 +460,9 @@ TEST_F(DistinctJoinTest, LeftJoinWithStructsAndNulls) Table probe(std::move(cols0)); Table build(std::move(cols1)); - auto distinct_join = cudf::distinct_hash_join{build.view(), probe.view()}; - auto result = distinct_join.left_join(); - auto gather_map = std::pair{std::move(result), get_left_indices(result->size())}; + auto distinct_join = cudf::distinct_hash_join{build.view()}; + auto result = distinct_join.left_join(probe.view()); + auto gather_map = std::pair{get_left_indices(result->size()), std::move(result)}; auto col0_gold_names_col = strcol_wrapper{ "Samuel Vimes", "Detritus", "Carrot Ironfoundersson", "Samuel Vimes", "Angua von Überwald"}; diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index 1f8b1ea207d..ed35f35794d 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -2901,16 +2901,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_leftDistinctJoinGatherMap j_right_keys, compare_nulls_equal, [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) { - auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - if (cudf::has_nested_columns(right)) { - cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); - return hash.left_join(); - } else { - cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); - return hash.left_join(); - } + cudf::distinct_hash_join hash(right, nulleq); + return hash.left_join(left); }); } @@ -3119,22 +3111,8 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_innerDistinctJoinGatherMa j_right_keys, compare_nulls_equal, [](cudf::table_view const& left, cudf::table_view const& right, cudf::null_equality nulleq) { - auto has_nulls = cudf::has_nested_nulls(left) || cudf::has_nested_nulls(right) - ? cudf::nullable_join::YES - : cudf::nullable_join::NO; - std::pair>, - std::unique_ptr>> - maps; - if (cudf::has_nested_columns(right)) { - cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); - maps = hash.inner_join(); - } else { - cudf::distinct_hash_join hash(right, left, has_nulls, nulleq); - maps = hash.inner_join(); - } - // Unique join returns {right map, left map} but all the other joins - // return {left map, right map}. Swap here to make it consistent. - return std::make_pair(std::move(maps.second), std::move(maps.first)); + cudf::distinct_hash_join hash(right, nulleq); + return hash.inner_join(left); }); } From 07ee82bb48e8c77f268ed9ce705d9a4bd5a8f32b Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Sat, 4 Jan 2025 11:15:07 -0600 Subject: [PATCH 06/19] Implement `.dt.total_seconds` (#17659) Fixes: #16802 This PR implements `.dt.total_seconds` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17659 --- python/cudf/cudf/core/column/timedelta.py | 13 ++++- python/cudf/cudf/core/index.py | 14 ++--- python/cudf/cudf/core/series.py | 62 ++++++++++++++++++++++- python/cudf/cudf/tests/test_timedelta.py | 24 ++++++++- 4 files changed, 102 insertions(+), 11 deletions(-) diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 417fa99dac0..749ab8e837a 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -1,9 +1,10 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime import functools +import math from typing import TYPE_CHECKING, cast import numpy as np @@ -263,7 +264,15 @@ def time_unit(self) -> str: return np.datetime_data(self.dtype)[0] def total_seconds(self) -> ColumnBase: - raise NotImplementedError("total_seconds is currently not implemented") + conversion = _unit_to_nanoseconds_conversion[self.time_unit] / 1e9 + # Typecast to decimal128 to avoid floating point precision issues + # https://github.com/rapidsai/cudf/issues/17664 + return ( + (self.astype("int64") * conversion) + .astype(cudf.Decimal128Dtype(38, 9)) + .round(decimals=abs(int(math.log10(conversion)))) + .astype("float64") + ) def ceil(self, freq: str) -> ColumnBase: raise NotImplementedError("ceil is currently not implemented") diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index eac5b9d71ae..85be8d21d27 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -842,14 +842,14 @@ def sort_values( @_performance_tracking def _gather(self, gather_map, nullify=False, check_bounds=True): gather_map = cudf.core.column.as_column(gather_map) - return cudf.Index._from_column( + return Index._from_column( self._column.take(gather_map, nullify, check_bounds), name=self.name, ) @_performance_tracking def _apply_boolean_mask(self, boolean_mask): - return cudf.Index._from_column( + return Index._from_column( self._column.apply_boolean_mask(boolean_mask), name=self.name ) @@ -857,7 +857,7 @@ def repeat(self, repeats, axis=None): return self._as_int_index().repeat(repeats, axis) def _split(self, splits): - return cudf.Index._from_column( + return Index._from_column( self._as_int_index()._split(splits), name=self.name ) @@ -1657,7 +1657,7 @@ def _clean_nulls_from_index(self) -> Index: if isinstance(self, (DatetimeIndex, TimedeltaIndex)) else str(cudf.NA) ) - return cudf.Index._from_column( + return Index._from_column( self._column.astype("str").fillna(fill_value), name=self.name, ) @@ -2964,13 +2964,13 @@ def median(self, *, skipna: bool = True, axis: int | None = 0): def std(self, *, skipna: bool = True, axis: int | None = 0, ddof: int = 1): return self._column.std(skipna=skipna, ddof=ddof) - def total_seconds(self) -> cupy.ndarray: + def total_seconds(self) -> Index: """ Return total duration of each element expressed in seconds. This method is currently not implemented. """ - return self._column.total_seconds().values + return Index._from_column(self._column.total_seconds(), name=self.name) def ceil(self, freq: str) -> Self: """ diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 961e5e11bc0..49c2c8cf387 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -5183,6 +5183,66 @@ def components(self) -> cudf.DataFrame: ca, index=self.series.index ) + def total_seconds(self) -> Series: + """ + Return total duration of each element expressed in seconds. + + This method is available directly on TimedeltaIndex + and on Series containing timedelta values under the ``.dt`` namespace. + + Returns + ------- + Index or Series + When the calling object is a TimedeltaIndex, + the return type is an Index with a float64 dtype. When the calling object + is a Series, the return type is Series of type `float64` whose + index is the same as the original. + + See Also + -------- + datetime.timedelta.total_seconds : Standard library version + of this method. + TimedeltaIndex.components : Return a DataFrame with components of + each Timedelta. + + Examples + -------- + **Series** + + >>> import cudf + >>> import pandas as pd + >>> import numpy as np + >>> s = cudf.Series(pd.to_timedelta(np.arange(5), unit="D")) + >>> s + 0 0 days 00:00:00 + 1 1 days 00:00:00 + 2 2 days 00:00:00 + 3 3 days 00:00:00 + 4 4 days 00:00:00 + dtype: timedelta64[ns] + + >>> s.dt.total_seconds() + 0 0.0 + 1 86400.0 + 2 172800.0 + 3 259200.0 + 4 345600.0 + dtype: float64 + + **TimedeltaIndex** + + >>> idx = cudf.from_pandas(pd.to_timedelta(np.arange(5), unit="D")) + >>> idx + TimedeltaIndex(['0 days', '1 days', '2 days', '3 days', '4 days'], + dtype='timedelta64[ns]', freq=None) + + >>> idx.total_seconds() + Index([0.0, 86400.0, 172800.0, 259200.0, 345600.0], dtype='float64') + """ + return self._return_result_like_self( + self.series._column.total_seconds() + ) + @_performance_tracking def _align_indices(series_list, how="outer", allow_non_unique=False): diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py index d622ff6b94e..f1da2a060ec 100644 --- a/python/cudf/cudf/tests/test_timedelta.py +++ b/python/cudf/cudf/tests/test_timedelta.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import datetime import operator @@ -1506,3 +1506,25 @@ def test_tdi_unit(): result = pd_tdi.unit expected = cudf_tdi.unit assert result == expected + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA) +@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) +def test_timedelta_series_total_seconds(data, dtype): + gsr = cudf.Series(data, dtype=dtype) + psr = gsr.to_pandas() + + expected = psr.dt.total_seconds() + actual = gsr.dt.total_seconds() + assert_eq(expected, actual) + + +@pytest.mark.parametrize("data", _TIMEDELTA_DATA) +@pytest.mark.parametrize("dtype", utils.TIMEDELTA_TYPES) +def test_timedelta_index_total_seconds(request, data, dtype): + gi = cudf.Index(data, dtype=dtype) + pi = gi.to_pandas() + + expected = pi.total_seconds() + actual = gi.total_seconds() + assert_eq(expected, actual) From 955b1f4566abccf920a022dc78a1e654acf0de16 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Sun, 5 Jan 2025 08:49:37 -0600 Subject: [PATCH 07/19] Cross-link cudf.pandas profiler documentation. (#17668) Adds a cross-link to the cudf.pandas profiler docs. This cross-linking would have helped answer a user question about how to profile. Authors: - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17668 --- docs/cudf/source/cudf_pandas/faq.md | 10 +++++----- docs/cudf/source/cudf_pandas/usage.md | 1 + 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/docs/cudf/source/cudf_pandas/faq.md b/docs/cudf/source/cudf_pandas/faq.md index 5024747227e..222b698a78d 100644 --- a/docs/cudf/source/cudf_pandas/faq.md +++ b/docs/cudf/source/cudf_pandas/faq.md @@ -63,11 +63,11 @@ keyword arguments, cuDF is not able to provide GPU acceleration and `cudf.pandas` will fall back to the CPU. The most accurate way to assess which functions run on the GPU is to try -running the code while using the `cudf.pandas` profiling features. The -profiler will indicate which functions ran on GPU / CPU. To improve -performance, try to use only functionality that can run entirely on GPU. -This helps reduce the number of memory transfers needed to fallback to -CPU. +running the code while using the `cudf.pandas` [profiling +features](cudf-pandas-profiling). The profiler will indicate which functions +ran on GPU / CPU. To improve performance, try to use only functionality that +can run entirely on GPU. This helps reduce the number of memory transfers +needed to fallback to CPU. ## How can I improve performance of my workflow with `cudf.pandas`? diff --git a/docs/cudf/source/cudf_pandas/usage.md b/docs/cudf/source/cudf_pandas/usage.md index 089f283e25d..fed63c2dd0f 100644 --- a/docs/cudf/source/cudf_pandas/usage.md +++ b/docs/cudf/source/cudf_pandas/usage.md @@ -75,6 +75,7 @@ with Pool(4) as pool: ... ``` +(cudf-pandas-profiling)= ## Profiling `cudf.pandas` `cudf.pandas` will attempt to use the GPU whenever possible and fall From c4f2e8e0be05603939849898dee93985608d57ca Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Jan 2025 09:54:04 -0800 Subject: [PATCH 08/19] Enable text build without relying on relaxed constexpr (#17647) Contributes to #7795 This PR updates `text` to build without depending on the relaxed constexpr build option. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Basit Ayantunde (https://github.com/lamarrr) - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17647 --- .../cudf/utilities/type_dispatcher.hpp | 4 +-- cpp/src/text/edit_distance.cu | 17 +++++----- cpp/src/text/jaccard.cu | 5 +-- cpp/src/text/minhash.cu | 11 +++---- cpp/src/text/replace.cu | 5 +-- cpp/src/text/subword/data_normalizer.cu | 7 ++-- cpp/src/text/subword/wordpiece_tokenizer.cu | 32 +++++++++++-------- 7 files changed, 45 insertions(+), 36 deletions(-) diff --git a/cpp/include/cudf/utilities/type_dispatcher.hpp b/cpp/include/cudf/utilities/type_dispatcher.hpp index 6351a84e38f..c1dd79ef14f 100644 --- a/cpp/include/cudf/utilities/type_dispatcher.hpp +++ b/cpp/include/cudf/utilities/type_dispatcher.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -53,7 +53,7 @@ namespace CUDF_EXPORT cudf { * @return The `cudf::type_id` corresponding to the specified type */ template -inline constexpr type_id type_to_id() +CUDF_HOST_DEVICE inline constexpr type_id type_to_id() { return type_id::EMPTY; }; diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu index b04e9961e01..b5063931485 100644 --- a/cpp/src/text/edit_distance.cu +++ b/cpp/src/text/edit_distance.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -64,10 +65,10 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str, if (str_length == 0) return tgt_length; if (tgt_length == 0) return str_length; - auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin(); - auto itr = str_length < tgt_length ? d_tgt.begin() : d_str.begin(); - // .first is min and .second is max - auto const [n, m] = std::minmax(str_length, tgt_length); + auto begin = str_length < tgt_length ? d_str.begin() : d_tgt.begin(); + auto itr = str_length < tgt_length ? d_tgt.begin() : d_str.begin(); + auto const n = cuda::std::min(str_length, tgt_length); + auto const m = cuda::std::max(str_length, tgt_length); // setup compute buffer pointers auto v0 = buffer; auto v1 = v0 + n + 1; @@ -81,7 +82,7 @@ __device__ cudf::size_type compute_distance(cudf::string_view const& d_str, auto sub_cost = v0[j] + (*itr != *itr_tgt); auto del_cost = v0[j + 1] + 1; auto ins_cost = v1[j] + 1; - v1[j + 1] = std::min(std::min(sub_cost, del_cost), ins_cost); + v1[j + 1] = cuda::std::min(cuda::std::min(sub_cost, del_cost), ins_cost); } thrust::swap(v0, v1); } @@ -170,7 +171,7 @@ std::unique_ptr edit_distance(cudf::strings_column_view const& str ? d_targets.element(0) : d_targets.element(idx); // just need 2 integers for each character of the shorter string - return (std::min(d_str.length(), d_tgt.length()) + 1) * 2; + return (cuda::std::min(d_str.length(), d_tgt.length()) + 1) * 2; }); // get the total size of the temporary compute buffer @@ -241,7 +242,7 @@ std::unique_ptr edit_distance_matrix(cudf::strings_column_view con if (d_str1.empty() || d_str2.empty()) { return; } // the temp size needed is 2 integers per character of the shorter string d_offsets[idx - ((row + 1) * (row + 2)) / 2] = - (std::min(d_str1.length(), d_str2.length()) + 1) * 2; + (cuda::std::min(d_str1.length(), d_str2.length()) + 1) * 2; }); // get the total size for the compute buffer diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu index 2de94a4eb59..247440212d0 100644 --- a/cpp/src/text/jaccard.cu +++ b/cpp/src/text/jaccard.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -36,6 +36,7 @@ #include #include +#include #include #include #include @@ -243,7 +244,7 @@ CUDF_KERNEL void count_substrings_kernel(cudf::column_device_view const d_string } } auto const char_count = warp_reduce(temp_storage).Sum(count); - if (lane_idx == 0) { d_counts[str_idx] = std::max(1, char_count - width + 1); } + if (lane_idx == 0) { d_counts[str_idx] = cuda::std::max(1, char_count - width + 1); } } /** diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu index 9a44d9477ab..9ce17c36b1f 100644 --- a/cpp/src/text/minhash.cu +++ b/cpp/src/text/minhash.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -40,14 +40,13 @@ #include #include +#include #include #include #include #include #include -#include - namespace nvtext { namespace detail { namespace { @@ -156,7 +155,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings, // initialize the output -- only needed for wider strings auto d_output = d_results + (str_idx * param_count); for (auto i = lane_idx; i < param_count; i += tile_size) { - d_output[i] = std::numeric_limits::max(); + d_output[i] = cuda::std::numeric_limits::max(); } } } @@ -226,7 +225,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, ? section_size : cuda::std::max(static_cast(size_bytes > 0), section_size - width + 1); - auto const init = size_bytes == 0 ? 0 : std::numeric_limits::max(); + auto const init = size_bytes == 0 ? 0 : cuda::std::numeric_limits::max(); auto const lane_idx = block.thread_rank(); auto const d_output = d_results + (str_idx * parameter_a.size()); @@ -235,7 +234,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings, // constants used in the permutation calculations constexpr uint64_t mersenne_prime = (1UL << 61) - 1; - constexpr hash_value_type hash_max = std::numeric_limits::max(); + constexpr hash_value_type hash_max = cuda::std::numeric_limits::max(); // found to be an efficient shared memory size for both hash types __shared__ hash_value_type block_values[block_size * params_per_thread]; diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu index 943bcbe9b3a..b041ce3ce0a 100644 --- a/cpp/src/text/replace.cu +++ b/cpp/src/text/replace.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -35,6 +35,7 @@ #include #include +#include #include #include #include @@ -196,7 +197,7 @@ struct sub_offset_fn { { // keep delimiter search within this sub-block auto const end = - d_input_chars + std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset); + d_input_chars + cuda::std::min(last_offset, ((idx + 2) * LS_SUB_BLOCK_SIZE) + first_offset); // starting point of this sub-block auto itr = d_input_chars + first_offset + ((idx + 1) * LS_SUB_BLOCK_SIZE); while ((itr < end) && diff --git a/cpp/src/text/subword/data_normalizer.cu b/cpp/src/text/subword/data_normalizer.cu index a3bed45e4bd..7a39199011e 100644 --- a/cpp/src/text/subword/data_normalizer.cu +++ b/cpp/src/text/subword/data_normalizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -134,8 +135,8 @@ extract_code_points_from_utf8(unsigned char const* strings, constexpr uint8_t max_utf8_blocks_for_char = 4; uint8_t utf8_blocks[max_utf8_blocks_for_char] = {0}; - for (int i = 0; i < std::min(static_cast(max_utf8_blocks_for_char), - total_bytes - start_byte_for_thread); + for (int i = 0; i < cuda::std::min(static_cast(max_utf8_blocks_for_char), + total_bytes - start_byte_for_thread); ++i) { utf8_blocks[i] = strings[start_byte_for_thread + i]; } diff --git a/cpp/src/text/subword/wordpiece_tokenizer.cu b/cpp/src/text/subword/wordpiece_tokenizer.cu index dd1e8ddb027..19f144dd158 100644 --- a/cpp/src/text/subword/wordpiece_tokenizer.cu +++ b/cpp/src/text/subword/wordpiece_tokenizer.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,8 @@ #include #include +#include +#include #include #include #include @@ -87,7 +89,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi // Deal with the start_word_indices array if (char_for_thread < num_code_points) { - uint32_t val_to_write = std::numeric_limits::max(); + uint32_t val_to_write = cuda::std::numeric_limits::max(); if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread > 0) && (code_points[char_for_thread - 1] == SPACE_CODE_POINT)) { val_to_write = char_for_thread; @@ -95,7 +97,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi start_word_indices[char_for_thread] = val_to_write; // Deal with the end_word_indices_array - val_to_write = std::numeric_limits::max(); + val_to_write = cuda::std::numeric_limits::max(); if ((code_points[char_for_thread] != SPACE_CODE_POINT) && (char_for_thread + 1 < num_code_points) && (code_points[char_for_thread + 1] == SPACE_CODE_POINT)) { @@ -103,7 +105,7 @@ CUDF_KERNEL void init_data_and_mark_word_start_and_ends(uint32_t const* code_poi } end_word_indices[char_for_thread] = val_to_write; - token_ids[char_for_thread] = std::numeric_limits::max(); + token_ids[char_for_thread] = cuda::std::numeric_limits::max(); tokens_per_word[char_for_thread] = 0; } } @@ -214,7 +216,7 @@ struct mark_special_tokens { __device__ void operator()(size_t idx) const { uint32_t const start_index = start_word_indices[idx]; - if ((start_index == std::numeric_limits::max()) || + if ((start_index == cuda::std::numeric_limits::max()) || ((start_index + MIN_ST_WIDTH + 2) > num_code_points)) return; if (code_points[start_index] != '[') return; @@ -225,12 +227,12 @@ struct mark_special_tokens { uint32_t const end_index = [&] { auto const begin = start_word_indices + start_pos; auto const width = - std::min(static_cast(MAX_ST_WIDTH + 1), (num_code_points - start_pos)); + cuda::std::min(static_cast(MAX_ST_WIDTH + 1), (num_code_points - start_pos)); auto const end = begin + width; // checking the next start-word is more reliable than arbitrarily searching for ']' // in case the text is split across string rows auto const iter = thrust::find_if(thrust::seq, begin + 1, end, [](auto swi) { - return swi != std::numeric_limits::max(); + return swi != cuda::std::numeric_limits::max(); }); return iter == end ? start_index : static_cast(iter - start_word_indices); }(); @@ -254,11 +256,11 @@ struct mark_special_tokens { thrust::fill(thrust::seq, start_word_indices + start_index + 1, // keep the first one start_word_indices + end_index + 1, - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); thrust::fill(thrust::seq, end_word_indices + start_index, end_word_indices + end_index + 1, - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); // reset the new end-word index end_word_indices[end_pos] = end_pos + 1; @@ -382,7 +384,7 @@ CUDF_KERNEL void kernel_wordpiece_tokenizer(uint32_t const* code_points, // We need to clean up the global array. This case is very uncommon. // Only 0.016% of words cannot be resolved to a token from the squad dev set. for (uint32_t i = 1; i < num_values_tokenized; ++i) { - token_ids[token_start + i] = std::numeric_limits::max(); + token_ids[token_start + i] = cuda::std::numeric_limits::max(); } num_values_tokenized = 0; } @@ -423,7 +425,10 @@ uvector_pair wordpiece_tokenizer::tokenize(cudf::strings_column_view const& inpu } struct copy_if_fn { // inline lambda not allowed in private or protected member function - __device__ bool operator()(uint32_t cp) { return cp != std::numeric_limits::max(); } + __device__ bool operator()(uint32_t cp) + { + return cp != cuda::std::numeric_limits::max(); + } }; struct tranform_fn { // just converting uint8 value to uint32 @@ -487,7 +492,7 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre auto itr_end = thrust::remove(rmm::exec_policy(stream), device_word_indices.begin(), device_word_indices.end(), - std::numeric_limits::max()); + cuda::std::numeric_limits::max()); // The number of tokens selected will be double the number of words since we // select from both the start and end index arrays. @@ -523,7 +528,8 @@ void wordpiece_tokenizer::tokenize(uvector_pair& cps_and_offsets, rmm::cuda_stre // token so this will always have enough memory to store the contiguous tokens. uint32_t* contiguous_token_ids = device_code_points; auto const copy_size = // thrust::copy_if limited to copying int-max values - std::min(device_token_ids.size(), static_cast(std::numeric_limits::max())); + cuda::std::min(device_token_ids.size(), + static_cast(cuda::std::numeric_limits::max())); auto ids_itr = device_token_ids.begin(); auto const ids_end = device_token_ids.end(); while (ids_itr != ids_end) { From 5d7686f71348a3cbed4400ba94da510af4572eb3 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Mon, 6 Jan 2025 10:21:42 -0800 Subject: [PATCH 09/19] Fix formatting in logging (#17680) Replace fmt-style `{}` syntax with `printf`-style `%xyz` format strings, because the fmt-style does not work with recent logger changes. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Nghia Truong (https://github.com/ttnghia) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/17680 --- cpp/src/io/comp/nvcomp_adapter.cpp | 10 +++++----- cpp/src/io/csv/reader_impl.cu | 4 ++-- cpp/src/io/parquet/writer_impl.cu | 4 ++-- cpp/src/io/utilities/data_sink.cpp | 4 ++-- cpp/src/io/utilities/datasource.cpp | 6 +++--- cpp/src/io/utilities/getenv_or.hpp | 15 +++++++++++---- cpp/src/utilities/host_memory.cpp | 4 ++-- cpp/src/utilities/stream_pool.cpp | 5 +++-- 8 files changed, 30 insertions(+), 22 deletions(-) diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp index 3a4e315348c..ac81dd421fa 100644 --- a/cpp/src/io/comp/nvcomp_adapter.cpp +++ b/cpp/src/io/comp/nvcomp_adapter.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -416,11 +416,11 @@ std::optional is_compression_disabled(compression_type compression, memo_map_lock.unlock(); if (reason.has_value()) { - CUDF_LOG_INFO("nvCOMP is disabled for {} compression; reason: {}", + CUDF_LOG_INFO("nvCOMP is disabled for %s compression; reason: %s", compression_type_name(compression), reason.value()); } else { - CUDF_LOG_INFO("nvCOMP is enabled for {} compression", compression_type_name(compression)); + CUDF_LOG_INFO("nvCOMP is enabled for %s compression", compression_type_name(compression)); } return reason; @@ -445,11 +445,11 @@ std::optional is_decompression_disabled(compression_type compressio memo_map_lock.unlock(); if (reason.has_value()) { - CUDF_LOG_INFO("nvCOMP is disabled for {} decompression; reason: {}", + CUDF_LOG_INFO("nvCOMP is disabled for %s decompression; reason: %s", compression_type_name(compression), reason.value()); } else { - CUDF_LOG_INFO("nvCOMP is enabled for {} decompression", compression_type_name(compression)); + CUDF_LOG_INFO("nvCOMP is enabled for %s decompression", compression_type_name(compression)); } return reason; diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu index e05353ee822..0d51526d925 100644 --- a/cpp/src/io/csv/reader_impl.cu +++ b/cpp/src/io/csv/reader_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -771,7 +771,7 @@ table_with_metadata read_csv(cudf::io::datasource* source, if (!reader_opts.is_enabled_mangle_dupe_cols()) { for (auto& col_name : column_names) { if (++col_names_counts[col_name] > 1) { - CUDF_LOG_WARN("Multiple columns with name {}; only the first appearance is parsed", + CUDF_LOG_WARN("Multiple columns with name %s; only the first appearance is parsed", col_name); auto const idx = &col_name - column_names.data(); diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu index 6b1a20701f9..77924ac0f35 100644 --- a/cpp/src/io/parquet/writer_impl.cu +++ b/cpp/src/io/parquet/writer_impl.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -719,7 +719,7 @@ std::vector construct_parquet_schema_tree( // all others default: CUDF_LOG_WARN( - "Unsupported page encoding requested: {}; the requested encoding will be ignored", + "Unsupported page encoding requested: %d; the requested encoding will be ignored", static_cast(col_meta.get_encoding())); return; } diff --git a/cpp/src/io/utilities/data_sink.cpp b/cpp/src/io/utilities/data_sink.cpp index dfa5d46cf48..975206646c6 100644 --- a/cpp/src/io/utilities/data_sink.cpp +++ b/cpp/src/io/utilities/data_sink.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -44,7 +44,7 @@ class file_sink : public data_sink { if (cufile_integration::is_kvikio_enabled()) { cufile_integration::set_up_kvikio(); _kvikio_file = kvikio::FileHandle(filepath, "w"); - CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode {}.", + CUDF_LOG_INFO("Writing a file using kvikIO, with compatibility mode %s.", _kvikio_file.is_compat_mode_preferred() ? "on" : "off"); } else { _cufile_out = detail::make_cufile_output(filepath); diff --git a/cpp/src/io/utilities/datasource.cpp b/cpp/src/io/utilities/datasource.cpp index 38dedcc2627..87b3c6facdf 100644 --- a/cpp/src/io/utilities/datasource.cpp +++ b/cpp/src/io/utilities/datasource.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ class file_source : public datasource { if (cufile_integration::is_kvikio_enabled()) { cufile_integration::set_up_kvikio(); _kvikio_file = kvikio::FileHandle(filepath); - CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode {}.", + CUDF_LOG_INFO("Reading a file using kvikIO, with compatibility mode %s.", _kvikio_file.is_compat_mode_preferred() ? "on" : "off"); } else { _cufile_in = detail::make_cufile_input(filepath); @@ -230,7 +230,7 @@ class memory_mapped_source : public file_source { { if (_map_addr != nullptr) { auto const result = munmap(_map_addr, _map_size); - if (result != 0) { CUDF_LOG_WARN("munmap failed with {}", result); } + if (result != 0) { CUDF_LOG_WARN("munmap failed with %d", result); } _map_addr = nullptr; } } diff --git a/cpp/src/io/utilities/getenv_or.hpp b/cpp/src/io/utilities/getenv_or.hpp index b9613428418..acfd2221797 100644 --- a/cpp/src/io/utilities/getenv_or.hpp +++ b/cpp/src/io/utilities/getenv_or.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -32,10 +32,17 @@ T getenv_or(std::string_view env_var_name, T default_val) { auto const env_val = std::getenv(env_var_name.data()); if (env_val != nullptr) { - CUDF_LOG_INFO("Environment variable {} read as {}", env_var_name, env_val); + CUDF_LOG_INFO("Environment variable %.*s read as %s", + static_cast(env_var_name.length()), + env_var_name.data(), + env_val); } else { - CUDF_LOG_INFO( - "Environment variable {} is not set, using default value {}", env_var_name, default_val); + std::stringstream ss; + ss << default_val; + CUDF_LOG_INFO("Environment variable %.*s is not set, using default value %s", + static_cast(env_var_name.length()), + env_var_name.data(), + ss.str()); } if (env_val == nullptr) { return default_val; } diff --git a/cpp/src/utilities/host_memory.cpp b/cpp/src/utilities/host_memory.cpp index 4196523d211..73c4567d3a4 100644 --- a/cpp/src/utilities/host_memory.cpp +++ b/cpp/src/utilities/host_memory.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ class fixed_pinned_pool_memory_resource { pool_size_{rmm::align_up(size, rmm::CUDA_ALLOCATION_ALIGNMENT)}, pool_{new host_pooled_mr(upstream_mr_, pool_size_, pool_size_)} { - CUDF_LOG_INFO("Pinned pool size = {}", pool_size_); + CUDF_LOG_INFO("Pinned pool size = %zu", pool_size_); // Allocate full size from the pinned pool to figure out the beginning and end address pool_begin_ = pool_->allocate_async(pool_size_, stream_); diff --git a/cpp/src/utilities/stream_pool.cpp b/cpp/src/utilities/stream_pool.cpp index b0f2d8c0637..80364885980 100644 --- a/cpp/src/utilities/stream_pool.cpp +++ b/cpp/src/utilities/stream_pool.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -129,7 +129,8 @@ class rmm_cuda_stream_pool : public cuda_stream_pool { std::vector get_streams(std::size_t count) override { if (count > STREAM_POOL_SIZE) { - CUDF_LOG_WARN("get_streams called with count ({}) > pool size ({})", count, STREAM_POOL_SIZE); + CUDF_LOG_WARN( + "get_streams called with count (%zu) > pool size (%zu)", count, STREAM_POOL_SIZE); } auto streams = std::vector(); for (uint32_t i = 0; i < count; i++) { From 782e2a7939afb989169510bd072ddb7ed59d58e3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Mon, 6 Jan 2025 11:03:31 -0800 Subject: [PATCH 10/19] Enable binaryop build without relying on relaxed constexpr (#17598) Contributes to #7795 This PR updates `binaryop` to build without depending on the relaxed constexpr build option. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17598 --- cpp/include/cudf/fixed_point/fixed_point.hpp | 44 ++++++++++++++------ cpp/src/binaryop/compiled/binary_ops.cuh | 40 +++++++++--------- 2 files changed, 52 insertions(+), 32 deletions(-) diff --git a/cpp/include/cudf/fixed_point/fixed_point.hpp b/cpp/include/cudf/fixed_point/fixed_point.hpp index ea2f5d4b6ca..5edbb322231 100644 --- a/cpp/include/cudf/fixed_point/fixed_point.hpp +++ b/cpp/include/cudf/fixed_point/fixed_point.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -60,7 +60,7 @@ enum class Radix : int32_t { BASE_2 = 2, BASE_10 = 10 }; * @return `true` if the type is supported by `fixed_point` implementation */ template -constexpr inline auto is_supported_representation_type() +CUDF_HOST_DEVICE constexpr inline auto is_supported_representation_type() { return cuda::std::is_same_v || // cuda::std::is_same_v || // @@ -72,6 +72,24 @@ constexpr inline auto is_supported_representation_type() // Helper functions for `fixed_point` type namespace detail { +/** + * @brief Returns the smaller of the given scales + * + * @param a The left-hand side value to compare + * @param b The right-hand side value to compare + * @return The smaller of the given scales + */ +CUDF_HOST_DEVICE constexpr inline scale_type min(scale_type const& a, scale_type const& b) +{ + // TODO This is a temporary workaround because is not self-contained when + // built with NVRTC 11.8. Replace this with cuda::std::min once the underlying issue is resolved. +#ifdef __CUDA_ARCH__ + return scale_type{min(static_cast(a), static_cast(b))}; +#else + return std::min(a, b); +#endif +} + /** * @brief A function for integer exponentiation by squaring. * @@ -267,12 +285,12 @@ class fixed_point { * @return The `fixed_point` number in base 10 (aka human readable format) */ template >* = nullptr> - explicit constexpr operator U() const + CUDF_HOST_DEVICE explicit constexpr operator U() const { // Cast to the larger of the two types (of U and Rep) before converting to Rep because in // certain cases casting to U before shifting will result in integer overflow (i.e. if U = // int32_t, Rep = int64_t and _value > 2 billion) - auto const value = std::common_type_t(_value); + auto const value = cuda::std::common_type_t(_value); return static_cast(detail::shift(value, scale_type{-_scale})); } @@ -669,7 +687,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator+(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const sum = lhs.rescaled(scale)._value + rhs.rescaled(scale)._value; #if defined(__CUDACC_DEBUG__) @@ -687,7 +705,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator-(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const diff = lhs.rescaled(scale)._value - rhs.rescaled(scale)._value; #if defined(__CUDACC_DEBUG__) @@ -735,7 +753,7 @@ template CUDF_HOST_DEVICE inline bool operator==(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value == rhs.rescaled(scale)._value; } @@ -744,7 +762,7 @@ template CUDF_HOST_DEVICE inline bool operator!=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value != rhs.rescaled(scale)._value; } @@ -753,7 +771,7 @@ template CUDF_HOST_DEVICE inline bool operator<=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value <= rhs.rescaled(scale)._value; } @@ -762,7 +780,7 @@ template CUDF_HOST_DEVICE inline bool operator>=(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value >= rhs.rescaled(scale)._value; } @@ -771,7 +789,7 @@ template CUDF_HOST_DEVICE inline bool operator<(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value < rhs.rescaled(scale)._value; } @@ -780,7 +798,7 @@ template CUDF_HOST_DEVICE inline bool operator>(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); return lhs.rescaled(scale)._value > rhs.rescaled(scale)._value; } @@ -789,7 +807,7 @@ template CUDF_HOST_DEVICE inline fixed_point operator%(fixed_point const& lhs, fixed_point const& rhs) { - auto const scale = std::min(lhs._scale, rhs._scale); + auto const scale = detail::min(lhs._scale, rhs._scale); auto const remainder = lhs.rescaled(scale)._value % rhs.rescaled(scale)._value; return fixed_point{scaled_integer{remainder, scale}}; } diff --git a/cpp/src/binaryop/compiled/binary_ops.cuh b/cpp/src/binaryop/compiled/binary_ops.cuh index 0e31a0b6cf5..2f255e7a07c 100644 --- a/cpp/src/binaryop/compiled/binary_ops.cuh +++ b/cpp/src/binaryop/compiled/binary_ops.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -27,6 +27,8 @@ #include #include +#include + namespace cudf { namespace binops { namespace compiled { @@ -51,7 +53,7 @@ struct type_casted_accessor { { if constexpr (column_device_view::has_element_accessor()) { auto const element = col.element(is_scalar ? 0 : i); - if constexpr (std::is_convertible_v) { + if constexpr (cuda::std::is_convertible_v) { return static_cast(element); } else if constexpr (is_fixed_point() && cuda::std::is_floating_point_v) { return convert_fixed_to_floating(element); @@ -75,7 +77,7 @@ struct typed_casted_writer { FromType val) const { if constexpr (mutable_column_device_view::has_element_accessor() and - std::is_constructible_v) { + cuda::std::is_constructible_v) { col.element(i) = static_cast(val); } else if constexpr (is_fixed_point()) { auto const scale = numeric::scale_type{col.type().scale()}; @@ -109,18 +111,18 @@ struct ops_wrapper { template __device__ void operator()(size_type i) { - if constexpr (std::is_invocable_v) { + if constexpr (cuda::std::is_invocable_v) { TypeCommon x = type_dispatcher(lhs.type(), type_casted_accessor{}, i, lhs, is_lhs_scalar); TypeCommon y = type_dispatcher(rhs.type(), type_casted_accessor{}, i, rhs, is_rhs_scalar); auto result = [&]() { - if constexpr (std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v) { + if constexpr (cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v) { bool output_valid = false; auto result = BinaryOperator{}.template operator()( x, @@ -134,7 +136,7 @@ struct ops_wrapper { return BinaryOperator{}.template operator()(x, y); } // To suppress nvcc warning - return std::invoke_result_t{}; + return cuda::std::invoke_result_t{}; }(); if constexpr (is_bool_result()) out.element(i) = result; @@ -161,16 +163,16 @@ struct ops2_wrapper { __device__ void operator()(size_type i) { if constexpr (!has_common_type_v and - std::is_invocable_v) { + cuda::std::is_invocable_v) { TypeLhs x = lhs.element(is_lhs_scalar ? 0 : i); TypeRhs y = rhs.element(is_rhs_scalar ? 0 : i); auto result = [&]() { - if constexpr (std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v or - std::is_same_v) { + if constexpr (cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v or + cuda::std::is_same_v) { bool output_valid = false; auto result = BinaryOperator{}.template operator()( x, @@ -184,7 +186,7 @@ struct ops2_wrapper { return BinaryOperator{}.template operator()(x, y); } // To suppress nvcc warning - return std::invoke_result_t{}; + return cuda::std::invoke_result_t{}; }(); if constexpr (is_bool_result()) out.element(i) = result; From 45a73291d4b9aa9f668405549ecce6e5df29eb7d Mon Sep 17 00:00:00 2001 From: James Lamb Date: Mon, 6 Jan 2025 14:29:59 -0600 Subject: [PATCH 11/19] remove find_package(Python) in libcudf build (#17683) Nothing in `libcudf`'s CMake should need a Python interpreter or linking to Python components. This proposes removing the `find(Python)` there, to simplify that build: https://github.com/rapidsai/cudf/blob/955b1f4566abccf920a022dc78a1e654acf0de16/python/libcudf/CMakeLists.txt#L37-L38 Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17683 --- .github/CODEOWNERS | 7 +++---- python/libcudf/CMakeLists.txt | 5 +---- 2 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 5e2f46714d9..e0b315f34fc 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -8,10 +8,9 @@ notebooks/ @rapidsai/cudf-python-codeowners python/dask_cudf/ @rapidsai/cudf-dask-codeowners #cmake code owners -cpp/CMakeLists.txt @rapidsai/cudf-cmake-codeowners -cpp/libcudf_kafka/CMakeLists.txt @rapidsai/cudf-cmake-codeowners -**/cmake/ @rapidsai/cudf-cmake-codeowners -*.cmake @rapidsai/cudf-cmake-codeowners +CMakeLists.txt @rapidsai/cudf-cmake-codeowners +**/cmake/ @rapidsai/cudf-cmake-codeowners +*.cmake @rapidsai/cudf-cmake-codeowners #java code owners java/ @rapidsai/cudf-java-codeowners diff --git a/python/libcudf/CMakeLists.txt b/python/libcudf/CMakeLists.txt index 5f9a04d3cee..259492b98d1 100644 --- a/python/libcudf/CMakeLists.txt +++ b/python/libcudf/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -34,9 +34,6 @@ endif() unset(cudf_FOUND) -# Find Python early so that later commands can use it -find_package(Python 3.10 REQUIRED COMPONENTS Interpreter) - set(BUILD_TESTS OFF) set(BUILD_BENCHMARKS OFF) set(CUDF_BUILD_TESTUTIL OFF) From b81d9e17fbffbb912e0128148f556bf7af41b6ab Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 6 Jan 2025 14:25:05 -0800 Subject: [PATCH 12/19] Fix cudf.polars sum of empty not equalling zero (#17685) closes #17681 (We have a similar carve-out in cudf classic due to `sum([]) == 0` in Python) Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) URL: https://github.com/rapidsai/cudf/pull/17685 --- .../cudf_polars/dsl/expressions/aggregation.py | 14 +++++++++++++- python/cudf_polars/tests/expressions/test_agg.py | 8 +++++++- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py index b88b109a975..92f39abe71e 100644 --- a/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py +++ b/python/cudf_polars/cudf_polars/dsl/expressions/aggregation.py @@ -91,7 +91,7 @@ def __init__( op = partial(self._reduce, request=req) elif name in {"min", "max"}: op = partial(op, propagate_nans=options) - elif name in {"count", "first", "last"}: + elif name in {"count", "sum", "first", "last"}: pass else: raise NotImplementedError( @@ -180,6 +180,18 @@ def _count(self, column: Column) -> Column: ) ) + def _sum(self, column: Column) -> Column: + if column.obj.size() == 0: + return Column( + plc.Column.from_scalar( + plc.interop.from_arrow( + pa.scalar(0, type=plc.interop.to_arrow(self.dtype)) + ), + 1, + ) + ) + return self._reduce(column, request=plc.aggregation.sum()) + def _min(self, column: Column, *, propagate_nans: bool) -> Column: if propagate_nans and column.nan_count > 0: return Column( diff --git a/python/cudf_polars/tests/expressions/test_agg.py b/python/cudf_polars/tests/expressions/test_agg.py index 86cb2352dcc..15ad845ea78 100644 --- a/python/cudf_polars/tests/expressions/test_agg.py +++ b/python/cudf_polars/tests/expressions/test_agg.py @@ -1,4 +1,4 @@ -# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. +# SPDX-FileCopyrightText: Copyright (c) 2024-2025, NVIDIA CORPORATION & AFFILIATES. # SPDX-License-Identifier: Apache-2.0 from __future__ import annotations @@ -148,3 +148,9 @@ def test_agg_singleton(op): q = df.select(op(pl.col("a"))) assert_gpu_result_equal(q) + + +def test_sum_empty_zero(): + df = pl.LazyFrame({"a": pl.Series(values=[], dtype=pl.Int32())}) + q = df.select(pl.col("a").sum()) + assert_gpu_result_equal(q) From 71827451fa459460894a1e6a34217e815938a562 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 6 Jan 2025 15:30:52 -0800 Subject: [PATCH 13/19] Set default logger level to warn (#17684) This PR leverages rapidsai/rapids-logger#8 to set the default logging level to warn and updates an associated test. This PR also makes a CI script change to facilitate testing by ensuring that `RAPIDS_PY_CUDA_SUFFIX` is always defined before we would insert any download commands of files from linked CI. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) URL: https://github.com/rapidsai/cudf/pull/17684 --- ci/build_wheel_libcudf.sh | 6 +++--- cpp/CMakeLists.txt | 2 +- cpp/tests/utilities_tests/logger_tests.cpp | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index af49942c8cd..d80e4fef0d0 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -1,11 +1,13 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -euo pipefail package_name="libcudf" package_dir="python/libcudf" +RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" + rapids-logger "Generating build requirements" rapids-dependency-file-generator \ @@ -28,8 +30,6 @@ export PIP_NO_BUILD_ISOLATION=0 export SKBUILD_CMAKE_ARGS="-DUSE_NVCOMP_RUNTIME_WHEEL=ON" ./ci/build_wheel.sh "${package_name}" "${package_dir}" -RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" - mkdir -p ${package_dir}/final_dist python -m auditwheel repair \ --exclude libnvcomp.so.4 \ diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index cb814aa8c0f..af92b7ceaf5 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -276,7 +276,7 @@ rapids_cpm_init() include(${rapids-cmake-dir}/cpm/rapids_logger.cmake) rapids_cpm_rapids_logger() -rapids_make_logger(cudf EXPORT_SET cudf-exports) +rapids_make_logger(cudf EXPORT_SET cudf-exports LOGGER_DEFAULT_LEVEL WARN) # find jitify include(cmake/thirdparty/get_jitify.cmake) diff --git a/cpp/tests/utilities_tests/logger_tests.cpp b/cpp/tests/utilities_tests/logger_tests.cpp index 58396115a54..b5d20325b75 100644 --- a/cpp/tests/utilities_tests/logger_tests.cpp +++ b/cpp/tests/utilities_tests/logger_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -55,7 +55,7 @@ TEST_F(LoggerTest, DefaultLevel) cudf::default_logger().warn("warn"); cudf::default_logger().error("error"); cudf::default_logger().critical("critical"); - ASSERT_EQ(this->sink_content(), "info\nwarn\nerror\ncritical\n"); + ASSERT_EQ(this->sink_content(), "warn\nerror\ncritical\n"); } TEST_F(LoggerTest, CustomLevel) From a0487be669326175982c8bfcdab4d61184c88e27 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 6 Jan 2025 17:45:33 -0800 Subject: [PATCH 14/19] Move unnecessary utilities from cudf._lib.scalar (#17636) In preparation for transitioning `DeviceScalar` to pylibcudf's `Scalar`, moving `_is_null_host_scalar` (a pure Python function) to `cudf.utils.utils` and removes `as_device_scalar` in favor of going through `cudf.Scalar` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17636 --- python/cudf/cudf/_lib/scalar.pyx | 23 --------------------- python/cudf/cudf/core/column/categorical.py | 2 +- python/cudf/cudf/core/column/column.py | 15 ++++++-------- python/cudf/cudf/core/column/lists.py | 2 +- python/cudf/cudf/core/column/numerical.py | 4 ++-- python/cudf/cudf/core/dataframe.py | 8 +++++-- python/cudf/cudf/core/scalar.py | 4 ++-- python/cudf/cudf/utils/dtypes.py | 2 +- python/cudf/cudf/utils/utils.py | 9 ++++++++ 9 files changed, 28 insertions(+), 41 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 40bd50acf16..fd6d0257940 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -260,26 +260,3 @@ cdef class DeviceScalar: self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ (cdtype_id) ] - - -def as_device_scalar(val, dtype=None): - if isinstance(val, (cudf.Scalar, DeviceScalar)): - if dtype == val.dtype or dtype is None: - if isinstance(val, DeviceScalar): - return val - else: - return val.device_value - else: - raise TypeError("Can't update dtype of existing GPU scalar") - else: - return cudf.Scalar(val, dtype=dtype).device_value - - -def _is_null_host_scalar(slr): - if cudf.utils.utils.is_na_like(slr): - return True - elif (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) or \ - slr is pd.NaT: - return True - else: - return False diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index b10b8dfe207..d705b4d4c21 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -621,7 +621,7 @@ def ordered(self) -> bool: def __setitem__(self, key, value): if cudf.api.types.is_scalar( value - ) and cudf._lib.scalar._is_null_host_scalar(value): + ) and cudf.utils.utils._is_null_host_scalar(value): to_add_categories = 0 else: if cudf.api.types.is_scalar(value): diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 31efe267c96..24b657f1c32 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,7 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.scalar import as_device_scalar from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -71,7 +70,7 @@ min_signed_type, min_unsigned_type, ) -from cudf.utils.utils import _array_ufunc, mask_dtype +from cudf.utils.utils import _array_ufunc, _is_null_host_scalar, mask_dtype if TYPE_CHECKING: import builtins @@ -777,9 +776,7 @@ def fillna( if not self.has_nulls(include_nan=True): return self.copy() elif method is None: - if is_scalar(fill_value) and libcudf.scalar._is_null_host_scalar( - fill_value - ): + if is_scalar(fill_value) and _is_null_host_scalar(fill_value): return self.copy() else: fill_value = self._validate_fillna_value(fill_value) @@ -1984,12 +1981,12 @@ def as_column( column = Column.from_pylibcudf( plc.filling.sequence( len(arbitrary), - as_device_scalar( + cudf.Scalar( arbitrary.start, dtype=np.dtype(np.int64) - ).c_value, - as_device_scalar( + ).device_value.c_value, + cudf.Scalar( arbitrary.step, dtype=np.dtype(np.int64) - ).c_value, + ).device_value.c_value, ) ) if cudf.get_option("default_integer_bitwidth") and dtype is None: diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 3d9440cdf21..6283e498842 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -236,7 +236,7 @@ def from_sequences( # Build Data, Mask & Offsets for data in arbitrary: - if cudf._lib.scalar._is_null_host_scalar(data): + if cudf.utils.utils._is_null_host_scalar(data): mask_col.append(False) offset_vals.append(offset) else: diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index 4405e153b0c..8fe5299fcdd 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -151,7 +151,7 @@ def __setitem__(self, key: Any, value: Any): cudf.Scalar( value, dtype=self.dtype - if cudf._lib.scalar._is_null_host_scalar(value) + if cudf.utils.utils._is_null_host_scalar(value) else None, ) if is_scalar(value) @@ -789,7 +789,7 @@ def _normalize_find_and_replace_input( ) # Scalar case if len(col_to_normalize) == 1: - if cudf._lib.scalar._is_null_host_scalar(col_to_normalize[0]): + if cudf.utils.utils._is_null_host_scalar(col_to_normalize[0]): return normalized_column.astype(input_column_dtype) if np.isinf(col_to_normalize[0]): return normalized_column diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 3334b57ce1b..b2121511a14 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -92,7 +92,11 @@ min_signed_type, ) from cudf.utils.performance_tracking import _performance_tracking -from cudf.utils.utils import GetAttrGetItemMixin, _external_only_api +from cudf.utils.utils import ( + GetAttrGetItemMixin, + _external_only_api, + _is_null_host_scalar, +) if TYPE_CHECKING: from cudf._typing import ColumnLike, Dtype, NotImplementedType @@ -3371,7 +3375,7 @@ def _insert(self, loc, name, value, nan_as_null=None, ignore_index=True): if isinstance(value, (np.ndarray, cupy.ndarray)): dtype = value.dtype value = value.item() - if libcudf.scalar._is_null_host_scalar(value): + if _is_null_host_scalar(value): dtype = "str" value = as_column( value, diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py index 80dd0921f9c..7d246960cc9 100644 --- a/python/cudf/cudf/core/scalar.py +++ b/python/cudf/cudf/core/scalar.py @@ -178,13 +178,13 @@ def dtype(self): def is_valid(self): if not self._is_host_value_current: self._device_value_to_host() - return not cudf._lib.scalar._is_null_host_scalar(self._host_value) + return not cudf.utils.utils._is_null_host_scalar(self._host_value) def _device_value_to_host(self): self._host_value = self._device_value._to_host_scalar() def _preprocess_host_value(self, value, dtype): - valid = not cudf._lib.scalar._is_null_host_scalar(value) + valid = not cudf.utils.utils._is_null_host_scalar(value) if isinstance(value, list): if dtype is not None: diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index ca8f9cac2d0..31a8f4de3b3 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -198,7 +198,7 @@ def to_cudf_compatible_scalar(val, dtype=None): If `val` is None, returns None. """ - if cudf._lib.scalar._is_null_host_scalar(val) or isinstance( + if cudf.utils.utils._is_null_host_scalar(val) or isinstance( val, cudf.Scalar ): return val diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index c83c1cbe895..0adaaa60654 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -341,6 +341,15 @@ def is_na_like(obj): return obj is None or obj is cudf.NA or obj is cudf.NaT +def _is_null_host_scalar(slr) -> bool: + # slr is NA like or NaT like + return ( + is_na_like(slr) + or (isinstance(slr, (np.datetime64, np.timedelta64)) and np.isnat(slr)) + or slr is pd.NaT + ) + + def _warn_no_dask_cudf(fn): @functools.wraps(fn) def wrapper(self): From f3081229379a7d92d7193a37a71bc43ad7a3d0fa Mon Sep 17 00:00:00 2001 From: Jason Lowe Date: Tue, 7 Jan 2025 10:20:27 -0600 Subject: [PATCH 15/19] Java Parquet reads via multiple host buffers (#17673) Adds a custom cuio datasource that can provide file data via multiple host memory buffers. This allows data that arrives from multiple threads in multiple buffers to be read directly rather than requiring the buffers to be concatenated into a single host memory buffer before reading. Authors: - Jason Lowe (https://github.com/jlowe) Approvers: - Alessandro Bellina (https://github.com/abellina) - Robert (Bobby) Evans (https://github.com/revans2) URL: https://github.com/rapidsai/cudf/pull/17673 --- .../ai/rapids/cudf/ParquetChunkedReader.java | 59 +++++-- java/src/main/java/ai/rapids/cudf/Table.java | 44 +++++- java/src/main/native/CMakeLists.txt | 5 +- .../include/multi_host_buffer_source.hpp | 57 +++++++ java/src/main/native/src/ChunkedReaderJni.cpp | 58 ++++--- java/src/main/native/src/TableJni.cpp | 26 +-- .../native/src/multi_host_buffer_source.cpp | 148 ++++++++++++++++++ .../test/java/ai/rapids/cudf/TableTest.java | 41 ++++- 8 files changed, 390 insertions(+), 48 deletions(-) create mode 100644 java/src/main/native/include/multi_host_buffer_source.hpp create mode 100644 java/src/main/native/src/multi_host_buffer_source.cpp diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java index 53af52eff07..5e544e92a77 100644 --- a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java +++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -62,12 +62,13 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File f * @param filePath Full path of the input Parquet file to read. */ public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, File filePath) { - handle = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId()); - + long[] handles = create(chunkSizeByteLimit, passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), + filePath.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } + multiHostBufferSourceHandle = handles[1]; } /** @@ -100,12 +101,41 @@ public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMe public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, ParquetOptions opts, HostMemoryBuffer buffer, long offset, long len) { - handle = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, - buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId()); + long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len }; + long[] handles = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, + addrsSizes, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; + if (handle == 0) { + throw new IllegalStateException("Cannot create native chunked Parquet reader object."); + } + multiHostBufferSourceHandle = handles[1]; + } + /** + * Construct the reader instance from a read limit and data in host memory buffers. + * + * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read, + * or 0 if there is no limit. + * @param passReadLimit Limit on the amount of memory used for reading and decompressing data or + * 0 if there is no limit + * @param opts The options for Parquet reading. + * @param buffers Array of buffers containing the file data. The buffers are logically + * concatenated to construct the file being read. + */ + public ParquetChunkedReader(long chunkSizeByteLimit, long passReadLimit, + ParquetOptions opts, HostMemoryBuffer... buffers) { + long[] addrsSizes = new long[buffers.length * 2]; + for (int i = 0; i < buffers.length; i++) { + addrsSizes[i * 2] = buffers[i].getAddress(); + addrsSizes[(i * 2) + 1] = buffers[i].getLength(); + } + long[] handles = create(chunkSizeByteLimit,passReadLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null, + addrsSizes, opts.timeUnit().typeId.getNativeId()); + handle = handles[0]; if (handle == 0) { throw new IllegalStateException("Cannot create native chunked Parquet reader object."); } + multiHostBufferSourceHandle = handles[1]; } /** @@ -181,6 +211,10 @@ public void close() { DataSourceHelper.destroyWrapperDataSource(dataSourceHandle); dataSourceHandle = 0; } + if (multiHostBufferSourceHandle != 0) { + destroyMultiHostBufferSource(multiHostBufferSourceHandle); + multiHostBufferSourceHandle = 0; + } } @@ -196,6 +230,8 @@ public void close() { private long dataSourceHandle = 0; + private long multiHostBufferSourceHandle = 0; + /** * Create a native chunked Parquet reader object on heap and return its memory address. * @@ -206,13 +242,12 @@ public void close() { * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all. * @param binaryToString Whether to convert the corresponding column to String if it is binary. * @param filePath Full path of the file to read, or given as null if reading from a buffer. - * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer. - * @param length The length of the buffer to read from. + * @param bufferAddrsSizes The address and size pairs of buffers to read from, or null if we are not using buffers. * @param timeUnit Return type of time unit for timestamps. */ - private static native long create(long chunkSizeByteLimit, long passReadLimit, - String[] filterColumnNames, boolean[] binaryToString, - String filePath, long bufferAddrs, long length, int timeUnit); + private static native long[] create(long chunkSizeByteLimit, long passReadLimit, + String[] filterColumnNames, boolean[] binaryToString, + String filePath, long[] bufferAddrsSizes, int timeUnit); private static native long createWithDataSource(long chunkedSizeByteLimit, String[] filterColumnNames, boolean[] binaryToString, int timeUnit, long dataSourceHandle); @@ -222,4 +257,6 @@ private static native long createWithDataSource(long chunkedSizeByteLimit, private static native long[] readChunk(long handle); private static native void close(long handle); + + private static native void destroyMultiHostBufferSource(long handle); } diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java index b01ce31b1f3..298f2cff6f3 100644 --- a/java/src/main/java/ai/rapids/cudf/Table.java +++ b/java/src/main/java/ai/rapids/cudf/Table.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -313,12 +313,11 @@ private static native long readAndInferJSON(long address, long length, * all of them * @param binaryToString whether to convert this column to String if binary * @param filePath the path of the file to read, or null if no path should be read. - * @param address the address of the buffer to read from or 0 if we should not. - * @param length the length of the buffer to read from. + * @param addrsAndSizes the address and size pairs for every buffer or null for no buffers. * @param timeUnit return type of TimeStamp in units */ private static native long[] readParquet(String[] filterColumnNames, boolean[] binaryToString, String filePath, - long address, long length, int timeUnit) throws CudfException; + long[] addrsAndSizes, int timeUnit) throws CudfException; private static native long[] readParquetFromDataSource(String[] filterColumnNames, boolean[] binaryToString, int timeUnit, @@ -1357,7 +1356,7 @@ public static Table readParquet(File path) { */ public static Table readParquet(ParquetOptions opts, File path) { return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - path.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId())); + path.getAbsolutePath(), null, opts.timeUnit().typeId.getNativeId())); } /** @@ -1402,6 +1401,14 @@ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, } } + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param buffer raw parquet formatted bytes. + * @param offset the starting offset into buffer. + * @param len the number of bytes to parse. + * @return the data parsed as a table on the GPU. + */ public static Table readParquet(ParquetOptions opts, byte[] buffer, long offset, long len) { return readParquet(opts, buffer, offset, len, DefaultHostMemoryAllocator.get()); } @@ -1422,10 +1429,35 @@ public static Table readParquet(ParquetOptions opts, HostMemoryBuffer buffer, assert len > 0; assert len <= buffer.getLength() - offset; assert offset >= 0 && offset < buffer.length; + long[] addrsSizes = new long[]{ buffer.getAddress() + offset, len }; + return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), + null, addrsSizes, opts.timeUnit().typeId.getNativeId())); + } + + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param buffers Buffers containing the Parquet data. The buffers are logically concatenated + * in order to construct the file being read. + * @return the data parsed as a table on the GPU. + */ + public static Table readParquet(ParquetOptions opts, HostMemoryBuffer... buffers) { + assert buffers.length > 0; + long[] addrsSizes = new long[buffers.length * 2]; + for (int i = 0; i < buffers.length; i++) { + addrsSizes[i * 2] = buffers[i].getAddress(); + addrsSizes[(i * 2) + 1] = buffers[i].getLength(); + } return new Table(readParquet(opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), - null, buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId())); + null, addrsSizes, opts.timeUnit().typeId.getNativeId())); } + /** + * Read parquet formatted data. + * @param opts various parquet parsing options. + * @param ds custom datasource to provide the Parquet file data + * @return the data parsed as a table on the GPU. + */ public static Table readParquet(ParquetOptions opts, DataSource ds) { long dataSourceHandle = DataSourceHelper.createWrapperDataSource(ds); try { diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt index 9ff43feeac6..bd1714aa476 100644 --- a/java/src/main/native/CMakeLists.txt +++ b/java/src/main/native/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -156,8 +156,9 @@ add_library( src/ScalarJni.cpp src/TableJni.cpp src/aggregation128_utils.cu - src/maps_column_view.cu src/check_nvcomp_output_sizes.cu + src/maps_column_view.cu + src/multi_host_buffer_source.cpp ) # Disable NVTX if necessary diff --git a/java/src/main/native/include/multi_host_buffer_source.hpp b/java/src/main/native/include/multi_host_buffer_source.hpp new file mode 100644 index 00000000000..2aedb2321e4 --- /dev/null +++ b/java/src/main/native/include/multi_host_buffer_source.hpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "jni_utils.hpp" + +#include + +#include + +namespace cudf { +namespace jni { + +/** + * @brief A custom datasource providing data from an array of host memory buffers. + */ +class multi_host_buffer_source : public cudf::io::datasource { + std::vector addrs_; + std::vector offsets_; + + size_t locate_offset_index(size_t offset); + + public: + explicit multi_host_buffer_source(native_jlongArray const& addrs_sizes); + std::unique_ptr host_read(size_t offset, size_t size) override; + size_t host_read(size_t offset, size_t size, uint8_t* dst) override; + bool supports_device_read() const override { return true; } + bool is_device_read_preferred(size_t size) const override { return true; } + std::unique_ptr device_read(size_t offset, + size_t size, + rmm::cuda_stream_view stream) override; + size_t device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override; + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override; + size_t size() const override { return offsets_.back(); } +}; + +} // namespace jni +} // namespace cudf diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp index cf04a87262f..4967e0b2b04 100644 --- a/java/src/main/native/src/ChunkedReaderJni.cpp +++ b/java/src/main/native/src/ChunkedReaderJni.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -16,6 +16,7 @@ #include "cudf_jni_apis.hpp" #include "jni_utils.hpp" +#include "multi_host_buffer_source.hpp" #include #include @@ -36,7 +37,7 @@ extern "C" { // This function should take all the parameters that `Table.readParquet` takes, // plus one more parameter `long chunkSizeByteLimit`. -JNIEXPORT jlong JNICALL +JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, jclass, jlong chunk_read_limit, @@ -44,27 +45,26 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inp_file_path, - jlong buffer, - jlong buffer_length, + jlongArray addrs_sizes, jint unit) { - JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0); + JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", nullptr); bool read_buffer = true; - if (buffer == 0) { - JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0); + if (addrs_sizes == nullptr) { + JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", nullptr); read_buffer = false; } else if (inp_file_path != nullptr) { - JNI_THROW_NEW( - env, cudf::jni::ILLEGAL_ARG_CLASS, "Cannot pass in both a buffer and an inp_file_path", 0); - } else if (buffer_length <= 0) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", 0); + JNI_THROW_NEW(env, + cudf::jni::ILLEGAL_ARG_CLASS, + "Cannot pass in both buffers and an inp_file_path", + nullptr); } try { cudf::jni::auto_set_device(env); cudf::jni::native_jstring filename(env, inp_file_path); if (!read_buffer && filename.is_empty()) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", 0); + JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "inp_file_path cannot be empty", nullptr); } cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); @@ -75,9 +75,15 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); (void)n_col_binary_read; - auto const source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), - static_cast(buffer_length)) - : cudf::io::source_info(filename.get()); + cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_sizes); + std::unique_ptr multi_buffer_source; + cudf::io::source_info source; + if (read_buffer) { + multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes)); + source = cudf::io::source_info(multi_buffer_source.get()); + } else { + source = cudf::io::source_info(filename.get()); + } auto opts_builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { @@ -86,13 +92,18 @@ Java_ai_rapids_cudf_ParquetChunkedReader_create(JNIEnv* env, auto const read_opts = opts_builder.convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - - return reinterpret_cast( + n_addrs_sizes.cancel(); + n_col_binary_read.cancel(); + auto reader_handle = reinterpret_cast( new cudf::io::chunked_parquet_reader(static_cast(chunk_read_limit), static_cast(pass_read_limit), read_opts)); + cudf::jni::native_jlongArray result(env, 2); + result[0] = reader_handle; + result[1] = cudf::jni::release_as_jlong(multi_buffer_source); + return result.get_jArray(); } - CATCH_STD(env, 0); + CATCH_STD(env, nullptr); } JNIEXPORT jlong JNICALL @@ -177,6 +188,17 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv* en CATCH_STD(env, ); } +JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_destroyMultiHostBufferSource( + JNIEnv* env, jclass, jlong handle) +{ + JNI_NULL_CHECK(env, handle, "handle is null", ); + + try { + delete reinterpret_cast(handle); + } + CATCH_STD(env, ); +} + // // Chunked ORC reader JNI // diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp index ed35f35794d..a6c7ae9ba18 100644 --- a/java/src/main/native/src/TableJni.cpp +++ b/java/src/main/native/src/TableJni.cpp @@ -19,6 +19,7 @@ #include "jni_compiled_expr.hpp" #include "jni_utils.hpp" #include "jni_writer_data_sink.hpp" +#include "multi_host_buffer_source.hpp" #include #include @@ -2071,20 +2072,17 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, jobjectArray filter_col_names, jbooleanArray j_col_binary_read, jstring inputfilepath, - jlong buffer, - jlong buffer_length, + jlongArray addrs_and_sizes, jint unit) { JNI_NULL_CHECK(env, j_col_binary_read, "null col_binary_read", 0); bool read_buffer = true; - if (buffer == 0) { + if (addrs_and_sizes == nullptr) { JNI_NULL_CHECK(env, inputfilepath, "input file or buffer must be supplied", NULL); read_buffer = false; } else if (inputfilepath != NULL) { JNI_THROW_NEW( env, cudf::jni::ILLEGAL_ARG_CLASS, "cannot pass in both a buffer and an inputfilepath", NULL); - } else if (buffer_length <= 0) { - JNI_THROW_NEW(env, cudf::jni::ILLEGAL_ARG_CLASS, "An empty buffer is not supported", NULL); } try { @@ -2096,10 +2094,15 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names); cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read); - - auto source = read_buffer ? cudf::io::source_info(reinterpret_cast(buffer), - static_cast(buffer_length)) - : cudf::io::source_info(filename.get()); + cudf::jni::native_jlongArray n_addrs_sizes(env, addrs_and_sizes); + std::unique_ptr multi_buffer_source; + cudf::io::source_info source; + if (read_buffer) { + multi_buffer_source.reset(new cudf::jni::multi_host_buffer_source(n_addrs_sizes)); + source = cudf::io::source_info(multi_buffer_source.get()); + } else { + source = cudf::io::source_info(filename.get()); + } auto builder = cudf::io::parquet_reader_options::builder(source); if (n_filter_col_names.size() > 0) { @@ -2110,7 +2113,10 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_readParquet(JNIEnv* env, builder.convert_strings_to_categories(false) .timestamp_type(cudf::data_type(static_cast(unit))) .build(); - return convert_table_for_return(env, cudf::io::read_parquet(opts).tbl); + auto tbl = cudf::io::read_parquet(opts).tbl; + n_col_binary_read.cancel(); + n_addrs_sizes.cancel(); + return convert_table_for_return(env, tbl); } CATCH_STD(env, NULL); } diff --git a/java/src/main/native/src/multi_host_buffer_source.cpp b/java/src/main/native/src/multi_host_buffer_source.cpp new file mode 100644 index 00000000000..c577fc680ba --- /dev/null +++ b/java/src/main/native/src/multi_host_buffer_source.cpp @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "multi_host_buffer_source.hpp" + +#include +#include +#include +#include + +namespace cudf { +namespace jni { + +multi_host_buffer_source::multi_host_buffer_source(native_jlongArray const& addrs_sizes) +{ + if (addrs_sizes.size() % 2 != 0) { + throw std::logic_error("addrs_sizes length not a multiple of 2"); + } + auto count = addrs_sizes.size() / 2; + addrs_.reserve(count); + offsets_.reserve(count + 1); + size_t total_size = 0; + for (int i = 0; i < addrs_sizes.size(); i += 2) { + addrs_.push_back(reinterpret_cast(addrs_sizes[i])); + offsets_.push_back(total_size); + total_size += addrs_sizes[i + 1]; + } + offsets_.push_back(total_size); +} + +size_t multi_host_buffer_source::locate_offset_index(size_t offset) +{ + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + auto start = offsets_.begin(); + auto it = std::upper_bound(start, offsets_.end(), offset); + return (it - start) - 1; +} + +std::unique_ptr multi_host_buffer_source::host_read(size_t offset, + size_t size) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + auto const end_offset = offset + size; + if (end_offset > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto next_offset = offsets_[buffer_index + 1]; + if (end_offset <= next_offset) { + // read range hits only a single buffer, so return a zero-copy view of the data + auto src = addrs_[buffer_index] + offset - offsets_[buffer_index]; + return std::make_unique(src, size); + } + auto buf = std::vector(size); + auto bytes_read = host_read(offset, size, buf.data()); + if (bytes_read != size) { + std::stringstream ss; + ss << "Expected host read of " << size << " found " << bytes_read; + throw std::logic_error(ss.str()); + } + return std::make_unique>>(std::move(buf)); +} + +size_t multi_host_buffer_source::host_read(size_t offset, size_t size, uint8_t* dst) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto bytes_left = size; + while (bytes_left > 0) { + auto next_offset = offsets_[buffer_index + 1]; + auto buffer_left = next_offset - offset; + auto buffer_offset = offset - offsets_[buffer_index]; + auto src = addrs_[buffer_index] + buffer_offset; + auto copy_size = std::min(buffer_left, bytes_left); + std::memcpy(dst, src, copy_size); + offset += copy_size; + dst += copy_size; + bytes_left -= copy_size; + ++buffer_index; + } + return size; +} + +std::unique_ptr multi_host_buffer_source::device_read( + size_t offset, size_t size, rmm::cuda_stream_view stream) +{ + rmm::device_buffer buf(size, stream); + auto dst = static_cast(buf.data()); + auto bytes_read = device_read(offset, size, dst, stream); + if (bytes_read != size) { + std::stringstream ss; + ss << "Expected device read of " << size << " found " << bytes_read; + throw std::logic_error(ss.str()); + } + return std::make_unique>(std::move(buf)); +} + +size_t multi_host_buffer_source::device_read(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) +{ + if (size == 0) { return 0; } + if (offset < 0 || offset >= offsets_.back()) { throw std::runtime_error("bad offset"); } + if (offset + size > offsets_.back()) { throw std::runtime_error("read past end of file"); } + auto buffer_index = locate_offset_index(offset); + auto bytes_left = size; + while (bytes_left > 0) { + auto next_offset = offsets_[buffer_index + 1]; + auto buffer_left = next_offset - offset; + auto buffer_offset = offset - offsets_[buffer_index]; + auto src = addrs_[buffer_index] + buffer_offset; + auto copy_size = std::min(buffer_left, bytes_left); + CUDF_CUDA_TRY(cudaMemcpyAsync(dst, src, copy_size, cudaMemcpyHostToDevice, stream.value())); + offset += copy_size; + dst += copy_size; + bytes_left -= copy_size; + ++buffer_index; + } + return size; +} + +std::future multi_host_buffer_source::device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) +{ + std::promise p; + p.set_value(device_read(offset, size, dst, stream)); + return p.get_future(); +} + +} // namespace jni +} // namespace cudf diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java index c7fcb1756b6..7eb32892bad 100644 --- a/java/src/test/java/ai/rapids/cudf/TableTest.java +++ b/java/src/test/java/ai/rapids/cudf/TableTest.java @@ -1,6 +1,6 @@ /* * - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -47,8 +47,11 @@ import java.math.BigInteger; import java.math.RoundingMode; import java.nio.ByteBuffer; +import java.nio.channels.FileChannel; +import java.nio.channels.SeekableByteChannel; import java.nio.charset.StandardCharsets; import java.nio.file.Files; +import java.nio.file.StandardOpenOption; import java.util.*; import java.util.function.Function; import java.util.stream.Collectors; @@ -1714,6 +1717,42 @@ void testChunkedReadParquet() { } } + @Test + void testChunkedReadParquetHostBuffers() throws Exception { + long size = TEST_PARQUET_FILE_CHUNKED_READ.length(); + java.nio.file.Path path = TEST_PARQUET_FILE_CHUNKED_READ.toPath(); + try (HostMemoryBuffer buf1 = HostMemoryBuffer.allocate(size / 2); + HostMemoryBuffer buf2 = HostMemoryBuffer.allocate(size - buf1.getLength())) { + try (SeekableByteChannel channel = Files.newByteChannel(path, StandardOpenOption.READ)) { + ByteBuffer bb1 = buf1.asByteBuffer(); + while (bb1.hasRemaining()) { + if (channel.read(bb1) == -1) { + throw new EOFException("error reading first buffer"); + } + } + ByteBuffer bb2 = buf2.asByteBuffer(); + while (bb2.hasRemaining()) { + if (channel.read(bb2) == -1) { + throw new EOFException("error reading second buffer"); + } + } + } + ParquetOptions opts = ParquetOptions.DEFAULT; + try (ParquetChunkedReader reader = new ParquetChunkedReader(240000, 0, opts, buf1, buf2)) { + int numChunks = 0; + long totalRows = 0; + while(reader.hasNext()) { + ++numChunks; + try(Table chunk = reader.readChunk()) { + totalRows += chunk.getRowCount(); + } + } + assertEquals(2, numChunks); + assertEquals(40000, totalRows); + } + } + } + @Test void testChunkedReadParquetFromDataSource() throws IOException { try (MultiBufferDataSource source = sourceFrom(TEST_PARQUET_FILE_CHUNKED_READ); From caf97ef24ae814054e6b35c46e0633c7ce7b12ed Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Tue, 7 Jan 2025 09:45:58 -0800 Subject: [PATCH 16/19] Add XXHash_32 hasher (#17533) Contributes to #17531 This PR introduces the xxhash_32 hasher to libcudf as a preparatory step for evaluating the impact of replacing murmurhash3_x86_32 with xxhash_32 as the default hash. Authors: - Yunsong Wang (https://github.com/PointKernel) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - David Wendt (https://github.com/davidwendt) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17533 --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/hashing.hpp | 22 ++- cpp/include/cudf/hashing/detail/hashing.hpp | 7 +- cpp/include/cudf/hashing/detail/xxhash_32.cuh | 118 +++++++++++++++ cpp/src/hash/xxhash_32.cu | 136 ++++++++++++++++++ cpp/src/io/orc/dict_enc.cu | 3 +- cpp/src/io/parquet/chunk_dict.cu | 3 +- cpp/src/join/join_common_utils.cuh | 3 +- cpp/tests/CMakeLists.txt | 3 +- cpp/tests/hashing/xxhash_32_test.cpp | 67 +++++++++ python/cudf/cudf/core/indexed_frame.py | 18 ++- python/cudf/cudf/tests/test_dataframe.py | 35 ++++- python/pylibcudf/pylibcudf/hashing.pxd | 6 +- python/pylibcudf/pylibcudf/hashing.pyi | 1 + python/pylibcudf/pylibcudf/hashing.pyx | 35 ++++- python/pylibcudf/pylibcudf/libcudf/hash.pxd | 7 +- .../pylibcudf/pylibcudf/tests/test_hashing.py | 43 +++--- 17 files changed, 473 insertions(+), 35 deletions(-) create mode 100644 cpp/include/cudf/hashing/detail/xxhash_32.cuh create mode 100644 cpp/src/hash/xxhash_32.cu create mode 100644 cpp/tests/hashing/xxhash_32_test.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index af92b7ceaf5..9dabe4e8800 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -461,6 +461,7 @@ add_library( src/hash/sha256_hash.cu src/hash/sha384_hash.cu src/hash/sha512_hash.cu + src/hash/xxhash_32.cu src/hash/xxhash_64.cu src/interop/dlpack.cpp src/interop/arrow_utilities.cpp diff --git a/cpp/include/cudf/hashing.hpp b/cpp/include/cudf/hashing.hpp index 307a52cd242..88034b4f804 100644 --- a/cpp/include/cudf/hashing.hpp +++ b/cpp/include/cudf/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -166,6 +166,26 @@ std::unique_ptr sha512( rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); +/** + * @brief Computes the XXHash_32 hash value of each row in the given table + * + * This function computes the hash of each column using the `seed` for the first column + * and the resulting hash as a seed for the next column and so on. + * The result is a uint32 value for each row. + * + * @param input The table of columns to hash + * @param seed Optional seed value to use for the hash function + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned column's device memory + * + * @returns A column where each row is the hash of a row from the input + */ +std::unique_ptr xxhash_32( + table_view const& input, + uint32_t seed = DEFAULT_HASH_SEED, + rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); + /** * @brief Computes the XXHash_64 hash value of each row in the given table * diff --git a/cpp/include/cudf/hashing/detail/hashing.hpp b/cpp/include/cudf/hashing/detail/hashing.hpp index 7cb80081a95..f796ff4526e 100644 --- a/cpp/include/cudf/hashing/detail/hashing.hpp +++ b/cpp/include/cudf/hashing/detail/hashing.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -61,6 +61,11 @@ std::unique_ptr sha512(table_view const& input, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); +std::unique_ptr xxhash_32(table_view const& input, + uint64_t seed, + rmm::cuda_stream_view, + rmm::device_async_resource_ref mr); + std::unique_ptr xxhash_64(table_view const& input, uint64_t seed, rmm::cuda_stream_view, diff --git a/cpp/include/cudf/hashing/detail/xxhash_32.cuh b/cpp/include/cudf/hashing/detail/xxhash_32.cuh new file mode 100644 index 00000000000..bb6e7f18fbc --- /dev/null +++ b/cpp/include/cudf/hashing/detail/xxhash_32.cuh @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +namespace cudf::hashing::detail { + +template +struct XXHash_32 { + using result_type = std::uint32_t; + + CUDF_HOST_DEVICE constexpr XXHash_32(uint32_t seed = cudf::DEFAULT_HASH_SEED) : _impl{seed} {} + + __device__ constexpr result_type operator()(Key const& key) const { return this->_impl(key); } + + __device__ constexpr result_type compute_bytes(cuda::std::byte const* bytes, + std::uint64_t size) const + { + return this->_impl.compute_hash(bytes, size); + } + + private: + template + __device__ constexpr result_type compute(T const& key) const + { + return this->compute_bytes(reinterpret_cast(&key), sizeof(T)); + } + + cuco::xxhash_32 _impl; +}; + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(bool const& key) const +{ + return this->compute(static_cast(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()(float const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + double const& key) const +{ + return this->compute(normalize_nans_and_zeros(key)); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::string_view const& key) const +{ + return this->compute_bytes(reinterpret_cast(key.data()), + key.size_bytes()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal32 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal64 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(numeric::decimal128 const& key) const +{ + return this->compute(key.value()); +} + +template <> +XXHash_32::result_type __device__ inline XXHash_32::operator()( + cudf::list_view const& key) const +{ + CUDF_UNREACHABLE("List column hashing is not supported"); +} + +template <> +XXHash_32::result_type + __device__ inline XXHash_32::operator()(cudf::struct_view const& key) const +{ + CUDF_UNREACHABLE("Direct hashing of struct_view is not supported"); +} + +} // namespace cudf::hashing::detail diff --git a/cpp/src/hash/xxhash_32.cu b/cpp/src/hash/xxhash_32.cu new file mode 100644 index 00000000000..40503f7f911 --- /dev/null +++ b/cpp/src/hash/xxhash_32.cu @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +namespace cudf { +namespace hashing { +namespace detail { + +namespace { + +/** + * @brief Computes the hash value of a row in the given table. + * + * @tparam Nullate A cudf::nullate type describing whether to check for nulls. + */ +template +class device_row_hasher { + public: + device_row_hasher(Nullate nulls, table_device_view const& t, hash_value_type seed) + : _check_nulls(nulls), _table(t), _seed(seed) + { + } + + __device__ auto operator()(size_type row_index) const noexcept + { + return cudf::detail::accumulate( + _table.begin(), + _table.end(), + _seed, + [row_index, nulls = _check_nulls] __device__(auto hash, auto column) { + return cudf::type_dispatcher( + column.type(), element_hasher_adapter{}, column, row_index, nulls, hash); + }); + } + + /** + * @brief Computes the hash value of an element in the given column. + */ + class element_hasher_adapter { + public: + template ())> + __device__ hash_value_type operator()(column_device_view const& col, + size_type const row_index, + Nullate const _check_nulls, + hash_value_type const _seed) const noexcept + { + if (_check_nulls && col.is_null(row_index)) { + return cuda::std::numeric_limits::max(); + } + auto const hasher = XXHash_32{_seed}; + return hasher(col.element(row_index)); + } + + template ())> + __device__ hash_value_type operator()(column_device_view const&, + size_type const, + Nullate const, + hash_value_type const) const noexcept + { + CUDF_UNREACHABLE("Unsupported type for XXHash_32"); + } + }; + + Nullate const _check_nulls; + table_device_view const _table; + hash_value_type const _seed; +}; + +} // namespace + +std::unique_ptr xxhash_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto output = make_numeric_column(data_type(type_to_id()), + input.num_rows(), + mask_state::UNALLOCATED, + stream, + mr); + + // Return early if there's nothing to hash + if (input.num_columns() == 0 || input.num_rows() == 0) { return output; } + + bool const nullable = has_nulls(input); + auto const input_view = table_device_view::create(input, stream); + auto output_view = output->mutable_view(); + + // Compute the hash value for each row + thrust::tabulate(rmm::exec_policy(stream), + output_view.begin(), + output_view.end(), + device_row_hasher(nullable, *input_view, seed)); + + return output; +} + +} // namespace detail + +std::unique_ptr xxhash_32(table_view const& input, + uint32_t seed, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::xxhash_32(input, seed, stream, mr); +} + +} // namespace hashing +} // namespace cudf diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu index 7facc6497ed..469f933f918 100644 --- a/cpp/src/io/orc/dict_enc.cu +++ b/cpp/src/io/orc/dict_enc.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu index b5f9b894c46..0d40a1f7b1b 100644 --- a/cpp/src/io/parquet/chunk_dict.cu +++ b/cpp/src/io/parquet/chunk_dict.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include +#include #include #include diff --git a/cpp/src/join/join_common_utils.cuh b/cpp/src/join/join_common_utils.cuh index 4f75908fe72..37c5698f654 100644 --- a/cpp/src/join/join_common_utils.cuh +++ b/cpp/src/join/join_common_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * Copyright (c) 2021-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -20,6 +20,7 @@ #include #include #include +#include #include #include diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index e5c29314203..344979e1288 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -192,6 +192,7 @@ ConfigureTest( hashing/sha256_test.cpp hashing/sha384_test.cpp hashing/sha512_test.cpp + hashing/xxhash_32_test.cpp hashing/xxhash_64_test.cpp ) diff --git a/cpp/tests/hashing/xxhash_32_test.cpp b/cpp/tests/hashing/xxhash_32_test.cpp new file mode 100644 index 00000000000..9e3c66b0d0b --- /dev/null +++ b/cpp/tests/hashing/xxhash_32_test.cpp @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +class XXHash_32_Test : public cudf::test::BaseFixture {}; + +TEST_F(XXHash_32_Test, TestInteger) +{ + auto col1 = cudf::test::fixed_width_column_wrapper{{0, 42, 825}}; + auto constexpr seed = 0u; + auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({148298089u, 1161967057u, 1066694813u}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} + +TEST_F(XXHash_32_Test, TestDouble) +{ + auto col1 = cudf::test::fixed_width_column_wrapper{{-8., 25., 90.}}; + auto constexpr seed = 42u; + + auto const output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({2276435783u, 3120212431u, 3454197470u}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} + +TEST_F(XXHash_32_Test, StringType) +{ + auto col1 = cudf::test::strings_column_wrapper({"I", "am", "AI"}); + auto constexpr seed = 825u; + + auto output = cudf::hashing::xxhash_32(cudf::table_view({col1}), seed); + + // Expected results were generated with the reference implementation: + // https://github.com/Cyan4973/xxHash/blob/dev/xxhash.h + auto expected = + cudf::test::fixed_width_column_wrapper({320624298u, 1612654309u, 1409499009u}); + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(output->view(), expected); +} diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 6854cb02aa5..e9ed74f804b 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. """Base class for Frame types that have an index.""" from __future__ import annotations @@ -2836,16 +2836,22 @@ def hash_values( Parameters ---------- - method : {'murmur3', 'md5', 'xxhash64'}, default 'murmur3' + method : {'murmur3', 'xxhash32', 'xxhash64', 'md5', 'sha1', 'sha224', 'sha256', 'sha384', 'sha512'}, default 'murmur3' Hash function to use: * murmur3: MurmurHash3 hash function - * md5: MD5 hash function + * xxhash32: xxHash32 hash function * xxhash64: xxHash64 hash function + * md5: MD5 hash function + * sha1: SHA-1 hash function + * sha224: SHA-224 hash function + * sha256: SHA-256 hash function + * sha384: SHA-384 hash function + * sha512: SHA-512 hash function seed : int, optional Seed value to use for the hash function. This parameter is only - supported for 'murmur3' and 'xxhash64'. + supported for 'murmur3', 'xxhash32', and 'xxhash64'. Returns @@ -2900,7 +2906,7 @@ def hash_values( 2 fe061786ea286a515b772d91b0dfcd70 dtype: object """ - seed_hash_methods = {"murmur3", "xxhash64"} + seed_hash_methods = {"murmur3", "xxhash32", "xxhash64"} if seed is None: seed = 0 elif method not in seed_hash_methods: @@ -2914,6 +2920,8 @@ def hash_values( ) if method == "murmur3": plc_column = plc.hashing.murmurhash3_x86_32(plc_table, seed) + elif method == "xxhash32": + plc_column = plc.hashing.xxhash_32(plc_table, seed) elif method == "xxhash64": plc_column = plc.hashing.xxhash_64(plc_table, seed) elif method == "md5": diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 11a9b398b50..f3cf8e36a5b 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import array as arr import contextlib @@ -1440,6 +1440,7 @@ def test_assign_callable(mapping): "sha256", "sha384", "sha512", + "xxhash32", "xxhash64", ], ) @@ -1447,6 +1448,7 @@ def test_assign_callable(mapping): def test_dataframe_hash_values(nrows, method, seed): warning_expected = seed is not None and method not in { "murmur3", + "xxhash32", "xxhash64", } potential_warning = ( @@ -1472,6 +1474,7 @@ def test_dataframe_hash_values(nrows, method, seed): "sha256": object, "sha384": object, "sha512": object, + "xxhash32": np.uint32, "xxhash64": np.uint64, } assert out.dtype == expected_dtypes[method] @@ -1486,7 +1489,7 @@ def test_dataframe_hash_values(nrows, method, seed): assert_eq(gdf["a"].hash_values(method=method, seed=seed), out_one) -@pytest.mark.parametrize("method", ["murmur3", "xxhash64"]) +@pytest.mark.parametrize("method", ["murmur3", "xxhash32", "xxhash64"]) def test_dataframe_hash_values_seed(method): gdf = cudf.DataFrame() data = np.arange(10) @@ -1500,6 +1503,34 @@ def test_dataframe_hash_values_seed(method): assert_neq(out_one, out_two) +def test_dataframe_hash_values_xxhash32(): + # xxhash32 has no built-in implementation in Python and we don't want to + # add a testing dependency, so we use regression tests against known good + # values. + gdf = cudf.DataFrame({"a": [0.0, 1.0, 2.0, np.inf, np.nan]}) + gdf["b"] = -gdf["a"] + out_a = gdf["a"].hash_values(method="xxhash32", seed=0) + expected_a = cudf.Series( + [3736311059, 2307980487, 2906647130, 746578903, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_a, expected_a) + + out_b = gdf["b"].hash_values(method="xxhash32", seed=42) + expected_b = cudf.Series( + [1076387279, 2261349915, 531498073, 650869264, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_b, expected_b) + + out_df = gdf.hash_values(method="xxhash32", seed=0) + expected_df = cudf.Series( + [1223721700, 2885793241, 1920811472, 1146715602, 4294967295], + dtype=np.uint32, + ) + assert_eq(out_df, expected_df) + + def test_dataframe_hash_values_xxhash64(): # xxhash64 has no built-in implementation in Python and we don't want to # add a testing dependency, so we use regression tests against known good diff --git a/python/pylibcudf/pylibcudf/hashing.pxd b/python/pylibcudf/pylibcudf/hashing.pxd index 2d070ddda69..fbd478f963f 100644 --- a/python/pylibcudf/pylibcudf/hashing.pxd +++ b/python/pylibcudf/pylibcudf/hashing.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t @@ -16,6 +16,10 @@ cpdef Table murmurhash3_x64_128( uint64_t seed=* ) +cpdef Column xxhash_32( + Table input, + uint32_t seed=* +) cpdef Column xxhash_64( Table input, diff --git a/python/pylibcudf/pylibcudf/hashing.pyi b/python/pylibcudf/pylibcudf/hashing.pyi index a849f5d0729..d535d842a18 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyi +++ b/python/pylibcudf/pylibcudf/hashing.pyi @@ -9,6 +9,7 @@ LIBCUDF_DEFAULT_HASH_SEED: Final[int] def murmurhash3_x86_32(input: Table, seed: int = ...) -> Column: ... def murmurhash3_x64_128(input: Table, seed: int = ...) -> Table: ... +def xxhash_32(input: Table, seed: int = ...) -> Column: ... def xxhash_64(input: Table, seed: int = ...) -> Column: ... def md5(input: Table) -> Column: ... def sha1(input: Table) -> Column: ... diff --git a/python/pylibcudf/pylibcudf/hashing.pyx b/python/pylibcudf/pylibcudf/hashing.pyx index 548cffc0ce8..1f093b20c6b 100644 --- a/python/pylibcudf/pylibcudf/hashing.pyx +++ b/python/pylibcudf/pylibcudf/hashing.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.utility cimport move @@ -13,6 +13,7 @@ from pylibcudf.libcudf.hash cimport ( sha256 as cpp_sha256, sha384 as cpp_sha384, sha512 as cpp_sha512, + xxhash_32 as cpp_xxhash_32, xxhash_64 as cpp_xxhash_64, ) from pylibcudf.libcudf.table.table cimport table @@ -30,6 +31,7 @@ __all__ = [ "sha256", "sha384", "sha512", + "xxhash_32", "xxhash_64", ] @@ -95,6 +97,37 @@ cpdef Table murmurhash3_x64_128( return Table.from_libcudf(move(c_result)) +cpdef Column xxhash_32( + Table input, + uint32_t seed=DEFAULT_HASH_SEED +): + """Computes the xxHash 32-bit hash value of each row in the given table. + + For details, see :cpp:func:`xxhash_32`. + + Parameters + ---------- + input : Table + The table of columns to hash + seed : uint32_t + Optional seed value to use for the hash function + + Returns + ------- + pylibcudf.Column + A column where each row is the hash of a row from the input + """ + + cdef unique_ptr[column] c_result + with nogil: + c_result = cpp_xxhash_32( + input.view(), + seed + ) + + return Column.from_libcudf(move(c_result)) + + cpdef Column xxhash_64( Table input, uint64_t seed=DEFAULT_HASH_SEED diff --git a/python/pylibcudf/pylibcudf/libcudf/hash.pxd b/python/pylibcudf/pylibcudf/libcudf/hash.pxd index 4e8a01b41a5..46fdf62cd6b 100644 --- a/python/pylibcudf/pylibcudf/libcudf/hash.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/hash.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from libc.stdint cimport uint32_t, uint64_t from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -44,6 +44,11 @@ cdef extern from "cudf/hashing.hpp" namespace "cudf::hashing" nogil: const table_view& input ) except +libcudf_exception_handler + cdef unique_ptr[column] xxhash_32( + const table_view& input, + const uint32_t seed + ) except +libcudf_exception_handler + cdef unique_ptr[column] xxhash_64( const table_view& input, const uint64_t seed diff --git a/python/pylibcudf/pylibcudf/tests/test_hashing.py b/python/pylibcudf/pylibcudf/tests/test_hashing.py index 83fb50fa4ef..7096dbe14ff 100644 --- a/python/pylibcudf/pylibcudf/tests/test_hashing.py +++ b/python/pylibcudf/pylibcudf/tests/test_hashing.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import hashlib import struct @@ -34,7 +34,9 @@ def hash_single_uint32(val, seed=0): def hash_combine_32(lhs, rhs): - return np.uint32(lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2))) + return np.uint32( + int((lhs ^ (rhs + 0x9E3779B9 + (lhs << 6) + (lhs >> 2)))) % 2**32 + ) def uint_hash_combine_32(lhs, rhs): @@ -80,22 +82,6 @@ def list_struct_table(): return data -def python_hash_value(x, method): - if method == "murmurhash3_x86_32": - return libcudf_mmh3_x86_32(x) - elif method == "murmurhash3_x64_128": - hasher = mmh3.mmh3_x64_128(seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED) - hasher.update(x) - # libcudf returns a tuple of two 64-bit integers - return hasher.utupledigest() - elif method == "xxhash_64": - return xxhash.xxh64( - x, seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED - ).intdigest() - else: - return getattr(hashlib, method)(x).hexdigest() - - @pytest.mark.parametrize( "method", ["sha1", "sha224", "sha256", "sha384", "sha512", "md5"] ) @@ -115,6 +101,23 @@ def py_hasher(val): assert_column_eq(got, expect) +def test_hash_column_xxhash32(pa_scalar_input_column, plc_scalar_input_tbl): + def py_hasher(val): + return xxhash.xxh32( + scalar_to_binary(val), seed=plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ).intdigest() + + expect = pa.array( + [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], + type=pa.uint32(), + ) + got = plc.hashing.xxhash_32( + plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) + + assert_column_eq(got, expect) + + def test_hash_column_xxhash64(pa_scalar_input_column, plc_scalar_input_tbl): def py_hasher(val): return xxhash.xxh64( @@ -125,7 +128,9 @@ def py_hasher(val): [py_hasher(val) for val in pa_scalar_input_column.to_pylist()], type=pa.uint64(), ) - got = plc.hashing.xxhash_64(plc_scalar_input_tbl, 0) + got = plc.hashing.xxhash_64( + plc_scalar_input_tbl, plc.hashing.LIBCUDF_DEFAULT_HASH_SEED + ) assert_column_eq(got, expect) From 4e97cd44ef4838a20a641aee3eb1a0e59ec21491 Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Tue, 7 Jan 2025 14:29:11 -0500 Subject: [PATCH 17/19] Fix the ORC decoding bug for the timestamp data (#17570) This PR introduces a band-aid class `run_cache_manager` to handle an exceptional case in TIMESTAMP data type, where the DATA stream (seconds) is processed ahead of SECONDARY stream (nanoseconds) and the excess rows are lost. The fix uses `run_cache_manager` (and also `cache_helper`, which is an implementation detail) to cache the potentially missed data from the DATA stream and let them be used in the next decoding iteration, thus preventing data loss. Closes #17155 Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) Approvers: - Matthew Murray (https://github.com/Matt711) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17570 --- cpp/src/io/orc/stripe_data.cu | 205 +++++++++++++++++- ...rcFile.timestamp.desynced.snappy.RLEv2.orc | Bin 0 -> 5832 bytes ....timestamp.desynced.uncompressed.RLEv2.orc | Bin 0 -> 5814 bytes python/cudf/cudf/tests/test_orc.py | 24 +- 4 files changed, 225 insertions(+), 4 deletions(-) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu index 1572b7246c0..1f84d1f81dc 100644 --- a/cpp/src/io/orc/stripe_data.cu +++ b/cpp/src/io/orc/stripe_data.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -132,6 +132,177 @@ struct orcdec_state_s { } vals; }; +/** + * @brief Manage caching of the first run of TIMESTAMP's DATA stream for a row group. + * + * This class is used to address a special case, where the first run of the DATA stream spans two + * adjacent row groups and its length is greater than the maximum length allowed to be consumed. + * This limit is imposed by the decoder when processing the SECONDARY stream. This class shall be + * instantiated in the shared memory, and be used to cache the DATA stream with a decoded data type + * of `int64_t`. As an optimization, the actual cache is implemented in the cache_helper class as a + * local variable and does not reside in the shared memory. + */ +class run_cache_manager { + private: + enum class status : uint8_t { + DISABLED, ///< Run cache manager is disabled. No caching will be performed. If the special case + ///< happens, the run cache manager will be set to this status after the cache read + ///< is completed. This status also applies when the special case does not happen. + CAN_WRITE_TO_CACHE, ///< Run cache manager is ready for write. If the special case happens, the + ///< run cache manager will be set to this status. + CAN_READ_FROM_CACHE, ///< Run cache manager is ready for read. If the special case happens, the + ///< run cache manager will be set to this status after the cache write is + ///< completed. + }; + + public: + /** + * @brief Initialize the run cache manager. + * + * @param[in] s ORC decoder state. + */ + __device__ void initialize(orcdec_state_s* s) + { + _status = (s->top.data.index.run_pos[CI_DATA2] > 0 and s->chunk.type_kind == TIMESTAMP) + ? status::CAN_WRITE_TO_CACHE + : status::DISABLED; + _reusable_length = 0; + _run_length = 0; + } + + private: + status _status; ///< The status of the run cache manager. + uint32_t + _reusable_length; ///< The number of data to be cached and reused later. For example, if a run + ///< has a length of 512 but the maximum length allowed to be consumed is + ///< capped at 162, then 350 (512-162) data will be cached. + uint32_t _run_length; ///< The length of the run, 512 in the above example. + friend class cache_helper; +}; + +/** + * @brief Helper class to help run_cache_manager cache the first run of TIMESTAMP's DATA stream for + * a row group. + * + * The run_cache_manager is intended to be stored in the shared memory, whereas the actual cache is + * in the local storage (as an optimization). If a function is to use run_cache_manager, both the + * manager and the cache objects need to be passed. This class is introduced to simplify the + * function call, so that only a single cache_helper object needs to be passed. To that end, public + * methods originally belonging to run_cache_manager have been moved to this class. + */ +class cache_helper { + public: + /** + * @brief Constructor. + * + * @param[in] run_cache_manager_inst An instance of run_cache_manager. + */ + __device__ explicit cache_helper(run_cache_manager& run_cache_manager_inst) + : _manager(run_cache_manager_inst) + { + } + + /** + * @brief Set the reusable length object. + * + * @param[in] run_length The length of the first run (spanning two adjacent row groups) of the + * DATA stream. + * @param[in] max_length The maximum length allowed to be consumed. This limit is imposed + * by the decoder when processing the SECONDARY stream. + */ + __device__ void set_reusable_length(uint32_t run_length, uint32_t max_length) + { + if (_manager._status == run_cache_manager::status::CAN_WRITE_TO_CACHE) { + _manager._run_length = run_length; + _manager._reusable_length = + (_manager._run_length > max_length) ? (_manager._run_length - max_length) : 0; + } + } + + /** + * @brief Adjust the maximum length allowed to be consumed when the length of the first run is + * greater than it. + * + * @param[in] max_length The maximum length allowed to be consumed for the DATA stream. + * @return A new maximum length. + */ + [[nodiscard]] __device__ uint32_t adjust_max_length(uint32_t max_length) + { + auto new_max_length{max_length}; + if (_manager._status == run_cache_manager::status::CAN_READ_FROM_CACHE) { + new_max_length -= _manager._reusable_length; + } + return new_max_length; + } + + /** + * @brief Copy the excess data from the intermediate buffer for the DATA stream to the cache. + * + * @param[in] src Intermediate buffer for the DATA stream. + */ + __device__ void write_to_cache(int64_t* src) + { + if (_manager._status != run_cache_manager::status::CAN_WRITE_TO_CACHE) { return; } + + auto const tid = threadIdx.x; + + __syncthreads(); + + // All threads in the block always take a uniform code path for the following branches. + // _reusable_length ranges between [0, 512]. + if (_manager._reusable_length > 0) { + auto const length_to_skip = _manager._run_length - _manager._reusable_length; + if (tid < _manager._reusable_length) { + auto const src_idx = tid + length_to_skip; + _storage = src[src_idx]; + } + if (tid == 0) { _manager._status = run_cache_manager::status::CAN_READ_FROM_CACHE; } + } else { + if (tid == 0) { _manager._status = run_cache_manager::status::DISABLED; } + } + + __syncthreads(); + } + + /** + * @brief Copy the cached data to the intermediate buffer for the DATA stream. + * + * @param[in,out] dst Intermediate buffer for the DATA stream. + * @param[in,out] rle Run length decoder state object. + */ + __device__ void read_from_cache(int64_t* dst, orc_rlev2_state_s* rle) + { + if (_manager._status != run_cache_manager::status::CAN_READ_FROM_CACHE) { return; } + + auto const tid = threadIdx.x; + + // First, shift the data up + auto const dst_idx = tid + _manager._reusable_length; + auto const v = (dst_idx < rle->num_vals + _manager._reusable_length) ? dst[tid] : 0; + __syncthreads(); + + if (dst_idx < rle->num_vals + _manager._reusable_length) { dst[dst_idx] = v; } + __syncthreads(); + + // Second, insert the cached data + if (tid < _manager._reusable_length) { dst[tid] = _storage; } + __syncthreads(); + + if (tid == 0) { + // Disable the run cache manager, since cache write-and-read happens at most once per row + // group. + _manager._status = run_cache_manager::status::DISABLED; + rle->num_vals += _manager._reusable_length; + } + + __syncthreads(); + } + + private: + run_cache_manager& _manager; ///< An instance of run_cache_manager. + int64_t _storage; ///< Per-thread cache storage. +}; + /** * @brief Initializes byte stream, modifying length and start position to keep the read pointer * 8-byte aligned. @@ -631,6 +802,8 @@ static const __device__ __constant__ uint8_t ClosestFixedBitsMap[65] = { * @param[in] maxvals maximum number of values to decode * @param[in] t thread id * @param[in] has_buffered_values If true, means there are already buffered values + * @param[in] cache_helper_inst If non-null, the run cache manager will be used to manage + * caching of the first run of the DATA stream. * * @return number of values decoded */ @@ -640,9 +813,11 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, T* vals, uint32_t maxvals, int t, - bool has_buffered_values = false) + bool has_buffered_values = false, + cache_helper* cache_helper_inst = nullptr) { if (t == 0) { + if (cache_helper_inst != nullptr) { maxvals = cache_helper_inst->adjust_max_length(maxvals); } uint32_t maxpos = min(bs->len, bs->pos + (bytestream_buffer_size - 8u)); uint32_t lastpos = bs->pos; auto numvals = 0; @@ -685,6 +860,9 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, l += deltapos; } } + + if (cache_helper_inst != nullptr) { cache_helper_inst->set_reusable_length(n, maxvals); } + if ((numvals != 0) and (numvals + n > maxvals)) break; // case where there are buffered values and can't consume a whole chunk // from decoded values, so skip adding any more to buffer, work on buffered values and then @@ -866,6 +1044,17 @@ static __device__ uint32_t Integer_RLEv2(orc_bytestream_s* bs, __syncwarp(); } __syncthreads(); + // Currently run_cache_manager is only designed to fix the TIMESTAMP's DATA stream bug where the + // data type is int64_t. + if constexpr (cuda::std::is_same_v) { + if (cache_helper_inst != nullptr) { + // Run cache is read from during the 2nd iteration of the top-level while loop in + // gpuDecodeOrcColumnData(). + cache_helper_inst->read_from_cache(vals, rle); + // Run cache is written to during the 1st iteration of the loop. + cache_helper_inst->write_to_cache(vals); + } + } return rle->num_vals; } @@ -1401,6 +1590,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) // Struct doesn't have any data in itself, so skip bool const is_valid = s->chunk.type_kind != STRUCT; size_t const max_num_rows = s->chunk.column_num_rows; + __shared__ run_cache_manager run_cache_manager_inst; + cache_helper cache_helper_inst(run_cache_manager_inst); if (t == 0 and is_valid) { // If we have an index, seek to the initial run and update row positions if (num_rowgroups > 0) { @@ -1443,6 +1634,8 @@ CUDF_KERNEL void __launch_bounds__(block_size) bytestream_init(&s->bs, s->chunk.streams[CI_DATA], s->chunk.strm_len[CI_DATA]); bytestream_init(&s->bs2, s->chunk.streams[CI_DATA2], s->chunk.strm_len[CI_DATA2]); + + run_cache_manager_inst.initialize(s); } __syncthreads(); @@ -1602,7 +1795,13 @@ CUDF_KERNEL void __launch_bounds__(block_size) if (is_rlev1(s->chunk.encoding_kind)) { numvals = Integer_RLEv1(bs, &s->u.rlev1, s->vals.i64, numvals, t); } else { - numvals = Integer_RLEv2(bs, &s->u.rlev2, s->vals.i64, numvals, t); + numvals = Integer_RLEv2(bs, + &s->u.rlev2, + s->vals.i64, + numvals, + t, + false /**has_buffered_values */, + &cache_helper_inst); } if (s->chunk.type_kind == DECIMAL) { // If we're using an index, we may have to drop values from the initial run diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.orc new file mode 100644 index 0000000000000000000000000000000000000000..a0ea4fbbfc2fdc0fc9f7b1adbdf4ec6fef161de8 GIT binary patch literal 5832 zcmZ{oYjhjsoyPx>CCiKg1aV0iC^))nboEHG<&m!=$(BcwB~NI#6@h?&(hj(QFm@O& z2?7d?+omhnLz!?DblXnsBq#|9lN*Rz8CXIT3d!VJ5f*pA4JZM^gphTY5Bq7q{D04R z-{<+C_ni0JlcA$l07P%pwXB?<{XraU$->hB*#(FhX$IZASBw|WT~fwHD^H#LGvCo# zS+h~#JN*0hw9W;bNDC~w9^O3X$tL5ET8_3*7cDAoo!8pedg6`kYxoYwtNUC1FF)zX z!W961_p4`C&3^J~lV*;e=vrDx*N%fnNHZuyZVF5D{G*cmSuHDS&Hn3qc*Jk*~Eccj|UbEWg3xE0mN?Nd}g@{?GxFx3#GJ_U2Vc~`>JY^B|8^x4GN?YV% zi;}UZEg@vYiqckWiIup*O0{gt{lv=ro0a{kmHU~MKPx0$X%&}RrK_y+)mG)KP3ko^ zlMU_4EjIO5J93*HE!Z)` zW@5}v-C@s-+nEVFJHLaw)6TPY;V!#)w>3O}!(t&*XFuLnu?0yIFfP;F_ zk^8lSIeQEHu!DQV!E+8_(jlG`mVWDyf9FtEIn>`fk#k-~OHS-jC-In*`VVJr-rLOM zPWBH@?g=N)JB4#!7N2xVPdVkMoys##bpe4qhoNN*dmbZRz^Dac?msc+PZ;}WjQb15 zpGOF5FmVc#rZIUfrkp3L>s`o;F0|spHn<35gxciFz2ssxyVxx*&bXBqUBXtExXmSP zcgg2Rl$|d16*uy#8?Cyr^S2SNyQw$axi{U+TW(g9;@)=il3RGkExzlPF4!i&=T>&P z)%QKfZV!54H@3$^eBhz}=E;5NVJ_Ux?)7l{JiP1?_It#Ox}}32`H)9B>`{++k&AYq zbuad@mpJOBKJn%j_AsA%*%>eQ4==BHg@rrB&%M&DSN^A0Ip$Rt^&(&T(1s5??juet z4lml7``R1+#>bxYaerFHYjz0}tHkeo(rNRS?|sU}>$mh@{`+&+pa0dt|JJbg9lq6m z?D4JIFYdehl}vWwA?>#-pEz^Yv9njKIdhEx4&Cv}xOTl~*V1#ZnbhvZULBvme)RsS z%cI>bd#UR91?#6Dobp{eIsg4V(eX>y&ptHOn>Q@j%e_7p+U_R$V`7*@wM7 z)>#{Ubn2NK432%2G=4>8dS2JM8z$ZRn0LmPR%Rcc+8N#M-^aZ>e*Ff|6H~i>J{f%f zczXQi4Wp~44&P`9?~~pe>)tc{)YP#XC%g73yT2H+^pO zbWFx}kN;-l==0N;|I(1y-9(Q+wsHE8)4pF$retQ%SpNsJf12)P3|X1`VElzmo&}pS z%;Xia@VD`)O`~h3m&Lk&DoY=ZZ`?FJJ$=hflS|)U_|f>bO|$E!SKMs4W5{=rMro2J*r?zwqCvw!@+OS7A&cmB$7>u&SX z@uQnPFHi6K)#TWI;o#WNN26P(58q-~zF#^teq!^qanrF|Chy*_93KB}^X$&)GqLR} z4}^uAP{_W%RYRm)~Z1_yAQOOYfb2W3BJD$=@7cKAyOA%j~=z zy#>SX4{%2(EH8WBUYm)Xe(Zqo$%Omm(RbD^yWQ}I-O-;<1Ye&1ueG<_KKbMU<4yHURpIa`NBc$9}_8YbkEvn;@y8fNGTInh|_;tyY7z3sohI% znpi5%ezbPyxMBT4?(?x9@AvFmyK8)M<3V9|;^wWR7ru0O!m#C_^v{X0t&{(1`T8O3%dwvw82xzN<#!tXdWdRFJhpB6 zqRqZLC*M899G`e{+w7<7dgI&QKg6AwcwxKeAL}ygVr~g8o;fNO>T`}faF{{1j)ysE&(d65zJNC+Vt;;T5_^$3m-|4<( z*DpNObh2-UzxNOPbuICw_dcP1I{Vbp6&HVZWb@%e|9#u{pPqhv=}Hu4bkAP)UO_W+ z_Om_rEjs-Oryh)BWuC515AepB^m9EAq8H5k__?d+Uv|;VRnOh~Yjn}f4bMIOYaTuF+TV9PgkCyx$8(1s z;xC)I=eh4e(>n8D`Lc(3)68$nJ&&N4BX8BOeuTHpJXOAzLme}JEI&Q}X>4Y?yko_c z->g`DYUhz*DO;)UFtCg^lR?13SbJo0-u@TyN#9B`a?&TLo*yDq5>n zsp%c7Y^_@rYs0GAG&aO$w4qIVY?uwV5jN6B*=SqNmbWoY`)#bPXya@p8*eMy1Y5-> zHhpZ9Y&Dx~tJ@S?!=~CbcBJVuJ8C!CF*|N2?4+Ht)An4`aXVu#*jang&e=l;xT$qkI931a1Y@jJ(PCRlk?;~jHlpX zJw*@aDS3G9qaMLi@ra(PNAlD>vZwA*w5vU;SK~#zMlb3$c`+~UCA80bDKG8KdGlVz zTkx{pqL3cY76Y!>jr6yp3oh z+o(3WEvG%+#=5-!3ebUEARk}?g#a5U1~{E7zz50!Ay5g3foebs)B>_D5Ksb*fEv^Uk)Sb% z22DXs7YPzUGDrpKU@n*sGQmQS)kT9`uoUEj<)9F(1jS%ADCv?xIam)W!A4LGX+lWI z7((GUAuNQ4h!7c~LUbq>%7+-;l_54%3~`}Sh!2%RLZ}iFb=QTYP%R{f>LDf62&wIw zc0_kmJKAn)$J+6B;+8YePC`2c?KHIKKzBQ|Gtgdub{5);(9S`73EFvRFGITky1SrV zg!U@5OVD0}b{X31(5^sx1KL&4-47iIbQqxng$@&RV92LT-fE%tN>gVFAJw2#XM|LRf-u4Z)FGlkqyZ5Xnwy{tfi5F-q0nW5E)2SG=pvwtgf0r24bYW?t~_)x&{cpg7P^Yi z#X(mIx_D?lAG!qSsz8?rT~+9kpsNO5GIZ6UOM&J~pi6~r4Rj;WZG>+0pRb>UZVbBd zW2P<8Y=&+Mx@qXnL3bXynPaDyLpKZE#V@W5L$eFI`7gH2`32~%K(`3pRp^$WyY}Uy zbJ+lND_?#&=U1Ue)5wIO#|S+r^q3m&&SfLeLmYRWgdPfd=;IG?Ux%w;RkBYyRxIllzWN$8QGrw%;|^fbOYxdnPP(2IQiljYEhLi3WZ zH-@1Xhh74DN$91ZmxkWlH@`axy$m#8`OW8Z{vz~pC#g9<54~mR6;8f2r$uPK?ptdZ zdTY=te_K2Wy$bX;pjU-H4fG+l%Nm!O{qn1X%*`YX^cLVp$d zCFrj~zYP6#=%4%F)zGg3sR0rJ(g-99qzOn2NE}E4nx6-f0+I$Y2V@>d2FL=CERaPY zIcQ!3BoAa6NCC(SkRp&(ASEDcK+4d(0Z0YN29PR5H4sH0YJ?~XQ4>TlXx<7@0-_{D zDTvY#%|SE|Q3j#~h_cZ98bmpWmLSSQv6^Mxd z8zCk^tOhX|Vs(fq5Nkk8g}4Ud2sD2NaTMYvh+`1PAx=P?gg6Cp8sa%bLwGJ}OdcCa|e>0N{TV0lm&tPF~S)j?^nHYmfkL1nNp zs3tTCBw7QYH$caEhQvilS&Lr@x(Ir~<`OMT(KOmX@(DLz$B38_j- zOjT1-s+N-V8&XQDky6u|G?F%^(X=Uz>9?kdG?}K-bUK&Lr2!n?!RFg{ESlf%?7J)9fP4>S7x!|ZTzm>Vt) z^TXv~VYo6Z>OUTqhHJy}aD7-AZVam#O$O0_mO(S743@z&M25^z89I~GAI~tELWa#0 zGhC*W;WOoopg)xnGu4cgsb%C$J)>kA8MUP;i)4*iG;7LYSv*T*$t=}k$mX*7ER!u{ z*=#Y(WlLGU<@~IWtz^Y)H7jLnSvgzJDlL~})e+4IGGZJ-M@%Ew2tGo<2P4!7J(3&A zk1!*J5q6|F!nL?Y_>uC6Fj5&2N2(*zNNq%J35+NsjS-dB&lW zX*uf$@xy-N|KH!Ya(-f7UWa@A-ayOE3p9&XuGE?|FyCSKHy2MdHD&1N!v6ue ClTzIP literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc new file mode 100644 index 0000000000000000000000000000000000000000..8a7969cdbbb380dc92c2f5fc72642a990fb204f0 GIT binary patch literal 5814 zcmZ{oYj_*gnaAJAl4V8#f(RrGlsLL;boEHG<&m!=$(BcwWlw0g6@h?&whXv{Fm@O& z2_h62w@p{D4`sqt&}}=hlb|FdOl}};-*cYle0wu=6ww=XO)KXP!N0`urdhEV(ZcL7UN*Z=85gcRbNY{bTXT8sCV_AB zAK2GCYq2AZ3vYbuf~OjcKWIAML|wYDuX}bo zYtYQ{lYeo|>cii!Kl<>0`QAVI>f0Z^^-n%=?!1}VtC2a)sHPcfXePAH6l};gHZ%HW zwyBvrubH3Y7Yxnf+-B+gX8D3juXf<-Mv%tFO2S$&Y{x3CEdH(=o@i=f{mrYut0A`e=Wj74n;~RR_5QV?2oP7PptfTA>nGPxYR0LW0kM9D(7uhud^Z7+t8d1yTL}x>7#D4Wq)d8 zer98TZsU-bdB!H(ViRw*Nh3Dd(5l>KQ*XB;ci7Rq9W!hpM(xyH_UxFQ8Mm`@+qk>! zJZl&3v5WUw!*e%oz2B~^u&XN_$fu8>dmh0abP%f?)I*N!FCEPJTiHh(+@lVja|ja- z@q)1QYlr+BhqBtC{?>_H@CsUVVvjkA$DP!FIJ0x!VV-cZzjJa=I(gnHT=;>o^`782;_MTEn(OT81W)T%@ec#i7|h`*gs<2pD_L+LRgE5lbAGx$?GuXB2nGo zLSAyAWf!*5MHnO0W>@xQ7qi91Zgp|SZM^6bwzX~X8_dyu^z^pZ|&pNIIsL;b~* z{m{c)vV-04;SP9s*&`hEh?jOshduHUk8;$b9`hoX?nG-|>|-x++)I7p%`WI-KJ~KG zUhZ#RUhxVGc8Z^Sr5UgMcdv57t1j$DzVxAWA9m75oLU@SxGVd$H~fu{J?-QEu$tHG z5yn@G-}lqLJlUNy%-hesF&f(Id2}+9o0z{}cyrAC(#ZLnmfc`jxL`ey80ZJOXsysEP zWBpAN?gPy3v8CmiCnk4AclZx*?~dKL(evcwo}W$x-#?iiyLIEpn#rR-GlUOF?~Qiu zn|gZk#Lp%=4k&xZ?%g3TEyJceO`wKoA+rD{b{nUzE4c8q+_K(It^t?2+`qqgX4`K(#-hFxGqRr2a z7=C_`lE*%Hd1~|2`q+K99%K%V9eR0Y%hax47;fKdUOIMsi|3W8J-?V3Jt!O=9r$Qu z+tkt949gEnN5)QVnKEuZaofbb2bH5^-)@=NHFYkwW91>_*tmA9=e2bg-afHvZ{Ygz zxm!nGU$^KE!y|{N+Gu+J)SK&kcTD{15cBc)6JK5lu%^Uk_V?Cj%* zgiprZuZ-+ox9m>C@AgK2Iv#vw>c7_AcIU)Xhm=o8hY!r`S+`=;@Z4c!dORk2=D)mp zbmGOs*x$xe;>f;r&&E6dc$iYguM(&JvTprd6O((F+%mpYocU0?umDgFek^K+CKB?`tJCS z_m6O=#$Vjw`P=#oJMqEZJMS8w+%fXm`epYR_8*bH9=+!9)Xe(Z?wL4rMEPcX`;M6f zTUOj_ICc~{J^sc{&zI|0$Il)=iv45!-JK&R*FPKkXnOC1?Dz*e*M7Br{qm1KKgygL zy?+1NZ`SX+-|&Te*SF)xcX>cdeK&FHqr1+If4*zvj3B+$dHSgI-RR7Usk48g&T+&` zkF6N>teDZ>_u8UeUo`l(=}x@*ZS%4#7QCxF)pNFI*^LX1G@S0)>F@p>e?wDz>HSZt zpUynJbj4-g9@}#C$baAQ-KS@tSh^C0Y29;IzL(cbpZ{Ff0}Ido`I6_ZeSmK~w(Z#c z1+;0}`rI=G9+~z$x6^QB?sVw6BM@I@o2GwV z>UtEl9DBQV?W4SH`svdB9O{_8EQ}>@VJ!s<*Ra;YTS^weQnrYeibb+iEpo#~i(;u;RIA2{SdCWHYO-Pt z+pL6@v{F{unziPvj5Tj%;YKTGEn0bN$tqaOR?%9qN)5ZMvbAPataYnu)7TK3(S|nc zvtc&eM%YLjWut9bTh7Kb9JH~vf{n8kZM>~y6KrLh*zmDUvQ=%et!7hfb(?C}*pY_M z?5N#j$LzSBu#@TWyQ*z) zAP%Dgb(kEO19uP((m`nrj;tf+U>tb|>nJ!lN72EQO@x^gZ?d$o&o6P5RuBuCR)m)0J z?o!cbD9P_8zzBuDB(4)h)Yg zZpB@9tJ()Wh{xzbJthz4!99eB^ibLfPu7$3FrK`J^%Ojur|99ek9h=7*&})?9?4Vn z$ex-<(XR2RUX2&=8oj93& z@>abv-0M}mb+78v_z<7bhx$xDOuNlT_(&h+qkUOl&d2!jK34m>kMk9MyszXFd}W{L ztN0}CZlCO{`4nH>r}{O1#BcPY+I@b^kNXKf>8Jd(KkLu=8SOzo>o52@f6>qTOMby$ z_KVt&{gS`xm;E)r;;;MF7EKGH{j3FTF|}YVcni@&wool}OICZbg=xvRuq}lauBF(* zx0G50?U@#_rP3m`R9obhT8q+BZ&6_tj^IWd#Z5Sd<2ZqnIHfb-Sv-d`cphi*0?y$@ zoY!593wRk9@d_^CRb0kvxT3opR|A>=5-!3ebUUAQxZ)`2ZUz z1UQ{5zz0eJAy5v8fl5FMR0Fau5Ksd3fEv^Uk)Sb%22DXs7YPzUGDrpKU^bWwGQoV1 z)kT9`uo&cnrJxWj2gP6|DCv?xIamuS!Fo^)X+lWI7((G!AuNQ4h!7c~LUbq_%7qx+ z)gd-i2yvleh!2%QLZ}=PbvJ~hP&FinY9S?552>x1Rz!D8E81#m#ai)J;e; zJqT?Gv>Bldg*FqkVbF#{8v$)3v{9g&fVM2O<)Dp$wmh`4&{lvp4%&*)#)IxLXcM5V z3~eH`RiI6RwkovA&{l&s1$1kmO@*)q!U%+o5Jn+vf-nYQ9Kr;^;}E7GOhY&e;T(h+ z2fF%tN>YVFAKr2#XM|Kv;rs6~Z#;HbPi|a2>)bv}>Rpfp#ObqtI@G zb_{gepq+qr651(fr=dLy?Kx;?pgj-mEa+Z`b`IK$(9T193EBl{FGITs?GrWKtzH_6(TZ3Y7kK%Qiq5NjSbL&K!*`J zQ0Op02L>HDbP&)%LI(wn2I$B_M-Dm|=*U9{3mpaM;Gm-j9XvE%3>^Y=l%Yd}jtX>0 z&{2gB89Hjvp+Mv1(4j)7209VwG(so(_cu;MCkCDP3DZ_+G(#r^oiuc2p)&`a%!#wh zp_7Hq!WUPEq0t4M{1;ni{Q`8Bp;Ls;3Uo@)S^e^{*=zthl`p@X^{ddOsb|8_WrQvi zx=i(VXR{ILB2GF_Ll*^I^vQ>}LRSvD80gAF7YlF+baAJ0%b|;huF|PjXR~GK62JPv ztY3o0By`EpRf8@Cy6Rt@-U{6s=tjQ&(Q@cUp>fIAo5IkILpK54By>~IO+$C~o8O#< zZU!2!{^s*pe*wC=)6}e=hwc(|3#Z?n)gmyc~M7 z(3?AVe;9i6(91$^0eU&;EkZ93FbTZ^^p>Glgx(7DO3+({UKx68&^!CTYoJ#JQUfFc zq!CCINE474kT{S8G`;{N1tbk*7RVfs43K#sSs)8Qa?rRINFK-%kOGipAVnZ6KuSPX zfs~TxF&tt9#7Kxy z5ThZMg;)+^3^X2u7z?oi#5jl*A;v?j1Tg_(Wr&FYn;<4ZtO_w1Vl{{<5UWE>g}4Ud z2sC~MaTMYvh+`1PAx=P?gg6Cp8sb@KJPC0I;(3U(5HCQSgLo0*Jj6>77ohPB#6^f# zATB|?3UL|YHHa$^uR~k~eFO9%&}W1`6#7iihe00>eZ;agePkb{H}qxua(zr+zK`uI z^l^Q~K3;!upU_wC6Z0SMNf2m*SFZYZ6m42ze+AqWQex<+OuO>7JBw_BdS(O*5l4ipBsf#Lu^P#O>h$^)YQh5>1yIv@|!29$yNfJ$j7M1Ko~QYH$c zaEhQvilS&LtG|JmvXJDG#U!tPEGZ<*NikVTO37+cPS%o&eoazMX;Mham_k#g z6qdqMg#LvTm7-JGR4&D&@+me|NOAhLDLz$738`{QOjS}+s+yAZ8&gWEo>J4AG?F%^ z(X=Uz>9?hcG?}K-bUK^PrI~a-&FWuIbLnE5PnXg{x||l%m9(VaotD$Jw34o;)j`c5 zGH4t`;oU)O5FaE4$w6w69?TBr1{wXqL3Xe($PE?;`N7hlFjyWG^&byPgVjNKur{a+ z)(6#$CWGid%b*!k2Fu_XB12}V44uj9PiB}*KEq}T87@=I@R?FZ(4WbOnMy{=R5Nm> zmQgbGjM~&NgbW#n&>_JFwv%`g9uE{mb50{38 z;qtIJTp5;ztHW|rU|1Qh5396>Mrb3A(k2>fiqHg2(iBb8Svp5Ebe?9LqBKVrX`U|8 z0$rv>x8~;dB1U^b7yhw!tbzV+;M!q z>9VG|#yNA)xg&Qk8@+ktraMQkAGy659N-62Q`33hEC2EN#P`LC|G#+8%DIU-IUVlx ddjm~3&pv+TO07u)a~&3cW8qXoLxzqn_#X}@SVRB- literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index c4b4ef60184..fe143e66407 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import datetime import decimal @@ -1970,3 +1970,25 @@ def test_row_group_alignment(datadir): got = cudf.read_orc(buffer) assert_eq(expected, got) + + +@pytest.mark.parametrize( + "inputfile", + [ + "TestOrcFile.timestamp.desynced.uncompressed.RLEv2.orc", + "TestOrcFile.timestamp.desynced.snappy.RLEv2.orc", + ], +) +def test_orc_reader_desynced_timestamp(datadir, inputfile): + # Test a special case where the DATA stream (second) in a TIMESTAMP column + # is progressed faster than the SECONDARY stream (nanosecond) at the start of a row + # group. In this case, the "run cache manager" in the decoder kernel is used to + # orchestrate the dual-stream processing. + # For more information, see https://github.com/rapidsai/cudf/issues/17155. + + path = datadir / inputfile + + expect = pd.read_orc(path) + got = cudf.read_orc(path) + + assert_frame_equal(cudf.from_pandas(expect), got) From 30c6caa7a5dc5bb18dcba04c87a03a5343f78fd2 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 7 Jan 2025 14:15:02 -0600 Subject: [PATCH 18/19] Remove "legacy" Dask DataFrame support from Dask cuDF (#17558) The legacy Dask DataFrame API is deprecated. We should remove it for 25.02 to reduce maintenance burden. **Blockers**: - [x] https://github.com/rapidsai/dask-cuda/pull/1417 Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - James Lamb (https://github.com/jameslamb) - GALI PREM SAGAR (https://github.com/galipremsagar) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/17558 --- ci/test_python_other.sh | 13 +- ci/test_wheel_dask_cudf.sh | 16 +- python/dask_cudf/dask_cudf/__init__.py | 56 +- .../dask_cudf/dask_cudf/_expr/collection.py | 36 +- python/dask_cudf/dask_cudf/_expr/expr.py | 8 +- python/dask_cudf/dask_cudf/_expr/groupby.py | 266 ++++- python/dask_cudf/dask_cudf/_legacy/core.py | 711 -------------- python/dask_cudf/dask_cudf/_legacy/groupby.py | 909 ------------------ .../dask_cudf/_legacy/io/__init__.py | 12 +- python/dask_cudf/dask_cudf/_legacy/io/csv.py | 222 ----- python/dask_cudf/dask_cudf/_legacy/io/json.py | 209 ---- python/dask_cudf/dask_cudf/_legacy/io/orc.py | 195 ---- .../dask_cudf/dask_cudf/_legacy/io/parquet.py | 64 +- python/dask_cudf/dask_cudf/_legacy/io/text.py | 56 -- python/dask_cudf/dask_cudf/_legacy/sorting.py | 361 ------- python/dask_cudf/dask_cudf/backends.py | 149 +-- python/dask_cudf/dask_cudf/core.py | 61 +- python/dask_cudf/dask_cudf/io/__init__.py | 15 +- python/dask_cudf/dask_cudf/io/csv.py | 12 +- python/dask_cudf/dask_cudf/io/json.py | 213 +++- python/dask_cudf/dask_cudf/io/orc.py | 208 +++- python/dask_cudf/dask_cudf/io/parquet.py | 16 +- .../dask_cudf/dask_cudf/io/tests/test_json.py | 10 +- .../dask_cudf/dask_cudf/io/tests/test_orc.py | 10 +- .../dask_cudf/io/tests/test_parquet.py | 39 +- .../dask_cudf/dask_cudf/io/tests/test_s3.py | 5 +- .../dask_cudf/dask_cudf/io/tests/test_text.py | 10 +- python/dask_cudf/dask_cudf/io/text.py | 60 +- .../dask_cudf/tests/test_accessor.py | 4 +- python/dask_cudf/dask_cudf/tests/test_core.py | 104 +- .../dask_cudf/dask_cudf/tests/test_groupby.py | 69 +- .../dask_cudf/dask_cudf/tests/test_onehot.py | 8 +- python/dask_cudf/dask_cudf/tests/test_sort.py | 4 +- python/dask_cudf/dask_cudf/tests/utils.py | 35 +- python/dask_cudf/pyproject.toml | 9 +- 35 files changed, 864 insertions(+), 3311 deletions(-) delete mode 100644 python/dask_cudf/dask_cudf/_legacy/core.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/groupby.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/io/csv.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/io/json.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/io/orc.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/io/text.py delete mode 100644 python/dask_cudf/dask_cudf/_legacy/sorting.py diff --git a/ci/test_python_other.sh b/ci/test_python_other.sh index db86721755d..3c6dba72164 100755 --- a/ci/test_python_other.sh +++ b/ci/test_python_other.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # Support invoking test_python_cudf.sh outside the script directory cd "$(dirname "$(realpath "${BASH_SOURCE[0]}")")"/../ @@ -24,8 +24,8 @@ EXITCODE=0 trap "EXITCODE=1" ERR set +e -rapids-logger "pytest dask_cudf (dask-expr)" -DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ +rapids-logger "pytest dask_cudf" +./ci/run_dask_cudf_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ @@ -34,13 +34,6 @@ DASK_DATAFRAME__QUERY_PLANNING=True ./ci/run_dask_cudf_pytests.sh \ --cov-report=xml:"${RAPIDS_COVERAGE_DIR}/dask-cudf-coverage.xml" \ --cov-report=term -rapids-logger "pytest dask_cudf (legacy)" -DASK_DATAFRAME__QUERY_PLANNING=False ./ci/run_dask_cudf_pytests.sh \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . - rapids-logger "pytest cudf_kafka" ./ci/run_cudf_kafka_pytests.sh \ --junitxml="${RAPIDS_TESTS_DIR}/junit-cudf-kafka.xml" diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index e15949f4bdb..44f430ce98d 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -1,5 +1,5 @@ #!/bin/bash -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. set -eou pipefail @@ -30,21 +30,11 @@ RAPIDS_TESTS_DIR=${RAPIDS_TESTS_DIR:-"${RESULTS_DIR}/test-results"}/ mkdir -p "${RAPIDS_TESTS_DIR}" # Run tests in dask_cudf/tests and dask_cudf/io/tests -rapids-logger "pytest dask_cudf (dask-expr)" +rapids-logger "pytest dask_cudf" pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=True python -m pytest \ +python -m pytest \ --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf.xml" \ --numprocesses=8 \ --dist=worksteal \ . popd - -# Run tests in dask_cudf/tests and dask_cudf/io/tests (legacy) -rapids-logger "pytest dask_cudf (legacy)" -pushd python/dask_cudf/dask_cudf -DASK_DATAFRAME__QUERY_PLANNING=False python -m pytest \ - --junitxml="${RAPIDS_TESTS_DIR}/junit-dask-cudf-legacy.xml" \ - --numprocesses=8 \ - --dist=worksteal \ - . -popd diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index 20eb2404b77..863102103ed 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -1,7 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import warnings -from importlib import import_module +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import dask.dataframe as dd from dask import config @@ -9,11 +6,16 @@ import cudf -from . import backends # noqa: F401 +from . import backends, io # noqa: F401 +from ._expr.expr import _patch_dask_expr from ._version import __git_commit__, __version__ # noqa: F401 -from .core import DataFrame, Index, Series, concat, from_cudf +from .core import DataFrame, Index, Series, _deprecated_api, concat, from_cudf -QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED +if not (QUERY_PLANNING_ON := dd._dask_expr_enabled()): + raise ValueError( + "The legacy DataFrame API is not supported in dask_cudf>24.12. " + "Please enable query-planning, or downgrade to dask_cudf<=24.12" + ) def read_csv(*args, **kwargs): @@ -36,46 +38,18 @@ def read_parquet(*args, **kwargs): return dd.read_parquet(*args, **kwargs) -def _deprecated_api(old_api, new_api=None, rec=None): - def inner_func(*args, **kwargs): - if new_api: - # Use alternative - msg = f"{old_api} is now deprecated. " - msg += rec or f"Please use {new_api} instead." - warnings.warn(msg, FutureWarning) - new_attr = new_api.split(".") - module = import_module(".".join(new_attr[:-1])) - return getattr(module, new_attr[-1])(*args, **kwargs) - - # No alternative - raise an error - raise NotImplementedError( - f"{old_api} is no longer supported. " + (rec or "") - ) - - return inner_func - - -if QUERY_PLANNING_ON: - from . import io - from ._expr.expr import _patch_dask_expr - - groupby_agg = _deprecated_api("dask_cudf.groupby_agg") - read_text = DataFrame.read_text - _patch_dask_expr() - -else: - from . import io # noqa: F401 - from ._legacy.groupby import groupby_agg # noqa: F401 - from ._legacy.io import read_text # noqa: F401 - - +groupby_agg = _deprecated_api("dask_cudf.groupby_agg") +read_text = DataFrame.read_text to_orc = _deprecated_api( "dask_cudf.to_orc", - new_api="dask_cudf._legacy.io.to_orc", + new_api="dask_cudf.io.to_orc", rec="Please use DataFrame.to_orc instead.", ) +_patch_dask_expr() + + __all__ = [ "DataFrame", "Index", diff --git a/python/dask_cudf/dask_cudf/_expr/collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py index 5192e6b8171..e8c9a970b7b 100644 --- a/python/dask_cudf/dask_cudf/_expr/collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import warnings from functools import cached_property @@ -15,19 +15,11 @@ from dask import config from dask.dataframe.core import is_dataframe_like +from dask.dataframe.dispatch import get_parallel_type from dask.typing import no_default import cudf -_LEGACY_WORKAROUND = ( - "To enable the 'legacy' dask-cudf API, set the " - "global 'dataframe.query-planning' config to " - "`False` before dask is imported. This can also " - "be done by setting an environment variable: " - "`DASK_DATAFRAME__QUERY_PLANNING=False` " -) - - ## ## Custom collection classes ## @@ -103,9 +95,8 @@ def set_index( divisions = None warnings.warn( "Ignoring divisions='quantile'. This option is now " - "deprecated. Please use the legacy API and raise an " - "issue on github if this feature is necessary." - f"\n{_LEGACY_WORKAROUND}", + "deprecated. Please raise an issue on github if this " + "feature is necessary.", FutureWarning, ) @@ -135,9 +126,7 @@ def groupby( if kwargs.pop("as_index") is not True: raise NotImplementedError( - f"{msg} Please reset the index after aggregating, or " - "use the legacy API if `as_index=False` is required.\n" - f"{_LEGACY_WORKAROUND}" + f"{msg} Please reset the index after aggregating." ) else: warnings.warn(msg, FutureWarning) @@ -153,15 +142,15 @@ def groupby( ) def to_orc(self, *args, **kwargs): - from dask_cudf._legacy.io import to_orc + from dask_cudf.io.orc import to_orc as to_orc_impl - return to_orc(self, *args, **kwargs) + return to_orc_impl(self, *args, **kwargs) @staticmethod def read_text(*args, **kwargs): - from dask_cudf._legacy.io.text import read_text as legacy_read_text + from dask_cudf.io.text import read_text as read_text_impl - return legacy_read_text(*args, **kwargs) + return read_text_impl(*args, **kwargs) def clip(self, lower=None, upper=None, axis=1): if axis not in (None, 1): @@ -197,6 +186,13 @@ class Index(DXIndex, CudfFrameBase): pass # Same as pandas (for now) +# dask.dataframe dispatch +get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) +get_parallel_type.register(cudf.Series, lambda _: Series) +get_parallel_type.register(cudf.BaseIndex, lambda _: Index) + + +# dask_expr dispatch (might go away?) get_collection_type.register(cudf.DataFrame, lambda _: DataFrame) get_collection_type.register(cudf.Series, lambda _: Series) get_collection_type.register(cudf.BaseIndex, lambda _: Index) diff --git a/python/dask_cudf/dask_cudf/_expr/expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py index 8b91e53604c..03d1da0d258 100644 --- a/python/dask_cudf/dask_cudf/_expr/expr.py +++ b/python/dask_cudf/dask_cudf/_expr/expr.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import functools import dask_expr._shuffle as _shuffle_module @@ -7,13 +7,13 @@ from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var -from dask.dataframe.core import ( - is_dataframe_like, +from dask.dataframe.dispatch import ( + is_categorical_dtype, make_meta, meta_nonempty, ) -from dask.dataframe.dispatch import is_categorical_dtype from dask.typing import no_default +from dask.utils import is_dataframe_like import cudf diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py index 0242fac6e72..a5cdd43169b 100644 --- a/python/dask_cudf/dask_cudf/_expr/groupby.py +++ b/python/dask_cudf/dask_cudf/_expr/groupby.py @@ -1,6 +1,7 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2025, NVIDIA CORPORATION. import functools +import numpy as np import pandas as pd from dask_expr._collection import new_collection from dask_expr._groupby import ( @@ -16,11 +17,262 @@ from dask.dataframe.groupby import Aggregation from cudf.core.groupby.groupby import _deprecate_collect +from cudf.utils.performance_tracking import _dask_cudf_performance_tracking ## ## Fused groupby aggregations ## +OPTIMIZED_AGGS = ( + "count", + "mean", + "std", + "var", + "sum", + "min", + "max", + list, + "first", + "last", +) + + +def _make_name(col_name, sep="_"): + """Combine elements of `col_name` into a single string, or no-op if + `col_name` is already a string + """ + if isinstance(col_name, str): + return col_name + return sep.join(name for name in col_name if name != "") + + +@_dask_cudf_performance_tracking +def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): + """Initial partition-level aggregation task. + + This is the first operation to be executed on each input + partition in `groupby_agg`. Depending on `aggs`, four possible + groupby aggregations ("count", "sum", "min", and "max") are + performed. The result is then partitioned (by hashing `gb_cols`) + into a number of distinct dictionary elements. The number of + elements in the output dictionary (`split_out`) corresponds to + the number of partitions in the final output of `groupby_agg`. + """ + + # Modify dict for initial (partition-wise) aggregations + _agg_dict = {} + for col, agg_list in aggs.items(): + _agg_dict[col] = set() + for agg in agg_list: + if agg in ("mean", "std", "var"): + _agg_dict[col].add("count") + _agg_dict[col].add("sum") + else: + _agg_dict[col].add(agg) + _agg_dict[col] = list(_agg_dict[col]) + if set(agg_list).intersection({"std", "var"}): + pow2_name = _make_name((col, "pow2"), sep=sep) + df[pow2_name] = df[col].astype("float64").pow(2) + _agg_dict[pow2_name] = ["sum"] + + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + _agg_dict + ) + output_columns = [_make_name(name, sep=sep) for name in gb.columns] + gb.columns = output_columns + # Return with deterministic column ordering + return gb[sorted(output_columns)] + + +@_dask_cudf_performance_tracking +def _tree_node_agg(df, gb_cols, dropna, sort, sep): + """Node in groupby-aggregation reduction tree. + + The input DataFrame (`df`) corresponds to the + concatenated output of one or more `_groupby_partition_agg` + tasks. In this function, "sum", "min" and/or "max" groupby + aggregations will be used to combine the statistics for + duplicate keys. + """ + + agg_dict = {} + for col in df.columns: + if col in gb_cols: + continue + agg = col.split(sep)[-1] + if agg in ("count", "sum"): + agg_dict[col] = ["sum"] + elif agg == "list": + agg_dict[col] = [list] + elif agg in OPTIMIZED_AGGS: + agg_dict[col] = [agg] + else: + raise ValueError(f"Unexpected aggregation: {agg}") + + gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( + agg_dict + ) + + # Don't include the last aggregation in the column names + output_columns = [ + _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep) + for name in gb.columns + ] + gb.columns = output_columns + # Return with deterministic column ordering + return gb[sorted(output_columns)] + + +@_dask_cudf_performance_tracking +def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): + """Calculate variance (given count, sum, and sum-squared columns).""" + + # Select count, sum, and sum-squared + n = df[count_name] + x = df[sum_name] + x2 = df[pow2_sum_name] + + # Use sum-squared approach to get variance + var = x2 - x**2 / n + div = n - ddof + div[div < 1] = 1 # Avoid division by 0 + var /= div + + # Set appropriate NaN elements + # (since we avoided 0-division) + var[(n - ddof) == 0] = np.nan + + return var + + +@_dask_cudf_performance_tracking +def _finalize_gb_agg( + gb_in, + gb_cols, + aggs, + columns, + final_columns, + as_index, + dropna, + sort, + sep, + str_cols_out, + aggs_renames, +): + """Final aggregation task. + + This is the final operation on each output partitions + of the `groupby_agg` algorithm. This function must + take care of higher-order aggregations, like "mean", + "std" and "var". We also need to deal with the column + index, the row index, and final sorting behavior. + """ + + gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep) + + # Deal with higher-order aggregations + for col in columns: + agg_list = aggs.get(col, []) + agg_set = set(agg_list) + if agg_set.intersection({"mean", "std", "var"}): + count_name = _make_name((col, "count"), sep=sep) + sum_name = _make_name((col, "sum"), sep=sep) + if agg_set.intersection({"std", "var"}): + pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep) + var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name) + if "var" in agg_list: + name_var = _make_name((col, "var"), sep=sep) + gb[name_var] = var + if "std" in agg_list: + name_std = _make_name((col, "std"), sep=sep) + gb[name_std] = np.sqrt(var) + gb.drop(columns=[pow2_sum_name], inplace=True) + if "mean" in agg_list: + mean_name = _make_name((col, "mean"), sep=sep) + gb[mean_name] = gb[sum_name] / gb[count_name] + if "sum" not in agg_list: + gb.drop(columns=[sum_name], inplace=True) + if "count" not in agg_list: + gb.drop(columns=[count_name], inplace=True) + if list in agg_list: + collect_name = _make_name((col, "list"), sep=sep) + gb[collect_name] = gb[collect_name].list.concat() + + # Ensure sorted keys if `sort=True` + if sort: + gb = gb.sort_values(gb_cols) + + # Set index if necessary + if as_index: + gb.set_index(gb_cols, inplace=True) + + # Unflatten column names + col_array = [] + agg_array = [] + for col in gb.columns: + if col in gb_cols: + col_array.append(col) + agg_array.append("") + else: + name, agg = col.split(sep) + col_array.append(name) + agg_array.append(aggs_renames.get((name, agg), agg)) + if str_cols_out: + gb.columns = col_array + else: + gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) + + return gb[final_columns] + + +@_dask_cudf_performance_tracking +def _redirect_aggs(arg): + """Redirect aggregations to their corresponding name in cuDF""" + redirects = { + sum: "sum", + max: "max", + min: "min", + "collect": list, + "list": list, + } + if isinstance(arg, dict): + new_arg = dict() + for col in arg: + if isinstance(arg[col], list): + new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] + elif isinstance(arg[col], dict): + new_arg[col] = { + k: redirects.get(v, v) for k, v in arg[col].items() + } + else: + new_arg[col] = redirects.get(arg[col], arg[col]) + return new_arg + if isinstance(arg, list): + return [redirects.get(agg, agg) for agg in arg] + return redirects.get(arg, arg) + + +@_dask_cudf_performance_tracking +def _aggs_optimized(arg, supported: set): + """Check that aggregations in `arg` are a subset of `supported`""" + if isinstance(arg, (list, dict)): + if isinstance(arg, dict): + _global_set: set[str] = set() + for col in arg: + if isinstance(arg[col], list): + _global_set = _global_set.union(set(arg[col])) + elif isinstance(arg[col], dict): + _global_set = _global_set.union(set(arg[col].values())) + else: + _global_set.add(arg[col]) + else: + _global_set = set(arg) + + return bool(_global_set.issubset(supported)) + elif isinstance(arg, (str, type)): + return arg in supported + return False + def _get_spec_info(gb): if isinstance(gb.arg, (dict, list)): @@ -105,20 +357,14 @@ def shuffle_by_index(self): @classmethod def chunk(cls, df, *by, **kwargs): - from dask_cudf._legacy.groupby import _groupby_partition_agg - return _groupby_partition_agg(df, **kwargs) @classmethod def combine(cls, inputs, **kwargs): - from dask_cudf._legacy.groupby import _tree_node_agg - return _tree_node_agg(_concat(inputs), **kwargs) @classmethod def aggregate(cls, inputs, **kwargs): - from dask_cudf._legacy.groupby import _finalize_gb_agg - return _finalize_gb_agg(_concat(inputs), **kwargs) @property @@ -193,12 +439,6 @@ def _maybe_get_custom_expr( shuffle_method=None, **kwargs, ): - from dask_cudf._legacy.groupby import ( - OPTIMIZED_AGGS, - _aggs_optimized, - _redirect_aggs, - ) - if kwargs: # Unsupported key-word arguments return None diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py deleted file mode 100644 index d6beb775a5e..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/core.py +++ /dev/null @@ -1,711 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -import math -import warnings - -import numpy as np -import pandas as pd -from tlz import partition_all - -from dask import dataframe as dd -from dask.base import normalize_token, tokenize -from dask.dataframe.core import ( - Scalar, - handle_out, - make_meta as dask_make_meta, - map_partitions, -) -from dask.dataframe.utils import raise_on_meta_error -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname - -import cudf -from cudf import _lib as libcudf -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -from dask_cudf._expr.accessors import ListMethods, StructMethods -from dask_cudf._legacy import sorting -from dask_cudf._legacy.sorting import ( - _deprecate_shuffle_kwarg, - _get_shuffle_method, -) - - -class _Frame(dd.core._Frame, OperatorMethodMixin): - """Superclass for DataFrame and Series - - Parameters - ---------- - dsk : dict - The dask graph to compute this DataFrame - name : str - The key prefix that specifies which keys in the dask comprise this - particular DataFrame / Series - meta : cudf.DataFrame, cudf.Series, or cudf.Index - An empty cudf object with names, dtypes, and indices matching the - expected output. - divisions : tuple of index values - Values along which we partition our blocks on the index - """ - - def _is_partition_type(self, meta): - return isinstance(meta, self._partition_type) - - def __repr__(self): - s = "" - return s % (type(self).__name__, len(self.dask), self.npartitions) - - -normalize_token.register(_Frame, lambda a: a._name) - - -class DataFrame(_Frame, dd.core.DataFrame): - """ - A distributed Dask DataFrame where the backing dataframe is a - :class:`cuDF DataFrame `. - - Typically you would not construct this object directly, but rather - use one of Dask-cuDF's IO routines. - - Most operations on :doc:`Dask DataFrames ` are - supported, with many of the same caveats. - - """ - - _partition_type = cudf.DataFrame - - @_dask_cudf_performance_tracking - def _assign_column(self, k, v): - def assigner(df, k, v): - out = df.copy() - out[k] = v - return out - - meta = assigner(self._meta, k, dask_make_meta(v)) - return self.map_partitions(assigner, k, v, meta=meta) - - @_dask_cudf_performance_tracking - def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): - import uuid - - if kwargs is None: - kwargs = {} - - if cache_key is None: - cache_key = uuid.uuid4() - - def do_apply_rows(df, func, incols, outcols, kwargs): - return df.apply_rows( - func, incols, outcols, kwargs, cache_key=cache_key - ) - - meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) - return self.map_partitions( - do_apply_rows, func, incols, outcols, kwargs, meta=meta - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def merge(self, other, shuffle_method=None, **kwargs): - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().merge( - other, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def join(self, other, shuffle_method=None, **kwargs): - # CuDF doesn't support "right" join yet - how = kwargs.pop("how", "left") - if how == "right": - return other.join(other=self, how="left", **kwargs) - - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().join( - other, - how=how, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def set_index( - self, - other, - sorted=False, - divisions=None, - shuffle_method=None, - **kwargs, - ): - pre_sorted = sorted - del sorted - - if divisions == "quantile": - warnings.warn( - "Using divisions='quantile' is now deprecated. " - "Please raise an issue on github if you believe " - "this feature is necessary.", - FutureWarning, - ) - - if ( - divisions == "quantile" - or isinstance(divisions, (cudf.DataFrame, cudf.Series)) - or ( - isinstance(other, str) - and cudf.api.types.is_string_dtype(self[other].dtype) - ) - ): - # Let upstream-dask handle "pre-sorted" case - if pre_sorted: - return dd.shuffle.set_sorted_index( - self, other, divisions=divisions, **kwargs - ) - - by = other - if not isinstance(other, list): - by = [by] - if len(by) > 1: - raise ValueError("Dask does not support MultiIndex (yet).") - if divisions == "quantile": - divisions = None - - # Use dask_cudf's sort_values - df = self.sort_values( - by, - max_branch=kwargs.get("max_branch", None), - divisions=divisions, - set_divisions=True, - ignore_index=True, - shuffle_method=shuffle_method, - ) - - # Ignore divisions if its a dataframe - if isinstance(divisions, cudf.DataFrame): - divisions = None - - # Set index and repartition - df2 = df.map_partitions( - sorting.set_index_post, - index_name=other, - drop=kwargs.get("drop", True), - column_dtype=df.columns.dtype, - ) - npartitions = kwargs.get("npartitions", self.npartitions) - partition_size = kwargs.get("partition_size", None) - if partition_size: - return df2.repartition(partition_size=partition_size) - if not divisions and df2.npartitions != npartitions: - return df2.repartition(npartitions=npartitions) - if divisions and df2.npartitions != len(divisions) - 1: - return df2.repartition(divisions=divisions) - return df2 - - return super().set_index( - other, - sorted=pre_sorted, - shuffle_method=_get_shuffle_method(shuffle_method), - divisions=divisions, - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def sort_values( - self, - by, - ignore_index=False, - max_branch=None, - divisions=None, - set_divisions=False, - ascending=True, - na_position="last", - sort_function=None, - sort_function_kwargs=None, - shuffle_method=None, - **kwargs, - ): - if kwargs: - raise ValueError( - f"Unsupported input arguments passed : {list(kwargs.keys())}" - ) - - df = sorting.sort_values( - self, - by, - max_branch=max_branch, - divisions=divisions, - set_divisions=set_divisions, - ignore_index=ignore_index, - ascending=ascending, - na_position=na_position, - shuffle_method=shuffle_method, - sort_function=sort_function, - sort_function_kwargs=sort_function_kwargs, - ) - - if ignore_index: - return df.reset_index(drop=True) - return df - - @_dask_cudf_performance_tracking - def to_parquet(self, path, *args, **kwargs): - """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" - from dask_cudf._legacy.io import to_parquet - - return to_parquet(self, path, *args, **kwargs) - - @_dask_cudf_performance_tracking - def to_orc(self, path, **kwargs): - """Calls dask_cudf._legacy.io.to_orc""" - from dask_cudf._legacy.io import to_orc - - return to_orc(self, path, **kwargs) - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - numeric_only=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def shuffle(self, *args, shuffle_method=None, **kwargs): - """Wraps dask.dataframe DataFrame.shuffle method""" - return super().shuffle( - *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs - ) - - @_dask_cudf_performance_tracking - def groupby(self, by=None, **kwargs): - from .groupby import CudfDataFrameGroupBy - - return CudfDataFrameGroupBy(self, by=by, **kwargs) - - -@_dask_cudf_performance_tracking -def sum_of_squares(x): - x = x.astype("f8")._column - outcol = libcudf.reduce.reduce("sum_of_squares", x) - return cudf.Series._from_column(outcol) - - -@_dask_cudf_performance_tracking -def var_aggregate(x2, x, n, ddof): - try: - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - result = (x2 / n) - (x / n) ** 2 - if ddof != 0: - result = result * n / (n - ddof) - return result - except ZeroDivisionError: - return np.float64(np.nan) - - -@_dask_cudf_performance_tracking -def nlargest_agg(x, **kwargs): - return cudf.concat(x).nlargest(**kwargs) - - -@_dask_cudf_performance_tracking -def nsmallest_agg(x, **kwargs): - return cudf.concat(x).nsmallest(**kwargs) - - -class Series(_Frame, dd.core.Series): - _partition_type = cudf.Series - - @_dask_cudf_performance_tracking - def count(self, split_every=False): - return reduction( - [self], - chunk=M.count, - aggregate=np.sum, - split_every=split_every, - meta="i8", - ) - - @_dask_cudf_performance_tracking - def mean(self, split_every=False): - sum = self.sum(split_every=split_every) - n = self.count(split_every=split_every) - return sum / n - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var(axis=axis, skipna=skipna) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_dask_cudf_performance_tracking - def groupby(self, *args, **kwargs): - from .groupby import CudfSeriesGroupBy - - return CudfSeriesGroupBy(self, *args, **kwargs) - - @property # type: ignore - @_dask_cudf_performance_tracking - def list(self): - return ListMethods(self) - - @property # type: ignore - @_dask_cudf_performance_tracking - def struct(self): - return StructMethods(self) - - -class Index(Series, dd.core.Index): - _partition_type = cudf.Index # type: ignore - - -@_dask_cudf_performance_tracking -def _naive_var(ddf, meta, skipna, ddof, split_every, out): - num = ddf._get_numeric_data() - x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) - n = num.count(split_every=split_every) - name = ddf._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof - ) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _parallel_var(ddf, meta, skipna, split_every, out): - def _local_var(x, skipna): - if skipna: - n = x.count() - avg = x.mean(skipna=skipna) - else: - # Not skipping nulls, so might as well - # avoid the full `count` operation - n = len(x) - avg = x.sum(skipna=skipna) / n - m2 = ((x - avg) ** 2).sum(skipna=skipna) - return n, avg, m2 - - def _aggregate_var(parts): - n, avg, m2 = parts[0] - for i in range(1, len(parts)): - n_a, avg_a, m2_a = n, avg, m2 - n_b, avg_b, m2_b = parts[i] - n = n_a + n_b - avg = (n_a * avg_a + n_b * avg_b) / n - delta = avg_b - avg_a - m2 = m2_a + m2_b + delta**2 * n_a * n_b / n - return n, avg, m2 - - def _finalize_var(vals): - n, _, m2 = vals - return m2 / (n - 1) - - # Build graph - nparts = ddf.npartitions - if not split_every: - split_every = nparts - name = "var-" + tokenize(skipna, split_every, out) - local_name = "local-" + name - num = ddf._get_numeric_data() - dsk = { - (local_name, n, 0): (_local_var, (num._name, n), skipna) - for n in range(nparts) - } - - # Use reduction tree - widths = [nparts] - while nparts > 1: - nparts = math.ceil(nparts / split_every) - widths.append(nparts) - height = len(widths) - for depth in range(1, height): - for group in range(widths[depth]): - p_max = widths[depth - 1] - lstart = split_every * group - lstop = min(lstart + split_every, p_max) - node_list = [ - (local_name, p, depth - 1) for p in range(lstart, lstop) - ] - dsk[(local_name, group, depth)] = (_aggregate_var, node_list) - if height == 1: - group = depth = 0 - dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) - - graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) - result = dd.core.new_dd_object(graph, name, meta, (None, None)) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _extract_meta(x): - """ - Extract internal cache data (``_meta``) from dask_cudf objects - """ - if isinstance(x, (Scalar, _Frame)): - return x._meta - elif isinstance(x, list): - return [_extract_meta(_x) for _x in x] - elif isinstance(x, tuple): - return tuple(_extract_meta(_x) for _x in x) - elif isinstance(x, dict): - return {k: _extract_meta(v) for k, v in x.items()} - return x - - -@_dask_cudf_performance_tracking -def _emulate(func, *args, **kwargs): - """ - Apply a function using args / kwargs. If arguments contain dd.DataFrame / - dd.Series, using internal cache (``_meta``) for calculation - """ - with raise_on_meta_error(funcname(func)): - return func(*_extract_meta(args), **_extract_meta(kwargs)) - - -@_dask_cudf_performance_tracking -def align_partitions(args): - """Align partitions between dask_cudf objects. - - Note that if all divisions are unknown, but have equal npartitions, then - they will be passed through unchanged. - """ - dfs = [df for df in args if isinstance(df, _Frame)] - if not dfs: - return args - - divisions = dfs[0].divisions - if not all(df.divisions == divisions for df in dfs): - raise NotImplementedError("Aligning mismatched partitions") - return args - - -@_dask_cudf_performance_tracking -def reduction( - args, - chunk=None, - aggregate=None, - combine=None, - meta=None, - token=None, - chunk_kwargs=None, - aggregate_kwargs=None, - combine_kwargs=None, - split_every=None, - **kwargs, -): - """Generic tree reduction operation. - - Parameters - ---------- - args : - Positional arguments for the `chunk` function. All `dask.dataframe` - objects should be partitioned and indexed equivalently. - chunk : function [block-per-arg] -> block - Function to operate on each block of data - aggregate : function list-of-blocks -> block - Function to operate on the list of results of chunk - combine : function list-of-blocks -> block, optional - Function to operate on intermediate lists of results of chunk - in a tree-reduction. If not provided, defaults to aggregate. - $META - token : str, optional - The name to use for the output keys. - chunk_kwargs : dict, optional - Keywords for the chunk function only. - aggregate_kwargs : dict, optional - Keywords for the aggregate function only. - combine_kwargs : dict, optional - Keywords for the combine function only. - split_every : int, optional - Group partitions into groups of this size while performing a - tree-reduction. If set to False, no tree-reduction will be used, - and all intermediates will be concatenated and passed to ``aggregate``. - Default is 8. - kwargs : - All remaining keywords will be passed to ``chunk``, ``aggregate``, and - ``combine``. - """ - if chunk_kwargs is None: - chunk_kwargs = dict() - if aggregate_kwargs is None: - aggregate_kwargs = dict() - chunk_kwargs.update(kwargs) - aggregate_kwargs.update(kwargs) - - if combine is None: - if combine_kwargs: - raise ValueError("`combine_kwargs` provided with no `combine`") - combine = aggregate - combine_kwargs = aggregate_kwargs - else: - if combine_kwargs is None: - combine_kwargs = dict() - combine_kwargs.update(kwargs) - - if not isinstance(args, (tuple, list)): - args = [args] - - npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} - if len(npartitions) > 1: - raise ValueError("All arguments must have same number of partitions") - npartitions = npartitions.pop() - - if split_every is None: - split_every = 8 - elif split_every is False: - split_every = npartitions - elif split_every < 2 or not isinstance(split_every, int): - raise ValueError("split_every must be an integer >= 2") - - token_key = tokenize( - token or (chunk, aggregate), - meta, - args, - chunk_kwargs, - aggregate_kwargs, - combine_kwargs, - split_every, - ) - - # Chunk - a = f"{token or funcname(chunk)}-chunk-{token_key}" - if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: - dsk = { - (a, 0, i): (chunk, key) - for i, key in enumerate(args[0].__dask_keys__()) - } - else: - dsk = { - (a, 0, i): ( - apply, - chunk, - [(x._name, i) if isinstance(x, _Frame) else x for x in args], - chunk_kwargs, - ) - for i in range(args[0].npartitions) - } - - # Combine - b = f"{token or funcname(combine)}-combine-{token_key}" - k = npartitions - depth = 0 - while k > split_every: - for part_i, inds in enumerate(partition_all(split_every, range(k))): - conc = (list, [(a, depth, i) for i in inds]) - dsk[(b, depth + 1, part_i)] = ( - (apply, combine, [conc], combine_kwargs) - if combine_kwargs - else (combine, conc) - ) - k = part_i + 1 - a = b - depth += 1 - - # Aggregate - b = f"{token or funcname(aggregate)}-agg-{token_key}" - conc = (list, [(a, depth, i) for i in range(k)]) - if aggregate_kwargs: - dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) - else: - dsk[(b, 0)] = (aggregate, conc) - - if meta is None: - meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) - meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) - meta = dask_make_meta(meta) - - graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) - return dd.core.new_dd_object(graph, b, meta, (None, None)) - - -for name in ( - "add", - "sub", - "mul", - "truediv", - "floordiv", - "mod", - "pow", - "radd", - "rsub", - "rmul", - "rtruediv", - "rfloordiv", - "rmod", - "rpow", -): - meth = getattr(cudf.DataFrame, name) - DataFrame._bind_operator_method(name, meth, original=cudf.Series) - - meth = getattr(cudf.Series, name) - Series._bind_operator_method(name, meth, original=cudf.Series) - -for name in ("lt", "gt", "le", "ge", "ne", "eq"): - meth = getattr(cudf.Series, name) - Series._bind_comparison_method(name, meth, original=cudf.Series) diff --git a/python/dask_cudf/dask_cudf/_legacy/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py deleted file mode 100644 index 7e01e91476d..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/groupby.py +++ /dev/null @@ -1,909 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from __future__ import annotations - -from functools import wraps - -import numpy as np -import pandas as pd - -from dask.dataframe.core import ( - DataFrame as DaskDataFrame, - aca, - split_out_on_cols, -) -from dask.dataframe.groupby import DataFrameGroupBy, SeriesGroupBy -from dask.utils import funcname - -import cudf -from cudf.core.groupby.groupby import _deprecate_collect -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg - -# aggregations that are dask-cudf optimized -OPTIMIZED_AGGS = ( - "count", - "mean", - "std", - "var", - "sum", - "min", - "max", - list, - "first", - "last", -) - - -def _check_groupby_optimized(func): - """ - Decorator for dask-cudf's groupby methods that returns the dask-cudf - optimized method if the groupby object is supported, otherwise - reverting to the upstream Dask method - """ - - @wraps(func) - def wrapper(*args, **kwargs): - gb = args[0] - if _groupby_optimized(gb): - return func(*args, **kwargs) - # note that we use upstream Dask's default kwargs for this call if - # none are specified; this shouldn't be an issue as those defaults are - # consistent with dask-cudf - return getattr(super(type(gb), gb), func.__name__)(*args[1:], **kwargs) - - return wrapper - - -class CudfDataFrameGroupBy(DataFrameGroupBy): - @_dask_cudf_performance_tracking - def __init__(self, *args, sort=None, **kwargs): - self.sep = kwargs.pop("sep", "___") - self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, sort=sort, **kwargs) - - @_dask_cudf_performance_tracking - def __getitem__(self, key): - if isinstance(key, list): - g = CudfDataFrameGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - **self.dropna, - ) - else: - g = CudfSeriesGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - **self.dropna, - ) - - g._meta = g._meta[key] - return g - - @_dask_cudf_performance_tracking - def _make_groupby_method_aggs(self, agg_name): - """Create aggs dictionary for aggregation methods""" - - if isinstance(self.by, list): - return {c: agg_name for c in self.obj.columns if c not in self.by} - return {c: agg_name for c in self.obj.columns if c != self.by} - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def count(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("count"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def mean(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("mean"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def std(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("std"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def var(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("var"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def sum(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("sum"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def min(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("min"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def max(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("max"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def collect(self, split_every=None, split_out=1): - _deprecate_collect() - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs(list), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def first(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("first"), - split_every, - split_out, - ) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def last(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - self._make_groupby_method_aggs("last"), - split_every, - split_out, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): - if arg == "size": - return self.size() - - arg = _redirect_aggs(arg) - - if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS): - if isinstance(self._meta.grouping.keys, cudf.MultiIndex): - keys = self._meta.grouping.keys.names - else: - keys = self._meta.grouping.keys.name - - return groupby_agg( - self.obj, - keys, - arg, - split_every=split_every, - split_out=split_out, - sep=self.sep, - sort=self.sort, - as_index=self.as_index, - shuffle_method=shuffle_method, - **self.dropna, - ) - - return super().aggregate( - arg, - split_every=split_every, - split_out=split_out, - shuffle_method=shuffle_method, - ) - - -class CudfSeriesGroupBy(SeriesGroupBy): - @_dask_cudf_performance_tracking - def __init__(self, *args, sort=None, **kwargs): - self.sep = kwargs.pop("sep", "___") - self.as_index = kwargs.pop("as_index", True) - super().__init__(*args, sort=sort, **kwargs) - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def count(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "count"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def mean(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "mean"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def std(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "std"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def var(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "var"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def sum(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "sum"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def min(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "min"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def max(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "max"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def collect(self, split_every=None, split_out=1): - _deprecate_collect() - return _make_groupby_agg_call( - self, - {self._slice: list}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def first(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "first"}, - split_every, - split_out, - )[self._slice] - - @_dask_cudf_performance_tracking - @_check_groupby_optimized - def last(self, split_every=None, split_out=1): - return _make_groupby_agg_call( - self, - {self._slice: "last"}, - split_every, - split_out, - )[self._slice] - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def aggregate( - self, arg, split_every=None, split_out=1, shuffle_method=None - ): - if arg == "size": - return self.size() - - arg = _redirect_aggs(arg) - - if not isinstance(arg, dict): - arg = {self._slice: arg} - - if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS): - return _make_groupby_agg_call( - self, arg, split_every, split_out, shuffle_method - )[self._slice] - - return super().aggregate( - arg, - split_every=split_every, - split_out=split_out, - shuffle_method=shuffle_method, - ) - - -def _shuffle_aggregate( - ddf, - gb_cols, - chunk, - chunk_kwargs, - aggregate, - aggregate_kwargs, - split_every, - split_out, - token=None, - sort=None, - shuffle_method=None, -): - # Shuffle-based groupby aggregation - # NOTE: This function is the dask_cudf version of - # dask.dataframe.groupby._shuffle_aggregate - - # Step 1 - Chunkwise groupby operation - chunk_name = f"{token or funcname(chunk)}-chunk" - chunked = ddf.map_partitions( - chunk, - meta=chunk(ddf._meta, **chunk_kwargs), - token=chunk_name, - **chunk_kwargs, - ) - - # Step 2 - Perform global sort or shuffle - shuffle_npartitions = max( - chunked.npartitions // split_every, - split_out, - ) - if sort and split_out > 1: - # Sort-based code path - result = ( - chunked.repartition(npartitions=shuffle_npartitions) - .sort_values( - gb_cols, - ignore_index=True, - shuffle_method=shuffle_method, - ) - .map_partitions( - aggregate, - meta=aggregate(chunked._meta, **aggregate_kwargs), - **aggregate_kwargs, - ) - ) - else: - # Hash-based code path - result = chunked.shuffle( - gb_cols, - npartitions=shuffle_npartitions, - ignore_index=True, - shuffle_method=shuffle_method, - ).map_partitions( - aggregate, - meta=aggregate(chunked._meta, **aggregate_kwargs), - **aggregate_kwargs, - ) - - # Step 3 - Repartition and return - if split_out < result.npartitions: - return result.repartition(npartitions=split_out) - return result - - -@_dask_cudf_performance_tracking -def groupby_agg( - ddf, - gb_cols, - aggs_in, - split_every=None, - split_out=None, - dropna=True, - sep="___", - sort=False, - as_index=True, - shuffle_method=None, -): - """Optimized groupby aggregation for Dask-CuDF. - - Parameters - ---------- - ddf : DataFrame - DataFrame object to perform grouping on. - gb_cols : str or list[str] - Column names to group by. - aggs_in : str, list, or dict - Aggregations to perform. - split_every : int (optional) - How to group intermediate aggregates. - dropna : bool - Drop grouping key values corresponding to NA values. - as_index : bool - Currently ignored. - sort : bool - Sort the group keys, better performance is obtained when - not sorting. - shuffle_method : str (optional) - Control how shuffling of the DataFrame is performed. - sep : str - Internal usage. - - - Notes - ----- - This "optimized" approach is more performant than the algorithm in - implemented in :meth:`DataFrame.apply` because it allows the cuDF - backend to perform multiple aggregations at once. - - This aggregation algorithm only supports the following options - - * "list" - * "count" - * "first" - * "last" - * "max" - * "mean" - * "min" - * "std" - * "sum" - * "var" - - - See Also - -------- - DataFrame.groupby : generic groupby of a DataFrame - dask.dataframe.apply_concat_apply : for more description of the - split_every argument. - - """ - # Assert that aggregations are supported - aggs = _redirect_aggs(aggs_in) - if not _aggs_optimized(aggs, OPTIMIZED_AGGS): - raise ValueError( - f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. " - f"Aggregations must be specified with dict or list syntax." - ) - - # If split_every is False, we use an all-to-one reduction - if split_every is False: - split_every = max(ddf.npartitions, 2) - - # Deal with default split_out and split_every params - split_every = split_every or 8 - split_out = split_out or 1 - - # Standardize `gb_cols`, `columns`, and `aggs` - if isinstance(gb_cols, str): - gb_cols = [gb_cols] - columns = [c for c in ddf.columns if c not in gb_cols] - if not isinstance(aggs, dict): - aggs = {col: aggs for col in columns} - - # Assert if our output will have a MultiIndex; this will be the case if - # any value in the `aggs` dict is not a string (i.e. multiple/named - # aggregations per column) - str_cols_out = True - aggs_renames = {} - for col in aggs: - if isinstance(aggs[col], str) or callable(aggs[col]): - aggs[col] = [aggs[col]] - elif isinstance(aggs[col], dict): - str_cols_out = False - col_aggs = [] - for k, v in aggs[col].items(): - aggs_renames[col, v] = k - col_aggs.append(v) - aggs[col] = col_aggs - else: - str_cols_out = False - if col in gb_cols: - columns.append(col) - - # Construct meta - _aggs = aggs.copy() - if str_cols_out: - # Metadata should use `str` for dict values if that is - # what the user originally specified (column names will - # be str, rather than tuples). - for col in aggs: - _aggs[col] = _aggs[col][0] - _meta = ddf._meta.groupby(gb_cols, as_index=as_index).agg(_aggs) - if aggs_renames: - col_array = [] - agg_array = [] - for col, agg in _meta.columns: - col_array.append(col) - agg_array.append(aggs_renames.get((col, agg), agg)) - _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - - chunk = _groupby_partition_agg - chunk_kwargs = { - "gb_cols": gb_cols, - "aggs": aggs, - "columns": columns, - "dropna": dropna, - "sort": sort, - "sep": sep, - } - - combine = _tree_node_agg - combine_kwargs = { - "gb_cols": gb_cols, - "dropna": dropna, - "sort": sort, - "sep": sep, - } - - aggregate = _finalize_gb_agg - aggregate_kwargs = { - "gb_cols": gb_cols, - "aggs": aggs, - "columns": columns, - "final_columns": _meta.columns, - "as_index": as_index, - "dropna": dropna, - "sort": sort, - "sep": sep, - "str_cols_out": str_cols_out, - "aggs_renames": aggs_renames, - } - - # Use shuffle_method=True for split_out>1 - if sort and split_out > 1 and shuffle_method is None: - shuffle_method = "tasks" - - # Check if we are using the shuffle-based algorithm - if shuffle_method: - # Shuffle-based aggregation - return _shuffle_aggregate( - ddf, - gb_cols, - chunk, - chunk_kwargs, - aggregate, - aggregate_kwargs, - split_every, - split_out, - token="cudf-aggregate", - sort=sort, - shuffle_method=shuffle_method - if isinstance(shuffle_method, str) - else None, - ) - - # Deal with sort/shuffle defaults - if split_out > 1 and sort: - raise ValueError( - "dask-cudf's groupby algorithm does not yet support " - "`sort=True` when `split_out>1`, unless a shuffle-based " - "algorithm is used. Please use `split_out=1`, group " - "with `sort=False`, or set `shuffle_method=True`." - ) - - # Determine required columns to enable column projection - required_columns = list( - set(gb_cols).union(aggs.keys()).intersection(ddf.columns) - ) - - return aca( - [ddf[required_columns]], - chunk=chunk, - chunk_kwargs=chunk_kwargs, - combine=combine, - combine_kwargs=combine_kwargs, - aggregate=aggregate, - aggregate_kwargs=aggregate_kwargs, - token="cudf-aggregate", - split_every=split_every, - split_out=split_out, - split_out_setup=split_out_on_cols, - split_out_setup_kwargs={"cols": gb_cols}, - sort=sort, - ignore_index=True, - ) - - -@_dask_cudf_performance_tracking -def _make_groupby_agg_call( - gb, aggs, split_every, split_out, shuffle_method=None -): - """Helper method to consolidate the common `groupby_agg` call for all - aggregations in one place - """ - - return groupby_agg( - gb.obj, - gb.by, - aggs, - split_every=split_every, - split_out=split_out, - sep=gb.sep, - sort=gb.sort, - as_index=gb.as_index, - shuffle_method=shuffle_method, - **gb.dropna, - ) - - -@_dask_cudf_performance_tracking -def _redirect_aggs(arg): - """Redirect aggregations to their corresponding name in cuDF""" - redirects = { - sum: "sum", - max: "max", - min: "min", - "collect": list, - "list": list, - } - if isinstance(arg, dict): - new_arg = dict() - for col in arg: - if isinstance(arg[col], list): - new_arg[col] = [redirects.get(agg, agg) for agg in arg[col]] - elif isinstance(arg[col], dict): - new_arg[col] = { - k: redirects.get(v, v) for k, v in arg[col].items() - } - else: - new_arg[col] = redirects.get(arg[col], arg[col]) - return new_arg - if isinstance(arg, list): - return [redirects.get(agg, agg) for agg in arg] - return redirects.get(arg, arg) - - -@_dask_cudf_performance_tracking -def _aggs_optimized(arg, supported: set): - """Check that aggregations in `arg` are a subset of `supported`""" - if isinstance(arg, (list, dict)): - if isinstance(arg, dict): - _global_set: set[str] = set() - for col in arg: - if isinstance(arg[col], list): - _global_set = _global_set.union(set(arg[col])) - elif isinstance(arg[col], dict): - _global_set = _global_set.union(set(arg[col].values())) - else: - _global_set.add(arg[col]) - else: - _global_set = set(arg) - - return bool(_global_set.issubset(supported)) - elif isinstance(arg, (str, type)): - return arg in supported - return False - - -@_dask_cudf_performance_tracking -def _groupby_optimized(gb): - """Check that groupby input can use dask-cudf optimized codepath""" - return isinstance(gb.obj, DaskDataFrame) and ( - isinstance(gb.by, str) - or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by)) - ) - - -def _make_name(col_name, sep="_"): - """Combine elements of `col_name` into a single string, or no-op if - `col_name` is already a string - """ - if isinstance(col_name, str): - return col_name - return sep.join(name for name in col_name if name != "") - - -@_dask_cudf_performance_tracking -def _groupby_partition_agg(df, gb_cols, aggs, columns, dropna, sort, sep): - """Initial partition-level aggregation task. - - This is the first operation to be executed on each input - partition in `groupby_agg`. Depending on `aggs`, four possible - groupby aggregations ("count", "sum", "min", and "max") are - performed. The result is then partitioned (by hashing `gb_cols`) - into a number of distinct dictionary elements. The number of - elements in the output dictionary (`split_out`) corresponds to - the number of partitions in the final output of `groupby_agg`. - """ - - # Modify dict for initial (partition-wise) aggregations - _agg_dict = {} - for col, agg_list in aggs.items(): - _agg_dict[col] = set() - for agg in agg_list: - if agg in ("mean", "std", "var"): - _agg_dict[col].add("count") - _agg_dict[col].add("sum") - else: - _agg_dict[col].add(agg) - _agg_dict[col] = list(_agg_dict[col]) - if set(agg_list).intersection({"std", "var"}): - pow2_name = _make_name((col, "pow2"), sep=sep) - df[pow2_name] = df[col].astype("float64").pow(2) - _agg_dict[pow2_name] = ["sum"] - - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - _agg_dict - ) - output_columns = [_make_name(name, sep=sep) for name in gb.columns] - gb.columns = output_columns - # Return with deterministic column ordering - return gb[sorted(output_columns)] - - -@_dask_cudf_performance_tracking -def _tree_node_agg(df, gb_cols, dropna, sort, sep): - """Node in groupby-aggregation reduction tree. - - The input DataFrame (`df`) corresponds to the - concatenated output of one or more `_groupby_partition_agg` - tasks. In this function, "sum", "min" and/or "max" groupby - aggregations will be used to combine the statistics for - duplicate keys. - """ - - agg_dict = {} - for col in df.columns: - if col in gb_cols: - continue - agg = col.split(sep)[-1] - if agg in ("count", "sum"): - agg_dict[col] = ["sum"] - elif agg == "list": - agg_dict[col] = [list] - elif agg in OPTIMIZED_AGGS: - agg_dict[col] = [agg] - else: - raise ValueError(f"Unexpected aggregation: {agg}") - - gb = df.groupby(gb_cols, dropna=dropna, as_index=False, sort=sort).agg( - agg_dict - ) - - # Don't include the last aggregation in the column names - output_columns = [ - _make_name(name[:-1] if isinstance(name, tuple) else name, sep=sep) - for name in gb.columns - ] - gb.columns = output_columns - # Return with deterministic column ordering - return gb[sorted(output_columns)] - - -@_dask_cudf_performance_tracking -def _var_agg(df, col, count_name, sum_name, pow2_sum_name, ddof=1): - """Calculate variance (given count, sum, and sum-squared columns).""" - - # Select count, sum, and sum-squared - n = df[count_name] - x = df[sum_name] - x2 = df[pow2_sum_name] - - # Use sum-squared approach to get variance - var = x2 - x**2 / n - div = n - ddof - div[div < 1] = 1 # Avoid division by 0 - var /= div - - # Set appropriate NaN elements - # (since we avoided 0-division) - var[(n - ddof) == 0] = np.nan - - return var - - -@_dask_cudf_performance_tracking -def _finalize_gb_agg( - gb_in, - gb_cols, - aggs, - columns, - final_columns, - as_index, - dropna, - sort, - sep, - str_cols_out, - aggs_renames, -): - """Final aggregation task. - - This is the final operation on each output partitions - of the `groupby_agg` algorithm. This function must - take care of higher-order aggregations, like "mean", - "std" and "var". We also need to deal with the column - index, the row index, and final sorting behavior. - """ - - gb = _tree_node_agg(gb_in, gb_cols, dropna, sort, sep) - - # Deal with higher-order aggregations - for col in columns: - agg_list = aggs.get(col, []) - agg_set = set(agg_list) - if agg_set.intersection({"mean", "std", "var"}): - count_name = _make_name((col, "count"), sep=sep) - sum_name = _make_name((col, "sum"), sep=sep) - if agg_set.intersection({"std", "var"}): - pow2_sum_name = _make_name((col, "pow2", "sum"), sep=sep) - var = _var_agg(gb, col, count_name, sum_name, pow2_sum_name) - if "var" in agg_list: - name_var = _make_name((col, "var"), sep=sep) - gb[name_var] = var - if "std" in agg_list: - name_std = _make_name((col, "std"), sep=sep) - gb[name_std] = np.sqrt(var) - gb.drop(columns=[pow2_sum_name], inplace=True) - if "mean" in agg_list: - mean_name = _make_name((col, "mean"), sep=sep) - gb[mean_name] = gb[sum_name] / gb[count_name] - if "sum" not in agg_list: - gb.drop(columns=[sum_name], inplace=True) - if "count" not in agg_list: - gb.drop(columns=[count_name], inplace=True) - if list in agg_list: - collect_name = _make_name((col, "list"), sep=sep) - gb[collect_name] = gb[collect_name].list.concat() - - # Ensure sorted keys if `sort=True` - if sort: - gb = gb.sort_values(gb_cols) - - # Set index if necessary - if as_index: - gb.set_index(gb_cols, inplace=True) - - # Unflatten column names - col_array = [] - agg_array = [] - for col in gb.columns: - if col in gb_cols: - col_array.append(col) - agg_array.append("") - else: - name, agg = col.split(sep) - col_array.append(name) - agg_array.append(aggs_renames.get((name, agg), agg)) - if str_cols_out: - gb.columns = col_array - else: - gb.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - - return gb[final_columns] diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py index 0421bd755f4..c544c32523f 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py @@ -1,11 +1 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from .csv import read_csv # noqa: F401 -from .json import read_json # noqa: F401 -from .orc import read_orc, to_orc # noqa: F401 -from .text import read_text # noqa: F401 - -try: - from .parquet import read_parquet, to_parquet # noqa: F401 -except ImportError: - pass +# Copyright (c) 2018-2025, NVIDIA CORPORATION. diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py deleted file mode 100644 index fa5400344f9..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/csv.py +++ /dev/null @@ -1,222 +0,0 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -import os -from glob import glob -from warnings import warn - -from fsspec.utils import infer_compression - -from dask import dataframe as dd -from dask.base import tokenize -from dask.dataframe.io.csv import make_reader -from dask.utils import apply, parse_bytes - -import cudf - - -def read_csv(path, blocksize="default", **kwargs): - """ - Read CSV files into a :class:`.DataFrame`. - - This API parallelizes the :func:`cudf:cudf.read_csv` function in - the following ways: - - It supports loading many files at once using globstrings: - - >>> import dask_cudf - >>> df = dask_cudf.read_csv("myfiles.*.csv") - - In some cases it can break up large files: - - >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - - It can read CSV files from external resources (e.g. S3, HTTP, FTP) - - >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") - >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") - - Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and - supports many of the same keyword arguments with the same - performance guarantees. See the docstring for - :func:`cudf:cudf.read_csv` for more information on available - keyword arguments. - - Parameters - ---------- - path : str, path object, or file-like object - Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as - builtin :py:func:`open` file handler function or - :py:class:`~io.StringIO`). - blocksize : int or str, default "256 MiB" - The target task partition size. If ``None``, a single block - is used for each file. - **kwargs : dict - Passthrough key-word arguments that are sent to - :func:`cudf:cudf.read_csv`. - - Notes - ----- - If any of `skipfooter`/`skiprows`/`nrows` are passed, - `blocksize` will default to None. - - Examples - -------- - >>> import dask_cudf - >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) - >>> ddf.compute() - a b - 0 1 hi - 1 2 hello - 2 3 ai - - """ - - # Handle `chunksize` deprecation - if "chunksize" in kwargs: - chunksize = kwargs.pop("chunksize", "default") - warn( - "`chunksize` is deprecated and will be removed in the future. " - "Please use `blocksize` instead.", - FutureWarning, - ) - if blocksize == "default": - blocksize = chunksize - - # Set default `blocksize` - if blocksize == "default": - if ( - kwargs.get("skipfooter", 0) != 0 - or kwargs.get("skiprows", 0) != 0 - or kwargs.get("nrows", None) is not None - ): - # Cannot read in blocks if skipfooter, - # skiprows or nrows is passed. - blocksize = None - else: - blocksize = "256 MiB" - - if "://" in str(path): - func = make_reader(cudf.read_csv, "read_csv", "CSV") - return func(path, blocksize=blocksize, **kwargs) - else: - return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) - - -def _internal_read_csv(path, blocksize="256 MiB", **kwargs): - if isinstance(blocksize, str): - blocksize = parse_bytes(blocksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - name = "read-csv-" + tokenize( - path, tokenize, **kwargs - ) # TODO: get last modified time - - compression = kwargs.get("compression", "infer") - - if compression == "infer": - # Infer compression from first path by default - compression = infer_compression(filenames[0]) - - if compression and blocksize: - # compressed CSVs reading must read the entire file - kwargs.pop("byte_range", None) - warn( - "Warning %s compression does not support breaking apart files\n" - "Please ensure that each individual file can fit in memory and\n" - "use the keyword ``blocksize=None to remove this message``\n" - "Setting ``blocksize=(size of file)``" % compression - ) - blocksize = None - - if blocksize is None: - return read_csv_without_blocksize(path, **kwargs) - - # Let dask.dataframe generate meta - dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") - kwargs1 = kwargs.copy() - usecols = kwargs1.pop("usecols", None) - dtype = kwargs1.pop("dtype", None) - meta = dask_reader(filenames[0], **kwargs1)._meta - names = meta.columns - if usecols or dtype: - # Regenerate meta with original kwargs if - # `usecols` or `dtype` was specified - meta = dask_reader(filenames[0], **kwargs)._meta - - dsk = {} - i = 0 - dtypes = meta.dtypes.values - - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, blocksize): - kwargs2 = kwargs.copy() - kwargs2["byte_range"] = ( - start, - blocksize, - ) # specify which chunk of the file we care about - if start != 0: - kwargs2["names"] = names # no header in the middle of the file - kwargs2["header"] = None - dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) - - i += 1 - - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) - - -def _read_csv(fn, dtypes=None, **kwargs): - return cudf.read_csv(fn, **kwargs) - - -def read_csv_without_blocksize(path, **kwargs): - """Read entire CSV with optional compression (gzip/zip) - - Parameters - ---------- - path : str - path to files (support for glob) - """ - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - name = "read-csv-" + tokenize(path, **kwargs) - - meta_kwargs = kwargs.copy() - if "skipfooter" in meta_kwargs: - meta_kwargs.pop("skipfooter") - if "nrows" in meta_kwargs: - meta_kwargs.pop("nrows") - # Read "head" of first file (first 5 rows). - # Convert to empty df for metadata. - meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] - - graph = { - (name, i): (apply, cudf.read_csv, [fn], kwargs) - for i, fn in enumerate(filenames) - } - - divisions = [None] * (len(filenames) + 1) - - return dd.core.new_dd_object(graph, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py deleted file mode 100644 index 98c5ceedb76..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/json.py +++ /dev/null @@ -1,209 +0,0 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. - -from functools import partial - -import numpy as np -from fsspec.core import get_compression, get_fs_token_paths - -import dask -from dask.utils import parse_bytes - -import cudf -from cudf.core.column import as_column -from cudf.utils.ioutils import _is_local_filesystem - -from dask_cudf.backends import _default_backend - - -def _read_json_partition( - paths, - fs=None, - include_path_column=False, - path_converter=None, - **kwargs, -): - # Transfer all data up front for remote storage - sources = ( - paths - if fs is None - else fs.cat_ranges( - paths, - [0] * len(paths), - fs.sizes(paths), - ) - ) - - if include_path_column: - # Add "path" column. - # Must iterate over sources sequentially - if not isinstance(include_path_column, str): - include_path_column = "path" - converted_paths = ( - paths - if path_converter is None - else [path_converter(path) for path in paths] - ) - dfs = [] - for i, source in enumerate(sources): - df = cudf.read_json(source, **kwargs) - df[include_path_column] = as_column( - converted_paths[i], length=len(df) - ) - dfs.append(df) - return cudf.concat(dfs) - else: - # Pass sources directly to cudf - return cudf.read_json(sources, **kwargs) - - -def read_json( - url_path, - engine="auto", - blocksize=None, - orient="records", - lines=None, - compression="infer", - aggregate_files=True, - **kwargs, -): - """Read JSON data into a :class:`.DataFrame`. - - This function wraps :func:`dask.dataframe.read_json`, and passes - ``engine=partial(cudf.read_json, engine="auto")`` by default. - - Parameters - ---------- - url_path : str, list of str - Location to read from. If a string, can include a glob character to - find a set of file names. - Supports protocol specifications such as ``"s3://"``. - engine : str or Callable, default "auto" - - If str, this value will be used as the ``engine`` argument - when :func:`cudf.read_json` is used to create each partition. - If a :obj:`~collections.abc.Callable`, this value will be used as the - underlying function used to create each partition from JSON - data. The default value is "auto", so that - ``engine=partial(cudf.read_json, engine="auto")`` will be - passed to :func:`dask.dataframe.read_json` by default. - aggregate_files : bool or int - Whether to map multiple files to each output partition. If True, - the `blocksize` argument will be used to determine the number of - files in each partition. If any one file is larger than `blocksize`, - the `aggregate_files` argument will be ignored. If an integer value - is specified, the `blocksize` argument will be ignored, and that - number of files will be mapped to each partition. Default is True. - **kwargs : - Key-word arguments to pass through to :func:`dask.dataframe.read_json`. - - Returns - ------- - :class:`.DataFrame` - - Examples - -------- - Load single file - - >>> from dask_cudf import read_json - >>> read_json('myfile.json') # doctest: +SKIP - - Load large line-delimited JSON files using partitions of approx - 256MB size - - >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP - - Load nested JSON data - - >>> read_json('myfile.json') # doctest: +SKIP - - See Also - -------- - dask.dataframe.read_json - - """ - - if lines is None: - lines = orient == "records" - if orient != "records" and lines: - raise ValueError( - 'Line-delimited JSON is only available with orient="records".' - ) - if blocksize and (orient != "records" or not lines): - raise ValueError( - "JSON file chunking only allowed for JSON-lines" - "input (orient='records', lines=True)." - ) - - inputs = [] - if aggregate_files and blocksize or int(aggregate_files) > 1: - # Attempt custom read if we are mapping multiple files - # to each output partition. Otherwise, upstream logic - # is sufficient. - - storage_options = kwargs.get("storage_options", {}) - fs, _, paths = get_fs_token_paths( - url_path, mode="rb", storage_options=storage_options - ) - if isinstance(aggregate_files, int) and aggregate_files > 1: - # Map a static file count to each partition - inputs = [ - paths[offset : offset + aggregate_files] - for offset in range(0, len(paths), aggregate_files) - ] - elif aggregate_files is True and blocksize: - # Map files dynamically (using blocksize) - file_sizes = fs.sizes(paths) # NOTE: This can be slow - blocksize = parse_bytes(blocksize) - if all([file_size <= blocksize for file_size in file_sizes]): - counts = np.unique( - np.floor(np.cumsum(file_sizes) / blocksize), - return_counts=True, - )[1] - offsets = np.concatenate([[0], counts.cumsum()]) - inputs = [ - paths[offsets[i] : offsets[i + 1]] - for i in range(len(offsets) - 1) - ] - - if inputs: - # Inputs were successfully populated. - # Use custom _read_json_partition function - # to generate each partition. - - compression = get_compression( - url_path[0] if isinstance(url_path, list) else url_path, - compression, - ) - _kwargs = dict( - orient=orient, - lines=lines, - compression=compression, - include_path_column=kwargs.get("include_path_column", False), - path_converter=kwargs.get("path_converter"), - ) - if not _is_local_filesystem(fs): - _kwargs["fs"] = fs - # TODO: Generate meta more efficiently - meta = _read_json_partition(inputs[0][:1], **_kwargs) - return dask.dataframe.from_map( - _read_json_partition, - inputs, - meta=meta, - **_kwargs, - ) - - # Fall back to dask.dataframe.read_json - return _default_backend( - dask.dataframe.read_json, - url_path, - engine=( - partial(cudf.read_json, engine=engine) - if isinstance(engine, str) - else engine - ), - blocksize=blocksize, - orient=orient, - lines=lines, - compression=compression, - **kwargs, - ) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py deleted file mode 100644 index fcf684fd6c8..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/orc.py +++ /dev/null @@ -1,195 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from io import BufferedWriter, IOBase - -from fsspec.core import get_fs_token_paths -from fsspec.utils import stringify_path -from pyarrow import orc as orc - -from dask import dataframe as dd -from dask.dataframe.io.utils import _get_pyarrow_dtypes - -import cudf - - -def _read_orc_stripe(source, fs, columns=None, kwargs=None): - """Pull out specific columns from specific stripe""" - path, stripe = source - if kwargs is None: - kwargs = {} - with fs.open(path, "rb") as f: - df_stripe = cudf.read_orc( - f, stripes=[stripe], columns=columns, **kwargs - ) - return df_stripe - - -def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): - """Read ORC files into a :class:`.DataFrame`. - - Note that this function is mostly borrowed from upstream Dask. - - Parameters - ---------- - path : str or list[str] - Location of file(s), which can be a full URL with protocol specifier, - and may include glob character if a single string. - columns : None or list[str] - Columns to load. If None, loads all. - filters : None or list of tuple or list of lists of tuples - If not None, specifies a filter predicate used to filter out - row groups using statistics stored for each row group as - Parquet metadata. Row groups that do not match the given - filter predicate are not read. The predicate is expressed in - `disjunctive normal form (DNF) - `__ - like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary - boolean logical combinations of single column predicates. The - innermost tuples each describe a single column predicate. The - list of inner predicates is interpreted as a conjunction - (AND), forming a more selective and multiple column predicate. - Finally, the outermost list combines these filters as a - disjunction (OR). Predicates may also be passed as a list of - tuples. This form is interpreted as a single conjunction. To - express OR in predicates, one must use the (preferred) - notation of list of lists of tuples. - storage_options : None or dict - Further parameters to pass to the bytes backend. - - See Also - -------- - dask.dataframe.read_orc - - Returns - ------- - dask_cudf.DataFrame - - """ - - storage_options = storage_options or {} - fs, _, paths = get_fs_token_paths( - path, mode="rb", storage_options=storage_options - ) - schema = None - nstripes_per_file = [] - for path in paths: - with fs.open(path, "rb") as f: - o = orc.ORCFile(f) - if schema is None: - schema = o.schema - elif schema != o.schema: - raise ValueError( - "Incompatible schemas while parsing ORC files" - ) - nstripes_per_file.append(o.nstripes) - schema = _get_pyarrow_dtypes(schema, categories=None) - if columns is not None: - ex = set(columns) - set(schema) - if ex: - raise ValueError( - f"Requested columns ({ex}) not in schema ({set(schema)})" - ) - else: - columns = list(schema) - - with fs.open(paths[0], "rb") as f: - meta = cudf.read_orc( - f, - stripes=[0] if nstripes_per_file[0] else None, - columns=columns, - **kwargs, - ) - - sources = [] - for path, n in zip(paths, nstripes_per_file): - for stripe in ( - range(n) - if filters is None - else cudf.io.orc._filter_stripes(filters, path) - ): - sources.append((path, stripe)) - - return dd.from_map( - _read_orc_stripe, - sources, - args=[fs], - columns=columns, - kwargs=kwargs, - meta=meta, - ) - - -def write_orc_partition(df, path, fs, filename, compression="snappy"): - full_path = fs.sep.join([path, filename]) - with fs.open(full_path, mode="wb") as out_file: - if not isinstance(out_file, IOBase): - out_file = BufferedWriter(out_file) - cudf.io.to_orc(df, out_file, compression=compression) - return full_path - - -def to_orc( - df, - path, - write_index=True, - storage_options=None, - compression="snappy", - compute=True, - **kwargs, -): - """ - Write a :class:`.DataFrame` to ORC file(s) (one file per partition). - - Parameters - ---------- - df : DataFrame - path : str or pathlib.Path - Destination directory for data. Prepend with protocol like ``s3://`` - or ``hdfs://`` for remote data. - write_index : boolean, optional - Whether or not to write the index. Defaults to True. - storage_options : None or dict - Further parameters to pass to the bytes backend. - compression : string or dict, optional - compute : bool, optional - If True (default) then the result is computed immediately. If - False then a :class:`~dask.delayed.Delayed` object is returned - for future computation. - - """ - - from dask import compute as dask_compute, delayed - - # TODO: Use upstream dask implementation once available - # (see: Dask Issue#5596) - - if hasattr(path, "name"): - path = stringify_path(path) - fs, _, _ = get_fs_token_paths( - path, mode="wb", storage_options=storage_options - ) - # Trim any protocol information from the path before forwarding - path = fs._strip_protocol(path) - - if write_index: - df = df.reset_index() - else: - # Not writing index - might as well drop it - df = df.reset_index(drop=True) - - fs.mkdirs(path, exist_ok=True) - - # Use i_offset and df.npartitions to define file-name list - filenames = ["part.%i.orc" % i for i in range(df.npartitions)] - - # write parts - dwrite = delayed(write_orc_partition) - parts = [ - dwrite(d, path, fs, filename, compression=compression) - for d, filename in zip(df.to_delayed(), filenames) - ] - - if compute: - return dask_compute(*parts) - - return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py index c0638e4a1c3..c0792663c7e 100644 --- a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import itertools import warnings from functools import partial @@ -8,7 +8,7 @@ import pandas as pd from pyarrow import dataset as pa_ds, parquet as pq -from dask import dataframe as dd +import dask.dataframe as dd from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine try: @@ -448,65 +448,7 @@ def set_object_dtypes_from_pa_schema(df, schema): df._data[col_name] = col.astype(typ) -def read_parquet(path, columns=None, **kwargs): - """ - Read parquet files into a :class:`.DataFrame`. - - Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` - to coordinate the execution of :func:`cudf.read_parquet`, and to - ultimately create a :class:`.DataFrame` collection. - - See the :func:`dask.dataframe.read_parquet` documentation for - all available options. - - Examples - -------- - >>> from dask_cudf import read_parquet - >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP - - When dealing with one or more large parquet files having an - in-memory footprint >15% device memory, the ``split_row_groups`` - argument should be used to map Parquet **row-groups** to DataFrame - partitions (instead of **files** to partitions). For example, the - following code will map each row-group to a distinct partition: - - >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP - - To map **multiple** row-groups to each partition, an integer can be - passed to ``split_row_groups`` to specify the **maximum** number of - row-groups allowed in each output partition: - - >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP - - See Also - -------- - cudf.read_parquet - dask.dataframe.read_parquet - """ - if isinstance(columns, str): - columns = [columns] - - # Set "check_file_size" option to determine whether we - # should check the parquet-file size. This check is meant - # to "protect" users from `split_row_groups` default changes - check_file_size = kwargs.pop("check_file_size", 500_000_000) - if ( - check_file_size - and ("split_row_groups" not in kwargs) - and ("chunksize" not in kwargs) - ): - # User is not specifying `split_row_groups` or `chunksize`, - # so we should warn them if/when a file is ~>0.5GB on disk. - # They can set `split_row_groups` explicitly to silence/skip - # this check - if "read" not in kwargs: - kwargs["read"] = {} - kwargs["read"]["check_file_size"] = check_file_size - - return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) - - -to_parquet = partial(dd.to_parquet, engine=CudfEngine) +to_parquet = dd.to_parquet if create_metadata_file_dd is None: create_metadata_file = create_metadata_file_dd diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py deleted file mode 100644 index 3757c85c80c..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/io/text.py +++ /dev/null @@ -1,56 +0,0 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. - -import os -from glob import glob - -import dask.dataframe as dd -from dask.utils import parse_bytes - -import cudf - - -def _read_text(source, **kwargs): - # Wrapper for cudf.read_text operation - fn, byte_range = source - return cudf.read_text(fn, byte_range=byte_range, **kwargs) - - -def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs): - if isinstance(chunksize, str): - chunksize = parse_bytes(chunksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - if chunksize and byte_range: - raise ValueError("Cannot specify both chunksize and byte_range.") - - if chunksize: - sources = [] - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, chunksize): - byte_range = ( - start, - chunksize, - ) # specify which chunk of the file we care about - sources.append((fn, byte_range)) - else: - sources = [(fn, byte_range) for fn in filenames] - - return dd.from_map( - _read_text, - sources, - meta=cudf.Series([], dtype="O"), - **kwargs, - ) diff --git a/python/dask_cudf/dask_cudf/_legacy/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py deleted file mode 100644 index a2ba4d1878e..00000000000 --- a/python/dask_cudf/dask_cudf/_legacy/sorting.py +++ /dev/null @@ -1,361 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import warnings -from collections.abc import Iterator -from functools import wraps - -import cupy -import numpy as np -import tlz as toolz - -from dask import config -from dask.base import tokenize -from dask.dataframe import methods -from dask.dataframe.core import DataFrame, Index, Series -from dask.dataframe.shuffle import rearrange_by_column -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M - -import cudf -from cudf.api.types import _is_categorical_dtype -from cudf.utils.performance_tracking import _dask_cudf_performance_tracking - -_SHUFFLE_SUPPORT = ("tasks", "p2p") # "disk" not supported - - -def _deprecate_shuffle_kwarg(func): - @wraps(func) - def wrapper(*args, **kwargs): - old_arg_value = kwargs.pop("shuffle", None) - - if old_arg_value is not None: - new_arg_value = old_arg_value - msg = ( - "the 'shuffle' keyword is deprecated, " - "use 'shuffle_method' instead." - ) - - warnings.warn(msg, FutureWarning) - if kwargs.get("shuffle_method") is not None: - msg = ( - "Can only specify 'shuffle' " - "or 'shuffle_method', not both." - ) - raise TypeError(msg) - kwargs["shuffle_method"] = new_arg_value - return func(*args, **kwargs) - - return wrapper - - -@_dask_cudf_performance_tracking -def set_index_post(df, index_name, drop, column_dtype): - df2 = df.set_index(index_name, drop=drop) - df2.columns = df2.columns.astype(column_dtype) - return df2 - - -@_dask_cudf_performance_tracking -def _set_partitions_pre(s, divisions, ascending=True, na_position="last"): - if ascending: - partitions = divisions.searchsorted(s, side="right") - 1 - else: - partitions = ( - len(divisions) - divisions.searchsorted(s, side="right") - 1 - ) - partitions[(partitions < 0) | (partitions >= len(divisions) - 1)] = ( - 0 if ascending else (len(divisions) - 2) - ) - partitions[s._columns[0].isnull().values] = ( - len(divisions) - 2 if na_position == "last" else 0 - ) - return partitions - - -@_dask_cudf_performance_tracking -def _quantile(a, q): - n = len(a) - if not len(a): - return None, n - return ( - a.quantile(q=q.tolist(), interpolation="nearest", method="table"), - n, - ) - - -@_dask_cudf_performance_tracking -def merge_quantiles(finalq, qs, vals): - """Combine several quantile calculations of different data. - [NOTE: Same logic as dask.array merge_percentiles] - """ - if isinstance(finalq, Iterator): - finalq = list(finalq) - finalq = np.array(finalq) - qs = list(map(list, qs)) - vals = list(vals) - vals, Ns = zip(*vals) - Ns = list(Ns) - - L = list(zip(*[(q, val, N) for q, val, N in zip(qs, vals, Ns) if N])) - if not L: - raise ValueError("No non-trivial arrays found") - qs, vals, Ns = L - - if len(vals) != len(qs) or len(Ns) != len(qs): - raise ValueError("qs, vals, and Ns parameters must be the same length") - - # transform qs and Ns into number of observations between quantiles - counts = [] - for q, N in zip(qs, Ns): - count = np.empty(len(q)) - count[1:] = np.diff(q) - count[0] = q[0] - count *= N - counts.append(count) - - def _append_counts(val, count): - val["_counts"] = count - return val - - # Sort by calculated quantile values, then number of observations. - combined_vals_counts = cudf.core.reshape._merge_sorted( - [*map(_append_counts, vals, counts)] - ) - combined_counts = cupy.asnumpy(combined_vals_counts["_counts"].values) - combined_vals = combined_vals_counts.drop(columns=["_counts"]) - - # quantile-like, but scaled by total number of observations - combined_q = np.cumsum(combined_counts) - - # rescale finalq quantiles to match combined_q - desired_q = finalq * sum(Ns) - - # TODO: Support other interpolation methods - # For now - Always use "nearest" for interpolation - left = np.searchsorted(combined_q, desired_q, side="left") - right = np.searchsorted(combined_q, desired_q, side="right") - 1 - np.minimum(left, len(combined_vals) - 1, left) # don't exceed max index - lower = np.minimum(left, right) - upper = np.maximum(left, right) - lower_residual = np.abs(combined_q[lower] - desired_q) - upper_residual = np.abs(combined_q[upper] - desired_q) - mask = lower_residual > upper_residual - index = lower # alias; we no longer need lower - index[mask] = upper[mask] - rv = combined_vals.iloc[index] - return rv.reset_index(drop=True) - - -@_dask_cudf_performance_tracking -def _approximate_quantile(df, q): - """Approximate quantiles of DataFrame or Series. - [NOTE: Same logic as dask.dataframe Series quantile] - """ - # current implementation needs q to be sorted so - # sort if array-like, otherwise leave it alone - q_ndarray = np.array(q) - if q_ndarray.ndim > 0: - q_ndarray.sort(kind="mergesort") - q = q_ndarray - - # Lets assume we are dealing with a DataFrame throughout - if isinstance(df, (Series, Index)): - df = df.to_frame() - assert isinstance(df, DataFrame) - final_type = df._meta._constructor - - # Create metadata - meta = df._meta_nonempty.quantile(q=q, method="table") - - # Define final action (create df with quantiles as index) - def finalize_tsk(tsk): - return (final_type, tsk) - - return_type = df.__class__ - - # pandas/cudf uses quantile in [0, 1] - # numpy / cupy uses [0, 100] - qs = np.asarray(q) - token = tokenize(df, qs) - - if len(qs) == 0: - name = "quantiles-" + token - empty_index = cudf.Index([], dtype=float) - return Series( - { - (name, 0): final_type( - {col: [] for col in df.columns}, - name=df.name, - index=empty_index, - ) - }, - name, - df._meta, - [None, None], - ) - else: - new_divisions = [np.min(q), np.max(q)] - - name = "quantiles-1-" + token - val_dsk = { - (name, i): (_quantile, key, qs) - for i, key in enumerate(df.__dask_keys__()) - } - - name2 = "quantiles-2-" + token - merge_dsk = { - (name2, 0): finalize_tsk( - (merge_quantiles, qs, [qs] * df.npartitions, sorted(val_dsk)) - ) - } - dsk = toolz.merge(val_dsk, merge_dsk) - graph = HighLevelGraph.from_collections(name2, dsk, dependencies=[df]) - df = return_type(graph, name2, meta, new_divisions) - - def set_quantile_index(df): - df.index = q - return df - - df = df.map_partitions(set_quantile_index, meta=meta) - return df - - -@_dask_cudf_performance_tracking -def quantile_divisions(df, by, npartitions): - qn = np.linspace(0.0, 1.0, npartitions + 1).tolist() - divisions = _approximate_quantile(df[by], qn).compute() - columns = divisions.columns - - # TODO: Make sure divisions are correct for all dtypes.. - if ( - len(columns) == 1 - and df[columns[0]].dtype != "object" - and not _is_categorical_dtype(df[columns[0]].dtype) - ): - dtype = df[columns[0]].dtype - divisions = divisions[columns[0]].astype("int64") - divisions.iloc[-1] += 1 - divisions = sorted( - divisions.drop_duplicates().astype(dtype).to_arrow().tolist(), - key=lambda x: (x is None, x), - ) - else: - for col in columns: - dtype = df[col].dtype - if dtype != "object": - divisions[col] = divisions[col].astype("int64") - divisions[col].iloc[-1] += 1 - divisions[col] = divisions[col].astype(dtype) - else: - if last := divisions[col].iloc[-1]: - val = chr(ord(last[0]) + 1) - else: - val = "this string intentionally left empty" # any but "" - divisions[col].iloc[-1] = val - divisions = divisions.drop_duplicates().sort_index() - return divisions - - -@_deprecate_shuffle_kwarg -@_dask_cudf_performance_tracking -def sort_values( - df, - by, - max_branch=None, - divisions=None, - set_divisions=False, - ignore_index=False, - ascending=True, - na_position="last", - shuffle_method=None, - sort_function=None, - sort_function_kwargs=None, -): - """Sort by the given list/tuple of column names.""" - - if not isinstance(ascending, bool): - raise ValueError("ascending must be either True or False") - if na_position not in ("first", "last"): - raise ValueError("na_position must be either 'first' or 'last'") - - npartitions = df.npartitions - if isinstance(by, tuple): - by = list(by) - elif not isinstance(by, list): - by = [by] - - # parse custom sort function / kwargs if provided - sort_kwargs = { - "by": by, - "ascending": ascending, - "na_position": na_position, - } - if sort_function is None: - sort_function = M.sort_values - if sort_function_kwargs is not None: - sort_kwargs.update(sort_function_kwargs) - - # handle single partition case - if npartitions == 1: - return df.map_partitions(sort_function, **sort_kwargs) - - # Step 1 - Calculate new divisions (if necessary) - if divisions is None: - divisions = quantile_divisions(df, by, npartitions) - - # Step 2 - Perform repartitioning shuffle - meta = df._meta._constructor_sliced([0]) - if not isinstance(divisions, (cudf.Series, cudf.DataFrame)): - dtype = df[by[0]].dtype - divisions = df._meta._constructor_sliced(divisions, dtype=dtype) - - partitions = df[by].map_partitions( - _set_partitions_pre, - divisions=divisions, - ascending=ascending, - na_position=na_position, - meta=meta, - ) - - df2 = df.assign(_partitions=partitions) - df3 = rearrange_by_column( - df2, - "_partitions", - max_branch=max_branch, - npartitions=len(divisions) - 1, - shuffle_method=_get_shuffle_method(shuffle_method), - ignore_index=ignore_index, - ).drop(columns=["_partitions"]) - df3.divisions = (None,) * (df3.npartitions + 1) - - # Step 3 - Return final sorted df - df4 = df3.map_partitions(sort_function, **sort_kwargs) - if not isinstance(divisions, cudf.DataFrame) and set_divisions: - # Can't have multi-column divisions elsewhere in dask (yet) - df4.divisions = tuple(methods.tolist(divisions)) - - return df4 - - -def get_default_shuffle_method(): - # Note that `dask.utils.get_default_shuffle_method` - # will return "p2p" by default when a distributed - # client is present. Dask-cudf supports "p2p", but - # will not use it by default (yet) - default = config.get("dataframe.shuffle.method", "tasks") - if default not in _SHUFFLE_SUPPORT: - default = "tasks" - return default - - -def _get_shuffle_method(shuffle_method): - # Utility to set the shuffle_method-kwarg default - # and to validate user-specified options - shuffle_method = shuffle_method or get_default_shuffle_method() - if shuffle_method not in _SHUFFLE_SUPPORT: - raise ValueError( - "Dask-cudf only supports the following shuffle " - f"methods: {_SHUFFLE_SUPPORT}. Got shuffle_method={shuffle_method}" - ) - - return shuffle_method diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index fceaaf185e8..f33733d9583 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import warnings from collections.abc import Iterator @@ -11,14 +11,12 @@ from packaging.version import Version from pandas.api.types import is_scalar -import dask.dataframe as dd from dask import config from dask.array.dispatch import percentile_lookup from dask.dataframe.backends import ( DataFrameBackendEntrypoint, PandasBackendEntrypoint, ) -from dask.dataframe.core import get_parallel_type, meta_nonempty from dask.dataframe.dispatch import ( categorical_dtype_dispatch, concat_dispatch, @@ -28,6 +26,8 @@ hash_object_dispatch, is_categorical_dtype_dispatch, make_meta_dispatch, + meta_nonempty, + partd_encode_dispatch, pyarrow_schema_dispatch, to_pyarrow_table_dispatch, tolist_dispatch, @@ -46,13 +46,6 @@ from cudf.api.types import is_string_dtype from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from ._legacy.core import DataFrame, Index, Series - -get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) -get_parallel_type.register(cudf.Series, lambda _: Series) -get_parallel_type.register(cudf.BaseIndex, lambda _: Index) - - # Required for Arrow filesystem support in read_parquet PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0") @@ -318,7 +311,7 @@ def tolist_cudf(obj): @is_categorical_dtype_dispatch.register( - (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype, Series) + (cudf.Series, cudf.BaseIndex, cudf.CategoricalDtype) # , Series) ) @_dask_cudf_performance_tracking def is_categorical_dtype_cudf(obj): @@ -464,28 +457,21 @@ def sizeof_cudf_series_index(obj): return obj.memory_usage() -# TODO: Remove try/except when cudf is pinned to dask>=2023.10.0 -try: - from dask.dataframe.dispatch import partd_encode_dispatch - - @partd_encode_dispatch.register(cudf.DataFrame) - def _simple_cudf_encode(_): - # Basic pickle-based encoding for a partd k-v store - import pickle +@partd_encode_dispatch.register(cudf.DataFrame) +def _simple_cudf_encode(_): + # Basic pickle-based encoding for a partd k-v store + import pickle - import partd + import partd - def join(dfs): - if not dfs: - return cudf.DataFrame() - else: - return cudf.concat(dfs) - - dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) - return partial(partd.Encode, dumps, pickle.loads, join) + def join(dfs): + if not dfs: + return cudf.DataFrame() + else: + return cudf.concat(dfs) -except ImportError: - pass + dumps = partial(pickle.dumps, protocol=pickle.HIGHEST_PROTOCOL) + return partial(partd.Encode, dumps, pickle.loads, join) def _default_backend(func, *args, **kwargs): @@ -557,105 +543,22 @@ def to_cudf_dispatch_from_cudf(data, **kwargs): return data -# Define "cudf" backend engine to be registered with Dask -class CudfBackendEntrypoint(DataFrameBackendEntrypoint): - """Backend-entrypoint class for Dask-DataFrame +# Define the "cudf" backend for "legacy" Dask DataFrame +class LegacyCudfBackendEntrypoint(DataFrameBackendEntrypoint): + """Backend-entrypoint class for legacy Dask-DataFrame This class is registered under the name "cudf" for the - ``dask.dataframe.backends`` entrypoint in ``setup.cfg``. - Dask-DataFrame will use the methods defined in this class - in place of ``dask.dataframe.`` when the - "dataframe.backend" configuration is set to "cudf": - - Examples - -------- - >>> import dask - >>> import dask.dataframe as dd - >>> with dask.config.set({"dataframe.backend": "cudf"}): - ... ddf = dd.from_dict({"a": range(10)}) - >>> type(ddf) - + ``dask.dataframe.backends`` entrypoint in ``pyproject.toml``. + This "legacy" backend is only used for CSV support. """ - @classmethod - def to_backend_dispatch(cls): - return to_cudf_dispatch - - @classmethod - def to_backend(cls, data: dd.core._Frame, **kwargs): - if isinstance(data._meta, (cudf.DataFrame, cudf.Series, cudf.Index)): - # Already a cudf-backed collection - _unsupported_kwargs("cudf", "cudf", kwargs) - return data - return data.map_partitions(cls.to_backend_dispatch(), **kwargs) - - @staticmethod - def from_dict( - data, - npartitions, - orient="columns", - dtype=None, - columns=None, - constructor=cudf.DataFrame, - ): - return _default_backend( - dd.from_dict, - data, - npartitions=npartitions, - orient=orient, - dtype=dtype, - columns=columns, - constructor=constructor, - ) - - @staticmethod - def read_parquet(*args, engine=None, **kwargs): - from dask_cudf._legacy.io.parquet import CudfEngine - - _raise_unsupported_parquet_kwargs(**kwargs) - return _default_backend( - dd.read_parquet, - *args, - engine=CudfEngine, - **kwargs, - ) - - @staticmethod - def read_json(*args, **kwargs): - from dask_cudf._legacy.io.json import read_json - - return read_json(*args, **kwargs) - @staticmethod - def read_orc(*args, **kwargs): - from dask_cudf._legacy.io import read_orc - - return read_orc(*args, **kwargs) - - @staticmethod - def read_csv(*args, **kwargs): - from dask_cudf._legacy.io import read_csv - - return read_csv(*args, **kwargs) - - @staticmethod - def read_hdf(*args, **kwargs): - # HDF5 reader not yet implemented in cudf - warnings.warn( - "read_hdf is not yet implemented in cudf/dask_cudf. " - "Moving to cudf from pandas. Expect poor performance!" - ) - return _default_backend(dd.read_hdf, *args, **kwargs).to_backend( - "cudf" - ) - - -# Define "cudf" backend entrypoint for dask-expr -class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint): +# Define the "cudf" backend for expr-based Dask DataFrame +class CudfBackendEntrypoint(DataFrameBackendEntrypoint): """Backend-entrypoint class for Dask-Expressions This class is registered under the name "cudf" for the - ``dask-expr.dataframe.backends`` entrypoint in ``setup.cfg``. + ``dask_expr.dataframe.backends`` entrypoint in ``pyproject.toml``. Dask-DataFrame will use the methods defined in this class in place of ``dask_expr.`` when the "dataframe.backend" configuration is set to "cudf": @@ -746,12 +649,12 @@ def read_csv( @staticmethod def read_json(*args, **kwargs): - from dask_cudf._legacy.io.json import read_json as read_json_impl + from dask_cudf.io.json import read_json as read_json_impl return read_json_impl(*args, **kwargs) @staticmethod def read_orc(*args, **kwargs): - from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc + from dask_cudf.io.orc import read_orc as legacy_read_orc return legacy_read_orc(*args, **kwargs) diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 5fd217209ec..32461104ef9 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,56 +1,41 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import textwrap +import warnings +from importlib import import_module import dask.dataframe as dd -from dask.tokenize import tokenize import cudf from cudf.utils.performance_tracking import _dask_cudf_performance_tracking # This module provides backward compatibility for legacy import patterns. -if dd.DASK_EXPR_ENABLED: - from dask_cudf._expr.collection import ( - DataFrame, - Index, - Series, - ) -else: - from dask_cudf._legacy.core import DataFrame, Index, Series # noqa: F401 - +from dask_cudf._expr.collection import ( + DataFrame, # noqa: F401 + Index, # noqa: F401 + Series, # noqa: F401 +) concat = dd.concat @_dask_cudf_performance_tracking def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): - from dask_cudf import QUERY_PLANNING_ON - if isinstance(getattr(data, "index", None), cudf.MultiIndex): raise NotImplementedError( "dask_cudf does not support MultiIndex Dataframes." ) - # Dask-expr doesn't support the `name` argument - name = {} - if not QUERY_PLANNING_ON: - name = { - "name": name - or ("from_cudf-" + tokenize(data, npartitions or chunksize)) - } - return dd.from_pandas( data, npartitions=npartitions, chunksize=chunksize, sort=sort, - **name, ) -from_cudf.__doc__ = ( - textwrap.dedent( - """ +from_cudf.__doc__ = textwrap.dedent( + """ Create a :class:`.DataFrame` from a :class:`cudf.DataFrame`. This function is a thin wrapper around @@ -58,9 +43,23 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): arguments (described below) excepting that it operates on cuDF rather than pandas objects.\n """ - ) - # TODO: `dd.from_pandas.__doc__` is empty when - # `DASK_DATAFRAME__QUERY_PLANNING=True` - # since dask-expr does not provide a docstring for from_pandas. - + textwrap.dedent(dd.from_pandas.__doc__ or "") -) +) + textwrap.dedent(dd.from_pandas.__doc__) + + +def _deprecated_api(old_api, new_api=None, rec=None): + def inner_func(*args, **kwargs): + if new_api: + # Use alternative + msg = f"{old_api} is now deprecated. " + msg += rec or f"Please use {new_api} instead." + warnings.warn(msg, FutureWarning) + new_attr = new_api.split(".") + module = import_module(".".join(new_attr[:-1])) + return getattr(module, new_attr[-1])(*args, **kwargs) + + # No alternative - raise an error + raise NotImplementedError( + f"{old_api} is no longer supported. " + (rec or "") + ) + + return inner_func diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 9bca33e414a..a5175c9bbe7 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,6 +1,6 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. -from dask_cudf import QUERY_PLANNING_ON, _deprecated_api +from dask_cudf.core import _deprecated_api from . import csv, json, orc, parquet, text # noqa: F401 @@ -15,20 +15,13 @@ ) to_orc = _deprecated_api( "dask_cudf.io.to_orc", - new_api="dask_cudf._legacy.io.to_orc", + new_api="dask_cudf.io.orc.to_orc", rec="Please use the DataFrame.to_orc method instead.", ) read_text = _deprecated_api( "dask_cudf.io.read_text", new_api="dask_cudf.read_text" ) -if QUERY_PLANNING_ON: - read_parquet = parquet.read_parquet -else: - read_parquet = _deprecated_api( - "The legacy dask_cudf.io.read_parquet API", - new_api="dask_cudf.read_parquet", - rec="", - ) +read_parquet = parquet.read_parquet to_parquet = _deprecated_api( "dask_cudf.io.to_parquet", new_api="dask_cudf._legacy.io.parquet.to_parquet", diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index 29f98b14511..e36ee04d827 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. import os from glob import glob @@ -25,11 +25,11 @@ def read_csv(path, blocksize="default", **kwargs): >>> import dask_cudf >>> df = dask_cudf.read_csv("myfiles.*.csv") - In some cases it can break up large files: + It can break up large files if blocksize is specified: >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - It can read CSV files from external resources (e.g. S3, HTTP, FTP) + It can read CSV files from external resources (e.g. S3, HTTP, FTP): >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") @@ -44,15 +44,15 @@ def read_csv(path, blocksize="default", **kwargs): ---------- path : str, path object, or file-like object Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as + ``py._path.local.LocalPath``), URL (including HTTP, FTP, and S3 + locations), or any object with a ``read()`` method (such as builtin :py:func:`open` file handler function or :py:class:`~io.StringIO`). blocksize : int or str, default "256 MiB" The target task partition size. If ``None``, a single block is used for each file. **kwargs : dict - Passthrough key-word arguments that are sent to + Passthrough keyword arguments that are sent to :func:`cudf:cudf.read_csv`. Notes diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index 8f85ea54c0a..3022ebb2a5b 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -1,8 +1,209 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +from functools import partial -read_json = _deprecated_api( - "dask_cudf.io.json.read_json", - new_api="dask_cudf.read_json", -) +import numpy as np +from fsspec.core import get_compression, get_fs_token_paths + +import dask +from dask.utils import parse_bytes + +import cudf +from cudf.core.column import as_column +from cudf.utils.ioutils import _is_local_filesystem + +from dask_cudf.backends import _default_backend + + +def _read_json_partition( + paths, + fs=None, + include_path_column=False, + path_converter=None, + **kwargs, +): + # Transfer all data up front for remote storage + sources = ( + paths + if fs is None + else fs.cat_ranges( + paths, + [0] * len(paths), + fs.sizes(paths), + ) + ) + + if include_path_column: + # Add "path" column. + # Must iterate over sources sequentially + if not isinstance(include_path_column, str): + include_path_column = "path" + converted_paths = ( + paths + if path_converter is None + else [path_converter(path) for path in paths] + ) + dfs = [] + for i, source in enumerate(sources): + df = cudf.read_json(source, **kwargs) + df[include_path_column] = as_column( + converted_paths[i], length=len(df) + ) + dfs.append(df) + return cudf.concat(dfs) + else: + # Pass sources directly to cudf + return cudf.read_json(sources, **kwargs) + + +def read_json( + url_path, + engine="auto", + blocksize=None, + orient="records", + lines=None, + compression="infer", + aggregate_files=True, + **kwargs, +): + """Read JSON data into a :class:`.DataFrame`. + + This function wraps :func:`dask.dataframe.read_json`, and passes + ``engine=partial(cudf.read_json, engine="auto")`` by default. + + Parameters + ---------- + url_path : str, list of str + Location to read from. If a string, can include a glob character to + find a set of file names. + Supports protocol specifications such as ``"s3://"``. + engine : str or Callable, default "auto" + + If str, this value will be used as the ``engine`` argument + when :func:`cudf.read_json` is used to create each partition. + If a :obj:`~collections.abc.Callable`, this value will be used as the + underlying function used to create each partition from JSON + data. The default value is "auto", so that + ``engine=partial(cudf.read_json, engine="auto")`` will be + passed to :func:`dask.dataframe.read_json` by default. + aggregate_files : bool or int + Whether to map multiple files to each output partition. If True, + the `blocksize` argument will be used to determine the number of + files in each partition. If any one file is larger than `blocksize`, + the `aggregate_files` argument will be ignored. If an integer value + is specified, the `blocksize` argument will be ignored, and that + number of files will be mapped to each partition. Default is True. + **kwargs : + Key-word arguments to pass through to :func:`dask.dataframe.read_json`. + + Returns + ------- + :class:`.DataFrame` + + Examples + -------- + Load single file + + >>> from dask_cudf import read_json + >>> read_json('myfile.json') # doctest: +SKIP + + Load large line-delimited JSON files using partitions of approx + 256MB size + + >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP + + Load nested JSON data + + >>> read_json('myfile.json') # doctest: +SKIP + + See Also + -------- + dask.dataframe.read_json + + """ + + if lines is None: + lines = orient == "records" + if orient != "records" and lines: + raise ValueError( + 'Line-delimited JSON is only available with orient="records".' + ) + if blocksize and (orient != "records" or not lines): + raise ValueError( + "JSON file chunking only allowed for JSON-lines" + "input (orient='records', lines=True)." + ) + + inputs = [] + if aggregate_files and blocksize or int(aggregate_files) > 1: + # Attempt custom read if we are mapping multiple files + # to each output partition. Otherwise, upstream logic + # is sufficient. + + storage_options = kwargs.get("storage_options", {}) + fs, _, paths = get_fs_token_paths( + url_path, mode="rb", storage_options=storage_options + ) + if isinstance(aggregate_files, int) and aggregate_files > 1: + # Map a static file count to each partition + inputs = [ + paths[offset : offset + aggregate_files] + for offset in range(0, len(paths), aggregate_files) + ] + elif aggregate_files is True and blocksize: + # Map files dynamically (using blocksize) + file_sizes = fs.sizes(paths) # NOTE: This can be slow + blocksize = parse_bytes(blocksize) + if all([file_size <= blocksize for file_size in file_sizes]): + counts = np.unique( + np.floor(np.cumsum(file_sizes) / blocksize), + return_counts=True, + )[1] + offsets = np.concatenate([[0], counts.cumsum()]) + inputs = [ + paths[offsets[i] : offsets[i + 1]] + for i in range(len(offsets) - 1) + ] + + if inputs: + # Inputs were successfully populated. + # Use custom _read_json_partition function + # to generate each partition. + + compression = get_compression( + url_path[0] if isinstance(url_path, list) else url_path, + compression, + ) + _kwargs = dict( + orient=orient, + lines=lines, + compression=compression, + include_path_column=kwargs.get("include_path_column", False), + path_converter=kwargs.get("path_converter"), + ) + if not _is_local_filesystem(fs): + _kwargs["fs"] = fs + # TODO: Generate meta more efficiently + meta = _read_json_partition(inputs[0][:1], **_kwargs) + return dask.dataframe.from_map( + _read_json_partition, + inputs, + meta=meta, + **_kwargs, + ) + + # Fall back to dask.dataframe.read_json + return _default_backend( + dask.dataframe.read_json, + url_path, + engine=( + partial(cudf.read_json, engine=engine) + if isinstance(engine, str) + else engine + ), + blocksize=blocksize, + orient=orient, + lines=lines, + compression=compression, + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index 5219cdacc31..5de28751912 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,13 +1,195 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from dask_cudf import _deprecated_api - -read_orc = _deprecated_api( - "dask_cudf.io.orc.read_orc", - new_api="dask_cudf.read_orc", -) -to_orc = _deprecated_api( - "dask_cudf.io.orc.to_orc", - new_api="dask_cudf._legacy.io.orc.to_orc", - rec="Please use the DataFrame.to_orc method instead.", -) +# Copyright (c) 2020-2025, NVIDIA CORPORATION. + +from io import BufferedWriter, IOBase + +from fsspec.core import get_fs_token_paths +from fsspec.utils import stringify_path +from pyarrow import orc as orc + +from dask import dataframe as dd +from dask.dataframe.io.utils import _get_pyarrow_dtypes + +import cudf + + +def _read_orc_stripe(source, fs, columns=None, kwargs=None): + """Pull out specific columns from specific stripe""" + path, stripe = source + if kwargs is None: + kwargs = {} + with fs.open(path, "rb") as f: + df_stripe = cudf.read_orc( + f, stripes=[stripe], columns=columns, **kwargs + ) + return df_stripe + + +def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): + """Read ORC files into a :class:`.DataFrame`. + + Note that this function is mostly borrowed from upstream Dask. + + Parameters + ---------- + path : str or list[str] + Location of file(s), which can be a full URL with protocol specifier, + and may include glob character if a single string. + columns : None or list[str] + Columns to load. If None, loads all. + filters : None or list of tuple or list of lists of tuples + If not None, specifies a filter predicate used to filter out + row groups using statistics stored for each row group as + Parquet metadata. Row groups that do not match the given + filter predicate are not read. The predicate is expressed in + `disjunctive normal form (DNF) + `__ + like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary + boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The + list of inner predicates is interpreted as a conjunction + (AND), forming a more selective and multiple column predicate. + Finally, the outermost list combines these filters as a + disjunction (OR). Predicates may also be passed as a list of + tuples. This form is interpreted as a single conjunction. To + express OR in predicates, one must use the (preferred) + notation of list of lists of tuples. + storage_options : None or dict + Further parameters to pass to the bytes backend. + + See Also + -------- + dask.dataframe.read_orc + + Returns + ------- + dask_cudf.DataFrame + + """ + + storage_options = storage_options or {} + fs, _, paths = get_fs_token_paths( + path, mode="rb", storage_options=storage_options + ) + schema = None + nstripes_per_file = [] + for path in paths: + with fs.open(path, "rb") as f: + o = orc.ORCFile(f) + if schema is None: + schema = o.schema + elif schema != o.schema: + raise ValueError( + "Incompatible schemas while parsing ORC files" + ) + nstripes_per_file.append(o.nstripes) + schema = _get_pyarrow_dtypes(schema, categories=None) + if columns is not None: + ex = set(columns) - set(schema) + if ex: + raise ValueError( + f"Requested columns ({ex}) not in schema ({set(schema)})" + ) + else: + columns = list(schema) + + with fs.open(paths[0], "rb") as f: + meta = cudf.read_orc( + f, + stripes=[0] if nstripes_per_file[0] else None, + columns=columns, + **kwargs, + ) + + sources = [] + for path, n in zip(paths, nstripes_per_file): + for stripe in ( + range(n) + if filters is None + else cudf.io.orc._filter_stripes(filters, path) + ): + sources.append((path, stripe)) + + return dd.from_map( + _read_orc_stripe, + sources, + args=[fs], + columns=columns, + kwargs=kwargs, + meta=meta, + ) + + +def write_orc_partition(df, path, fs, filename, compression="snappy"): + full_path = fs.sep.join([path, filename]) + with fs.open(full_path, mode="wb") as out_file: + if not isinstance(out_file, IOBase): + out_file = BufferedWriter(out_file) + cudf.io.to_orc(df, out_file, compression=compression) + return full_path + + +def to_orc( + df, + path, + write_index=True, + storage_options=None, + compression="snappy", + compute=True, + **kwargs, +): + """ + Write a :class:`.DataFrame` to ORC file(s) (one file per partition). + + Parameters + ---------- + df : DataFrame + path : str or pathlib.Path + Destination directory for data. Prepend with protocol like ``s3://`` + or ``hdfs://`` for remote data. + write_index : boolean, optional + Whether or not to write the index. Defaults to True. + storage_options : None or dict + Further parameters to pass to the bytes backend. + compression : string or dict, optional + compute : bool, optional + If True (default) then the result is computed immediately. If + False then a :class:`~dask.delayed.Delayed` object is returned + for future computation. + + """ + + from dask import compute as dask_compute, delayed + + # TODO: Use upstream dask implementation once available + # (see: Dask Issue#5596) + + if hasattr(path, "name"): + path = stringify_path(path) + fs, _, _ = get_fs_token_paths( + path, mode="wb", storage_options=storage_options + ) + # Trim any protocol information from the path before forwarding + path = fs._strip_protocol(path) + + if write_index: + df = df.reset_index() + else: + # Not writing index - might as well drop it + df = df.reset_index(drop=True) + + fs.mkdirs(path, exist_ok=True) + + # Use i_offset and df.npartitions to define file-name list + filenames = ["part.%i.orc" % i for i in range(df.npartitions)] + + # write parts + dwrite = delayed(write_orc_partition) + parts = [ + dwrite(d, path, fs, filename, compression=compression) + for d, filename in zip(df.to_delayed(), filenames) + ] + + if compute: + return dask_compute(*parts) + + return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index ba6209c4820..a953dce787d 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2024-2025, NVIDIA CORPORATION. from __future__ import annotations @@ -37,10 +37,9 @@ def TaskList(*x): import cudf -from dask_cudf import QUERY_PLANNING_ON, _deprecated_api - # Dask-expr imports CudfEngine from this module from dask_cudf._legacy.io.parquet import CudfEngine +from dask_cudf.core import _deprecated_api if TYPE_CHECKING: from collections.abc import MutableMapping @@ -832,15 +831,8 @@ def read_parquet_expr( ) -if QUERY_PLANNING_ON: - read_parquet = read_parquet_expr - read_parquet.__doc__ = read_parquet_expr.__doc__ -else: - read_parquet = _deprecated_api( - "The legacy dask_cudf.io.parquet.read_parquet API", - new_api="dask_cudf.read_parquet", - rec="", - ) +read_parquet = read_parquet_expr +read_parquet.__doc__ = read_parquet_expr.__doc__ to_parquet = _deprecated_api( "dask_cudf.io.parquet.to_parquet", new_api="dask_cudf._legacy.io.parquet.to_parquet", diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index f5509cf91c3..48eca13e16f 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import math import os @@ -11,10 +11,6 @@ from dask.utils import tmpfile import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") def test_read_json_backend_dispatch(tmp_path): @@ -137,7 +133,3 @@ def test_deprecated_api_paths(tmp_path): with pytest.warns(match="dask_cudf.io.read_json is now deprecated"): df2 = dask_cudf.io.read_json(path) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"): - df2 = dask_cudf.io.json.read_json(path) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index b6064d851ca..4aac463420b 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. import glob import os @@ -12,10 +12,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) sample_orc = os.path.join(cur_dir, "data/orc/sample.orc") @@ -159,7 +155,3 @@ def test_deprecated_api_paths(tmpdir): with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"): df2 = dask_cudf.io.read_orc(paths) dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"): - df2 = dask_cudf.io.orc.read_orc(paths) - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index 6efe6c4f388..9f7031f4d2a 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import glob import math @@ -16,11 +16,6 @@ import dask_cudf from dask_cudf._legacy.io.parquet import create_metadata_file -from dask_cudf.tests.utils import ( - require_dask_expr, - skip_dask_expr, - xfail_dask_expr, -) # Check if create_metadata_file is supported by # the current dask.dataframe version @@ -450,7 +445,6 @@ def test_create_metadata_file(tmpdir, partition_on): dd.assert_eq(ddf1, ddf2) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @need_create_meta def test_create_metadata_file_inconsistent_schema(tmpdir): # NOTE: This test demonstrates that the CudfEngine @@ -531,19 +525,6 @@ def test_cudf_list_struct_write(tmpdir): dd.assert_eq(df, new_ddf) -@skip_dask_expr("Not necessary in dask-expr") -def test_check_file_size(tmpdir): - # Test simple file-size check to help warn users - # of upstream change to `split_row_groups` default - fn = str(tmpdir.join("test.parquet")) - cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn) - with pytest.warns(match="large parquet file"): - # Need to use `dask_cudf._legacy.io` path - # TODO: Remove outdated `check_file_size` functionality - dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute() - - -@xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0") def test_null_partition(tmpdir): import pyarrow as pa from pyarrow.dataset import HivePartitioning @@ -626,7 +607,6 @@ def test_timezone_column(tmpdir): dd.assert_eq(got, expect) -@require_dask_expr() @pytest.mark.skipif( not dask_cudf.backends.PYARROW_GE_15, reason="Requires pyarrow 15", @@ -677,17 +657,8 @@ def test_deprecated_api_paths(tmpdir): with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"): dask_cudf.io.to_parquet(df, tmpdir) - if dask_cudf.QUERY_PLANNING_ON: - df2 = dask_cudf.io.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) - - df2 = dask_cudf.io.parquet.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) - else: - with pytest.warns(match="legacy dask_cudf.io.read_parquet"): - df2 = dask_cudf.io.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) + df2 = dask_cudf.io.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) - with pytest.warns(match="legacy dask_cudf.io.parquet.read_parquet"): - df2 = dask_cudf.io.parquet.read_parquet(tmpdir) - dd.assert_eq(df, df2, check_divisions=False) + df2 = dask_cudf.io.parquet.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index 90907f6fb99..7c53b89a883 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import os import socket @@ -14,7 +14,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import QUERY_PLANNING_ON moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -136,7 +135,7 @@ def test_read_parquet_open_file_options_raises(): pytest.param( "arrow", marks=pytest.mark.skipif( - not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15, + not dask_cudf.backends.PYARROW_GE_15, reason="Not supported", ), ), diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index e35b6411a9d..f4d59334e03 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import os @@ -9,10 +9,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import skip_dask_expr - -# No dask-expr support for dask<2024.4.0 -pytestmark = skip_dask_expr(lt_version="2024.4.0") cur_dir = os.path.dirname(__file__) text_file = os.path.join(cur_dir, "data/text/sample.pgn") @@ -42,7 +38,3 @@ def test_deprecated_api_paths(): with pytest.warns(match="dask_cudf.io.read_text is now deprecated"): df2 = dask_cudf.io.read_text(text_file, delimiter=".") dd.assert_eq(df, df2, check_divisions=False) - - with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"): - df2 = dask_cudf.io.text.read_text(text_file, delimiter=".") - dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py index 1caf4e81d8e..eb1d007cc16 100644 --- a/python/dask_cudf/dask_cudf/io/text.py +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -1,8 +1,56 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. -from dask_cudf import _deprecated_api +import os +from glob import glob -read_text = _deprecated_api( - "dask_cudf.io.text.read_text", - new_api="dask_cudf.read_text", -) +import dask.dataframe as dd +from dask.utils import parse_bytes + +import cudf + + +def _read_text(source, **kwargs): + # Wrapper for cudf.read_text operation + fn, byte_range = source + return cudf.read_text(fn, byte_range=byte_range, **kwargs) + + +def read_text(path, chunksize="256 MiB", byte_range=None, **kwargs): + if isinstance(chunksize, str): + chunksize = parse_bytes(chunksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + if chunksize and byte_range: + raise ValueError("Cannot specify both chunksize and byte_range.") + + if chunksize: + sources = [] + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, chunksize): + byte_range = ( + start, + chunksize, + ) # specify which chunk of the file we care about + sources.append((fn, byte_range)) + else: + sources = [(fn, byte_range) for fn in filenames] + + return dd.from_map( + _read_text, + sources, + meta=cudf.Series([], dtype="O"), + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/tests/test_accessor.py b/python/dask_cudf/dask_cudf/tests/test_accessor.py index 3fbb2aacd2c..c6b01a648eb 100644 --- a/python/dask_cudf/dask_cudf/tests/test_accessor.py +++ b/python/dask_cudf/dask_cudf/tests/test_accessor.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -13,7 +13,6 @@ from cudf.testing._utils import does_not_raise import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr ############################################################################# # Datetime Accessor # @@ -112,7 +111,6 @@ def test_categorical_accessor_initialization2(data): dsr.cat -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize("data", [data_cat_1()]) def test_categorical_basic(data): cat = data.copy() diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 7101fb7e00a..31957a106ff 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import random @@ -9,18 +9,12 @@ import dask from dask import dataframe as dd -from dask.dataframe.core import make_meta as dask_make_meta, meta_nonempty +from dask.dataframe.dispatch import make_meta as dask_make_meta, meta_nonempty from dask.utils import M import cudf import dask_cudf -from dask_cudf.tests.utils import ( - QUERY_PLANNING_ON, - require_dask_expr, - skip_dask_expr, - xfail_dask_expr, -) rng = np.random.default_rng(seed=0) @@ -299,37 +293,6 @@ def test_set_index_sorted(): gddf1.set_index("val", sorted=True) -@pytest.mark.parametrize("nelem", [10, 200, 1333]) -@pytest.mark.parametrize("index", [None, "myindex"]) -def test_rearrange_by_divisions(nelem, index): - with dask.config.set(scheduler="single-threaded"): - rng = np.random.default_rng(seed=0) - df = pd.DataFrame( - { - "x": rng.integers(0, 20, size=nelem), - "y": rng.normal(size=nelem), - "z": rng.choice(["dog", "cat", "bird"], nelem), - } - ) - df["z"] = df["z"].astype("category") - - ddf1 = dd.from_pandas(df, npartitions=4) - gdf1 = dask_cudf.from_cudf( - cudf.DataFrame.from_pandas(df), npartitions=4 - ) - ddf1.index.name = index - gdf1.index.name = index - divisions = (0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20) - - expect = dd.shuffle.rearrange_by_divisions( - ddf1, "x", divisions=divisions, shuffle_method="tasks" - ) - result = dd.shuffle.rearrange_by_divisions( - gdf1, "x", divisions=divisions, shuffle_method="tasks" - ) - dd.assert_eq(expect, result) - - def test_assign(): rng = np.random.default_rng(seed=0) df = pd.DataFrame( @@ -393,44 +356,6 @@ def test_setitem_scalar_datetime(): np.testing.assert_array_equal(got["z"], df["z"]) -@skip_dask_expr("Not relevant for dask-expr") -@pytest.mark.parametrize( - "func", - [ - lambda: pd.DataFrame( - {"A": rng.random(10), "B": rng.random(10)}, - index=list("abcdefghij"), - ), - lambda: pd.DataFrame( - { - "A": rng.random(10), - "B": list("a" * 10), - "C": pd.Series( - [str(20090101 + i) for i in range(10)], - dtype="datetime64[ns]", - ), - }, - index=list("abcdefghij"), - ), - lambda: pd.Series(list("abcdefghijklmnop")), - lambda: pd.Series( - rng.random(10), - index=pd.Index( - [str(20090101 + i) for i in range(10)], dtype="datetime64[ns]" - ), - ), - ], -) -def test_repr(func): - pdf = func() - gdf = cudf.from_pandas(pdf) - gddf = dd.from_pandas(gdf, npartitions=3, sort=False) - - assert repr(gddf) - if hasattr(pdf, "_repr_html_"): - assert gddf._repr_html_() - - @pytest.mark.skip(reason="datetime indexes not fully supported in cudf") @pytest.mark.parametrize("start", ["1d", "5d", "1w", "12h"]) @pytest.mark.parametrize("stop", ["1d", "3d", "8h"]) @@ -657,20 +582,20 @@ def test_hash_object_dispatch(index): ) # DataFrame - result = dd.core.hash_object_dispatch(obj, index=index) + result = dd.dispatch.hash_object_dispatch(obj, index=index) expected = dask_cudf.backends.hash_object_cudf(obj, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # Series - result = dd.core.hash_object_dispatch(obj["x"], index=index) + result = dd.dispatch.hash_object_dispatch(obj["x"], index=index) expected = dask_cudf.backends.hash_object_cudf(obj["x"], index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) # DataFrame with MultiIndex obj_multi = obj.set_index(["x", "z"], drop=True) - result = dd.core.hash_object_dispatch(obj_multi, index=index) + result = dd.dispatch.hash_object_dispatch(obj_multi, index=index) expected = dask_cudf.backends.hash_object_cudf(obj_multi, index=index) assert isinstance(result, cudf.Series) dd.assert_eq(result, expected) @@ -784,7 +709,6 @@ def test_dataframe_set_index(): assert_eq(ddf.compute(), pddf.compute()) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_series_describe(): random.seed(0) sr = cudf.datasets.randomdata(20)["x"] @@ -800,7 +724,6 @@ def test_series_describe(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_dataframe_describe(): random.seed(0) df = cudf.datasets.randomdata(20) @@ -814,7 +737,6 @@ def test_dataframe_describe(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_zero_std_describe(): num = 84886781 df = cudf.DataFrame( @@ -864,7 +786,7 @@ def test_merging_categorical_columns(): ddf_1 = dask_cudf.from_cudf(df_1, npartitions=2) - ddf_1 = dd.categorical.categorize(ddf_1, columns=["cat_col"]) + ddf_1 = ddf_1.categorize(columns=["cat_col"]) df_2 = cudf.DataFrame( {"id_2": [111, 112, 113], "cat_col": ["g", "h", "f"]} @@ -872,7 +794,7 @@ def test_merging_categorical_columns(): ddf_2 = dask_cudf.from_cudf(df_2, npartitions=2) - ddf_2 = dd.categorical.categorize(ddf_2, columns=["cat_col"]) + ddf_2 = ddf_2.categorize(columns=["cat_col"]) expected = cudf.DataFrame( { @@ -932,14 +854,9 @@ def func(x): result = ds.map_partitions(func, meta=s.values) - if QUERY_PLANNING_ON: - # Check Array and round-tripped DataFrame - dask.array.assert_eq(result, func(s)) - dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) - else: - # Legacy version still carries numpy metadata - # See: https://github.com/dask/dask/issues/11017 - dask.array.assert_eq(result.compute(), func(s)) + # Check Array and round-tripped DataFrame + dask.array.assert_eq(result, func(s)) + dd.assert_eq(result.to_dask_dataframe(), s, check_index=False) def test_implicit_array_conversion_cupy_sparse(): @@ -981,7 +898,6 @@ def test_series_isin_error(): ddf.isin([1, 5, "a"]).compute() -@require_dask_expr() def test_to_backend_simplify(): # Check that column projection is not blocked by to_backend with dask.config.set({"dataframe.backend": "pandas"}): diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 9bd3b506db0..11ca0c6a783 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd @@ -13,12 +13,7 @@ from cudf.testing._utils import expect_warning_if import dask_cudf -from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized -from dask_cudf.tests.utils import ( - QUERY_PLANNING_ON, - require_dask_expr, - xfail_dask_expr, -) +from dask_cudf._expr.groupby import OPTIMIZED_AGGS, _aggs_optimized def assert_cudf_groupby_layers(ddf): @@ -78,18 +73,12 @@ def test_groupby_basic(series, aggregation, pdf): expect = getattr(gdf_grouped, aggregation)() actual = getattr(ddf_grouped, aggregation)() - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - dd.assert_eq(expect, actual, check_dtype=check_dtype) if not series: expect = gdf_grouped.agg({"x": aggregation}) actual = ddf_grouped.agg({"x": aggregation}) - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - dd.assert_eq(expect, actual, check_dtype=check_dtype) @@ -134,13 +123,6 @@ def test_groupby_agg(func, aggregation, pdf): check_dtype = aggregation != "count" - if not QUERY_PLANNING_ON: - assert_cudf_groupby_layers(actual) - - # groupby.agg should add an explicit getitem layer - # to improve/enable column projection - assert hlg_layer(actual.dask, "getitem") - dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype) @@ -556,20 +538,13 @@ def test_groupby_categorical_key(): True, pytest.param( False, - marks=xfail_dask_expr("as_index not supported in dask-expr"), - ), - ], -) -@pytest.mark.parametrize( - "fused", - [ - True, - pytest.param( - False, - marks=require_dask_expr("Not supported by legacy API"), + marks=pytest.mark.xfail( + reason="as_index not supported in dask-expr" + ), ), ], ) +@pytest.mark.parametrize("fused", [True, False]) @pytest.mark.parametrize("split_out", ["use_dask_default", 1, 2]) @pytest.mark.parametrize("split_every", [False, 4]) @pytest.mark.parametrize("npartitions", [1, 10]) @@ -590,19 +565,16 @@ def test_groupby_agg_params( "c": ["mean", "std", "var"], } - fused_kwarg = {"fused": fused} if QUERY_PLANNING_ON else {} + fused_kwarg = {"fused": fused} split_kwargs = {"split_every": split_every, "split_out": split_out} if split_out == "use_dask_default": split_kwargs.pop("split_out") # Avoid using as_index when query-planning is enabled - if QUERY_PLANNING_ON: - with pytest.warns(FutureWarning, match="argument is now deprecated"): - # Should warn when `as_index` is used - ddf.groupby(["name", "a"], sort=False, as_index=as_index) - maybe_as_index = {"as_index": as_index} if as_index is False else {} - else: - maybe_as_index = {"as_index": as_index} + with pytest.warns(FutureWarning, match="argument is now deprecated"): + # Should warn when `as_index` is used + ddf.groupby(["name", "a"], sort=False, as_index=as_index) + maybe_as_index = {"as_index": as_index} if as_index is False else {} # Check `sort=True` behavior if split_out == 1: @@ -671,7 +643,6 @@ def test_groupby_agg_params( dd.assert_eq(gf, pf) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize( "aggregations", [(sum, "sum"), (max, "max"), (min, "min")] ) @@ -711,7 +682,6 @@ def test_is_supported(arg, supported): assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") def test_groupby_unique_lists(): df = pd.DataFrame({"a": [0, 0, 0, 1, 1, 1], "b": [10, 10, 10, 7, 8, 9]}) gdf = cudf.from_pandas(df) @@ -758,7 +728,7 @@ def test_groupby_first_last(data, agg): ) -@xfail_dask_expr("Co-alignment check fails in dask-expr") +@pytest.mark.xfail(reason="Co-alignment check fails in dask-expr") def test_groupby_with_list_of_series(): df = cudf.DataFrame({"a": [1, 2, 3, 4, 5]}) gdf = dask_cudf.from_cudf(df, npartitions=2) @@ -773,7 +743,6 @@ def test_groupby_with_list_of_series(): ) -@xfail_dask_expr("Newer dask version needed", lt_version="2024.5.0") @pytest.mark.parametrize( "func", [ @@ -833,7 +802,7 @@ def test_groupby_all_columns(func): expect = func(ddf) actual = func(gddf) - dd.assert_eq(expect, actual, check_names=not QUERY_PLANNING_ON) + dd.assert_eq(expect, actual, check_names=False) def test_groupby_shuffle(): @@ -870,15 +839,3 @@ def test_groupby_shuffle(): # NOTE: `shuffle_method=True` should be default got = gddf.groupby("a", sort=False).agg(spec, split_out=2) dd.assert_eq(expect, got.compute().sort_index()) - - if not QUERY_PLANNING_ON: - # Sorted aggregation fails with split_out>1 when shuffle is False - # (sort=True, split_out=2, shuffle_method=False) - with pytest.raises(ValueError): - gddf.groupby("a", sort=True).agg( - spec, shuffle_method=False, split_out=2 - ) - - # Check shuffle kwarg deprecation - with pytest.warns(match="'shuffle' keyword is deprecated"): - gddf.groupby("a", sort=True).agg(spec, shuffle=False) diff --git a/python/dask_cudf/dask_cudf/tests/test_onehot.py b/python/dask_cudf/dask_cudf/tests/test_onehot.py index 0b7c7855e07..2d05345bc4a 100644 --- a/python/dask_cudf/dask_cudf/tests/test_onehot.py +++ b/python/dask_cudf/dask_cudf/tests/test_onehot.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import pandas as pd import pytest @@ -8,12 +8,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr - -# No dask-expr support -pytestmark = xfail_dask_expr( - "Newer dask version needed", lt_version="2024.5.0" -) def test_get_dummies_cat(): diff --git a/python/dask_cudf/dask_cudf/tests/test_sort.py b/python/dask_cudf/dask_cudf/tests/test_sort.py index 02c815427f3..68d6e72660e 100644 --- a/python/dask_cudf/dask_cudf/tests/test_sort.py +++ b/python/dask_cudf/dask_cudf/tests/test_sort.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. import cupy as cp import numpy as np @@ -10,7 +10,6 @@ import cudf import dask_cudf -from dask_cudf.tests.utils import xfail_dask_expr @pytest.mark.parametrize("ascending", [True, False]) @@ -67,7 +66,6 @@ def test_sort_repartition(): dd.assert_eq(len(new_ddf), len(ddf)) -@xfail_dask_expr("missing null support", lt_version="2024.5.1") @pytest.mark.parametrize("na_position", ["first", "last"]) @pytest.mark.parametrize("ascending", [True, False]) @pytest.mark.parametrize("by", ["a", "b", ["a", "b"]]) diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index b44b3f939e7..ef6765f39d1 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -1,22 +1,12 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. import numpy as np import pandas as pd -import pytest -from packaging.version import Version -import dask import dask.dataframe as dd import cudf -from dask_cudf import QUERY_PLANNING_ON - -if QUERY_PLANNING_ON: - DASK_VERSION = Version(dask.__version__) -else: - DASK_VERSION = None - def _make_random_frame(nelem, npartitions=2, include_na=False): rng = np.random.default_rng(seed=0) @@ -30,26 +20,3 @@ def _make_random_frame(nelem, npartitions=2, include_na=False): gdf = cudf.DataFrame.from_pandas(df) dgf = dd.from_pandas(gdf, npartitions=npartitions) return df, dgf - - -_default_reason = "Not compatible with dask-expr" - - -def skip_dask_expr(reason=_default_reason, lt_version=None): - if lt_version is not None: - skip = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) - else: - skip = QUERY_PLANNING_ON - return pytest.mark.skipif(skip, reason=reason) - - -def xfail_dask_expr(reason=_default_reason, lt_version=None): - if lt_version is not None: - xfail = QUERY_PLANNING_ON and DASK_VERSION < Version(lt_version) - else: - xfail = QUERY_PLANNING_ON - return pytest.mark.xfail(xfail, reason=reason) - - -def require_dask_expr(reason="requires dask-expr"): - return pytest.mark.skipif(not QUERY_PLANNING_ON, reason=reason) diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index a8cb696d7f6..b88816a3d47 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2024, NVIDIA CORPORATION. +# Copyright (c) 2021-2025, NVIDIA CORPORATION. [build-system] build-backend = "rapids_build_backend.build" @@ -39,10 +39,10 @@ classifiers = [ ] [project.entry-points."dask.dataframe.backends"] -cudf = "dask_cudf.backends:CudfBackendEntrypoint" +cudf = "dask_cudf.backends:LegacyCudfBackendEntrypoint" [project.entry-points."dask_expr.dataframe.backends"] -cudf = "dask_cudf.backends:CudfDXBackendEntrypoint" +cudf = "dask_cudf.backends:CudfBackendEntrypoint" [project.optional-dependencies] test = [ @@ -102,8 +102,5 @@ filterwarnings = [ # https://github.com/dask/partd/blob/main/partd/pandas.py#L198 "ignore:Passing a BlockManager to DataFrame is deprecated and will raise in a future version. Use public APIs instead.:DeprecationWarning", "ignore:String support for `aggregate_files` is experimental. Behavior may change in the future.:FutureWarning:dask", - # Dask now loudly throws warnings: https://github.com/dask/dask/pull/11437 - # When the legacy implementation is removed we can remove this warning and stop running pytests with `DASK_DATAFRAME__QUERY_PLANNING=False` - "ignore:The legacy Dask DataFrame implementation is deprecated and will be removed in a future version.*:FutureWarning", ] xfail_strict = true From dc99d2f9bc602e40c7bae894b6759e30a8efdddd Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Wed, 8 Jan 2025 13:00:30 +0000 Subject: [PATCH 19/19] Introduce some simple benchmarks for rolling window aggregations (#17613) Previously we did not have any benchmarks for rolling aggregations. Introduce some, so we can measure the effects of any performance improvements we might make. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - MithunR (https://github.com/mythrocks) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17613 --- cpp/benchmarks/CMakeLists.txt | 7 +- .../rolling/grouped_rolling_sum.cpp | 70 +++++++++ cpp/benchmarks/rolling/rolling_sum.cpp | 134 ++++++++++++++++++ 3 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 cpp/benchmarks/rolling/grouped_rolling_sum.cpp create mode 100644 cpp/benchmarks/rolling/rolling_sum.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 749e1b628ee..0ff712c1c77 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -425,6 +425,11 @@ ConfigureNVBench(DECIMAL_NVBENCH decimal/convert_floating.cpp) # --------------------------------------------------------------------------------- ConfigureNVBench(RESHAPE_NVBENCH reshape/interleave.cpp) +# ################################################################################################## +# * rolling benchmark +# --------------------------------------------------------------------------------- +ConfigureNVBench(ROLLING_NVBENCH rolling/grouped_rolling_sum.cpp rolling/rolling_sum.cpp) + add_custom_target( run_benchmarks DEPENDS CUDF_BENCHMARKS diff --git a/cpp/benchmarks/rolling/grouped_rolling_sum.cpp b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp new file mode 100644 index 00000000000..04afe5ac661 --- /dev/null +++ b/cpp/benchmarks/rolling/grouped_rolling_sum.cpp @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +#include + +template +void bench_row_grouped_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const cardinality = static_cast(state.get_int64("cardinality")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + auto const keys = [&] { + data_profile const profile = + data_profile_builder() + .cardinality(cardinality) + .no_validity() + .distribution(cudf::type_to_id(), distribution_id::UNIFORM, 0, num_rows); + auto keys = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + return cudf::sort(cudf::table_view{{keys->view()}}); + }(); + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = cudf::grouped_rolling_window( + keys->view(), vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_grouped_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_grouped_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 28}) + .add_int64_axis("preceding_size", {1, 10}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1}) + .add_int64_axis("cardinality", {10, 100, 1'000'000, 100'000'000}); diff --git a/cpp/benchmarks/rolling/rolling_sum.cpp b/cpp/benchmarks/rolling/rolling_sum.cpp new file mode 100644 index 00000000000..af9ecd6a26f --- /dev/null +++ b/cpp/benchmarks/rolling/rolling_sum.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2024-2025, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include + +#include + +template +void bench_row_fixed_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + auto const min_periods = static_cast(state.get_int64("min_periods")); + + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + auto vals = create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding_size, following_size, min_periods, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +template +void bench_row_variable_rolling_sum(nvbench::state& state, nvbench::type_list) +{ + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const preceding_size = static_cast(state.get_int64("preceding_size")); + auto const following_size = static_cast(state.get_int64("following_size")); + + auto vals = [&]() { + data_profile const profile = data_profile_builder().cardinality(0).no_validity().distribution( + cudf::type_to_id(), distribution_id::UNIFORM, 0, 100); + return create_random_column(cudf::type_to_id(), row_count{num_rows}, profile); + }(); + + auto preceding = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, preceding_size](auto i) { + return std::min(i + 1, std::max(preceding_size, i + 1 - num_rows)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto following = [&]() { + auto data = std::vector(num_rows); + auto it = thrust::make_counting_iterator(0); + std::transform(it, it + num_rows, data.begin(), [num_rows, following_size](auto i) { + return std::max(-i - 1, std::min(following_size, num_rows - i - 1)); + }); + auto buf = rmm::device_buffer( + data.data(), num_rows * sizeof(cudf::size_type), cudf::get_default_stream()); + cudf::get_default_stream().synchronize(); + return std::make_unique(cudf::data_type(cudf::type_to_id()), + num_rows, + std::move(buf), + rmm::device_buffer{}, + 0); + }(); + + auto req = cudf::make_sum_aggregation(); + + auto const mem_stats_logger = cudf::memory_stats_logger(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value())); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + auto const result = + cudf::rolling_window(vals->view(), preceding->view(), following->view(), 1, *req); + }); + auto const elapsed_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + state.add_element_count(static_cast(num_rows) / elapsed_time / 1'000'000., "Mrows/s"); + state.add_buffer_size( + mem_stats_logger.peak_memory_usage(), "peak_memory_usage", "peak_memory_usage"); +} + +NVBENCH_BENCH_TYPES(bench_row_fixed_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_fixed_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {1, 10, 100}) + .add_int64_axis("following_size", {2}) + .add_int64_axis("min_periods", {1, 20}); + +NVBENCH_BENCH_TYPES(bench_row_variable_rolling_sum, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("row_variable_rolling_sum") + .add_int64_power_of_two_axis("num_rows", {14, 22, 28}) + .add_int64_axis("preceding_size", {10, 100}) + .add_int64_axis("following_size", {2});