From 231015910bae375077e07c01d2bf70697182ccad Mon Sep 17 00:00:00 2001 From: Shruti Shivakumar Date: Thu, 9 Jan 2025 16:53:25 -0500 Subject: [PATCH 1/5] Support multithreaded reading of compressed buffers in JSON reader (#17670) Addresses #17638 This PR introduces multithreaded host-side decompression of compressed input buffers passed to the JSON reader, and uses a stream pool to transfer the uncompressed buffers to device. Authors: - Shruti Shivakumar (https://github.com/shrshi) Approvers: - Paul Mattione (https://github.com/pmattione-nvidia) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/17670 --- cpp/src/io/json/read_json.cu | 71 +++++++++++++++++++++++++++++++----- 1 file changed, 61 insertions(+), 10 deletions(-) diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index 82d8152ca1c..113342e9cbf 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * Copyright (c) 2022-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -30,6 +30,7 @@ #include #include +#include #include #include @@ -37,12 +38,25 @@ #include #include +#include +#include + #include namespace cudf::io::json::detail { namespace { +namespace pools { + +BS::thread_pool& tpool() +{ + static BS::thread_pool _tpool(std::thread::hardware_concurrency()); + return _tpool; +} + +} // namespace pools + class compressed_host_buffer_source final : public datasource { public: explicit compressed_host_buffer_source(std::unique_ptr const& src, @@ -51,8 +65,8 @@ class compressed_host_buffer_source final : public datasource { { auto ch_buffer = host_span(reinterpret_cast(_dbuf_ptr->data()), _dbuf_ptr->size()); - if (comptype == compression_type::GZIP || comptype == compression_type::ZIP || - comptype == compression_type::SNAPPY) { + if (_comptype == compression_type::GZIP || _comptype == compression_type::ZIP || + _comptype == compression_type::SNAPPY) { _decompressed_ch_buffer_size = cudf::io::detail::get_uncompressed_size(_comptype, ch_buffer); } else { _decompressed_buffer = cudf::io::detail::decompress(_comptype, ch_buffer); @@ -96,7 +110,22 @@ class compressed_host_buffer_source final : public datasource { return std::make_unique(_decompressed_buffer.data() + offset, count); } - [[nodiscard]] bool supports_device_read() const override { return false; } + std::future device_read_async(size_t offset, + size_t size, + uint8_t* dst, + rmm::cuda_stream_view stream) override + { + auto& thread_pool = pools::tpool(); + return thread_pool.submit_task([this, offset, size, dst, stream] { + auto hbuf = host_read(offset, size); + CUDF_CUDA_TRY( + cudaMemcpyAsync(dst, hbuf->data(), hbuf->size(), cudaMemcpyHostToDevice, stream.value())); + stream.synchronize(); + return hbuf->size(); + }); + } + + [[nodiscard]] bool supports_device_read() const override { return true; } [[nodiscard]] size_t size() const override { return _decompressed_ch_buffer_size; } @@ -431,6 +460,8 @@ device_span ingest_raw_input(device_span buffer, // line of file i+1 don't end up on the same JSON line, if file i does not already end with a line // delimiter. auto constexpr num_delimiter_chars = 1; + std::vector> thread_tasks; + auto stream_pool = cudf::detail::fork_streams(stream, pools::tpool().get_thread_count()); auto delimiter_map = cudf::detail::make_empty_host_vector(sources.size(), stream); std::vector prefsum_source_sizes(sources.size()); @@ -447,13 +478,17 @@ device_span ingest_raw_input(device_span buffer, auto const total_bytes_to_read = std::min(range_size, prefsum_source_sizes.back() - range_offset); range_offset -= start_source ? prefsum_source_sizes[start_source - 1] : 0; - for (std::size_t i = start_source; i < sources.size() && bytes_read < total_bytes_to_read; i++) { + for (std::size_t i = start_source, cur_stream = 0; + i < sources.size() && bytes_read < total_bytes_to_read; + i++) { if (sources[i]->is_empty()) continue; auto data_size = std::min(sources[i]->size() - range_offset, total_bytes_to_read - bytes_read); auto destination = reinterpret_cast(buffer.data()) + bytes_read + (num_delimiter_chars * delimiter_map.size()); - if (sources[i]->is_device_read_preferred(data_size)) { - bytes_read += sources[i]->device_read(range_offset, data_size, destination, stream); + if (sources[i]->supports_device_read()) { + thread_tasks.emplace_back(sources[i]->device_read_async( + range_offset, data_size, destination, stream_pool[cur_stream++ % stream_pool.size()])); + bytes_read += data_size; } else { h_buffers.emplace_back(sources[i]->host_read(range_offset, data_size)); auto const& h_buffer = h_buffers.back(); @@ -481,6 +516,15 @@ device_span ingest_raw_input(device_span buffer, buffer.data()); } stream.synchronize(); + + if (thread_tasks.size()) { + auto const bytes_read = std::accumulate( + thread_tasks.begin(), thread_tasks.end(), std::size_t{0}, [](std::size_t sum, auto& task) { + return sum + task.get(); + }); + CUDF_EXPECTS(bytes_read == total_bytes_to_read, "something's fishy"); + } + return buffer.first(bytes_read + (delimiter_map.size() * num_delimiter_chars)); } @@ -505,10 +549,17 @@ table_with_metadata read_json(host_span> sources, return read_json_impl(sources, reader_opts, stream, mr); std::vector> compressed_sources; - for (size_t i = 0; i < sources.size(); i++) { - compressed_sources.emplace_back( - std::make_unique(sources[i], reader_opts.get_compression())); + std::vector>> thread_tasks; + auto& thread_pool = pools::tpool(); + for (auto& src : sources) { + thread_tasks.emplace_back(thread_pool.submit_task([&reader_opts, &src] { + return std::make_unique(src, reader_opts.get_compression()); + })); } + std::transform(thread_tasks.begin(), + thread_tasks.end(), + std::back_inserter(compressed_sources), + [](auto& task) { return task.get(); }); // in read_json_impl, we need the compressed source size to actually be the // uncompressed source size for correct batching return read_json_impl(compressed_sources, reader_opts, stream, mr); From a8a41975b0c1cfaedb7d4461ee027f6f9ff75b0e Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 9 Jan 2025 14:16:04 -0800 Subject: [PATCH 2/5] Remove cudf._libs.types.pyx (#17665) Contributes to https://github.com/rapidsai/cudf/issues/17317 1. Moves some Python routines/objects to `cudf/utils/dtypes.py` 2. Moves specific column only routines directly to `cudf/_libs/column.pyx` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17665 --- python/cudf/cudf/_lib/CMakeLists.txt | 4 +- python/cudf/cudf/_lib/column.pxd | 4 +- python/cudf/cudf/_lib/column.pyx | 100 ++++++++-- python/cudf/cudf/_lib/scalar.pyx | 49 ++--- python/cudf/cudf/_lib/types.pxd | 11 -- python/cudf/cudf/_lib/types.pyx | 172 ------------------ python/cudf/cudf/core/_base_index.py | 9 +- .../cudf/cudf/core/_internals/aggregation.py | 4 +- python/cudf/cudf/core/_internals/binaryop.py | 4 +- python/cudf/cudf/core/_internals/unary.py | 4 +- python/cudf/cudf/core/column/categorical.py | 10 +- python/cudf/cudf/core/column/column.py | 27 +-- python/cudf/cudf/core/column/lists.py | 4 +- python/cudf/cudf/core/column/string.py | 12 +- python/cudf/cudf/core/copy_types.py | 6 +- python/cudf/cudf/core/dtypes.py | 5 +- python/cudf/cudf/core/groupby/groupby.py | 21 +-- python/cudf/cudf/core/index.py | 6 +- python/cudf/cudf/core/indexed_frame.py | 3 +- python/cudf/cudf/core/join/join.py | 6 +- python/cudf/cudf/core/multiindex.py | 11 +- python/cudf/cudf/core/reshape.py | 9 +- python/cudf/cudf/io/csv.py | 8 +- python/cudf/cudf/io/json.py | 8 +- python/cudf/cudf/io/orc.py | 4 +- python/cudf/cudf/utils/dtypes.py | 66 ++++++- 26 files changed, 251 insertions(+), 316 deletions(-) delete mode 100644 python/cudf/cudf/_lib/types.pxd delete mode 100644 python/cudf/cudf/_lib/types.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index ff6fba1c3e8..ec44a6aa8c5 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2022-2025, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -12,7 +12,7 @@ # the License. # ============================================================================= -set(cython_sources column.pyx scalar.pyx strings_udf.pyx types.pyx) +set(cython_sources column.pyx scalar.pyx strings_udf.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd index 8b1d16f0d85..026c12895e8 100644 --- a/python/cudf/cudf/_lib/column.pxd +++ b/python/cudf/cudf/_lib/column.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal @@ -13,6 +13,8 @@ from pylibcudf.libcudf.column.column_view cimport ( from pylibcudf.libcudf.types cimport size_type from rmm.librmm.device_buffer cimport device_buffer +cdef dtype_from_lists_column_view(column_view cv) +cdef dtype_from_column_view(column_view cv) cdef class Column: cdef public: diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f7dcd89ea48..c59bbc0f40c 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from typing import Literal @@ -19,24 +19,21 @@ from cudf.core.buffer import ( as_buffer, cuda_array_interface_wrapper, ) -from cudf.utils.dtypes import _get_base_dtype +from cudf.utils.dtypes import ( + _get_base_dtype, + dtype_to_pylibcudf_type, + PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES, +) from cpython.buffer cimport PyObject_CheckBuffer -from libc.stdint cimport uintptr_t -from libcpp.memory cimport make_unique, unique_ptr +from libc.stdint cimport uintptr_t, int32_t +from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector from rmm.pylibrmm.device_buffer cimport DeviceBuffer -from cudf._lib.types cimport ( - dtype_from_column_view, - dtype_to_pylibcudf_type, -) - -from cudf._lib.types import dtype_from_pylibcudf_column - -from pylibcudf cimport DataType as plc_DataType +from pylibcudf cimport DataType as plc_DataType, Column as plc_Column cimport pylibcudf.libcudf.copying as cpp_copying cimport pylibcudf.libcudf.types as libcudf_types cimport pylibcudf.libcudf.unary as libcudf_unary @@ -45,6 +42,7 @@ from pylibcudf.libcudf.column.column_factories cimport ( make_numeric_column ) from pylibcudf.libcudf.column.column_view cimport column_view +from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view from pylibcudf.libcudf.null_mask cimport null_count as cpp_null_count from pylibcudf.libcudf.scalar.scalar cimport scalar @@ -64,6 +62,80 @@ cdef get_element(column_view col_view, size_type index): ) +def dtype_from_pylibcudf_column(plc_Column col not None): + type_ = col.type() + tid = type_.id() + + if tid == pylibcudf.TypeId.LIST: + child = col.list_view().child() + return cudf.ListDtype(dtype_from_pylibcudf_column(child)) + elif tid == pylibcudf.TypeId.STRUCT: + fields = { + str(i): dtype_from_pylibcudf_column(col.child(i)) + for i in range(col.num_children()) + } + return cudf.StructDtype(fields) + elif tid == pylibcudf.TypeId.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == pylibcudf.TypeId.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + elif tid == pylibcudf.TypeId.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-type_.scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[tid] + + +cdef dtype_from_lists_column_view(column_view cv): + # lists_column_view have no default constructor, so we heap + # allocate it to get around Cython's limitation of requiring + # default constructors for stack allocated objects + cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) + cdef column_view child = lv.get()[0].child() + + if child.type().id() == libcudf_types.type_id.LIST: + return cudf.ListDtype(dtype_from_lists_column_view(child)) + else: + return cudf.ListDtype(dtype_from_column_view(child)) + + +cdef dtype_from_column_view(column_view cv): + cdef libcudf_types.type_id tid = cv.type().id() + if tid == libcudf_types.type_id.LIST: + return dtype_from_lists_column_view(cv) + elif tid == libcudf_types.type_id.STRUCT: + fields = { + str(i): dtype_from_column_view(cv.child(i)) + for i in range(cv.num_children()) + } + return cudf.StructDtype(fields) + elif tid == libcudf_types.type_id.DECIMAL64: + return cudf.Decimal64Dtype( + precision=cudf.Decimal64Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL32: + return cudf.Decimal32Dtype( + precision=cudf.Decimal32Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + elif tid == libcudf_types.type_id.DECIMAL128: + return cudf.Decimal128Dtype( + precision=cudf.Decimal128Dtype.MAX_PRECISION, + scale=-cv.type().scale() + ) + else: + return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[(tid)] + + cdef class Column: """ A Column stores columnar data in device memory. @@ -361,7 +433,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[mutable_column_view] children cdef void* data @@ -424,7 +496,7 @@ cdef class Column: col = self data_dtype = col.dtype - cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) + cdef plc_DataType dtype = dtype_to_pylibcudf_type(data_dtype) cdef libcudf_types.size_type offset = self.offset cdef vector[column_view] children cdef void* data diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index fd6d0257940..65607c91302 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. import copy @@ -14,17 +14,16 @@ import pylibcudf as plc import cudf from cudf.core.dtypes import ListDtype, StructDtype -from cudf._lib.types import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES -from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf.core.missing import NA, NaT +from cudf.utils.dtypes import PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES # We currently need this cimport because some of the implementations here # access the c_obj of the scalar, and because we need to be able to call # pylibcudf.Scalar.from_libcudf. Both of those are temporarily acceptable until # DeviceScalar is phased out entirely from cuDF Cython (at which point # cudf.Scalar will be directly backed by pylibcudf.Scalar). -from pylibcudf cimport Scalar as plc_Scalar, type_id as plc_TypeID -from pylibcudf.libcudf.scalar.scalar cimport list_scalar, scalar, struct_scalar +from pylibcudf cimport Scalar as plc_Scalar +from pylibcudf.libcudf.scalar.scalar cimport scalar def _replace_nested(obj, check, replacement): @@ -223,40 +222,22 @@ cdef class DeviceScalar: return s cdef void _set_dtype(self, dtype=None): - cdef plc_TypeID cdtype_id = self.c_value.type().id() + cdtype_id = self.c_value.type().id() if dtype is not None: self._dtype = dtype elif cdtype_id in { - plc_TypeID.DECIMAL32, - plc_TypeID.DECIMAL64, - plc_TypeID.DECIMAL128, + plc.TypeID.DECIMAL32, + plc.TypeID.DECIMAL64, + plc.TypeID.DECIMAL128, }: raise TypeError( "Must pass a dtype when constructing from a fixed-point scalar" ) - elif cdtype_id == plc_TypeID.STRUCT: - struct_table_view = (self.get_raw_ptr())[0].view() - self._dtype = StructDtype({ - str(i): dtype_from_column_view(struct_table_view.column(i)) - for i in range(struct_table_view.num_columns()) - }) - elif cdtype_id == plc_TypeID.LIST: - if ( - self.get_raw_ptr() - )[0].view().type().id() == plc_TypeID.LIST: - self._dtype = dtype_from_column_view( - (self.get_raw_ptr())[0].view() - ) - else: - self._dtype = ListDtype( - PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - ( - (self.get_raw_ptr())[0] - .view().type().id() - ) - ] - ) + elif cdtype_id == plc.TypeID.STRUCT: + self._dtype = StructDtype.from_arrow( + plc.interop.to_arrow(self.c_value).type + ) + elif cdtype_id == plc.TypeID.LIST: + self._dtype = ListDtype.from_arrow(plc.interop.to_arrow(self.c_value).type) else: - self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (cdtype_id) - ] + self._dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[cdtype_id] diff --git a/python/cudf/cudf/_lib/types.pxd b/python/cudf/cudf/_lib/types.pxd deleted file mode 100644 index 18b1d26e4db..00000000000 --- a/python/cudf/cudf/_lib/types.pxd +++ /dev/null @@ -1,11 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport int32_t - -from pylibcudf.libcudf.column.column_view cimport column_view - -ctypedef int32_t underlying_type_t_type_id - -cdef dtype_from_column_view(column_view cv) - -cpdef dtype_to_pylibcudf_type(dtype) diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx deleted file mode 100644 index 777bd070b32..00000000000 --- a/python/cudf/cudf/_lib/types.pyx +++ /dev/null @@ -1,172 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -import numpy as np -import pandas as pd - -from libcpp.memory cimport make_shared, shared_ptr - -cimport pylibcudf.libcudf.types as libcudf_types -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.lists.lists_column_view cimport lists_column_view - -import pylibcudf as plc - -import cudf - - -SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { - np.dtype("int8"): plc.types.TypeId.INT8, - np.dtype("int16"): plc.types.TypeId.INT16, - np.dtype("int32"): plc.types.TypeId.INT32, - np.dtype("int64"): plc.types.TypeId.INT64, - np.dtype("uint8"): plc.types.TypeId.UINT8, - np.dtype("uint16"): plc.types.TypeId.UINT16, - np.dtype("uint32"): plc.types.TypeId.UINT32, - np.dtype("uint64"): plc.types.TypeId.UINT64, - np.dtype("float32"): plc.types.TypeId.FLOAT32, - np.dtype("float64"): plc.types.TypeId.FLOAT64, - np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, - np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, - np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, - np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, - np.dtype("object"): plc.types.TypeId.STRING, - np.dtype("bool"): plc.types.TypeId.BOOL8, - np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, - np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, - np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, - np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, -} -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { - plc_type: np_type - for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() -} -# There's no equivalent to EMPTY in cudf. We translate EMPTY -# columns from libcudf to ``int8`` columns of all nulls in Python. -# ``int8`` is chosen because it uses the least amount of memory. -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype("object") -PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") - - -size_type_dtype = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] - - -cdef dtype_from_lists_column_view(column_view cv): - # lists_column_view have no default constructor, so we heap - # allocate it to get around Cython's limitation of requiring - # default constructors for stack allocated objects - cdef shared_ptr[lists_column_view] lv = make_shared[lists_column_view](cv) - cdef column_view child = lv.get()[0].child() - - if child.type().id() == libcudf_types.type_id.LIST: - return cudf.ListDtype(dtype_from_lists_column_view(child)) - elif child.type().id() == libcudf_types.type_id.EMPTY: - return cudf.ListDtype("int8") - else: - return cudf.ListDtype( - dtype_from_column_view(child) - ) - -cdef dtype_from_structs_column_view(column_view cv): - fields = { - str(i): dtype_from_column_view(cv.child(i)) - for i in range(cv.num_children()) - } - return cudf.StructDtype(fields) - -cdef dtype_from_column_view(column_view cv): - cdef libcudf_types.type_id tid = cv.type().id() - if tid == libcudf_types.type_id.LIST: - return dtype_from_lists_column_view(cv) - elif tid == libcudf_types.type_id.STRUCT: - return dtype_from_structs_column_view(cv) - elif tid == libcudf_types.type_id.DECIMAL64: - return cudf.Decimal64Dtype( - precision=cudf.Decimal64Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL32: - return cudf.Decimal32Dtype( - precision=cudf.Decimal32Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - elif tid == libcudf_types.type_id.DECIMAL128: - return cudf.Decimal128Dtype( - precision=cudf.Decimal128Dtype.MAX_PRECISION, - scale=-cv.type().scale() - ) - else: - return PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[ - (tid) - ] - - -cpdef dtype_to_pylibcudf_type(dtype): - if isinstance(dtype, cudf.ListDtype): - return plc.DataType(plc.TypeId.LIST) - elif isinstance(dtype, cudf.StructDtype): - return plc.DataType(plc.TypeId.STRUCT) - elif isinstance(dtype, cudf.Decimal128Dtype): - tid = plc.TypeId.DECIMAL128 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal64Dtype): - tid = plc.TypeId.DECIMAL64 - return plc.DataType(tid, -dtype.scale) - elif isinstance(dtype, cudf.Decimal32Dtype): - tid = plc.TypeId.DECIMAL32 - return plc.DataType(tid, -dtype.scale) - # libcudf types don't support timezones so convert to the base type - elif isinstance(dtype, pd.DatetimeTZDtype): - dtype = np.dtype(f" ColumnBase: if self.null_count == len(self): # self.categories is empty; just return codes return self.codes - gather_map = self.codes.astype(libcudf.types.size_type_dtype).fillna(0) + gather_map = self.codes.astype(SIZE_TYPE_DTYPE).fillna(0) out = self.categories.take(gather_map) out = out.set_mask(self.mask) return out @@ -1192,10 +1192,10 @@ def _concat( codes = [o.codes for o in objs] newsize = sum(map(len, codes)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: codes_col = column.column_empty(0, head.codes.dtype) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index e23ca810065..30da8727366 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -25,7 +25,6 @@ import cudf from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import ( _is_non_decimal_numeric_dtype, _is_pandas_nullable_extension_dtype, @@ -60,9 +59,11 @@ from cudf.core.mixins import BinaryOperand, Reducible from cudf.errors import MixedTypeError from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, cudf_dtype_from_pa_type, cudf_dtype_to_pa_type, + dtype_to_pylibcudf_type, find_common_type, get_time_unit, is_column_like, @@ -874,7 +875,7 @@ def indices_of( value = as_column(value, dtype=self.dtype, length=1) mask = value.contains(self) return apply_boolean_mask( # type: ignore[return-value] - [as_column(range(0, len(self)), dtype=size_type_dtype)], mask + [as_column(range(0, len(self)), dtype=SIZE_TYPE_DTYPE)], mask )[0] def _find_first_and_last(self, value: ScalarLike) -> tuple[int, int]: @@ -954,7 +955,7 @@ def take( # TODO: For performance, the check and conversion of gather map should # be done by the caller. This check will be removed in future release. if indices.dtype.kind not in {"u", "i"}: - indices = indices.astype(libcudf.types.size_type_dtype) + indices = indices.astype(SIZE_TYPE_DTYPE) GatherMap(indices, len(self), nullify=not check_bounds or nullify) gathered = copying.gather([self], indices, nullify=nullify) # type: ignore[arg-type] return gathered[0]._with_type_metadata(self.dtype) # type: ignore[return-value] @@ -1743,9 +1744,7 @@ def column_empty( elif isinstance(dtype, ListDtype): data = None children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), + as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE), column_empty(row_count, dtype=dtype.element_type), ) elif isinstance(dtype, CategoricalDtype): @@ -1754,21 +1753,16 @@ def column_empty( cudf.core.column.NumericalColumn( data=as_buffer( rmm.DeviceBuffer( - size=row_count - * cudf.dtype(libcudf.types.size_type_dtype).itemsize + size=row_count * cudf.dtype(SIZE_TYPE_DTYPE).itemsize ) ), size=None, - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ) elif dtype.kind in "OU" and not isinstance(dtype, DecimalDtype): data = as_buffer(rmm.DeviceBuffer(size=0)) - children = ( - as_column( - 0, length=row_count + 1, dtype=libcudf.types.size_type_dtype - ), - ) + children = (as_column(0, length=row_count + 1, dtype=SIZE_TYPE_DTYPE),) else: data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize)) @@ -2552,10 +2546,9 @@ def concat_columns(objs: "MutableSequence[ColumnBase]") -> ColumnBase: ) newsize = sum(map(len, objs)) - if newsize > np.iinfo(libcudf.types.size_type_dtype).max: + if newsize > np.iinfo(SIZE_TYPE_DTYPE).max: raise MemoryError( - f"Result of concat cannot have " - f"size > {libcudf.types.size_type_dtype}_MAX" + f"Result of concat cannot have " f"size > {SIZE_TYPE_DTYPE}_MAX" ) elif newsize == 0: return column_empty(0, head.dtype) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 6fc2b5d4ca2..04b4003c510 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -14,7 +14,6 @@ import cudf import cudf.core.column.column as column -from cudf._lib.types import size_type_dtype from cudf.api.types import _is_non_decimal_numeric_dtype, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase, as_column @@ -22,6 +21,7 @@ from cudf.core.column.numerical import NumericalColumn from cudf.core.dtypes import ListDtype from cudf.core.missing import NA +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from collections.abc import Sequence @@ -258,7 +258,7 @@ def from_sequences( offset_col = cast( NumericalColumn, - column.as_column(offset_vals, dtype=size_type_dtype), + column.as_column(offset_vals, dtype=SIZE_TYPE_DTYPE), ) # Build ListColumn diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 20eded9a27f..2bee85cb387 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -19,16 +19,18 @@ import cudf.api.types import cudf.core.column.column as column import cudf.core.column.datetime as datetime -from cudf import _lib as libcudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type, size_type_dtype from cudf.api.types import is_integer, is_scalar, is_string_dtype from cudf.core._internals import binaryop from cudf.core.buffer import acquire_spill_lock from cudf.core.column.column import ColumnBase from cudf.core.column.methods import ColumnMethods from cudf.utils.docutils import copy_docstring -from cudf.utils.dtypes import can_convert_to_column +from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, + can_convert_to_column, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from collections.abc import Callable, Sequence @@ -5611,7 +5613,7 @@ def __init__( if len(children) == 0 and size != 0: # all nulls-column: offsets = column.as_column( - 0, length=size + 1, dtype=size_type_dtype + 0, length=size + 1, dtype=SIZE_TYPE_DTYPE ) children = (offsets,) @@ -5888,7 +5890,7 @@ def as_decimal_column( ) -> cudf.core.column.DecimalBaseColumn: plc_column = plc.strings.convert.convert_fixed_point.to_fixed_point( self.to_pylibcudf(mode="read"), - libcudf.types.dtype_to_pylibcudf_type(dtype), + dtype_to_pylibcudf_type(dtype), ) result = Column.from_pylibcudf(plc_column) result.dtype.precision = dtype.precision # type: ignore[union-attr] diff --git a/python/cudf/cudf/core/copy_types.py b/python/cudf/cudf/core/copy_types.py index 4b6ad59c8e1..aaaf6c7ee4f 100644 --- a/python/cudf/cudf/core/copy_types.py +++ b/python/cudf/cudf/core/copy_types.py @@ -1,11 +1,11 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. +# Copyright (c) 2023-2025, NVIDIA CORPORATION. from dataclasses import dataclass from typing import TYPE_CHECKING, Any, cast from typing_extensions import Self import cudf -from cudf._lib.types import size_type_dtype +from cudf.utils.dtypes import SIZE_TYPE_DTYPE if TYPE_CHECKING: from cudf.core.column import NumericalColumn @@ -63,7 +63,7 @@ def __init__(self, column: Any, nrows: int, *, nullify: bool): # Alternately we can have an Optional[Column] and handle None # specially in _gather. self.column = cast( - "NumericalColumn", self.column.astype(size_type_dtype) + "NumericalColumn", self.column.astype(SIZE_TYPE_DTYPE) ) else: if self.column.dtype.kind not in {"i", "u"}: diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py index 8ed233ba737..ce7fb968069 100644 --- a/python/cudf/cudf/core/dtypes.py +++ b/python/cudf/cudf/core/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import decimal @@ -57,7 +57,8 @@ def dtype(arbitrary): if np_dtype.kind in set("OU"): return np.dtype("object") elif ( - np_dtype not in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES + np_dtype + not in cudf.utils.dtypes.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES ): raise TypeError(f"Unsupported type {np_dtype}") return np_dtype diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index 17302311a7e..7bc4b08fc49 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -21,7 +21,6 @@ import cudf import cudf.core._internals from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( is_list_like, @@ -46,7 +45,7 @@ from cudf.core.mixins import Reducible, Scannable from cudf.core.multiindex import MultiIndex from cudf.core.udf.groupby_utils import _can_be_jitted, jit_groupby_apply -from cudf.utils.dtypes import cudf_dtype_to_pa_type +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, cudf_dtype_to_pa_type from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import GetAttrGetItemMixin @@ -588,7 +587,7 @@ def indices(self) -> dict[ScalarLike, cp.ndarray]: offsets, group_keys, (indices,) = self._groups( [ cudf.core.column.as_column( - range(len(self.obj)), dtype=size_type_dtype + range(len(self.obj)), dtype=SIZE_TYPE_DTYPE ) ] ) @@ -1185,7 +1184,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): # aggregation scheme in libcudf. This is probably "fast # enough" for most reasonable input sizes. _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) # "Out of bounds" n for the group size either means no entries # (negative) or all the entries (positive) @@ -1199,7 +1198,7 @@ def _head_tail(self, n, *, take_head: bool, preserve_order: bool): group_offsets = group_offsets[:-1] else: group_offsets = group_offsets[1:] - size_per_group - to_take = np.arange(size_per_group.sum(), dtype=size_type_dtype) + to_take = np.arange(size_per_group.sum(), dtype=SIZE_TYPE_DTYPE) fixup = np.empty_like(size_per_group) fixup[0] = 0 np.cumsum(size_per_group[:-1], out=fixup[1:]) @@ -1500,11 +1499,11 @@ def sample( # into a numpy array directly, rather than a list. # TODO: this uses the sort-based groupby, could one use hash-based? _, offsets, _, group_values = self._grouped() - group_offsets = np.asarray(offsets, dtype=size_type_dtype) + group_offsets = np.asarray(offsets, dtype=SIZE_TYPE_DTYPE) size_per_group = np.diff(group_offsets) if n is not None: samples_per_group = np.broadcast_to( - size_type_dtype.type(n), size_per_group.shape + SIZE_TYPE_DTYPE.type(n), size_per_group.shape ) if not replace and (minsize := size_per_group.min()) < n: raise ValueError( @@ -1517,7 +1516,7 @@ def sample( # which is round-to-nearest, ties to sgn(x) * inf). samples_per_group = np.round( size_per_group * frac, decimals=0 - ).astype(size_type_dtype) + ).astype(SIZE_TYPE_DTYPE) if replace: # We would prefer to use cupy here, but their rng.integers # interface doesn't take array-based low and high @@ -1525,7 +1524,7 @@ def sample( low = 0 high = np.repeat(size_per_group, samples_per_group) rng = np.random.default_rng(seed=random_state) - indices = rng.integers(low, high, dtype=size_type_dtype) + indices = rng.integers(low, high, dtype=SIZE_TYPE_DTYPE) indices += np.repeat(group_offsets[:-1], samples_per_group) else: # Approach: do a segmented argsort of the index array and take @@ -1533,7 +1532,7 @@ def sample( # We will shuffle the group indices and then pick them out # from the grouped dataframe index. nrows = len(group_values) - indices = cp.arange(nrows, dtype=size_type_dtype) + indices = cp.arange(nrows, dtype=SIZE_TYPE_DTYPE) if len(size_per_group) < 500: # Empirically shuffling with cupy is faster at this scale rs = cp.random.get_random_state() @@ -1557,7 +1556,7 @@ def sample( indices = ColumnBase.from_pylibcudf(plc_table.columns()[0]) indices = cp.asarray(indices.data_array_view(mode="read")) # Which indices are we going to want? - want = np.arange(samples_per_group.sum(), dtype=size_type_dtype) + want = np.arange(samples_per_group.sum(), dtype=SIZE_TYPE_DTYPE) scan = np.empty_like(samples_per_group) scan[0] = 0 np.cumsum(samples_per_group[:-1], out=scan[1:]) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index b535e8aabd2..0d1bf552982 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -19,7 +19,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import ( _is_non_decimal_numeric_dtype, @@ -53,6 +52,7 @@ from cudf.core.single_column_frame import SingleColumnFrame from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( + SIZE_TYPE_DTYPE, _maybe_convert_to_default_type, find_common_type, is_mixed_with_object_dtype, @@ -1002,7 +1002,7 @@ def _indices_of(self, value) -> cudf.core.column.NumericalColumn: i = [self._range.index(value)] except ValueError: i = [] - return as_column(i, dtype=size_type_dtype) + return as_column(i, dtype=SIZE_TYPE_DTYPE) def isin(self, values, level=None): if level is not None and level > 0: @@ -1348,7 +1348,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = as_column( -1, length=len(needle), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index eded681baf0..4c6f8a9c152 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -60,6 +60,7 @@ from cudf.utils import docutils, ioutils from cudf.utils._numba import _CUDFNumbaConfig from cudf.utils.docutils import copy_docstring +from cudf.utils.dtypes import SIZE_TYPE_DTYPE from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import _warn_no_dask_cudf @@ -3034,7 +3035,7 @@ def _slice(self, arg: slice, keep_index: bool = True) -> Self: NumericalColumn, as_column( range(start, stop, stride), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ), ), len(self), diff --git a/python/cudf/cudf/core/join/join.py b/python/cudf/cudf/core/join/join.py index 6e965ceca66..ce7edc8fdbe 100644 --- a/python/cudf/cudf/core/join/join.py +++ b/python/cudf/cudf/core/join/join.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations from typing import Any @@ -7,7 +7,6 @@ import cudf from cudf import _lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.core._internals import sorting from cudf.core.buffer import acquire_spill_lock from cudf.core.copy_types import GatherMap @@ -17,6 +16,7 @@ _IndexIndexer, _match_join_keys, ) +from cudf.utils.dtypes import SIZE_TYPE_DTYPE class Merge: @@ -243,7 +243,7 @@ def _gather_maps(self, left_cols, right_cols): # tables, we gather from iota on both right and left, and then # sort the gather maps with those two columns as key. key_order = [ - cudf.core.column.as_column(range(n), dtype=size_type_dtype).take( + cudf.core.column.as_column(range(n), dtype=SIZE_TYPE_DTYPE).take( map_, nullify=null, check_bounds=False ) for map_, n, null in zip(maps, lengths, nullify) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e7efd01ca85..64ec099cb39 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -17,7 +17,6 @@ import cudf import cudf._lib as libcudf -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_integer, is_list_like, is_object_dtype, is_scalar from cudf.core import column @@ -34,7 +33,7 @@ ensure_index, ) from cudf.core.join._join_helpers import _match_join_keys -from cudf.utils.dtypes import is_column_like +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, is_column_like from cudf.utils.performance_tracking import _performance_tracking from cudf.utils.utils import NotIterable, _external_only_api, _is_same_name @@ -199,7 +198,7 @@ def __init__( ) if lo == -1: # Now we can gather and insert null automatically - code[code == -1] = np.iinfo(size_type_dtype).min + code[code == -1] = np.iinfo(SIZE_TYPE_DTYPE).min result_col = level._column.take(code, nullify=True) source_data[i] = result_col._with_type_metadata(level.dtype) @@ -1578,11 +1577,11 @@ def droplevel(self, level=-1) -> Self | cudf.Index: def to_pandas( self, *, nullable: bool = False, arrow_type: bool = False ) -> pd.MultiIndex: - # cudf uses np.iinfo(size_type_dtype).min as missing code + # cudf uses np.iinfo(SIZE_TYPE_DTYPE).min as missing code # pandas uses -1 as missing code pd_codes = ( code.find_and_replace( - column.as_column(np.iinfo(size_type_dtype).min, length=1), + column.as_column(np.iinfo(SIZE_TYPE_DTYPE).min, length=1), column.as_column(-1, length=1), ) for code in self._codes @@ -1903,7 +1902,7 @@ def get_indexer(self, target, method=None, limit=None, tolerance=None): result = column.as_column( -1, length=len(target), - dtype=libcudf.types.size_type_dtype, + dtype=SIZE_TYPE_DTYPE, ) if not len(self): return _return_get_indexer_result(result.values) diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index 0abd42d4d4e..eedd777aafe 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -12,13 +12,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import size_type_dtype from cudf.api.extensions import no_default from cudf.api.types import is_scalar from cudf.core._compat import PANDAS_LT_300 from cudf.core.column import ColumnBase, as_column, column_empty from cudf.core.column_accessor import ColumnAccessor -from cudf.utils.dtypes import min_unsigned_type +from cudf.utils.dtypes import SIZE_TYPE_DTYPE, min_unsigned_type if TYPE_CHECKING: from cudf._typing import Dtype @@ -1333,10 +1332,10 @@ def _one_hot_encode_column( else: column = column._get_decategorized_column() # type: ignore[attr-defined] - if column.size * categories.size >= np.iinfo(size_type_dtype).max: + if column.size * categories.size >= np.iinfo(SIZE_TYPE_DTYPE).max: raise ValueError( "Size limitation exceeded: column.size * category.size < " - f"np.iinfo({size_type_dtype}).max. Consider reducing " + f"np.iinfo({SIZE_TYPE_DTYPE}).max. Consider reducing " "size of category" ) result_labels = ( diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py index 6d617cbf38e..7e8468c8e8a 100644 --- a/python/cudf/cudf/io/csv.py +++ b/python/cudf/cudf/io/csv.py @@ -1,4 +1,4 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2018-2025, NVIDIA CORPORATION. from __future__ import annotations import errno @@ -16,11 +16,13 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_hashable, is_scalar from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) from cudf.utils.performance_tracking import _performance_tracking _CSV_HEX_TYPE_MAP = { diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index ff326e09315..16c7d189dfd 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import os @@ -14,10 +14,12 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.core.buffer import acquire_spill_lock from cudf.utils import ioutils -from cudf.utils.dtypes import _maybe_convert_to_default_type +from cudf.utils.dtypes import ( + _maybe_convert_to_default_type, + dtype_to_pylibcudf_type, +) if TYPE_CHECKING: from cudf.core.column import ColumnBase diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index f3124552fd1..0ac2950a22b 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2019-2025, NVIDIA CORPORATION. from __future__ import annotations import itertools @@ -11,11 +11,11 @@ import cudf from cudf._lib.column import Column -from cudf._lib.types import dtype_to_pylibcudf_type from cudf.api.types import is_list_like from cudf.core.buffer import acquire_spill_lock from cudf.core.index import _index_from_data from cudf.utils import ioutils +from cudf.utils.dtypes import dtype_to_pylibcudf_type try: import ujson as json # type: ignore[import-untyped] diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 31a8f4de3b3..9e932acb5fa 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2025, NVIDIA CORPORATION. from __future__ import annotations import datetime @@ -11,6 +11,8 @@ import pyarrow as pa from pandas.core.dtypes.common import infer_dtype_from_object +import pylibcudf as plc + import cudf if TYPE_CHECKING: @@ -151,7 +153,7 @@ def cudf_dtype_from_pydata_dtype(dtype): return cudf.core.dtypes.Decimal64Dtype elif cudf.api.types.is_decimal128_dtype(dtype): return cudf.core.dtypes.Decimal128Dtype - elif dtype in cudf._lib.types.SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: + elif dtype in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES: return dtype.type return infer_dtype_from_object(dtype) @@ -604,6 +606,66 @@ def _get_base_dtype(dtype: pd.DatetimeTZDtype) -> np.dtype: return dtype.base +def dtype_to_pylibcudf_type(dtype) -> plc.DataType: + if isinstance(dtype, cudf.ListDtype): + return plc.DataType(plc.TypeId.LIST) + elif isinstance(dtype, cudf.StructDtype): + return plc.DataType(plc.TypeId.STRUCT) + elif isinstance(dtype, cudf.Decimal128Dtype): + tid = plc.TypeId.DECIMAL128 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal64Dtype): + tid = plc.TypeId.DECIMAL64 + return plc.DataType(tid, -dtype.scale) + elif isinstance(dtype, cudf.Decimal32Dtype): + tid = plc.TypeId.DECIMAL32 + return plc.DataType(tid, -dtype.scale) + # libcudf types don't support timezones so convert to the base type + elif isinstance(dtype, pd.DatetimeTZDtype): + dtype = _get_base_dtype(dtype) + else: + dtype = np.dtype(dtype) + return plc.DataType(SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[dtype]) + + +SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES = { + np.dtype("int8"): plc.types.TypeId.INT8, + np.dtype("int16"): plc.types.TypeId.INT16, + np.dtype("int32"): plc.types.TypeId.INT32, + np.dtype("int64"): plc.types.TypeId.INT64, + np.dtype("uint8"): plc.types.TypeId.UINT8, + np.dtype("uint16"): plc.types.TypeId.UINT16, + np.dtype("uint32"): plc.types.TypeId.UINT32, + np.dtype("uint64"): plc.types.TypeId.UINT64, + np.dtype("float32"): plc.types.TypeId.FLOAT32, + np.dtype("float64"): plc.types.TypeId.FLOAT64, + np.dtype("datetime64[s]"): plc.types.TypeId.TIMESTAMP_SECONDS, + np.dtype("datetime64[ms]"): plc.types.TypeId.TIMESTAMP_MILLISECONDS, + np.dtype("datetime64[us]"): plc.types.TypeId.TIMESTAMP_MICROSECONDS, + np.dtype("datetime64[ns]"): plc.types.TypeId.TIMESTAMP_NANOSECONDS, + np.dtype("object"): plc.types.TypeId.STRING, + np.dtype("bool"): plc.types.TypeId.BOOL8, + np.dtype("timedelta64[s]"): plc.types.TypeId.DURATION_SECONDS, + np.dtype("timedelta64[ms]"): plc.types.TypeId.DURATION_MILLISECONDS, + np.dtype("timedelta64[us]"): plc.types.TypeId.DURATION_MICROSECONDS, + np.dtype("timedelta64[ns]"): plc.types.TypeId.DURATION_NANOSECONDS, +} +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES = { + plc_type: np_type + for np_type, plc_type in SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES.items() +} +# There's no equivalent to EMPTY in cudf. We translate EMPTY +# columns from libcudf to ``int8`` columns of all nulls in Python. +# ``int8`` is chosen because it uses the least amount of memory. +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.EMPTY] = np.dtype("int8") +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.STRUCT] = np.dtype( + "object" +) +PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.TypeId.LIST] = np.dtype("object") + + +SIZE_TYPE_DTYPE = PYLIBCUDF_TO_SUPPORTED_NUMPY_TYPES[plc.types.SIZE_TYPE_ID] + # Type dispatch loops similar to what are found in `np.add.types` # In NumPy, whether or not an op can be performed between two # operands is determined by checking to see if NumPy has a c/c++ From 559cda24e4258da1aa35b7de60f46e8a86b1effa Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 9 Jan 2025 19:18:27 -0800 Subject: [PATCH 3/5] Use 64-bit offsets only if the current strings column output chunk size exceeds threshold (#17693) This PR improves on #17207 and only uses 64-bit offsets if the current output chunk of a strings column exceeds the large-strings threshold instead of using cumulative strings column sizes per `pass` or `row group` level. Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) Approvers: - Karthikeyan (https://github.com/karthikeyann) - David Wendt (https://github.com/davidwendt) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/17693 --- cpp/src/io/parquet/reader_impl.cpp | 48 +++++++-------------- cpp/src/io/parquet/reader_impl_chunking.hpp | 5 +-- 2 files changed, 17 insertions(+), 36 deletions(-) diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp index c48ff896e33..f9fcca6bb4f 100644 --- a/cpp/src/io/parquet/reader_impl.cpp +++ b/cpp/src/io/parquet/reader_impl.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -97,38 +97,24 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num _stream); } - // Compute column string sizes (using page string offsets) for this subpass + // Compute column string sizes (using page string offsets) for this output table chunk col_string_sizes = calculate_page_string_offsets(); - // ensure cumulative column string sizes have been initialized - if (pass.cumulative_col_string_sizes.empty()) { - pass.cumulative_col_string_sizes.resize(_input_columns.size(), 0); - } - - // Add to the cumulative column string sizes of this pass - std::transform(pass.cumulative_col_string_sizes.begin(), - pass.cumulative_col_string_sizes.end(), - col_string_sizes.begin(), - pass.cumulative_col_string_sizes.begin(), - std::plus<>{}); - // Check for overflow in cumulative column string sizes of this pass so that the page string // offsets of overflowing (large) string columns are treated as 64-bit. auto const threshold = static_cast(strings::detail::get_offset64_threshold()); - auto const has_large_strings = std::any_of(pass.cumulative_col_string_sizes.cbegin(), - pass.cumulative_col_string_sizes.cend(), + auto const has_large_strings = std::any_of(col_string_sizes.cbegin(), + col_string_sizes.cend(), [=](std::size_t sz) { return sz > threshold; }); if (has_large_strings and not strings::detail::is_large_strings_enabled()) { CUDF_FAIL("String column exceeds the column size limit", std::overflow_error); } - // Mark any chunks for which the cumulative column string size has exceeded the - // large strings threshold - if (has_large_strings) { - for (auto& chunk : pass.chunks) { - auto const idx = chunk.src_col_index; - if (pass.cumulative_col_string_sizes[idx] > threshold) { chunk.is_large_string_col = true; } - } + // Mark/unmark column-chunk descriptors depending on the string sizes of corresponding output + // column chunks and the large strings threshold. + for (auto& chunk : pass.chunks) { + auto const idx = chunk.src_col_index; + chunk.is_large_string_col = (col_string_sizes[idx] > threshold); } } @@ -210,11 +196,9 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num // only do string buffer for leaf if (idx == max_depth - 1 and out_buf.string_size() == 0 and col_string_sizes[pass.chunks[c].src_col_index] > 0) { - out_buf.create_string_data( - col_string_sizes[pass.chunks[c].src_col_index], - pass.cumulative_col_string_sizes[pass.chunks[c].src_col_index] > - static_cast(strings::detail::get_offset64_threshold()), - _stream); + out_buf.create_string_data(col_string_sizes[pass.chunks[c].src_col_index], + pass.chunks[c].is_large_string_col, + _stream); } if (has_strings) { str_data[idx] = out_buf.string_data(); } out_buf.user_data |= @@ -416,11 +400,11 @@ void reader::impl::decode_page_data(read_mode mode, size_t skip_rows, size_t num final_offsets.emplace_back(offset); out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED; } else if (out_buf.type.id() == type_id::STRING) { - // need to cap off the string offsets column - auto const sz = static_cast(col_string_sizes[idx]); - if (sz <= strings::detail::get_offset64_threshold()) { + // only if it is not a large strings column + if (col_string_sizes[idx] <= + static_cast(strings::detail::get_offset64_threshold())) { out_buffers.emplace_back(static_cast(out_buf.data()) + out_buf.size); - final_offsets.emplace_back(sz); + final_offsets.emplace_back(static_cast(col_string_sizes[idx])); } } } diff --git a/cpp/src/io/parquet/reader_impl_chunking.hpp b/cpp/src/io/parquet/reader_impl_chunking.hpp index ca46f198bb8..4a773fbced1 100644 --- a/cpp/src/io/parquet/reader_impl_chunking.hpp +++ b/cpp/src/io/parquet/reader_impl_chunking.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. + * Copyright (c) 2023-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -130,9 +130,6 @@ struct pass_intermediate_data { rmm::device_buffer decomp_dict_data{0, cudf::get_default_stream()}; rmm::device_uvector str_dict_index{0, cudf::get_default_stream()}; - // cumulative strings column sizes. - std::vector cumulative_col_string_sizes{}; - int level_type_size{0}; // skip_rows / num_rows for this pass. From fb2413e1505297e737095d97e0732eec52519802 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 10 Jan 2025 10:06:35 -0800 Subject: [PATCH 4/5] Make tests build without relaxed constexpr (#17691) Contributes to https://github.com/rapidsai/cudf/issues/7795 This PR updates tests to build without depending on the relaxed constexpr build option. Authors: - Yunsong Wang (https://github.com/PointKernel) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17691 --- .../cudf/detail/utilities/integer_utils.hpp | 12 +++-- cpp/include/cudf/utilities/span.hpp | 40 +++++++++------ cpp/src/io/utilities/parsing_utils.cuh | 49 ++++++++++--------- cpp/src/io/utilities/trie.cuh | 8 ++- .../transform/segmented_row_bit_count_test.cu | 4 +- cpp/tests/utilities/column_utilities.cu | 18 ++++--- 6 files changed, 75 insertions(+), 56 deletions(-) diff --git a/cpp/include/cudf/detail/utilities/integer_utils.hpp b/cpp/include/cudf/detail/utilities/integer_utils.hpp index 2e3d71815c0..44a86f1c84f 100644 --- a/cpp/include/cudf/detail/utilities/integer_utils.hpp +++ b/cpp/include/cudf/detail/utilities/integer_utils.hpp @@ -1,7 +1,7 @@ /* * Copyright 2019 BlazingDB, Inc. * Copyright 2019 Eyal Rozenberg - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -23,6 +23,8 @@ */ #include +#include +#include #include #include @@ -44,13 +46,17 @@ namespace util { * `modulus` is positive. The safety is in regard to rollover. */ template -constexpr S round_up_safe(S number_to_round, S modulus) +CUDF_HOST_DEVICE constexpr S round_up_safe(S number_to_round, S modulus) { auto remainder = number_to_round % modulus; if (remainder == 0) { return number_to_round; } auto rounded_up = number_to_round - remainder + modulus; if (rounded_up < number_to_round) { - throw std::invalid_argument("Attempt to round up beyond the type's maximum value"); +#ifndef __CUDA_ARCH__ + CUDF_FAIL("Attempt to round up beyond the type's maximum value", cudf::data_type_error); +#else + CUDF_UNREACHABLE("Attempt to round up beyond the type's maximum value"); +#endif } return rounded_up; } diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp index e7b76946248..b5044a58934 100644 --- a/cpp/include/cudf/utilities/span.hpp +++ b/cpp/include/cudf/utilities/span.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -197,11 +197,16 @@ struct host_span : public cudf::detail::span_basedata() + offset, count, _is_device_accessible}; } @@ -434,8 +439,8 @@ struct device_span : public cudf::detail::span_basedata() + offset, count}; } @@ -475,28 +480,28 @@ class base_2dspan { * * @return A pointer to the first element of the span */ - [[nodiscard]] constexpr auto data() const noexcept { return _flat.data(); } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto data() const noexcept { return _flat.data(); } /** * @brief Returns the size in the span as pair. * * @return pair representing rows and columns size of the span */ - [[nodiscard]] constexpr auto size() const noexcept { return _size; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto size() const noexcept { return _size; } /** * @brief Returns the number of elements in the span. * * @return Number of elements in the span */ - [[nodiscard]] constexpr auto count() const noexcept { return _flat.size(); } + [[nodiscard]] CUDF_HOST_DEVICE constexpr auto count() const noexcept { return _flat.size(); } /** * @brief Checks if the span is empty. * * @return True if the span is empty, false otherwise */ - [[nodiscard]] constexpr bool is_empty() const noexcept { return count() == 0; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr bool is_empty() const noexcept { return count() == 0; } /** * @brief Returns a reference to the row-th element of the sequence. @@ -507,7 +512,7 @@ class base_2dspan { * @param row the index of the element to access * @return A reference to the row-th element of the sequence, i.e., `data()[row]` */ - constexpr RowType operator[](size_t row) const + CUDF_HOST_DEVICE constexpr RowType operator[](size_t row) const { return _flat.subspan(row * _size.second, _size.second); } @@ -517,7 +522,10 @@ class base_2dspan { * * @return A flattened span of the 2D span */ - [[nodiscard]] constexpr RowType flat_view() const { return _flat; } + [[nodiscard]] CUDF_HOST_DEVICE constexpr RowType flat_view() const + { + return _flat; + } /** * @brief Construct a 2D span from another 2D span of convertible type diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh index 75e45a68842..9833dab282e 100644 --- a/cpp/src/io/utilities/parsing_utils.cuh +++ b/cpp/src/io/utilities/parsing_utils.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2020-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -171,7 +171,10 @@ constexpr uint8_t decode_digit(char c, bool* valid_flag) } // Converts character to lowercase. -constexpr char to_lower(char const c) { return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; } +CUDF_HOST_DEVICE constexpr char to_lower(char const c) +{ + return c >= 'A' && c <= 'Z' ? c + ('a' - 'A') : c; +} /** * @brief Checks if string is infinity, case insensitive with/without sign @@ -515,13 +518,13 @@ struct ConvertFunctor { template and !std::is_same_v and !cudf::is_fixed_point())> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex = false) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex = false) { auto const value = [as_hex, &opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values @@ -564,13 +567,13 @@ struct ConvertFunctor { * @brief Dispatch for boolean type types. */ template )> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex) { auto const value = [&opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values @@ -593,13 +596,13 @@ struct ConvertFunctor { * is not valid. In such case, the validity mask is set to zero too. */ template )> - __host__ __device__ __forceinline__ bool operator()(char const* begin, - char const* end, - void* out_buffer, - size_t row, - data_type const output_type, - parse_options_view const& opts, - bool as_hex) + __device__ __forceinline__ bool operator()(char const* begin, + char const* end, + void* out_buffer, + size_t row, + data_type const output_type, + parse_options_view const& opts, + bool as_hex) { auto const value = [&opts, begin, end]() -> cuda::std::optional { // Check for user-specified true/false values diff --git a/cpp/src/io/utilities/trie.cuh b/cpp/src/io/utilities/trie.cuh index c0efc5b6f20..dbdc4a34277 100644 --- a/cpp/src/io/utilities/trie.cuh +++ b/cpp/src/io/utilities/trie.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. + * Copyright (c) 2018-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,16 +74,14 @@ CUDF_EXPORT trie create_serialized_trie(std::vector const& keys, /* * @brief Searches for a string in a serialized trie. * - * Can be executed on host or device, as long as the data is available - * * @param trie Pointer to the array of nodes that make up the trie * @param key Pointer to the start of the string to find * @param key_len Length of the string to find * * @return Boolean value; true if string is found, false otherwise */ -CUDF_HOST_DEVICE inline bool serialized_trie_contains(device_span trie, - device_span key) +__device__ inline bool serialized_trie_contains(device_span trie, + device_span key) { if (trie.empty()) { return false; } if (key.empty()) { return trie.front().is_leaf; } diff --git a/cpp/tests/transform/segmented_row_bit_count_test.cu b/cpp/tests/transform/segmented_row_bit_count_test.cu index 652b9053582..0e4f623f0a2 100644 --- a/cpp/tests/transform/segmented_row_bit_count_test.cu +++ b/cpp/tests/transform/segmented_row_bit_count_test.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2024, NVIDIA CORPORATION. + * Copyright (c) 2024-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -74,7 +74,7 @@ compute_segmented_row_bit_count(cudf::table_view const& input, cudf::size_type s // Since the number of rows may not divisible by segment_length, // the last segment may be shorter than the others. auto const size_begin = d_sizes + segment_idx * segment_length; - auto const size_end = std::min(size_begin + segment_length, d_sizes + num_rows); + auto const size_end = cuda::std::min(size_begin + segment_length, d_sizes + num_rows); return thrust::reduce(thrust::seq, size_begin, size_end); })); diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu index fb9bdeb0b22..6888f26fd16 100644 --- a/cpp/tests/utilities/column_utilities.cu +++ b/cpp/tests/utilities/column_utilities.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2019-2025, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -37,6 +37,8 @@ #include #include +#include +#include #include #include #include @@ -412,14 +414,16 @@ class corresponding_rows_not_equivalent { T const y = rhs.element(rhs_index); // Must handle inf and nan separately - if (std::isinf(x) || std::isinf(y)) { + if (cuda::std::isinf(x) || cuda::std::isinf(y)) { return x != y; // comparison of (inf==inf) returns true - } else if (std::isnan(x) || std::isnan(y)) { - return std::isnan(x) != std::isnan(y); // comparison of (nan==nan) returns false + } else if (cuda::std::isnan(x) || cuda::std::isnan(y)) { + return cuda::std::isnan(x) != + cuda::std::isnan(y); // comparison of (nan==nan) returns false } else { - T const abs_x_minus_y = std::abs(x - y); - return abs_x_minus_y >= std::numeric_limits::min() && - abs_x_minus_y > std::numeric_limits::epsilon() * std::abs(x + y) * fp_ulps; + T const abs_x_minus_y = cuda::std::abs(x - y); + return abs_x_minus_y >= cuda::std::numeric_limits::min() && + abs_x_minus_y > + cuda::std::numeric_limits::epsilon() * cuda::std::abs(x + y) * fp_ulps; } } else { // if either is null, then the inequality was checked already From dc2a75cba40d38f4a6ba66e652764e96fa6b593d Mon Sep 17 00:00:00 2001 From: Tianyu Liu Date: Fri, 10 Jan 2025 13:22:39 -0500 Subject: [PATCH 5/5] Add special orc test data: timestamp interspersed with null values (#17713) Authors: - Tianyu Liu (https://github.com/kingcrimsontianyu) Approvers: - Bradley Dice (https://github.com/bdice) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/17713 --- ...e.timestamp.desynced.snappy.RLEv2.hasNull.orc | Bin 0 -> 5951 bytes ...stamp.desynced.uncompressed.RLEv2.hasNull.orc | Bin 0 -> 6565 bytes python/cudf/cudf/tests/test_orc.py | 6 ++++++ 3 files changed, 6 insertions(+) create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc create mode 100644 python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.snappy.RLEv2.hasNull.orc new file mode 100644 index 0000000000000000000000000000000000000000..8772f84c3ba2a7942323f49ac28c3e5b172a91fc GIT binary patch literal 5951 zcmaKwdvH_dnZ_T?xR5lJv24X%8VbY$ylTk?@?*+;e|9od$AXd0?qoGRN57280S zG&aHsjtl{0T3Dr7sAwCiY&WdvCaltKsEEN4O+!UD5U~v)ifu$LL?qbAfQ=0HI_b{r z{ZBGzYY!Sz_4)7SG|3=|4MJiM<0FN zSjS?0h5h~`a{X7BdmWT4P*38k=9?dUW4rG+tVN&lqp#cRa1V?5YL@uTum4!Qy>{QtYjU2fOX;~l)?6%~6{T_DgDlxv67x}lc3 zAy@s-6WnDR^h1M&q1}x``^lk0+?Pj9L#LaDCYpz)%)6AIeq7qRrJUNLwrpu>+v4K= z^ofow8?0LfZCiHRx9sQNd&se6v~$bpt}PSYTc-H0z_qosXKOjLRlRy^i{L)jLtCHt z_SOv#Zyj8-b+_P^{qC)Y9@#qj=+@JZZJj8*Z>sktJn>TLcV8;^yrlj+p6z?QHGj`r z`xLK#Eq?WV-d->7z&hUHf8d??0si%A-j(&d*$sTb5BX(2{QEO}!?S$bkN9hzJ?*lAoP>fq!K)e|B)`(=QfX3kYOG0>c)8ZL46-OZae`pntny z`wqe0py0qT&hHeQ__5&JPXt$X31)ZW(mxiKg$iXa7aD$AXnUpbiGSL$Vb6}iy*qaQ zY{&kR6^Hii82$N<)4$j;@yi`kC1LCjmi{VO9to;n3$~PsUB3=K@h`y*uLlPY1b3H) z_eX<=4hBbm6FmKI!HIjsQ*RFA&~WL$50}S=)%Wac`R%alcf(Ko$MA-?hX+L~cmIBP ze|-4Rk>SyIhEI$3O&lGbI<^y|J4@f)S-$K6_3@o8f7t1IZ|4(#+_~XI(e^(T?M)UP zI9YV~&#cz}EIRi=(Unt0v!{y%AF}MJV%eEu!+#aq{!+Z=BUaDZ;{J2R+s_y8O&1@y z!1~T#i%(oEJ~vi;WxRNHg7w`?OUp7#WtW#4K3;13g!TPTm#+P6Y5$d_+dp5rH_Q6r z)uo5OSbE~irRT0Ky)wz#I8`FJUQ(7TkxiEvM3-K?S+XWGxpttO{;m>DQ$45T8cI-LhTNmH>atg+gv1<$c#^Ynk>oYIE z^A4-T>^Ghnd*kNJTO%XaCO2c~^0@N$%$cJjw; zTaNOk%mLG~i8r~kx_952+&x;GxV>j^OS0##Gn0?Hu^tWwW^d?28ip_4j`=@T1Ge9_*iaf6P4L+C8xK zW3y!A@)Kj$oHf*cf90|2jru>0Idj{7I`HdHnt6o}C&wPnz5h=Gzx(8va%10%k=|+k z-huZ%HETES`SaLQ6C1(fN+y zeEh-NGiQ(9y*)X$ba-K6@!Pe0biQz^cW7YM(DB0AhZFNFrpC7om@}*HnV9{j-KCr*k`>cefF5}tn;GvM@@fu#`xG%KN>mP*E@>mn$GtVn$`O~;+;NtYmhvUxYAG&8WDE9H8bQq`YQdE zfy!XP!OC!Dq%v9=tBhABDwCC|%5=e7mD$Q%Wxk4CC9D!viK`@4M8T0NvWlvrtC%Wx zm8Z&E<*V{n1*<|;;i^bgv?^8=uS!%U3r<$0t1?yDs$5mRnq4id7FCN2&QufC+G?_z zs-~-%YIn7#+FNj;+Fu>04pxV%!_|@MXmzYQUU0cOS)Hm*S7)lT)w$}tlr0q&T$PHY z5-B0oN=YdtrKOD2T`(>6N_|qlG$0L1L(;G`B8?WzN#oLlG$~C<)6$GIE6qvsoC29p zCX$I|5*Z=W%19X{qd7}tZkb2smHA|TSwI$)g=Aq_R2GxPWeHhQmXf7q8CjOIQkIvq zO{5M&VX?6kg6F3cn(t2r5E~up**}Dq@PbBB@9z(u#~C ztH>$xHS8K8XI+iBMp8r6XluwCs)nv%YTTS>YP>bR8h=foCRh`y3D-nwqMS`N@tQ#q&e25UpLVb067 z(b`yTyf#srtWDLXYcsW3&c52blC2afMM|+!q9l}BCCPbBNh=wpTj^1Hl|H3k8Bhi} z2bE!EL>X1alyPN3nN+5fX=PTKQ|47{l~5&8iB%F6!8xKLRg{WWF)Fvpqw=bJDnIAA zDyRyn!m5ZWs*0)Ns)Q=ZIjKskGODa9r^>6@YN1-B7IV(13AI*Dswp+CX4G!ANA2ZY zQ2W&Zbx<8rht&~vR2@^tIhWN*bxNI9XVh7BPMz1VHA2o+jaVbm5E`w9)KD5)!)V-` zX^mIo)A%(3O;8ikgf$UOlryJ^YZ98ICZ$PhGMcO=r^#!DT9H<)m1qgAR!eFrEzMn` zb!$CZuhysaYXjP#Hlz)6muaKgm^Q9WXp`EMHm%KQv)q;1ypF9C>O?xRPNE}pS{=!) z(9t?Z=hk_2UY$?p*9CMzu0j{qMIL`r7uCgdaa}@})TMN3u1=TL<#hQvcAcd9iMk}$QJ1dE)Me{(b@_UB zy|7+XFRmx*we@5@RZrJ5_3nC4y_fq)y}v$CAFL16hwCHt(fU|@ocm;bvOZOxuFuqG z>vQ$_26lsxyRJdpAZZ{Pv<+kf)j&5e4Q}o;4c-P{gTEor5NrrFgc~9aQSPROctfHg zxqbl+DKwKh4LLOA!5xBL2)zh;G4vAX3Fx)Zlh9Mp)8Gz6?}pw3y%%~P^nU0A z&0;O>Jy4+9$pAq*lI#4t!;AYjnKK*B)7 zz`)>!!2^RA1|JN57y>W^!955=7={Q8Q5a${#9>Ilkc1%xLmJ$-V93IdgCUPbHX4O! z6roX!MhO}TaF3voL?eYp8jTDZ-Dvcn(TheO8vWoNM`I9;AvA{37(rtcjWIOF(U?GE z65Nw$OrtS_#w;3hXv~9TgA{@kffR##1|$Je3z7s$fuun)Al)E6AidyT0O$UWHK%qXb3*MlFmaj1-JCj0}u! z7`-t1VD!TnfH4ST2*xms5g4Q3&cPUmF#%%|#uSWc7&9&!d^0a~jYrLbDjnlH8vcy+$;X)ACs~ z(`aUGX*mZGXt}G<|nf-d%=5T=8Hu>0CRAbUgX0tM_`W5 zzQ3sB;5~V}$^df;=Jf5rEX-M$b1>)8!bXb_ymk4<4x>ea79xM7A1x$WsJRsev@mFK zgZB(tylC;E#XmPaizb7;xm{cIMk zLbQs|Dn_dWtpr-N^NGV~rO-+*l=q|6jaJXX3kJ0M(CSBP0Ifl^hR_;DYZR?9w8qhz zKx-1MDYT~1nnCO0&F({M9uymt5R?d%7?cDQ0ZI#s1n)IaG$;m?87nBc_A5;KT z5WIt+!k{9cqM%}+;-C_slAuzc(%`)XDhnzHDh~@A79lJmSj4bMU?IRe0t*QX1q%%e z1B)9L4=i3-e6aYzI}S?_mJlpqSR$}QVTr*Kha~|^618G9NO||XQN#R-c_`V(Jnzdfp#t0NwiaFr_s)!-3{I}+P!G^q1})60NR6S z51~Db_6XXe;LV{uj`jrFlW0$&J&pDZ+OufSp*;_N0Xl@}5TQei4hcF4bZF5*qJu&Q z4gL~zxY6N3hZh|_bokK`Kt~WAA#{YnUxtn-I%4REqa%TiBsx;)NTVZzjx6{qdj~u6 zR<>1W64;tv;*Y8n6cW3TxOJu|};iYuuW!Cao!J+L~Sb z)6Co0Hla;q6Wb& zPubJ8Iw3a4mil%9XcJrU1y|j<^(*Zh2hv+aJp`-jwbevAm zNjgQR=?tBvb9CMzbch^chr~fRv<}iiIcWZ{!|m`mybhnk?+7@8j*uhFf7uar#2j%) z!jW{O9BD_!k>&4m5O*9I^&&*&SYn*GtGaiGuxT#%y+T7gk7R8ahIfv;2-HCyQnU@i|KNAdAht^zAiui zcvrA1)D`ZEbVa*jUGc6&SCW6SE8Ugp%68?t^4;uiVYjGT+)Z?AyUA{<)E@yCdDv?pSx6f4Mu^o$5|^XS%c9x$eA^?G*B_I>k;FIcHu_;1aq-F0o7EB3xP* z>7rb;V2R7^^0>S%pUdwGxPq>bD=b*%in?O1xGUjGx>ByRE91%vR=V;%>>go{s7Kr* z=^=WwJ!B8v!}Pd&JU!kXUyr{h&=V9Wdcr-Co@h_3C*G6jN%o|A(gIyiwkOwb literal 0 HcmV?d00001 diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.timestamp.desynced.uncompressed.RLEv2.hasNull.orc new file mode 100644 index 0000000000000000000000000000000000000000..f5a1edbb10ee55f131764e18b638c35c8e06b0cf GIT binary patch literal 6565 zcmcI}eRLD|neHQ5Uls;CA_*YU;L0D8Mz%giwk3~b%O2U5JhCl$XcDS4b*gMPRM`fq zq_GiBaAXJ|vxQZfg^KQmD%%^b=uKFq-B1yOBbtVaZXlu!Ac}27K8Q%LkpUYS>~-7S zy}fsvJ@?#m`uk(v=ltI1eV_N8-^`pd=XJ7Hu~5Pn4HUWX2k@b20m#Kd{BcR{6HP8+ z?pb~+cx!*yP#Qb%(`eYB_*AGpce>QY5v#=FMjqz5Ur8<5Q6Tw}=$weTcX{CO?@6WC z-x)rzzT(}nYYd0Q|)KY48aZ$JO&?E>xtU$DOU1^W-Y^aYLw zzhM7?uYQr^_Y$sS^MfCKQ5`RRLFE6R)SpfE9}=!(dvE>j&(!f3kotWc3vK`Wga1qJ zFC_J+ZT}aEg*sk;?@#LZZ_nyaGyNxi=|9o`3I2=Jf4Kjwge&orAOG^->-c|s^1iZr zzAz^SL>w&VLQw!C4;(($2(VU&J`uJRqEL+ei*Tq2#~0)565LgcnG%lZF3xfZN3j$u z?&jF<;jForvvwJ$Z#llag0n};*}sx==swQzFF|=f=khAfOfDn(GIzNQx-zb@oNKS( zuBqg%m7`w4-KOO3A-MZh+(T7puI8Rqb1!SSGui@?uAr10)anPDY6e}kgOBo;t~U%0 z7zcON5ALG|5At6eF%O<<7#wdLoV4szfBa!d^X5`|v(~z~sb#ZE@cl>IHm|pB9md_KeQWWfZwdBz z1^b^69Qqr<@o(dQJ}J1oPB61xDEf|Yxex#Rl+gIJ(EeTFnrDP-d-3x=;kFIJJ$~W- zjlx6h_}kPcXP*;Z-XxqESn}laMOOnN#h}QzS!CZLTJr+l-zw_cCfc@Lv?nOqKZJ8T zM92SDboP6q%R5CgyKwRE3YUip6)zSVzh7v7sqoRi-@bnL_JKXycl}`dzTy=J_ii8g z;r3Jiuzma=w@(&_u`gKiqhM(ysC^~aR3df#IQZ!Q2(EuMIIusst0cTH8a#L)IP#O= zss9xmze_s#`VbBdmHhKiX>3S)*WRX|4!M3d^yt3~t$%Z9K(cbz&xiKKhYlVd8hLBz zlw|Mtk)g?>J20}NyH<0`&H4NWYPW;MTdUPY5v!u zv+osMK3Oz#YO&~jPJ3#x;`CzUZx-8syLin9obEG=`_3-jc5d;W^y2;JIbZ$V;^P+< zpB-I%d2I2_IOiJ|mn_dLQCwPL{BVi=BhI%zUb6O+C4HBdZ2NS{o-F4(SC$<5Y{~J@ zmz=%2 zzM|x{`2k>{jMt+zPE7QgBA z=xaBo-xwagIHX$hE@ zj=#>I(ZBr`zhu*9bJ;~r;mqp9k$WePayIKPzP@yG|!Ub0C0&O9$OTzmI>{olQG^!~o-cSkMbu3i0GKD5X-EIU4G z%h^JG_m&^6++g_Cs57_q`~5%us8LY(P;&I4+`E6@|Fe&dsyFmLAL*GA?&*K$V+*-q z_pe7EA73Bt|Lw;|>o&akucPaxw*Ij1ez}F(aOS1>^@*7Xj?YmLD zf7teRGn?Q0o4NfroYCQjvL_#${K^A!2XE~B$?)S>obLSj4{qOk&&k(@d#{}QdVcoI z+{n#h{ySSgb9!c1o}D{>(-|Ay{r2;JJ!?8Qck1TOpANtJxzjuOSMIs9H&6b{@SC5X zd~$Z<`P-L2ifp8M8R&))vA zPgd1mT>jqNhN-ca`{ysOYR)e^HDj;odF9zvpRQ`pKa`sL!S%6M?^v@IHvjT(8xP!A zcVKeQ6Bc)V>`dd~8xybJ8T`yFU%BdBGqiXR?msK$H&+G!|k~h z#kPxv({rzKpPL^kdhWAjM++9-vh_XMrSBKq;5@slr?2=TcV=YH)w6%uMEEgXL@@^@Jz$G-p4Pz`poe8-gOt=?KNX-?}bl# zhsSz1jSBl3F7|F6Dd`)&)Vp)kwxQvp-rb{*Z+MHd_2tp6{)SI`UmbnbKYXS4^^wwz z4WIYEIr_=Q;fda(W5WK1Ytz3P)AV1Mntp%G`RpS%CW}_Bd-lTg^u@8A&pmQ$`qQ!3 zadCF~+StiWkKCEQH8!#7!u(`OY4O0U|GG*C}D&qy0DwCC|%5-I>GFzFe%*(lQalsY2R4$Vfa#Bvo zX*nZj(bU{)TNC*(NkvMLR%8@e-bzJY$yJJ#5~Wlr zQxZy2N%6{*jFMHll^&&6=~McZ0cDV8pJye4M2H9z5h6;&h&YiTk~{~ICNe~p$PsxJS0z?S zR8rn*6`>+kl!{g{DpuuIc~oBB!z#ZjpbDx&s<0}eimGC&xGJeisnV*9Dyzz=@>Sd_ zG4F{gX_c&ss3NPVD!PiPVyoP|r>eYFzAArJpek4ustQ*{s-nD&Rq?7sRkA8om9ENE zWvg;kdEQ{PxLQ&zt(H|2)nqkQO;M-xc>S%SWI$oWq zPFAO?)76>kEN^dhUd>gD)e^N-EmIR}Qcdw*Q8Q{*?N)o#UbRo{R|nKV-T`%39Z^Ts zF?C#>P$$(Xby}TO=hS%(S0mO)G*XRBL+}o3C=IP)G_1z0@o2mnpT^HSrU`07ny@CK ziE3h+xF(@V@=j>dnv5o^$!YRhu2!s-Xr;W^b1bga(Jo6>o8KAm3| z&;@lNU04^QcJ2E~Crpa=JVzCMBell#v8Uk`zgk41W>nCOxE=^pSot zKnBSW8Rjn~qhySXlL<0OrpPpzA+!9IWM0qJi}ez{R4>yLdQwmE%k+$%)w}f`y;twk z`}F~Rkgw8*^^vbXrjP1l`nW!!PwG?pG+(dJ>T~*h4Yx*IBdL+r$Z80_sfMbdYnU3g z#$Ds7@z(fi{CsOouqIR!u8Gt{YhpF=nnX>K@2E-FWNNZCxte?}w^m#$sg>3ewPY<- zOV={BY^}T2Q|skFTX^5U6jAEE?$?YORk$oT?%z+)MZeY zMO_YcdGH5e5W^sWK?;Km1_A~W1_}lm1_t~g7~C*;VDQ4=gTW6&0EQq8AsE8ozX(GV zh8PTS7!oifVMxJ{h9Lt(7W}<1KsF$Ih0RJ%R zDb&-bXHd_g-i>+>>b!Fp)4(FwrnEFtISXVe-P{gUJt50Hz>J zA(+B2MPQ18KMPYFrUXn$m{KsMVamXig((M99)bdx#V|`?mclH9nShyunSz;ynSo#t z%x;)HFneM4!R&`Q0CN!L5X@l+mckr`IRm_Y-92694n2!b*+Flb=W;6{T74c>`6eQ5BbAu#!{5dsw&!jp#<_)#>(&=5yM z0u4ztq^`ZV5UWQ+_S&@tejbh7oYRO#2^ytnl;wWC5NkpsHKm+EBZEeEYU?31deG=a zqYsUKGzPDG`p_6gW90hCLSPJy@f%-SASWSkpfQcc3>vd&%-xtd1Pd1y@y!SOV39(w z`sPoKu#m7&u+XqDu&}VWr@uD?ix+~2r$1ZZ2Ve=#Fbn)JED>0uGw&|oI0TQ~sxZQm zf+c+`Fat{#mK-d3G;z@+hTw_(BZtr=Llcod+=nI#P4w&vBbrz=xgmH8Og$JL9-OiGBgut zCg&1|&`hJ5nJ?`_vm4Ex`R9yi_MzF2<^Y<5Xbzz{jOHksV`z?}If3RRnp0>_qd9}- zg_GTj<~(RFXfbFBXenqJXaY0|nu6dJ&UVsv=_7wv>$W;bP$3Apu?aeprfE; zpyQwupp&3epwke%0Xhph2RaWc7gjN>5?H0M%3viRI1DQVD-A0HD+{X|Ru8ORSbebi zAvgwW5Y`Z^VOS%uMq!P?8izFjYZ8JJu%=|JqcwuoCbVyxosYs*XFbNZ2?v-^csd(a-ThwTx2)E=|P?FoBQ=&+~l8GF{Av*+8n?c#Px zyHvQkooFZ9sdl=ZX=mHr?Vfh8@ZolUd!Rko9%>J_N7|$9vG%y|vG!zpsy*GFY0tLj z+Vc#T5euJSq>PLq7?PnFnqe50aSNYfyo`_WGXW;ZgqScBVWPr~Oq@wDNhZalnGBO< za!lSKc1Rpjhs;4ZNC)Mh9gJ|u;dXc&UWd=&cLW?kN5~NtzUYWLVve{Y;Yd1Cj-)6_(})U!FISiJRRN+Ux&XV&=C|K=m>X2I-(u1j(A6+ zBiWJaNDJTS$ads9@}1mHai^qH+9~TKgoit+PP&unWINrRo=$J4uhTC))*0*!b%r}5 zozc!%XS_4fnG~MrOm}8Fvz@uld>6M%+$HIfb`f1<7u7{~FC=dIcX>3WSwr|l+)|< zIsMLnGw2LC!_J5^Dx7u3oe5{snR2F`8E4j+bLK?_F0o7ElDcFr!bQ3$7wuw1i(GD( z$K`eTTz*%;6?BDMVbM}o)D?5ZT?tpxm2#zB8CO=c(v|P#c8j|u-O_GZH_=UYQ{7BA z+wJc5bbGsf-Tv-CcTlA24tGboqusIYcz2>Z*`4Z6i}c;u?p$}C<+5T{!b({gONdM? z#nLRpvaFl+uwK^3`bAbY$cET38)2htjE%DiHYswjX*R=V*&Lf+&0Q^CEmvNHvBkoO;}25{Q9;SHPFl2J)-L~zw$!Bf@NU@(QIUkhkrsV>L(kLS`t~=U zdiopRUSA3Yw2+BJg?~ia5AUY_n4kJ@^EV9?**P9Qsn)9XqQ6<_Wnh3O