From c791f8044d0d11f55042afd7a66698d8ce2e1973 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 6 Dec 2024 10:20:06 -0800 Subject: [PATCH] Remove cudf._lib.text in favor of inlining pylibcudf (#17408) Contributes to https://github.com/rapidsai/cudf/issues/17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17408 --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/text.pyx | 53 ---------------------------- python/cudf/cudf/io/text.py | 45 +++++++++++++++++------ 4 files changed, 34 insertions(+), 66 deletions(-) delete mode 100644 python/cudf/cudf/_lib/text.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 2f05101e8e3..4e1bf860872 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -27,7 +27,6 @@ set(cython_sources stream_compaction.pyx string_casting.pyx strings_udf.pyx - text.pyx transform.pyx types.pyx utils.pyx diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index cb2d0501fea..c79d5100622 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -16,7 +16,6 @@ string_casting, strings, strings_udf, - text, ) MAX_COLUMN_SIZE = np.iinfo(np.int32).max diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx deleted file mode 100644 index 7942d067c2b..00000000000 --- a/python/cudf/cudf/_lib/text.pyx +++ /dev/null @@ -1,53 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libcpp cimport bool - -from io import TextIOBase - -import pylibcudf as plc - -from cudf._lib.column cimport Column - - -def read_text(object filepaths_or_buffers, - str delimiter, - object byte_range, - bool strip_delimiters, - object compression, - object compression_offsets): - """ - Cython function to call into libcudf API, see `multibyte_split`. - - See Also - -------- - cudf.io.text.read_text - """ - if compression is None: - if isinstance(filepaths_or_buffers, TextIOBase): - datasource = plc.io.text.make_source(filepaths_or_buffers.read()) - else: - datasource = plc.io.text.make_source_from_file(filepaths_or_buffers) - elif compression == "bgzip": - if isinstance(filepaths_or_buffers, TextIOBase): - raise ValueError("bgzip compression requires a file path") - if compression_offsets is not None: - if len(compression_offsets) != 2: - raise ValueError( - "compression offsets need to consist of two elements") - datasource = plc.io.text.make_source_from_bgzip_file( - filepaths_or_buffers, - compression_offsets[0], - compression_offsets[1] - ) - else: - datasource = plc.io.text.make_source_from_bgzip_file( - filepaths_or_buffers, - ) - else: - raise ValueError("Only bgzip compression is supported at the moment") - - options = plc.io.text.ParseOptions( - byte_range=byte_range, strip_delimiters=strip_delimiters - ) - plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) - return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py index 5ce738cae0e..5e266c5ff55 100644 --- a/python/cudf/cudf/io/text.py +++ b/python/cudf/cudf/io/text.py @@ -1,9 +1,10 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. -from io import BytesIO, StringIO +from io import BytesIO, StringIO, TextIOBase + +import pylibcudf as plc import cudf -from cudf._lib import text as libtext from cudf.utils import ioutils from cudf.utils.performance_tracking import _performance_tracking @@ -33,13 +34,35 @@ def read_text( filepath_or_buffer, "read_text" ) - return cudf.Series._from_column( - libtext.read_text( - filepath_or_buffer, - delimiter=delimiter, - byte_range=byte_range, - strip_delimiters=strip_delimiters, - compression=compression, - compression_offsets=compression_offsets, - ) + if compression is None: + if isinstance(filepath_or_buffer, TextIOBase): + datasource = plc.io.text.make_source(filepath_or_buffer.read()) + else: + datasource = plc.io.text.make_source_from_file(filepath_or_buffer) + elif compression == "bgzip": + if isinstance(filepath_or_buffer, TextIOBase): + raise ValueError("bgzip compression requires a file path") + if compression_offsets is not None: + if len(compression_offsets) != 2: + raise ValueError( + "Compression offsets need to consist of two elements" + ) + datasource = plc.io.text.make_source_from_bgzip_file( + filepath_or_buffer, + compression_offsets[0], + compression_offsets[1], + ) + else: + datasource = plc.io.text.make_source_from_bgzip_file( + filepath_or_buffer, + ) + else: + raise ValueError("Only bgzip compression is supported at the moment") + + options = plc.io.text.ParseOptions( + byte_range=byte_range, strip_delimiters=strip_delimiters ) + plc_column = plc.io.text.multibyte_split(datasource, delimiter, options) + result = cudf._lib.column.Column.from_pylibcudf(plc_column) + + return cudf.Series._from_column(result)