From c791f8044d0d11f55042afd7a66698d8ce2e1973 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Fri, 6 Dec 2024 10:20:06 -0800
Subject: [PATCH] Remove cudf._lib.text in favor of inlining pylibcudf (#17408)

Contributes to https://github.com/rapidsai/cudf/issues/17317

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Matthew Murray (https://github.com/Matt711)

URL: https://github.com/rapidsai/cudf/pull/17408
---
 python/cudf/cudf/_lib/CMakeLists.txt |  1 -
 python/cudf/cudf/_lib/__init__.py    |  1 -
 python/cudf/cudf/_lib/text.pyx       | 53 ----------------------------
 python/cudf/cudf/io/text.py          | 45 +++++++++++++++++------
 4 files changed, 34 insertions(+), 66 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/text.pyx

diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 2f05101e8e3..4e1bf860872 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -27,7 +27,6 @@ set(cython_sources
     stream_compaction.pyx
     string_casting.pyx
     strings_udf.pyx
-    text.pyx
     transform.pyx
     types.pyx
     utils.pyx
diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
index cb2d0501fea..c79d5100622 100644
--- a/python/cudf/cudf/_lib/__init__.py
+++ b/python/cudf/cudf/_lib/__init__.py
@@ -16,7 +16,6 @@
     string_casting,
     strings,
     strings_udf,
-    text,
 )
 
 MAX_COLUMN_SIZE = np.iinfo(np.int32).max
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
deleted file mode 100644
index 7942d067c2b..00000000000
--- a/python/cudf/cudf/_lib/text.pyx
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) 2020-2024, NVIDIA CORPORATION.
-
-from libcpp cimport bool
-
-from io import TextIOBase
-
-import pylibcudf as plc
-
-from cudf._lib.column cimport Column
-
-
-def read_text(object filepaths_or_buffers,
-              str delimiter,
-              object byte_range,
-              bool strip_delimiters,
-              object compression,
-              object compression_offsets):
-    """
-    Cython function to call into libcudf API, see `multibyte_split`.
-
-    See Also
-    --------
-    cudf.io.text.read_text
-    """
-    if compression is None:
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            datasource = plc.io.text.make_source(filepaths_or_buffers.read())
-        else:
-            datasource = plc.io.text.make_source_from_file(filepaths_or_buffers)
-    elif compression == "bgzip":
-        if isinstance(filepaths_or_buffers, TextIOBase):
-            raise ValueError("bgzip compression requires a file path")
-        if compression_offsets is not None:
-            if len(compression_offsets) != 2:
-                raise ValueError(
-                    "compression offsets need to consist of two elements")
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-                compression_offsets[0],
-                compression_offsets[1]
-            )
-        else:
-            datasource = plc.io.text.make_source_from_bgzip_file(
-                filepaths_or_buffers,
-            )
-    else:
-        raise ValueError("Only bgzip compression is supported at the moment")
-
-    options = plc.io.text.ParseOptions(
-        byte_range=byte_range, strip_delimiters=strip_delimiters
-    )
-    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
-    return Column.from_pylibcudf(plc_column)
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 5ce738cae0e..5e266c5ff55 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from io import BytesIO, StringIO
+from io import BytesIO, StringIO, TextIOBase
+
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import text as libtext
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
@@ -33,13 +34,35 @@ def read_text(
         filepath_or_buffer, "read_text"
     )
 
-    return cudf.Series._from_column(
-        libtext.read_text(
-            filepath_or_buffer,
-            delimiter=delimiter,
-            byte_range=byte_range,
-            strip_delimiters=strip_delimiters,
-            compression=compression,
-            compression_offsets=compression_offsets,
-        )
+    if compression is None:
+        if isinstance(filepath_or_buffer, TextIOBase):
+            datasource = plc.io.text.make_source(filepath_or_buffer.read())
+        else:
+            datasource = plc.io.text.make_source_from_file(filepath_or_buffer)
+    elif compression == "bgzip":
+        if isinstance(filepath_or_buffer, TextIOBase):
+            raise ValueError("bgzip compression requires a file path")
+        if compression_offsets is not None:
+            if len(compression_offsets) != 2:
+                raise ValueError(
+                    "Compression offsets need to consist of two elements"
+                )
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+                compression_offsets[0],
+                compression_offsets[1],
+            )
+        else:
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+            )
+    else:
+        raise ValueError("Only bgzip compression is supported at the moment")
+
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
     )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    result = cudf._lib.column.Column.from_pylibcudf(plc_column)
+
+    return cudf.Series._from_column(result)