Remove cudf._lib.text in favor of inlining pylibcudf (#17408)

Contributes to #17317 Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Matthew Murray (https://github.com/Matt711) URL: #17408
rapidsai · Dec 6, 2024 · c791f80 · c791f80
1 parent 38261f8
commit c791f80
Show file tree

Hide file tree

Showing 4 changed files with 34 additions and 66 deletions.
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -27,7 +27,6 @@ set(cython_sources
     stream_compaction.pyx
     string_casting.pyx
     strings_udf.pyx
-    text.pyx
     transform.pyx
     types.pyx
     utils.pyx

diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py
@@ -16,7 +16,6 @@
     string_casting,
     strings,
     strings_udf,
-    text,
 )
 
 MAX_COLUMN_SIZE = np.iinfo(np.int32).max

diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
@@ -1,9 +1,10 @@
 # Copyright (c) 2018-2024, NVIDIA CORPORATION.
 
-from io import BytesIO, StringIO
+from io import BytesIO, StringIO, TextIOBase
+
+import pylibcudf as plc
 
 import cudf
-from cudf._lib import text as libtext
 from cudf.utils import ioutils
 from cudf.utils.performance_tracking import _performance_tracking
 
@@ -33,13 +34,35 @@ def read_text(
         filepath_or_buffer, "read_text"
     )
 
-    return cudf.Series._from_column(
-        libtext.read_text(
-            filepath_or_buffer,
-            delimiter=delimiter,
-            byte_range=byte_range,
-            strip_delimiters=strip_delimiters,
-            compression=compression,
-            compression_offsets=compression_offsets,
-        )
+    if compression is None:
+        if isinstance(filepath_or_buffer, TextIOBase):
+            datasource = plc.io.text.make_source(filepath_or_buffer.read())
+        else:
+            datasource = plc.io.text.make_source_from_file(filepath_or_buffer)
+    elif compression == "bgzip":
+        if isinstance(filepath_or_buffer, TextIOBase):
+            raise ValueError("bgzip compression requires a file path")
+        if compression_offsets is not None:
+            if len(compression_offsets) != 2:
+                raise ValueError(
+                    "Compression offsets need to consist of two elements"
+                )
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+                compression_offsets[0],
+                compression_offsets[1],
+            )
+        else:
+            datasource = plc.io.text.make_source_from_bgzip_file(
+                filepath_or_buffer,
+            )
+    else:
+        raise ValueError("Only bgzip compression is supported at the moment")
+
+    options = plc.io.text.ParseOptions(
+        byte_range=byte_range, strip_delimiters=strip_delimiters
     )
+    plc_column = plc.io.text.multibyte_split(datasource, delimiter, options)
+    result = cudf._lib.column.Column.from_pylibcudf(plc_column)
+
+    return cudf.Series._from_column(result)