From 2762e59881ee1a68c8cc49012a5f3af213898407 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:22:44 -0800 Subject: [PATCH 1/5] Remove cudf._lib.nvtext in favor of inlining pylibcudf --- python/cudf/cudf/_lib/CMakeLists.txt | 1 - python/cudf/cudf/_lib/__init__.py | 1 - python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 24 - python/cudf/cudf/_lib/nvtext/__init__.pxd | 0 python/cudf/cudf/_lib/nvtext/__init__.py | 0 .../cudf/_lib/nvtext/byte_pair_encode.pyx | 24 - .../cudf/cudf/_lib/nvtext/edit_distance.pyx | 24 - .../cudf/cudf/_lib/nvtext/generate_ngrams.pyx | 35 -- python/cudf/cudf/_lib/nvtext/jaccard.pyx | 17 - python/cudf/cudf/_lib/nvtext/minhash.pyx | 73 --- .../cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx | 24 - python/cudf/cudf/_lib/nvtext/normalize.pyx | 28 -- python/cudf/cudf/_lib/nvtext/replace.pyx | 52 --- python/cudf/cudf/_lib/nvtext/stemmer.pyx | 55 --- .../cudf/_lib/nvtext/subword_tokenize.pyx | 38 -- python/cudf/cudf/_lib/nvtext/tokenize.pyx | 86 ---- python/cudf/cudf/_lib/strings/__init__.py | 33 -- python/cudf/cudf/core/byte_pair_encoding.py | 13 +- python/cudf/cudf/core/column/string.py | 435 ++++++++++++++---- python/cudf/cudf/core/subword_tokenizer.py | 7 +- python/cudf/cudf/core/tokenize_vocabulary.py | 9 +- 21 files changed, 366 insertions(+), 613 deletions(-) delete mode 100644 python/cudf/cudf/_lib/nvtext/CMakeLists.txt delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/nvtext/__init__.py delete mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/edit_distance.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/jaccard.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/minhash.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/normalize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/replace.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/stemmer.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx delete mode 100644 python/cudf/cudf/_lib/nvtext/tokenize.pyx diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index dd27aae7133..e623399a09c 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -50,5 +50,4 @@ include(../../../../cpp/cmake/thirdparty/get_nanoarrow.cmake) target_link_libraries(interop PUBLIC nanoarrow) add_subdirectory(io) -add_subdirectory(nvtext) add_subdirectory(strings) diff --git a/python/cudf/cudf/_lib/__init__.py b/python/cudf/cudf/_lib/__init__.py index cdf7cbe13c4..9653c7a06f1 100644 --- a/python/cudf/cudf/_lib/__init__.py +++ b/python/cudf/cudf/_lib/__init__.py @@ -7,7 +7,6 @@ groupby, interop, merge, - nvtext, orc, parquet, reduce, diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt deleted file mode 100644 index 22ec5d472f2..00000000000 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -# ============================================================================= -# Copyright (c) 2022-2024, NVIDIA CORPORATION. -# -# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except -# in compliance with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software distributed under the License -# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express -# or implied. See the License for the specific language governing permissions and limitations under -# the License. -# ============================================================================= - -set(cython_sources - byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx - ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx -) -set(linked_libraries cudf::cudf) -rapids_cython_create_modules( - CXX - SOURCE_FILES "${cython_sources}" - LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf -) diff --git a/python/cudf/cudf/_lib/nvtext/__init__.pxd b/python/cudf/cudf/_lib/nvtext/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/nvtext/__init__.py b/python/cudf/cudf/_lib/nvtext/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx deleted file mode 100644 index 2b2762eead2..00000000000 --- a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext -from pylibcudf.nvtext.byte_pair_encode import BPEMergePairs # no-cython-lint - - -@acquire_spill_lock() -def byte_pair_encoding( - Column strings, - object merge_pairs, - object separator -): - return Column.from_pylibcudf( - nvtext.byte_pair_encode.byte_pair_encoding( - strings.to_pylibcudf(mode="read"), - merge_pairs, - separator.device_value.c_value - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx deleted file mode 100644 index 3dd99c42d76..00000000000 --- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf cimport nvtext - -from cudf._lib.column cimport Column - - -@acquire_spill_lock() -def edit_distance(Column strings, Column targets): - result = nvtext.edit_distance.edit_distance( - strings.to_pylibcudf(mode="read"), - targets.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def edit_distance_matrix(Column strings): - result = nvtext.edit_distance.edit_distance_matrix( - strings.to_pylibcudf(mode="read") - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx deleted file mode 100644 index 7fdf9258b7f..00000000000 --- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def generate_ngrams(Column strings, int ngrams, object py_separator): - result = nvtext.generate_ngrams.generate_ngrams( - strings.to_pylibcudf(mode="read"), - ngrams, - py_separator.device_value.c_value - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def generate_character_ngrams(Column strings, int ngrams): - result = nvtext.generate_ngrams.generate_character_ngrams( - strings.to_pylibcudf(mode="read"), - ngrams - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def hash_character_ngrams(Column strings, int ngrams): - result = nvtext.generate_ngrams.hash_character_ngrams( - strings.to_pylibcudf(mode="read"), - ngrams - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/jaccard.pyx b/python/cudf/cudf/_lib/nvtext/jaccard.pyx deleted file mode 100644 index c964d0206b7..00000000000 --- a/python/cudf/cudf/_lib/nvtext/jaccard.pyx +++ /dev/null @@ -1,17 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def jaccard_index(Column input1, Column input2, int width): - result = nvtext.jaccard.jaccard_index( - input1.to_pylibcudf(mode="read"), - input2.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx deleted file mode 100644 index 25cfcf99ca6..00000000000 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ /dev/null @@ -1,73 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t, uint64_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def minhash(Column input, Column seeds, int width=4): - result = nvtext.minhash.minhash( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width): - return Column.from_pylibcudf( - nvtext.minhash.minhash_permuted( - input.to_pylibcudf(mode="read"), - seed, - a.to_pylibcudf(mode="read"), - b.to_pylibcudf(mode="read"), - width, - ) - ) - - -@acquire_spill_lock() -def minhash64(Column input, Column seeds, int width=4): - result = nvtext.minhash.minhash64( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - width, - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width): - return Column.from_pylibcudf( - nvtext.minhash.minhash64_permuted( - input.to_pylibcudf(mode="read"), - seed, - a.to_pylibcudf(mode="read"), - b.to_pylibcudf(mode="read"), - width, - ) - ) - - -@acquire_spill_lock() -def word_minhash(Column input, Column seeds): - result = nvtext.minhash.word_minhash( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(result) - - -@acquire_spill_lock() -def word_minhash64(Column input, Column seeds): - result = nvtext.minhash.word_minhash64( - input.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return Column.from_pylibcudf(result) diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx deleted file mode 100644 index c125d92a24e..00000000000 --- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx +++ /dev/null @@ -1,24 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def ngrams_tokenize( - Column input, - int ngrams, - object py_delimiter, - object py_separator -): - return Column.from_pylibcudf( - nvtext.ngrams_tokenize.ngrams_tokenize( - input.to_pylibcudf(mode="read"), - ngrams, - py_delimiter.device_value.c_value, - py_separator.device_value.c_value - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx deleted file mode 100644 index cc45123dd0a..00000000000 --- a/python/cudf/cudf/_lib/nvtext/normalize.pyx +++ /dev/null @@ -1,28 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def normalize_spaces(Column input): - return Column.from_pylibcudf( - nvtext.normalize.normalize_spaces( - input.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def normalize_characters(Column input, bool do_lower=True): - return Column.from_pylibcudf( - nvtext.normalize.normalize_characters( - input.to_pylibcudf(mode="read"), - do_lower, - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx deleted file mode 100644 index bec56ade83c..00000000000 --- a/python/cudf/cudf/_lib/nvtext/replace.pyx +++ /dev/null @@ -1,52 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column -from pylibcudf import nvtext - - -@acquire_spill_lock() -def replace_tokens(Column strings, - Column targets, - Column replacements, - object py_delimiter): - """ - The `targets` tokens are searched for within each `strings` - in the Column and replaced with the corresponding `replacements` - if found. Tokens are identified by the `py_delimiter` character - provided. - """ - - return Column.from_pylibcudf( - nvtext.replace.replace_tokens( - strings.to_pylibcudf(mode="read"), - targets.to_pylibcudf(mode="read"), - replacements.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value, - ) - ) - - -@acquire_spill_lock() -def filter_tokens(Column strings, - size_type min_token_length, - object py_replacement, - object py_delimiter): - """ - Tokens smaller than `min_token_length` are removed from `strings` - in the Column and optionally replaced with the corresponding - `py_replacement` string. Tokens are identified by the `py_delimiter` - character provided. - """ - - return Column.from_pylibcudf( - nvtext.replace.filter_tokens( - strings.to_pylibcudf(mode="read"), - min_token_length, - py_replacement.device_value.c_value, - py_delimiter.device_value.c_value, - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx deleted file mode 100644 index 63a389b64d5..00000000000 --- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx +++ /dev/null @@ -1,55 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from enum import IntEnum - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.nvtext.stemmer cimport ( - letter_type, - underlying_type_t_letter_type, -) -from pylibcudf.libcudf.types cimport size_type - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -class LetterType(IntEnum): - CONSONANT = letter_type.CONSONANT - VOWEL = letter_type.VOWEL - - -@acquire_spill_lock() -def porter_stemmer_measure(Column strings): - return Column.from_pylibcudf( - nvtext.stemmer.porter_stemmer_measure( - strings.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def is_letter(Column strings, - object ltype, - size_type index): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - index, - ) - ) - - -@acquire_spill_lock() -def is_letter_multi(Column strings, - object ltype, - Column indices): - return Column.from_pylibcudf( - nvtext.stemmer.is_letter( - strings.to_pylibcudf(mode="read"), - ltype==LetterType.VOWEL, - indices.to_pylibcudf(mode="read"), - ) - ) diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx deleted file mode 100644 index 5e0bfb74705..00000000000 --- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx +++ /dev/null @@ -1,38 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t - -from cudf.core.buffer import acquire_spill_lock - -from libcpp cimport bool - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def subword_tokenize_inmem_hash( - Column strings, - object hashed_vocabulary, - uint32_t max_sequence_length=64, - uint32_t stride=48, - bool do_lower=True, - bool do_truncate=False, -): - """ - Subword tokenizes text series by using the pre-loaded hashed vocabulary - """ - result = nvtext.subword_tokenize.subword_tokenize( - strings.to_pylibcudf(mode="read"), - hashed_vocabulary, - max_sequence_length, - stride, - do_lower, - do_truncate, - ) - # return the 3 tensor components - tokens = Column.from_pylibcudf(result[0]) - masks = Column.from_pylibcudf(result[1]) - metadata = Column.from_pylibcudf(result[2]) - return tokens, masks, metadata diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx deleted file mode 100644 index f473c48e2f7..00000000000 --- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx +++ /dev/null @@ -1,86 +0,0 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. - -from cudf.core.buffer import acquire_spill_lock - -from pylibcudf.libcudf.types cimport size_type - -from pylibcudf.nvtext.tokenize import TokenizeVocabulary # no-cython-lint - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def _tokenize_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _tokenize_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read"), - ) - ) - - -@acquire_spill_lock() -def _count_tokens_scalar(Column strings, object py_delimiter): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_scalar( - strings.to_pylibcudf(mode="read"), - py_delimiter.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def _count_tokens_column(Column strings, Column delimiters): - return Column.from_pylibcudf( - nvtext.tokenize.count_tokens_column( - strings.to_pylibcudf(mode="read"), - delimiters.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def character_tokenize(Column strings): - return Column.from_pylibcudf( - nvtext.tokenize.character_tokenize( - strings.to_pylibcudf(mode="read") - ) - ) - - -@acquire_spill_lock() -def detokenize(Column strings, Column indices, object py_separator): - return Column.from_pylibcudf( - nvtext.tokenize.detokenize( - strings.to_pylibcudf(mode="read"), - indices.to_pylibcudf(mode="read"), - py_separator.device_value.c_value - ) - ) - - -@acquire_spill_lock() -def tokenize_with_vocabulary(Column strings, - object vocabulary, - object py_delimiter, - size_type default_id): - return Column.from_pylibcudf( - nvtext.tokenize.tokenize_with_vocabulary( - strings.to_pylibcudf(mode="read"), - vocabulary, - py_delimiter.device_value.c_value, - default_id - ) - ) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py index b795c54c112..48a5c966bc7 100644 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ b/python/cudf/cudf/_lib/strings/__init__.py @@ -1,37 +1,4 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams, - generate_ngrams, - hash_character_ngrams, -) -from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import ( - minhash, - minhash64, - minhash64_permuted, - minhash_permuted, - word_minhash, - word_minhash64, -) -from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize -from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces -from cudf._lib.nvtext.replace import filter_tokens, replace_tokens -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter, - is_letter_multi, - porter_stemmer_measure, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column, - _count_tokens_scalar, - _tokenize_column, - _tokenize_scalar, - character_tokenize, - detokenize, - tokenize_with_vocabulary, -) from cudf._lib.strings.convert.convert_fixed_point import to_decimal from cudf._lib.strings.convert.convert_floats import is_float from cudf._lib.strings.convert.convert_integers import is_integer diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py index 8d38a5f2272..b49f5154697 100644 --- a/python/cudf/cudf/core/byte_pair_encoding.py +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.byte_pair_encode import ( - byte_pair_encoding as cpp_byte_pair_encoding, -) class BytePairEncoder: @@ -25,12 +22,12 @@ class BytePairEncoder: BytePairEncoder """ - def __init__(self, merges_pair: "cudf.Series"): + def __init__(self, merges_pair: cudf.Series) -> None: self.merge_pairs = plc.nvtext.byte_pair_encode.BPEMergePairs( merges_pair._column.to_pylibcudf(mode="read") ) - def __call__(self, text, separator: str = " ") -> cudf.Series: + def __call__(self, text: cudf.Series, separator: str = " ") -> cudf.Series: """ Parameters @@ -57,6 +54,6 @@ def __call__(self, text, separator: str = " ") -> cudf.Series: dtype: object """ sep = cudf.Scalar(separator, dtype="str") - result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) - - return cudf.Series._from_column(result) + return cudf.Series._from_column( + text._column.byte_pair_encoding(self.merge_pairs, sep) + ) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 6b45828568c..e506ef0b9d7 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -44,6 +44,8 @@ SeriesOrIndex, ) from cudf.core.buffer import Buffer + from cudf.core.column.lists import ListColumn + from cudf.core.column.numerical import NumericalColumn def str_to_boolean(column: StringColumn): @@ -622,7 +624,7 @@ def join( def _split_by_character(self): col = self._column.fillna("") # sanitize nulls - result_col = libstrings.character_tokenize(col) + result_col = col.character_tokenize() offset_col = col.children[0] @@ -4696,9 +4698,7 @@ def normalize_spaces(self) -> SeriesOrIndex: 1 test string dtype: object """ - return self._return_or_inplace( - libstrings.normalize_spaces(self._column) - ) + return self._return_or_inplace(self._column.normalize_spaces()) def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: r""" @@ -4746,7 +4746,7 @@ def normalize_characters(self, do_lower: bool = True) -> SeriesOrIndex: dtype: object """ return self._return_or_inplace( - libstrings.normalize_characters(self._column, do_lower) + self._column.normalize_characters(do_lower) ) def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: @@ -4778,16 +4778,16 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: 2 goodbye dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + if isinstance(delim, Column): result = self._return_or_inplace( - libstrings._tokenize_column(self._column, delimiter), + self._column.tokenize_column(delim), retain_index=False, ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): result = self._return_or_inplace( - libstrings._tokenize_scalar(self._column, delimiter), + self._column.tokenize_scalar(delim), retain_index=False, ) else: @@ -4802,7 +4802,7 @@ def tokenize(self, delimiter: str = " ") -> SeriesOrIndex: return result def detokenize( - self, indices: "cudf.Series", separator: str = " " + self, indices: cudf.Series, separator: str = " " ) -> SeriesOrIndex: """ Combines tokens into strings by concatenating them in the order @@ -4832,9 +4832,9 @@ def detokenize( 2 three dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.detokenize(self._column, indices._column, separator), + self._column.detokenize(indices._column, sep), # type: ignore[arg-type] retain_index=False, ) @@ -4885,17 +4885,15 @@ def character_tokenize(self) -> SeriesOrIndex: 2 . dtype: object """ - result_col = libstrings.character_tokenize(self._column) + result_col = self._column.character_tokenize() if isinstance(self._parent, cudf.Series): lengths = self.len().fillna(0) index = self._parent.index.repeat(lengths) - return cudf.Series._from_column( + return type(self._parent)._from_column( result_col, name=self._parent.name, index=index ) - elif isinstance(self._parent, cudf.BaseIndex): - return cudf.Index._from_column(result_col, name=self._parent.name) else: - return result_col + return self._return_or_inplace(result_col) def token_count(self, delimiter: str = " ") -> SeriesOrIndex: """ @@ -4922,15 +4920,15 @@ def token_count(self, delimiter: str = " ") -> SeriesOrIndex: 2 0 dtype: int32 """ - delimiter = _massage_string_arg(delimiter, "delimiter", allow_col=True) - if isinstance(delimiter, Column): + delim = _massage_string_arg(delimiter, "delimiter", allow_col=True) + if isinstance(delim, Column): return self._return_or_inplace( - libstrings._count_tokens_column(self._column, delimiter) + self._column.count_tokens_column(delim) ) - elif isinstance(delimiter, cudf.Scalar): + elif isinstance(delim, cudf.Scalar): return self._return_or_inplace( - libstrings._count_tokens_scalar(self._column, delimiter) + self._column.count_tokens_scalar(delim) # type: ignore[arg-type] ) else: raise TypeError( @@ -4969,9 +4967,9 @@ def ngrams(self, n: int = 2, separator: str = "_") -> SeriesOrIndex: 2 xyz_hhh dtype: object """ - separator = _massage_string_arg(separator, "separator") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.generate_ngrams(self._column, n, separator), + self._column.generate_ngrams(n, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5018,7 +5016,7 @@ def character_ngrams( dtype: list """ result = self._return_or_inplace( - libstrings.generate_character_ngrams(self._column, n), + self._column.generate_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5063,7 +5061,7 @@ def hash_character_ngrams( """ result = self._return_or_inplace( - libstrings.hash_character_ngrams(self._column, n), + self._column.hash_character_ngrams(n), retain_index=True, ) if isinstance(result, cudf.Series) and not as_list: @@ -5101,10 +5099,10 @@ def ngrams_tokenize( 2 best_book dtype: object """ - delimiter = _massage_string_arg(delimiter, "delimiter") - separator = _massage_string_arg(separator, "separator") + delim = _massage_string_arg(delimiter, "delimiter") + sep = _massage_string_arg(separator, "separator") return self._return_or_inplace( - libstrings.ngrams_tokenize(self._column, n, delimiter, separator), + self._column.ngrams_tokenize(n, delim, sep), # type: ignore[arg-type] retain_index=False, ) @@ -5183,10 +5181,9 @@ def replace_tokens( ) return self._return_or_inplace( - libstrings.replace_tokens( - self._column, - targets_column, - replacements_column, + self._column.replace_tokens( + targets_column, # type: ignore[arg-type] + replacements_column, # type: ignore[arg-type] cudf.Scalar(delimiter, dtype="str"), ), ) @@ -5254,8 +5251,7 @@ def filter_tokens( ) return self._return_or_inplace( - libstrings.filter_tokens( - self._column, + self._column.filter_tokens( min_token_length, cudf.Scalar(replacement, dtype="str"), cudf.Scalar(delimiter, dtype="str"), @@ -5281,9 +5277,7 @@ def porter_stemmer_measure(self) -> SeriesOrIndex: 1 2 dtype: int32 """ - return self._return_or_inplace( - libstrings.porter_stemmer_measure(self._column) - ) + return self._return_or_inplace(self._column.porter_stemmer_measure()) def is_consonant(self, position) -> SeriesOrIndex: """ @@ -5316,17 +5310,10 @@ def is_consonant(self, position) -> SeriesOrIndex: 1 False dtype: bool """ - ltype = libstrings.LetterType.CONSONANT - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(False, column.as_column(position)) # type: ignore[arg-type] ) def is_vowel(self, position) -> SeriesOrIndex: @@ -5360,17 +5347,10 @@ def is_vowel(self, position) -> SeriesOrIndex: 1 True dtype: bool """ - ltype = libstrings.LetterType.VOWEL - if can_convert_to_column(position): - return self._return_or_inplace( - libstrings.is_letter_multi( - self._column, ltype, column.as_column(position) - ), - ) - + position = column.as_column(position) return self._return_or_inplace( - libstrings.is_letter(self._column, ltype, position) + self._column.is_letter(True, column.as_column(position)) # type: ignore[arg-type] ) def edit_distance(self, targets) -> SeriesOrIndex: @@ -5419,7 +5399,7 @@ def edit_distance(self, targets) -> SeriesOrIndex: ) return self._return_or_inplace( - libstrings.edit_distance(self._column, targets_column) + self._column.edit_distance(targets_column) # type: ignore[arg-type] ) def edit_distance_matrix(self) -> SeriesOrIndex: @@ -5459,9 +5439,7 @@ def edit_distance_matrix(self) -> SeriesOrIndex: "Cannot compute edit distance between null strings. " "Consider removing them using `dropna` or fill with `fillna`." ) - return self._return_or_inplace( - libstrings.edit_distance_matrix(self._column) - ) + return self._return_or_inplace(self._column.edit_distance_matrix()) def minhash( self, seeds: ColumnLike | None = None, width: int = 4 @@ -5503,7 +5481,7 @@ def minhash( f"Expecting a Series with dtype uint32, got {type(seeds)}" ) return self._return_or_inplace( - libstrings.minhash(self._column, seeds_column, width) + self._column.minhash(seeds_column, width) # type: ignore[arg-type] ) def minhash_permuted( @@ -5554,9 +5532,7 @@ def minhash_permuted( f"Expecting a Series with dtype uint32, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash_permuted( - self._column, seed, a_column, b_column, width - ) + self._column.minhash_permuted(seed, a_column, b_column, width) # type: ignore[arg-type] ) def minhash64( @@ -5597,7 +5573,7 @@ def minhash64( f"Expecting a Series with dtype uint64, got {type(seeds)}" ) return self._return_or_inplace( - libstrings.minhash64(self._column, seeds_column, width) + self._column.minhash64(seeds_column, width) # type: ignore[arg-type] ) def minhash64_permuted( @@ -5648,9 +5624,7 @@ def minhash64_permuted( f"Expecting a Series with dtype uint64, got {type(b)}" ) return self._return_or_inplace( - libstrings.minhash64_permuted( - self._column, seed, a_column, b_column, width - ) + self._column.minhash64_permuted(seed, a_column, b_column, width) # type: ignore[arg-type] ) def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: @@ -5683,9 +5657,7 @@ def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: raise ValueError( f"Expecting a Series with dtype uint32, got {type(seeds)}" ) - return self._return_or_inplace( - libstrings.word_minhash(self._column, seeds_column) - ) + return self._return_or_inplace(self._column.word_minhash(seeds_column)) # type: ignore[arg-type] def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: """ @@ -5720,7 +5692,7 @@ def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: f"Expecting a Series with dtype uint64, got {type(seeds)}" ) return self._return_or_inplace( - libstrings.word_minhash64(self._column, seeds_column) + self._column.word_minhash64(seeds_column) # type: ignore[arg-type] ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: @@ -5746,13 +5718,14 @@ def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: 1 0.307692 dtype: float32 """ - return self._return_or_inplace( - libstrings.jaccard_index(self._column, input._column, width), + self._column.jaccard_index(input._column, width) ) -def _massage_string_arg(value, name, allow_col=False): +def _massage_string_arg( + value, name, allow_col: bool = False +) -> StringColumn | cudf.Scalar: if isinstance(value, cudf.Scalar): return value @@ -5763,9 +5736,9 @@ def _massage_string_arg(value, name, allow_col=False): if allow_col: if isinstance(value, list): - return column.as_column(value, dtype="str") + return column.as_column(value, dtype="str") # type: ignore[return-value] - if isinstance(value, Column) and is_string_dtype(value.dtype): + if isinstance(value, StringColumn): return value allowed_types.append("Column") @@ -6305,6 +6278,312 @@ def view(self, dtype) -> "cudf.core.column.ColumnBase": return to_view.view(dtype) + @acquire_spill_lock() + def minhash(self, seeds: NumericalColumn, width: int = 4) -> ListColumn: + result = plc.nvtext.minhash.minhash( + self.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def minhash_permuted( + self, + seed: np.uint32, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash_permuted( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def minhash64(self, seeds: NumericalColumn, width: int = 4) -> ListColumn: + result = plc.nvtext.minhash.minhash64( + plc.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + width, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def minhash64_permuted( + self, + seed: np.uint64, + a: NumericalColumn, + b: NumericalColumn, + width: int, + ) -> ListColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.minhash.minhash64_permuted( + self.to_pylibcudf(mode="read"), + seed, + a.to_pylibcudf(mode="read"), + b.to_pylibcudf(mode="read"), + width, + ) + ) + + @acquire_spill_lock() + def word_minhash(self, seeds: NumericalColumn) -> ListColumn: + result = plc.nvtext.minhash.word_minhash( + self.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def word_minhash64(self, seeds: NumericalColumn) -> ListColumn: + result = plc.nvtext.minhash.word_minhash64( + self.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def jaccard_index(self, other: Self, width: int) -> NumericalColumn: + result = plc.nvtext.jaccard.jaccard_index( + self.to_pylibcudf(mode="read"), + other.to_pylibcudf(mode="read"), + width, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_ngrams(self, ngrams: int, separator: cudf.Scalar) -> Self: + result = plc.nvtext.generate_ngrams.generate_ngrams( + self.to_pylibcudf(mode="read"), + ngrams, + separator.device_value.c_value, + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def generate_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.generate_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def hash_character_ngrams(self, ngrams: int) -> ListColumn: + result = plc.nvtext.generate_ngrams.hash_character_ngrams( + self.to_pylibcudf(mode="read"), ngrams + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance(self, targets: Self) -> NumericalColumn: + result = plc.nvtext.edit_distance.edit_distance( + self.to_pylibcudf(mode="read"), targets.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def edit_distance_matrix(self) -> ListColumn: + result = plc.nvtext.edit_distance.edit_distance_matrix( + self.to_pylibcudf(mode="read") + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def byte_pair_encoding( + self, + merge_pairs: plc.nvtext.byte_pair_encode.BPEMergePairs, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.byte_pair_encode.byte_pair_encoding( + self.to_pylibcudf(mode="read"), + merge_pairs, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def ngrams_tokenize( + self, + ngrams: int, + delimiter: cudf.Scalar, + separator: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.ngrams_tokenize.ngrams_tokenize( + self.to_pylibcudf(mode="read"), + ngrams, + delimiter.device_value.c_value, + separator.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def normalize_spaces(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_spaces( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def normalize_characters(self, do_lower: bool = True) -> Self: + return Column.from_pylibcudf( # type: ignore[return-value] + plc.nvtext.normalize.normalize_characters( + self.to_pylibcudf(mode="read"), + do_lower, + ) + ) + + @acquire_spill_lock() + def replace_tokens( + self, targets: Self, replacements: Self, delimiter: cudf.Scalar + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.replace_tokens( + self.to_pylibcudf(mode="read"), + targets.to_pylibcudf(mode="read"), + replacements.to_pylibcudf(mode="read"), + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def filter_tokens( + self, + min_token_length: int, + replacement: cudf.Scalar, + delimiter: cudf.Scalar, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.replace.filter_tokens( + self.to_pylibcudf(mode="read"), + min_token_length, + replacement.device_value.c_value, + delimiter.device_value.c_value, + ) + ) + + @acquire_spill_lock() + def porter_stemmer_measure(self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.porter_stemmer_measure( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def is_letter(self, is_vowel: bool, index: int | NumericalColumn) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.stemmer.is_letter( + self.to_pylibcudf(mode="read"), + is_vowel, + index + if isinstance(index, int) + else index.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def subword_tokenize( + self, + hashed_vocabulary: plc.nvtext.subword_tokenize.HashedVocabulary, + max_sequence_length: int = 64, + stride: int = 48, + do_lower: bool = True, + do_truncate: bool = False, + ) -> tuple[ColumnBase, ColumnBase, ColumnBase]: + """ + Subword tokenizes text series by using the pre-loaded hashed vocabulary + """ + result = plc.nvtext.subword_tokenize.subword_tokenize( + self.to_pylibcudf(mode="read"), + hashed_vocabulary, + max_sequence_length, + stride, + do_lower, + do_truncate, + ) + # return the 3 tensor components + tokens = type(self).from_pylibcudf(result[0]) + masks = type(self).from_pylibcudf(result[1]) + metadata = type(self).from_pylibcudf(result[2]) + return tokens, masks, metadata + + @acquire_spill_lock() + def tokenize_scalar(self, delimiter: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def tokenize_column(self, delimiters: Self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def count_tokens_scalar(self, delimiter: cudf.Scalar) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_scalar( + self.to_pylibcudf(mode="read"), delimiter.device_value.c_value + ) + ) + + @acquire_spill_lock() + def count_tokens_column(self, delimiters: Self) -> NumericalColumn: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.count_tokens_column( + self.to_pylibcudf(mode="read"), + delimiters.to_pylibcudf(mode="read"), + ) + ) + + @acquire_spill_lock() + def character_tokenize(self) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.character_tokenize( + self.to_pylibcudf(mode="read") + ) + ) + + @acquire_spill_lock() + def tokenize_with_vocabulary( + self, + vocabulary: plc.nvtext.tokenize.TokenizeVocabulary, + delimiter: cudf.Scalar, + default_id: int, + ) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.tokenize_with_vocabulary( + self.to_pylibcudf(mode="read"), + vocabulary, + delimiter.device_value.c_value, + default_id, + ) + ) + + @acquire_spill_lock() + def detokenize(self, indices: ColumnBase, separator: cudf.Scalar) -> Self: + return type(self).from_pylibcudf( # type: ignore[return-value] + plc.nvtext.tokenize.detokenize( + self.to_pylibcudf(mode="read"), + indices.to_pylibcudf(mode="read"), + separator.device_value.c_value, + ) + ) + def _modify_characters( self, method: Callable[[plc.Column], plc.Column] ) -> Self: diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py index dda1f199078..479838ef2a8 100644 --- a/python/cudf/cudf/core/subword_tokenizer.py +++ b/python/cudf/cudf/core/subword_tokenizer.py @@ -8,10 +8,6 @@ import pylibcudf as plc -from cudf._lib.nvtext.subword_tokenize import ( - subword_tokenize_inmem_hash as cpp_subword_tokenize, -) - def _cast_to_appropriate_type(ar, cast_type): if cast_type == "cp": @@ -210,8 +206,7 @@ def __call__( stride = max_length - stride # behavior varies from subword_tokenize but maps with huggingface - input_ids, attention_mask, metadata = cpp_subword_tokenize( - text._column, + input_ids, attention_mask, metadata = text._column.subword_tokenize( self.vocab_file, max_sequence_length=max_length, stride=stride, diff --git a/python/cudf/cudf/core/tokenize_vocabulary.py b/python/cudf/cudf/core/tokenize_vocabulary.py index 1e31376cce8..fb8b9b3131c 100644 --- a/python/cudf/cudf/core/tokenize_vocabulary.py +++ b/python/cudf/cudf/core/tokenize_vocabulary.py @@ -5,9 +5,6 @@ import pylibcudf as plc import cudf -from cudf._lib.nvtext.tokenize import ( - tokenize_with_vocabulary as cpp_tokenize_with_vocabulary, -) class TokenizeVocabulary: @@ -20,7 +17,7 @@ class TokenizeVocabulary: Strings column of vocabulary terms """ - def __init__(self, vocabulary: "cudf.Series"): + def __init__(self, vocabulary: cudf.Series) -> None: self.vocabulary = plc.nvtext.tokenize.TokenizeVocabulary( vocabulary._column.to_pylibcudf(mode="read") ) @@ -46,8 +43,8 @@ def tokenize( if delimiter is None: delimiter = "" delim = cudf.Scalar(delimiter, dtype="str") - result = cpp_tokenize_with_vocabulary( - text._column, self.vocabulary, delim, default_id + result = text._column.tokenize_with_vocabulary( + self.vocabulary, delim, default_id ) return cudf.Series._from_column(result) From e2487ef53624d201d14f2f066c32c4a8fbd45f7c Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Thu, 5 Dec 2024 16:34:46 -0800 Subject: [PATCH 2/5] Address text failures --- python/cudf/cudf/core/column/lists.py | 16 ++++++++++++++++ python/cudf/cudf/core/column/string.py | 26 +++++--------------------- 2 files changed, 21 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index ea384888388..82a67b43bab 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -437,6 +437,22 @@ def segmented_gather(self, gather_map: ColumnBase) -> ColumnBase: ) ) + @acquire_spill_lock() + def word_minhash64(self, seeds: NumericalColumn) -> ListColumn: + result = plc.nvtext.minhash.word_minhash64( + self.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + + @acquire_spill_lock() + def word_minhash(self, seeds: NumericalColumn) -> Self: + result = plc.nvtext.minhash.word_minhash( + self.to_pylibcudf(mode="read"), + seeds.to_pylibcudf(mode="read"), + ) + return type(self).from_pylibcudf(result) # type: ignore[return-value] + class ListMethods(ColumnMethods): """ diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index e506ef0b9d7..fb0ed1637f1 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -5313,7 +5313,7 @@ def is_consonant(self, position) -> SeriesOrIndex: if can_convert_to_column(position): position = column.as_column(position) return self._return_or_inplace( - self._column.is_letter(False, column.as_column(position)) # type: ignore[arg-type] + self._column.is_letter(False, position) # type: ignore[arg-type] ) def is_vowel(self, position) -> SeriesOrIndex: @@ -5350,7 +5350,7 @@ def is_vowel(self, position) -> SeriesOrIndex: if can_convert_to_column(position): position = column.as_column(position) return self._return_or_inplace( - self._column.is_letter(True, column.as_column(position)) # type: ignore[arg-type] + self._column.is_letter(True, position) # type: ignore[arg-type] ) def edit_distance(self, targets) -> SeriesOrIndex: @@ -5657,7 +5657,7 @@ def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: raise ValueError( f"Expecting a Series with dtype uint32, got {type(seeds)}" ) - return self._return_or_inplace(self._column.word_minhash(seeds_column)) # type: ignore[arg-type] + return self._return_or_inplace(self._column.word_minhash(seeds_column)) # type: ignore[attr-defined] def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: """ @@ -5692,7 +5692,7 @@ def word_minhash64(self, seeds: ColumnLike | None = None) -> SeriesOrIndex: f"Expecting a Series with dtype uint64, got {type(seeds)}" ) return self._return_or_inplace( - self._column.word_minhash64(seeds_column) # type: ignore[arg-type] + self._column.word_minhash64(seeds_column) # type: ignore[attr-defined] ) def jaccard_index(self, input: cudf.Series, width: int) -> SeriesOrIndex: @@ -6332,22 +6332,6 @@ def minhash64_permuted( ) ) - @acquire_spill_lock() - def word_minhash(self, seeds: NumericalColumn) -> ListColumn: - result = plc.nvtext.minhash.word_minhash( - self.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return type(self).from_pylibcudf(result) # type: ignore[return-value] - - @acquire_spill_lock() - def word_minhash64(self, seeds: NumericalColumn) -> ListColumn: - result = plc.nvtext.minhash.word_minhash64( - self.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return type(self).from_pylibcudf(result) # type: ignore[return-value] - @acquire_spill_lock() def jaccard_index(self, other: Self, width: int) -> NumericalColumn: result = plc.nvtext.jaccard.jaccard_index( @@ -6473,7 +6457,7 @@ def filter_tokens( @acquire_spill_lock() def porter_stemmer_measure(self) -> NumericalColumn: return type(self).from_pylibcudf( # type: ignore[return-value] - plc.nvtext.tokenize.porter_stemmer_measure( + plc.nvtext.stemmer.porter_stemmer_measure( self.to_pylibcudf(mode="read") ) ) From 1ff4ee710d7f20181b76f5ecca9bebb72e66ce21 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:46:23 -0800 Subject: [PATCH 3/5] Remove nvtext and strings --- python/cudf/cudf/_lib/nvtext/minhash.pyx | 35 ----------------------- python/cudf/cudf/_lib/strings/__init__.py | 30 ------------------- 2 files changed, 65 deletions(-) delete mode 100644 python/cudf/cudf/_lib/nvtext/minhash.pyx delete mode 100644 python/cudf/cudf/_lib/strings/__init__.py diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx deleted file mode 100644 index 9f2b3f92502..00000000000 --- a/python/cudf/cudf/_lib/nvtext/minhash.pyx +++ /dev/null @@ -1,35 +0,0 @@ -# Copyright (c) 2023-2024, NVIDIA CORPORATION. - -from libc.stdint cimport uint32_t, uint64_t - -from cudf.core.buffer import acquire_spill_lock - -from cudf._lib.column cimport Column - -from pylibcudf import nvtext - - -@acquire_spill_lock() -def minhash(Column input, uint32_t seed, Column a, Column b, int width): - return Column.from_pylibcudf( - nvtext.minhash.minhash( - input.to_pylibcudf(mode="read"), - seed, - a.to_pylibcudf(mode="read"), - b.to_pylibcudf(mode="read"), - width, - ) - ) - - -@acquire_spill_lock() -def minhash64(Column input, uint64_t seed, Column a, Column b, int width): - return Column.from_pylibcudf( - nvtext.minhash.minhash64( - input.to_pylibcudf(mode="read"), - seed, - a.to_pylibcudf(mode="read"), - b.to_pylibcudf(mode="read"), - width, - ) - ) diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py deleted file mode 100644 index b9095a22a42..00000000000 --- a/python/cudf/cudf/_lib/strings/__init__.py +++ /dev/null @@ -1,30 +0,0 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. -from cudf._lib.nvtext.edit_distance import edit_distance, edit_distance_matrix -from cudf._lib.nvtext.generate_ngrams import ( - generate_character_ngrams, - generate_ngrams, - hash_character_ngrams, -) -from cudf._lib.nvtext.jaccard import jaccard_index -from cudf._lib.nvtext.minhash import ( - minhash, - minhash64, -) -from cudf._lib.nvtext.ngrams_tokenize import ngrams_tokenize -from cudf._lib.nvtext.normalize import normalize_characters, normalize_spaces -from cudf._lib.nvtext.replace import filter_tokens, replace_tokens -from cudf._lib.nvtext.stemmer import ( - LetterType, - is_letter, - is_letter_multi, - porter_stemmer_measure, -) -from cudf._lib.nvtext.tokenize import ( - _count_tokens_column, - _count_tokens_scalar, - _tokenize_column, - _tokenize_scalar, - character_tokenize, - detokenize, - tokenize_with_vocabulary, -) From 018fa54f2c2bf24769e4767a1b22f756574f9569 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 10 Dec 2024 15:48:50 -0800 Subject: [PATCH 4/5] Remove word_minhash --- python/cudf/cudf/core/column/lists.py | 16 ---------------- 1 file changed, 16 deletions(-) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index 04a3a84506a..ba98e28f6a2 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -441,22 +441,6 @@ def segmented_gather(self, gather_map: ColumnBase) -> ColumnBase: ) ) - @acquire_spill_lock() - def word_minhash64(self, seeds: NumericalColumn) -> ListColumn: - result = plc.nvtext.minhash.word_minhash64( - self.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return type(self).from_pylibcudf(result) # type: ignore[return-value] - - @acquire_spill_lock() - def word_minhash(self, seeds: NumericalColumn) -> Self: - result = plc.nvtext.minhash.word_minhash( - self.to_pylibcudf(mode="read"), - seeds.to_pylibcudf(mode="read"), - ) - return type(self).from_pylibcudf(result) # type: ignore[return-value] - class ListMethods(ColumnMethods): """ From 09148f03cde8736627379115737fc318c4b07efb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 10 Dec 2024 16:47:26 -0800 Subject: [PATCH 5/5] Remove _permuted --- python/cudf/cudf/core/column/string.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py index 95f14495af7..c021554f3bd 100644 --- a/python/cudf/cudf/core/column/string.py +++ b/python/cudf/cudf/core/column/string.py @@ -6135,7 +6135,7 @@ def minhash( width: int, ) -> ListColumn: return type(self).from_pylibcudf( # type: ignore[return-value] - plc.nvtext.minhash.minhash_permuted( + plc.nvtext.minhash.minhash( self.to_pylibcudf(mode="read"), seed, a.to_pylibcudf(mode="read"), @@ -6153,7 +6153,7 @@ def minhash64( width: int, ) -> ListColumn: return type(self).from_pylibcudf( # type: ignore[return-value] - plc.nvtext.minhash.minhash64_permuted( + plc.nvtext.minhash.minhash64( self.to_pylibcudf(mode="read"), seed, a.to_pylibcudf(mode="read"),