From 525eca0a668650537ff785135768b83ef3353617 Mon Sep 17 00:00:00 2001 From: Feng Jiang <106386742+Feng-Jiang28@users.noreply.github.com> Date: Thu, 18 Jul 2024 13:01:51 +0800 Subject: [PATCH] implement kernel for substring_index (#2205) * draft Signed-off-by: fejiang * jni binding Signed-off-by: fejiang * some change Signed-off-by: fejiang * some change Signed-off-by: fejiang * test files for substringindex Signed-off-by: fejiang * remove unwanted files Signed-off-by: fejiang * format it Signed-off-by: fejiang * modified ColumnView.java adding methods Signed-off-by: fejiang * modified ColumnView.java Signed-off-by: fejiang * modified Signed-off-by: fejiang * substringIndex.java Signed-off-by: fejiang * add java test Signed-off-by: fejiang * add java test Signed-off-by: fejiang * added java test Signed-off-by: fejiang * rename gpusubstringindex Signed-off-by: fejiang * name changed Signed-off-by: fejiang * add one more test case Signed-off-by: fejiang * cudf update Signed-off-by: fejiang * cudf conflict resolve Signed-off-by: fejiang * cpp test added Signed-off-by: fejiang * remove unwanted code Signed-off-by: fejiang * cudf updated and clang format Signed-off-by: fejiang * removed comments & dealing with formatting Signed-off-by: fejiang * CMakeLists formatting Signed-off-by: fejiang * testcases added Signed-off-by: fejiang * lexicographic order in CMakeLists Signed-off-by: fejiang * change delimiter type to Scalar Signed-off-by: fejiang * format Signed-off-by: fejiang * reuse slice_strings function Signed-off-by: fejiang --------- Signed-off-by: fejiang --- src/main/cpp/CMakeLists.txt | 2 + src/main/cpp/src/SubStringIndexJni.cpp | 37 ++++ src/main/cpp/src/substring_index.cu | 166 ++++++++++++++++++ src/main/cpp/src/substring_index.hpp | 40 +++++ src/main/cpp/tests/CMakeLists.txt | 4 + src/main/cpp/tests/substring_index.cpp | 54 ++++++ .../rapids/jni/GpuSubstringIndexUtils.java | 31 ++++ .../jni/GpuSubstringIndexUtilsTest.java | 76 ++++++++ 8 files changed, 410 insertions(+) create mode 100644 src/main/cpp/src/SubStringIndexJni.cpp create mode 100644 src/main/cpp/src/substring_index.cu create mode 100644 src/main/cpp/src/substring_index.hpp create mode 100644 src/main/cpp/tests/substring_index.cpp create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtilsTest.java diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 3c7181819c..72220e9360 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -197,6 +197,7 @@ add_library( src/RegexRewriteUtilsJni.cpp src/RowConversionJni.cpp src/SparkResourceAdaptorJni.cpp + src/SubStringIndexJni.cpp src/ZOrderJni.cpp src/bloom_filter.cu src/case_when.cu @@ -214,6 +215,7 @@ add_library( src/parse_uri.cu src/regex_rewrite_utils.cu src/row_conversion.cu + src/substring_index.cu src/timezones.cu src/utilities.cu src/xxhash64.cu diff --git a/src/main/cpp/src/SubStringIndexJni.cpp b/src/main/cpp/src/SubStringIndexJni.cpp new file mode 100644 index 0000000000..1e53166ab7 --- /dev/null +++ b/src/main/cpp/src/SubStringIndexJni.cpp @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cudf_jni_apis.hpp" +#include "substring_index.hpp" + +extern "C" { + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_GpuSubstringIndexUtils_substringIndex( + JNIEnv* env, jclass, jlong strings_handle, jlong delimiter, jint count) +{ + JNI_NULL_CHECK(env, strings_handle, "strings column handle is null", 0); + JNI_NULL_CHECK(env, delimiter, "delimiter scalar handle is null", 0); + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(strings_handle); + auto const strings_column = cudf::strings_column_view{*input}; + cudf::string_scalar* ss_scalar = reinterpret_cast(delimiter); + return cudf::jni::release_as_jlong( + spark_rapids_jni::substring_index(strings_column, *ss_scalar, count)); + } + CATCH_STD(env, 0); +} +} // extern "C" diff --git a/src/main/cpp/src/substring_index.cu b/src/main/cpp/src/substring_index.cu new file mode 100644 index 0000000000..c685b3b7c9 --- /dev/null +++ b/src/main/cpp/src/substring_index.cu @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "substring_index.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include + +using namespace cudf; + +namespace spark_rapids_jni { + +namespace detail { + +namespace { + +/** + * @brief Compute slice indices for each string. + * + * When slice_strings is invoked with a delimiter string and a delimiter count, we need to + * compute the start and end indices of the substring. This function accomplishes that. + */ +template +void compute_substring_indices(column_device_view const& d_column, + DelimiterItrT const delim_itr, + size_type delimiter_count, + size_type* start_char_pos, + size_type* end_char_pos, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref) +{ + auto strings_count = d_column.size(); + + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + strings_count, + [delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__(size_type idx) { + auto const& delim_val_pair = delim_itr[idx]; + auto const& delim_val = delim_val_pair.first; // Don't use it yet + + // If the column value for this row is null, result is null. + // If the delimiter count is 0, result is empty string. + // If the global delimiter or the row specific delimiter is invalid or if it is empty, row + // value is empty. + if (d_column.is_null(idx) || !delim_val_pair.second || delim_val.empty()) return; + auto const& col_val = d_column.element(idx); + + // If the column value for the row is empty, the row value is empty. + if (!col_val.empty()) { + auto const col_val_len = col_val.length(); + auto const delimiter_len = delim_val.length(); + + auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; + bool const left_to_right = (delimiter_count > 0); + + size_type start_pos = start_char_pos[idx]; + size_type end_pos = col_val_len; + size_type char_pos = -1; + + end_char_pos[idx] = col_val_len; + + for (auto i = 0; i < nsearches; ++i) { + char_pos = left_to_right ? col_val.find(delim_val, start_pos) + : col_val.rfind(delim_val, 0, end_pos); + if (char_pos == string_view::npos) return; + if (left_to_right) + start_pos = char_pos + delimiter_len; + else + end_pos = char_pos; + } + if (left_to_right) + end_char_pos[idx] = char_pos; + else + start_char_pos[idx] = end_pos + delimiter_len; + } + }); +} + +} // namespace + +template +std::unique_ptr substring_index(strings_column_view const& strings, + DelimiterItrT const delimiter_itr, + size_type count, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + auto strings_count = strings.size(); + // If there aren't any rows, return an empty strings column + if (strings_count == 0) { return make_empty_column(type_id::STRING); } + + // Compute the substring indices first + auto start_chars_pos_vec = make_column_from_scalar(numeric_scalar(0, true, stream), + strings_count, + stream, + rmm::mr::get_current_device_resource()); + auto stop_chars_pos_vec = make_column_from_scalar(numeric_scalar(0, true, stream), + strings_count, + stream, + rmm::mr::get_current_device_resource()); + + auto start_char_pos = start_chars_pos_vec->mutable_view().data(); + auto end_char_pos = stop_chars_pos_vec->mutable_view().data(); + + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + + // If delimiter count is 0, the output column will contain empty strings + if (count != 0) { + // Compute the substring indices first + compute_substring_indices( + d_column, delimiter_itr, count, start_char_pos, end_char_pos, stream, mr); + } + + return cudf::strings::slice_strings( + strings, start_chars_pos_vec->view(), stop_chars_pos_vec->view(), stream, mr); +} + +} // namespace detail + +// external API + +std::unique_ptr substring_index(strings_column_view const& strings, + string_scalar const& delimiter, + size_type count, + rmm::device_async_resource_ref mr) +{ + CUDF_FUNC_RANGE(); + return detail::substring_index(strings, + cudf::detail::make_pair_iterator(delimiter), + count, + cudf::get_default_stream(), + mr); +} +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/substring_index.hpp b/src/main/cpp/src/substring_index.hpp new file mode 100644 index 0000000000..aacfc9b813 --- /dev/null +++ b/src/main/cpp/src/substring_index.hpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include + +namespace spark_rapids_jni { + +/** + * @brief Returns the substring of strings before count occurrence of the delimiter delim. + * + * @param strings Strings column + * @param delimiter The delimiter string used to slice string + * @param count Specify the occurrence of the delimiter + * @return A string column used to store the result + */ +std::unique_ptr substring_index( + cudf::strings_column_view const& strings, + cudf::string_scalar const& delimiter, + cudf::size_type count, + rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/tests/CMakeLists.txt b/src/main/cpp/tests/CMakeLists.txt index 55a7b7a887..244d18c903 100644 --- a/src/main/cpp/tests/CMakeLists.txt +++ b/src/main/cpp/tests/CMakeLists.txt @@ -77,3 +77,7 @@ ConfigureTest(UTILITIES ConfigureTest(PARSE_URI parse_uri.cpp) + +ConfigureTest(SUBSTRING_INDEX + substring_index.cpp) + diff --git a/src/main/cpp/tests/substring_index.cpp b/src/main/cpp/tests/substring_index.cpp new file mode 100644 index 0000000000..2f837bec0c --- /dev/null +++ b/src/main/cpp/tests/substring_index.cpp @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include + +using namespace cudf; + +struct SubstringIndexTests : public test::BaseFixture {}; + +TEST_F(SubstringIndexTests, ScalarDelimiter) +{ + auto col0 = test::strings_column_wrapper({"www.yahoo.com", + "www.apache..org", + "tennis...com", + "nvidia....com", + "google...........com", + "microsoft...c.....co..m"}); + + auto exp_results = test::strings_column_wrapper( + {"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."}); + + auto results = + spark_rapids_jni::substring_index(strings_column_view{col0}, string_scalar("."), 3); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); +} diff --git a/src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java new file mode 100644 index 0000000000..a8750919c9 --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.*; + +public class GpuSubstringIndexUtils { + static{ + NativeDepsLoader.loadNativeDeps(); + } + + public static ColumnVector substringIndex(ColumnView cv, Scalar delimiter, int count){ + return new ColumnVector(substringIndex(cv.getNativeView(), CudfAccessor.getScalarHandle(delimiter), count)); + } + + private static native long substringIndex(long columnView, long delimiter, int count) throws CudfException; +} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtilsTest.java new file mode 100644 index 0000000000..9666893448 --- /dev/null +++ b/src/test/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtilsTest.java @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.AssertUtils; +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.Scalar; +import ai.rapids.cudf.Table; +import org.junit.jupiter.api.Test; + +import java.util.ArrayList; +import java.util.List; + +public class GpuSubstringIndexUtilsTest { + @Test + void gpuSubstringIndexTest(){ + Table.TestBuilder tb = new Table.TestBuilder(); + tb.column( "www.apache.org"); + tb.column("www.apache"); + tb.column("www"); + tb.column(""); + tb.column("org"); + tb.column("apache.org"); + tb.column("www.apache.org"); + tb.column(""); + tb.column("大千世界大"); + tb.column("www||apache"); + + try(Table expected = tb.build()){ + Table.TestBuilder tb2 = new Table.TestBuilder(); + tb2.column("www.apache.org"); + tb2.column("www.apache.org"); + tb2.column("www.apache.org"); + tb2.column("www.apache.org"); + tb2.column("www.apache.org"); + tb2.column("www.apache.org"); + tb2.column("www.apache.org"); + tb2.column(""); + tb2.column("大千世界大千世界"); + tb2.column("www||apache||org"); + + Scalar dotScalar = Scalar.fromString("."); + Scalar cnChar = Scalar.fromString("千"); + Scalar verticalBar = Scalar.fromString("||"); + Scalar[] delimiterArray = new Scalar[]{dotScalar, dotScalar, dotScalar, dotScalar,dotScalar, dotScalar, dotScalar, dotScalar, cnChar, verticalBar}; + int[] countArray = new int[]{3, 2, 1, 0, -1, -2, -3, -2, 2, 2}; + List result = new ArrayList<>(); + try (Table origTable = tb2.build()){ + for(int i = 0; i < origTable.getNumberOfColumns(); i++){ + ColumnVector string_col = origTable.getColumn(i); + result.add(GpuSubstringIndexUtils.substringIndex(string_col, delimiterArray[i], countArray[i])); + } + try (Table result_tbl = new Table( + result.toArray(new ColumnVector[result.size()]))){ + AssertUtils.assertTablesAreEqual(expected, result_tbl); + } + }finally { + result.forEach(ColumnVector::close); + } + } + } +}