-
Notifications
You must be signed in to change notification settings - Fork 66
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
implement kernel for substring_index (#2205)
* draft Signed-off-by: fejiang <fejiang@nvidia.com> * jni binding Signed-off-by: fejiang <fejiang@nvidia.com> * some change Signed-off-by: fejiang <fejiang@nvidia.com> * some change Signed-off-by: fejiang <fejiang@nvidia.com> * test files for substringindex Signed-off-by: fejiang <fejiang@nvidia.com> * remove unwanted files Signed-off-by: fejiang <fejiang@nvidia.com> * format it Signed-off-by: fejiang <fejiang@nvidia.com> * modified ColumnView.java adding methods Signed-off-by: fejiang <fejiang@nvidia.com> * modified ColumnView.java Signed-off-by: fejiang <fejiang@nvidia.com> * modified Signed-off-by: fejiang <fejiang@nvidia.com> * substringIndex.java Signed-off-by: fejiang <fejiang@nvidia.com> * add java test Signed-off-by: fejiang <fejiang@nvidia.com> * add java test Signed-off-by: fejiang <fejiang@nvidia.com> * added java test Signed-off-by: fejiang <fejiang@nvidia.com> * rename gpusubstringindex Signed-off-by: fejiang <fejiang@nvidia.com> * name changed Signed-off-by: fejiang <fejiang@nvidia.com> * add one more test case Signed-off-by: fejiang <fejiang@nvidia.com> * cudf update Signed-off-by: fejiang <fejiang@nvidia.com> * cudf conflict resolve Signed-off-by: fejiang <fejiang@nvidia.com> * cpp test added Signed-off-by: fejiang <fejiang@nvidia.com> * remove unwanted code Signed-off-by: fejiang <fejiang@nvidia.com> * cudf updated and clang format Signed-off-by: fejiang <fejiang@nvidia.com> * removed comments & dealing with formatting Signed-off-by: fejiang <fejiang@nvidia.com> * CMakeLists formatting Signed-off-by: fejiang <fejiang@nvidia.com> * testcases added Signed-off-by: fejiang <fejiang@nvidia.com> * lexicographic order in CMakeLists Signed-off-by: fejiang <fejiang@nvidia.com> * change delimiter type to Scalar Signed-off-by: fejiang <fejiang@nvidia.com> * format Signed-off-by: fejiang <fejiang@nvidia.com> * reuse slice_strings function Signed-off-by: fejiang <fejiang@nvidia.com> --------- Signed-off-by: fejiang <fejiang@nvidia.com>
- Loading branch information
1 parent
4b9ca1f
commit 525eca0
Showing
8 changed files
with
410 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
/* | ||
* Copyright (c) 2019-2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include "cudf_jni_apis.hpp" | ||
#include "substring_index.hpp" | ||
|
||
extern "C" { | ||
|
||
JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_GpuSubstringIndexUtils_substringIndex( | ||
JNIEnv* env, jclass, jlong strings_handle, jlong delimiter, jint count) | ||
{ | ||
JNI_NULL_CHECK(env, strings_handle, "strings column handle is null", 0); | ||
JNI_NULL_CHECK(env, delimiter, "delimiter scalar handle is null", 0); | ||
try { | ||
cudf::jni::auto_set_device(env); | ||
auto const input = reinterpret_cast<cudf::column_view const*>(strings_handle); | ||
auto const strings_column = cudf::strings_column_view{*input}; | ||
cudf::string_scalar* ss_scalar = reinterpret_cast<cudf::string_scalar*>(delimiter); | ||
return cudf::jni::release_as_jlong( | ||
spark_rapids_jni::substring_index(strings_column, *ss_scalar, count)); | ||
} | ||
CATCH_STD(env, 0); | ||
} | ||
} // extern "C" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,166 @@ | ||
/* | ||
* Copyright (c) 2019-2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#include "substring_index.hpp" | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/column/column_device_view.cuh> | ||
#include <cudf/column/column_factories.hpp> | ||
#include <cudf/detail/indexalator.cuh> | ||
#include <cudf/detail/iterator.cuh> | ||
#include <cudf/detail/null_mask.hpp> | ||
#include <cudf/detail/nvtx/ranges.hpp> | ||
#include <cudf/scalar/scalar_device_view.cuh> | ||
#include <cudf/strings/detail/strings_children.cuh> | ||
#include <cudf/strings/slice.hpp> | ||
#include <cudf/strings/string_view.cuh> | ||
#include <cudf/strings/strings_column_view.hpp> | ||
#include <cudf/utilities/default_stream.hpp> | ||
|
||
#include <rmm/cuda_stream_view.hpp> | ||
|
||
#include <thrust/for_each.h> | ||
#include <thrust/iterator/constant_iterator.h> | ||
#include <thrust/iterator/counting_iterator.h> | ||
#include <thrust/transform.h> | ||
|
||
using namespace cudf; | ||
|
||
namespace spark_rapids_jni { | ||
|
||
namespace detail { | ||
|
||
namespace { | ||
|
||
/** | ||
* @brief Compute slice indices for each string. | ||
* | ||
* When slice_strings is invoked with a delimiter string and a delimiter count, we need to | ||
* compute the start and end indices of the substring. This function accomplishes that. | ||
*/ | ||
template <typename DelimiterItrT> | ||
void compute_substring_indices(column_device_view const& d_column, | ||
DelimiterItrT const delim_itr, | ||
size_type delimiter_count, | ||
size_type* start_char_pos, | ||
size_type* end_char_pos, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref) | ||
{ | ||
auto strings_count = d_column.size(); | ||
|
||
thrust::for_each_n( | ||
rmm::exec_policy(stream), | ||
thrust::make_counting_iterator<size_type>(0), | ||
strings_count, | ||
[delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__(size_type idx) { | ||
auto const& delim_val_pair = delim_itr[idx]; | ||
auto const& delim_val = delim_val_pair.first; // Don't use it yet | ||
|
||
// If the column value for this row is null, result is null. | ||
// If the delimiter count is 0, result is empty string. | ||
// If the global delimiter or the row specific delimiter is invalid or if it is empty, row | ||
// value is empty. | ||
if (d_column.is_null(idx) || !delim_val_pair.second || delim_val.empty()) return; | ||
auto const& col_val = d_column.element<string_view>(idx); | ||
|
||
// If the column value for the row is empty, the row value is empty. | ||
if (!col_val.empty()) { | ||
auto const col_val_len = col_val.length(); | ||
auto const delimiter_len = delim_val.length(); | ||
|
||
auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count; | ||
bool const left_to_right = (delimiter_count > 0); | ||
|
||
size_type start_pos = start_char_pos[idx]; | ||
size_type end_pos = col_val_len; | ||
size_type char_pos = -1; | ||
|
||
end_char_pos[idx] = col_val_len; | ||
|
||
for (auto i = 0; i < nsearches; ++i) { | ||
char_pos = left_to_right ? col_val.find(delim_val, start_pos) | ||
: col_val.rfind(delim_val, 0, end_pos); | ||
if (char_pos == string_view::npos) return; | ||
if (left_to_right) | ||
start_pos = char_pos + delimiter_len; | ||
else | ||
end_pos = char_pos; | ||
} | ||
if (left_to_right) | ||
end_char_pos[idx] = char_pos; | ||
else | ||
start_char_pos[idx] = end_pos + delimiter_len; | ||
} | ||
}); | ||
} | ||
|
||
} // namespace | ||
|
||
template <typename DelimiterItrT> | ||
std::unique_ptr<column> substring_index(strings_column_view const& strings, | ||
DelimiterItrT const delimiter_itr, | ||
size_type count, | ||
rmm::cuda_stream_view stream, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
auto strings_count = strings.size(); | ||
// If there aren't any rows, return an empty strings column | ||
if (strings_count == 0) { return make_empty_column(type_id::STRING); } | ||
|
||
// Compute the substring indices first | ||
auto start_chars_pos_vec = make_column_from_scalar(numeric_scalar<size_type>(0, true, stream), | ||
strings_count, | ||
stream, | ||
rmm::mr::get_current_device_resource()); | ||
auto stop_chars_pos_vec = make_column_from_scalar(numeric_scalar<size_type>(0, true, stream), | ||
strings_count, | ||
stream, | ||
rmm::mr::get_current_device_resource()); | ||
|
||
auto start_char_pos = start_chars_pos_vec->mutable_view().data<size_type>(); | ||
auto end_char_pos = stop_chars_pos_vec->mutable_view().data<size_type>(); | ||
|
||
auto strings_column = column_device_view::create(strings.parent(), stream); | ||
auto d_column = *strings_column; | ||
|
||
// If delimiter count is 0, the output column will contain empty strings | ||
if (count != 0) { | ||
// Compute the substring indices first | ||
compute_substring_indices( | ||
d_column, delimiter_itr, count, start_char_pos, end_char_pos, stream, mr); | ||
} | ||
|
||
return cudf::strings::slice_strings( | ||
strings, start_chars_pos_vec->view(), stop_chars_pos_vec->view(), stream, mr); | ||
} | ||
|
||
} // namespace detail | ||
|
||
// external API | ||
|
||
std::unique_ptr<column> substring_index(strings_column_view const& strings, | ||
string_scalar const& delimiter, | ||
size_type count, | ||
rmm::device_async_resource_ref mr) | ||
{ | ||
CUDF_FUNC_RANGE(); | ||
return detail::substring_index(strings, | ||
cudf::detail::make_pair_iterator<string_view>(delimiter), | ||
count, | ||
cudf::get_default_stream(), | ||
mr); | ||
} | ||
} // namespace spark_rapids_jni |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,40 @@ | ||
/* | ||
* Copyright (c) 2019-2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
#pragma once | ||
|
||
#include <cudf/column/column.hpp> | ||
#include <cudf/scalar/scalar.hpp> | ||
#include <cudf/strings/strings_column_view.hpp> | ||
|
||
#include <rmm/mr/device/per_device_resource.hpp> | ||
|
||
namespace spark_rapids_jni { | ||
|
||
/** | ||
* @brief Returns the substring of strings before count occurrence of the delimiter delim. | ||
* | ||
* @param strings Strings column | ||
* @param delimiter The delimiter string used to slice string | ||
* @param count Specify the occurrence of the delimiter | ||
* @return A string column used to store the result | ||
*/ | ||
std::unique_ptr<cudf::column> substring_index( | ||
cudf::strings_column_view const& strings, | ||
cudf::string_scalar const& delimiter, | ||
cudf::size_type count, | ||
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource()); | ||
|
||
} // namespace spark_rapids_jni |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
/* | ||
* Copyright (c) 2019-2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cudf_test/base_fixture.hpp> | ||
#include <cudf_test/column_utilities.hpp> | ||
#include <cudf_test/column_wrapper.hpp> | ||
|
||
#include <cudf/column/column_view.hpp> | ||
#include <cudf/scalar/scalar.hpp> | ||
#include <cudf/strings/slice.hpp> | ||
#include <cudf/strings/strings_column_view.hpp> | ||
|
||
#include <thrust/host_vector.h> | ||
#include <thrust/iterator/transform_iterator.h> | ||
#include <thrust/sequence.h> | ||
|
||
#include <substring_index.hpp> | ||
|
||
#include <string> | ||
#include <vector> | ||
|
||
using namespace cudf; | ||
|
||
struct SubstringIndexTests : public test::BaseFixture {}; | ||
|
||
TEST_F(SubstringIndexTests, ScalarDelimiter) | ||
{ | ||
auto col0 = test::strings_column_wrapper({"www.yahoo.com", | ||
"www.apache..org", | ||
"tennis...com", | ||
"nvidia....com", | ||
"google...........com", | ||
"microsoft...c.....co..m"}); | ||
|
||
auto exp_results = test::strings_column_wrapper( | ||
{"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."}); | ||
|
||
auto results = | ||
spark_rapids_jni::substring_index(strings_column_view{col0}, string_scalar("."), 3); | ||
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results); | ||
} |
31 changes: 31 additions & 0 deletions
31
src/main/java/com/nvidia/spark/rapids/jni/GpuSubstringIndexUtils.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
/* | ||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package com.nvidia.spark.rapids.jni; | ||
|
||
import ai.rapids.cudf.*; | ||
|
||
public class GpuSubstringIndexUtils { | ||
static{ | ||
NativeDepsLoader.loadNativeDeps(); | ||
} | ||
|
||
public static ColumnVector substringIndex(ColumnView cv, Scalar delimiter, int count){ | ||
return new ColumnVector(substringIndex(cv.getNativeView(), CudfAccessor.getScalarHandle(delimiter), count)); | ||
} | ||
|
||
private static native long substringIndex(long columnView, long delimiter, int count) throws CudfException; | ||
} |
Oops, something went wrong.