Skip to content

Commit

Permalink
implement kernel for substring_index (#2205)
Browse files Browse the repository at this point in the history
* draft

Signed-off-by: fejiang <fejiang@nvidia.com>

* jni binding

Signed-off-by: fejiang <fejiang@nvidia.com>

* some change

Signed-off-by: fejiang <fejiang@nvidia.com>

* some change

Signed-off-by: fejiang <fejiang@nvidia.com>

* test files for substringindex

Signed-off-by: fejiang <fejiang@nvidia.com>

* remove unwanted files

Signed-off-by: fejiang <fejiang@nvidia.com>

* format it

Signed-off-by: fejiang <fejiang@nvidia.com>

* modified ColumnView.java adding methods

Signed-off-by: fejiang <fejiang@nvidia.com>

* modified ColumnView.java

Signed-off-by: fejiang <fejiang@nvidia.com>

* modified

Signed-off-by: fejiang <fejiang@nvidia.com>

* substringIndex.java

Signed-off-by: fejiang <fejiang@nvidia.com>

* add java test

Signed-off-by: fejiang <fejiang@nvidia.com>

* add java test

Signed-off-by: fejiang <fejiang@nvidia.com>

* added java test

Signed-off-by: fejiang <fejiang@nvidia.com>

* rename gpusubstringindex

Signed-off-by: fejiang <fejiang@nvidia.com>

* name changed

Signed-off-by: fejiang <fejiang@nvidia.com>

* add one more test case

Signed-off-by: fejiang <fejiang@nvidia.com>

* cudf update

Signed-off-by: fejiang <fejiang@nvidia.com>

* cudf conflict resolve

Signed-off-by: fejiang <fejiang@nvidia.com>

* cpp test added

Signed-off-by: fejiang <fejiang@nvidia.com>

* remove unwanted code

Signed-off-by: fejiang <fejiang@nvidia.com>

* cudf updated and clang format

Signed-off-by: fejiang <fejiang@nvidia.com>

* removed comments & dealing with formatting

Signed-off-by: fejiang <fejiang@nvidia.com>

* CMakeLists formatting

Signed-off-by: fejiang <fejiang@nvidia.com>

* testcases added

Signed-off-by: fejiang <fejiang@nvidia.com>

* lexicographic order in CMakeLists

Signed-off-by: fejiang <fejiang@nvidia.com>

* change delimiter type to Scalar

Signed-off-by: fejiang <fejiang@nvidia.com>

* format

Signed-off-by: fejiang <fejiang@nvidia.com>

* reuse slice_strings function

Signed-off-by: fejiang <fejiang@nvidia.com>

---------

Signed-off-by: fejiang <fejiang@nvidia.com>
  • Loading branch information
Feng-Jiang28 authored Jul 18, 2024
1 parent 4b9ca1f commit 525eca0
Show file tree
Hide file tree
Showing 8 changed files with 410 additions and 0 deletions.
2 changes: 2 additions & 0 deletions src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@ add_library(
src/RegexRewriteUtilsJni.cpp
src/RowConversionJni.cpp
src/SparkResourceAdaptorJni.cpp
src/SubStringIndexJni.cpp
src/ZOrderJni.cpp
src/bloom_filter.cu
src/case_when.cu
Expand All @@ -214,6 +215,7 @@ add_library(
src/parse_uri.cu
src/regex_rewrite_utils.cu
src/row_conversion.cu
src/substring_index.cu
src/timezones.cu
src/utilities.cu
src/xxhash64.cu
Expand Down
37 changes: 37 additions & 0 deletions src/main/cpp/src/SubStringIndexJni.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "cudf_jni_apis.hpp"
#include "substring_index.hpp"

extern "C" {

JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_GpuSubstringIndexUtils_substringIndex(
JNIEnv* env, jclass, jlong strings_handle, jlong delimiter, jint count)
{
JNI_NULL_CHECK(env, strings_handle, "strings column handle is null", 0);
JNI_NULL_CHECK(env, delimiter, "delimiter scalar handle is null", 0);
try {
cudf::jni::auto_set_device(env);
auto const input = reinterpret_cast<cudf::column_view const*>(strings_handle);
auto const strings_column = cudf::strings_column_view{*input};
cudf::string_scalar* ss_scalar = reinterpret_cast<cudf::string_scalar*>(delimiter);
return cudf::jni::release_as_jlong(
spark_rapids_jni::substring_index(strings_column, *ss_scalar, count));
}
CATCH_STD(env, 0);
}
} // extern "C"
166 changes: 166 additions & 0 deletions src/main/cpp/src/substring_index.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "substring_index.hpp"

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/indexalator.cuh>
#include <cudf/detail/iterator.cuh>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/strings/detail/strings_children.cuh>
#include <cudf/strings/slice.hpp>
#include <cudf/strings/string_view.cuh>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/for_each.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/transform.h>

using namespace cudf;

namespace spark_rapids_jni {

namespace detail {

namespace {

/**
* @brief Compute slice indices for each string.
*
* When slice_strings is invoked with a delimiter string and a delimiter count, we need to
* compute the start and end indices of the substring. This function accomplishes that.
*/
template <typename DelimiterItrT>
void compute_substring_indices(column_device_view const& d_column,
DelimiterItrT const delim_itr,
size_type delimiter_count,
size_type* start_char_pos,
size_type* end_char_pos,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref)
{
auto strings_count = d_column.size();

thrust::for_each_n(
rmm::exec_policy(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
[delim_itr, delimiter_count, start_char_pos, end_char_pos, d_column] __device__(size_type idx) {
auto const& delim_val_pair = delim_itr[idx];
auto const& delim_val = delim_val_pair.first; // Don't use it yet

// If the column value for this row is null, result is null.
// If the delimiter count is 0, result is empty string.
// If the global delimiter or the row specific delimiter is invalid or if it is empty, row
// value is empty.
if (d_column.is_null(idx) || !delim_val_pair.second || delim_val.empty()) return;
auto const& col_val = d_column.element<string_view>(idx);

// If the column value for the row is empty, the row value is empty.
if (!col_val.empty()) {
auto const col_val_len = col_val.length();
auto const delimiter_len = delim_val.length();

auto nsearches = (delimiter_count < 0) ? -delimiter_count : delimiter_count;
bool const left_to_right = (delimiter_count > 0);

size_type start_pos = start_char_pos[idx];
size_type end_pos = col_val_len;
size_type char_pos = -1;

end_char_pos[idx] = col_val_len;

for (auto i = 0; i < nsearches; ++i) {
char_pos = left_to_right ? col_val.find(delim_val, start_pos)
: col_val.rfind(delim_val, 0, end_pos);
if (char_pos == string_view::npos) return;
if (left_to_right)
start_pos = char_pos + delimiter_len;
else
end_pos = char_pos;
}
if (left_to_right)
end_char_pos[idx] = char_pos;
else
start_char_pos[idx] = end_pos + delimiter_len;
}
});
}

} // namespace

template <typename DelimiterItrT>
std::unique_ptr<column> substring_index(strings_column_view const& strings,
DelimiterItrT const delimiter_itr,
size_type count,
rmm::cuda_stream_view stream,
rmm::device_async_resource_ref mr)
{
auto strings_count = strings.size();
// If there aren't any rows, return an empty strings column
if (strings_count == 0) { return make_empty_column(type_id::STRING); }

// Compute the substring indices first
auto start_chars_pos_vec = make_column_from_scalar(numeric_scalar<size_type>(0, true, stream),
strings_count,
stream,
rmm::mr::get_current_device_resource());
auto stop_chars_pos_vec = make_column_from_scalar(numeric_scalar<size_type>(0, true, stream),
strings_count,
stream,
rmm::mr::get_current_device_resource());

auto start_char_pos = start_chars_pos_vec->mutable_view().data<size_type>();
auto end_char_pos = stop_chars_pos_vec->mutable_view().data<size_type>();

auto strings_column = column_device_view::create(strings.parent(), stream);
auto d_column = *strings_column;

// If delimiter count is 0, the output column will contain empty strings
if (count != 0) {
// Compute the substring indices first
compute_substring_indices(
d_column, delimiter_itr, count, start_char_pos, end_char_pos, stream, mr);
}

return cudf::strings::slice_strings(
strings, start_chars_pos_vec->view(), stop_chars_pos_vec->view(), stream, mr);
}

} // namespace detail

// external API

std::unique_ptr<column> substring_index(strings_column_view const& strings,
string_scalar const& delimiter,
size_type count,
rmm::device_async_resource_ref mr)
{
CUDF_FUNC_RANGE();
return detail::substring_index(strings,
cudf::detail::make_pair_iterator<string_view>(delimiter),
count,
cudf::get_default_stream(),
mr);
}
} // namespace spark_rapids_jni
40 changes: 40 additions & 0 deletions src/main/cpp/src/substring_index.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <rmm/mr/device/per_device_resource.hpp>

namespace spark_rapids_jni {

/**
* @brief Returns the substring of strings before count occurrence of the delimiter delim.
*
* @param strings Strings column
* @param delimiter The delimiter string used to slice string
* @param count Specify the occurrence of the delimiter
* @return A string column used to store the result
*/
std::unique_ptr<cudf::column> substring_index(
cudf::strings_column_view const& strings,
cudf::string_scalar const& delimiter,
cudf::size_type count,
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

} // namespace spark_rapids_jni
4 changes: 4 additions & 0 deletions src/main/cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,7 @@ ConfigureTest(UTILITIES

ConfigureTest(PARSE_URI
parse_uri.cpp)

ConfigureTest(SUBSTRING_INDEX
substring_index.cpp)

54 changes: 54 additions & 0 deletions src/main/cpp/tests/substring_index.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>

#include <cudf/column/column_view.hpp>
#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/slice.hpp>
#include <cudf/strings/strings_column_view.hpp>

#include <thrust/host_vector.h>
#include <thrust/iterator/transform_iterator.h>
#include <thrust/sequence.h>

#include <substring_index.hpp>

#include <string>
#include <vector>

using namespace cudf;

struct SubstringIndexTests : public test::BaseFixture {};

TEST_F(SubstringIndexTests, ScalarDelimiter)
{
auto col0 = test::strings_column_wrapper({"www.yahoo.com",
"www.apache..org",
"tennis...com",
"nvidia....com",
"google...........com",
"microsoft...c.....co..m"});

auto exp_results = test::strings_column_wrapper(
{"www.yahoo.com", "www.apache.", "tennis..", "nvidia..", "google..", "microsoft.."});

auto results =
spark_rapids_jni::substring_index(strings_column_view{col0}, string_scalar("."), 3);
CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, exp_results);
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
/*
* Copyright (c) 2022-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.nvidia.spark.rapids.jni;

import ai.rapids.cudf.*;

public class GpuSubstringIndexUtils {
static{
NativeDepsLoader.loadNativeDeps();
}

public static ColumnVector substringIndex(ColumnView cv, Scalar delimiter, int count){
return new ColumnVector(substringIndex(cv.getNativeView(), CudfAccessor.getScalarHandle(delimiter), count));
}

private static native long substringIndex(long columnView, long delimiter, int count) throws CudfException;
}
Loading

0 comments on commit 525eca0

Please sign in to comment.