Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix cudf::strings::replace_with_backrefs hang on empty match result #13418

Merged
merged 6 commits into from
May 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 9 additions & 8 deletions cpp/src/strings/replace/backref_re.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -61,14 +61,15 @@ struct backrefs_fn {
auto out_ptr = d_chars ? (d_chars + d_offsets[idx]) : nullptr;
size_type lpos = 0; // last byte position processed in d_str
size_type begin = 0; // first character position matching regex
size_type end = nchars; // last character position (exclusive)
size_type end = -1; // match through the end of the string

// copy input to output replacing strings as we go
while (prog.find(prog_idx, d_str, begin, end) > 0) // inits the begin/end vars
while ((begin <= nchars) &&
(prog.find(prog_idx, d_str, begin, end) > 0)) // inits the begin/end vars
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
{
auto spos = d_str.byte_offset(begin); // get offset for the
auto epos = d_str.byte_offset(end); // character position values;
nbytes += d_repl.size_bytes() - (epos - spos); // compute the output size
auto spos = d_str.byte_offset(begin); // get offset for the
auto epos = d_str.byte_offset(end); // character position values;
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
nbytes += d_repl.size_bytes() - (epos - spos); // compute the output size

// copy the string data before the matched section
if (out_ptr) { out_ptr = copy_and_increment(out_ptr, in_ptr + lpos, spos - lpos); }
Expand All @@ -84,7 +85,7 @@ struct backrefs_fn {
}
// extract the specific group's string for this backref's index
auto extracted = prog.extract(prog_idx, d_str, begin, end, backref.first - 1);
if (!extracted || (extracted.value().second <= extracted.value().first)) {
if (!extracted || (extracted.value().second < extracted.value().first)) {
return; // no value for this backref number; that is ok
}
auto spos_extract = d_str.byte_offset(extracted.value().first); // convert
Expand All @@ -104,8 +105,8 @@ struct backrefs_fn {

// setup to match the next section
lpos = epos;
begin = end;
end = nchars;
begin = end + (begin == end);
end = -1;
}

// finally, copy remainder of input string
Expand Down
15 changes: 15 additions & 0 deletions cpp/tests/strings/replace_regex_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -354,6 +354,21 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest)
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

// https://github.com/rapidsai/cudf/issues/13404
TEST_F(StringsReplaceRegexTest, ReplaceBackrefsWithEmptyCapture)
{
cudf::test::strings_column_wrapper input({"one\ntwo", "three\n\n", "four\r\n"});
auto sv = cudf::strings_column_view(input);

auto pattern = std::string("(\r\n|\r)?$");
auto repl_template = std::string("[\\1]");

cudf::test::strings_column_wrapper expected({"one\ntwo[]", "three\n[]\n[]", "four[\r\n][]"});
auto prog = cudf::strings::regex_program::create(pattern);
auto results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
}

TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest)
{
cudf::test::strings_column_wrapper strings({"this string left intentionally blank"});
Expand Down