Skip to content

Commit

Permalink
Exclude all unicode line terminator characters from matching dot (#5424)
Browse files Browse the repository at this point in the history
* Exclude all unicode line terminator characters from matching dot

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Use existing terminatorChars seq, add unit tests

Signed-off-by: Anthony Chang <antchang@nvidia.com>
  • Loading branch information
anthony-chang authored May 6, 2022
1 parent c1aebfc commit 00282b3
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 8 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -565,7 +565,9 @@ class CudfRegexTranspiler(mode: RegexMode) {
case RegexChar(ch) => ch match {
case '.' =>
// workaround for https://github.com/rapidsai/cudf/issues/9619
RegexCharacterClass(negated = true, ListBuffer(RegexChar('\r'), RegexChar('\n')))
val terminatorChars = new ListBuffer[RegexCharacterClassComponent]()
terminatorChars ++= lineTerminatorChars.map(RegexChar)
RegexCharacterClass(negated = true, terminatorChars)
case '$' if mode == RegexSplitMode || mode == RegexReplaceMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/4533
throw new RegexUnsupportedException("line anchor $ is not supported in split or replace")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -277,10 +277,17 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
Seq("", "$", "$9.99"))
}

test("dot matches CR on GPU but not on CPU") {
// see https://github.com/rapidsai/cudf/issues/9619
val pattern = "1."
assertCpuGpuMatchesRegexpFind(Seq(pattern), Seq("1\r2", "1\n2", "1\r\n2"))
test("dot does not match all line terminators") {
// see https://github.com/NVIDIA/spark-rapids/issues/5415
val pattern = Seq("1.")
val inputs = Seq("123", "1\r2", "1\n2", "1\r\n2", "1\u00852", "1\u20282", "1\u20292")
assertCpuGpuMatchesRegexpFind(pattern, inputs)
}

test("dot does not match line terminator combinations") {
val pattern = Seq("a.")
val inputs = Seq("abc", "a\n\rb", "a\n\u0085b", "a\u2029\u0085b", "a\u2082\rb")
assertCpuGpuMatchesRegexpFind(pattern, inputs)
}

test("character class with ranges") {
Expand All @@ -306,11 +313,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
"[0-9]{2}:[0-9]{2}:[0-9]{2})" +
"(.[1-9]*(?:0)?[1-9]+)?(.0*[1-9]+)?(?:.0*)?\\z"

// input and output should be identical except for `.` being replaced with `[^\r\n]` and
// `\z` being replaced with `$`
// input and output should be identical except for `.` being replaced
// with `[^\n\r\u0085\u2028\u2029]` and `\z` being replaced with `$`
doTranspileTest(TIMESTAMP_TRUNCATE_REGEX,
TIMESTAMP_TRUNCATE_REGEX
.replaceAll("\\.", "[^\r\n]")
.replaceAll("\\.", "[^\n\r\u0085\u2028\u2029]")
.replaceAll("\\\\z", "\\$"))
}

Expand Down

0 comments on commit 00282b3

Please sign in to comment.