Skip to content

Commit

Permalink
Add support for regular expressions containing octal digits greater t…
Browse files Browse the repository at this point in the history
…han `\200` (#5443)

* Convert octal digits greater than \200 to direct unicode char

Signed-off-by: Anthony Chang <antchang@nvidia.com>

* Add integration test

Signed-off-by: Anthony Chang <antchang@nvidia.com>
  • Loading branch information
anthony-chang authored May 12, 2022
1 parent fa3e435 commit 456dfbe
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 17 deletions.
13 changes: 13 additions & 0 deletions integration_tests/src/main/python/string_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -867,6 +867,19 @@ def test_regexp_whitespace():
),
conf=_regexp_conf)

def test_regexp_octal_digits():
gen = mk_str_gen('[abcd]\u0000\u0041\u007f\u0080\u00ff[\\\\xa0-\\\\xb0][abcd]')
assert_gpu_and_cpu_are_equal_collect(
lambda spark: unary_op_df(spark, gen).selectExpr(
'rlike(a, "\\\\0177")',
'rlike(a, "\\\\0200")',
'rlike(a, "\\\\0101")',
'regexp_extract(a, "([a-d]+)\\\\0240([a-d]+)", 1)',
'regexp_replace(a, "\\\\0377", "")',
'regexp_replace(a, "\\\\0260", "")',
),
conf=_regexp_conf)

def test_rlike():
gen = mk_str_gen('[abcd]{1,3}')
assert_gpu_and_cpu_are_equal_collect(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -620,12 +620,12 @@ class CudfRegexTranspiler(mode: RegexMode) {
} else {
digits
}
if (Integer.parseInt(octal, 8) >= 128) {
// see https://github.com/NVIDIA/spark-rapids/issues/4746
throw new RegexUnsupportedException(
"cuDF does not support octal digits 0o177 < n <= 0o377")
val codePoint = Integer.parseInt(octal, 8)
if (codePoint >= 128) {
RegexChar(codePoint.toChar)
} else {
RegexOctalChar(octal)
}
RegexOctalChar(octal)

case RegexHexDigit(digits) =>
val codePoint = Integer.parseInt(digits, 16)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -142,14 +142,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
"cuDF does not support null characters in regular expressions"))
}

test("cuDF does not support octal digits 0o177 < n <= 0o377") {
// see https://github.com/NVIDIA/spark-rapids/issues/4746
val patterns = Seq(raw"\0200", raw"\0377")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexFindMode,
"cuDF does not support octal digits 0o177 < n <= 0o377"))
}

test("cuDF does not support octal digits in character classes") {
// see https://github.com/NVIDIA/spark-rapids/issues/4862
val patterns = Seq(raw"[\02]", raw"[\012]", raw"[\0177]")
Expand All @@ -170,10 +162,11 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
)
}

test("octal digits < 0o177 - find") {
val patterns = Seq(raw"\07", raw"\077", raw"\0177", raw"\01772")
assertCpuGpuMatchesRegexpFind(patterns, Seq("", "\u0007", "a\u0007b",
"\u0007\u003f\u007f", "\u007f", "\u007f2"))
test("octal digits - find") {
val patterns = Seq(raw"\07", raw"\077", raw"\0177", raw"\01772", raw"\0200",
raw"\0376", raw"\0377", raw"\02002")
assertCpuGpuMatchesRegexpFind(patterns, Seq("", "\u0007", "a\u0007b", "a\u007fb",
"\u0007\u003f\u007f", "\u007f", "\u0080", "a\u00fe\u00ffb", "\u007f2"))
}

test("hex digits - find") {
Expand Down

0 comments on commit 456dfbe

Please sign in to comment.