diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py index 75522aa9ae5..81fe65027e2 100644 --- a/integration_tests/src/main/python/string_test.py +++ b/integration_tests/src/main/python/string_test.py @@ -881,6 +881,7 @@ def test_character_classes(): 'rlike(a, "[\n-\\]")', 'rlike(a, "[+--]")', 'regexp_extract(a, "[123]", 0)', + 'regexp_replace(a, "[\\\\0101-\\\\0132]", "@")', 'regexp_replace(a, "[\\\\x41-\\\\x5a]", "@")', ), conf=_regexp_conf) @@ -959,7 +960,9 @@ def test_regexp_octal_digits(): 'rlike(a, "\\\\0177")', 'rlike(a, "\\\\0200")', 'rlike(a, "\\\\0101")', + 'rlike(a, "[\\\\0240-\\\\0377]")', 'regexp_extract(a, "([a-d]+)\\\\0240([a-d]+)", 1)', + 'regexp_extract(a, "([a-d]+)[\\\\0141-\\\\0172]([a-d]+)", 0)', 'regexp_replace(a, "\\\\0377", "")', 'regexp_replace(a, "\\\\0260", "")', ), diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala index 4d04a551e81..b628ef3eec5 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala @@ -172,8 +172,9 @@ class RegexParser(pattern: String) { case Some('x') => consumeExpected('x') RegexChar(Integer.parseInt(parseHexDigit.a, 16).toChar) - case Some('0') => throw new RegexUnsupportedException( - "cuDF does not support octal digits in character classes") + case Some('0') => + consumeExpected('0') + RegexChar(Integer.parseInt(parseOctalDigit.a, 8).toChar) case Some(ch) => consumeExpected(ch) match { // List of character literals with an escape from here, under "Characters" @@ -933,12 +934,6 @@ class CudfRegexTranspiler(mode: RegexMode) { // - "[a[]" should match the literal characters "a" and "[" // - "[a-b[c-d]]" is supported by Java but not cuDF throw new RegexUnsupportedException("nested character classes are not supported") - case RegexEscaped(ch) if ch == '0' => - // see https://github.com/NVIDIA/spark-rapids/issues/4862 - // examples - // - "[\02] should match the character with code point 2" - throw new RegexUnsupportedException( - "cuDF does not support octal digits in character classes") case _ => } val components: Seq[RegexCharacterClassComponent] = characters diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala index 6105b81dc1b..2c44fb6160d 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionTranspilerSuite.scala @@ -150,16 +150,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { "cuDF does not support null characters in regular expressions")) } - test("cuDF does not support octal digits in character classes") { - // see https://github.com/NVIDIA/spark-rapids/issues/4862 - val patterns = Seq(raw"[\02]", raw"[\012]", raw"[\0177]") - patterns.foreach(pattern => - assertUnsupported(pattern, RegexFindMode, - "cuDF does not support octal digits in character classes" - ) - ) - } - test("octal digits - find") { val patterns = Seq(raw"\07", raw"\077", raw"\0177", raw"\01772", raw"\0200", raw"\0376", raw"\0377", raw"\02002") @@ -167,6 +157,12 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm { "\u0007\u003f\u007f", "\u007f", "\u0080", "a\u00fe\u00ffb", "\u007f2")) } + test("octal digit character classes") { + val patterns = Seq(raw"[\02]", raw"[\012]", raw"[\0177]", raw"[a-\0377]", raw"[\01-\0777]") + val inputs = Seq("", "\u0002", "a\u0012b\n\u0177c", "a[+\u00fe23z") + assertCpuGpuMatchesRegexpFind(patterns, inputs) + } + test("hex digits - find") { val patterns = Seq(raw"\x07", raw"\x3f", raw"\x7F", raw"\x7f", raw"\x{7}", raw"\x{0007f}", raw"\x80", raw"\xff", raw"\x{0008f}", raw"\x{10FFFF}", raw"\x{00eeee}")