Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Patterns such (3?)+ should now fall back to CPU #4715

Merged
merged 10 commits into from
Feb 18, 2022
Original file line number Diff line number Diff line change
Expand Up @@ -610,6 +610,23 @@ class CudfRegexTranspiler(replace: Boolean) {
// example: "a*+"
throw new RegexUnsupportedException(nothingToRepeat)

case (RegexGroup(capture, term), SimpleQuantifier(ch)) if "+*".contains(ch) =>
// example: "(3?)+"
def isSimpleRepetition(e: RegexAST):Boolean = {
e match {
case RegexRepetition(term, quantifier) =>
term.isInstanceOf[RegexCharacterClassComponent]
andygrove marked this conversation as resolved.
Show resolved Hide resolved
case RegexSequence(parts) if parts.length == 1 =>
isSimpleRepetition(parts.last)
case _ => false
}
}
val tr = rewrite(term)
if (isSimpleRepetition(tr)) {
// perhaps we could rewrite it here
throw new RegexUnsupportedException(nothingToRepeat)
}
RegexRepetition(RegexGroup(capture, tr), quantifier)
case _ =>
RegexRepetition(rewrite(base), quantifier)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,13 @@ class RegularExpressionParserSuite extends FunSuite {
RegexSequence(ListBuffer(RegexOctalChar("47"), RegexChar('7'))))
}

test("repetition with group containing simple repetition") {
assert(parse("(3?)+") ===
RegexSequence(ListBuffer(RegexRepetition(RegexGroup(capture = true,
RegexSequence(ListBuffer(RegexRepetition(RegexChar('3'),
SimpleQuantifier('?'))))),SimpleQuantifier('+')))))
}

test("group containing choice with repetition") {
assert(parse("(\t+|a)") == RegexSequence(ListBuffer(
RegexGroup(capture = true, RegexChoice(RegexSequence(ListBuffer(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,13 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
)
}

test("cuDF does not support single repetition both inside and outside of capture groups") {
// see https://github.com/NVIDIA/spark-rapids/issues/4487
val patterns = Seq("(3?)+", "(3?)*", "(3*)+")
andygrove marked this conversation as resolved.
Show resolved Hide resolved
patterns.foreach(pattern =>
assertUnsupported(pattern, replace = false, "nothing to repeat"))
}

test("cuDF does not support OR at BOL / EOL") {
val patterns = Seq("$|a", "^|a")
patterns.foreach(pattern => {
Expand Down