Skip to content

Commit

Permalink
Enable fuzz testing for Regular Expression repetitions and move remai…
Browse files Browse the repository at this point in the history
…ning edge cases to CPU (#4885)

* Handle remaining issues with repetitions and enable repetition fuzz tests

Signed-off-by: Navin Kumar <navink@nvidia.com>

* Add comment references to new issues created for string_split

Signed-off-by: Navin Kumar <navink@nvidia.com>
  • Loading branch information
NVnavkumar authored Mar 4, 2022
1 parent 5292cd5 commit 97ca886
Show file tree
Hide file tree
Showing 2 changed files with 37 additions and 17 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -708,15 +708,29 @@ class CudfRegexTranspiler(mode: RegexMode) {
throw new RegexUnsupportedException(
"regexp_replace on GPU does not support repetition with ? or *")

case (_, QuantifierVariableLength(0,None)) if mode == RegexReplaceMode =>
case (_, SimpleQuantifier(ch)) if mode == RegexSplitMode && "?*".contains(ch) =>
// example: pattern " ?", input "] b[", replace with "X":
// java: X]XXbX[X
// cuDF: XXXX] b[
// see https://github.com/NVIDIA/spark-rapids/issues/4884
throw new RegexUnsupportedException(
"regexp_split on GPU does not support repetition with ? or * consistently with Spark")

case (_, QuantifierVariableLength(0, _)) if mode == RegexReplaceMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/4468
throw new RegexUnsupportedException(
"regexp_replace on GPU does not support repetition with {0,}")
"regexp_replace on GPU does not support repetition with {0,} or {0,n}")

case (_, QuantifierVariableLength(0, _)) if mode == RegexSplitMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/4884
throw new RegexUnsupportedException(
"regexp_split on GPU does not support repetition with {0,} or {0,n} " +
"consistently with Spark")

case (_, QuantifierFixedLength(0)) | (_, QuantifierVariableLength(0, Some(0)))
case (_, QuantifierFixedLength(0))
if mode != RegexFindMode =>
throw new RegexUnsupportedException(
"regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}")
"regex_replace and regex_split on GPU do not support repetition with {0}")

case (RegexGroup(_, term), SimpleQuantifier(ch))
if "+*".contains(ch) && !isSupportedRepetitionBase(term) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("cuDF does not support single repetition both inside and outside of capture groups") {
// see https://github.com/NVIDIA/spark-rapids/issues/4487
val patterns = Seq("(3?)+", "(3?)*", "(3*)+", "((3?))+")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexFindMode, "nothing to repeat"))
Expand Down Expand Up @@ -328,6 +327,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("regexp_replace - character class repetition - ? and * - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4468
val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexReplaceMode,
Expand All @@ -336,28 +336,34 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
)
}

test("regexp_replace - character class repetition - {0,} - fall back to CPU") {
val patterns = Seq(raw"[1a-zA-Z]{0,}")
test("regexp_replace - character class repetition - {0,} or {0,n} - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4468
val patterns = Seq(raw"[1a-zA-Z]{0,}", raw"[1a-zA-Z]{0,2}")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexReplaceMode,
"regexp_replace on GPU does not support repetition with {0,}"
"regexp_replace on GPU does not support repetition with {0,} or {0,n}"
)
)
}

test("regexp_replace - fall back to CPU for {0} or {0,0}") {
val patterns = Seq("a{0}", raw"\02{0}", "a{0,0}", raw"\02{0,0}")
test("regexp_split - character class repetition - ? and * - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4884
val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexReplaceMode,
"regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}")
assertUnsupported(pattern, RegexSplitMode,
"regexp_split on GPU does not support repetition with ? or * " +
"consistently with Spark"
)
)
}

test("regexp_split - fall back to CPU for {0} or {0,0}") {
val patterns = Seq("a{0}", raw"\02{0}", "a{0,0}", raw"\02{0,0}")
test("regexp_split - fall back to CPU for {0,n}, or {0,}") {
// see https://github.com/NVIDIA/spark-rapids/issues/4884
val patterns = Seq("a{0,}", raw"\02{0,}", "a{0,2}", raw"\02{0,10}")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexSplitMode,
"regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}")
"regexp_split on GPU does not support repetition with {0,} or {0,n} " +
"consistently with Spark")
)
}

Expand Down Expand Up @@ -754,12 +760,12 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) {
() => predefinedCharacterClass,
() => group(depth),
() => boundaryMatch,
() => sequence(depth))
() => sequence(depth),
() => repetition(depth))
val generators = if (skipKnownIssues) {
baseGenerators
} else {
baseGenerators ++ Seq(
() => repetition(depth), // https://github.com/NVIDIA/spark-rapids/issues/4487
() => choice(depth)) // https://github.com/NVIDIA/spark-rapids/issues/4603
}
generators(rr.nextInt(generators.length))()
Expand Down

0 comments on commit 97ca886

Please sign in to comment.