Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable fuzz testing for Regular Expression repetitions and move remaining edge cases to CPU #4885

Merged
merged 3 commits into from
Mar 4, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -692,15 +692,29 @@ class CudfRegexTranspiler(mode: RegexMode) {
throw new RegexUnsupportedException(
"regexp_replace on GPU does not support repetition with ? or *")

case (_, QuantifierVariableLength(0,None)) if mode == RegexReplaceMode =>
case (_, SimpleQuantifier(ch)) if mode == RegexSplitMode && "?*".contains(ch) =>
// example: pattern " ?", input "] b[", replace with "X":
// java: X]XXbX[X
// cuDF: XXXX] b[
// see https://github.com/NVIDIA/spark-rapids/issues/4884
throw new RegexUnsupportedException(
"regexp_split on GPU does not support repetition with ? or * consistently with Spark")

case (_, QuantifierVariableLength(0, _)) if mode == RegexReplaceMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/4468
throw new RegexUnsupportedException(
"regexp_replace on GPU does not support repetition with {0,}")
"regexp_replace on GPU does not support repetition with {0,} or {0,n}")

case (_, QuantifierVariableLength(0, _)) if mode == RegexSplitMode =>
// see https://github.com/NVIDIA/spark-rapids/issues/4884
throw new RegexUnsupportedException(
"regexp_split on GPU does not support repetition with {0,} or {0,n} " +
"consistently with Spark")

case (_, QuantifierFixedLength(0)) | (_, QuantifierVariableLength(0, Some(0)))
case (_, QuantifierFixedLength(0))
if mode != RegexFindMode =>
throw new RegexUnsupportedException(
"regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}")
"regex_replace and regex_split on GPU do not support repetition with {0}")

case (RegexGroup(_, term), SimpleQuantifier(ch))
if "+*".contains(ch) && !isSupportedRepetitionBase(term) =>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,6 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("cuDF does not support single repetition both inside and outside of capture groups") {
// see https://github.com/NVIDIA/spark-rapids/issues/4487
val patterns = Seq("(3?)+", "(3?)*", "(3*)+", "((3?))+")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexFindMode, "nothing to repeat"))
Expand Down Expand Up @@ -311,6 +310,7 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
}

test("regexp_replace - character class repetition - ? and * - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4468
val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexReplaceMode,
Expand All @@ -319,28 +319,34 @@ class RegularExpressionTranspilerSuite extends FunSuite with Arm {
)
}

test("regexp_replace - character class repetition - {0,} - fall back to CPU") {
val patterns = Seq(raw"[1a-zA-Z]{0,}")
test("regexp_replace - character class repetition - {0,} or {0,n} - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4468
val patterns = Seq(raw"[1a-zA-Z]{0,}", raw"[1a-zA-Z]{0,2}")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexReplaceMode,
"regexp_replace on GPU does not support repetition with {0,}"
"regexp_replace on GPU does not support repetition with {0,} or {0,n}"
)
)
}

test("regexp_replace - fall back to CPU for {0} or {0,0}") {
val patterns = Seq("a{0}", raw"\02{0}", "a{0,0}", raw"\02{0,0}")
test("regexp_split - character class repetition - ? and * - fall back to CPU") {
// see https://github.com/NVIDIA/spark-rapids/issues/4884
val patterns = Seq(raw"[1a-zA-Z]?", raw"[1a-zA-Z]*")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexReplaceMode,
"regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}")
assertUnsupported(pattern, RegexSplitMode,
"regexp_split on GPU does not support repetition with ? or * " +
"consistently with Spark"
)
)
}

test("regexp_split - fall back to CPU for {0} or {0,0}") {
val patterns = Seq("a{0}", raw"\02{0}", "a{0,0}", raw"\02{0,0}")
test("regexp_split - fall back to CPU for {0,n}, or {0,}") {
// see https://github.com/NVIDIA/spark-rapids/issues/4884
val patterns = Seq("a{0,}", raw"\02{0,}", "a{0,2}", raw"\02{0,10}")
patterns.foreach(pattern =>
assertUnsupported(pattern, RegexSplitMode,
"regex_replace and regex_split on GPU do not support repetition with {0} or {0,0}")
"regexp_split on GPU does not support repetition with {0,} or {0,n} " +
"consistently with Spark")
)
}

Expand Down Expand Up @@ -737,12 +743,12 @@ class FuzzRegExp(suggestedChars: String, skipKnownIssues: Boolean = true) {
() => predefinedCharacterClass,
() => group(depth),
() => boundaryMatch,
() => sequence(depth))
() => sequence(depth),
() => repetition(depth))
val generators = if (skipKnownIssues) {
baseGenerators
} else {
baseGenerators ++ Seq(
() => repetition(depth), // https://github.com/NVIDIA/spark-rapids/issues/4487
() => choice(depth)) // https://github.com/NVIDIA/spark-rapids/issues/4603
}
generators(rr.nextInt(generators.length))()
Expand Down