From 3b6bc16dbe7bc124911f860bd4869614b61448c6 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Wed, 7 Jul 2021 14:09:06 -0600
Subject: [PATCH 1/8] Add support for timeParserPolicy=LEGACY when parsing
 dates

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 docs/compatibility.md                         |  34 ++-
 .../sql/rapids/datetimeExpressions.scala      | 251 ++++++++++++++--
 .../nvidia/spark/rapids/CudfTestHelper.scala  | 135 +++++++++
 .../spark/rapids/ParseDateTimeSuite.scala     | 281 ++++++++++++++++--
 4 files changed, 647 insertions(+), 54 deletions(-)
 create mode 100644 tests/src/test/scala/com/nvidia/spark/rapids/CudfTestHelper.scala

diff --git a/docs/compatibility.md b/docs/compatibility.md
index 7ddfadcb76f..a24c16c325f 100644
--- a/docs/compatibility.md
+++ b/docs/compatibility.md
@@ -353,7 +353,12 @@ the specified format string will fall into one of three categories:
 - Supported on GPU but may produce different results to Spark
 - Unsupported on GPU
 
-The formats which are supported on GPU and 100% compatible with Spark are :
+The formats which are supported on GPU vary depending on the setting for `timeParserPolicy`.
+
+### CORRECTED and EXCEPTION timeParserPolicy
+
+With timeParserPolicy set to `CORRECTED` or `EXCEPTION` (the default), the following formats are supported
+on the GPU without requiring any additional settings.
 
 - `dd/MM/yyyy`
 - `yyyy/MM`
@@ -366,10 +371,11 @@ The formats which are supported on GPU and 100% compatible with Spark are :
 - `dd-MM`
 - `dd/MM`
 
-Examples of supported formats that may produce different results are:
+Valid Spark date/time formats that do not appear in the list above may also be supported but have not been 
+extensively tested and may produce different results compared to the CPU. Known issues include:
 
-- Trailing characters (including whitespace) may return a non-null value on GPU and Spark will 
-  return null 
+- Valid dates and timestamps followed by trailing characters (including whitespace) may be parsed to non-null 
+  values on GPU where Spark would treat the data as invalid and return null
 
 To attempt to use other formats on the GPU, set
 [`spark.rapids.sql.incompatibleDateFormats.enabled`](configs.md#sql.incompatibleDateFormats.enabled)
@@ -391,6 +397,26 @@ Formats that contain any of the following words are unsupported and will fall ba
 "d", "S", "SS", "SSS", "SSSS", "SSSSS", "SSSSSSSSS", "SSSSSSS", "SSSSSSSS"
 ```
 
+### LEGACY timeParserPolicy
+
+With timeParserPolicy set to `LEGACY` and
+[`spark.rapids.sql.incompatibleDateFormats.enabled`](configs.md#sql.incompatibleDateFormats.enabled)
+set to `true`, and `spark.sql.ansi.enabled` set to `false`, the following formats are supported but not 
+guaranteed to produce the same results as the CPU:
+
+- `dd-MM-yyyy`
+- `dd/MM/yyyy`
+- `yyyy/MM/dd`
+- `yyyy-MM-dd`
+- `yyyy/MM/dd HH:mm:ss`
+- `yyyy-MM-dd HH:mm:ss`
+
+LEGACY timeParserPolicy support has the following limitations when running on the GPU:
+
+- Only 4 digit years are supported
+- The proleptic Gregorian calendar is used instead of the hybrid Julian+Gregorian calender 
+  that Spark uses in legacy mode
+
 ## Formatting dates and timestamps as strings
 
 When formatting dates and timestamps as strings using functions such as `from_unixtime`, only a
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index a3b63d446d7..b05b09e9c1f 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -25,6 +25,7 @@ import com.nvidia.spark.rapids.GpuOverrides.{extractStringLit, getTimeParserPoli
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 
 import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, ExpectsInputTypes, Expression, ImplicitCastInputTypes, NullIntolerant, TimeZoneAwareExpression}
+import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.vectorized.ColumnarBatch
 import org.apache.spark.unsafe.types.CalendarInterval
@@ -398,7 +399,32 @@ abstract class UnixTimeExprMeta[A <: BinaryExpression with TimeZoneAwareExpressi
         case Some(rightLit) =>
           sparkFormat = rightLit
           if (GpuOverrides.getTimeParserPolicy == LegacyTimeParserPolicy) {
-            willNotWorkOnGpu("legacyTimeParserPolicy LEGACY is not supported")
+            try {
+              // try and convert the format to cuDF format - this will throw an exception if
+              // the format contains unsupported characters or words
+              strfFormat = DateUtils.toStrf(sparkFormat,
+                expr.left.dataType == DataTypes.StringType)
+              // format parsed ok but we have no 100% compatible formats in LEGACY mode
+              if (GpuToTimestamp.LEGACY_COMPATIBLE_FORMATS.exists(_.format == sparkFormat)) {
+                // LEGACY support has a number of issues that mean we cannot guarantee
+                // compatibility with CPU
+                // - we can only support 4 digit years but Spark supports a wider range
+                // - we use a proleptic Gregorian calender but Spark uses a hybrid Julian+Gregorian
+                //   calender in LEGACY mode
+                if (SQLConf.get.ansiEnabled) {
+                  willNotWorkOnGpu("LEGACY format in ANSI mode is not supported on the GPU")
+                } else if (!conf.incompatDateFormats) {
+                  willNotWorkOnGpu(s"LEGACY format '$sparkFormat' on the GPU is not guaranteed " +
+                    s"to produce the same results as Spark on CPU. Set " +
+                    s"${RapidsConf.INCOMPATIBLE_DATE_FORMATS.key}=true to force onto GPU.")
+                }
+              } else {
+                willNotWorkOnGpu(s"LEGACY format '$sparkFormat' is not supported on the GPU.")
+              }
+            } catch {
+              case e: TimestampFormatConversionException =>
+                willNotWorkOnGpu(s"Failed to convert ${e.reason} ${e.getMessage}")
+            }
           } else {
             try {
               // try and convert the format to cuDF format - this will throw an exception if
@@ -406,11 +432,11 @@ abstract class UnixTimeExprMeta[A <: BinaryExpression with TimeZoneAwareExpressi
               strfFormat = DateUtils.toStrf(sparkFormat,
                 expr.left.dataType == DataTypes.StringType)
               // format parsed ok, so it is either compatible (tested/certified) or incompatible
-              if (!GpuToTimestamp.COMPATIBLE_FORMATS.contains(sparkFormat) &&
+              if (!GpuToTimestamp.CORRECTED_COMPATIBLE_FORMATS.contains(sparkFormat) &&
                   !conf.incompatDateFormats) {
-                willNotWorkOnGpu(s"format '$sparkFormat' on the GPU is not guaranteed " +
+                willNotWorkOnGpu(s"CORRECTED format '$sparkFormat' on the GPU is not guaranteed " +
                   s"to produce the same results as Spark on CPU. Set " +
-                  s"spark.rapids.sql.incompatibleDateFormats.enabled=true to force onto GPU.")
+                  s"${RapidsConf.INCOMPATIBLE_DATE_FORMATS.key}=true to force onto GPU.")
               }
             } catch {
               case e: TimestampFormatConversionException =>
@@ -430,8 +456,10 @@ object ExceptionTimeParserPolicy extends TimeParserPolicy
 object CorrectedTimeParserPolicy extends TimeParserPolicy
 
 object GpuToTimestamp extends Arm {
-  /** We are compatible with Spark for these formats */
-  val COMPATIBLE_FORMATS = Seq(
+  // We are compatible with Spark for these formats when the timeParserPolicy is CORRECTED
+  // or EXCEPTION. It is possible that other formats may be supported but these are the only
+  // ones that we have tests for.
+  val CORRECTED_COMPATIBLE_FORMATS = Seq(
     "yyyy-MM-dd",
     "yyyy-MM",
     "yyyy/MM/dd",
@@ -444,6 +472,70 @@ object GpuToTimestamp extends Arm {
     "dd/MM"
   )
 
+  // We are compatible with Spark for these formats when the timeParserPolicy is LEGACY. It
+  // is possible that other formats may be supported but these are the only ones that we have
+  // tests for.
+  val LEGACY_COMPATIBLE_FORMATS = Seq(
+    LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false),
+    LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false),
+    LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false),
+    LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false),
+    LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true),
+    LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true)
+  )
+
+  /** remove whitespace before month and day */
+  val REMOVE_WHITESPACE_FROM_MONTH_DAY: RegexReplace =
+    RegexReplace("(\\A\\d+)-([ ]*)(\\d+)-([ ]*)(\\d+)", "\\1-\\3-\\5")
+
+  /** Regex rule to replace "yyyy-m-" with "yyyy-mm-" */
+  val FIX_SINGLE_DIGIT_MONTH: RegexReplace =
+    RegexReplace("(\\A\\d+)-(\\d{1}-)", "\\1-0\\2")
+
+  /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */
+  val FIX_SINGLE_DIGIT_DAY_1: RegexReplace =
+    RegexReplace("(\\A\\d+-\\d{2})-(\\d{1}\\D+)", "\\1-0\\2")
+
+  /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */
+  val FIX_SINGLE_DIGIT_DAY_2: RegexReplace =
+    RegexReplace("(\\A\\d+-\\d{2})-(\\d{1})\\Z", "\\1-0\\2")
+
+  /** Regex rule to replace "yyyy-mm-dd h-" with "yyyy-mm-dd hh-" */
+  val FIX_SINGLE_DIGIT_HOUR_1: RegexReplace =
+    RegexReplace("(\\A\\d+-\\d{2}-\\d{2}) (\\d{1}:)", "\\1 0\\2")
+
+  /** Regex rule to replace "yyyy-mm-ddTh-" with "yyyy-mm-ddThh-" */
+  val FIX_SINGLE_DIGIT_HOUR_2: RegexReplace =
+    RegexReplace("(\\A\\d+-\\d{2}-\\d{2})T(\\d{1}:)", "\\1T0\\2")
+
+  /** Regex rule to replace "yyyy-mm-dd[ T]hh-m-" with "yyyy-mm-dd[ T]hh-mm-" */
+  val FIX_SINGLE_DIGIT_MINUTE: RegexReplace =
+    RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}):(\\d{1}:)", "\\1:0\\2")
+
+  /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */
+  val FIX_SINGLE_DIGIT_SECOND_1: RegexReplace =
+    RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}):(\\d{1}\\D+)", "\\1:0\\2")
+
+  /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */
+  val FIX_SINGLE_DIGIT_SECOND_2: RegexReplace =
+    RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}):(\\d{1})\\Z", "\\1:0\\2")
+
+  /** Convert dates to standard format */
+  val FIX_DATES = Seq(
+    REMOVE_WHITESPACE_FROM_MONTH_DAY,
+    FIX_SINGLE_DIGIT_MONTH,
+    FIX_SINGLE_DIGIT_DAY_1,
+    FIX_SINGLE_DIGIT_DAY_2)
+
+  /** Convert timestamps to standard format */
+  val FIX_TIMESTAMPS = Seq(
+    FIX_SINGLE_DIGIT_HOUR_1,
+    FIX_SINGLE_DIGIT_HOUR_2,
+    FIX_SINGLE_DIGIT_MINUTE,
+    FIX_SINGLE_DIGIT_SECOND_1,
+    FIX_SINGLE_DIGIT_SECOND_2
+  )
+
   def daysScalarSeconds(name: String): Scalar = {
     Scalar.timestampFromLong(DType.TIMESTAMP_SECONDS, DateUtils.specialDatesSeconds(name))
   }
@@ -459,7 +551,7 @@ object GpuToTimestamp extends Arm {
   }
 
   def isTimestamp(col: ColumnVector, sparkFormat: String, strfFormat: String) : ColumnVector = {
-    if (COMPATIBLE_FORMATS.contains(sparkFormat)) {
+    if (CORRECTED_COMPATIBLE_FORMATS.contains(sparkFormat)) {
       // the cuDF `is_timestamp` function is less restrictive than Spark's behavior for UnixTime
       // and ToUnixTime and will support parsing a subset of a string so we check the length of
       // the string as well which works well for fixed-length formats but if/when we want to
@@ -489,12 +581,10 @@ object GpuToTimestamp extends Arm {
       daysScalar: String => Scalar,
       asTimestamp: (ColumnVector, String) => ColumnVector): ColumnVector = {
 
-    val isTimestamp = GpuToTimestamp.isTimestamp(lhs.getBase, sparkFormat, strfFormat)
-
     // in addition to date/timestamp strings, we also need to check for special dates and null
     // values, since anything else is invalid and should throw an error or be converted to null
     // depending on the policy
-    withResource(isTimestamp) { isTimestamp =>
+    withResource(isTimestamp(lhs.getBase, sparkFormat, strfFormat)) { isTimestamp =>
       withResource(daysEqual(lhs.getBase, DateUtils.EPOCH)) { isEpoch =>
         withResource(daysEqual(lhs.getBase, DateUtils.NOW)) { isNow =>
           withResource(daysEqual(lhs.getBase, DateUtils.TODAY)) { isToday =>
@@ -534,8 +624,103 @@ object GpuToTimestamp extends Arm {
       }
     }
   }
+
+  /**
+   * Parse string to timestamp when timeParserPolicy is LEGACY. This was the default behavior
+   * prior to Spark 3.0
+   */
+  def parseStringAsTimestampWithLegacyParserPolicy(
+      lhs: GpuColumnVector,
+      sparkFormat: String,
+      strfFormat: String,
+      dtype: DType,
+      asTimestamp: (ColumnVector, String) => ColumnVector): ColumnVector = {
+
+    val format = LEGACY_COMPATIBLE_FORMATS.find(_.format == sparkFormat)
+      .getOrElse(throw new IllegalStateException(s"Unsupported format $sparkFormat"))
+
+    val regexReplaceRules = if (format.isTimestamp) {
+      FIX_DATES ++ FIX_TIMESTAMPS
+    } else {
+      FIX_DATES
+    }
+
+    val rulesWithSeparator = format.separator match {
+      case '/' =>
+        regexReplaceRules.map {
+          case RegexReplace(pattern, backref) =>
+            RegexReplace(pattern.replace('-', '/'), backref.replace('-', '/'))
+        }
+      case '-' =>
+        regexReplaceRules
+    }
+
+    val fixedUp = rulesWithSeparator
+      .foldLeft(rejectLeadingNewlineThenStrip(lhs))((cv, regexRule) => {
+        withResource(cv) {
+          _.stringReplaceWithBackrefs(regexRule.search, regexRule.replace)
+        }
+      })
+
+    if (format.isTimestamp) {
+      // special handling to ignore "yyyy-mm-dd hh:mm:" which Spark treats
+      // as null but cuDF supports
+      val timestampFormatNoSeconds = "\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\Z"
+      val formatWithSeparator = format.separator match {
+        case '/' => timestampFormatNoSeconds.replace('-', '/')
+        case '-' => timestampFormatNoSeconds
+      }
+      withResource(Scalar.fromNull(dtype)) { nullValue =>
+        withResource(fixedUp.matchesRe(formatWithSeparator)) { hasNoSeconds =>
+          withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp =>
+            hasNoSeconds.ifElse(nullValue, timestamp)
+          }
+        }
+      }
+    } else {
+      asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)
+    }
+  }
+
+  /**
+   * Filter out strings that have a newline before the first non-whitespace character
+   * and then strip all leading and trailing whitespace.
+   */
+  private def rejectLeadingNewlineThenStrip(lhs: GpuColumnVector) = {
+    withResource(lhs.getBase.matchesRe("\\A[ \\t]*[\\n]+")) { hasLeadingNewline =>
+      withResource(Scalar.fromNull(DType.STRING)) { nullValue =>
+        withResource(lhs.getBase.strip()) { stripped =>
+          hasLeadingNewline.ifElse(nullValue, stripped)
+        }
+      }
+    }
+  }
+
+  /**
+   * Parse a string column to timestamp.
+   */
+  def asTimestampOrNull(
+      cv: ColumnVector,
+      dtype: DType,
+      strfFormat: String,
+      asTimestamp: (ColumnVector, String) => ColumnVector): ColumnVector = {
+    withResource(cv) { _ =>
+      withResource(Scalar.fromNull(dtype)) { nullValue =>
+        withResource(cv.isTimestamp(strfFormat)) { isTimestamp =>
+          withResource(asTimestamp(cv, strfFormat)) { timestamp =>
+            isTimestamp.ifElse(timestamp, nullValue)
+          }
+        }
+      }
+    }
+  }
+
 }
 
+case class LegacyParseFormat(format: String, separator: Char, isTimestamp: Boolean)
+
+case class RegexReplace(search: String, replace: String)
+
 /**
  * A direct conversion of Spark's ToTimestamp class which converts time to UNIX timestamp by
  * first converting to microseconds and then dividing by the downScaleFactor
@@ -572,13 +757,22 @@ abstract class GpuToTimestamp
   override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
     val tmp = if (lhs.dataType == StringType) {
       // rhs is ignored we already parsed the format
-      parseStringAsTimestamp(
-        lhs,
-        sparkFormat,
-        strfFormat,
-        DType.TIMESTAMP_MICROSECONDS,
-        daysScalarMicros,
-        (col, strfFormat) => col.asTimestampMicroseconds(strfFormat))
+      if (getTimeParserPolicy == LegacyTimeParserPolicy) {
+        parseStringAsTimestampWithLegacyParserPolicy(
+          lhs,
+          sparkFormat,
+          strfFormat,
+          DType.TIMESTAMP_MICROSECONDS,
+          (col, strfFormat) => col.asTimestampMicroseconds(strfFormat))
+      } else {
+        parseStringAsTimestamp(
+          lhs,
+          sparkFormat,
+          strfFormat,
+          DType.TIMESTAMP_MICROSECONDS,
+          daysScalarMicros,
+          (col, strfFormat) => col.asTimestampMicroseconds(strfFormat))
+      }
     } else { // Timestamp or DateType
       lhs.getBase.asTimestampMicroseconds()
     }
@@ -614,13 +808,22 @@ abstract class GpuToTimestampImproved extends GpuToTimestamp {
   override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = {
     val tmp = if (lhs.dataType == StringType) {
       // rhs is ignored we already parsed the format
-      parseStringAsTimestamp(
-        lhs,
-        sparkFormat,
-        strfFormat,
-        DType.TIMESTAMP_SECONDS,
-        daysScalarSeconds,
-        (col, strfFormat) => col.asTimestampSeconds(strfFormat))
+      if (getTimeParserPolicy == LegacyTimeParserPolicy) {
+        parseStringAsTimestampWithLegacyParserPolicy(
+          lhs,
+          sparkFormat,
+          strfFormat,
+          DType.TIMESTAMP_SECONDS,
+          (col, strfFormat) => col.asTimestampSeconds(strfFormat))
+      } else {
+        parseStringAsTimestamp(
+          lhs,
+          sparkFormat,
+          strfFormat,
+          DType.TIMESTAMP_SECONDS,
+          daysScalarSeconds,
+          (col, strfFormat) => col.asTimestampSeconds(strfFormat))
+      }
     } else if (lhs.dataType() == DateType){
       lhs.getBase.asTimestampSeconds()
     } else { // Timestamp
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CudfTestHelper.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CudfTestHelper.scala
new file mode 100644
index 00000000000..bd18b229282
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/CudfTestHelper.scala
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import ai.rapids.cudf.{ColumnView, DType, HostColumnVector, HostColumnVectorCore}
+import org.junit.jupiter.api.Assertions.{assertArrayEquals, assertEquals}
+
+/**
+ * Convenience methods for testing cuDF calls directly. This code is largely copied
+ * from the cuDF Java test suite.
+ */
+object CudfTestHelper {
+
+  /**
+   * Checks and asserts that passed in columns match
+   *
+   * @param expect The expected result column
+   * @param cv     The input column
+   */
+  def assertColumnsAreEqual(expect: ColumnView, cv: ColumnView): Unit = {
+    assertColumnsAreEqual(expect, cv, colName = "unnamed")
+  }
+
+  /**
+   * Checks and asserts that passed in columns match
+   *
+   * @param expected The expected result column
+   * @param cv       The input column
+   * @param colName  The name of the column
+   */
+  def assertColumnsAreEqual(expected: ColumnView, cv: ColumnView, colName: String): Unit = {
+    assertPartialColumnsAreEqual(expected, 0, expected.getRowCount, cv, colName,
+      enableNullCheck = true)
+  }
+
+  /**
+   * Checks and asserts that passed in host columns match
+   *
+   * @param expected The expected result host column
+   * @param cv       The input host column
+   * @param colName  The name of the host column
+   */
+  def assertColumnsAreEqual(expected: HostColumnVector,
+                            cv: HostColumnVector, colName: String): Unit = {
+    assertPartialColumnsAreEqual(expected, 0,
+      expected.getRowCount, cv, colName, enableNullCheck = true)
+  }
+
+  def assertPartialColumnsAreEqual(
+                                    expected: ColumnView,
+                                    rowOffset: Long,
+                                    length: Long,
+                                    cv: ColumnView,
+                                    colName: String,
+                                    enableNullCheck: Boolean): Unit = {
+    try {
+      val hostExpected = expected.copyToHost
+      val hostcv = cv.copyToHost
+      try assertPartialColumnsAreEqual(hostExpected, rowOffset, length,
+        hostcv, colName, enableNullCheck)
+      finally {
+        if (hostExpected != null) hostExpected.close()
+        if (hostcv != null) hostcv.close()
+      }
+    }
+  }
+
+  def assertPartialColumnsAreEqual(
+      expected: HostColumnVectorCore,
+      rowOffset: Long, length: Long,
+      cv: HostColumnVectorCore,
+      colName: String, enableNullCheck: Boolean): Unit = {
+    assertEquals(expected.getType, cv.getType, "Type For Column " + colName)
+    assertEquals(length, cv.getRowCount, "Row Count For Column " + colName)
+    assertEquals(expected.getNumChildren, cv.getNumChildren, "Child Count for Column " + colName)
+    if (enableNullCheck) assertEquals(expected.getNullCount,
+      cv.getNullCount, "Null Count For Column " + colName)
+    else {
+      // TODO add in a proper check when null counts are
+      //  supported by serializing a partitioned column
+    }
+
+    import ai.rapids.cudf.DType.DTypeEnum._
+
+    val `type`: DType = expected.getType
+    for (expectedRow <- rowOffset until (rowOffset + length)) {
+      val tableRow: Long = expectedRow - rowOffset
+      assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow),
+        "NULL for Column " + colName + " Row " + tableRow)
+      if (!expected.isNull(expectedRow)) `type`.getTypeId match {
+        case BOOL8 | INT8 | UINT8 =>
+          assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow),
+            "Column " + colName + " Row " + tableRow)
+
+        case INT16 | UINT16 =>
+          assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow),
+            "Column " + colName + " Row " + tableRow)
+
+        case INT32 | UINT32 | TIMESTAMP_DAYS | DURATION_DAYS | DECIMAL32 =>
+          assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow),
+            "Column " + colName + " Row " + tableRow)
+
+        case INT64 | UINT64 |
+             DURATION_MICROSECONDS | DURATION_MILLISECONDS | DURATION_NANOSECONDS |
+             DURATION_SECONDS | TIMESTAMP_MICROSECONDS | TIMESTAMP_MILLISECONDS |
+             TIMESTAMP_NANOSECONDS | TIMESTAMP_SECONDS | DECIMAL64 =>
+          assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow),
+            "Column " + colName + " Row " + tableRow)
+
+        case STRING =>
+          assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow),
+            "Column " + colName + " Row " + tableRow)
+
+        case _ =>
+          throw new IllegalArgumentException(`type` + " is not supported yet")
+      }
+    }
+  }
+
+
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
index ba55a2daca1..14d2715746d 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
@@ -16,20 +16,28 @@
 
 package com.nvidia.spark.rapids
 
+import ai.rapids.cudf.ColumnVector
 import java.sql.{Date, Timestamp}
-
-import scala.collection.mutable.ListBuffer
-
 import org.scalatest.BeforeAndAfterEach
+import scala.collection.mutable.ListBuffer
 
 import org.apache.spark.SparkConf
-import org.apache.spark.sql.SparkSession
+import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.functions.{col, to_date, to_timestamp, unix_timestamp}
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY_1, FIX_SINGLE_DIGIT_DAY_2, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND_1, FIX_SINGLE_DIGIT_SECOND_2, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY}
+import org.apache.spark.sql.rapids.RegexReplace
 
 class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterEach {
 
+  private val CORRECTED_TIME_PARSER_POLICY: SparkConf = new SparkConf()
+    .set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")
+
+  private val LEGACY_TIME_PARSER_POLICY_CONF: SparkConf = new SparkConf()
+    .set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "LEGACY")
+    .set(RapidsConf.INCOMPATIBLE_DATE_FORMATS.key, "true")
+
   override def beforeEach() {
     GpuOverrides.removeAllListeners()
   }
@@ -51,61 +59,79 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
 
   testSparkResultsAreEqual("to_date yyyy-MM-dd",
       datesAsStrings,
-      conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      conf = CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", to_date(col("c0"), "yyyy-MM-dd"))
   }
 
+  testSparkResultsAreEqual("to_date yyyy-MM-dd LEGACY",
+    datesAsStrings,
+    conf = LEGACY_TIME_PARSER_POLICY_CONF) {
+    df => df.withColumn("c1", to_date(col("c0"), "yyyy-MM-dd"))
+  }
+
+  testSparkResultsAreEqual("to_date yyyy/MM/dd LEGACY",
+    datesAsStrings,
+    conf = LEGACY_TIME_PARSER_POLICY_CONF) {
+    df => df.withColumn("c1", to_date(col("c0"), "yyyy/MM/dd"))
+  }
+
   testSparkResultsAreEqual("to_date dd/MM/yyyy",
       datesAsStrings,
-      conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      conf = CORRECTED_TIME_PARSER_POLICY) {
+    df => df.withColumn("c1", to_date(col("c0"), "dd/MM/yyyy"))
+  }
+
+  testSparkResultsAreEqual("to_date dd/MM/yyyy LEGACY",
+    datesAsStrings,
+    conf = LEGACY_TIME_PARSER_POLICY_CONF) {
     df => df.withColumn("c1", to_date(col("c0"), "dd/MM/yyyy"))
   }
 
   testSparkResultsAreEqual("to_date parse date",
       dates,
-      conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      conf = CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", to_date(col("c0"), "yyyy-MM-dd"))
   }
 
   testSparkResultsAreEqual("to_date parse timestamp",
       timestamps,
-      conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      conf = CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", to_date(col("c0"), "yyyy-MM-dd"))
   }
 
   testSparkResultsAreEqual("to_timestamp yyyy-MM-dd",
       timestampsAsStrings,
-      conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      conf = CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", to_timestamp(col("c0"), "yyyy-MM-dd"))
   }
 
   testSparkResultsAreEqual("to_timestamp dd/MM/yyyy",
       timestampsAsStrings,
-      conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      conf = CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", to_timestamp(col("c0"), "dd/MM/yyyy"))
   }
 
   testSparkResultsAreEqual("to_date default pattern",
       datesAsStrings,
-      new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", to_date(col("c0")))
   }
 
   testSparkResultsAreEqual("unix_timestamp parse date",
       timestampsAsStrings,
-      new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd"))
   }
 
   testSparkResultsAreEqual("unix_timestamp parse yyyy/MM",
     timestampsAsStrings,
-    new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+    CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy/MM"))
   }
 
   testSparkResultsAreEqual("to_unix_timestamp parse yyyy/MM",
     timestampsAsStrings,
-    new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+    CORRECTED_TIME_PARSER_POLICY) {
     df => {
       df.createOrReplaceTempView("df")
       df.sqlContext.sql("SELECT c0, to_unix_timestamp(c0, 'yyyy/MM') FROM df")
@@ -124,10 +150,22 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
 
   testSparkResultsAreEqual("unix_timestamp parse timestamp",
       timestampsAsStrings,
-      new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      CORRECTED_TIME_PARSER_POLICY) {
+    df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd HH:mm:ss"))
+  }
+
+  testSparkResultsAreEqual("unix_timestamp parse yyyy-MM-dd HH:mm:ss LEGACY",
+    timestampsAsStrings,
+    LEGACY_TIME_PARSER_POLICY_CONF) {
     df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd HH:mm:ss"))
   }
 
+  testSparkResultsAreEqual("unix_timestamp parse yyyy/MM/dd HH:mm:ss LEGACY",
+    timestampsAsStrings,
+    LEGACY_TIME_PARSER_POLICY_CONF) {
+    df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy/MM/dd HH:mm:ss"))
+  }
+
   testSparkResultsAreEqual("unix_timestamp parse timestamp millis (fall back to CPU)",
     timestampsAsStrings,
     new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")
@@ -138,17 +176,18 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
 
   testSparkResultsAreEqual("unix_timestamp parse timestamp default pattern",
       timestampsAsStrings,
-      new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) {
+      CORRECTED_TIME_PARSER_POLICY) {
     df => df.withColumn("c1", unix_timestamp(col("c0")))
   }
 
-  test("fall back to CPU when policy is LEGACY") {
+  test("fall back to CPU when policy is LEGACY and unsupported format is used") {
     val e = intercept[IllegalArgumentException] {
       val df = withGpuSparkSession(spark => {
+        val formatString = "u" // we do not support this legacy format on GPU
         timestampsAsStrings(spark)
             .repartition(2)
-            .withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd HH:mm:ss"))
-      }, new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "LEGACY"))
+            .withColumn("c1", unix_timestamp(col("c0"), formatString))
+      }, LEGACY_TIME_PARSER_POLICY_CONF)
       df.collect()
     }
     assert(e.getMessage.contains(
@@ -169,7 +208,7 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
         datesAsStrings(spark)
           .repartition(2)
           .withColumn("c1", to_date(col("c0"), "F"))
-      }, new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED"))
+      }, CORRECTED_TIME_PARSER_POLICY)
       df.collect()
     }
     assert(e.getMessage.contains(
@@ -198,6 +237,161 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
     assert(cpuNowSeconds <= gpuNowSeconds)
   }
 
+  test("Regex: Remove whitespace from month and day") {
+    testRegex(REMOVE_WHITESPACE_FROM_MONTH_DAY,
+    Seq("1- 1-1", "1-1- 1", "1- 1- 1", null),
+    Seq("1-1-1", "1-1-1", "1-1-1", null))
+  }
+
+  test("Regex: Fix single digit month") {
+    testRegex(FIX_SINGLE_DIGIT_MONTH,
+      Seq("1-2-3", "1111-2-3", null),
+      Seq("1-02-3", "1111-02-3", null))
+  }
+
+  test("Regex: Fix single digit day followed by non digit char") {
+    // single digit day followed by non digit
+    testRegex(FIX_SINGLE_DIGIT_DAY_1,
+      Seq("1111-02-3 ", "1111-02-3:", null),
+      Seq("1111-02-03 ", "1111-02-03:", null))
+  }
+
+  test("Regex: Fix single digit day at end of string") {
+    // single digit day at end of string
+    testRegex(FIX_SINGLE_DIGIT_DAY_2,
+      Seq("1-02-3", "1111-02-3", "1111-02-03", null),
+      Seq("1-02-03", "1111-02-03", "1111-02-03", null))
+  }
+
+  test("Regex: Fix single digit hour 1") {
+    // single digit day at end of string
+    testRegex(FIX_SINGLE_DIGIT_HOUR_1,
+      Seq("2001-12-31 1:2:3", "2001-12-31 1:22:33", null),
+      Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null))
+  }
+
+  test("Regex: Fix single digit hour 2") {
+    // single digit day at end of string
+    testRegex(FIX_SINGLE_DIGIT_HOUR_2,
+      Seq("2001-12-31T1:2:3", "2001-12-31T1:22:33", null),
+      Seq("2001-12-31T01:2:3", "2001-12-31T01:22:33", null))
+  }
+
+  test("Regex: Fix single digit minute") {
+    // single digit day at end of string
+    testRegex(FIX_SINGLE_DIGIT_MINUTE,
+      Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null),
+      Seq("2001-12-31 01:02:3", "2001-12-31 01:22:33", null))
+  }
+
+  test("Regex: Fix single digit second 1") {
+    // single digit day at end of string
+    testRegex(FIX_SINGLE_DIGIT_SECOND_1,
+      Seq("2001-12-31 01:02:3:", "2001-12-31 01:22:33:", null),
+      Seq("2001-12-31 01:02:03:", "2001-12-31 01:22:33:", null))
+  }
+
+  test("Regex: Fix single digit second 2") {
+    // single digit day at end of string
+    testRegex(FIX_SINGLE_DIGIT_SECOND_2,
+      Seq("2001-12-31 01:02:3", "2001-12-31 01:22:33", null),
+      Seq("2001-12-31 01:02:03", "2001-12-31 01:22:33", null))
+  }
+
+  test("Regex: Apply all date rules") {
+    // end to end test of all date rules being applied in sequence
+    val testPairs = Seq(
+      ("2001- 1-1", "2001-01-01"),
+      ("2001-1- 1", "2001-01-01"),
+      ("2001- 1- 1", "2001-01-01"),
+      ("1999-12-31", "1999-12-31"),
+      ("1999-2-31", "1999-02-31"),
+      ("1999-2-3:", "1999-02-03:"),
+      ("1999-2-3", "1999-02-03"),
+      ("1999-2-3 1:2:3.4", "1999-02-03 1:2:3.4"),
+      ("1999-2-3T1:2:3.4", "1999-02-03T1:2:3.4")
+    )
+    val values = testPairs.map(_._1)
+    val expected = testPairs.map(_._2)
+    withResource(ColumnVector.fromStrings(values: _*)) { v =>
+      withResource(ColumnVector.fromStrings(expected: _*)) { expected =>
+        val actual = FIX_DATES.foldLeft(v.incRefCount())((a, b) => {
+          withResource(a) {
+            _.stringReplaceWithBackrefs(b.search, b.replace)
+          }
+        })
+        withResource(actual) { _ =>
+          CudfTestHelper.assertColumnsAreEqual(expected, actual)
+        }
+      }
+    }
+  }
+
+  test("Regex: Apply all date and timestamp rules") {
+    // end to end test of all date and timestamp rules being applied in sequence
+    val testPairs = Seq(
+      ("2001- 1-1", "2001-01-01"),
+      ("2001-1- 1", "2001-01-01"),
+      ("2001- 1- 1", "2001-01-01"),
+      ("1999-12-31", "1999-12-31"),
+      ("1999-2-31", "1999-02-31"),
+      ("1999-2-3:", "1999-02-03:"),
+      ("1999-2-3", "1999-02-03"),
+      ("1999-2-3 1:2:3.4", "1999-02-03 01:02:03.4"),
+      ("1999-2-3T1:2:3.4", "1999-02-03T01:02:03.4")
+    )
+    val values = testPairs.map(_._1)
+    val expected = testPairs.map(_._2)
+    withResource(ColumnVector.fromStrings(values: _*)) { v =>
+      withResource(ColumnVector.fromStrings(expected: _*)) { expected =>
+        val actual = (FIX_DATES ++ FIX_TIMESTAMPS).foldLeft(v.incRefCount())((a, b) => {
+          withResource(a) {
+            _.stringReplaceWithBackrefs(b.search, b.replace)
+          }
+        })
+        withResource(actual) { _ =>
+          CudfTestHelper.assertColumnsAreEqual(expected, actual)
+        }
+      }
+    }
+  }
+
+  private def testRegex(rule: RegexReplace, values: Seq[String], expected: Seq[String]): Unit = {
+    withResource(ColumnVector.fromStrings(values: _*)) { v =>
+      withResource(ColumnVector.fromStrings(expected: _*)) { expected =>
+        withResource(v.stringReplaceWithBackrefs(rule.search, rule.replace)) { actual =>
+          CudfTestHelper.assertColumnsAreEqual(expected, actual)
+        }
+      }
+    }
+  }
+
+  // just show the failures so we don't have to manually parse all
+  // the output to find which ones failed
+  override def compareResults(
+      sort: Boolean,
+      maxFloatDiff: Double,
+      fromCpu: Array[Row],
+      fromGpu: Array[Row]): Unit = {
+    assert(fromCpu.length === fromGpu.length)
+
+    val failures = fromCpu.zip(fromGpu).zipWithIndex.filterNot {
+      case ((cpu, gpu), _) => super.compare(cpu, gpu, 0.0001)
+    }
+
+    if (failures.nonEmpty) {
+      val str = failures.map {
+        case ((cpu, gpu), i) =>
+          s"""
+             |[#$i] CPU: $cpu
+             |[#$i] GPU: $gpu
+             |
+             |""".stripMargin
+      }.mkString("\n")
+      fail(s"Mismatch between CPU and GPU for the following rows:\n$str")
+    }
+  }
+
   private def dates(spark: SparkSession) = {
     import spark.implicits._
     dateValues.toDF("c0")
@@ -225,12 +419,33 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
     values.toDF("c0")
   }
 
+  private val singleDigits = Seq("1999-1-1 ",
+    "1999-1-1 11",
+    "1999-1-1 11:",
+    "1999-1-1 11:5",
+    "1999-1-1 11:59",
+    "1999-1-1 11:59:",
+    "1999-1-1 11:59:5",
+    "1999-1-1 11:59:59",
+    "1999-1-1",
+    "1999-1-1 ",
+    "1999-1-1 1",
+    "1999-1-1 1:",
+    "1999-1-1 1:2",
+    "1999-1-1 1:2:",
+    "1999-1-1 1:2:3",
+    "1999-1-1 1:2:3.",
+    "1999-1-1 1:12:3.",
+    "1999-1-1 11:12:3.",
+    "1999-1-1 11:2:3.",
+    "1999-1-1 11:2:13.",
+    "1999-1-1 1:2:3.4")
+
   private val timestampValues = Seq(
     "",
     "null",
     null,
     "\n",
-    "1999-12-31 ",
     "1999-12-31 11",
     "1999-12-31 11:",
     "1999-12-31 11:5",
@@ -238,12 +453,23 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
     "1999-12-31 11:59:",
     "1999-12-31 11:59:5",
     "1999-12-31 11:59:59",
+    "  1999-12-31 11:59:59",
+    "\t1999-12-31 11:59:59",
+    "\t1999-12-31 11:59:59\n",
     "1999-12-31 11:59:59.",
     "1999-12-31 11:59:59.9",
+    " 1999-12-31 11:59:59.9",
     "1999-12-31 11:59:59.99",
     "1999-12-31 11:59:59.999",
+    "1999-12-31 11:59:59.9999",
+    "1999-12-31 11:59:59.99999",
+    "1999-12-31 11:59:59.999999",
+    "1999-12-31 11:59:59.9999999",
+    "1999-12-31 11:59:59.99999999",
+    "1999-12-31 11:59:59.999999999",
     "31/12/1999",
     "31/12/1999 11:59:59.999",
+    "1999-12-3",
     "1999-12-31",
     "1999/12/31",
     "1999-12",
@@ -251,11 +477,13 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
     "1975/06",
     "1975/06/18",
     "1975/06/18 06:48:57",
-    "1999-12-31\n",
-    "\t1999-12-31",
-    "\n1999-12-31",
-    "1999/12/31"
-  )
+    "1999-12-29\n",
+    "\t1999-12-30",
+    " \n1999-12-31",
+    "1999/12/31",
+    "2001- 1-1",
+    "2001-1- 1",
+    "2001- 1- 1") ++ singleDigits ++ singleDigits.map(_.replace('-', '/'))
 
   private val dateValues = Seq(
     Date.valueOf("2020-07-24"),
@@ -267,5 +495,6 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
     Timestamp.valueOf("2015-07-25 02:02:02.2"),
     Timestamp.valueOf("1999-12-31 11:59:59.999")
   )
+
 }
 

From fac5b01cd4192b271660ea6cbc40608ebf931d54 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Wed, 7 Jul 2021 16:01:07 -0600
Subject: [PATCH 2/8] Validate final string before conversion and handle more
 edge cases found during manual fuzzing

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../sql/rapids/datetimeExpressions.scala      | 44 +++++++++++--------
 .../spark/rapids/ParseDateTimeSuite.scala     | 19 +++++---
 2 files changed, 37 insertions(+), 26 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index b05b09e9c1f..c6b9428a6e5 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -476,17 +476,23 @@ object GpuToTimestamp extends Arm {
   // is possible that other formats may be supported but these are the only ones that we have
   // tests for.
   val LEGACY_COMPATIBLE_FORMATS = Seq(
-    LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false),
-    LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false),
-    LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false),
-    LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false),
-    LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true),
-    LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true)
+    LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false,
+      "\\A\\d{4}-\\d{2}-\\d{2}(\\D|\\s|\\Z)"),
+    LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false,
+      "\\A\\d{4}/\\d{2}/\\d{2}(\\D|\\s|\\Z)"),
+    LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false,
+      "\\A\\d{2}-\\d{2}-\\d{4}(\\D|\\s|\\Z)"),
+    LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false,
+      "\\A\\d{2}/\\d{2}/\\d{4}(\\D|\\s|\\Z)"),
+    LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true,
+      "\\A\\d{4}-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(\\D|\\s|\\Z)"),
+    LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true,
+      "\\A\\d{4}/\\d{2}/\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(\\D|\\s|\\Z)")
   )
 
   /** remove whitespace before month and day */
   val REMOVE_WHITESPACE_FROM_MONTH_DAY: RegexReplace =
-    RegexReplace("(\\A\\d+)-([ ]*)(\\d+)-([ ]*)(\\d+)", "\\1-\\3-\\5")
+    RegexReplace("(\\A\\d+)-([ \\t]*)(\\d+)-([ \\t]*)(\\d+)", "\\1-\\3-\\5")
 
   /** Regex rule to replace "yyyy-m-" with "yyyy-mm-" */
   val FIX_SINGLE_DIGIT_MONTH: RegexReplace =
@@ -494,7 +500,7 @@ object GpuToTimestamp extends Arm {
 
   /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */
   val FIX_SINGLE_DIGIT_DAY_1: RegexReplace =
-    RegexReplace("(\\A\\d+-\\d{2})-(\\d{1}\\D+)", "\\1-0\\2")
+    RegexReplace("(\\A\\d+-\\d{2})-(\\d{1}[\\D\\s]+)", "\\1-0\\2")
 
   /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */
   val FIX_SINGLE_DIGIT_DAY_2: RegexReplace =
@@ -663,22 +669,21 @@ object GpuToTimestamp extends Arm {
       })
 
     if (format.isTimestamp) {
-      // special handling to ignore "yyyy-mm-dd hh:mm:" which Spark treats
-      // as null but cuDF supports
-      val timestampFormatNoSeconds = "\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\Z"
-      val formatWithSeparator = format.separator match {
-        case '/' => timestampFormatNoSeconds.replace('-', '/')
-        case '-' => timestampFormatNoSeconds
-      }
       withResource(Scalar.fromNull(dtype)) { nullValue =>
-        withResource(fixedUp.matchesRe(formatWithSeparator)) { hasNoSeconds =>
+        withResource(fixedUp.matchesRe(format.validRegex)) { hasNoSeconds =>
           withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp =>
-            hasNoSeconds.ifElse(nullValue, timestamp)
+            hasNoSeconds.ifElse(timestamp, nullValue)
           }
         }
       }
     } else {
-      asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)
+      withResource(Scalar.fromNull(dtype)) { nullValue =>
+        withResource(fixedUp.matchesRe(format.validRegex)) { isValidDate =>
+          withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp =>
+            isValidDate.ifElse(timestamp, nullValue)
+          }
+        }
+      }
     }
   }
 
@@ -717,7 +722,8 @@ object GpuToTimestamp extends Arm {
 
 }
 
-case class LegacyParseFormat(format: String, separator: Char, isTimestamp: Boolean)
+case class LegacyParseFormat(format: String, separator: Char, isTimestamp: Boolean,
+    validRegex: String)
 
 case class RegexReplace(search: String, replace: String)
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
index 14d2715746d..06ab169748f 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
@@ -245,22 +245,22 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
 
   test("Regex: Fix single digit month") {
     testRegex(FIX_SINGLE_DIGIT_MONTH,
-      Seq("1-2-3", "1111-2-3", null),
-      Seq("1-02-3", "1111-02-3", null))
+      Seq("1-2-3", "1111-2-3", "2000-7-7\n9\t8568:\n", null),
+      Seq("1-02-3", "1111-02-3", "2000-07-7\n9\t8568:\n", null))
   }
 
   test("Regex: Fix single digit day followed by non digit char") {
     // single digit day followed by non digit
     testRegex(FIX_SINGLE_DIGIT_DAY_1,
-      Seq("1111-02-3 ", "1111-02-3:", null),
-      Seq("1111-02-03 ", "1111-02-03:", null))
+      Seq("1111-02-3 ", "1111-02-3:", "2000-03-192", "2000-07-7\n9\t8568:\n", null),
+      Seq("1111-02-03 ", "1111-02-03:", "2000-03-192", "2000-07-07\n9\t8568:\n", null))
   }
 
   test("Regex: Fix single digit day at end of string") {
     // single digit day at end of string
     testRegex(FIX_SINGLE_DIGIT_DAY_2,
-      Seq("1-02-3", "1111-02-3", "1111-02-03", null),
-      Seq("1-02-03", "1111-02-03", "1111-02-03", null))
+      Seq("1-02-3", "1111-02-3", "1111-02-03", "2000-03-192", null),
+      Seq("1-02-03", "1111-02-03", "1111-02-03", "2000-03-192", null))
   }
 
   test("Regex: Fix single digit hour 1") {
@@ -483,7 +483,12 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
     "1999/12/31",
     "2001- 1-1",
     "2001-1- 1",
-    "2001- 1- 1") ++ singleDigits ++ singleDigits.map(_.replace('-', '/'))
+    "2001- 1- 1",
+    "2000-3-192",
+    // interesting test cases from fuzzing that original triggered differences between CPU and GPU
+    "2000-1- 099\n305\n 7-390--.0:-",
+    "2000-7-7\n9\t8568:\n",
+    "2000-\t5-2.7.44584.9935") ++ singleDigits ++ singleDigits.map(_.replace('-', '/'))
 
   private val dateValues = Seq(
     Date.valueOf("2020-07-24"),

From 6f50d9f4b0ed4336b5b483e0467af42193a96465 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Wed, 7 Jul 2021 16:23:43 -0600
Subject: [PATCH 3/8] Remove duplicate code and add more code comments

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../sql/rapids/datetimeExpressions.scala      | 25 ++++++++-----------
 1 file changed, 11 insertions(+), 14 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index c6b9428a6e5..fe0c565082f 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -645,12 +645,16 @@ object GpuToTimestamp extends Arm {
     val format = LEGACY_COMPATIBLE_FORMATS.find(_.format == sparkFormat)
       .getOrElse(throw new IllegalStateException(s"Unsupported format $sparkFormat"))
 
+    // optimization to apply only the necessary rules depending on whether we are
+    // parsing to a date or timestamp
     val regexReplaceRules = if (format.isTimestamp) {
       FIX_DATES ++ FIX_TIMESTAMPS
     } else {
       FIX_DATES
     }
 
+    // we support date formats using either `-` or `/` to separate year, month, and day and the
+    // regex rules are written with '-' so we need to replace '-' with '/' here as necessary
     val rulesWithSeparator = format.separator match {
       case '/' =>
         regexReplaceRules.map {
@@ -661,6 +665,7 @@ object GpuToTimestamp extends Arm {
         regexReplaceRules
     }
 
+    // apply each rule in turn to the data
     val fixedUp = rulesWithSeparator
       .foldLeft(rejectLeadingNewlineThenStrip(lhs))((cv, regexRule) => {
         withResource(cv) {
@@ -668,20 +673,12 @@ object GpuToTimestamp extends Arm {
         }
       })
 
-    if (format.isTimestamp) {
-      withResource(Scalar.fromNull(dtype)) { nullValue =>
-        withResource(fixedUp.matchesRe(format.validRegex)) { hasNoSeconds =>
-          withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp =>
-            hasNoSeconds.ifElse(timestamp, nullValue)
-          }
-        }
-      }
-    } else {
-      withResource(Scalar.fromNull(dtype)) { nullValue =>
-        withResource(fixedUp.matchesRe(format.validRegex)) { isValidDate =>
-          withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp =>
-            isValidDate.ifElse(timestamp, nullValue)
-          }
+    // check the final value against a regex to determine if it is valid or not, so we produce
+    // null values for any invalid inputs
+    withResource(Scalar.fromNull(dtype)) { nullValue =>
+      withResource(fixedUp.matchesRe(format.validRegex)) { isValidDate =>
+        withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp =>
+          isValidDate.ifElse(timestamp, nullValue)
         }
       }
     }

From a9d11f06bace74a7a1ef4a52dafc8c531a29a376 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 8 Jul 2021 15:19:09 -0600
Subject: [PATCH 4/8] Use raw strings for improved readability

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../sql/rapids/datetimeExpressions.scala      | 30 +++++++++----------
 1 file changed, 15 insertions(+), 15 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index fe0c565082f..0af87547397 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -477,54 +477,54 @@ object GpuToTimestamp extends Arm {
   // tests for.
   val LEGACY_COMPATIBLE_FORMATS = Seq(
     LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false,
-      "\\A\\d{4}-\\d{2}-\\d{2}(\\D|\\s|\\Z)"),
+      raw"\A\d{4}-\d{2}-\d{2}(\D|\s|\Z)"),
     LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false,
-      "\\A\\d{4}/\\d{2}/\\d{2}(\\D|\\s|\\Z)"),
+      raw"\A\d{4}/\d{2}/\d{2}(\D|\s|\Z)"),
     LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false,
-      "\\A\\d{2}-\\d{2}-\\d{4}(\\D|\\s|\\Z)"),
+      raw"\A\d{2}-\d{2}-\d{4}(\D|\s|\Z)"),
     LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false,
-      "\\A\\d{2}/\\d{2}/\\d{4}(\\D|\\s|\\Z)"),
+      raw"\A\d{2}/\d{2}/\d{4}(\D|\s|\Z)"),
     LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true,
-      "\\A\\d{4}-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(\\D|\\s|\\Z)"),
+      raw"\A\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(\D|\s|\Z)"),
     LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true,
-      "\\A\\d{4}/\\d{2}/\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(\\D|\\s|\\Z)")
+      raw"\A\d{4}/\d{2}/\d{2}[ T]\d{2}:\d{2}:\d{2}(\D|\s|\Z)")
   )
 
   /** remove whitespace before month and day */
   val REMOVE_WHITESPACE_FROM_MONTH_DAY: RegexReplace =
-    RegexReplace("(\\A\\d+)-([ \\t]*)(\\d+)-([ \\t]*)(\\d+)", "\\1-\\3-\\5")
+    RegexReplace(raw"(\A\d+)-([ \t]*)(\d+)-([ \t]*)(\d+)", raw"\1-\3-\5")
 
   /** Regex rule to replace "yyyy-m-" with "yyyy-mm-" */
   val FIX_SINGLE_DIGIT_MONTH: RegexReplace =
-    RegexReplace("(\\A\\d+)-(\\d{1}-)", "\\1-0\\2")
+    RegexReplace(raw"(\A\d+)-(\d{1}-)", raw"\1-0\2")
 
   /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */
   val FIX_SINGLE_DIGIT_DAY_1: RegexReplace =
-    RegexReplace("(\\A\\d+-\\d{2})-(\\d{1}[\\D\\s]+)", "\\1-0\\2")
+    RegexReplace(raw"(\A\d+-\d{2})-(\d{1}[\D\s]+)", raw"\1-0\2")
 
   /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */
   val FIX_SINGLE_DIGIT_DAY_2: RegexReplace =
-    RegexReplace("(\\A\\d+-\\d{2})-(\\d{1})\\Z", "\\1-0\\2")
+    RegexReplace(raw"(\A\d+-\d{2})-(\d{1})\Z", raw"\1-0\2")
 
   /** Regex rule to replace "yyyy-mm-dd h-" with "yyyy-mm-dd hh-" */
   val FIX_SINGLE_DIGIT_HOUR_1: RegexReplace =
-    RegexReplace("(\\A\\d+-\\d{2}-\\d{2}) (\\d{1}:)", "\\1 0\\2")
+    RegexReplace(raw"(\A\d+-\d{2}-\d{2}) (\d{1}:)", raw"\1 0\2")
 
   /** Regex rule to replace "yyyy-mm-ddTh-" with "yyyy-mm-ddThh-" */
   val FIX_SINGLE_DIGIT_HOUR_2: RegexReplace =
-    RegexReplace("(\\A\\d+-\\d{2}-\\d{2})T(\\d{1}:)", "\\1T0\\2")
+    RegexReplace(raw"(\A\d+-\d{2}-\d{2})T(\d{1}:)", raw"\1T0\2")
 
   /** Regex rule to replace "yyyy-mm-dd[ T]hh-m-" with "yyyy-mm-dd[ T]hh-mm-" */
   val FIX_SINGLE_DIGIT_MINUTE: RegexReplace =
-    RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}):(\\d{1}:)", "\\1:0\\2")
+    RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}):(\d{1}:)", raw"\1:0\2")
 
   /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */
   val FIX_SINGLE_DIGIT_SECOND_1: RegexReplace =
-    RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}):(\\d{1}\\D+)", "\\1:0\\2")
+    RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1}\D+)", raw"\1:0\2")
 
   /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */
   val FIX_SINGLE_DIGIT_SECOND_2: RegexReplace =
-    RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}):(\\d{1})\\Z", "\\1:0\\2")
+    RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1})\Z", raw"\1:0\2")
 
   /** Convert dates to standard format */
   val FIX_DATES = Seq(

From 2e6fe5345851eab5de1f143d041364fb44e022e5 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 8 Jul 2021 17:20:29 -0600
Subject: [PATCH 5/8] Combine FIX_SINGLE_DIGIT_DAY patterns

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../sql/rapids/datetimeExpressions.scala      | 21 +++++++------------
 .../spark/rapids/ParseDateTimeSuite.scala     |  6 +++---
 2 files changed, 11 insertions(+), 16 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index 0af87547397..d825fa108ad 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -499,30 +499,26 @@ object GpuToTimestamp extends Arm {
     RegexReplace(raw"(\A\d+)-(\d{1}-)", raw"\1-0\2")
 
   /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */
-  val FIX_SINGLE_DIGIT_DAY_1: RegexReplace =
-    RegexReplace(raw"(\A\d+-\d{2})-(\d{1}[\D\s]+)", raw"\1-0\2")
+  val FIX_SINGLE_DIGIT_DAY: RegexReplace =
+    RegexReplace(raw"(\A\d+-\d{2})-(\d{1})([\D\s]|\Z)", raw"\1-0\2\3")
 
-  /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */
-  val FIX_SINGLE_DIGIT_DAY_2: RegexReplace =
-    RegexReplace(raw"(\A\d+-\d{2})-(\d{1})\Z", raw"\1-0\2")
-
-  /** Regex rule to replace "yyyy-mm-dd h-" with "yyyy-mm-dd hh-" */
+  /** Regex rule to replace "yyyy-mm-dd h:" with "yyyy-mm-dd hh:" */
   val FIX_SINGLE_DIGIT_HOUR_1: RegexReplace =
     RegexReplace(raw"(\A\d+-\d{2}-\d{2}) (\d{1}:)", raw"\1 0\2")
 
-  /** Regex rule to replace "yyyy-mm-ddTh-" with "yyyy-mm-ddThh-" */
+  /** Regex rule to replace "yyyy-mm-ddTh:" with "yyyy-mm-ddThh:" */
   val FIX_SINGLE_DIGIT_HOUR_2: RegexReplace =
     RegexReplace(raw"(\A\d+-\d{2}-\d{2})T(\d{1}:)", raw"\1T0\2")
 
-  /** Regex rule to replace "yyyy-mm-dd[ T]hh-m-" with "yyyy-mm-dd[ T]hh-mm-" */
+  /** Regex rule to replace "yyyy-mm-dd[ T]hh:m:" with "yyyy-mm-dd[ T]hh:mm:" */
   val FIX_SINGLE_DIGIT_MINUTE: RegexReplace =
     RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}):(\d{1}:)", raw"\1:0\2")
 
-  /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */
+  /** Regex rule to replace "yyyy-mm-dd[ T]hh:mm:s" with "yyyy-mm-dd[ T]hh:mm:ss" */
   val FIX_SINGLE_DIGIT_SECOND_1: RegexReplace =
     RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1}\D+)", raw"\1:0\2")
 
-  /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */
+  /** Regex rule to replace "yyyy-mm-dd[ T]hh:mm:s" with "yyyy-mm-dd[ T]hh:mm:ss" */
   val FIX_SINGLE_DIGIT_SECOND_2: RegexReplace =
     RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1})\Z", raw"\1:0\2")
 
@@ -530,8 +526,7 @@ object GpuToTimestamp extends Arm {
   val FIX_DATES = Seq(
     REMOVE_WHITESPACE_FROM_MONTH_DAY,
     FIX_SINGLE_DIGIT_MONTH,
-    FIX_SINGLE_DIGIT_DAY_1,
-    FIX_SINGLE_DIGIT_DAY_2)
+    FIX_SINGLE_DIGIT_DAY)
 
   /** Convert timestamps to standard format */
   val FIX_TIMESTAMPS = Seq(
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
index 06ab169748f..447b46b2931 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.functions.{col, to_date, to_timestamp, unix_timestamp}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY_1, FIX_SINGLE_DIGIT_DAY_2, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND_1, FIX_SINGLE_DIGIT_SECOND_2, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY}
+import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND_1, FIX_SINGLE_DIGIT_SECOND_2, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY}
 import org.apache.spark.sql.rapids.RegexReplace
 
 class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterEach {
@@ -251,14 +251,14 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
 
   test("Regex: Fix single digit day followed by non digit char") {
     // single digit day followed by non digit
-    testRegex(FIX_SINGLE_DIGIT_DAY_1,
+    testRegex(FIX_SINGLE_DIGIT_DAY,
       Seq("1111-02-3 ", "1111-02-3:", "2000-03-192", "2000-07-7\n9\t8568:\n", null),
       Seq("1111-02-03 ", "1111-02-03:", "2000-03-192", "2000-07-07\n9\t8568:\n", null))
   }
 
   test("Regex: Fix single digit day at end of string") {
     // single digit day at end of string
-    testRegex(FIX_SINGLE_DIGIT_DAY_2,
+    testRegex(FIX_SINGLE_DIGIT_DAY,
       Seq("1-02-3", "1111-02-3", "1111-02-03", "2000-03-192", null),
       Seq("1-02-03", "1111-02-03", "1111-02-03", "2000-03-192", null))
   }

From 36ce995a6e27b31d863f6b8302b0685e2c8b1ff6 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 8 Jul 2021 17:26:02 -0600
Subject: [PATCH 6/8] Combine patterns for matching seconds and fix some
 incorrect comments

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../sql/rapids/datetimeExpressions.scala      | 11 +++-------
 .../spark/rapids/ParseDateTimeSuite.scala     | 22 +++++++++----------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index d825fa108ad..269db7bac6b 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -515,12 +515,8 @@ object GpuToTimestamp extends Arm {
     RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}):(\d{1}:)", raw"\1:0\2")
 
   /** Regex rule to replace "yyyy-mm-dd[ T]hh:mm:s" with "yyyy-mm-dd[ T]hh:mm:ss" */
-  val FIX_SINGLE_DIGIT_SECOND_1: RegexReplace =
-    RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1}\D+)", raw"\1:0\2")
-
-  /** Regex rule to replace "yyyy-mm-dd[ T]hh:mm:s" with "yyyy-mm-dd[ T]hh:mm:ss" */
-  val FIX_SINGLE_DIGIT_SECOND_2: RegexReplace =
-    RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1})\Z", raw"\1:0\2")
+  val FIX_SINGLE_DIGIT_SECOND: RegexReplace =
+    RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1})([\D\s]|\Z)", raw"\1:0\2\3")
 
   /** Convert dates to standard format */
   val FIX_DATES = Seq(
@@ -533,8 +529,7 @@ object GpuToTimestamp extends Arm {
     FIX_SINGLE_DIGIT_HOUR_1,
     FIX_SINGLE_DIGIT_HOUR_2,
     FIX_SINGLE_DIGIT_MINUTE,
-    FIX_SINGLE_DIGIT_SECOND_1,
-    FIX_SINGLE_DIGIT_SECOND_2
+    FIX_SINGLE_DIGIT_SECOND
   )
 
   def daysScalarSeconds(name: String): Scalar = {
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
index 447b46b2931..ba39c2018c9 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.functions.{col, to_date, to_timestamp, unix_timestamp}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND_1, FIX_SINGLE_DIGIT_SECOND_2, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY}
+import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY}
 import org.apache.spark.sql.rapids.RegexReplace
 
 class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterEach {
@@ -264,36 +264,36 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
   }
 
   test("Regex: Fix single digit hour 1") {
-    // single digit day at end of string
+    // single digit hour with space separating date and time
     testRegex(FIX_SINGLE_DIGIT_HOUR_1,
       Seq("2001-12-31 1:2:3", "2001-12-31 1:22:33", null),
       Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null))
   }
 
   test("Regex: Fix single digit hour 2") {
-    // single digit day at end of string
+    // single digit hour with 'T' separating date and time
     testRegex(FIX_SINGLE_DIGIT_HOUR_2,
       Seq("2001-12-31T1:2:3", "2001-12-31T1:22:33", null),
       Seq("2001-12-31T01:2:3", "2001-12-31T01:22:33", null))
   }
 
   test("Regex: Fix single digit minute") {
-    // single digit day at end of string
+    // single digit minute at end of string
     testRegex(FIX_SINGLE_DIGIT_MINUTE,
       Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null),
       Seq("2001-12-31 01:02:3", "2001-12-31 01:22:33", null))
   }
 
-  test("Regex: Fix single digit second 1") {
-    // single digit day at end of string
-    testRegex(FIX_SINGLE_DIGIT_SECOND_1,
-      Seq("2001-12-31 01:02:3:", "2001-12-31 01:22:33:", null),
-      Seq("2001-12-31 01:02:03:", "2001-12-31 01:22:33:", null))
+  test("Regex: Fix single digit second followed by non digit") {
+    // single digit second followed by non digit
+    testRegex(FIX_SINGLE_DIGIT_SECOND,
+      Seq("2001-12-31 01:02:3:", "2001-12-31 01:22:33:", "2001-12-31 01:02:3 ", null),
+      Seq("2001-12-31 01:02:03:", "2001-12-31 01:22:33:", "2001-12-31 01:02:03 ", null))
   }
 
-  test("Regex: Fix single digit second 2") {
+  test("Regex: Fix single digit second at end of string") {
     // single digit day at end of string
-    testRegex(FIX_SINGLE_DIGIT_SECOND_2,
+    testRegex(FIX_SINGLE_DIGIT_SECOND,
       Seq("2001-12-31 01:02:3", "2001-12-31 01:22:33", null),
       Seq("2001-12-31 01:02:03", "2001-12-31 01:22:33", null))
   }

From aac06e3f469241e9eb9224c8b60108efd99f5966 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Thu, 8 Jul 2021 17:57:37 -0600
Subject: [PATCH 7/8] combine two more rules

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../spark/sql/rapids/datetimeExpressions.scala      | 13 ++++---------
 .../nvidia/spark/rapids/ParseDateTimeSuite.scala    | 11 ++++++-----
 2 files changed, 10 insertions(+), 14 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index 269db7bac6b..4ceaaa9beb4 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -502,13 +502,9 @@ object GpuToTimestamp extends Arm {
   val FIX_SINGLE_DIGIT_DAY: RegexReplace =
     RegexReplace(raw"(\A\d+-\d{2})-(\d{1})([\D\s]|\Z)", raw"\1-0\2\3")
 
-  /** Regex rule to replace "yyyy-mm-dd h:" with "yyyy-mm-dd hh:" */
-  val FIX_SINGLE_DIGIT_HOUR_1: RegexReplace =
-    RegexReplace(raw"(\A\d+-\d{2}-\d{2}) (\d{1}:)", raw"\1 0\2")
-
-  /** Regex rule to replace "yyyy-mm-ddTh:" with "yyyy-mm-ddThh:" */
-  val FIX_SINGLE_DIGIT_HOUR_2: RegexReplace =
-    RegexReplace(raw"(\A\d+-\d{2}-\d{2})T(\d{1}:)", raw"\1T0\2")
+  /** Regex rule to replace "yyyy-mm-dd[ T]h:" with "yyyy-mm-dd hh:" */
+  val FIX_SINGLE_DIGIT_HOUR: RegexReplace =
+    RegexReplace(raw"(\A\d+-\d{2}-\d{2})[ T](\d{1}:)", raw"\1 0\2")
 
   /** Regex rule to replace "yyyy-mm-dd[ T]hh:m:" with "yyyy-mm-dd[ T]hh:mm:" */
   val FIX_SINGLE_DIGIT_MINUTE: RegexReplace =
@@ -526,8 +522,7 @@ object GpuToTimestamp extends Arm {
 
   /** Convert timestamps to standard format */
   val FIX_TIMESTAMPS = Seq(
-    FIX_SINGLE_DIGIT_HOUR_1,
-    FIX_SINGLE_DIGIT_HOUR_2,
+    FIX_SINGLE_DIGIT_HOUR,
     FIX_SINGLE_DIGIT_MINUTE,
     FIX_SINGLE_DIGIT_SECOND
   )
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
index ba39c2018c9..45ecbd8af1b 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala
@@ -26,7 +26,7 @@ import org.apache.spark.sql.{Row, SparkSession}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.functions.{col, to_date, to_timestamp, unix_timestamp}
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY}
+import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY}
 import org.apache.spark.sql.rapids.RegexReplace
 
 class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterEach {
@@ -265,16 +265,17 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
 
   test("Regex: Fix single digit hour 1") {
     // single digit hour with space separating date and time
-    testRegex(FIX_SINGLE_DIGIT_HOUR_1,
+    testRegex(FIX_SINGLE_DIGIT_HOUR,
       Seq("2001-12-31 1:2:3", "2001-12-31 1:22:33", null),
       Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null))
   }
 
   test("Regex: Fix single digit hour 2") {
     // single digit hour with 'T' separating date and time
-    testRegex(FIX_SINGLE_DIGIT_HOUR_2,
+    // note that the T gets replaced with whitespace in this case
+    testRegex(FIX_SINGLE_DIGIT_HOUR,
       Seq("2001-12-31T1:2:3", "2001-12-31T1:22:33", null),
-      Seq("2001-12-31T01:2:3", "2001-12-31T01:22:33", null))
+      Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null))
   }
 
   test("Regex: Fix single digit minute") {
@@ -338,7 +339,7 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE
       ("1999-2-3:", "1999-02-03:"),
       ("1999-2-3", "1999-02-03"),
       ("1999-2-3 1:2:3.4", "1999-02-03 01:02:03.4"),
-      ("1999-2-3T1:2:3.4", "1999-02-03T01:02:03.4")
+      ("1999-2-3T1:2:3.4", "1999-02-03 01:02:03.4")
     )
     val values = testPairs.map(_._1)
     val expected = testPairs.map(_._2)

From 41208b4d1b8054590f89320b25c2749a38466776 Mon Sep 17 00:00:00 2001
From: Andy Grove <andygrove@nvidia.com>
Date: Fri, 9 Jul 2021 08:56:54 -0600
Subject: [PATCH 8/8] Change LEGACY_COMPATIBLE_FORMATS to Map

Signed-off-by: Andy Grove <andygrove@nvidia.com>
---
 .../sql/rapids/datetimeExpressions.scala      | 23 +++++++++----------
 1 file changed, 11 insertions(+), 12 deletions(-)

diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
index 4ceaaa9beb4..0a76b50e8de 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala
@@ -405,7 +405,7 @@ abstract class UnixTimeExprMeta[A <: BinaryExpression with TimeZoneAwareExpressi
               strfFormat = DateUtils.toStrf(sparkFormat,
                 expr.left.dataType == DataTypes.StringType)
               // format parsed ok but we have no 100% compatible formats in LEGACY mode
-              if (GpuToTimestamp.LEGACY_COMPATIBLE_FORMATS.exists(_.format == sparkFormat)) {
+              if (GpuToTimestamp.LEGACY_COMPATIBLE_FORMATS.contains(sparkFormat)) {
                 // LEGACY support has a number of issues that mean we cannot guarantee
                 // compatibility with CPU
                 // - we can only support 4 digit years but Spark supports a wider range
@@ -475,18 +475,18 @@ object GpuToTimestamp extends Arm {
   // We are compatible with Spark for these formats when the timeParserPolicy is LEGACY. It
   // is possible that other formats may be supported but these are the only ones that we have
   // tests for.
-  val LEGACY_COMPATIBLE_FORMATS = Seq(
-    LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false,
+  val LEGACY_COMPATIBLE_FORMATS = Map(
+    "yyyy-MM-dd" -> LegacyParseFormat('-', isTimestamp = false,
       raw"\A\d{4}-\d{2}-\d{2}(\D|\s|\Z)"),
-    LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false,
+    "yyyy/MM/dd" -> LegacyParseFormat('/', isTimestamp = false,
       raw"\A\d{4}/\d{2}/\d{2}(\D|\s|\Z)"),
-    LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false,
+    "dd-MM-yyyy" -> LegacyParseFormat('-', isTimestamp = false,
       raw"\A\d{2}-\d{2}-\d{4}(\D|\s|\Z)"),
-    LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false,
+    "dd/MM/yyyy" -> LegacyParseFormat('/', isTimestamp = false,
       raw"\A\d{2}/\d{2}/\d{4}(\D|\s|\Z)"),
-    LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true,
+    "yyyy-MM-dd HH:mm:ss" -> LegacyParseFormat('-', isTimestamp = true,
       raw"\A\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(\D|\s|\Z)"),
-    LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true,
+    "yyyy/MM/dd HH:mm:ss" -> LegacyParseFormat('/', isTimestamp = true,
       raw"\A\d{4}/\d{2}/\d{2}[ T]\d{2}:\d{2}:\d{2}(\D|\s|\Z)")
   )
 
@@ -627,8 +627,8 @@ object GpuToTimestamp extends Arm {
       dtype: DType,
       asTimestamp: (ColumnVector, String) => ColumnVector): ColumnVector = {
 
-    val format = LEGACY_COMPATIBLE_FORMATS.find(_.format == sparkFormat)
-      .getOrElse(throw new IllegalStateException(s"Unsupported format $sparkFormat"))
+    val format = LEGACY_COMPATIBLE_FORMATS.getOrElse(sparkFormat,
+      throw new IllegalStateException(s"Unsupported format $sparkFormat"))
 
     // optimization to apply only the necessary rules depending on whether we are
     // parsing to a date or timestamp
@@ -704,8 +704,7 @@ object GpuToTimestamp extends Arm {
 
 }
 
-case class LegacyParseFormat(format: String, separator: Char, isTimestamp: Boolean,
-    validRegex: String)
+case class LegacyParseFormat(separator: Char, isTimestamp: Boolean, validRegex: String)
 
 case class RegexReplace(search: String, replace: String)