From 3b6bc16dbe7bc124911f860bd4869614b61448c6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 7 Jul 2021 14:09:06 -0600 Subject: [PATCH 1/8] Add support for timeParserPolicy=LEGACY when parsing dates Signed-off-by: Andy Grove --- docs/compatibility.md | 34 ++- .../sql/rapids/datetimeExpressions.scala | 251 ++++++++++++++-- .../nvidia/spark/rapids/CudfTestHelper.scala | 135 +++++++++ .../spark/rapids/ParseDateTimeSuite.scala | 281 ++++++++++++++++-- 4 files changed, 647 insertions(+), 54 deletions(-) create mode 100644 tests/src/test/scala/com/nvidia/spark/rapids/CudfTestHelper.scala diff --git a/docs/compatibility.md b/docs/compatibility.md index 7ddfadcb76f..a24c16c325f 100644 --- a/docs/compatibility.md +++ b/docs/compatibility.md @@ -353,7 +353,12 @@ the specified format string will fall into one of three categories: - Supported on GPU but may produce different results to Spark - Unsupported on GPU -The formats which are supported on GPU and 100% compatible with Spark are : +The formats which are supported on GPU vary depending on the setting for `timeParserPolicy`. + +### CORRECTED and EXCEPTION timeParserPolicy + +With timeParserPolicy set to `CORRECTED` or `EXCEPTION` (the default), the following formats are supported +on the GPU without requiring any additional settings. - `dd/MM/yyyy` - `yyyy/MM` @@ -366,10 +371,11 @@ The formats which are supported on GPU and 100% compatible with Spark are : - `dd-MM` - `dd/MM` -Examples of supported formats that may produce different results are: +Valid Spark date/time formats that do not appear in the list above may also be supported but have not been +extensively tested and may produce different results compared to the CPU. Known issues include: -- Trailing characters (including whitespace) may return a non-null value on GPU and Spark will - return null +- Valid dates and timestamps followed by trailing characters (including whitespace) may be parsed to non-null + values on GPU where Spark would treat the data as invalid and return null To attempt to use other formats on the GPU, set [`spark.rapids.sql.incompatibleDateFormats.enabled`](configs.md#sql.incompatibleDateFormats.enabled) @@ -391,6 +397,26 @@ Formats that contain any of the following words are unsupported and will fall ba "d", "S", "SS", "SSS", "SSSS", "SSSSS", "SSSSSSSSS", "SSSSSSS", "SSSSSSSS" ``` +### LEGACY timeParserPolicy + +With timeParserPolicy set to `LEGACY` and +[`spark.rapids.sql.incompatibleDateFormats.enabled`](configs.md#sql.incompatibleDateFormats.enabled) +set to `true`, and `spark.sql.ansi.enabled` set to `false`, the following formats are supported but not +guaranteed to produce the same results as the CPU: + +- `dd-MM-yyyy` +- `dd/MM/yyyy` +- `yyyy/MM/dd` +- `yyyy-MM-dd` +- `yyyy/MM/dd HH:mm:ss` +- `yyyy-MM-dd HH:mm:ss` + +LEGACY timeParserPolicy support has the following limitations when running on the GPU: + +- Only 4 digit years are supported +- The proleptic Gregorian calendar is used instead of the hybrid Julian+Gregorian calender + that Spark uses in legacy mode + ## Formatting dates and timestamps as strings When formatting dates and timestamps as strings using functions such as `from_unixtime`, only a diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index a3b63d446d7..b05b09e9c1f 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -25,6 +25,7 @@ import com.nvidia.spark.rapids.GpuOverrides.{extractStringLit, getTimeParserPoli import com.nvidia.spark.rapids.RapidsPluginImplicits._ import org.apache.spark.sql.catalyst.expressions.{BinaryExpression, ExpectsInputTypes, Expression, ImplicitCastInputTypes, NullIntolerant, TimeZoneAwareExpression} +import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ import org.apache.spark.sql.vectorized.ColumnarBatch import org.apache.spark.unsafe.types.CalendarInterval @@ -398,7 +399,32 @@ abstract class UnixTimeExprMeta[A <: BinaryExpression with TimeZoneAwareExpressi case Some(rightLit) => sparkFormat = rightLit if (GpuOverrides.getTimeParserPolicy == LegacyTimeParserPolicy) { - willNotWorkOnGpu("legacyTimeParserPolicy LEGACY is not supported") + try { + // try and convert the format to cuDF format - this will throw an exception if + // the format contains unsupported characters or words + strfFormat = DateUtils.toStrf(sparkFormat, + expr.left.dataType == DataTypes.StringType) + // format parsed ok but we have no 100% compatible formats in LEGACY mode + if (GpuToTimestamp.LEGACY_COMPATIBLE_FORMATS.exists(_.format == sparkFormat)) { + // LEGACY support has a number of issues that mean we cannot guarantee + // compatibility with CPU + // - we can only support 4 digit years but Spark supports a wider range + // - we use a proleptic Gregorian calender but Spark uses a hybrid Julian+Gregorian + // calender in LEGACY mode + if (SQLConf.get.ansiEnabled) { + willNotWorkOnGpu("LEGACY format in ANSI mode is not supported on the GPU") + } else if (!conf.incompatDateFormats) { + willNotWorkOnGpu(s"LEGACY format '$sparkFormat' on the GPU is not guaranteed " + + s"to produce the same results as Spark on CPU. Set " + + s"${RapidsConf.INCOMPATIBLE_DATE_FORMATS.key}=true to force onto GPU.") + } + } else { + willNotWorkOnGpu(s"LEGACY format '$sparkFormat' is not supported on the GPU.") + } + } catch { + case e: TimestampFormatConversionException => + willNotWorkOnGpu(s"Failed to convert ${e.reason} ${e.getMessage}") + } } else { try { // try and convert the format to cuDF format - this will throw an exception if @@ -406,11 +432,11 @@ abstract class UnixTimeExprMeta[A <: BinaryExpression with TimeZoneAwareExpressi strfFormat = DateUtils.toStrf(sparkFormat, expr.left.dataType == DataTypes.StringType) // format parsed ok, so it is either compatible (tested/certified) or incompatible - if (!GpuToTimestamp.COMPATIBLE_FORMATS.contains(sparkFormat) && + if (!GpuToTimestamp.CORRECTED_COMPATIBLE_FORMATS.contains(sparkFormat) && !conf.incompatDateFormats) { - willNotWorkOnGpu(s"format '$sparkFormat' on the GPU is not guaranteed " + + willNotWorkOnGpu(s"CORRECTED format '$sparkFormat' on the GPU is not guaranteed " + s"to produce the same results as Spark on CPU. Set " + - s"spark.rapids.sql.incompatibleDateFormats.enabled=true to force onto GPU.") + s"${RapidsConf.INCOMPATIBLE_DATE_FORMATS.key}=true to force onto GPU.") } } catch { case e: TimestampFormatConversionException => @@ -430,8 +456,10 @@ object ExceptionTimeParserPolicy extends TimeParserPolicy object CorrectedTimeParserPolicy extends TimeParserPolicy object GpuToTimestamp extends Arm { - /** We are compatible with Spark for these formats */ - val COMPATIBLE_FORMATS = Seq( + // We are compatible with Spark for these formats when the timeParserPolicy is CORRECTED + // or EXCEPTION. It is possible that other formats may be supported but these are the only + // ones that we have tests for. + val CORRECTED_COMPATIBLE_FORMATS = Seq( "yyyy-MM-dd", "yyyy-MM", "yyyy/MM/dd", @@ -444,6 +472,70 @@ object GpuToTimestamp extends Arm { "dd/MM" ) + // We are compatible with Spark for these formats when the timeParserPolicy is LEGACY. It + // is possible that other formats may be supported but these are the only ones that we have + // tests for. + val LEGACY_COMPATIBLE_FORMATS = Seq( + LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false), + LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false), + LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false), + LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false), + LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true), + LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true) + ) + + /** remove whitespace before month and day */ + val REMOVE_WHITESPACE_FROM_MONTH_DAY: RegexReplace = + RegexReplace("(\\A\\d+)-([ ]*)(\\d+)-([ ]*)(\\d+)", "\\1-\\3-\\5") + + /** Regex rule to replace "yyyy-m-" with "yyyy-mm-" */ + val FIX_SINGLE_DIGIT_MONTH: RegexReplace = + RegexReplace("(\\A\\d+)-(\\d{1}-)", "\\1-0\\2") + + /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */ + val FIX_SINGLE_DIGIT_DAY_1: RegexReplace = + RegexReplace("(\\A\\d+-\\d{2})-(\\d{1}\\D+)", "\\1-0\\2") + + /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */ + val FIX_SINGLE_DIGIT_DAY_2: RegexReplace = + RegexReplace("(\\A\\d+-\\d{2})-(\\d{1})\\Z", "\\1-0\\2") + + /** Regex rule to replace "yyyy-mm-dd h-" with "yyyy-mm-dd hh-" */ + val FIX_SINGLE_DIGIT_HOUR_1: RegexReplace = + RegexReplace("(\\A\\d+-\\d{2}-\\d{2}) (\\d{1}:)", "\\1 0\\2") + + /** Regex rule to replace "yyyy-mm-ddTh-" with "yyyy-mm-ddThh-" */ + val FIX_SINGLE_DIGIT_HOUR_2: RegexReplace = + RegexReplace("(\\A\\d+-\\d{2}-\\d{2})T(\\d{1}:)", "\\1T0\\2") + + /** Regex rule to replace "yyyy-mm-dd[ T]hh-m-" with "yyyy-mm-dd[ T]hh-mm-" */ + val FIX_SINGLE_DIGIT_MINUTE: RegexReplace = + RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}):(\\d{1}:)", "\\1:0\\2") + + /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */ + val FIX_SINGLE_DIGIT_SECOND_1: RegexReplace = + RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}):(\\d{1}\\D+)", "\\1:0\\2") + + /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */ + val FIX_SINGLE_DIGIT_SECOND_2: RegexReplace = + RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}):(\\d{1})\\Z", "\\1:0\\2") + + /** Convert dates to standard format */ + val FIX_DATES = Seq( + REMOVE_WHITESPACE_FROM_MONTH_DAY, + FIX_SINGLE_DIGIT_MONTH, + FIX_SINGLE_DIGIT_DAY_1, + FIX_SINGLE_DIGIT_DAY_2) + + /** Convert timestamps to standard format */ + val FIX_TIMESTAMPS = Seq( + FIX_SINGLE_DIGIT_HOUR_1, + FIX_SINGLE_DIGIT_HOUR_2, + FIX_SINGLE_DIGIT_MINUTE, + FIX_SINGLE_DIGIT_SECOND_1, + FIX_SINGLE_DIGIT_SECOND_2 + ) + def daysScalarSeconds(name: String): Scalar = { Scalar.timestampFromLong(DType.TIMESTAMP_SECONDS, DateUtils.specialDatesSeconds(name)) } @@ -459,7 +551,7 @@ object GpuToTimestamp extends Arm { } def isTimestamp(col: ColumnVector, sparkFormat: String, strfFormat: String) : ColumnVector = { - if (COMPATIBLE_FORMATS.contains(sparkFormat)) { + if (CORRECTED_COMPATIBLE_FORMATS.contains(sparkFormat)) { // the cuDF `is_timestamp` function is less restrictive than Spark's behavior for UnixTime // and ToUnixTime and will support parsing a subset of a string so we check the length of // the string as well which works well for fixed-length formats but if/when we want to @@ -489,12 +581,10 @@ object GpuToTimestamp extends Arm { daysScalar: String => Scalar, asTimestamp: (ColumnVector, String) => ColumnVector): ColumnVector = { - val isTimestamp = GpuToTimestamp.isTimestamp(lhs.getBase, sparkFormat, strfFormat) - // in addition to date/timestamp strings, we also need to check for special dates and null // values, since anything else is invalid and should throw an error or be converted to null // depending on the policy - withResource(isTimestamp) { isTimestamp => + withResource(isTimestamp(lhs.getBase, sparkFormat, strfFormat)) { isTimestamp => withResource(daysEqual(lhs.getBase, DateUtils.EPOCH)) { isEpoch => withResource(daysEqual(lhs.getBase, DateUtils.NOW)) { isNow => withResource(daysEqual(lhs.getBase, DateUtils.TODAY)) { isToday => @@ -534,8 +624,103 @@ object GpuToTimestamp extends Arm { } } } + + /** + * Parse string to timestamp when timeParserPolicy is LEGACY. This was the default behavior + * prior to Spark 3.0 + */ + def parseStringAsTimestampWithLegacyParserPolicy( + lhs: GpuColumnVector, + sparkFormat: String, + strfFormat: String, + dtype: DType, + asTimestamp: (ColumnVector, String) => ColumnVector): ColumnVector = { + + val format = LEGACY_COMPATIBLE_FORMATS.find(_.format == sparkFormat) + .getOrElse(throw new IllegalStateException(s"Unsupported format $sparkFormat")) + + val regexReplaceRules = if (format.isTimestamp) { + FIX_DATES ++ FIX_TIMESTAMPS + } else { + FIX_DATES + } + + val rulesWithSeparator = format.separator match { + case '/' => + regexReplaceRules.map { + case RegexReplace(pattern, backref) => + RegexReplace(pattern.replace('-', '/'), backref.replace('-', '/')) + } + case '-' => + regexReplaceRules + } + + val fixedUp = rulesWithSeparator + .foldLeft(rejectLeadingNewlineThenStrip(lhs))((cv, regexRule) => { + withResource(cv) { + _.stringReplaceWithBackrefs(regexRule.search, regexRule.replace) + } + }) + + if (format.isTimestamp) { + // special handling to ignore "yyyy-mm-dd hh:mm:" which Spark treats + // as null but cuDF supports + val timestampFormatNoSeconds = "\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\Z" + val formatWithSeparator = format.separator match { + case '/' => timestampFormatNoSeconds.replace('-', '/') + case '-' => timestampFormatNoSeconds + } + withResource(Scalar.fromNull(dtype)) { nullValue => + withResource(fixedUp.matchesRe(formatWithSeparator)) { hasNoSeconds => + withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp => + hasNoSeconds.ifElse(nullValue, timestamp) + } + } + } + } else { + asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp) + } + } + + /** + * Filter out strings that have a newline before the first non-whitespace character + * and then strip all leading and trailing whitespace. + */ + private def rejectLeadingNewlineThenStrip(lhs: GpuColumnVector) = { + withResource(lhs.getBase.matchesRe("\\A[ \\t]*[\\n]+")) { hasLeadingNewline => + withResource(Scalar.fromNull(DType.STRING)) { nullValue => + withResource(lhs.getBase.strip()) { stripped => + hasLeadingNewline.ifElse(nullValue, stripped) + } + } + } + } + + /** + * Parse a string column to timestamp. + */ + def asTimestampOrNull( + cv: ColumnVector, + dtype: DType, + strfFormat: String, + asTimestamp: (ColumnVector, String) => ColumnVector): ColumnVector = { + withResource(cv) { _ => + withResource(Scalar.fromNull(dtype)) { nullValue => + withResource(cv.isTimestamp(strfFormat)) { isTimestamp => + withResource(asTimestamp(cv, strfFormat)) { timestamp => + isTimestamp.ifElse(timestamp, nullValue) + } + } + } + } + } + } +case class LegacyParseFormat(format: String, separator: Char, isTimestamp: Boolean) + +case class RegexReplace(search: String, replace: String) + /** * A direct conversion of Spark's ToTimestamp class which converts time to UNIX timestamp by * first converting to microseconds and then dividing by the downScaleFactor @@ -572,13 +757,22 @@ abstract class GpuToTimestamp override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = { val tmp = if (lhs.dataType == StringType) { // rhs is ignored we already parsed the format - parseStringAsTimestamp( - lhs, - sparkFormat, - strfFormat, - DType.TIMESTAMP_MICROSECONDS, - daysScalarMicros, - (col, strfFormat) => col.asTimestampMicroseconds(strfFormat)) + if (getTimeParserPolicy == LegacyTimeParserPolicy) { + parseStringAsTimestampWithLegacyParserPolicy( + lhs, + sparkFormat, + strfFormat, + DType.TIMESTAMP_MICROSECONDS, + (col, strfFormat) => col.asTimestampMicroseconds(strfFormat)) + } else { + parseStringAsTimestamp( + lhs, + sparkFormat, + strfFormat, + DType.TIMESTAMP_MICROSECONDS, + daysScalarMicros, + (col, strfFormat) => col.asTimestampMicroseconds(strfFormat)) + } } else { // Timestamp or DateType lhs.getBase.asTimestampMicroseconds() } @@ -614,13 +808,22 @@ abstract class GpuToTimestampImproved extends GpuToTimestamp { override def doColumnar(lhs: GpuColumnVector, rhs: GpuScalar): ColumnVector = { val tmp = if (lhs.dataType == StringType) { // rhs is ignored we already parsed the format - parseStringAsTimestamp( - lhs, - sparkFormat, - strfFormat, - DType.TIMESTAMP_SECONDS, - daysScalarSeconds, - (col, strfFormat) => col.asTimestampSeconds(strfFormat)) + if (getTimeParserPolicy == LegacyTimeParserPolicy) { + parseStringAsTimestampWithLegacyParserPolicy( + lhs, + sparkFormat, + strfFormat, + DType.TIMESTAMP_SECONDS, + (col, strfFormat) => col.asTimestampSeconds(strfFormat)) + } else { + parseStringAsTimestamp( + lhs, + sparkFormat, + strfFormat, + DType.TIMESTAMP_SECONDS, + daysScalarSeconds, + (col, strfFormat) => col.asTimestampSeconds(strfFormat)) + } } else if (lhs.dataType() == DateType){ lhs.getBase.asTimestampSeconds() } else { // Timestamp diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/CudfTestHelper.scala b/tests/src/test/scala/com/nvidia/spark/rapids/CudfTestHelper.scala new file mode 100644 index 00000000000..bd18b229282 --- /dev/null +++ b/tests/src/test/scala/com/nvidia/spark/rapids/CudfTestHelper.scala @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids + +import ai.rapids.cudf.{ColumnView, DType, HostColumnVector, HostColumnVectorCore} +import org.junit.jupiter.api.Assertions.{assertArrayEquals, assertEquals} + +/** + * Convenience methods for testing cuDF calls directly. This code is largely copied + * from the cuDF Java test suite. + */ +object CudfTestHelper { + + /** + * Checks and asserts that passed in columns match + * + * @param expect The expected result column + * @param cv The input column + */ + def assertColumnsAreEqual(expect: ColumnView, cv: ColumnView): Unit = { + assertColumnsAreEqual(expect, cv, colName = "unnamed") + } + + /** + * Checks and asserts that passed in columns match + * + * @param expected The expected result column + * @param cv The input column + * @param colName The name of the column + */ + def assertColumnsAreEqual(expected: ColumnView, cv: ColumnView, colName: String): Unit = { + assertPartialColumnsAreEqual(expected, 0, expected.getRowCount, cv, colName, + enableNullCheck = true) + } + + /** + * Checks and asserts that passed in host columns match + * + * @param expected The expected result host column + * @param cv The input host column + * @param colName The name of the host column + */ + def assertColumnsAreEqual(expected: HostColumnVector, + cv: HostColumnVector, colName: String): Unit = { + assertPartialColumnsAreEqual(expected, 0, + expected.getRowCount, cv, colName, enableNullCheck = true) + } + + def assertPartialColumnsAreEqual( + expected: ColumnView, + rowOffset: Long, + length: Long, + cv: ColumnView, + colName: String, + enableNullCheck: Boolean): Unit = { + try { + val hostExpected = expected.copyToHost + val hostcv = cv.copyToHost + try assertPartialColumnsAreEqual(hostExpected, rowOffset, length, + hostcv, colName, enableNullCheck) + finally { + if (hostExpected != null) hostExpected.close() + if (hostcv != null) hostcv.close() + } + } + } + + def assertPartialColumnsAreEqual( + expected: HostColumnVectorCore, + rowOffset: Long, length: Long, + cv: HostColumnVectorCore, + colName: String, enableNullCheck: Boolean): Unit = { + assertEquals(expected.getType, cv.getType, "Type For Column " + colName) + assertEquals(length, cv.getRowCount, "Row Count For Column " + colName) + assertEquals(expected.getNumChildren, cv.getNumChildren, "Child Count for Column " + colName) + if (enableNullCheck) assertEquals(expected.getNullCount, + cv.getNullCount, "Null Count For Column " + colName) + else { + // TODO add in a proper check when null counts are + // supported by serializing a partitioned column + } + + import ai.rapids.cudf.DType.DTypeEnum._ + + val `type`: DType = expected.getType + for (expectedRow <- rowOffset until (rowOffset + length)) { + val tableRow: Long = expectedRow - rowOffset + assertEquals(expected.isNull(expectedRow), cv.isNull(tableRow), + "NULL for Column " + colName + " Row " + tableRow) + if (!expected.isNull(expectedRow)) `type`.getTypeId match { + case BOOL8 | INT8 | UINT8 => + assertEquals(expected.getByte(expectedRow), cv.getByte(tableRow), + "Column " + colName + " Row " + tableRow) + + case INT16 | UINT16 => + assertEquals(expected.getShort(expectedRow), cv.getShort(tableRow), + "Column " + colName + " Row " + tableRow) + + case INT32 | UINT32 | TIMESTAMP_DAYS | DURATION_DAYS | DECIMAL32 => + assertEquals(expected.getInt(expectedRow), cv.getInt(tableRow), + "Column " + colName + " Row " + tableRow) + + case INT64 | UINT64 | + DURATION_MICROSECONDS | DURATION_MILLISECONDS | DURATION_NANOSECONDS | + DURATION_SECONDS | TIMESTAMP_MICROSECONDS | TIMESTAMP_MILLISECONDS | + TIMESTAMP_NANOSECONDS | TIMESTAMP_SECONDS | DECIMAL64 => + assertEquals(expected.getLong(expectedRow), cv.getLong(tableRow), + "Column " + colName + " Row " + tableRow) + + case STRING => + assertArrayEquals(expected.getUTF8(expectedRow), cv.getUTF8(tableRow), + "Column " + colName + " Row " + tableRow) + + case _ => + throw new IllegalArgumentException(`type` + " is not supported yet") + } + } + } + + +} diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala index ba55a2daca1..14d2715746d 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala @@ -16,20 +16,28 @@ package com.nvidia.spark.rapids +import ai.rapids.cudf.ColumnVector import java.sql.{Date, Timestamp} - -import scala.collection.mutable.ListBuffer - import org.scalatest.BeforeAndAfterEach +import scala.collection.mutable.ListBuffer import org.apache.spark.SparkConf -import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.functions.{col, to_date, to_timestamp, unix_timestamp} import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY_1, FIX_SINGLE_DIGIT_DAY_2, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND_1, FIX_SINGLE_DIGIT_SECOND_2, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY} +import org.apache.spark.sql.rapids.RegexReplace class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterEach { + private val CORRECTED_TIME_PARSER_POLICY: SparkConf = new SparkConf() + .set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED") + + private val LEGACY_TIME_PARSER_POLICY_CONF: SparkConf = new SparkConf() + .set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "LEGACY") + .set(RapidsConf.INCOMPATIBLE_DATE_FORMATS.key, "true") + override def beforeEach() { GpuOverrides.removeAllListeners() } @@ -51,61 +59,79 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE testSparkResultsAreEqual("to_date yyyy-MM-dd", datesAsStrings, - conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + conf = CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", to_date(col("c0"), "yyyy-MM-dd")) } + testSparkResultsAreEqual("to_date yyyy-MM-dd LEGACY", + datesAsStrings, + conf = LEGACY_TIME_PARSER_POLICY_CONF) { + df => df.withColumn("c1", to_date(col("c0"), "yyyy-MM-dd")) + } + + testSparkResultsAreEqual("to_date yyyy/MM/dd LEGACY", + datesAsStrings, + conf = LEGACY_TIME_PARSER_POLICY_CONF) { + df => df.withColumn("c1", to_date(col("c0"), "yyyy/MM/dd")) + } + testSparkResultsAreEqual("to_date dd/MM/yyyy", datesAsStrings, - conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + conf = CORRECTED_TIME_PARSER_POLICY) { + df => df.withColumn("c1", to_date(col("c0"), "dd/MM/yyyy")) + } + + testSparkResultsAreEqual("to_date dd/MM/yyyy LEGACY", + datesAsStrings, + conf = LEGACY_TIME_PARSER_POLICY_CONF) { df => df.withColumn("c1", to_date(col("c0"), "dd/MM/yyyy")) } testSparkResultsAreEqual("to_date parse date", dates, - conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + conf = CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", to_date(col("c0"), "yyyy-MM-dd")) } testSparkResultsAreEqual("to_date parse timestamp", timestamps, - conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + conf = CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", to_date(col("c0"), "yyyy-MM-dd")) } testSparkResultsAreEqual("to_timestamp yyyy-MM-dd", timestampsAsStrings, - conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + conf = CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", to_timestamp(col("c0"), "yyyy-MM-dd")) } testSparkResultsAreEqual("to_timestamp dd/MM/yyyy", timestampsAsStrings, - conf = new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + conf = CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", to_timestamp(col("c0"), "dd/MM/yyyy")) } testSparkResultsAreEqual("to_date default pattern", datesAsStrings, - new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", to_date(col("c0"))) } testSparkResultsAreEqual("unix_timestamp parse date", timestampsAsStrings, - new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd")) } testSparkResultsAreEqual("unix_timestamp parse yyyy/MM", timestampsAsStrings, - new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy/MM")) } testSparkResultsAreEqual("to_unix_timestamp parse yyyy/MM", timestampsAsStrings, - new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + CORRECTED_TIME_PARSER_POLICY) { df => { df.createOrReplaceTempView("df") df.sqlContext.sql("SELECT c0, to_unix_timestamp(c0, 'yyyy/MM') FROM df") @@ -124,10 +150,22 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE testSparkResultsAreEqual("unix_timestamp parse timestamp", timestampsAsStrings, - new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + CORRECTED_TIME_PARSER_POLICY) { + df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd HH:mm:ss")) + } + + testSparkResultsAreEqual("unix_timestamp parse yyyy-MM-dd HH:mm:ss LEGACY", + timestampsAsStrings, + LEGACY_TIME_PARSER_POLICY_CONF) { df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd HH:mm:ss")) } + testSparkResultsAreEqual("unix_timestamp parse yyyy/MM/dd HH:mm:ss LEGACY", + timestampsAsStrings, + LEGACY_TIME_PARSER_POLICY_CONF) { + df => df.withColumn("c1", unix_timestamp(col("c0"), "yyyy/MM/dd HH:mm:ss")) + } + testSparkResultsAreEqual("unix_timestamp parse timestamp millis (fall back to CPU)", timestampsAsStrings, new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED") @@ -138,17 +176,18 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE testSparkResultsAreEqual("unix_timestamp parse timestamp default pattern", timestampsAsStrings, - new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) { + CORRECTED_TIME_PARSER_POLICY) { df => df.withColumn("c1", unix_timestamp(col("c0"))) } - test("fall back to CPU when policy is LEGACY") { + test("fall back to CPU when policy is LEGACY and unsupported format is used") { val e = intercept[IllegalArgumentException] { val df = withGpuSparkSession(spark => { + val formatString = "u" // we do not support this legacy format on GPU timestampsAsStrings(spark) .repartition(2) - .withColumn("c1", unix_timestamp(col("c0"), "yyyy-MM-dd HH:mm:ss")) - }, new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "LEGACY")) + .withColumn("c1", unix_timestamp(col("c0"), formatString)) + }, LEGACY_TIME_PARSER_POLICY_CONF) df.collect() } assert(e.getMessage.contains( @@ -169,7 +208,7 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE datesAsStrings(spark) .repartition(2) .withColumn("c1", to_date(col("c0"), "F")) - }, new SparkConf().set(SQLConf.LEGACY_TIME_PARSER_POLICY.key, "CORRECTED")) + }, CORRECTED_TIME_PARSER_POLICY) df.collect() } assert(e.getMessage.contains( @@ -198,6 +237,161 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE assert(cpuNowSeconds <= gpuNowSeconds) } + test("Regex: Remove whitespace from month and day") { + testRegex(REMOVE_WHITESPACE_FROM_MONTH_DAY, + Seq("1- 1-1", "1-1- 1", "1- 1- 1", null), + Seq("1-1-1", "1-1-1", "1-1-1", null)) + } + + test("Regex: Fix single digit month") { + testRegex(FIX_SINGLE_DIGIT_MONTH, + Seq("1-2-3", "1111-2-3", null), + Seq("1-02-3", "1111-02-3", null)) + } + + test("Regex: Fix single digit day followed by non digit char") { + // single digit day followed by non digit + testRegex(FIX_SINGLE_DIGIT_DAY_1, + Seq("1111-02-3 ", "1111-02-3:", null), + Seq("1111-02-03 ", "1111-02-03:", null)) + } + + test("Regex: Fix single digit day at end of string") { + // single digit day at end of string + testRegex(FIX_SINGLE_DIGIT_DAY_2, + Seq("1-02-3", "1111-02-3", "1111-02-03", null), + Seq("1-02-03", "1111-02-03", "1111-02-03", null)) + } + + test("Regex: Fix single digit hour 1") { + // single digit day at end of string + testRegex(FIX_SINGLE_DIGIT_HOUR_1, + Seq("2001-12-31 1:2:3", "2001-12-31 1:22:33", null), + Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null)) + } + + test("Regex: Fix single digit hour 2") { + // single digit day at end of string + testRegex(FIX_SINGLE_DIGIT_HOUR_2, + Seq("2001-12-31T1:2:3", "2001-12-31T1:22:33", null), + Seq("2001-12-31T01:2:3", "2001-12-31T01:22:33", null)) + } + + test("Regex: Fix single digit minute") { + // single digit day at end of string + testRegex(FIX_SINGLE_DIGIT_MINUTE, + Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null), + Seq("2001-12-31 01:02:3", "2001-12-31 01:22:33", null)) + } + + test("Regex: Fix single digit second 1") { + // single digit day at end of string + testRegex(FIX_SINGLE_DIGIT_SECOND_1, + Seq("2001-12-31 01:02:3:", "2001-12-31 01:22:33:", null), + Seq("2001-12-31 01:02:03:", "2001-12-31 01:22:33:", null)) + } + + test("Regex: Fix single digit second 2") { + // single digit day at end of string + testRegex(FIX_SINGLE_DIGIT_SECOND_2, + Seq("2001-12-31 01:02:3", "2001-12-31 01:22:33", null), + Seq("2001-12-31 01:02:03", "2001-12-31 01:22:33", null)) + } + + test("Regex: Apply all date rules") { + // end to end test of all date rules being applied in sequence + val testPairs = Seq( + ("2001- 1-1", "2001-01-01"), + ("2001-1- 1", "2001-01-01"), + ("2001- 1- 1", "2001-01-01"), + ("1999-12-31", "1999-12-31"), + ("1999-2-31", "1999-02-31"), + ("1999-2-3:", "1999-02-03:"), + ("1999-2-3", "1999-02-03"), + ("1999-2-3 1:2:3.4", "1999-02-03 1:2:3.4"), + ("1999-2-3T1:2:3.4", "1999-02-03T1:2:3.4") + ) + val values = testPairs.map(_._1) + val expected = testPairs.map(_._2) + withResource(ColumnVector.fromStrings(values: _*)) { v => + withResource(ColumnVector.fromStrings(expected: _*)) { expected => + val actual = FIX_DATES.foldLeft(v.incRefCount())((a, b) => { + withResource(a) { + _.stringReplaceWithBackrefs(b.search, b.replace) + } + }) + withResource(actual) { _ => + CudfTestHelper.assertColumnsAreEqual(expected, actual) + } + } + } + } + + test("Regex: Apply all date and timestamp rules") { + // end to end test of all date and timestamp rules being applied in sequence + val testPairs = Seq( + ("2001- 1-1", "2001-01-01"), + ("2001-1- 1", "2001-01-01"), + ("2001- 1- 1", "2001-01-01"), + ("1999-12-31", "1999-12-31"), + ("1999-2-31", "1999-02-31"), + ("1999-2-3:", "1999-02-03:"), + ("1999-2-3", "1999-02-03"), + ("1999-2-3 1:2:3.4", "1999-02-03 01:02:03.4"), + ("1999-2-3T1:2:3.4", "1999-02-03T01:02:03.4") + ) + val values = testPairs.map(_._1) + val expected = testPairs.map(_._2) + withResource(ColumnVector.fromStrings(values: _*)) { v => + withResource(ColumnVector.fromStrings(expected: _*)) { expected => + val actual = (FIX_DATES ++ FIX_TIMESTAMPS).foldLeft(v.incRefCount())((a, b) => { + withResource(a) { + _.stringReplaceWithBackrefs(b.search, b.replace) + } + }) + withResource(actual) { _ => + CudfTestHelper.assertColumnsAreEqual(expected, actual) + } + } + } + } + + private def testRegex(rule: RegexReplace, values: Seq[String], expected: Seq[String]): Unit = { + withResource(ColumnVector.fromStrings(values: _*)) { v => + withResource(ColumnVector.fromStrings(expected: _*)) { expected => + withResource(v.stringReplaceWithBackrefs(rule.search, rule.replace)) { actual => + CudfTestHelper.assertColumnsAreEqual(expected, actual) + } + } + } + } + + // just show the failures so we don't have to manually parse all + // the output to find which ones failed + override def compareResults( + sort: Boolean, + maxFloatDiff: Double, + fromCpu: Array[Row], + fromGpu: Array[Row]): Unit = { + assert(fromCpu.length === fromGpu.length) + + val failures = fromCpu.zip(fromGpu).zipWithIndex.filterNot { + case ((cpu, gpu), _) => super.compare(cpu, gpu, 0.0001) + } + + if (failures.nonEmpty) { + val str = failures.map { + case ((cpu, gpu), i) => + s""" + |[#$i] CPU: $cpu + |[#$i] GPU: $gpu + | + |""".stripMargin + }.mkString("\n") + fail(s"Mismatch between CPU and GPU for the following rows:\n$str") + } + } + private def dates(spark: SparkSession) = { import spark.implicits._ dateValues.toDF("c0") @@ -225,12 +419,33 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE values.toDF("c0") } + private val singleDigits = Seq("1999-1-1 ", + "1999-1-1 11", + "1999-1-1 11:", + "1999-1-1 11:5", + "1999-1-1 11:59", + "1999-1-1 11:59:", + "1999-1-1 11:59:5", + "1999-1-1 11:59:59", + "1999-1-1", + "1999-1-1 ", + "1999-1-1 1", + "1999-1-1 1:", + "1999-1-1 1:2", + "1999-1-1 1:2:", + "1999-1-1 1:2:3", + "1999-1-1 1:2:3.", + "1999-1-1 1:12:3.", + "1999-1-1 11:12:3.", + "1999-1-1 11:2:3.", + "1999-1-1 11:2:13.", + "1999-1-1 1:2:3.4") + private val timestampValues = Seq( "", "null", null, "\n", - "1999-12-31 ", "1999-12-31 11", "1999-12-31 11:", "1999-12-31 11:5", @@ -238,12 +453,23 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE "1999-12-31 11:59:", "1999-12-31 11:59:5", "1999-12-31 11:59:59", + " 1999-12-31 11:59:59", + "\t1999-12-31 11:59:59", + "\t1999-12-31 11:59:59\n", "1999-12-31 11:59:59.", "1999-12-31 11:59:59.9", + " 1999-12-31 11:59:59.9", "1999-12-31 11:59:59.99", "1999-12-31 11:59:59.999", + "1999-12-31 11:59:59.9999", + "1999-12-31 11:59:59.99999", + "1999-12-31 11:59:59.999999", + "1999-12-31 11:59:59.9999999", + "1999-12-31 11:59:59.99999999", + "1999-12-31 11:59:59.999999999", "31/12/1999", "31/12/1999 11:59:59.999", + "1999-12-3", "1999-12-31", "1999/12/31", "1999-12", @@ -251,11 +477,13 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE "1975/06", "1975/06/18", "1975/06/18 06:48:57", - "1999-12-31\n", - "\t1999-12-31", - "\n1999-12-31", - "1999/12/31" - ) + "1999-12-29\n", + "\t1999-12-30", + " \n1999-12-31", + "1999/12/31", + "2001- 1-1", + "2001-1- 1", + "2001- 1- 1") ++ singleDigits ++ singleDigits.map(_.replace('-', '/')) private val dateValues = Seq( Date.valueOf("2020-07-24"), @@ -267,5 +495,6 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE Timestamp.valueOf("2015-07-25 02:02:02.2"), Timestamp.valueOf("1999-12-31 11:59:59.999") ) + } From fac5b01cd4192b271660ea6cbc40608ebf931d54 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 7 Jul 2021 16:01:07 -0600 Subject: [PATCH 2/8] Validate final string before conversion and handle more edge cases found during manual fuzzing Signed-off-by: Andy Grove --- .../sql/rapids/datetimeExpressions.scala | 44 +++++++++++-------- .../spark/rapids/ParseDateTimeSuite.scala | 19 +++++--- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index b05b09e9c1f..c6b9428a6e5 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -476,17 +476,23 @@ object GpuToTimestamp extends Arm { // is possible that other formats may be supported but these are the only ones that we have // tests for. val LEGACY_COMPATIBLE_FORMATS = Seq( - LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false), - LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false), - LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false), - LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false), - LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true), - LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true) + LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false, + "\\A\\d{4}-\\d{2}-\\d{2}(\\D|\\s|\\Z)"), + LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false, + "\\A\\d{4}/\\d{2}/\\d{2}(\\D|\\s|\\Z)"), + LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false, + "\\A\\d{2}-\\d{2}-\\d{4}(\\D|\\s|\\Z)"), + LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false, + "\\A\\d{2}/\\d{2}/\\d{4}(\\D|\\s|\\Z)"), + LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true, + "\\A\\d{4}-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(\\D|\\s|\\Z)"), + LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true, + "\\A\\d{4}/\\d{2}/\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(\\D|\\s|\\Z)") ) /** remove whitespace before month and day */ val REMOVE_WHITESPACE_FROM_MONTH_DAY: RegexReplace = - RegexReplace("(\\A\\d+)-([ ]*)(\\d+)-([ ]*)(\\d+)", "\\1-\\3-\\5") + RegexReplace("(\\A\\d+)-([ \\t]*)(\\d+)-([ \\t]*)(\\d+)", "\\1-\\3-\\5") /** Regex rule to replace "yyyy-m-" with "yyyy-mm-" */ val FIX_SINGLE_DIGIT_MONTH: RegexReplace = @@ -494,7 +500,7 @@ object GpuToTimestamp extends Arm { /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */ val FIX_SINGLE_DIGIT_DAY_1: RegexReplace = - RegexReplace("(\\A\\d+-\\d{2})-(\\d{1}\\D+)", "\\1-0\\2") + RegexReplace("(\\A\\d+-\\d{2})-(\\d{1}[\\D\\s]+)", "\\1-0\\2") /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */ val FIX_SINGLE_DIGIT_DAY_2: RegexReplace = @@ -663,22 +669,21 @@ object GpuToTimestamp extends Arm { }) if (format.isTimestamp) { - // special handling to ignore "yyyy-mm-dd hh:mm:" which Spark treats - // as null but cuDF supports - val timestampFormatNoSeconds = "\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\Z" - val formatWithSeparator = format.separator match { - case '/' => timestampFormatNoSeconds.replace('-', '/') - case '-' => timestampFormatNoSeconds - } withResource(Scalar.fromNull(dtype)) { nullValue => - withResource(fixedUp.matchesRe(formatWithSeparator)) { hasNoSeconds => + withResource(fixedUp.matchesRe(format.validRegex)) { hasNoSeconds => withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp => - hasNoSeconds.ifElse(nullValue, timestamp) + hasNoSeconds.ifElse(timestamp, nullValue) } } } } else { - asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp) + withResource(Scalar.fromNull(dtype)) { nullValue => + withResource(fixedUp.matchesRe(format.validRegex)) { isValidDate => + withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp => + isValidDate.ifElse(timestamp, nullValue) + } + } + } } } @@ -717,7 +722,8 @@ object GpuToTimestamp extends Arm { } -case class LegacyParseFormat(format: String, separator: Char, isTimestamp: Boolean) +case class LegacyParseFormat(format: String, separator: Char, isTimestamp: Boolean, + validRegex: String) case class RegexReplace(search: String, replace: String) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala index 14d2715746d..06ab169748f 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala @@ -245,22 +245,22 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE test("Regex: Fix single digit month") { testRegex(FIX_SINGLE_DIGIT_MONTH, - Seq("1-2-3", "1111-2-3", null), - Seq("1-02-3", "1111-02-3", null)) + Seq("1-2-3", "1111-2-3", "2000-7-7\n9\t8568:\n", null), + Seq("1-02-3", "1111-02-3", "2000-07-7\n9\t8568:\n", null)) } test("Regex: Fix single digit day followed by non digit char") { // single digit day followed by non digit testRegex(FIX_SINGLE_DIGIT_DAY_1, - Seq("1111-02-3 ", "1111-02-3:", null), - Seq("1111-02-03 ", "1111-02-03:", null)) + Seq("1111-02-3 ", "1111-02-3:", "2000-03-192", "2000-07-7\n9\t8568:\n", null), + Seq("1111-02-03 ", "1111-02-03:", "2000-03-192", "2000-07-07\n9\t8568:\n", null)) } test("Regex: Fix single digit day at end of string") { // single digit day at end of string testRegex(FIX_SINGLE_DIGIT_DAY_2, - Seq("1-02-3", "1111-02-3", "1111-02-03", null), - Seq("1-02-03", "1111-02-03", "1111-02-03", null)) + Seq("1-02-3", "1111-02-3", "1111-02-03", "2000-03-192", null), + Seq("1-02-03", "1111-02-03", "1111-02-03", "2000-03-192", null)) } test("Regex: Fix single digit hour 1") { @@ -483,7 +483,12 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE "1999/12/31", "2001- 1-1", "2001-1- 1", - "2001- 1- 1") ++ singleDigits ++ singleDigits.map(_.replace('-', '/')) + "2001- 1- 1", + "2000-3-192", + // interesting test cases from fuzzing that original triggered differences between CPU and GPU + "2000-1- 099\n305\n 7-390--.0:-", + "2000-7-7\n9\t8568:\n", + "2000-\t5-2.7.44584.9935") ++ singleDigits ++ singleDigits.map(_.replace('-', '/')) private val dateValues = Seq( Date.valueOf("2020-07-24"), From 6f50d9f4b0ed4336b5b483e0467af42193a96465 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Wed, 7 Jul 2021 16:23:43 -0600 Subject: [PATCH 3/8] Remove duplicate code and add more code comments Signed-off-by: Andy Grove --- .../sql/rapids/datetimeExpressions.scala | 25 ++++++++----------- 1 file changed, 11 insertions(+), 14 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index c6b9428a6e5..fe0c565082f 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -645,12 +645,16 @@ object GpuToTimestamp extends Arm { val format = LEGACY_COMPATIBLE_FORMATS.find(_.format == sparkFormat) .getOrElse(throw new IllegalStateException(s"Unsupported format $sparkFormat")) + // optimization to apply only the necessary rules depending on whether we are + // parsing to a date or timestamp val regexReplaceRules = if (format.isTimestamp) { FIX_DATES ++ FIX_TIMESTAMPS } else { FIX_DATES } + // we support date formats using either `-` or `/` to separate year, month, and day and the + // regex rules are written with '-' so we need to replace '-' with '/' here as necessary val rulesWithSeparator = format.separator match { case '/' => regexReplaceRules.map { @@ -661,6 +665,7 @@ object GpuToTimestamp extends Arm { regexReplaceRules } + // apply each rule in turn to the data val fixedUp = rulesWithSeparator .foldLeft(rejectLeadingNewlineThenStrip(lhs))((cv, regexRule) => { withResource(cv) { @@ -668,20 +673,12 @@ object GpuToTimestamp extends Arm { } }) - if (format.isTimestamp) { - withResource(Scalar.fromNull(dtype)) { nullValue => - withResource(fixedUp.matchesRe(format.validRegex)) { hasNoSeconds => - withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp => - hasNoSeconds.ifElse(timestamp, nullValue) - } - } - } - } else { - withResource(Scalar.fromNull(dtype)) { nullValue => - withResource(fixedUp.matchesRe(format.validRegex)) { isValidDate => - withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp => - isValidDate.ifElse(timestamp, nullValue) - } + // check the final value against a regex to determine if it is valid or not, so we produce + // null values for any invalid inputs + withResource(Scalar.fromNull(dtype)) { nullValue => + withResource(fixedUp.matchesRe(format.validRegex)) { isValidDate => + withResource(asTimestampOrNull(fixedUp, dtype, strfFormat, asTimestamp)) { timestamp => + isValidDate.ifElse(timestamp, nullValue) } } } From a9d11f06bace74a7a1ef4a52dafc8c531a29a376 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Jul 2021 15:19:09 -0600 Subject: [PATCH 4/8] Use raw strings for improved readability Signed-off-by: Andy Grove --- .../sql/rapids/datetimeExpressions.scala | 30 +++++++++---------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index fe0c565082f..0af87547397 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -477,54 +477,54 @@ object GpuToTimestamp extends Arm { // tests for. val LEGACY_COMPATIBLE_FORMATS = Seq( LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false, - "\\A\\d{4}-\\d{2}-\\d{2}(\\D|\\s|\\Z)"), + raw"\A\d{4}-\d{2}-\d{2}(\D|\s|\Z)"), LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false, - "\\A\\d{4}/\\d{2}/\\d{2}(\\D|\\s|\\Z)"), + raw"\A\d{4}/\d{2}/\d{2}(\D|\s|\Z)"), LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false, - "\\A\\d{2}-\\d{2}-\\d{4}(\\D|\\s|\\Z)"), + raw"\A\d{2}-\d{2}-\d{4}(\D|\s|\Z)"), LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false, - "\\A\\d{2}/\\d{2}/\\d{4}(\\D|\\s|\\Z)"), + raw"\A\d{2}/\d{2}/\d{4}(\D|\s|\Z)"), LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true, - "\\A\\d{4}-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(\\D|\\s|\\Z)"), + raw"\A\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(\D|\s|\Z)"), LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true, - "\\A\\d{4}/\\d{2}/\\d{2}[ T]\\d{2}:\\d{2}:\\d{2}(\\D|\\s|\\Z)") + raw"\A\d{4}/\d{2}/\d{2}[ T]\d{2}:\d{2}:\d{2}(\D|\s|\Z)") ) /** remove whitespace before month and day */ val REMOVE_WHITESPACE_FROM_MONTH_DAY: RegexReplace = - RegexReplace("(\\A\\d+)-([ \\t]*)(\\d+)-([ \\t]*)(\\d+)", "\\1-\\3-\\5") + RegexReplace(raw"(\A\d+)-([ \t]*)(\d+)-([ \t]*)(\d+)", raw"\1-\3-\5") /** Regex rule to replace "yyyy-m-" with "yyyy-mm-" */ val FIX_SINGLE_DIGIT_MONTH: RegexReplace = - RegexReplace("(\\A\\d+)-(\\d{1}-)", "\\1-0\\2") + RegexReplace(raw"(\A\d+)-(\d{1}-)", raw"\1-0\2") /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */ val FIX_SINGLE_DIGIT_DAY_1: RegexReplace = - RegexReplace("(\\A\\d+-\\d{2})-(\\d{1}[\\D\\s]+)", "\\1-0\\2") + RegexReplace(raw"(\A\d+-\d{2})-(\d{1}[\D\s]+)", raw"\1-0\2") /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */ val FIX_SINGLE_DIGIT_DAY_2: RegexReplace = - RegexReplace("(\\A\\d+-\\d{2})-(\\d{1})\\Z", "\\1-0\\2") + RegexReplace(raw"(\A\d+-\d{2})-(\d{1})\Z", raw"\1-0\2") /** Regex rule to replace "yyyy-mm-dd h-" with "yyyy-mm-dd hh-" */ val FIX_SINGLE_DIGIT_HOUR_1: RegexReplace = - RegexReplace("(\\A\\d+-\\d{2}-\\d{2}) (\\d{1}:)", "\\1 0\\2") + RegexReplace(raw"(\A\d+-\d{2}-\d{2}) (\d{1}:)", raw"\1 0\2") /** Regex rule to replace "yyyy-mm-ddTh-" with "yyyy-mm-ddThh-" */ val FIX_SINGLE_DIGIT_HOUR_2: RegexReplace = - RegexReplace("(\\A\\d+-\\d{2}-\\d{2})T(\\d{1}:)", "\\1T0\\2") + RegexReplace(raw"(\A\d+-\d{2}-\d{2})T(\d{1}:)", raw"\1T0\2") /** Regex rule to replace "yyyy-mm-dd[ T]hh-m-" with "yyyy-mm-dd[ T]hh-mm-" */ val FIX_SINGLE_DIGIT_MINUTE: RegexReplace = - RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}):(\\d{1}:)", "\\1:0\\2") + RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}):(\d{1}:)", raw"\1:0\2") /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */ val FIX_SINGLE_DIGIT_SECOND_1: RegexReplace = - RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}):(\\d{1}\\D+)", "\\1:0\\2") + RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1}\D+)", raw"\1:0\2") /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */ val FIX_SINGLE_DIGIT_SECOND_2: RegexReplace = - RegexReplace("(\\A\\d+-\\d{2}-\\d{2}[ T]\\d{2}:\\d{2}):(\\d{1})\\Z", "\\1:0\\2") + RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1})\Z", raw"\1:0\2") /** Convert dates to standard format */ val FIX_DATES = Seq( From 2e6fe5345851eab5de1f143d041364fb44e022e5 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Jul 2021 17:20:29 -0600 Subject: [PATCH 5/8] Combine FIX_SINGLE_DIGIT_DAY patterns Signed-off-by: Andy Grove --- .../sql/rapids/datetimeExpressions.scala | 21 +++++++------------ .../spark/rapids/ParseDateTimeSuite.scala | 6 +++--- 2 files changed, 11 insertions(+), 16 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index 0af87547397..d825fa108ad 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -499,30 +499,26 @@ object GpuToTimestamp extends Arm { RegexReplace(raw"(\A\d+)-(\d{1}-)", raw"\1-0\2") /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */ - val FIX_SINGLE_DIGIT_DAY_1: RegexReplace = - RegexReplace(raw"(\A\d+-\d{2})-(\d{1}[\D\s]+)", raw"\1-0\2") + val FIX_SINGLE_DIGIT_DAY: RegexReplace = + RegexReplace(raw"(\A\d+-\d{2})-(\d{1})([\D\s]|\Z)", raw"\1-0\2\3") - /** Regex rule to replace "yyyy-mm-d" with "yyyy-mm-dd" */ - val FIX_SINGLE_DIGIT_DAY_2: RegexReplace = - RegexReplace(raw"(\A\d+-\d{2})-(\d{1})\Z", raw"\1-0\2") - - /** Regex rule to replace "yyyy-mm-dd h-" with "yyyy-mm-dd hh-" */ + /** Regex rule to replace "yyyy-mm-dd h:" with "yyyy-mm-dd hh:" */ val FIX_SINGLE_DIGIT_HOUR_1: RegexReplace = RegexReplace(raw"(\A\d+-\d{2}-\d{2}) (\d{1}:)", raw"\1 0\2") - /** Regex rule to replace "yyyy-mm-ddTh-" with "yyyy-mm-ddThh-" */ + /** Regex rule to replace "yyyy-mm-ddTh:" with "yyyy-mm-ddThh:" */ val FIX_SINGLE_DIGIT_HOUR_2: RegexReplace = RegexReplace(raw"(\A\d+-\d{2}-\d{2})T(\d{1}:)", raw"\1T0\2") - /** Regex rule to replace "yyyy-mm-dd[ T]hh-m-" with "yyyy-mm-dd[ T]hh-mm-" */ + /** Regex rule to replace "yyyy-mm-dd[ T]hh:m:" with "yyyy-mm-dd[ T]hh:mm:" */ val FIX_SINGLE_DIGIT_MINUTE: RegexReplace = RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}):(\d{1}:)", raw"\1:0\2") - /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */ + /** Regex rule to replace "yyyy-mm-dd[ T]hh:mm:s" with "yyyy-mm-dd[ T]hh:mm:ss" */ val FIX_SINGLE_DIGIT_SECOND_1: RegexReplace = RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1}\D+)", raw"\1:0\2") - /** Regex rule to replace "yyyy-mm-dd[ T]hh-mm-s" with "yyyy-mm-dd[ T]hh-mm-ss" */ + /** Regex rule to replace "yyyy-mm-dd[ T]hh:mm:s" with "yyyy-mm-dd[ T]hh:mm:ss" */ val FIX_SINGLE_DIGIT_SECOND_2: RegexReplace = RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1})\Z", raw"\1:0\2") @@ -530,8 +526,7 @@ object GpuToTimestamp extends Arm { val FIX_DATES = Seq( REMOVE_WHITESPACE_FROM_MONTH_DAY, FIX_SINGLE_DIGIT_MONTH, - FIX_SINGLE_DIGIT_DAY_1, - FIX_SINGLE_DIGIT_DAY_2) + FIX_SINGLE_DIGIT_DAY) /** Convert timestamps to standard format */ val FIX_TIMESTAMPS = Seq( diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala index 06ab169748f..447b46b2931 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.functions.{col, to_date, to_timestamp, unix_timestamp} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY_1, FIX_SINGLE_DIGIT_DAY_2, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND_1, FIX_SINGLE_DIGIT_SECOND_2, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY} +import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND_1, FIX_SINGLE_DIGIT_SECOND_2, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY} import org.apache.spark.sql.rapids.RegexReplace class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterEach { @@ -251,14 +251,14 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE test("Regex: Fix single digit day followed by non digit char") { // single digit day followed by non digit - testRegex(FIX_SINGLE_DIGIT_DAY_1, + testRegex(FIX_SINGLE_DIGIT_DAY, Seq("1111-02-3 ", "1111-02-3:", "2000-03-192", "2000-07-7\n9\t8568:\n", null), Seq("1111-02-03 ", "1111-02-03:", "2000-03-192", "2000-07-07\n9\t8568:\n", null)) } test("Regex: Fix single digit day at end of string") { // single digit day at end of string - testRegex(FIX_SINGLE_DIGIT_DAY_2, + testRegex(FIX_SINGLE_DIGIT_DAY, Seq("1-02-3", "1111-02-3", "1111-02-03", "2000-03-192", null), Seq("1-02-03", "1111-02-03", "1111-02-03", "2000-03-192", null)) } From 36ce995a6e27b31d863f6b8302b0685e2c8b1ff6 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Jul 2021 17:26:02 -0600 Subject: [PATCH 6/8] Combine patterns for matching seconds and fix some incorrect comments Signed-off-by: Andy Grove --- .../sql/rapids/datetimeExpressions.scala | 11 +++------- .../spark/rapids/ParseDateTimeSuite.scala | 22 +++++++++---------- 2 files changed, 14 insertions(+), 19 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index d825fa108ad..269db7bac6b 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -515,12 +515,8 @@ object GpuToTimestamp extends Arm { RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}):(\d{1}:)", raw"\1:0\2") /** Regex rule to replace "yyyy-mm-dd[ T]hh:mm:s" with "yyyy-mm-dd[ T]hh:mm:ss" */ - val FIX_SINGLE_DIGIT_SECOND_1: RegexReplace = - RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1}\D+)", raw"\1:0\2") - - /** Regex rule to replace "yyyy-mm-dd[ T]hh:mm:s" with "yyyy-mm-dd[ T]hh:mm:ss" */ - val FIX_SINGLE_DIGIT_SECOND_2: RegexReplace = - RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1})\Z", raw"\1:0\2") + val FIX_SINGLE_DIGIT_SECOND: RegexReplace = + RegexReplace(raw"(\A\d+-\d{2}-\d{2}[ T]\d{2}:\d{2}):(\d{1})([\D\s]|\Z)", raw"\1:0\2\3") /** Convert dates to standard format */ val FIX_DATES = Seq( @@ -533,8 +529,7 @@ object GpuToTimestamp extends Arm { FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, - FIX_SINGLE_DIGIT_SECOND_1, - FIX_SINGLE_DIGIT_SECOND_2 + FIX_SINGLE_DIGIT_SECOND ) def daysScalarSeconds(name: String): Scalar = { diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala index 447b46b2931..ba39c2018c9 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.functions.{col, to_date, to_timestamp, unix_timestamp} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND_1, FIX_SINGLE_DIGIT_SECOND_2, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY} +import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY} import org.apache.spark.sql.rapids.RegexReplace class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterEach { @@ -264,36 +264,36 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE } test("Regex: Fix single digit hour 1") { - // single digit day at end of string + // single digit hour with space separating date and time testRegex(FIX_SINGLE_DIGIT_HOUR_1, Seq("2001-12-31 1:2:3", "2001-12-31 1:22:33", null), Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null)) } test("Regex: Fix single digit hour 2") { - // single digit day at end of string + // single digit hour with 'T' separating date and time testRegex(FIX_SINGLE_DIGIT_HOUR_2, Seq("2001-12-31T1:2:3", "2001-12-31T1:22:33", null), Seq("2001-12-31T01:2:3", "2001-12-31T01:22:33", null)) } test("Regex: Fix single digit minute") { - // single digit day at end of string + // single digit minute at end of string testRegex(FIX_SINGLE_DIGIT_MINUTE, Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null), Seq("2001-12-31 01:02:3", "2001-12-31 01:22:33", null)) } - test("Regex: Fix single digit second 1") { - // single digit day at end of string - testRegex(FIX_SINGLE_DIGIT_SECOND_1, - Seq("2001-12-31 01:02:3:", "2001-12-31 01:22:33:", null), - Seq("2001-12-31 01:02:03:", "2001-12-31 01:22:33:", null)) + test("Regex: Fix single digit second followed by non digit") { + // single digit second followed by non digit + testRegex(FIX_SINGLE_DIGIT_SECOND, + Seq("2001-12-31 01:02:3:", "2001-12-31 01:22:33:", "2001-12-31 01:02:3 ", null), + Seq("2001-12-31 01:02:03:", "2001-12-31 01:22:33:", "2001-12-31 01:02:03 ", null)) } - test("Regex: Fix single digit second 2") { + test("Regex: Fix single digit second at end of string") { // single digit day at end of string - testRegex(FIX_SINGLE_DIGIT_SECOND_2, + testRegex(FIX_SINGLE_DIGIT_SECOND, Seq("2001-12-31 01:02:3", "2001-12-31 01:22:33", null), Seq("2001-12-31 01:02:03", "2001-12-31 01:22:33", null)) } From aac06e3f469241e9eb9224c8b60108efd99f5966 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Thu, 8 Jul 2021 17:57:37 -0600 Subject: [PATCH 7/8] combine two more rules Signed-off-by: Andy Grove --- .../spark/sql/rapids/datetimeExpressions.scala | 13 ++++--------- .../nvidia/spark/rapids/ParseDateTimeSuite.scala | 11 ++++++----- 2 files changed, 10 insertions(+), 14 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index 269db7bac6b..4ceaaa9beb4 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -502,13 +502,9 @@ object GpuToTimestamp extends Arm { val FIX_SINGLE_DIGIT_DAY: RegexReplace = RegexReplace(raw"(\A\d+-\d{2})-(\d{1})([\D\s]|\Z)", raw"\1-0\2\3") - /** Regex rule to replace "yyyy-mm-dd h:" with "yyyy-mm-dd hh:" */ - val FIX_SINGLE_DIGIT_HOUR_1: RegexReplace = - RegexReplace(raw"(\A\d+-\d{2}-\d{2}) (\d{1}:)", raw"\1 0\2") - - /** Regex rule to replace "yyyy-mm-ddTh:" with "yyyy-mm-ddThh:" */ - val FIX_SINGLE_DIGIT_HOUR_2: RegexReplace = - RegexReplace(raw"(\A\d+-\d{2}-\d{2})T(\d{1}:)", raw"\1T0\2") + /** Regex rule to replace "yyyy-mm-dd[ T]h:" with "yyyy-mm-dd hh:" */ + val FIX_SINGLE_DIGIT_HOUR: RegexReplace = + RegexReplace(raw"(\A\d+-\d{2}-\d{2})[ T](\d{1}:)", raw"\1 0\2") /** Regex rule to replace "yyyy-mm-dd[ T]hh:m:" with "yyyy-mm-dd[ T]hh:mm:" */ val FIX_SINGLE_DIGIT_MINUTE: RegexReplace = @@ -526,8 +522,7 @@ object GpuToTimestamp extends Arm { /** Convert timestamps to standard format */ val FIX_TIMESTAMPS = Seq( - FIX_SINGLE_DIGIT_HOUR_1, - FIX_SINGLE_DIGIT_HOUR_2, + FIX_SINGLE_DIGIT_HOUR, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_SECOND ) diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala index ba39c2018c9..45ecbd8af1b 100644 --- a/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala +++ b/tests/src/test/scala/com/nvidia/spark/rapids/ParseDateTimeSuite.scala @@ -26,7 +26,7 @@ import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.functions.{col, to_date, to_timestamp, unix_timestamp} import org.apache.spark.sql.internal.SQLConf -import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR_1, FIX_SINGLE_DIGIT_HOUR_2, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY} +import org.apache.spark.sql.rapids.GpuToTimestamp.{FIX_DATES, FIX_SINGLE_DIGIT_DAY, FIX_SINGLE_DIGIT_HOUR, FIX_SINGLE_DIGIT_MINUTE, FIX_SINGLE_DIGIT_MONTH, FIX_SINGLE_DIGIT_SECOND, FIX_TIMESTAMPS, REMOVE_WHITESPACE_FROM_MONTH_DAY} import org.apache.spark.sql.rapids.RegexReplace class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterEach { @@ -265,16 +265,17 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE test("Regex: Fix single digit hour 1") { // single digit hour with space separating date and time - testRegex(FIX_SINGLE_DIGIT_HOUR_1, + testRegex(FIX_SINGLE_DIGIT_HOUR, Seq("2001-12-31 1:2:3", "2001-12-31 1:22:33", null), Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null)) } test("Regex: Fix single digit hour 2") { // single digit hour with 'T' separating date and time - testRegex(FIX_SINGLE_DIGIT_HOUR_2, + // note that the T gets replaced with whitespace in this case + testRegex(FIX_SINGLE_DIGIT_HOUR, Seq("2001-12-31T1:2:3", "2001-12-31T1:22:33", null), - Seq("2001-12-31T01:2:3", "2001-12-31T01:22:33", null)) + Seq("2001-12-31 01:2:3", "2001-12-31 01:22:33", null)) } test("Regex: Fix single digit minute") { @@ -338,7 +339,7 @@ class ParseDateTimeSuite extends SparkQueryCompareTestSuite with BeforeAndAfterE ("1999-2-3:", "1999-02-03:"), ("1999-2-3", "1999-02-03"), ("1999-2-3 1:2:3.4", "1999-02-03 01:02:03.4"), - ("1999-2-3T1:2:3.4", "1999-02-03T01:02:03.4") + ("1999-2-3T1:2:3.4", "1999-02-03 01:02:03.4") ) val values = testPairs.map(_._1) val expected = testPairs.map(_._2) From 41208b4d1b8054590f89320b25c2749a38466776 Mon Sep 17 00:00:00 2001 From: Andy Grove Date: Fri, 9 Jul 2021 08:56:54 -0600 Subject: [PATCH 8/8] Change LEGACY_COMPATIBLE_FORMATS to Map Signed-off-by: Andy Grove --- .../sql/rapids/datetimeExpressions.scala | 23 +++++++++---------- 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala index 4ceaaa9beb4..0a76b50e8de 100644 --- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala +++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/datetimeExpressions.scala @@ -405,7 +405,7 @@ abstract class UnixTimeExprMeta[A <: BinaryExpression with TimeZoneAwareExpressi strfFormat = DateUtils.toStrf(sparkFormat, expr.left.dataType == DataTypes.StringType) // format parsed ok but we have no 100% compatible formats in LEGACY mode - if (GpuToTimestamp.LEGACY_COMPATIBLE_FORMATS.exists(_.format == sparkFormat)) { + if (GpuToTimestamp.LEGACY_COMPATIBLE_FORMATS.contains(sparkFormat)) { // LEGACY support has a number of issues that mean we cannot guarantee // compatibility with CPU // - we can only support 4 digit years but Spark supports a wider range @@ -475,18 +475,18 @@ object GpuToTimestamp extends Arm { // We are compatible with Spark for these formats when the timeParserPolicy is LEGACY. It // is possible that other formats may be supported but these are the only ones that we have // tests for. - val LEGACY_COMPATIBLE_FORMATS = Seq( - LegacyParseFormat("yyyy-MM-dd", '-', isTimestamp = false, + val LEGACY_COMPATIBLE_FORMATS = Map( + "yyyy-MM-dd" -> LegacyParseFormat('-', isTimestamp = false, raw"\A\d{4}-\d{2}-\d{2}(\D|\s|\Z)"), - LegacyParseFormat("yyyy/MM/dd", '/', isTimestamp = false, + "yyyy/MM/dd" -> LegacyParseFormat('/', isTimestamp = false, raw"\A\d{4}/\d{2}/\d{2}(\D|\s|\Z)"), - LegacyParseFormat("dd-MM-yyyy", '-', isTimestamp = false, + "dd-MM-yyyy" -> LegacyParseFormat('-', isTimestamp = false, raw"\A\d{2}-\d{2}-\d{4}(\D|\s|\Z)"), - LegacyParseFormat("dd/MM/yyyy", '/', isTimestamp = false, + "dd/MM/yyyy" -> LegacyParseFormat('/', isTimestamp = false, raw"\A\d{2}/\d{2}/\d{4}(\D|\s|\Z)"), - LegacyParseFormat("yyyy-MM-dd HH:mm:ss", '-', isTimestamp = true, + "yyyy-MM-dd HH:mm:ss" -> LegacyParseFormat('-', isTimestamp = true, raw"\A\d{4}-\d{2}-\d{2}[ T]\d{2}:\d{2}:\d{2}(\D|\s|\Z)"), - LegacyParseFormat("yyyy/MM/dd HH:mm:ss", '/', isTimestamp = true, + "yyyy/MM/dd HH:mm:ss" -> LegacyParseFormat('/', isTimestamp = true, raw"\A\d{4}/\d{2}/\d{2}[ T]\d{2}:\d{2}:\d{2}(\D|\s|\Z)") ) @@ -627,8 +627,8 @@ object GpuToTimestamp extends Arm { dtype: DType, asTimestamp: (ColumnVector, String) => ColumnVector): ColumnVector = { - val format = LEGACY_COMPATIBLE_FORMATS.find(_.format == sparkFormat) - .getOrElse(throw new IllegalStateException(s"Unsupported format $sparkFormat")) + val format = LEGACY_COMPATIBLE_FORMATS.getOrElse(sparkFormat, + throw new IllegalStateException(s"Unsupported format $sparkFormat")) // optimization to apply only the necessary rules depending on whether we are // parsing to a date or timestamp @@ -704,8 +704,7 @@ object GpuToTimestamp extends Arm { } -case class LegacyParseFormat(format: String, separator: Char, isTimestamp: Boolean, - validRegex: String) +case class LegacyParseFormat(separator: Char, isTimestamp: Boolean, validRegex: String) case class RegexReplace(search: String, replace: String)