NVIDIA · revans2 · Mar 10, 2022 · Feb 18, 2022 · Feb 18, 2022 · Feb 18, 2022
diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -283,7 +283,6 @@ will produce a different result compared to the plugin.
 Due to inconsistencies between how CSV data is parsed CSV parsing is off by default.
 Each data type can be enabled or disabled independently using the following configs.
 
- * [spark.rapids.sql.csv.read.date.enabled](configs.md#sql.csv.read.date.enabled)
  * [spark.rapids.sql.csvTimestamps.enabled](configs.md#sql.csvTimestamps.enabled)
 
 If you know that your particular data type will be parsed correctly enough, you may enable each
@@ -323,17 +322,14 @@ Only a limited set of formats are supported when parsing dates.
 * `"MM/yyyy"`
 * `"MM-dd-yyyy"`
 * `"MM/dd/yyyy"`
+* `"dd-MM-yyyy"`
+* `"dd/MM/yyyy"`
 
 The reality is that all of these formats are supported at the same time. The plugin will only
 disable itself if you set a format that it does not support.
 
 As a workaround you can parse the column as a timestamp and then cast it to a date.
 
-Invalid dates in Spark, values that have the correct format, but the numbers produce invalid dates,
-can result in an exception by default, and how they are parsed can be controlled through a config.
-The RAPIDS Accelerator does not support any of this and will produce an incorrect date. Typically,
-one that overflowed.
-
 ### CSV Timestamps
 The CSV parser does not support time zones.  It will ignore any trailing time zone information,
 despite the format asking for a `XXX` or `[XXX]`. As such it is off by default and you can enable it

diff --git a/docs/configs.md b/docs/configs.md
@@ -67,7 +67,6 @@ Name | Description | Default Value
 <a name="sql.castStringToFloat.enabled"></a>spark.rapids.sql.castStringToFloat.enabled|When set to true, enables casting from strings to float types (float, double) on the GPU. Currently hex values aren't supported on the GPU. Also note that casting from string to float types on the GPU returns incorrect results when the string represents any number "1.7976931348623158E308" <= x < "1.7976931348623159E308" and "-1.7976931348623158E308" >= x > "-1.7976931348623159E308" in both these cases the GPU returns Double.MaxValue while CPU returns "+Infinity" and "-Infinity" respectively|false
 <a name="sql.castStringToTimestamp.enabled"></a>spark.rapids.sql.castStringToTimestamp.enabled|When set to true, casting from string to timestamp is supported on the GPU. The GPU only supports a subset of formats when casting strings to timestamps. Refer to the CAST documentation for more details.|false
 <a name="sql.concurrentGpuTasks"></a>spark.rapids.sql.concurrentGpuTasks|Set the number of tasks that can execute concurrently per GPU. Tasks may temporarily block when the number of concurrent tasks in the executor exceeds this amount. Allowing too many concurrent tasks on the same GPU may lead to GPU out of memory errors.|1
-<a name="sql.csv.read.date.enabled"></a>spark.rapids.sql.csv.read.date.enabled|Parsing invalid CSV dates produces different results from Spark|false
 <a name="sql.csvTimestamps.enabled"></a>spark.rapids.sql.csvTimestamps.enabled|When set to true, enables the CSV parser to read timestamps. The default output format for Spark includes a timezone at the end. Anything except the UTC timezone is not supported. Timestamps after 2038 and before 1902 are also not supported.|false
 <a name="sql.decimalOverflowGuarantees"></a>spark.rapids.sql.decimalOverflowGuarantees|FOR TESTING ONLY. DO NOT USE IN PRODUCTION. Please see the decimal section of the compatibility documents for more information on this config.|true
 <a name="sql.enabled"></a>spark.rapids.sql.enabled|Enable (true) or disable (false) sql operations on the GPU|true

diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py
@@ -14,7 +14,8 @@
 
 import pytest
 
-from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_fallback_write, assert_cpu_and_gpu_are_equal_collect_with_capture
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_write, \
+    assert_cpu_and_gpu_are_equal_collect_with_capture, assert_gpu_fallback_collect
 from conftest import get_non_gpu_allowed
 from datetime import datetime, timezone
 from data_gen import *
@@ -167,15 +168,7 @@
     StructField('ignored_b', StringType())])
 
 _enable_all_types_conf = {'spark.rapids.sql.csvTimestamps.enabled': 'true',
-        'spark.rapids.sql.csv.read.bool.enabled': 'true',
-        'spark.rapids.sql.csv.read.date.enabled': 'true',
-        'spark.rapids.sql.csv.read.byte.enabled': 'true',
-        'spark.rapids.sql.csv.read.short.enabled': 'true',
-        'spark.rapids.sql.csv.read.integer.enabled': 'true',
-        'spark.rapids.sql.csv.read.long.enabled': 'true',
-        'spark.rapids.sql.csv.read.float.enabled': 'true',
-        'spark.rapids.sql.csv.read.double.enabled': 'true',
-        'spark.sql.legacy.timeParserPolicy': 'Corrected'}
+        'spark.sql.legacy.timeParserPolicy': 'CORRECTED'}
 
 def read_csv_df(data_path, schema, options = {}):
     def read_impl(spark):
@@ -200,8 +193,8 @@ def read_impl(spark):
 @pytest.mark.parametrize('name,schema,options', [
     ('Acquisition_2007Q3.txt', _acq_schema, {'sep': '|'}),
     ('Performance_2007Q3.txt_0', _perf_schema, {'sep': '|'}),
-    pytest.param('ts.csv', _date_schema, {}),
-    pytest.param('date.csv', _date_schema, {}, marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/1111')),
+    ('ts.csv', _date_schema, {}),
+    ('date.csv', _date_schema, {}),
     ('ts.csv', _ts_schema, {}),
     ('str.csv', _bad_str_schema, {'header': 'true'}),
     ('str.csv', _good_str_schema, {'header': 'true'}),
@@ -316,22 +309,74 @@ def test_csv_fallback(spark_tmp_path, read_func, disable_conf):
         'MM-yyyy', 'MM/yyyy', 'MM-dd-yyyy', 'MM/dd/yyyy']
 @pytest.mark.parametrize('date_format', csv_supported_date_formats, ids=idfn)
 @pytest.mark.parametrize('v1_enabled_list', ["", "csv"])
-def test_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_list):
+@pytest.mark.parametrize('ansi_enabled', ["true", "false"])
+@pytest.mark.parametrize('time_parser_policy', [
+    pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('BatchScanExec,FileSourceScanExec')),
+    'CORRECTED',
+    'EXCEPTION'
+])
+def test_date_formats_round_trip(spark_tmp_path, date_format, v1_enabled_list, ansi_enabled, time_parser_policy):
     gen = StructGen([('a', DateGen())], nullable=False)
     data_path = spark_tmp_path + '/CSV_DATA'
     schema = gen.data_type
-    updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.sources.useV1SourceList': v1_enabled_list})
+    updated_conf = copy_and_update(_enable_all_types_conf,
+       {'spark.sql.sources.useV1SourceList': v1_enabled_list,
+        'spark.sql.ansi.enabled': ansi_enabled,
+        'spark.rapids.sql.incompatibleDateFormats.enabled': True,
+        'spark.sql.legacy.timeParserPolicy': time_parser_policy})
     with_cpu_session(
             lambda spark : gen_df(spark, gen).write\
                     .option('dateFormat', date_format)\
                     .csv(data_path))
-    assert_gpu_and_cpu_are_equal_collect(
+    if time_parser_policy == 'LEGACY':
+        expected_class = 'FileSourceScanExec'
+        if v1_enabled_list == '':
+            expected_class = 'BatchScanExec'
+        assert_gpu_fallback_collect(
+            lambda spark : spark.read \
+                .schema(schema) \
+                .option('dateFormat', date_format) \
+                .csv(data_path),
+            expected_class,
+            conf=updated_conf)
+    else:
+        assert_gpu_and_cpu_are_equal_collect(
             lambda spark : spark.read\
                     .schema(schema)\
                     .option('dateFormat', date_format)\
                     .csv(data_path),
             conf=updated_conf)
 
+@pytest.mark.parametrize('filename', ["date.csv"])
+@pytest.mark.parametrize('v1_enabled_list', ["", "csv"])
+@pytest.mark.parametrize('ansi_enabled', ["true", "false"])
+@pytest.mark.parametrize('time_parser_policy', [
+    pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('BatchScanExec,FileSourceScanExec')),
+    'CORRECTED',
+    'EXCEPTION'
+])
+def test_read_valid_and_invalid_dates(std_input_path, filename, v1_enabled_list, ansi_enabled, time_parser_policy):
+    data_path = std_input_path + '/' + filename
+    updated_conf = copy_and_update(_enable_all_types_conf,
+                                   {'spark.sql.sources.useV1SourceList': v1_enabled_list,
+                                    'spark.sql.ansi.enabled': ansi_enabled,
+                                    'spark.rapids.sql.incompatibleDateFormats.enabled': True,
+                                    'spark.sql.legacy.timeParserPolicy': time_parser_policy})
+    if time_parser_policy == 'EXCEPTION':
+        assert_gpu_and_cpu_error(
+            lambda spark : spark.read \
+                .schema(_date_schema) \
+                .csv(data_path)
+                .collect(),
+            conf=updated_conf,
+            error_message='DateTimeException')
+    else:
+        assert_gpu_and_cpu_are_equal_collect(
+            lambda spark : spark.read \
+                .schema(_date_schema) \
+                .csv(data_path),
+            conf=updated_conf)
+
 csv_supported_ts_parts = ['', # Just the date
         "'T'HH:mm:ss.SSSXXX",
         "'T'HH:mm:ss[.SSS][XXX]",

diff --git a/integration_tests/src/main/python/json_test.py b/integration_tests/src/main/python/json_test.py
@@ -14,7 +14,7 @@
 
 import pytest
 
-from asserts import assert_gpu_and_cpu_are_equal_collect
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect
 from data_gen import *
 from conftest import is_databricks_runtime
 from marks import approximate_float, allow_non_gpu, ignore_order
@@ -66,6 +66,9 @@
 _decimal_10_3_schema = StructType([
     StructField('number', DecimalType(10, 3))])
 
+_date_schema = StructType([
+    StructField('number', DateType())])
+
 _string_schema = StructType([
     StructField('a', StringType())])
 
@@ -204,21 +207,81 @@ def test_json_ts_formats_round_trip(spark_tmp_path, date_format, ts_part, v1_ena
     'floats_invalid.json',
     pytest.param('floats_edge_cases.json', marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/4647')),
     'decimals.json',
+    'dates.json',
+    'dates_invalid.json',
 ])
-@pytest.mark.parametrize('schema', [_bool_schema, _byte_schema, _short_schema, _int_schema, _long_schema, _float_schema, _double_schema, _decimal_10_2_schema, _decimal_10_3_schema])
+@pytest.mark.parametrize('schema', [_bool_schema, _byte_schema, _short_schema, _int_schema, _long_schema, _float_schema, _double_schema, _decimal_10_2_schema, _decimal_10_3_schema, _date_schema])
 @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
 @pytest.mark.parametrize('allow_non_numeric_numbers', ["true", "false"])
 @pytest.mark.parametrize('allow_numeric_leading_zeros', ["true"])
 @pytest.mark.parametrize('ansi_enabled', ["true", "false"])
 def test_basic_json_read(std_input_path, filename, schema, read_func, allow_non_numeric_numbers, allow_numeric_leading_zeros, ansi_enabled):
-    updated_conf = copy_and_update(_enable_all_types_conf, {'spark.sql.ansi.enabled': ansi_enabled})
+    updated_conf = copy_and_update(_enable_all_types_conf,
+        {'spark.sql.ansi.enabled': ansi_enabled,
+         'spark.sql.legacy.timeParserPolicy': 'CORRECTED'})
     assert_gpu_and_cpu_are_equal_collect(
         read_func(std_input_path + '/' + filename,
         schema,
         { "allowNonNumericNumbers": allow_non_numeric_numbers,
           "allowNumericLeadingZeros": allow_numeric_leading_zeros}),
         conf=updated_conf)
 
+@approximate_float
+@pytest.mark.parametrize('filename', [
+    'dates.json',
+])
+@pytest.mark.parametrize('schema', [_date_schema])
+@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
+@pytest.mark.parametrize('ansi_enabled', ["true", "false"])
+@pytest.mark.parametrize('time_parser_policy', [
+    pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('FileSourceScanExec')),
+    'CORRECTED',
+    'EXCEPTION'
+])
+def test_json_read_valid_dates(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy):
+    updated_conf = copy_and_update(_enable_all_types_conf,
+                                   {'spark.sql.ansi.enabled': ansi_enabled,
+                                    'spark.sql.legacy.timeParserPolicy': time_parser_policy,
+                                    'spark.rapids.sql.incompatibleDateFormats.enabled': True})
+    f = read_func(std_input_path + '/' + filename, schema, {})
+    if time_parser_policy == 'LEGACY' and ansi_enabled == 'true':
+        assert_gpu_fallback_collect(
+            f,
+            'FileSourceScanExec',
+            conf=updated_conf)
+    else:
+        assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf)
+
+@approximate_float
+@pytest.mark.parametrize('filename', [
+    'dates_invalid.json',
+])
+@pytest.mark.parametrize('schema', [_date_schema])
+@pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
+@pytest.mark.parametrize('ansi_enabled', ["true", "false"])
+@pytest.mark.parametrize('time_parser_policy', [
+    pytest.param('LEGACY', marks=pytest.mark.allow_non_gpu('FileSourceScanExec')),
+    'CORRECTED',
+    'EXCEPTION'
+])
+def test_json_read_invalid_dates(std_input_path, filename, schema, read_func, ansi_enabled, time_parser_policy):
+    updated_conf = copy_and_update(_enable_all_types_conf,
+                                   {'spark.sql.ansi.enabled': ansi_enabled,
+                                    'spark.sql.legacy.timeParserPolicy': time_parser_policy })
+    f = read_func(std_input_path + '/' + filename, schema, {})
+    if time_parser_policy == 'EXCEPTION':
+        assert_gpu_and_cpu_error(
+            df_fun=lambda spark: f(spark).collect(),
+            conf=updated_conf,
+            error_message='DateTimeException')
+    elif time_parser_policy == 'LEGACY' and ansi_enabled == 'true':
+        assert_gpu_fallback_collect(
+            f,
+            'FileSourceScanExec',
+            conf=updated_conf)
+    else:
+        assert_gpu_and_cpu_are_equal_collect(f, conf=updated_conf)
+
 @pytest.mark.parametrize('schema', [_string_schema])
 @pytest.mark.parametrize('read_func', [read_json_df, read_json_sql])
 @pytest.mark.parametrize('allow_unquoted_chars', ["true"])

diff --git a/integration_tests/src/test/resources/dates.json b/integration_tests/src/test/resources/dates.json
@@ -0,0 +1,3 @@
+{ "number": "2020-09-16" }
+{ "number": "1581-01-01" }
+{ "number": "1583-01-01" }
diff --git a/integration_tests/src/test/resources/dates_invalid.json b/integration_tests/src/test/resources/dates_invalid.json
@@ -0,0 +1,2 @@
+{ "number": "2020-09-32" }
+{ "number": "2020-50-16" }
diff --git a/...tion_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala b/...tion_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala
@@ -47,7 +47,6 @@ class MortgageSparkSuite extends FunSuite {
       .config("spark.rapids.sql.test.enabled", false)
       .config("spark.rapids.sql.incompatibleOps.enabled", true)
       .config("spark.rapids.sql.hasNans", false)
-      .config("spark.rapids.sql.csv.read.date.enabled", true)
     val rapidsShuffle = ShimLoader.getRapidsShuffleManagerClass
     val prop = System.getProperty("rapids.shuffle.manager.override", "false")
     if (prop.equalsIgnoreCase("true")) {

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/DateUtils.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/DateUtils.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,8 @@ import com.nvidia.spark.rapids.VersionUtils.isSpark320OrLater
 
 import org.apache.spark.sql.catalyst.util.DateTimeUtils
 import org.apache.spark.sql.catalyst.util.DateTimeUtils.localDateToDays
+import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.rapids.{GpuToTimestamp, LegacyTimeParserPolicy}
 
 /**
  * Class for helper functions for Date
@@ -211,4 +213,55 @@ object DateUtils {
   }
 
   case class TimestampFormatConversionException(reason: String) extends Exception
+
+  def tagAndGetCudfFormat(
+      meta: RapidsMeta[_, _, _],
+      sparkFormat: String,
+      parseString: Boolean): String = {
+    var strfFormat = ""
+    if (GpuOverrides.getTimeParserPolicy == LegacyTimeParserPolicy) {
+      try {
+        // try and convert the format to cuDF format - this will throw an exception if
+        // the format contains unsupported characters or words
+        strfFormat = toStrf(sparkFormat, parseString)
+        // format parsed ok but we have no 100% compatible formats in LEGACY mode
+        if (GpuToTimestamp.LEGACY_COMPATIBLE_FORMATS.contains(sparkFormat)) {
+          // LEGACY support has a number of issues that mean we cannot guarantee
+          // compatibility with CPU
+          // - we can only support 4 digit years but Spark supports a wider range
+          // - we use a proleptic Gregorian calender but Spark uses a hybrid Julian+Gregorian
+          //   calender in LEGACY mode
+          if (SQLConf.get.ansiEnabled) {
+            meta.willNotWorkOnGpu("LEGACY format in ANSI mode is not supported on the GPU")
+          } else if (!meta.conf.incompatDateFormats) {
+            meta.willNotWorkOnGpu(s"LEGACY format '$sparkFormat' on the GPU is not guaranteed " +
+              s"to produce the same results as Spark on CPU. Set " +
+              s"${RapidsConf.INCOMPATIBLE_DATE_FORMATS.key}=true to force onto GPU.")
+          }
+        } else {
+          meta.willNotWorkOnGpu(s"LEGACY format '$sparkFormat' is not supported on the GPU.")
+        }
+      } catch {
+        case e: TimestampFormatConversionException =>
+          meta.willNotWorkOnGpu(s"Failed to convert ${e.reason} ${e.getMessage}")
+      }
+    } else {
+      try {
+        // try and convert the format to cuDF format - this will throw an exception if
+        // the format contains unsupported characters or words
+        strfFormat = toStrf(sparkFormat, parseString)
+        // format parsed ok, so it is either compatible (tested/certified) or incompatible
+        if (!GpuToTimestamp.CORRECTED_COMPATIBLE_FORMATS.contains(sparkFormat) &&
+          !meta.conf.incompatDateFormats) {
+          meta.willNotWorkOnGpu(s"CORRECTED format '$sparkFormat' on the GPU is not guaranteed " +
+            s"to produce the same results as Spark on CPU. Set " +
+            s"${RapidsConf.INCOMPATIBLE_DATE_FORMATS.key}=true to force onto GPU.")
+        }
+      } catch {
+        case e: TimestampFormatConversionException =>
+          meta.willNotWorkOnGpu(s"Failed to convert ${e.reason} ${e.getMessage}")
+      }
+    }
+    strfFormat
+  }
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuBatchScanExec.scala
@@ -39,6 +39,7 @@ import org.apache.spark.sql.execution.datasources.csv.CSVDataSource
 import org.apache.spark.sql.execution.datasources.v2._
 import org.apache.spark.sql.execution.datasources.v2.csv.CSVScan
 import org.apache.spark.sql.internal.SQLConf
+import org.apache.spark.sql.rapids.LegacyTimeParserPolicy
 import org.apache.spark.sql.types._
 import org.apache.spark.sql.util.CaseInsensitiveStringMap
 import org.apache.spark.sql.vectorized.ColumnarBatch
@@ -211,14 +212,14 @@ object GpuCSVScan {
     // parsedOptions.maxColumns was originally a performance optimization but is not used any more
 
     if (readSchema.map(_.dataType).contains(DateType)) {
-      if (!meta.conf.isCsvDateReadEnabled) {
-        meta.willNotWorkOnGpu("CSV reading is not 100% compatible when reading dates. " +
-            s"To enable it please set ${RapidsConf.ENABLE_READ_CSV_DATES} to true.")
+      if (GpuOverrides.getTimeParserPolicy == LegacyTimeParserPolicy) {
+        // Spark's CSV parser will parse the string "2020-50-16" to the date 2024/02/16 when
+        // timeParserPolicy is set to LEGACY mode and we would reject this as an invalid date
+        // so we fall back to CPU
+        meta.willNotWorkOnGpu(s"GpuCSVScan does not support timeParserPolicy=LEGACY")
       }
       ShimLoader.getSparkShims.dateFormatInRead(parsedOptions).foreach { dateFormat =>
-        if (!supportedDateFormats.contains(dateFormat)) {
-          meta.willNotWorkOnGpu(s"the date format '${dateFormat}' is not supported'")
-        }
+        DateUtils.tagAndGetCudfFormat(meta, dateFormat, parseString = true)
       }
     }
 
@@ -412,4 +413,5 @@ class CSVPartitionReader(
     }
   }
 
+  override def dateFormat: String = parsedOptions.dateFormat
 }
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{ "number": "2020-09-32" }
		{ "number": "2020-50-16" }