NVIDIA · razajafri · Aug 25, 2021 · Aug 12, 2021 · Aug 16, 2021 · Aug 17, 2021
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -582,9 +582,9 @@ Accelerator supports are described below.
 <td><b>NS</b></td>
 <td><b>NS</b></td>
 <td><b>NS</b></td>
-<td><em>PS<br/>Only supported for Parquet;<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested NULL, BINARY, CALENDAR, MAP, UDT</em></td>
-<td><b>NS</b></td>
-<td><em>PS<br/>Only supported for Parquet;<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested NULL, BINARY, CALENDAR, MAP, UDT</em></td>
+<td><em>PS<br/>Only supported for Parquet;<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested NULL, BINARY, CALENDAR, UDT</em></td>
+<td><em>PS<br/>Only supported for Parquet;<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested NULL, BINARY, CALENDAR, UDT</em></td>
+<td><em>PS<br/>Only supported for Parquet;<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested NULL, BINARY, CALENDAR, UDT</em></td>
 <td><b>NS</b></td>
 </tr>
 <tr>
@@ -15726,9 +15726,9 @@ dates or timestamps, or for a lack of type coercion support.
 <td> </td>
 <td><b>NS</b></td>
 <td> </td>
-<td><em>PS<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested BINARY, MAP, UDT</em></td>
-<td><b>NS</b></td>
-<td><em>PS<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested BINARY, MAP, UDT</em></td>
+<td><em>PS<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested BINARY, UDT</em></td>
+<td><em>PS<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested BINARY, UDT</em></td>
+<td><em>PS<br/>max nested DECIMAL precision of 18;<br/>UTC is only supported TZ for nested TIMESTAMP;<br/>missing nested BINARY, UDT</em></td>
 <td><b>NS</b></td>
 </tr>
 </table>
diff --git a/integration_tests/src/main/python/parquet_write_test.py b/integration_tests/src/main/python/parquet_write_test.py
@@ -20,6 +20,7 @@
 from marks import *
 from pyspark.sql.types import *
 from spark_session import with_cpu_session, with_gpu_session
+import pyspark.sql.functions as f
 import random
 
 # test with original parquet file reader, the multi-file parallel reader for cloud, and coalesce file reader for
@@ -35,19 +36,39 @@
               'spark.sql.legacy.parquet.int96RebaseModeInWrite': 'CORRECTED'}
 
 
+def limited_timestamp(nullable=True):
+    return TimestampGen(start=datetime(1677, 9, 22, tzinfo=timezone.utc), end=datetime(2262, 4, 11, tzinfo=timezone.utc),
+                        nullable=nullable)
+
 parquet_basic_gen =[byte_gen, short_gen, int_gen, long_gen, float_gen, double_gen,
                     string_gen, boolean_gen, date_gen,
                     # we are limiting TimestampGen to avoid overflowing the INT96 value
                     # see https://github.com/rapidsai/cudf/issues/8070
-                    TimestampGen(start=datetime(1677, 9, 22, tzinfo=timezone.utc), end=datetime(2262, 4, 11, tzinfo=timezone.utc))]
-parquet_struct_gen = [StructGen([['child'+str(ind), sub_gen] for ind, sub_gen in enumerate(parquet_basic_gen)]),
-                      StructGen([['child0', StructGen([[ 'child1', byte_gen]])]])]
-parquet_array_gen = [ArrayGen(sub_gen, max_length=10) for sub_gen in parquet_basic_gen + parquet_struct_gen] + \
-                    [ArrayGen(ArrayGen(sub_gen, max_length=10), max_length=10) for sub_gen in parquet_basic_gen + parquet_struct_gen]
-parquet_write_gens_list = [parquet_basic_gen + parquet_struct_gen + parquet_array_gen +
-                           [decimal_gen_default,
-                           decimal_gen_scale_precision, decimal_gen_same_scale_precision, decimal_gen_64bit]]
+                    limited_timestamp()]
+
+parquet_basic_map_gens = [MapGen(f(nullable=False), f()) for f in
+                          [BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, DateGen,
+                           limited_timestamp]] + [simple_string_to_string_map_gen]
+
+parquet_struct_gen = [StructGen([['child' + str(ind), sub_gen] for ind, sub_gen in enumerate(parquet_basic_gen)]),
+                      StructGen([['child0', StructGen([['child1', byte_gen]])]]),
+                      StructGen([['child0', MapGen(StringGen(nullable=False), StringGen())], ['child1', IntegerGen()]])]
 
+parquet_array_gen = [ArrayGen(sub_gen, max_length=10) for sub_gen in parquet_basic_gen + parquet_struct_gen] + [
+    ArrayGen(ArrayGen(sub_gen, max_length=10), max_length=10) for sub_gen in parquet_basic_gen + parquet_struct_gen]
+
+parquet_map_gens_sample = parquet_basic_map_gens + [MapGen(StringGen(pattern='key_[0-9]', nullable=False),
+                                                           ArrayGen(string_gen), max_length=10),
+                                                    MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen,
+                                                           max_length=10),
+                                                    MapGen(StringGen(pattern='key_[0-9]', nullable=False),
+                                                           simple_string_to_string_map_gen)]
+
+parquet_map_gens = parquet_map_gens_sample + [
+    MapGen(StructGen([['child0', StringGen()], ['child1', StringGen()]], nullable=False), FloatGen()),
+    MapGen(StructGen([['child0', StringGen(nullable=True)]], nullable=False), StringGen())]
+parquet_write_gens_list = [
+    parquet_basic_gen + parquet_struct_gen + parquet_array_gen + parquet_decimal_gens + parquet_map_gens]
 parquet_ts_write_options = ['INT96', 'TIMESTAMP_MICROS', 'TIMESTAMP_MILLIS']
 
 @pytest.mark.parametrize('parquet_gens', parquet_write_gens_list, ids=idfn)
@@ -279,3 +300,22 @@ def test_buckets_write_fallback(spark_tmp_path, spark_tmp_table_factory):
             lambda spark, path: spark.read.parquet(path),
             data_path,
             'DataWritingCommandExec')
+
+# This test is testing how the parquet_writer will behave if column has a validity mask without having any nulls.
+# There is no straight forward to do it besides creating a vector with nulls and then dropping nulls
+# cudf will create a vector with a null_mask even though we have just filtered them
+def test_write_map_nullable(spark_tmp_path):
+    data_path = spark_tmp_path + '/PARQUET_DATA'
+
+    def generate_map_with_empty_validity(spark, path):
+        gen_data = StructGen([['number', IntegerGen()], ['word', LongGen()]], nullable=False)
+        gen_df(spark, gen_data)
+        df = gen_df(spark, gen_data)
+        df_noNulls = df.filter("number is not null")
+        df_map = df_noNulls.withColumn("map", f.create_map(["number", "word"])).drop("number").drop("word")
+        df_map.coalesce(1).write.parquet(path)
+
+    assert_gpu_and_cpu_writes_are_equal_collect(
+            generate_map_with_empty_validity,
+            lambda spark, path: spark.read.parquet(path),
+            data_path)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -765,7 +765,7 @@ object GpuOverrides {
       cudfRead = (TypeSig.commonCudfTypes + TypeSig.DECIMAL_64 + TypeSig.STRUCT + TypeSig.ARRAY +
           TypeSig.MAP).nested(),
       cudfWrite = (TypeSig.commonCudfTypes + TypeSig.DECIMAL_64 + TypeSig.STRUCT +
-          TypeSig.ARRAY).nested(),
+          TypeSig.ARRAY + TypeSig.MAP).nested(),
       sparkSig = (TypeSig.atomics + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP +
           TypeSig.UDT).nested())),
     (OrcFormatType, FileFormatChecks(
@@ -2987,9 +2987,10 @@ object GpuOverrides {
     exec[DataWritingCommandExec](
       "Writing data",
       ExecChecks((TypeSig.commonCudfTypes +
-        TypeSig.DECIMAL_64.withPsNote(TypeEnum.DECIMAL, "Only supported for Parquet") +
-        TypeSig.STRUCT.withPsNote(TypeEnum.STRUCT, "Only supported for Parquet") +
-        TypeSig.ARRAY.withPsNote(TypeEnum.ARRAY, "Only supported for Parquet")).nested(),
+          TypeSig.DECIMAL_64.withPsNote(TypeEnum.DECIMAL, "Only supported for Parquet") +
+          TypeSig.STRUCT.withPsNote(TypeEnum.STRUCT, "Only supported for Parquet") +
+          TypeSig.MAP.withPsNote(TypeEnum.MAP, "Only supported for Parquet") +
+          TypeSig.ARRAY.withPsNote(TypeEnum.ARRAY, "Only supported for Parquet")).nested(),
         TypeSig.all),
       (p, conf, parent, r) => new SparkPlanMeta[DataWritingCommandExec](p, conf, parent, r) {
         override val childDataWriteCmds: scala.Seq[DataWritingCommandMeta[_]] =

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuParquetFileFormat.scala
@@ -33,7 +33,7 @@ import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.internal.SQLConf.ParquetOutputTimestampType
 import org.apache.spark.sql.rapids.ColumnarWriteTaskStatsTracker
 import org.apache.spark.sql.rapids.execution.TrampolineUtil
-import org.apache.spark.sql.types.{ArrayType, DataTypes, DateType, Decimal, DecimalType, StructField, StructType, TimestampType}
+import org.apache.spark.sql.types.{ArrayType, DataType, DataTypes, DateType, Decimal, DecimalType, MapType, StructField, StructType, TimestampType}
 import org.apache.spark.sql.vectorized.ColumnarBatch
 
 object GpuParquetFileFormat {
@@ -105,29 +105,64 @@ object GpuParquetFileFormat {
     }
   }
 
-  def parquetWriterOptionsFromSchema[T <: NestedBuilder[_, _], V <: ParquetColumnWriterOptions]
-  (builder: ParquetColumnWriterOptions.NestedBuilder[T, V],
-   schema: StructType, writeInt96: Boolean): T = {
+  def parquetWriterOptionsFromField[T <: NestedBuilder[_, _], V <: ParquetColumnWriterOptions](
+      builder: ParquetColumnWriterOptions.NestedBuilder[T, V],
+      dataType: DataType,
+      name: String,
+      writeInt96: Boolean,
+      nullable: Boolean): T = {
+    dataType match {
+      case dt: DecimalType =>
+        builder.withDecimalColumn(name, dt.precision, nullable)
+      case TimestampType =>
+        builder.withTimestampColumn(name, writeInt96, nullable)
+      case s: StructType =>
+        builder.withStructColumn(
+          parquetWriterOptionsFromSchema(
+            // we are setting this to nullable, in case the parent is a Map's key and wants to
+            // set this to false
+            structBuilder(name, nullable),
+            s,
+            writeInt96).build())
+      case a: ArrayType =>
+        builder.withListColumn(
+          parquetWriterOptionsFromField(
+            // we are setting this to nullable, in case the parent is a Map's key and wants to
+            // set this to false
+            listBuilder(name, nullable),
+            a.elementType,
+            name,
+            writeInt96,
+            true).build())
+      case m: MapType =>
+        builder.withMapColumn(
+          mapColumn(name,
+            parquetWriterOptionsFromField(
+              ParquetWriterOptions.builder(),
+              m.keyType,
+              "key",
+              writeInt96,
+              false).build().getChildColumnOptions()(0),
+            parquetWriterOptionsFromField(
+              ParquetWriterOptions.builder(),
+              m.valueType,
+              "value",
+              writeInt96,
+              nullable).build().getChildColumnOptions()(0)))
+      case _ =>
+        builder.withColumns(nullable, name)
+    }
+    builder.asInstanceOf[T]
+  }
+
+  def parquetWriterOptionsFromSchema[T <: NestedBuilder[_, _], V <: ParquetColumnWriterOptions](
+      builder: ParquetColumnWriterOptions.NestedBuilder[T, V],
+      schema: StructType,
+      writeInt96: Boolean): T = {
     // TODO once https://github.com/rapidsai/cudf/issues/7654 is fixed go back to actually
     // setting if the output is nullable or not everywhere we have hard-coded nullable=true
     schema.foreach(field =>
-      field.dataType match {
-        case dt: DecimalType =>
-          builder.withDecimalColumn(field.name, dt.precision, true)
-        case TimestampType =>
-          builder.withTimestampColumn(field.name,
-            writeInt96, true)
-        case s: StructType =>
-          builder.withStructColumn(
-            parquetWriterOptionsFromSchema(structBuilder(field.name), s, writeInt96).build())
-        case a: ArrayType =>
-          builder.withListColumn(
-            parquetWriterOptionsFromSchema(listBuilder(field.name),
-              StructType(Array(StructField(field.name, a.elementType, true))), writeInt96)
-              .build())
-        case _ =>
-          builder.withColumns(true, field.name)
-      }
+      parquetWriterOptionsFromField(builder, field.dataType, field.name, writeInt96, true)
     )
     builder.asInstanceOf[T]
   }