From f21399f1b33045302cc1c38161d2354ba02bde18 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 6 Sep 2021 11:03:38 +0800 Subject: [PATCH 1/5] orc read map Signed-off-by: Chong Gao --- .../scala/com/nvidia/spark/rapids/GpuOrcScan.scala | 12 ++++++++++++ .../scala/com/nvidia/spark/rapids/GpuOverrides.scala | 2 +- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala index 9d6219d7f00..3e1c6742640 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala @@ -1020,6 +1020,18 @@ private case class GpuOrcFileFilterHandler( setMapping(id) updateMapping(prefixNew, children(i)) } + } else if(schema.getCategory == TypeDescription.Category.MAP) { + val children = schema.getChildren.asScala + for (i <- 0 until children.size) { + val prefixNew = if(i == 0) { + prefix + "._key" + } else { + prefix + "._value" + } + val id = fileSchema.findSubtype(prefixNew).getId + setMapping(id) + updateMapping(prefixNew, children(i)) + } } } updateMapping("", readerSchema, true) diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala index 4f205190062..065810d0aad 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala @@ -798,7 +798,7 @@ object GpuOverrides { TypeSig.UDT).nested())), (OrcFormatType, FileFormatChecks( cudfRead = (TypeSig.commonCudfTypes + TypeSig.ARRAY + TypeSig.DECIMAL_64 + - TypeSig.STRUCT).nested(), + TypeSig.STRUCT + TypeSig.MAP).nested(), cudfWrite = TypeSig.commonCudfTypes, sparkSig = (TypeSig.atomics + TypeSig.STRUCT + TypeSig.ARRAY + TypeSig.MAP + TypeSig.UDT).nested()))) From 552fc56b72615b815f8166e89d53afff1c9d9a45 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 6 Sep 2021 18:22:31 +0800 Subject: [PATCH 2/5] orc read map Signed-off-by: Chong Gao --- integration_tests/src/main/python/orc_test.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 344714fba2f..22b02948cf9 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -82,9 +82,16 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, StructGen([['child0', byte_gen], ['child1', orc_basic_struct_gen]]), StructGen([['child0', ArrayGen(short_gen)], ['child1', double_gen]])] +all_basic_map_gens = [MapGen(f(nullable=False), f()) for f in [BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, TimestampGen]] + [simple_string_to_string_map_gen] +all_basic_map_gens += [DateGen(start=date(1590, 1, 1))] +orc_map_gens_sample = all_basic_map_gens + [MapGen(StringGen(pattern='key_[0-9]', nullable=False), ArrayGen(string_gen), max_length=10), + MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10), + MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen)] + orc_gens_list = [orc_basic_gens, orc_array_gens_sample, orc_struct_gens_sample, + orc_map_gens_sample, pytest.param([date_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/131')), pytest.param([timestamp_gen], marks=pytest.mark.xfail(reason='https://github.com/NVIDIA/spark-rapids/issues/131'))] From 40d669070aa4ad1993c2a53da755c7a5d91e5c9d Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Mon, 6 Sep 2021 18:43:52 +0800 Subject: [PATCH 3/5] orc read map Signed-off-by: Chong Gao --- docs/supported_ops.md | 6 +++--- tools/src/main/resources/supportedDataSource.csv | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/docs/supported_ops.md b/docs/supported_ops.md index 204e7ac4cbf..a6c25210df5 100644 --- a/docs/supported_ops.md +++ b/docs/supported_ops.md @@ -16153,9 +16153,9 @@ dates or timestamps, or for a lack of type coercion support. NS -PS
max child DECIMAL precision of 18;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types BINARY, MAP, UDT
-NS -PS
max child DECIMAL precision of 18;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types BINARY, MAP, UDT
+PS
max child DECIMAL precision of 18;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types BINARY, UDT
+PS
max child DECIMAL precision of 18;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types BINARY, UDT
+PS
max child DECIMAL precision of 18;
UTC is only supported TZ for child TIMESTAMP;
unsupported child types BINARY, UDT
NS diff --git a/tools/src/main/resources/supportedDataSource.csv b/tools/src/main/resources/supportedDataSource.csv index 5e70a98d5d2..89016aa0e8d 100644 --- a/tools/src/main/resources/supportedDataSource.csv +++ b/tools/src/main/resources/supportedDataSource.csv @@ -1,6 +1,6 @@ Format,Direction,BOOLEAN,BYTE,SHORT,INT,LONG,FLOAT,DOUBLE,DATE,TIMESTAMP,STRING,DECIMAL,NULL,BINARY,CALENDAR,ARRAY,MAP,STRUCT,UDT CSV,read,CO,CO,CO,CO,CO,CO,CO,CO,CO,S,CO,NA,NS,NA,NA,NA,NA,NA -ORC,read,S,S,S,S,S,S,S,S,PS,S,CO,NA,NS,NA,PS,NS,PS,NS +ORC,read,S,S,S,S,S,S,S,S,PS,S,CO,NA,NS,NA,PS,PS,PS,NS ORC,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA Parquet,read,S,S,S,S,S,S,S,S,PS,S,CO,NA,NS,NA,PS,PS,PS,NS Parquet,write,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA,NA From 5610405bf16f0043259019038204ebb29e95d4a7 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Tue, 7 Sep 2021 11:43:36 +0800 Subject: [PATCH 4/5] orc read map Signed-off-by: Chong Gao --- integration_tests/src/main/python/orc_test.py | 17 ++++++++++++----- .../com/nvidia/spark/rapids/GpuOrcScan.scala | 6 +++--- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 22b02948cf9..31f648eedaf 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -82,11 +82,18 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, StructGen([['child0', byte_gen], ['child1', orc_basic_struct_gen]]), StructGen([['child0', ArrayGen(short_gen)], ['child1', double_gen]])] -all_basic_map_gens = [MapGen(f(nullable=False), f()) for f in [BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, TimestampGen]] + [simple_string_to_string_map_gen] -all_basic_map_gens += [DateGen(start=date(1590, 1, 1))] -orc_map_gens_sample = all_basic_map_gens + [MapGen(StringGen(pattern='key_[0-9]', nullable=False), ArrayGen(string_gen), max_length=10), - MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10), - MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen)] +# similar with parquet map gens +orc_map_gens_sample = [MapGen(f(nullable=False), f()) for f in [ + BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, + lambda nullable=True: TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc), nullable=nullable)]] + \ + [simple_string_to_string_map_gen, + MapGen(StringGen(pattern='key_[0-9]', nullable=False), ArrayGen(string_gen), max_length=10), + MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10), + MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen)] +# test map(date, date) +orc_map_gens_sample += [MapGen(DateGen(start=date(1590, 1, 1), nullable=False), DateGen(start=date(1590, 1, 1)))] +# test map(struct, struct) +orc_map_gens_sample += [MapGen(StructGen([['child0', byte_gen], ['child1', long_gen]], nullable=False), StructGen([['child0', byte_gen], ['child1', long_gen]]))] orc_gens_list = [orc_basic_gens, orc_array_gens_sample, diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala index 3e1c6742640..9823fa4f5bd 100644 --- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala +++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScan.scala @@ -1001,7 +1001,7 @@ private case class GpuOrcFileFilterHandler( if (schema.getCategory == TypeDescription.Category.STRUCT) { val fieldNames = schema.getFieldNames.asScala val children = schema.getChildren.asScala - for (i <- 0 until children.size) { + for (i <- children.indices) { val prefixNew = if (isRoot) { fieldNames(i) } else { @@ -1014,7 +1014,7 @@ private case class GpuOrcFileFilterHandler( } } else if (schema.getCategory == TypeDescription.Category.LIST) { val children = schema.getChildren.asScala - for (i <- 0 until children.size) { + for (i <- children.indices) { val prefixNew = prefix + "._elem" val id = fileSchema.findSubtype(prefixNew).getId setMapping(id) @@ -1022,7 +1022,7 @@ private case class GpuOrcFileFilterHandler( } } else if(schema.getCategory == TypeDescription.Category.MAP) { val children = schema.getChildren.asScala - for (i <- 0 until children.size) { + for (i <- children.indices) { val prefixNew = if(i == 0) { prefix + "._key" } else { From a58e4f232a5ea076eca108ab6ef39f3c5a852993 Mon Sep 17 00:00:00 2001 From: Chong Gao Date: Wed, 8 Sep 2021 16:25:10 +0800 Subject: [PATCH 5/5] refactor test code Signed-off-by: Chong Gao --- integration_tests/src/main/python/orc_test.py | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py index 31f648eedaf..462f67c0437 100644 --- a/integration_tests/src/main/python/orc_test.py +++ b/integration_tests/src/main/python/orc_test.py @@ -82,18 +82,18 @@ def test_basic_read(std_input_path, name, read_func, v1_enabled_list, orc_impl, StructGen([['child0', byte_gen], ['child1', orc_basic_struct_gen]]), StructGen([['child0', ArrayGen(short_gen)], ['child1', double_gen]])] -# similar with parquet map gens -orc_map_gens_sample = [MapGen(f(nullable=False), f()) for f in [ +orc_basic_map_gens = [simple_string_to_string_map_gen] + [MapGen(f(nullable=False), f()) for f in [ BooleanGen, ByteGen, ShortGen, IntegerGen, LongGen, FloatGen, DoubleGen, - lambda nullable=True: TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc), nullable=nullable)]] + \ - [simple_string_to_string_map_gen, - MapGen(StringGen(pattern='key_[0-9]', nullable=False), ArrayGen(string_gen), max_length=10), - MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10), - MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen)] -# test map(date, date) -orc_map_gens_sample += [MapGen(DateGen(start=date(1590, 1, 1), nullable=False), DateGen(start=date(1590, 1, 1)))] -# test map(struct, struct) -orc_map_gens_sample += [MapGen(StructGen([['child0', byte_gen], ['child1', long_gen]], nullable=False), StructGen([['child0', byte_gen], ['child1', long_gen]]))] + lambda nullable=True: TimestampGen(start=datetime(1900, 1, 1, tzinfo=timezone.utc), nullable=nullable), + lambda nullable=True: DateGen(start=date(1590, 1, 1), nullable=nullable)]] + +# Some map gens, but not all because of nesting +orc_map_gens_sample = orc_basic_map_gens + [ + MapGen(StringGen(pattern='key_[0-9]', nullable=False), ArrayGen(string_gen), max_length=10), + MapGen(RepeatSeqGen(IntegerGen(nullable=False), 10), long_gen, max_length=10), + MapGen(StringGen(pattern='key_[0-9]', nullable=False), simple_string_to_string_map_gen), + MapGen(StructGen([['child0', byte_gen], ['child1', long_gen]], nullable=False), + StructGen([['child0', byte_gen], ['child1', long_gen]]))] orc_gens_list = [orc_basic_gens, orc_array_gens_sample,