From 8efb5b5730fb5c07522687117a35569eeb30f365 Mon Sep 17 00:00:00 2001
From: Allen Xu <wjxiz1992@gmail.com>
Date: Fri, 14 Jun 2024 12:42:52 +0800
Subject: [PATCH] Update iceberg package version for Spark 3.2.x (#190)

* update iceberg package version for Spark 3.2.x

Signed-off-by: Allen Xu <allxu@nvidia.com>

* update README

Signed-off-by: Allen Xu <allxu@nvidia.com>

---------

Signed-off-by: Allen Xu <allxu@nvidia.com>
---
 nds/README.md                           | 6 ++++--
 nds/convert_submit_cpu_iceberg.template | 2 +-
 nds/maintenance_iceberg.template        | 2 +-
 nds/power_run_cpu_iceberg.template      | 2 +-
 nds/power_run_gpu_iceberg.template      | 2 +-
 5 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/nds/README.md b/nds/README.md
index 94db74b..f4175f8 100644
--- a/nds/README.md
+++ b/nds/README.md
@@ -152,7 +152,8 @@ Parquet, Orc, Avro, JSON and Iceberg are supported for output data format at pre
 only Parquet and Orc are supported.
 
 Note: when exporting data from CSV to Iceberg, user needs to set necessary configs for Iceberg in submit template.
-e.g. [convert_submit_cpu_iceberg.template](./convert_submit_cpu_iceberg.template)
+e.g. [convert_submit_cpu_iceberg.template](./convert_submit_cpu_iceberg.template).
+To run iceberg against different Spark versions, please modify the Iceberg package version accordingly in the template file.
 
 User can also specify `--tables` to convert specific table or tables. See argument details below.
 
@@ -405,7 +406,8 @@ update operations cannot be done atomically on raw Parquet/Orc files, so we use
 [Iceberg](https://iceberg.apache.org/) as dataset metadata manager to overcome the issue.
 
 Enabling Iceberg requires additional configuration. Please refer to [Iceberg Spark](https://iceberg.apache.org/docs/latest/getting-started/)
-for details. We also provide a Spark submit template with necessary Iceberg configs: [maintenance_iceberg.template](./maintenance_iceberg.template)
+for details. We also provide a Spark submit template with necessary Iceberg configs: [maintenance_iceberg.template](./maintenance_iceberg.template).
+To run iceberg against different Spark versions, please modify the Iceberg package version accordingly in the template file.
 
 The data maintenance queries are in [data_maintenance](./data_maintenance) folder. `DF_*.sql` are
 DELETE queries while `LF_*.sql` are INSERT queries.
diff --git a/nds/convert_submit_cpu_iceberg.template b/nds/convert_submit_cpu_iceberg.template
index c961bfd..2440ac8 100644
--- a/nds/convert_submit_cpu_iceberg.template
+++ b/nds/convert_submit_cpu_iceberg.template
@@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}"
                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
-                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2"
+                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop")
diff --git a/nds/maintenance_iceberg.template b/nds/maintenance_iceberg.template
index d5b333e..a3f3fe6 100644
--- a/nds/maintenance_iceberg.template
+++ b/nds/maintenance_iceberg.template
@@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}"
                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
-                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2"
+                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"
diff --git a/nds/power_run_cpu_iceberg.template b/nds/power_run_cpu_iceberg.template
index 16cdf64..006e6f0 100644
--- a/nds/power_run_cpu_iceberg.template
+++ b/nds/power_run_cpu_iceberg.template
@@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}"
                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
-                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2"
+                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"
diff --git a/nds/power_run_gpu_iceberg.template b/nds/power_run_gpu_iceberg.template
index 164bdfa..f6e1c04 100644
--- a/nds/power_run_gpu_iceberg.template
+++ b/nds/power_run_gpu_iceberg.template
@@ -41,7 +41,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}"
                    "--conf" "spark.rapids.memory.host.spillStorageSize=32G"
                    "--conf" "spark.rapids.memory.pinnedPool.size=8g"
                    "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}"
-                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2"
+                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"