Update iceberg package version for Spark 3.2.x (#190)

* update iceberg package version for Spark 3.2.x Signed-off-by: Allen Xu <allxu@nvidia.com> * update README Signed-off-by: Allen Xu <allxu@nvidia.com> --------- Signed-off-by: Allen Xu <allxu@nvidia.com>
NVIDIA · Jun 14, 2024 · 8efb5b5 · 8efb5b5
1 parent a97870f
commit 8efb5b5
Show file tree

Hide file tree

Showing 5 changed files with 8 additions and 6 deletions.
diff --git a/nds/README.md b/nds/README.md
@@ -152,7 +152,8 @@ Parquet, Orc, Avro, JSON and Iceberg are supported for output data format at pre
 only Parquet and Orc are supported.
 
 Note: when exporting data from CSV to Iceberg, user needs to set necessary configs for Iceberg in submit template.
-e.g. [convert_submit_cpu_iceberg.template](./convert_submit_cpu_iceberg.template)
+e.g. [convert_submit_cpu_iceberg.template](./convert_submit_cpu_iceberg.template).
+To run iceberg against different Spark versions, please modify the Iceberg package version accordingly in the template file.
 
 User can also specify `--tables` to convert specific table or tables. See argument details below.
 
@@ -405,7 +406,8 @@ update operations cannot be done atomically on raw Parquet/Orc files, so we use
 [Iceberg](https://iceberg.apache.org/) as dataset metadata manager to overcome the issue.
 
 Enabling Iceberg requires additional configuration. Please refer to [Iceberg Spark](https://iceberg.apache.org/docs/latest/getting-started/)
-for details. We also provide a Spark submit template with necessary Iceberg configs: [maintenance_iceberg.template](./maintenance_iceberg.template)
+for details. We also provide a Spark submit template with necessary Iceberg configs: [maintenance_iceberg.template](./maintenance_iceberg.template).
+To run iceberg against different Spark versions, please modify the Iceberg package version accordingly in the template file.
 
 The data maintenance queries are in [data_maintenance](./data_maintenance) folder. `DF_*.sql` are
 DELETE queries while `LF_*.sql` are INSERT queries.

diff --git a/nds/convert_submit_cpu_iceberg.template b/nds/convert_submit_cpu_iceberg.template
@@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}"
                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
-                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2"
+                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop")
diff --git a/nds/maintenance_iceberg.template b/nds/maintenance_iceberg.template
@@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}"
                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
-                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2"
+                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"

diff --git a/nds/power_run_cpu_iceberg.template b/nds/power_run_cpu_iceberg.template
@@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}"
                    "--conf" "spark.executor.instances=${NUM_EXECUTORS}"
                    "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}"
                    "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}"
-                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2"
+                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"

diff --git a/nds/power_run_gpu_iceberg.template b/nds/power_run_gpu_iceberg.template
@@ -41,7 +41,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}"
                    "--conf" "spark.rapids.memory.host.spillStorageSize=32G"
                    "--conf" "spark.rapids.memory.pinnedPool.size=8g"
                    "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}"
-                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2"
+                   "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2"
                    "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions"
                    "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog"
                    "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"