From 8efb5b5730fb5c07522687117a35569eeb30f365 Mon Sep 17 00:00:00 2001 From: Allen Xu Date: Fri, 14 Jun 2024 12:42:52 +0800 Subject: [PATCH] Update iceberg package version for Spark 3.2.x (#190) * update iceberg package version for Spark 3.2.x Signed-off-by: Allen Xu * update README Signed-off-by: Allen Xu --------- Signed-off-by: Allen Xu --- nds/README.md | 6 ++++-- nds/convert_submit_cpu_iceberg.template | 2 +- nds/maintenance_iceberg.template | 2 +- nds/power_run_cpu_iceberg.template | 2 +- nds/power_run_gpu_iceberg.template | 2 +- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/nds/README.md b/nds/README.md index 94db74b..f4175f8 100644 --- a/nds/README.md +++ b/nds/README.md @@ -152,7 +152,8 @@ Parquet, Orc, Avro, JSON and Iceberg are supported for output data format at pre only Parquet and Orc are supported. Note: when exporting data from CSV to Iceberg, user needs to set necessary configs for Iceberg in submit template. -e.g. [convert_submit_cpu_iceberg.template](./convert_submit_cpu_iceberg.template) +e.g. [convert_submit_cpu_iceberg.template](./convert_submit_cpu_iceberg.template). +To run iceberg against different Spark versions, please modify the Iceberg package version accordingly in the template file. User can also specify `--tables` to convert specific table or tables. See argument details below. @@ -405,7 +406,8 @@ update operations cannot be done atomically on raw Parquet/Orc files, so we use [Iceberg](https://iceberg.apache.org/) as dataset metadata manager to overcome the issue. Enabling Iceberg requires additional configuration. Please refer to [Iceberg Spark](https://iceberg.apache.org/docs/latest/getting-started/) -for details. We also provide a Spark submit template with necessary Iceberg configs: [maintenance_iceberg.template](./maintenance_iceberg.template) +for details. We also provide a Spark submit template with necessary Iceberg configs: [maintenance_iceberg.template](./maintenance_iceberg.template). +To run iceberg against different Spark versions, please modify the Iceberg package version accordingly in the template file. The data maintenance queries are in [data_maintenance](./data_maintenance) folder. `DF_*.sql` are DELETE queries while `LF_*.sql` are INSERT queries. diff --git a/nds/convert_submit_cpu_iceberg.template b/nds/convert_submit_cpu_iceberg.template index c961bfd..2440ac8 100644 --- a/nds/convert_submit_cpu_iceberg.template +++ b/nds/convert_submit_cpu_iceberg.template @@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}" "--conf" "spark.executor.instances=${NUM_EXECUTORS}" "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}" "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}" - "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2" + "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2" "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog" "--conf" "spark.sql.catalog.spark_catalog.type=hadoop") diff --git a/nds/maintenance_iceberg.template b/nds/maintenance_iceberg.template index d5b333e..a3f3fe6 100644 --- a/nds/maintenance_iceberg.template +++ b/nds/maintenance_iceberg.template @@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}" "--conf" "spark.executor.instances=${NUM_EXECUTORS}" "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}" "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}" - "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2" + "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2" "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog" "--conf" "spark.sql.catalog.spark_catalog.type=hadoop" diff --git a/nds/power_run_cpu_iceberg.template b/nds/power_run_cpu_iceberg.template index 16cdf64..006e6f0 100644 --- a/nds/power_run_cpu_iceberg.template +++ b/nds/power_run_cpu_iceberg.template @@ -30,7 +30,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}" "--conf" "spark.executor.instances=${NUM_EXECUTORS}" "--conf" "spark.executor.memory=${EXECUTOR_MEMORY}" "--conf" "spark.sql.shuffle.partitions=${SHUFFLE_PARTITIONS}" - "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2" + "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2" "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog" "--conf" "spark.sql.catalog.spark_catalog.type=hadoop" diff --git a/nds/power_run_gpu_iceberg.template b/nds/power_run_gpu_iceberg.template index 164bdfa..f6e1c04 100644 --- a/nds/power_run_gpu_iceberg.template +++ b/nds/power_run_gpu_iceberg.template @@ -41,7 +41,7 @@ export SPARK_CONF=("--master" "${SPARK_MASTER}" "--conf" "spark.rapids.memory.host.spillStorageSize=32G" "--conf" "spark.rapids.memory.pinnedPool.size=8g" "--conf" "spark.rapids.sql.concurrentGpuTasks=${CONCURRENT_GPU_TASKS}" - "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.1_2.12:0.13.2" + "--packages" "org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:0.13.2" "--conf" "spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions" "--conf" "spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog" "--conf" "spark.sql.catalog.spark_catalog.type=hadoop"