diff --git a/docs-new/.github/vale/styles/vocab/iceberg/accept.txt b/docs-new/.github/vale/styles/vocab/iceberg/accept.txt new file mode 100644 index 000000000000..edc4b8f29823 --- /dev/null +++ b/docs-new/.github/vale/styles/vocab/iceberg/accept.txt @@ -0,0 +1,39 @@ +API +Avro +[Bb]ackport +Cloudera +[Cc]anonicalize +[Cc]iphertext +[Cc]lasspath +[Cc]odec +[Dd]ev +Dremio +[Ee]ndian +Flink +GitHub # the 'H' must be uppercased +Gradle +[Jj]avadoc +Kryo(Serializer)? +[Ll]akehouse +[Mm]etastore[s|'s]? +[Nn]amespace +[Nn]ullable +Nessie +[Pp]laintext +[Pp]luggable +PostgreSQL # Postgres is a nickname +[Pp]ushdown +[Ss]erializable +[Ss]erverless +[Ss]ubstring +Trino +[Uu]nix # https://unix.stackexchange.com/a/532341 +UNIX +[Uu]npartitioned +[Uu]nparseable +[Uu]psert +(?i)uuid +ya?ml +YA?ML +[Zz]order +[Zz]std diff --git a/docs-new/.github/vale/styles/vocab/iceberg/reject.txt b/docs-new/.github/vale/styles/vocab/iceberg/reject.txt new file mode 100644 index 000000000000..7015fe0c4560 --- /dev/null +++ b/docs-new/.github/vale/styles/vocab/iceberg/reject.txt @@ -0,0 +1,15 @@ +# https://learn.microsoft.com/en-us/style-guide/a-z-word-list-term-collections/term-collections/date-time-terms#abbreviating-units-of-time +millis +nanos +\d[smnwy] +\dns +yrs +wks +secs +hrs +mos + +# https://learn.microsoft.com/en-us/style-guide/word-choice/avoid-jargon#testing-for-jargon +[Cc]onfig +[Pp]ythonic +[Ss]emver # symantic versioning https://semver.org/ diff --git a/docs-new/.gitignore b/docs-new/.gitignore new file mode 100644 index 000000000000..29b4103c3972 --- /dev/null +++ b/docs-new/.gitignore @@ -0,0 +1,120 @@ +## Temp remove for first phase +.github/workflows/ + +## MkDocs +/site/ + +## Vale +.github/vale/styles/* +!.github/vale/styles/vocab + +## MacOS + +# General +.DS_Store +.AppleDouble +.LSOverride + +# Icon must end with two \r +Icon + +# Thumbnails +._* + +# Files that might appear in the root of a volume +.DocumentRevisions-V100 +.fseventsd +.Spotlight-V100 +.TemporaryItems +.Trashes +.VolumeIcon.icns +.com.apple.timemachine.donotpresent + +# Directories potentially created on remote AFP share +.AppleDB +.AppleDesktop +Network Trash Folder +Temporary Items +.apdisk + +## Linux + +*~ + +# temporary files which can be created if a process still has a handle open of a deleted file +.fuse_hidden* + +# KDE directory preferences +.directory + +# Linux trash folder which might appear on any partition or disk +.Trash-* + +# .nfs files are created when an open file is removed but is still being accessed +.nfs* + +## Eclipse + +.metadata +bin/ +tmp/ +*.tmp +*.bak +*.swp +*~.nib +local.properties +.settings/ +.loadpath +.recommenders + +# External tool builders +.externalToolBuilders/ + +# Locally stored "Eclipse launch configurations" +*.launch + +# PyDev specific (Python IDE for Eclipse) +*.pydevproject + +# CDT-specific (C/C++ Development Tooling) +.cproject + +# CDT- autotools +.autotools + +# Java annotation processor (APT) +.factorypath + +# PDT-specific (PHP Development Tools) +.buildpath + +# sbteclipse plugin +.target + +# Tern plugin +.tern-project + +# TeXlipse plugin +.texlipse + +# STS (Spring Tool Suite) +.springBeans + +# Code Recommenders +.recommenders/ + +# Annotation Processing +.apt_generated/ +.apt_generated_test/ + +# Scala IDE specific (Scala & Java development for Eclipse) +.cache-main +.scala_dependencies +.worksheet + +# Project description file. +# Typically, this file would be tracked if it contains build/dependency configurations: +.project + + + diff --git a/docs-new/.vale.ini b/docs-new/.vale.ini new file mode 100644 index 000000000000..65a3deb2c682 --- /dev/null +++ b/docs-new/.vale.ini @@ -0,0 +1,48 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +StylesPath = .github/vale/styles + +Vocab = iceberg + +MinAlertLevel = suggestion + +Packages = Microsoft, alex + +[*] +BasedOnStyles = Vale +TokenIgnores = \{\{\s*[^\n$]+\s*\}\} + +#Temporarily ignore all errors + +Vale.Spelling = No +Vale.Repitition = No +Vale.Terms = No +Vale.Avoid = No + + +#[*{blogs,talks}.md] +#BasedOnStyles = Vale + +#Vale.Spelling = No +#Vale.Terms = No +#Vale.Avoid = No + +# Temporarily ignore Microsoft and alex styles. +# [*.{md,html}] +# BasedOnStyles = Vale, Microsoft, alex + diff --git a/docs-new/LICENSE b/docs-new/LICENSE new file mode 100644 index 000000000000..261eeb9e9f8b --- /dev/null +++ b/docs-new/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/docs-new/README.md b/docs-new/README.md new file mode 100644 index 000000000000..8ff18029486b --- /dev/null +++ b/docs-new/README.md @@ -0,0 +1,70 @@ + + +# making-iceberg-docs + +Testbed for the Iceberg docs. + +## Requirements + +* Python >=3.9 +* pip + +## Install + +``` +python -m venv mkdocs_env +source mkdocs_env/bin/activate + +pip install -r requirements.txt +``` + +## Add git worktrees + +For now, I'm just adding a single version + +``` +git worktree add home/docs/1.3.1 docs-1.3.1 +git worktree add home/javadoc javadoc +``` + +## Build + +``` +mkdocs build +mkdocs serve +``` + +## Validate Links + +Due to the delayed aggregation of subdocs of `mkdocs-monorepo-plugin` there are some warnings that display for subdocs that mkdocs are not able to connect due to being off by a single directory when the version directory is added, which doesn't mirror the directory layout on disk. + +To ensure the links work, I am temporarily using a tool called linkchedker, which will traverse the links on the livesite. +``` +pip install linkchecker + +./linkchecker http://localhost:8000 -r1 -Fcsv/link_warnings.csv +``` + + + +## Things to consider + + - Do not use static links to the public iceberg site. + - Only use relative links. If you want to reference the root (the directory where the main mkdocs.yml is located `home` in our case) use "spec.md" vs "/spec.md". Also, static sites should only reference the `docs/*` (see next point), but docs can reference the static content normally (e.g. `branching.md` page which is a versioned page linking to `spec.md` which is a static page). + - Avoid statically linking a specific version of the documentation ('latest', '1.3.1', etc...) unless it is absolutely relevant to the context being provided. This should almost never be the case unless referencing legacy functionality. + - When internally linking markdown files to other markdown files, [always use the `.md` suffix](https://github.com/mkdocs/mkdocs/issues/2456#issuecomment-881877986). That will indicate to mkdocs exactly how to treat that link depending on the mode the link is compiled with, e.g. if it becomes a /index.html or .html. Using the `.md` extension will work with either mode. diff --git a/docs-new/home/ASF.md b/docs-new/home/ASF.md new file mode 100644 index 000000000000..c9d5c898c5a8 --- /dev/null +++ b/docs-new/home/ASF.md @@ -0,0 +1,18 @@ + + +Apache Software Foundation links diff --git a/docs-new/home/about.md b/docs-new/home/about.md new file mode 100644 index 000000000000..b377ca1d1a22 --- /dev/null +++ b/docs-new/home/about.md @@ -0,0 +1,30 @@ +--- +Title: What is Iceberg? +--- + + +Iceberg is a high-performance format for huge analytic tables. Iceberg brings the reliability and simplicity of SQL tables to big data, while making it possible for engines like Spark, Trino, Flink, Presto, Hive and Impala to safely work with the same tables, at the same time. +
+
diff --git a/docs-new/home/assets/images/Iceberg-logo-wordmark.png b/docs-new/home/assets/images/Iceberg-logo-wordmark.png new file mode 100644 index 000000000000..6ee6cb4a9956 Binary files /dev/null and b/docs-new/home/assets/images/Iceberg-logo-wordmark.png differ diff --git a/docs-new/home/assets/images/Iceberg-logo.png b/docs-new/home/assets/images/Iceberg-logo.png new file mode 100644 index 000000000000..82f18a2ef1bf Binary files /dev/null and b/docs-new/home/assets/images/Iceberg-logo.png differ diff --git a/docs-new/home/assets/images/contact-bg.jpg b/docs-new/home/assets/images/contact-bg.jpg new file mode 100644 index 000000000000..8d240551e3fa Binary files /dev/null and b/docs-new/home/assets/images/contact-bg.jpg differ diff --git a/docs-new/home/assets/images/favicon-16x16.png b/docs-new/home/assets/images/favicon-16x16.png new file mode 100644 index 000000000000..d8c447d2a389 Binary files /dev/null and b/docs-new/home/assets/images/favicon-16x16.png differ diff --git a/docs-new/home/assets/images/favicon-32x32.png b/docs-new/home/assets/images/favicon-32x32.png new file mode 100644 index 000000000000..0f61caf936f7 Binary files /dev/null and b/docs-new/home/assets/images/favicon-32x32.png differ diff --git a/docs-new/home/assets/images/favicon-96x96.png b/docs-new/home/assets/images/favicon-96x96.png new file mode 100644 index 000000000000..5fa8e5ee9671 Binary files /dev/null and b/docs-new/home/assets/images/favicon-96x96.png differ diff --git a/docs-new/home/assets/images/favicon.ico b/docs-new/home/assets/images/favicon.ico new file mode 100644 index 000000000000..2a1daead1ba7 Binary files /dev/null and b/docs-new/home/assets/images/favicon.ico differ diff --git a/docs-new/home/assets/images/favicon.png b/docs-new/home/assets/images/favicon.png new file mode 100644 index 000000000000..5fa8e5ee9671 Binary files /dev/null and b/docs-new/home/assets/images/favicon.png differ diff --git a/docs-new/home/assets/images/iceberg-logo-icon.png b/docs-new/home/assets/images/iceberg-logo-icon.png new file mode 100644 index 000000000000..e4a99c3951e6 Binary files /dev/null and b/docs-new/home/assets/images/iceberg-logo-icon.png differ diff --git a/docs-new/home/assets/images/iceberg-metadata.png b/docs-new/home/assets/images/iceberg-metadata.png new file mode 100644 index 000000000000..48a1b0cee80e Binary files /dev/null and b/docs-new/home/assets/images/iceberg-metadata.png differ diff --git a/docs-new/home/assets/images/intro-bg.jpg b/docs-new/home/assets/images/intro-bg.jpg new file mode 100644 index 000000000000..8d240551e3fa Binary files /dev/null and b/docs-new/home/assets/images/intro-bg.jpg differ diff --git a/docs-new/home/assets/images/intro-bg.webp b/docs-new/home/assets/images/intro-bg.webp new file mode 100644 index 000000000000..1e67a759ef46 Binary files /dev/null and b/docs-new/home/assets/images/intro-bg.webp differ diff --git a/docs-new/home/benchmarks.md b/docs-new/home/benchmarks.md new file mode 100644 index 000000000000..636ad36b1646 --- /dev/null +++ b/docs-new/home/benchmarks.md @@ -0,0 +1,132 @@ +--- +title: "Benchmarks" +--- + + +## Available Benchmarks and how to run them + +Benchmarks are located under `/jmh`. It is generally favorable to only run the tests of interest rather than running all available benchmarks. +Also note that JMH benchmarks run within the same JVM as the system-under-test, so results might vary between runs. + +## Running Benchmarks on GitHub + +It is possible to run one or more Benchmarks via the **JMH Benchmarks** GH action on your own fork of the Iceberg repo. This GH action takes the following inputs: +* The repository name where those benchmarks should be run against, such as `apache/iceberg` or `/iceberg` +* The branch name to run benchmarks against, such as `master` or `my-cool-feature-branch` +* A list of comma-separated double-quoted Benchmark names, such as `"IcebergSourceFlatParquetDataReadBenchmark", "IcebergSourceFlatParquetDataFilterBenchmark", "IcebergSourceNestedListParquetDataWriteBenchmark"` + +Benchmark results will be uploaded once **all** benchmarks are done. + +It is worth noting that the GH runners have limited resources so the benchmark results should rather be seen as an indicator to guide developers in understanding code changes. +It is likely that there is variability in results across different runs, therefore the benchmark results shouldn't be used to form assumptions around production choices. + + +## Running Benchmarks locally + +Below are the existing benchmarks shown with the actual commands on how to run them locally. + + +### IcebergSourceNestedListParquetDataWriteBenchmark +A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceNestedListParquetDataWriteBenchmark -PjmhOutputPath=benchmark/iceberg-source-nested-list-parquet-data-write-benchmark-result.txt` + +### SparkParquetReadersNestedDataBenchmark +A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and Spark Parquet readers. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=SparkParquetReadersNestedDataBenchmark -PjmhOutputPath=benchmark/spark-parquet-readers-nested-data-benchmark-result.txt` + +### SparkParquetWritersFlatDataBenchmark +A benchmark that evaluates the performance of writing Parquet data with a flat schema using Iceberg and Spark Parquet writers. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=SparkParquetWritersFlatDataBenchmark -PjmhOutputPath=benchmark/spark-parquet-writers-flat-data-benchmark-result.txt` + +### IcebergSourceFlatORCDataReadBenchmark +A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceFlatORCDataReadBenchmark -PjmhOutputPath=benchmark/iceberg-source-flat-orc-data-read-benchmark-result.txt` + +### SparkParquetReadersFlatDataBenchmark +A benchmark that evaluates the performance of reading Parquet data with a flat schema using Iceberg and Spark Parquet readers. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=SparkParquetReadersFlatDataBenchmark -PjmhOutputPath=benchmark/spark-parquet-readers-flat-data-benchmark-result.txt` + +### VectorizedReadDictionaryEncodedFlatParquetDataBenchmark +A benchmark to compare performance of reading Parquet dictionary encoded data with a flat schema using vectorized Iceberg read path and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=VectorizedReadDictionaryEncodedFlatParquetDataBenchmark -PjmhOutputPath=benchmark/vectorized-read-dict-encoded-flat-parquet-data-result.txt` + +### IcebergSourceNestedListORCDataWriteBenchmark +A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceNestedListORCDataWriteBenchmark -PjmhOutputPath=benchmark/iceberg-source-nested-list-orc-data-write-benchmark-result.txt` + +### VectorizedReadFlatParquetDataBenchmark +A benchmark to compare performance of reading Parquet data with a flat schema using vectorized Iceberg read path and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=VectorizedReadFlatParquetDataBenchmark -PjmhOutputPath=benchmark/vectorized-read-flat-parquet-data-result.txt` + +### IcebergSourceFlatParquetDataWriteBenchmark +A benchmark that evaluates the performance of writing Parquet data with a flat schema using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceFlatParquetDataWriteBenchmark -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-write-benchmark-result.txt` + +### IcebergSourceNestedAvroDataReadBenchmark +A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceNestedAvroDataReadBenchmark -PjmhOutputPath=benchmark/iceberg-source-nested-avro-data-read-benchmark-result.txt` + +### IcebergSourceFlatAvroDataReadBenchmark +A benchmark that evaluates the performance of reading Avro data with a flat schema using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceFlatAvroDataReadBenchmark -PjmhOutputPath=benchmark/iceberg-source-flat-avro-data-read-benchmark-result.txt` + +### IcebergSourceNestedParquetDataWriteBenchmark +A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceNestedParquetDataWriteBenchmark -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-write-benchmark-result.txt` + +### IcebergSourceNestedParquetDataReadBenchmark +* A benchmark that evaluates the performance of reading nested Parquet data using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +` ./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceNestedParquetDataReadBenchmark -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-read-benchmark-result.txt` + +### IcebergSourceNestedORCDataReadBenchmark +A benchmark that evaluates the performance of reading ORC data with a flat schema using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceNestedORCDataReadBenchmark -PjmhOutputPath=benchmark/iceberg-source-nested-orc-data-read-benchmark-result.txt` + +### IcebergSourceFlatParquetDataReadBenchmark +A benchmark that evaluates the performance of reading Parquet data with a flat schema using Iceberg and the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceFlatParquetDataReadBenchmark -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-read-benchmark-result.txt` + +### IcebergSourceFlatParquetDataFilterBenchmark +A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. This class uses a dataset with a flat schema, where the records are clustered according to the +column used in the filter predicate. The performance is compared to the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: + +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceFlatParquetDataFilterBenchmark -PjmhOutputPath=benchmark/iceberg-source-flat-parquet-data-filter-benchmark-result.txt` + +### IcebergSourceNestedParquetDataFilterBenchmark +A benchmark that evaluates the file skipping capabilities in the Spark data source for Iceberg. This class uses a dataset with nested data, where the records are clustered according to the +column used in the filter predicate. The performance is compared to the built-in file source in Spark. To run this benchmark for either spark-2 or spark-3: +`./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=IcebergSourceNestedParquetDataFilterBenchmark -PjmhOutputPath=benchmark/iceberg-source-nested-parquet-data-filter-benchmark-result.txt` + +### SparkParquetWritersNestedDataBenchmark +* A benchmark that evaluates the performance of writing nested Parquet data using Iceberg and Spark Parquet writers. To run this benchmark for either spark-2 or spark-3: + `./gradlew :iceberg-spark:iceberg-spark[2|3]:jmh -PjmhIncludeRegex=SparkParquetWritersNestedDataBenchmark -PjmhOutputPath=benchmark/spark-parquet-writers-nested-data-benchmark-result.txt` diff --git a/docs-new/home/blogs.md b/docs-new/home/blogs.md new file mode 100644 index 000000000000..a5ef127235c5 --- /dev/null +++ b/docs-new/home/blogs.md @@ -0,0 +1,389 @@ +--- +title: "Blogs" +--- + + +## Iceberg Blogs + +Here is a list of company blogs that talk about Iceberg. The blogs are ordered from most recent to oldest. + + +### [From Hive Tables to Iceberg Tables: Hassle-Free](https://blog.cloudera.com/from-hive-tables-to-iceberg-tables-hassle-free/) +**Date**: July 14th, 2023, **Company**: Cloudera + +**Authors**: [Srinivas Rishindra Pothireddi](https://www.linkedin.com/in/srinivas-rishindra/) + +### [12 Times Faster Query Planning With Iceberg Manifest Caching in Impala](https://blog.cloudera.com/12-times-faster-query-planning-with-iceberg-manifest-caching-in-impala/) +**Date**: July 13th, 2023, **Company**: Cloudera + +**Authors**: [Riza Suminto](https://www.linkedin.com/in/rizasuminto/) + +### [How Bilibili Builds OLAP Data Lakehouse with Apache Iceberg](https://medium.com/@lirui.fudan/how-bilibili-builds-olap-data-lakehouse-with-apache-iceberg-9f3408e53f9) +**Date**: June 14th, 2023, **Company**: Bilibili + +**Authors**: [Rui Li](https://www.linkedin.com/in/rui-li-19282979/) + +### [Introducing the Apache Iceberg Catalog Migration Tool](https://www.dremio.com/blog/introducing-the-apache-iceberg-catalog-migration-tool/) +**Date**: May 12th, 2022, **Company**: Dremio + +**Authors**: [Dipankar Mazumdar](https://www.linkedin.com/in/dipankar-mazumdar/) & [Ajantha Bhat](https://www.linkedin.com/in/ajanthabhat/) + +### [3 Ways to Use Python with Apache Iceberg](https://www.dremio.com/blog/3-ways-to-use-python-with-apache-iceberg/) +**Date**: April 12th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [3 Ways to Convert a Delta Lake Table Into an Apache Iceberg Table](https://www.dremio.com/blog/3-ways-to-convert-a-delta-lake-table-into-an-apache-iceberg-table/) +**Date**: April 3rd, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [How to Convert CSV Files into an Apache Iceberg table with Dremio](https://www.dremio.com/blog/how-to-convert-csv-files-into-an-apache-iceberg-table-with-dremio/) +**Date**: April 3rd, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Open Data Lakehouse powered by Iceberg for all your Data Warehouse needs](https://blog.cloudera.com/open-data-lakehouse-powered-by-iceberg-for-all-your-data-warehouse-needs/) +**Date**: April 3rd, 2023, **Company**: Cloudera + +**Authors**: [Zoltan Borok-Nagy](https://www.linkedin.com/in/zoltán-borók-nagy-7370a65b/), [Ayush Saxena](https://www.linkedin.com/in/ayush-saxena151/), [Tamas Mate](https://www.linkedin.com/in/tmater/), [Simhadri Govindappa](https://www.linkedin.com/in/simhadri-govindappa-1a7788148/) + +### [Exploring Branch & Tags in Apache Iceberg using Spark](https://www.dremio.com/blog/exploring-branch-tags-in-apache-iceberg-using-spark/) +**Date**: March 29th, 2022, **Company**: Dremio + +**Author**: [Dipankar Mazumdar](https://www.linkedin.com/in/dipankar-mazumdar/) + +### [Iceberg Tables: Catalog Support Now Available](https://www.snowflake.com/blog/iceberg-tables-catalog-support-available-now/) +**Date**: March 29th, 2023, **Company**: Snowflake + +**Authors**: [Ron Ortloff](https://www.linkedin.com/in/ron-ortloff/), [Dennis Huo](https://www.linkedin.com/in/dennis-huo-2aaba92a/) + +### [Dealing with Data Incidents Using the Rollback Feature in Apache Iceberg](https://www.dremio.com/blog/dealing-with-data-incidents-using-the-rollback-feature-in-apache-iceberg/) +**Date**: February 24th, 2022, **Company**: Dremio + +**Author**: [Dipankar Mazumdar](https://www.linkedin.com/in/dipankar-mazumdar/) + +### [Partition and File Pruning for Dremio’s Apache Iceberg-backed Reflections](https://www.dremio.com/blog/partition-and-file-pruning-for-dremios-apache-iceberg-backed-reflections/) +**Date**: February 8th, 2022, **Company**: Dremio + +**Author**: [Benny Chow](https://www.linkedin.com/in/bechow/) + +### [Understanding Iceberg Table Metadata](https://medium.com/snowflake/understanding-iceberg-table-metadata-b1209fbcc7c3) +**Date**: January 30st, 2023, **Company**: Snowflake + +**Author**: [Phani Raj](https://www.linkedin.com/in/phani-raj-9830a31b/) + +### [Creating and managing Apache Iceberg tables using serverless features and without coding](https://medium.com/snowflake/creating-and-managing-apache-iceberg-tables-using-serverless-features-and-without-coding-14d2198cf5b5) +**Date**: January 27th, 2023, **Company**: Snowflake + +**Author**: [Parag Jain](https://www.linkedin.com/in/paragjainsa/) + +### [Getting started with Apache Iceberg](https://medium.com/snowflake/getting-started-with-apache-iceberg-80f338921a31) +**Date**: January 27th, 2023, **Company**: Snowflake + +**Author**: [Jedidiah Rajbhushan](https://www.linkedin.com/in/jrajbhushan/) + +### [How Apache Iceberg enables ACID compliance for data lakes](https://medium.com/snowflake/how-apache-iceberg-enables-acid-compliance-for-data-lakes-9069ae783b60/) +**Date**: January 13th, 2023, **Company**: Snowflake + +**Authors**: [Sumeet Tandure](https://www.linkedin.com/in/sumeettandure/) + +### [Multi-Cloud Open Lakehouse with Apache Iceberg in Cloudera Data Platform](https://blog.cloudera.com/implement-a-multi-cloud-open-lakehouse-with-apache-iceberg-in-cloudera-data-platform/) +**Date**: December 15th, 2022, **Company**: Cloudera + +**Authors**: [Bill Zhang](https://www.linkedin.com/in/billzhang01/), [Shaun Ahmadian](https://www.linkedin.com/in/ssahmadian/), [Zoltan Borok-Nagy](https://www.linkedin.com/in/zoltán-borók-nagy-7370a65b/), [Vincent Kulandaisamy](https://www.linkedin.com/in/vincentkulandaisamy/) + +### [Connecting Tableau to Apache Iceberg Tables with Dremio](https://www.dremio.com/blog/connecting-tableau-to-apache-iceberg-tables-with-dremio/) +**Date**: December 15th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Getting Started with Project Nessie, Apache Iceberg, and Apache Spark Using Docker](https://www.dremio.com/blog/getting-started-with-project-nessie-apache-iceberg-and-apache-spark-using-docker/) +**Date**: December 15th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Apache Iceberg FAQ](https://www.dremio.com/blog/apache-iceberg-faq/) +**Date**: December 14th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [A Notebook for getting started with Project Nessie, Apache Iceberg, and Apache Spark](https://www.dremio.com/blog/a-notebook-for-getting-started-with-project-nessie-apache-iceberg-and-apache-spark/) +**Date**: December 5th, 2022, **Company**: Dremio + +**Author**: [Dipankar Mazumdar](https://www.linkedin.com/in/dipankar-mazumdar/) + +### [Time Travel with Dremio and Apache Iceberg](https://www.dremio.com/blog/time-travel-with-dremio-and-apache-iceberg/) +**Date**: November 29th, 2022, **Company**: Dremio + +**Author**: [Michael Flower](https://www.linkedin.com/in/michael-flower-b0a3474/) + +### [Compaction in Apache Iceberg: Fine-Tuning Your Iceberg Table's Data Files](https://www.dremio.com/subsurface/compaction-in-apache-iceberg-fine-tuning-your-iceberg-tables-data-files/) +**Date**: November 9th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [The Life of a Read Query for Apache Iceberg Tables](https://www.dremio.com/subsurface/the-life-of-a-read-query-for-apache-iceberg-tables/) +**Date**: October 31st, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Puffins and Icebergs: Additional Stats for Apache Iceberg Tables](https://www.dremio.com/subsurface/puffins-and-icebergs-additional-stats-for-apache-iceberg-tables/) +**Date**: October 17th, 2022, **Company**: Dremio + +**Author**: [Dipankar Mazumdar](https://www.linkedin.com/in/dipankar-mazumdar/) + +### [Apache Iceberg and the Right to be Forgotten](https://www.dremio.com/subsurface/apache-iceberg-and-the-right-to-be-forgotten/) +**Date**: September 30th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Streaming Data into Apache Iceberg tables using AWS Kinesis and AWS Glue](https://www.dremio.com/subsurface/streaming-data-into-apache-iceberg-tables-using-aws-kinesis-and-aws-glue/) +**Date**: September 26th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Iceberg Flink Sink: Stream Directly into your Data Warehouse Tables](https://tabular.io/blog/flink-sink/) +**Date**: October 12, 2022, **Company**: Tabular + +**Author**: [Sam Redai](https://www.linkedin.com/in/sredai/) + +### [Partitioning for Correctness (and Performance)](https://tabular.io/blog/partitioning/) +**Date**: September 28, 2022, **Company**: Tabular + +**Author**: [Jason Reid](https://www.linkedin.com/in/jasonreid/) + +### [Ensuring High Performance at Any Scale with Apache Iceberg’s Object Store File Layout](https://www.dremio.com/subsurface/ensuring-high-performance-at-any-scale-with-apache-icebergs-object-store-file-layout/) +**Date**: September 20, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Introduction to Apache Iceberg Using Spark](https://www.dremio.com/subsurface/introduction-to-apache-iceberg-using-spark/) +**Date**: September 15, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [How Z-Ordering in Apache Iceberg Helps Improve Performance](https://www.dremio.com/subsurface/how-z-ordering-in-apache-iceberg-helps-improve-performance/) +**Date**: September 13th, 2022, **Company**: Dremio + +**Author**: [Dipankar Mazumdar](https://www.linkedin.com/in/dipankar-mazumdar/) + +### [Apache Iceberg 101 – Your Guide to Learning Apache Iceberg Concepts and Practices](https://www.dremio.com/subsurface/apache-iceberg-101-your-guide-to-learning-apache-iceberg-concepts-and-practices/) +**Date**: September 12th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [A Hands-On Look at the Structure of an Apache Iceberg Table](https://www.dremio.com/subsurface/a-hands-on-look-at-the-structure-of-an-apache-iceberg-table/) +**Date**: August 24, 2022, **Company**: Dremio + +**Author**: [Dipankar Mazumdar](https://www.linkedin.com/in/dipankar-mazumdar/) + +### [Future-Proof Partitioning and Fewer Table Rewrites with Apache Iceberg](https://www.dremio.com/subsurface/future-proof-partitioning-and-fewer-table-rewrites-with-apache-iceberg/) +**Date**: August 18, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [How to use Apache Iceberg in CDP's Open Lakehouse](https://blog.cloudera.com/how-to-use-apache-iceberg-in-cdps-open-lakehouse/) +**Date**: August 8th, 2022, **Company**: Cloudera + +**Authors**: [Bill Zhang](https://www.linkedin.com/in/billzhang01/), [Peter Ableda](https://www.linkedin.com/in/peterableda), [Shaun Ahmadian](https://www.linkedin.com/in/ssahmadian/), [Manish Maheshwari](https://www.linkedin.com/in/mmaheshwari/) + +### [Near Real-Time Ingestion For Trino](https://www.starburst.io/blog/near-real-time-ingestion-for-trino/) +**Date**: August 4th, 2022, **Company**: Starburst + +**Authors**: [Eric Hwang](https://www.linkedin.com/in/ericwhwang), [Monica Miller](https://www.linkedin.com/in/monica-d-miller), [Brian Zhan](https://www.linkedin.com/in/bzhan) + +### [How to implement Apache Iceberg in AWS Athena](https://big-data-demystified.ninja/2022/07/28/how-to-implement-apache-iceberg-in-aws-athena/) +**Date**: July 28th, 2022 + +**Author**: [Shneior Dicastro] + +### [Supercharge your Data Lakehouse with Apache Iceberg in Cloudera Data Platform](https://blog.cloudera.com/supercharge-your-data-lakehouse-with-apache-iceberg-in-cloudera-data-platform/) +**Date**: June 30th, 2022, **Company**: Cloudera + +**Authors**: [Bill Zhang](https://www.linkedin.com/in/billzhang01/), [Shaun Ahmadian](https://www.linkedin.com/in/ssahmadian/) + +### [Migrating a Hive Table to an Iceberg Table Hands-on Tutorial](https://www.dremio.com/subsurface/migrating-a-hive-table-to-an-iceberg-table-hands-on-tutorial/) +**Date**: June 6th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Fewer Accidental Full Table Scans Brought to You by Apache Iceberg’s Hidden Partitioning](https://www.dremio.com/subsurface/fewer-accidental-full-table-scans-brought-to-you-by-apache-icebergs-hidden-partitioning/) +**Date**: May 21st, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [An Introduction To The Iceberg Java API Part 2 - Table Scans](https://tabular.io/blog/java-api-part-2/) +**Date**: May 11th, 2022, **Company**: Tabular + +**Author**: [Sam Redai](https://www.linkedin.com/in/sredai/) + +### [Iceberg's Guiding Light: The Iceberg Open Table Format Specification](https://tabular.io/blog/iceberg-format-version/) +**Date**: April 26th, 2022, **Company**: Tabular + +**Author**: [Sam Redai](https://www.linkedin.com/in/sredai/) + +### [How to Migrate a Hive Table to an Iceberg Table](https://www.dremio.com/subsurface/how-to-migrate-a-hive-table-to-an-iceberg-table/) +**Date**: April 15th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [Using Iceberg's S3FileIO Implementation To Store Your Data In MinIO](https://tabular.io/blog/minio/) +**Date**: April 14th, 2022, **Company**: Tabular + +**Author**: [Sam Redai](https://www.linkedin.com/in/sredai/) + +### [Maintaining Iceberg Tables – Compaction, Expiring Snapshots, and More](https://www.dremio.com/subsurface/maintaining-iceberg-tables-compaction-expiring-snapshots-and-more/) +**Date**: April 7th, 2022, **Company**: Dremio + +**Author**: [Alex Merced](https://www.linkedin.com/in/alexmerced/) + +### [An Introduction To The Iceberg Java API - Part 1](https://tabular.io/blog/java-api-part-1/) +**Date**: April 1st, 2022, **Company**: Tabular + +**Author**: [Sam Redai](https://www.linkedin.com/in/sredai/) + +### [Integrated Audits: Streamlined Data Observability With Apache Iceberg](https://tabular.io/blog/integrated-audits/) +**Date**: March 2nd, 2022, **Company**: Tabular + +**Author**: [Sam Redai](https://www.linkedin.com/in/sredai/) + +### [Introducing Apache Iceberg in Cloudera Data Platform](https://blog.cloudera.com/introducing-apache-iceberg-in-cloudera-data-platform/) +**Date**: February 23rd, 2022, **Company**: Cloudera + +**Authors**: [Bill Zhang](https://www.linkedin.com/in/billzhang01/), [Peter Vary](https://www.linkedin.com/in/peter-vary/), [Marton Bod](https://www.linkedin.com/in/martonbod/), [Wing Yew Poon](https://github.com/wypoon) + +### [What's new in Iceberg 0.13](https://tabular.io/blog/whats-new-in-iceberg-0.13/) +**Date**: February 22nd, 2022, **Company**: Tabular + +**Author**: [Ryan Blue](https://www.linkedin.com/in/rdblue/) + +### [Apache Iceberg Becomes Industry Open Standard with Ecosystem Adoption](https://www.dremio.com/apache-iceberg-becomes-industry-open-standard-with-ecosystem-adoption/) +**Date**: February 3rd, 2022, **Company**: Dremio + +**Author**: [Mark Lyons](https://www.linkedin.com/in/markclyons/) + +### [Docker, Spark, and Iceberg: The Fastest Way to Try Iceberg!](https://tabular.io/blog/docker-spark-and-iceberg/) +**Date**: February 2nd, 2022, **Company**: Tabular + +**Author**: [Sam Redai](https://www.linkedin.com/in/sredai/), [Kyle Bendickson](https://www.linkedin.com/in/kylebendickson/) + +### [Expanding the Data Cloud with Apache Iceberg](https://www.snowflake.com/blog/expanding-the-data-cloud-with-apache-iceberg/) +**Date**: January 21st, 2022, **Company**: Snowflake + +**Author**: [James Malone](https://www.linkedin.com/in/jamesamalone/) + +### [Iceberg FileIO: Cloud Native Tables](https://tabular.io/blog/iceberg-fileio/) +**Date**: December 16th, 2021, **Company**: Tabular + +**Author**: [Daniel Weeks](https://www.linkedin.com/in/daniel-weeks-a1946860/) + +### [Using Spark in EMR with Apache Iceberg](https://tabular.io/blog/emr-spark-and-iceberg/) +**Date**: December 10th, 2021, **Company**: Tabular + +**Author**: [Sam Redai](https://www.linkedin.com/in/sredai/) + +### [Using Flink CDC to synchronize data from MySQL sharding tables and build real-time data lake](https://ververica.github.io/flink-cdc-connectors/master/content/quickstart/build-real-time-data-lake-tutorial.html) +**Date**: November 11th, 2021, **Company**: Ververica, Alibaba Cloud + +**Author**: [Yuxia Luo](https://github.com/luoyuxia), [Jark Wu](https://github.com/wuchong), [Zheng Hu](https://www.linkedin.com/in/zheng-hu-37017683/) + +### [Metadata Indexing in Iceberg](https://tabular.io/blog/iceberg-metadata-indexing/) +**Date**: October 10th, 2021, **Company**: Tabular + +**Author**: [Ryan Blue](https://www.linkedin.com/in/rdblue/) + +### [Using Debezium to Create a Data Lake with Apache Iceberg](https://debezium.io/blog/2021/10/20/using-debezium-create-data-lake-with-apache-iceberg/) +**Date**: October 20th, 2021, **Company**: Memiiso Community + +**Author**: [Ismail Simsek](https://www.linkedin.com/in/ismailsimsek/) + +### [How to Analyze CDC Data in Iceberg Data Lake Using Flink](https://www.alibabacloud.com/blog/how-to-analyze-cdc-data-in-iceberg-data-lake-using-flink_597838) +**Date**: June 15th, 2021, **Company**: Alibaba Cloud Community + +**Author**: [Li Jinsong](https://www.linkedin.com/in/%E5%8A%B2%E6%9D%BE-%E6%9D%8E-48b54b101/), [Hu Zheng](https://www.linkedin.com/in/zheng-hu-37017683/), [Yang Weihai](https://www.linkedin.com/in/weihai-yang-697a16224/), [Peidan Li](https://www.linkedin.com/in/peidian-li-18938820a/) + +### [Apache Iceberg: An Architectural Look Under the Covers](https://www.dremio.com/apache-iceberg-an-architectural-look-under-the-covers/) +**Date**: July 6th, 2021, **Company**: Dremio + +**Author**: [Jason Hughes](https://www.linkedin.com/in/jasonhhughes/) + +### [Migrating to Apache Iceberg at Adobe Experience Platform](https://medium.com/adobetech/migrating-to-apache-iceberg-at-adobe-experience-platform-40fa80f8b8de) +**Date**: Jun 17th, 2021, **Company**: Adobe + +**Author**: [Romin Parekh](https://www.linkedin.com/in/rominparekh/), [Miao Wang](https://www.linkedin.com/in/miao-wang-0406a74/), [Shone Sadler](https://www.linkedin.com/in/shonesadler/) + +### [Flink + Iceberg: How to Construct a Whole-scenario Real-time Data Warehouse](https://www.alibabacloud.com/blog/flink-%2B-iceberg-how-to-construct-a-whole-scenario-real-time-data-warehouse_597824) +**Date**: Jun 8th, 2021, **Company**: Tencent + +**Author** [Shu (Simon Su) Su](https://www.linkedin.com/in/shu-su-62944994/) + +### [Trino on Ice III: Iceberg Concurrency Model, Snapshots, and the Iceberg Spec](https://blog.starburst.io/trino-on-ice-iii-iceberg-concurrency-model-snapshots-and-the-iceberg-spec) +**Date**: May 25th, 2021, **Company**: Starburst + +**Author**: [Brian Olsen](https://www.linkedin.com/in/bitsondatadev) + +### [Trino on Ice II: In-Place Table Evolution and Cloud Compatibility with Iceberg](https://blog.starburst.io/trino-on-ice-ii-in-place-table-evolution-and-cloud-compatibility-with-iceberg) +**Date**: May 11th, 2021, **Company**: Starburst + +**Author**: [Brian Olsen](https://www.linkedin.com/in/bitsondatadev) + +### [Trino On Ice I: A Gentle Introduction To Iceberg](https://blog.starburst.io/trino-on-ice-i-a-gentle-introduction-to-iceberg) +**Date**: Apr 27th, 2021, **Company**: Starburst + +**Author**: [Brian Olsen](https://www.linkedin.com/in/bitsondatadev) + +### [Apache Iceberg: A Different Table Design for Big Data](https://thenewstack.io/apache-iceberg-a-different-table-design-for-big-data/) +**Date**: Feb 1st, 2021, **Company**: thenewstack.io + +**Author**: [Susan Hall](https://thenewstack.io/author/susanhall/) + +### [A Short Introduction to Apache Iceberg](https://medium.com/expedia-group-tech/a-short-introduction-to-apache-iceberg-d34f628b6799) +**Date**: Jan 26th, 2021, **Company**: Expedia + +**Author**: [Christine Mathiesen](https://www.linkedin.com/in/christine-mathiesen-676a98159/) + +### [Taking Query Optimizations to the Next Level with Iceberg](https://medium.com/adobetech/taking-query-optimizations-to-the-next-level-with-iceberg-6c968b83cd6f) +**Date**: Jan 14th, 2021, **Company**: Adobe + +**Author**: [Gautam Kowshik](https://www.linkedin.com/in/gautamk/), [Xabriel J. Collazo Mojica](https://www.linkedin.com/in/xabriel/) + +### [FastIngest: Low-latency Gobblin with Apache Iceberg and ORC format](https://engineering.linkedin.com/blog/2021/fastingest-low-latency-gobblin) +**Date**: Jan 6th, 2021, **Company**: Linkedin + +**Author**: [Zihan Li](https://www.linkedin.com/in/zihan-li-0a8a15149/), [Sudarshan Vasudevan](https://www.linkedin.com/in/suddu/), [Lei Sun](https://www.linkedin.com/in/lei-s-a93138a0/), [Shirshanka Das](https://www.linkedin.com/in/shirshankadas/) + +### [High Throughput Ingestion with Iceberg](https://medium.com/adobetech/high-throughput-ingestion-with-iceberg-ccf7877a413f) +**Date**: Dec 22nd, 2020, **Company**: Adobe + +**Author**: [Andrei Ionescu](http://linkedin.com/in/andreiionescu), [Shone Sadler](https://www.linkedin.com/in/shonesadler/), [Anil Malkani](https://www.linkedin.com/in/anil-malkani-52861a/) + +### [Optimizing data warehouse storage](https://netflixtechblog.com/optimizing-data-warehouse-storage-7b94a48fdcbe) +**Date**: Dec 21st, 2020, **Company**: Netflix + +**Author**: [Anupom Syam](https://www.linkedin.com/in/anupom/) + +### [Iceberg at Adobe](https://medium.com/adobetech/iceberg-at-adobe-88cf1950e866) +**Date**: Dec 3rd, 2020, **Company**: Adobe + +**Author**: [Shone Sadler](https://www.linkedin.com/in/shonesadler/), [Romin Parekh](https://www.linkedin.com/in/rominparekh/), [Anil Malkani](https://www.linkedin.com/in/anil-malkani-52861a/) + +### [Bulldozer: Batch Data Moving from Data Warehouse to Online Key-Value Stores](https://netflixtechblog.com/bulldozer-batch-data-moving-from-data-warehouse-to-online-key-value-stores-41bac13863f8) +**Date**: Oct 27th, 2020, **Company**: Netflix + +**Author**: [Tianlong Chen](https://www.linkedin.com/in/tianlong-chen-39189b7a/), [Ioannis Papapanagiotou](https://www.linkedin.com/in/ipapapa/) diff --git a/docs-new/home/catalog.md b/docs-new/home/catalog.md new file mode 100644 index 000000000000..0480f9423322 --- /dev/null +++ b/docs-new/home/catalog.md @@ -0,0 +1,48 @@ +--- +title: "Iceberg Catalogs" +--- + + +# Iceberg Catalogs + +## Overview + +You may think of Iceberg as a format for managing data in a single table, but the Iceberg library needs a way to keep track of those tables by name. Tasks like creating, dropping, and renaming tables are the responsibility of a catalog. Catalogs manage a collection of tables that are usually grouped into namespaces. The most important responsibility of a catalog is tracking a table's current metadata, which is provided by the catalog when you load a table. + +The first step when using an Iceberg client is almost always initializing and configuring a catalog. The configured catalog is then used by compute engines to execute catalog operations. Multiple types of compute engines using a shared Iceberg catalog allows them to share a common data layer. + +A catalog is almost always configured through the processing engine which passes along a set of properties during initialization. Different processing engines have different ways to configure a catalog. When configuring a catalog, it’s always best to refer to the [Iceberg documentation](docs/latest/configuration.md#catalog-properties) as well as the docs for the specific processing engine being used. Ultimately, these configurations boil down to a common set of catalog properties that will be passed to configure the Iceberg catalog. + +## Catalog Implementations + +Iceberg catalogs are flexible and can be implemented using almost any backend system. They can be plugged into any Iceberg runtime, and allow any processing engine that supports Iceberg to load the tracked Iceberg tables. Iceberg also comes with a number of catalog implementations that are ready to use out of the box. + +This includes: +- REST - a server-side catalog that’s exposed through a REST API +- Hive Metastore - tracks namespaces and tables using a Hive metastore +- JDBC - tracks namespaces and tables in a simple JDBC database +- Nessie - a transactional catalog that tracks namespaces and tables in a database with git-like version control + +There are more catalog types in addition to the ones listed here as well as custom catalogs that are developed to include specialized functionality. + +## Decoupling Using the REST Catalog + +The REST catalog was introduced in the Iceberg 0.14.0 release and provides greater control over how Iceberg catalogs are implemented. Instead of using technology-specific logic contained in the catalog clients, the implementation details of a REST catalog lives on the catalog server. If you’re familiar with Hive, this is somewhat similar to the Hive thrift service that allows access to a hive server over a single port. The server-side logic can be written in any language and use any custom technology, as long as the API follows the [Iceberg REST Open API specification](https://github.com/apache/iceberg/blob/master/open-api/rest-catalog-open-api.yaml). + +A great benefit of the REST catalog is that it allows you to use a single client to talk to any catalog backend. This increased flexibility makes +it easier to make custom catalogs compatible with engines like Athena or Starburst without requiring the inclusion of a Jar into the classpath. diff --git a/docs-new/home/community.md b/docs-new/home/community.md new file mode 100644 index 000000000000..bf5d4449b43e --- /dev/null +++ b/docs-new/home/community.md @@ -0,0 +1,106 @@ +--- +title: "Community" +--- + + +# Welcome! + +Apache Iceberg tracks issues in GitHub and prefers to receive contributions as pull requests. + +Community discussions happen primarily on the dev mailing list, on apache-iceberg Slack workspace, and on specific GitHub issues. + +## Contribute + +See [Contributing](contribute.md) for more details on how to contribute to Iceberg. + +## Issues + +Issues are tracked in GitHub: + +* [View open issues][open-issues] +* [Open a new issue][new-issue] + +[open-issues]: https://github.com/apache/iceberg/issues +[new-issue]: https://github.com/apache/iceberg/issues/new + +## Slack + +We use the [Apache Iceberg workspace](https://apache-iceberg.slack.com/) on Slack. To be invited, follow [this invite link](https://join.slack.com/t/apache-iceberg/shared_invite/zt-1znkcg5zm-7_FE~pcox347XwZE3GNfPg). + +Please note that this link may occasionally break when Slack does an upgrade. If you encounter problems using it, please let us know by sending an email to . + +## Iceberg Community Events + +This calendar contians two calendar feeds: + +* Iceberg Community Events - Events such as conferences and meetups, aimed to educate and inspire Iceberg users. +* Iceberg Dev Events - Events such as the triweekly Iceberg sync, aimed to discuss the project roadmap and how to implement features. + +You can subscribe to either or both of these calendars by clicking the "+ Google Calendar" icon on the bottom right. + + + +## Mailing Lists + +Iceberg has four mailing lists: + +* **Developers**: -- used for community discussions + - [Subscribe](mailto:dev-subscribe@iceberg.apache.org) + - [Unsubscribe](mailto:dev-unsubscribe@iceberg.apache.org) + - [Archive](https://lists.apache.org/list.html?dev@iceberg.apache.org) +* **Commits**: -- distributes commit notifications + - [Subscribe](mailto:commits-subscribe@iceberg.apache.org) + - [Unsubscribe](mailto:commits-unsubscribe@iceberg.apache.org) + - [Archive](https://lists.apache.org/list.html?commits@iceberg.apache.org) +* **Issues**: -- Github issue tracking + - [Subscribe](mailto:issues-subscribe@iceberg.apache.org) + - [Unsubscribe](mailto:issues-unsubscribe@iceberg.apache.org) + - [Archive](https://lists.apache.org/list.html?issues@iceberg.apache.org) +* **Private**: -- private list for the PMC to discuss sensitive issues related to the health of the project + - [Archive](https://lists.apache.org/list.html?private@iceberg.apache.org) + +## Community Guidelines + +### Apache Iceberg Community Guidelines + +The Apache Iceberg community is built on the principles described in the [Apache Way](https://www.apache.org/theapacheway/index.html) +and all who engage with the community are expected to be respectful, open, come with the best interests of the community in mind, +and abide by the Apache Foundation [Code of Conduct](https://www.apache.org/foundation/policies/conduct.html). + +### Participants with Corporate Interests + +A wide range of corporate entities have interests that overlap in both features and frameworks related to Iceberg and while we +encourage engagement and contributions, the community is not a venue for marketing, solicitation, or recruitment. + +Any vendor who wants to participate in the Apache Iceberg community Slack workspace should create a dedicated vendor channel +for their organization prefixed by `vendor-`. + +This space can be used to discuss features and integration with Iceberg related to the vendor offering. This space should not +be used to promote competing vendor products/services or disparage other vendor offerings. Discussion should be focused on +questions asked by the community and not to expand/introduce/redirect users to alternate offerings. + +### Marketing / Solicitation / Recruiting + +The Apache Iceberg community is a space for everyone to operate free of influence. The development lists, slack workspace, +and github should not be used to market products or services. Solicitation or overt promotion should not be performed in common +channels or through direct messages. + +Recruitment of community members should not be conducted through direct messages or community channels, but opportunities +related to contributing to or using Iceberg can be posted to the `#jobs` channel. + +For questions regarding any of the guidelines above, please contact a PMC member diff --git a/docs-new/home/contribute.md b/docs-new/home/contribute.md new file mode 100644 index 000000000000..9b47fcbb283f --- /dev/null +++ b/docs-new/home/contribute.md @@ -0,0 +1,397 @@ +--- +title: "Contribute" +--- + +# Contributing + +In this page, you will find some guidelines on contributing to Apache Iceberg. Please keep in mind that none of +these are hard rules and they're meant as a collection of helpful suggestions to make contributing as seamless of an +experience as possible. + +If you are thinking of contributing but first would like to discuss the change you wish to make, we welcome you to +head over to the [Community](community.md) page on the official Iceberg documentation site +to find a number of ways to connect with the community, including slack and our mailing lists. Of course, always feel +free to just open a [new issue](https://github.com/apache/iceberg/issues/new) in the GitHub repo. You can also check the following for a [good first issue](https://github.com/apache/iceberg/issues?q=is%3Aissue+is%3Aopen+label%3A%22good+first+issue%22). + +The Iceberg Project is hosted on GitHub at . + +## Pull Request Process + +The Iceberg community prefers to receive contributions as [Github pull requests][github-pr-docs]. + +[View open pull requests][iceberg-prs] + + +[iceberg-prs]: https://github.com/apache/iceberg/pulls +[github-pr-docs]: https://help.github.com/articles/about-pull-requests/ + +* PRs are automatically labeled based on the content by our github-actions labeling action +* It's helpful to include a prefix in the summary that provides context to PR reviewers, such as `Build:`, `Docs:`, `Spark:`, `Flink:`, `Core:`, `API:` +* If a PR is related to an issue, adding `Closes #1234` in the PR description will automatically close the issue and helps keep the project clean +* If a PR is posted for visibility and isn't necessarily ready for review or merging, be sure to convert the PR to a draft + + +## Building the Project Locally + +Iceberg is built using Gradle with Java 8 or Java 11. + +* To invoke a build and run tests: `./gradlew build` +* To skip tests: `./gradlew build -x test -x integrationTest` +* To fix code style: `./gradlew spotlessApply` +* To build particular Spark/Flink Versions: `./gradlew build -DsparkVersions=3.2,3.3 -DflinkVersions=1.14` + +Iceberg table support is organized in library modules: + +* `iceberg-common` contains utility classes used in other modules +* `iceberg-api` contains the public Iceberg API +* `iceberg-core` contains implementations of the Iceberg API and support for Avro data files, **this is what processing engines should depend on** +* `iceberg-parquet` is an optional module for working with tables backed by Parquet files +* `iceberg-arrow` is an optional module for reading Parquet into Arrow memory +* `iceberg-orc` is an optional module for working with tables backed by ORC files +* `iceberg-hive-metastore` is an implementation of Iceberg tables backed by the Hive metastore Thrift client +* `iceberg-data` is an optional module for working with tables directly from JVM applications + +This project Iceberg also has modules for adding Iceberg support to processing engines: + +* `iceberg-spark` is an implementation of Spark's Datasource V2 API for Iceberg with submodules for each spark versions (use runtime jars for a shaded version) +* `iceberg-flink` contains classes for integrating with Apache Flink (use iceberg-flink-runtime for a shaded version) +* `iceberg-mr` contains an InputFormat and other classes for integrating with Apache Hive +* `iceberg-pig` is an implementation of Pig's LoadFunc API for Iceberg + +## Setting up IDE and Code Style + +### Configuring Code Formatter for Eclipse/IntelliJ + +Follow the instructions for [Eclipse](https://github.com/google/google-java-format#eclipse) or +[IntelliJ](https://github.com/google/google-java-format#intellij-android-studio-and-other-jetbrains-ides) to install the **google-java-format** plugin (note the required manual actions for IntelliJ). + + +## Semantic Versioning + +Apache Iceberg leverages [semantic versioning](https://semver.org/#semantic-versioning-200) to ensure compatibility +for developers and users of the iceberg libraries as APIs and implementations evolve. +The requirements and guarantees provided depend on the subproject as described below: + +### Major Version Deprecations Required + +__Modules__ +`iceberg-api` + +The API subproject is the main interface for developers and users of the Iceberg API and therefore has the strongest +guarantees. +Evolution of the interfaces in this subproject are enforced by [Revapi](https://revapi.org/) and require +explicit acknowledgement of API changes. +All public interfaces and classes require one major version for deprecation cycle. +Any backward incompatible changes should be annotated as `@Deprecated` and removed for the next major release. +Backward compatible changes are allowed within major versions. + +### Minor Version Deprecations Required + +__Modules__ +`iceberg-common` +`iceberg-core` +`iceberg-data` +`iceberg-orc` +`iceberg-parquet` + +Changes to public interfaces and classes in the subprojects listed above require a deprecation cycle of one minor +release. +These projects contain common and internal code used by other projects and can evolve within a major release. +Minor release deprecation will provide other subprojects and external projects notice and opportunity to transition +to new implementations. + +### Minor Version Deprecations Discretionary + +__modules__ (All modules not referenced above) + +Other modules are less likely to be extended directly and modifications should make a good faith effort to follow a +minor version deprecation cycle. +If there are significant structural or design changes that result in deprecations +being difficult to orchestrate, it is up to the committers to decide if deprecation is necessary. + +## Deprecation Notices + +All interfaces, classes, and methods targeted for deprecation must include the following: + +1. `@Deprecated` annotation on the appropriate element +2. `@depreceted` javadoc comment including: the version for removal, the appropriate alternative for usage +3. Replacement of existing code paths that use the deprecated behavior + +Example: + +```java + /** + * Set the sequence number for this manifest entry. + * + * @param sequenceNumber a sequence number + * @deprecated since 1.0.0, will be removed in 1.1.0; use dataSequenceNumber() instead. + */ + @Deprecated + void sequenceNumber(long sequenceNumber); +``` + + +## Iceberg Code Contribution Guidelines + +### Style + +For Python, please use the tox command `tox -e format` to apply autoformatting to the project. + +Java code adheres to the [Google style](https://google.github.io/styleguide/javaguide.html), which will be verified via `./gradlew spotlessCheck` during builds. +In order to automatically fix Java code style issues, please use `./gradlew spotlessApply`. + +**NOTE**: The **google-java-format** plugin will always use the latest version of the **google-java-format**. However, `spotless` itself is configured to use **google-java-format** 1.7 +since that version is compatible with JDK 8. When formatting the code in the IDE, there is a slight chance that it will produce slightly different results. In such a case please run `./gradlew spotlessApply` +as CI will check the style against **google-java-format** 1.7. + +### Copyright + +Each file must include the Apache license information as a header. + +``` +Licensed to the Apache Software Foundation (ASF) under one +or more contributor license agreements. See the NOTICE file +distributed with this work for additional information +regarding copyright ownership. The ASF licenses this file +to you under the Apache License, Version 2.0 (the +"License"); you may not use this file except in compliance +with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, +software distributed under the License is distributed on an +"AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +KIND, either express or implied. See the License for the +specific language governing permissions and limitations +under the License. +``` + +### Configuring Copyright for IntelliJ IDEA + +Every file needs to include the Apache license as a header. This can be automated in IntelliJ by +adding a Copyright profile: + +1. In the **Settings/Preferences** dialog go to **Editor → Copyright → Copyright Profiles**. +2. Add a new profile and name it **Apache**. +3. Add the following text as the license text: + + ``` + Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + ``` +4. Go to **Editor → Copyright** and choose the **Apache** profile as the default profile for this + project. +5. Click **Apply**. + +### Java style guidelines + +#### Method naming + +1. Make method names as short as possible, while being clear. Omit needless words. +2. Avoid `get` in method names, unless an object must be a Java bean. + * In most cases, replace `get` with a more specific verb that describes what is happening in the method, like `find` or `fetch`. + * If there isn't a more specific verb or the method is a getter, omit `get` because it isn't helpful to readers and makes method names longer. +3. Where possible, use words and conjugations that form correct sentences in English when read + * For example, `Transform.preservesOrder()` reads correctly in an if statement: `if (transform.preservesOrder()) { ... }` + +#### Boolean arguments + +Avoid boolean arguments to methods that are not `private` to avoid confusing invocations like `sendMessage(false)`. It is better to create two methods with names and behavior, even if both are implemented by one internal method. + +```java + // prefer exposing suppressFailure in method names + public void sendMessageIgnoreFailure() { + sendMessageInternal(true); + } + + public void sendMessage() { + sendMessageInternal(false); + } + + private void sendMessageInternal(boolean suppressFailure) { + ... + } +``` + +When passing boolean arguments to existing or external methods, use inline comments to help the reader understand actions without an IDE. + +```java + // BAD: it is not clear what false controls + dropTable(identifier, false); + + // GOOD: these uses of dropTable are clear to the reader + dropTable(identifier, true /* purge data */); + dropTable(identifier, purge); +``` + +#### Config naming + +1. Use `-` to link words in one concept + * For example, preferred convection `access-key-id` rather than `access.key.id` +2. Use `.` to create a hierarchy of config groups + * For example, `s3` in `s3.access-key-id`, `s3.secret-access-key` + +## Testing + +### AssertJ + +Prefer using [AssertJ](https://github.com/assertj/assertj) assertions as those provide a rich and intuitive set of strongly-typed assertions. +Checks can be expressed in a fluent way and [AssertJ](https://github.com/assertj/assertj) provides rich context when assertions fail. +Additionally, [AssertJ](https://github.com/assertj/assertj) has powerful testing capabilities on collections and exceptions. +Please refer to the [usage guide](https://assertj.github.io/doc/#assertj-core-assertions-guide) for additional examples. + +```java +// bad: will only say true != false when check fails +assertTrue(x instanceof Xyz); + +// better: will show type of x when check fails +assertThat(x).isInstanceOf(Xyz.class); + +// bad: will only say true != false when check fails +assertTrue(catalog.listNamespaces().containsAll(expected)); + +// better: will show content of expected and of catalog.listNamespaces() if check fails +assertThat(catalog.listNamespaces()).containsAll(expected); +``` +```java +// ok +assertNotNull(metadataFileLocations); +assertEquals(metadataFileLocations.size(), 4); + +// better: will show the content of metadataFileLocations if check fails +assertThat(metadataFileLocations).isNotNull().hasSize(4); + +// or +assertThat(metadataFileLocations).isNotNull().hasSameSizeAs(expected).hasSize(4); +``` + +```java +// bad +try { + catalog.createNamespace(deniedNamespace); + Assert.fail("this should fail"); +} catch (Exception e) { + assertEquals(AccessDeniedException.class, e.getClass()); + assertEquals("User 'testUser' has no permission to create namespace", e.getMessage()); +} + +// better +assertThatThrownBy(() -> catalog.createNamespace(deniedNamespace)) + .isInstanceOf(AccessDeniedException.class) + .hasMessage("User 'testUser' has no permission to create namespace"); +``` +Checks on exceptions should always make sure to assert that a particular exception message has occurred. + + +### Awaitility + +Avoid using `Thread.sleep()` in tests as it leads to long test durations and flaky behavior if a condition takes slightly longer than expected. + +```java +deleteTablesAsync(); +Thread.sleep(3000L); +assertThat(tables()).isEmpty(); +``` + +A better alternative is using [Awaitility](https://github.com/awaitility/awaitility) to make sure `tables()` are eventually empty. The below example will run the check +with a default polling interval of **100 millis**: + +```java +deleteTablesAsync(); +Awaitility.await("Tables were not deleted") + .atMost(5, TimeUnit.SECONDS) + .untilAsserted(() -> assertThat(tables()).isEmpty()); +``` + +Please refer to the [usage guide](https://github.com/awaitility/awaitility/wiki/Usage) of [Awaitility](https://github.com/awaitility/awaitility) for more usage examples. + + +### JUnit4 / JUnit5 + +Iceberg currently uses a mix of JUnit4 (`org.junit` imports) and JUnit5 (`org.junit.jupiter.api` imports) tests. To allow an easier migration to JUnit5 in the future, new test classes +that are being added to the codebase should be written purely in JUnit5 where possible. + + +## Running Benchmarks +Some PRs/changesets might require running benchmarks to determine whether they are affecting the baseline performance. Currently there is +no "push a single button to get a performance comparison" solution available, therefore one has to run JMH performance tests on their local machine and +post the results on the PR. + +See [Benchmarks](benchmarks.md) for a summary of available benchmarks and how to run them. + +## Website and Documentation Updates + +Currently, there is an [iceberg-docs](https://github.com/apache/iceberg-docs) repository +which contains the HTML/CSS and other files needed for the [Iceberg website](https://iceberg.apache.org/). +The [docs folder](https://github.com/apache/iceberg/tree/master/docs) in the Iceberg repository contains +the markdown content for the documentation site. All markdown changes should still be made +to this repository. + +### Submitting Pull Requests + +Changes to the markdown contents should be submitted directly to this repository. + +Changes to the website appearance (e.g. HTML, CSS changes) should be submitted to the [iceberg-docs repository](https://github.com/apache/iceberg-docs) against the `main` branch. + +Changes to the documentation of old Iceberg versions should be submitted to the [iceberg-docs repository](https://github.com/apache/iceberg-docs) against the specific version branch. + +### Reporting Issues + +All issues related to the doc website should still be submitted to the [Iceberg repository](https://github.com/apache/iceberg). +The GitHub Issues feature of the [iceberg-docs repository](https://github.com/apache/iceberg-docs) is disabled. + +### Running Locally + +Clone the [iceberg-docs](https://github.com/apache/iceberg-docs) repository to run the website locally: +```shell +git clone git@github.com:apache/iceberg-docs.git +cd iceberg-docs +``` + +To start the landing page site locally, run: +```shell +cd landing-page && hugo serve +``` + +To start the documentation site locally, run: +```shell +cd docs && hugo serve +``` + +If you would like to see how the latest website looks based on the documentation in the Iceberg repository, you can copy docs to the iceberg-docs repository by: +```shell +rm -rf docs/content/docs +rm -rf landing-page/content/common +cp -r /docs/versioned docs/content/docs +cp -r /docs/common landing-page/content/common +``` diff --git a/docs-new/home/docs/latest/api.md b/docs-new/home/docs/latest/api.md new file mode 100644 index 000000000000..286f7bd2254d --- /dev/null +++ b/docs-new/home/docs/latest/api.md @@ -0,0 +1,256 @@ +--- +title: "Java API" +--- + + +# Iceberg Java API + +## Tables + +The main purpose of the Iceberg API is to manage table metadata, like schema, partition spec, metadata, and data files that store table data. + +Table metadata and operations are accessed through the `Table` interface. This interface will return table information. + +### Table metadata + +The [`Table` interface](../../javadoc/{{ icebergVersion }}/index.html?org/apache/iceberg/Table.html) provides access to the table metadata: + +* `schema` returns the current table [schema](schemas.md) +* `spec` returns the current table partition spec +* `properties` returns a map of key-value [properties](configuration.md) +* `currentSnapshot` returns the current table snapshot +* `snapshots` returns all valid snapshots for the table +* `snapshot(id)` returns a specific snapshot by ID +* `location` returns the table's base location + +Tables also provide `refresh` to update the table to the latest version, and expose helpers: + +* `io` returns the `FileIO` used to read and write table files +* `locationProvider` returns a `LocationProvider` used to create paths for data and metadata files + + +### Scanning + +#### File level + +Iceberg table scans start by creating a `TableScan` object with `newScan`. + +```java +TableScan scan = table.newScan(); +``` + +To configure a scan, call `filter` and `select` on the `TableScan` to get a new `TableScan` with those changes. + +```java +TableScan filteredScan = scan.filter(Expressions.equal("id", 5)) +``` + +Calls to configuration methods create a new `TableScan` so that each `TableScan` is immutable and won't change unexpectedly if shared across threads. + +When a scan is configured, `planFiles`, `planTasks`, and `schema` are used to return files, tasks, and the read projection. + +```java +TableScan scan = table.newScan() + .filter(Expressions.equal("id", 5)) + .select("id", "data"); + +Schema projection = scan.schema(); +Iterable tasks = scan.planTasks(); +``` + +Use `asOfTime` or `useSnapshot` to configure the table snapshot for time travel queries. + +#### Row level + +Iceberg table scans start by creating a `ScanBuilder` object with `IcebergGenerics.read`. + +```java +ScanBuilder scanBuilder = IcebergGenerics.read(table) +``` + +To configure a scan, call `where` and `select` on the `ScanBuilder` to get a new `ScanBuilder` with those changes. + +```java +scanBuilder.where(Expressions.equal("id", 5)) +``` + +When a scan is configured, call method `build` to execute scan. `build` return `CloseableIterable` + +```java +CloseableIterable result = IcebergGenerics.read(table) + .where(Expressions.lessThan("id", 5)) + .build(); +``` +where `Record` is Iceberg record for iceberg-data module `org.apache.iceberg.data.Record`. + +### Update operations + +`Table` also exposes operations that update the table. These operations use a builder pattern, [`PendingUpdate`](../../javadoc/{{ icebergVersion }}/index.html?org/apache/iceberg/PendingUpdate.html), that commits when `PendingUpdate#commit` is called. + +For example, updating the table schema is done by calling `updateSchema`, adding updates to the builder, and finally calling `commit` to commit the pending changes to the table: + +```java +table.updateSchema() + .addColumn("count", Types.LongType.get()) + .commit(); +``` + +Available operations to update a table are: + +* `updateSchema` -- update the table schema +* `updateProperties` -- update table properties +* `updateLocation` -- update the table's base location +* `newAppend` -- used to append data files +* `newFastAppend` -- used to append data files, will not compact metadata +* `newOverwrite` -- used to append data files and remove files that are overwritten +* `newDelete` -- used to delete data files +* `newRewrite` -- used to rewrite data files; will replace existing files with new versions +* `newTransaction` -- create a new table-level transaction +* `rewriteManifests` -- rewrite manifest data by clustering files, for faster scan planning +* `rollback` -- rollback the table state to a specific snapshot + +### Transactions + +Transactions are used to commit multiple table changes in a single atomic operation. A transaction is used to create individual operations using factory methods, like `newAppend`, just like working with a `Table`. Operations created by a transaction are committed as a group when `commitTransaction` is called. + +For example, deleting and appending a file in the same transaction: +```java +Transaction t = table.newTransaction(); + +// commit operations to the transaction +t.newDelete().deleteFromRowFilter(filter).commit(); +t.newAppend().appendFile(data).commit(); + +// commit all the changes to the table +t.commitTransaction(); +``` + +## Types + +Iceberg data types are located in the [`org.apache.iceberg.types` package](../../javadoc/{{ icebergVersion }}/index.html?org/apache/iceberg/types/package-summary.html). + +### Primitives + +Primitive type instances are available from static methods in each type class. Types without parameters use `get`, and types like `decimal` use factory methods: + +```java +Types.IntegerType.get() // int +Types.DoubleType.get() // double +Types.DecimalType.of(9, 2) // decimal(9, 2) +``` + +### Nested types + +Structs, maps, and lists are created using factory methods in type classes. + +Like struct fields, map keys or values and list elements are tracked as nested fields. Nested fields track [field IDs](evolution.md#correctness) and nullability. + +Struct fields are created using `NestedField.optional` or `NestedField.required`. Map value and list element nullability is set in the map and list factory methods. + +```java +// struct<1 id: int, 2 data: optional string> +StructType struct = Struct.of( + Types.NestedField.required(1, "id", Types.IntegerType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get()) + ) +``` +```java +// map<1 key: int, 2 value: optional string> +MapType map = MapType.ofOptional( + 1, 2, + Types.IntegerType.get(), + Types.StringType.get() + ) +``` +```java +// array<1 element: int> +ListType list = ListType.ofRequired(1, IntegerType.get()); +``` + + +## Expressions + +Iceberg's expressions are used to configure table scans. To create expressions, use the factory methods in [`Expressions`](../../javadoc/{{ icebergVersion }}/index.html?org/apache/iceberg/expressions/Expressions.html). + +Supported predicate expressions are: + +* `isNull` +* `notNull` +* `equal` +* `notEqual` +* `lessThan` +* `lessThanOrEqual` +* `greaterThan` +* `greaterThanOrEqual` +* `in` +* `notIn` +* `startsWith` +* `notStartsWith` + +Supported expression operations are: + +* `and` +* `or` +* `not` + +Constant expressions are: + +* `alwaysTrue` +* `alwaysFalse` + +### Expression binding + +When created, expressions are unbound. Before an expression is used, it will be bound to a data type to find the field ID the expression name represents, and to convert predicate literals. + +For example, before using the expression `lessThan("x", 10)`, Iceberg needs to determine which column `"x"` refers to and convert `10` to that column's data type. + +If the expression could be bound to the type `struct<1 x: long, 2 y: long>` or to `struct<11 x: int, 12 y: int>`. + +### Expression example + +```java +table.newScan() + .filter(Expressions.greaterThanOrEqual("x", 5)) + .filter(Expressions.lessThan("x", 10)) +``` + + +## Modules + +Iceberg table support is organized in library modules: + +* `iceberg-common` contains utility classes used in other modules +* `iceberg-api` contains the public Iceberg API, including expressions, types, tables, and operations +* `iceberg-arrow` is an implementation of the Iceberg type system for reading and writing data stored in Iceberg tables using Apache Arrow as the in-memory data format +* `iceberg-aws` contains implementations of the Iceberg API to be used with tables stored on AWS S3 and/or for tables defined using the AWS Glue data catalog +* `iceberg-core` contains implementations of the Iceberg API and support for Avro data files, **this is what processing engines should depend on** +* `iceberg-parquet` is an optional module for working with tables backed by Parquet files +* `iceberg-orc` is an optional module for working with tables backed by ORC files (*experimental*) +* `iceberg-hive-metastore` is an implementation of Iceberg tables backed by the Hive metastore Thrift client + +This project Iceberg also has modules for adding Iceberg support to processing engines and associated tooling: + +* `iceberg-spark` is an implementation of Spark's Datasource V2 API for Iceberg with submodules for each spark versions (use runtime jars for a shaded version) +* `iceberg-flink` is an implementation of Flink's Table and DataStream API for Iceberg (use iceberg-flink-runtime for a shaded version) +* `iceberg-hive3` is an implementation of Hive 3 specific SerDe's for Timestamp, TimestampWithZone, and Date object inspectors (use iceberg-hive-runtime for a shaded version). +* `iceberg-mr` is an implementation of MapReduce and Hive InputFormats and SerDes for Iceberg (use iceberg-hive-runtime for a shaded version for use with Hive) +* `iceberg-nessie` is a module used to integrate Iceberg table metadata history and operations with [Project Nessie](https://projectnessie.org/) +* `iceberg-data` is a client library used to read Iceberg tables from JVM applications +* `iceberg-pig` is an implementation of Pig's LoadFunc API for Iceberg +* `iceberg-runtime` generates a shaded runtime jar for Spark to integrate with iceberg tables + diff --git a/docs-new/home/docs/latest/assets/images/audit-branch.png b/docs-new/home/docs/latest/assets/images/audit-branch.png new file mode 100644 index 000000000000..3d6506a513ca Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/audit-branch.png differ diff --git a/docs-new/home/docs/latest/assets/images/historical-snapshot-tag.png b/docs-new/home/docs/latest/assets/images/historical-snapshot-tag.png new file mode 100644 index 000000000000..6a6be3d53526 Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/historical-snapshot-tag.png differ diff --git a/docs-new/home/docs/latest/assets/images/iceberg-in-place-metadata-migration.png b/docs-new/home/docs/latest/assets/images/iceberg-in-place-metadata-migration.png new file mode 100644 index 000000000000..1ede320f2c23 Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/iceberg-in-place-metadata-migration.png differ diff --git a/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step1.png b/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step1.png new file mode 100644 index 000000000000..aae5166fdc9c Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step1.png differ diff --git a/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step2.png b/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step2.png new file mode 100644 index 000000000000..13bb2444b67b Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step2.png differ diff --git a/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step3.png b/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step3.png new file mode 100644 index 000000000000..0175101a16ac Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/iceberg-migrateaction-step3.png differ diff --git a/docs-new/home/docs/latest/assets/images/iceberg-snapshotaction-step1.png b/docs-new/home/docs/latest/assets/images/iceberg-snapshotaction-step1.png new file mode 100644 index 000000000000..f66a3b284f14 Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/iceberg-snapshotaction-step1.png differ diff --git a/docs-new/home/docs/latest/assets/images/iceberg-snapshotaction-step2.png b/docs-new/home/docs/latest/assets/images/iceberg-snapshotaction-step2.png new file mode 100644 index 000000000000..5e255fffec5f Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/iceberg-snapshotaction-step2.png differ diff --git a/docs-new/home/docs/latest/assets/images/partition-spec-evolution.png b/docs-new/home/docs/latest/assets/images/partition-spec-evolution.png new file mode 100644 index 000000000000..0bc595f686e1 Binary files /dev/null and b/docs-new/home/docs/latest/assets/images/partition-spec-evolution.png differ diff --git a/docs-new/home/docs/latest/aws.md b/docs-new/home/docs/latest/aws.md new file mode 100644 index 000000000000..5a038c78fd49 --- /dev/null +++ b/docs-new/home/docs/latest/aws.md @@ -0,0 +1,686 @@ +--- +title: "AWS" +--- + + +# Iceberg AWS Integrations + +Iceberg provides integration with different AWS services through the `iceberg-aws` module. +This section describes how to use Iceberg with AWS. + +## Enabling AWS Integration + +The `iceberg-aws` module is bundled with Spark and Flink engine runtimes for all versions from `0.11.0` onwards. +However, the AWS clients are not bundled so that you can use the same client version as your application. +You will need to provide the AWS v2 SDK because that is what Iceberg depends on. +You can choose to use the [AWS SDK bundle](https://mvnrepository.com/artifact/software.amazon.awssdk/bundle), +or individual AWS client packages (Glue, S3, DynamoDB, KMS, STS) if you would like to have a minimal dependency footprint. + +All the default AWS clients use the [URL Connection HTTP Client](https://mvnrepository.com/artifact/software.amazon.awssdk/url-connection-client) +for HTTP connection management. +This dependency is not part of the AWS SDK bundle and needs to be added separately. +To choose a different HTTP client library such as [Apache HTTP Client](https://mvnrepository.com/artifact/software.amazon.awssdk/apache-client), +see the section [client customization](#aws-client-customization) for more details. + +All the AWS module features can be loaded through custom catalog properties, +you can go to the documentations of each engine to see how to load a custom catalog. +Here are some examples. + +### Spark + +For example, to use AWS features with Spark 3.3 (with scala 2.12) and AWS clients version 2.20.18, you can start the Spark SQL shell with: + +```sh +# add Iceberg dependency +ICEBERG_VERSION={{ icebergVersion }} +DEPENDENCIES="org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:$ICEBERG_VERSION" + +# add AWS dependency +AWS_SDK_VERSION=2.20.18 +AWS_MAVEN_GROUP=software.amazon.awssdk +AWS_PACKAGES=( + "bundle" +) +for pkg in "${AWS_PACKAGES[@]}"; do + DEPENDENCIES+=",$AWS_MAVEN_GROUP:$pkg:$AWS_SDK_VERSION" +done + +# start Spark SQL client shell +spark-sql --packages $DEPENDENCIES \ + --conf spark.sql.defaultCatalog=my_catalog \ + --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket/my/key/prefix \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \ + --conf spark.sql.catalog.my_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO +``` + +As you can see, In the shell command, we use `--packages` to specify the additional AWS bundle and HTTP client dependencies with their version as `2.20.18`. + +### Flink + +To use AWS module with Flink, you can download the necessary dependencies and specify them when starting the Flink SQL client: + +```sh +# download Iceberg dependency +ICEBERG_VERSION={{ icebergVersion }} +MAVEN_URL=https://repo1.maven.org/maven2 +ICEBERG_MAVEN_URL=$MAVEN_URL/org/apache/iceberg +wget $ICEBERG_MAVEN_URL/iceberg-flink-runtime/$ICEBERG_VERSION/iceberg-flink-runtime-$ICEBERG_VERSION.jar + +# download AWS dependency +AWS_SDK_VERSION=2.20.18 +AWS_MAVEN_URL=$MAVEN_URL/software/amazon/awssdk +AWS_PACKAGES=( + "bundle" +) +for pkg in "${AWS_PACKAGES[@]}"; do + wget $AWS_MAVEN_URL/$pkg/$AWS_SDK_VERSION/$pkg-$AWS_SDK_VERSION.jar +done + +# start Flink SQL client shell +/path/to/bin/sql-client.sh embedded \ + -j iceberg-flink-runtime-$ICEBERG_VERSION.jar \ + -j bundle-$AWS_SDK_VERSION.jar \ + shell +``` + +With those dependencies, you can create a Flink catalog like the following: + +```sql +CREATE CATALOG my_catalog WITH ( + 'type'='iceberg', + 'warehouse'='s3://my-bucket/my/key/prefix', + 'catalog-impl'='org.apache.iceberg.aws.glue.GlueCatalog', + 'io-impl'='org.apache.iceberg.aws.s3.S3FileIO' +); +``` + +You can also specify the catalog configurations in `sql-client-defaults.yaml` to preload it: + +```yaml +catalogs: + - name: my_catalog + type: iceberg + warehouse: s3://my-bucket/my/key/prefix + catalog-impl: org.apache.iceberg.aws.glue.GlueCatalog + io-impl: org.apache.iceberg.aws.s3.S3FileIO +``` + +### Hive + +To use AWS module with Hive, you can download the necessary dependencies similar to the Flink example, +and then add them to the Hive classpath or add the jars at runtime in CLI: + +``` +add jar /my/path/to/iceberg-hive-runtime.jar; +add jar /my/path/to/aws/bundle.jar; +``` + +With those dependencies, you can register a Glue catalog and create external tables in Hive at runtime in CLI by: + +```sql +SET iceberg.engine.hive.enabled=true; +SET hive.vectorized.execution.enabled=false; +SET iceberg.catalog.glue.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog; +SET iceberg.catalog.glue.warehouse=s3://my-bucket/my/key/prefix; + +-- suppose you have an Iceberg table database_a.table_a created by GlueCatalog +CREATE EXTERNAL TABLE database_a.table_a +STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' +TBLPROPERTIES ('iceberg.catalog'='glue'); +``` + +You can also preload the catalog by setting the configurations above in `hive-site.xml`. + +## Catalogs + +There are multiple different options that users can choose to build an Iceberg catalog with AWS. + +### Glue Catalog + +Iceberg enables the use of [AWS Glue](https://aws.amazon.com/glue) as the `Catalog` implementation. +When used, an Iceberg namespace is stored as a [Glue Database](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-databases.html), +an Iceberg table is stored as a [Glue Table](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-tables.html), +and every Iceberg table version is stored as a [Glue TableVersion](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-api-catalog-tables.html#aws-glue-api-catalog-tables-TableVersion). +You can start using Glue catalog by specifying the `catalog-impl` as `org.apache.iceberg.aws.glue.GlueCatalog`, +just like what is shown in the [enabling AWS integration](#enabling-aws-integration) section above. +More details about loading the catalog can be found in individual engine pages, such as [Spark](spark-configuration.md#loading-a-custom-catalog) and [Flink](flink.md#adding-catalogs). + +#### Glue Catalog ID + +There is a unique Glue metastore in each AWS account and each AWS region. +By default, `GlueCatalog` chooses the Glue metastore to use based on the user's default AWS client credential and region setup. +You can specify the Glue catalog ID through `glue.id` catalog property to point to a Glue catalog in a different AWS account. +The Glue catalog ID is your numeric AWS account ID. +If the Glue catalog is in a different region, you should configure your AWS client to point to the correct region, +see more details in [AWS client customization](#aws-client-customization). + +#### Skip Archive + +AWS Glue has the ability to archive older table versions and a user can roll back the table to any historical version if needed. +By default, the Iceberg Glue Catalog will skip the archival of older table versions. +If a user wishes to archive older table versions, they can set `glue.skip-archive` to false. +Do note for streaming ingestion into Iceberg tables, setting `glue.skip-archive` to false will quickly create a lot of Glue table versions. +For more details, please read [Glue Quotas](https://docs.aws.amazon.com/general/latest/gr/glue.html) and the [UpdateTable API](https://docs.aws.amazon.com/glue/latest/webapi/API_UpdateTable.html). + +#### Skip Name Validation + +Allow user to skip name validation for table name and namespaces. +It is recommended to stick to Glue best practice in +https://docs.aws.amazon.com/athena/latest/ug/glue-best-practices.html to make sure operations are Hive compatible. +This is only added for users that have existing conventions using non-standard characters. When database name +and table name validation are skipped, there is no guarantee that downstream systems would all support the names. + +#### Optimistic Locking + +By default, Iceberg uses Glue's optimistic locking for concurrent updates to a table. +With optimistic locking, each table has a version id. +If users retrieve the table metadata, Iceberg records the version id of that table. +Users can update the table as long as the version ID on the server side remains unchanged. +Version mismatch occurs if someone else modified the table before you did, causing an update failure. +Iceberg then refreshes metadata and checks if there is a conflict. +If there is no commit conflict, the operation will be retried. +Optimistic locking guarantees atomic transaction of Iceberg tables in Glue. +It also prevents others from accidentally overwriting your changes. + +!!! info + Please use AWS SDK version >= 2.17.131 to leverage Glue's Optimistic Locking. + If the AWS SDK version is below 2.17.131, only in-memory lock is used. To ensure atomic transaction, you need to set up a [DynamoDb Lock Manager](#dynamodb-lock-manager). + + +#### Warehouse Location + +Similar to all other catalog implementations, `warehouse` is a required catalog property to determine the root path of the data warehouse in storage. +By default, Glue only allows a warehouse location in S3 because of the use of `S3FileIO`. +To store data in a different local or cloud store, Glue catalog can switch to use `HadoopFileIO` or any custom FileIO by setting the `io-impl` catalog property. +Details about this feature can be found in the [custom FileIO](custom-catalog.md#custom-file-io-implementation) section. + +#### Table Location + +By default, the root location for a table `my_table` of namespace `my_ns` is at `my-warehouse-location/my-ns.db/my-table`. +This default root location can be changed at both namespace and table level. + +To use a different path prefix for all tables under a namespace, use AWS console or any AWS Glue client SDK you like to update the `locationUri` attribute of the corresponding Glue database. +For example, you can update the `locationUri` of `my_ns` to `s3://my-ns-bucket`, +then any newly created table will have a default root location under the new prefix. +For instance, a new table `my_table_2` will have its root location at `s3://my-ns-bucket/my_table_2`. + +To use a completely different root path for a specific table, set the `location` table property to the desired root path value you want. +For example, in Spark SQL you can do: + +```sql +CREATE TABLE my_catalog.my_ns.my_table ( + id bigint, + data string, + category string) +USING iceberg +OPTIONS ('location'='s3://my-special-table-bucket') +PARTITIONED BY (category); +``` + +For engines like Spark that support the `LOCATION` keyword, the above SQL statement is equivalent to: + +```sql +CREATE TABLE my_catalog.my_ns.my_table ( + id bigint, + data string, + category string) +USING iceberg +LOCATION 's3://my-special-table-bucket' +PARTITIONED BY (category); +``` + +### DynamoDB Catalog + +Iceberg supports using a [DynamoDB](https://aws.amazon.com/dynamodb) table to record and manage database and table information. + +#### Configurations + +The DynamoDB catalog supports the following configurations: + +| Property | Default | Description | +| --------------------------------- | -------------------------------------------------- | ------------------------------------------------------ | +| dynamodb.table-name | iceberg | name of the DynamoDB table used by DynamoDbCatalog | + + +#### Internal Table Design + +The DynamoDB table is designed with the following columns: + +| Column | Key | Type | Description | +| ----------------- | --------------- | ----------- |--------------------------------------------------------------------- | +| identifier | partition key | string | table identifier such as `db1.table1`, or string `NAMESPACE` for namespaces | +| namespace | sort key | string | namespace name. A [global secondary index (GSI)](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GSI.html) is created with namespace as partition key, identifier as sort key, no other projected columns | +| v | | string | row version, used for optimistic locking | +| updated_at | | number | timestamp (millis) of the last update | +| created_at | | number | timestamp (millis) of the table creation | +| p. | | string | Iceberg-defined table properties including `table_type`, `metadata_location` and `previous_metadata_location` or namespace properties + +This design has the following benefits: + +1. it avoids potential [hot partition issue](https://aws.amazon.com/premiumsupport/knowledge-center/dynamodb-table-throttled/) if there are heavy write traffic to the tables within the same namespace because the partition key is at the table level +2. namespace operations are clustered in a single partition to avoid affecting table commit operations +3. a sort key to partition key reverse GSI is used for list table operation, and all other operations are single row ops or single partition query. No full table scan is needed for any operation in the catalog. +4. a string UUID version field `v` is used instead of `updated_at` to avoid 2 processes committing at the same millisecond +5. multi-row transaction is used for `catalog.renameTable` to ensure idempotency +6. properties are flattened as top level columns so that user can add custom GSI on any property field to customize the catalog. For example, users can store owner information as table property `owner`, and search tables by owner by adding a GSI on the `p.owner` column. + +### RDS JDBC Catalog + +Iceberg also supports the JDBC catalog which uses a table in a relational database to manage Iceberg tables. +You can configure to use the JDBC catalog with relational database services like [AWS RDS](https://aws.amazon.com/rds). +Read [the JDBC integration page](jdbc.md#jdbc-catalog) for guides and examples about using the JDBC catalog. +Read [this AWS documentation](https://docs.aws.amazon.com/AmazonRDS/latest/UserGuide/UsingWithRDS.IAMDBAuth.Connecting.Java.html) for more details about configuring the JDBC catalog with IAM authentication. + +### Which catalog to choose? + +With all the available options, we offer the following guidelines when choosing the right catalog to use for your application: + +1. if your organization has an existing Glue metastore or plans to use the AWS analytics ecosystem including Glue, [Athena](https://aws.amazon.com/athena), [EMR](https://aws.amazon.com/emr), [Redshift](https://aws.amazon.com/redshift) and [LakeFormation](https://aws.amazon.com/lake-formation), Glue catalog provides the easiest integration. +2. if your application requires frequent updates to table or high read and write throughput (e.g. streaming write), Glue and DynamoDB catalog provides the best performance through optimistic locking. +3. if you would like to enforce access control for tables in a catalog, Glue tables can be managed as an [IAM resource](https://docs.aws.amazon.com/service-authorization/latest/reference/list_awsglue.html), whereas DynamoDB catalog tables can only be managed through [item-level permission](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/specifying-conditions.html) which is much more complicated. +4. if you would like to query tables based on table property information without the need to scan the entire catalog, DynamoDB catalog allows you to build secondary indexes for any arbitrary property field and provide efficient query performance. +5. if you would like to have the benefit of DynamoDB catalog while also connect to Glue, you can enable [DynamoDB stream with Lambda trigger](https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/Streams.Lambda.Tutorial.html) to asynchronously update your Glue metastore with table information in the DynamoDB catalog. +6. if your organization already maintains an existing relational database in RDS or uses [serverless Aurora](https://aws.amazon.com/rds/aurora/serverless/) to manage tables, the JDBC catalog provides the easiest integration. + +## DynamoDb Lock Manager + +[Amazon DynamoDB](https://aws.amazon.com/dynamodb) can be used by `HadoopCatalog` or `HadoopTables` so that for every commit, +the catalog first obtains a lock using a helper DynamoDB table and then try to safely modify the Iceberg table. +This is necessary for a file system-based catalog to ensure atomic transaction in storages like S3 that do not provide file write mutual exclusion. + +This feature requires the following lock related catalog properties: + +1. Set `lock-impl` as `org.apache.iceberg.aws.dynamodb.DynamoDbLockManager`. +2. Set `lock.table` as the DynamoDB table name you would like to use. If the lock table with the given name does not exist in DynamoDB, a new table is created with billing mode set as [pay-per-request](https://aws.amazon.com/blogs/aws/amazon-dynamodb-on-demand-no-capacity-planning-and-pay-per-request-pricing). + +Other lock related catalog properties can also be used to adjust locking behaviors such as heartbeat interval. +For more details, please refer to [Lock catalog properties](configuration.md#lock-catalog-properties). + + +## S3 FileIO + +Iceberg allows users to write data to S3 through `S3FileIO`. +`GlueCatalog` by default uses this `FileIO`, and other catalogs can load this `FileIO` using the `io-impl` catalog property. + +### Progressive Multipart Upload + +`S3FileIO` implements a customized progressive multipart upload algorithm to upload data. +Data files are uploaded by parts in parallel as soon as each part is ready, +and each file part is deleted as soon as its upload process completes. +This provides maximized upload speed and minimized local disk usage during uploads. +Here are the configurations that users can tune related to this feature: + +| Property | Default | Description | +| --------------------------------- | -------------------------------------------------- | ------------------------------------------------------ | +| s3.multipart.num-threads | the available number of processors in the system | number of threads to use for uploading parts to S3 (shared across all output streams) | +| s3.multipart.part-size-bytes | 32MB | the size of a single part for multipart upload requests | +| s3.multipart.threshold | 1.5 | the threshold expressed as a factor times the multipart size at which to switch from uploading using a single put object request to uploading using multipart upload | +| s3.staging-dir | `java.io.tmpdir` property value | the directory to hold temporary files | + +### S3 Server Side Encryption + +`S3FileIO` supports all 3 S3 server side encryption modes: + +* [SSE-S3](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingServerSideEncryption.html): When you use Server-Side Encryption with Amazon S3-Managed Keys (SSE-S3), each object is encrypted with a unique key. As an additional safeguard, it encrypts the key itself with a master key that it regularly rotates. Amazon S3 server-side encryption uses one of the strongest block ciphers available, 256-bit Advanced Encryption Standard (AES-256), to encrypt your data. +* [SSE-KMS](https://docs.aws.amazon.com/AmazonS3/latest/dev/UsingKMSEncryption.html): Server-Side Encryption with Customer Master Keys (CMKs) Stored in AWS Key Management Service (SSE-KMS) is similar to SSE-S3, but with some additional benefits and charges for using this service. There are separate permissions for the use of a CMK that provides added protection against unauthorized access of your objects in Amazon S3. SSE-KMS also provides you with an audit trail that shows when your CMK was used and by whom. Additionally, you can create and manage customer managed CMKs or use AWS managed CMKs that are unique to you, your service, and your Region. +* [SSE-C](https://docs.aws.amazon.com/AmazonS3/latest/dev/ServerSideEncryptionCustomerKeys.html): With Server-Side Encryption with Customer-Provided Keys (SSE-C), you manage the encryption keys and Amazon S3 manages the encryption, as it writes to disks, and decryption when you access your objects. + +To enable server side encryption, use the following configuration properties: + +| Property | Default | Description | +| --------------------------------- | ---------------------------------------- | ------------------------------------------------------ | +| s3.sse.type | `none` | `none`, `s3`, `kms` or `custom` | +| s3.sse.key | `aws/s3` for `kms` type, null otherwise | A KMS Key ID or ARN for `kms` type, or a custom base-64 AES256 symmetric key for `custom` type. | +| s3.sse.md5 | null | If SSE type is `custom`, this value must be set as the base-64 MD5 digest of the symmetric key to ensure integrity. | + +### S3 Access Control List + +`S3FileIO` supports S3 access control list (ACL) for detailed access control. +User can choose the ACL level by setting the `s3.acl` property. +For more details, please read [S3 ACL Documentation](https://docs.aws.amazon.com/AmazonS3/latest/dev/acl-overview.html). + +### Object Store File Layout + +S3 and many other cloud storage services [throttle requests based on object prefix](https://aws.amazon.com/premiumsupport/knowledge-center/s3-request-limit-avoid-throttling/). +Data stored in S3 with a traditional Hive storage layout can face S3 request throttling as objects are stored under the same file path prefix. + +Iceberg by default uses the Hive storage layout but can be switched to use the `ObjectStoreLocationProvider`. +With `ObjectStoreLocationProvider`, a deterministic hash is generated for each stored file, with the hash appended +directly after the `write.data.path`. This ensures files written to s3 are equally distributed across multiple [prefixes](https://aws.amazon.com/premiumsupport/knowledge-center/s3-object-key-naming-pattern/) in the S3 bucket. Resulting in minimized throttling and maximized throughput for S3-related IO operations. When using `ObjectStoreLocationProvider` having a shared and short `write.data.path` across your Iceberg tables will improve performance. + +For more information on how S3 scales API QPS, check out the 2018 re:Invent session on [Best Practices for Amazon S3 and Amazon S3 Glacier](https://youtu.be/rHeTn9pHNKo?t=3219). At [53:39](https://youtu.be/rHeTn9pHNKo?t=3219) it covers how S3 scales/partitions & at [54:50](https://youtu.be/rHeTn9pHNKo?t=3290) it discusses the 30-60 minute wait time before new partitions are created. + +To use the `ObjectStorageLocationProvider` add `'write.object-storage.enabled'=true` in the table's properties. +Below is an example Spark SQL command to create a table using the `ObjectStorageLocationProvider`: +```sql +CREATE TABLE my_catalog.my_ns.my_table ( + id bigint, + data string, + category string) +USING iceberg +OPTIONS ( + 'write.object-storage.enabled'=true, + 'write.data.path'='s3://my-table-data-bucket') +PARTITIONED BY (category); +``` + +We can then insert a single row into this new table +```SQL +INSERT INTO my_catalog.my_ns.my_table VALUES (1, "Pizza", "orders"); +``` + +Which will write the data to S3 with a hash (`2d3905f8`) appended directly after the `write.object-storage.path`, ensuring reads to the table are spread evenly across [S3 bucket prefixes](https://docs.aws.amazon.com/AmazonS3/latest/userguide/optimizing-performance.html), and improving performance. +``` +s3://my-table-data-bucket/2d3905f8/my_ns.db/my_table/category=orders/00000-0-5affc076-96a4-48f2-9cd2-d5efbc9f0c94-00001.parquet +``` + +Note, the path resolution logic for `ObjectStoreLocationProvider` is `write.data.path` then `/data`. +However, for the older versions up to 0.12.0, the logic is as follows: +- before 0.12.0, `write.object-storage.path` must be set. +- at 0.12.0, `write.object-storage.path` then `write.folder-storage.path` then `/data`. + +For more details, please refer to the [LocationProvider Configuration](custom-catalog.md#custom-location-provider-implementation) section. + +### S3 Strong Consistency + +In November 2020, S3 announced [strong consistency](https://aws.amazon.com/s3/consistency/) for all read operations, and Iceberg is updated to fully leverage this feature. +There is no redundant consistency wait and check which might negatively impact performance during IO operations. + +### Hadoop S3A FileSystem + +Before `S3FileIO` was introduced, many Iceberg users choose to use `HadoopFileIO` to write data to S3 through the [S3A FileSystem](https://github.com/apache/hadoop/blob/trunk/hadoop-tools/hadoop-aws/src/main/java/org/apache/hadoop/fs/s3a/S3AFileSystem.java). +As introduced in the previous sections, `S3FileIO` adopts the latest AWS clients and S3 features for optimized security and performance + and is thus recommended for S3 use cases rather than the S3A FileSystem. + +`S3FileIO` writes data with `s3://` URI scheme, but it is also compatible with schemes written by the S3A FileSystem. +This means for any table manifests containing `s3a://` or `s3n://` file paths, `S3FileIO` is still able to read them. +This feature allows people to easily switch from S3A to `S3FileIO`. + +If for any reason you have to use S3A, here are the instructions: + +1. To store data using S3A, specify the `warehouse` catalog property to be an S3A path, e.g. `s3a://my-bucket/my-warehouse` +2. For `HiveCatalog`, to also store metadata using S3A, specify the Hadoop config property `hive.metastore.warehouse.dir` to be an S3A path. +3. Add [hadoop-aws](https://mvnrepository.com/artifact/org.apache.hadoop/hadoop-aws) as a runtime dependency of your compute engine. +4. Configure AWS settings based on [hadoop-aws documentation](https://hadoop.apache.org/docs/current/hadoop-aws/tools/hadoop-aws/index.html) (make sure you check the version, S3A configuration varies a lot based on the version you use). + +### S3 Write Checksum Verification + +To ensure integrity of uploaded objects, checksum validations for S3 writes can be turned on by setting catalog property `s3.checksum-enabled` to `true`. +This is turned off by default. + +### S3 Tags + +Custom [tags](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-tagging.html) can be added to S3 objects while writing and deleting. +For example, to write S3 tags with Spark 3.3, you can start the Spark SQL shell with: +``` +spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket/my/key/prefix \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \ + --conf spark.sql.catalog.my_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO \ + --conf spark.sql.catalog.my_catalog.s3.write.tags.my_key1=my_val1 \ + --conf spark.sql.catalog.my_catalog.s3.write.tags.my_key2=my_val2 +``` +For the above example, the objects in S3 will be saved with tags: `my_key1=my_val1` and `my_key2=my_val2`. Do note that the specified write tags will be saved only while object creation. + +When the catalog property `s3.delete-enabled` is set to `false`, the objects are not hard-deleted from S3. +This is expected to be used in combination with S3 delete tagging, so objects are tagged and removed using [S3 lifecycle policy](https://docs.aws.amazon.com/AmazonS3/latest/userguide/object-lifecycle-mgmt.html). +The property is set to `true` by default. + +With the `s3.delete.tags` config, objects are tagged with the configured key-value pairs before deletion. +Users can configure tag-based object lifecycle policy at bucket level to transition objects to different tiers. +For example, to add S3 delete tags with Spark 3.3, you can start the Spark SQL shell with: + +``` +sh spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://iceberg-warehouse/s3-tagging \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \ + --conf spark.sql.catalog.my_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO \ + --conf spark.sql.catalog.my_catalog.s3.delete.tags.my_key3=my_val3 \ + --conf spark.sql.catalog.my_catalog.s3.delete-enabled=false +``` + +For the above example, the objects in S3 will be saved with tags: `my_key3=my_val3` before deletion. +Users can also use the catalog property `s3.delete.num-threads` to mention the number of threads to be used for adding delete tags to the S3 objects. + +When the catalog property `s3.write.table-tag-enabled` and `s3.write.namespace-tag-enabled` is set to `true` then the objects in S3 will be saved with tags: `iceberg.table=` and `iceberg.namespace=`. +Users can define access and data retention policy per namespace or table based on these tags. +For example, to write table and namespace name as S3 tags with Spark 3.3, you can start the Spark SQL shell with: +``` +sh spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://iceberg-warehouse/s3-tagging \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \ + --conf spark.sql.catalog.my_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO \ + --conf spark.sql.catalog.my_catalog.s3.write.table-tag-enabled=true \ + --conf spark.sql.catalog.my_catalog.s3.write.namespace-tag-enabled=true +``` +For more details on tag restrictions, please refer [User-Defined Tag Restrictions](https://docs.aws.amazon.com/awsaccountbilling/latest/aboutv2/allocation-tag-restrictions.html). + +### S3 Access Points + +[Access Points](https://docs.aws.amazon.com/AmazonS3/latest/userguide/using-access-points.html) can be used to perform +S3 operations by specifying a mapping of bucket to access points. This is useful for multi-region access, cross-region access, +disaster recovery, etc. + +For using cross-region access points, we need to additionally set `use-arn-region-enabled` catalog property to +`true` to enable `S3FileIO` to make cross-region calls, it's not required for same / multi-region access points. + +For example, to use S3 access-point with Spark 3.3, you can start the Spark SQL shell with: +``` +spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket2/my/key/prefix \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \ + --conf spark.sql.catalog.my_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO \ + --conf spark.sql.catalog.my_catalog.s3.use-arn-region-enabled=false \ + --conf spark.sql.catalog.test.s3.access-points.my-bucket1=arn:aws:s3::123456789012:accesspoint:mfzwi23gnjvgw.mrap \ + --conf spark.sql.catalog.test.s3.access-points.my-bucket2=arn:aws:s3::123456789012:accesspoint:mfzwi23gnjvgw.mrap +``` +For the above example, the objects in S3 on `my-bucket1` and `my-bucket2` buckets will use `arn:aws:s3::123456789012:accesspoint:mfzwi23gnjvgw.mrap` +access-point for all S3 operations. + +For more details on using access-points, please refer [Using access points with compatible Amazon S3 operations](https://docs.aws.amazon.com/AmazonS3/latest/userguide/access-points-usage-examples.html). + +### S3 Acceleration + +[S3 Acceleration](https://aws.amazon.com/s3/transfer-acceleration/) can be used to speed up transfers to and from Amazon S3 by as much as 50-500% for long-distance transfer of larger objects. + +To use S3 Acceleration, we need to set `s3.acceleration-enabled` catalog property to `true` to enable `S3FileIO` to make accelerated S3 calls. + +For example, to use S3 Acceleration with Spark 3.3, you can start the Spark SQL shell with: +``` +spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket2/my/key/prefix \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \ + --conf spark.sql.catalog.my_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO \ + --conf spark.sql.catalog.my_catalog.s3.acceleration-enabled=true +``` + +For more details on using S3 Acceleration, please refer to [Configuring fast, secure file transfers using Amazon S3 Transfer Acceleration](https://docs.aws.amazon.com/AmazonS3/latest/userguide/transfer-acceleration.html). + +### S3 Dual-stack + +[S3 Dual-stack](https://docs.aws.amazon.com/AmazonS3/latest/userguide/dual-stack-endpoints.html) allows a client to access an S3 bucket through a dual-stack endpoint. +When clients request a dual-stack endpoint, the bucket URL resolves to an IPv6 address if possible, otherwise fallback to IPv4. + +To use S3 Dual-stack, we need to set `s3.dualstack-enabled` catalog property to `true` to enable `S3FileIO` to make dual-stack S3 calls. + +For example, to use S3 Dual-stack with Spark 3.3, you can start the Spark SQL shell with: +``` +spark-sql --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket2/my/key/prefix \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \ + --conf spark.sql.catalog.my_catalog.io-impl=org.apache.iceberg.aws.s3.S3FileIO \ + --conf spark.sql.catalog.my_catalog.s3.dualstack-enabled=true +``` + +For more details on using S3 Dual-stack, please refer [Using dual-stack endpoints from the AWS CLI and the AWS SDKs](https://docs.aws.amazon.com/AmazonS3/latest/userguide/dual-stack-endpoints.html#dual-stack-endpoints-cli) + +## AWS Client Customization + +Many organizations have customized their way of configuring AWS clients with their own credential provider, access proxy, retry strategy, etc. +Iceberg allows users to plug in their own implementation of `org.apache.iceberg.aws.AwsClientFactory` by setting the `client.factory` catalog property. + +### Cross-Account and Cross-Region Access + +It is a common use case for organizations to have a centralized AWS account for Glue metastore and S3 buckets, and use different AWS accounts and regions for different teams to access those resources. +In this case, a [cross-account IAM role](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_use.html) is needed to access those centralized resources. +Iceberg provides an AWS client factory `AssumeRoleAwsClientFactory` to support this common use case. +This also serves as an example for users who would like to implement their own AWS client factory. + +This client factory has the following configurable catalog properties: + +| Property | Default | Description | +| --------------------------------- | ---------------------------------------- | ------------------------------------------------------ | +| client.assume-role.arn | null, requires user input | ARN of the role to assume, e.g. arn:aws:iam::123456789:role/myRoleToAssume | +| client.assume-role.region | null, requires user input | All AWS clients except the STS client will use the given region instead of the default region chain | +| client.assume-role.external-id | null | An optional [external ID](https://docs.aws.amazon.com/IAM/latest/UserGuide/id_roles_create_for-user_externalid.html) | +| client.assume-role.timeout-sec | 1 hour | Timeout of each assume role session. At the end of the timeout, a new set of role session credentials will be fetched through an STS client. | + +By using this client factory, an STS client is initialized with the default credential and region to assume the specified role. +The Glue, S3 and DynamoDB clients are then initialized with the assume-role credential and region to access resources. +Here is an example to start Spark shell with this client factory: + +```shell +spark-sql --packages org.apache.iceberg:iceberg-spark-runtime:{{ icebergVersion }},software.amazon.awssdk:bundle:2.20.18 \ + --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket/my/key/prefix \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog \ + --conf spark.sql.catalog.my_catalog.client.factory=org.apache.iceberg.aws.AssumeRoleAwsClientFactory \ + --conf spark.sql.catalog.my_catalog.client.assume-role.arn=arn:aws:iam::123456789:role/myRoleToAssume \ + --conf spark.sql.catalog.my_catalog.client.assume-role.region=ap-northeast-1 +``` + +### HTTP Client Configurations +AWS clients support two types of HTTP Client, [URL Connection HTTP Client](https://mvnrepository.com/artifact/software.amazon.awssdk/url-connection-client) +and [Apache HTTP Client](https://mvnrepository.com/artifact/software.amazon.awssdk/apache-client). +By default, AWS clients use **URL Connection** HTTP Client to communicate with the service. +This HTTP client optimizes for minimum dependencies and startup latency but supports less functionality than other implementations. +In contrast, Apache HTTP Client supports more functionalities and more customized settings, such as expect-continue handshake and TCP KeepAlive, at the cost of extra dependency and additional startup latency. + +For more details of configuration, see sections [URL Connection HTTP Client Configurations](#url-connection-http-client-configurations) and [Apache HTTP Client Configurations](#apache-http-client-configurations). + +Configure the following property to set the type of HTTP client: + +| Property | Default | Description | +|------------------|---------|------------------------------------------------------------------------------------------------------------| +| http-client.type | apache | Types of HTTP Client.
`urlconnection`: URL Connection HTTP Client
`apache`: Apache HTTP Client | + +#### URL Connection HTTP Client Configurations + +URL Connection HTTP Client has the following configurable properties: + +| Property | Default | Description | +|-------------------------------------------------|---------|------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| http-client.urlconnection.socket-timeout-ms | null | An optional [socket timeout](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/urlconnection/UrlConnectionHttpClient.Builder.html#socketTimeout(java.time.Duration)) in milliseconds | +| http-client.urlconnection.connection-timeout-ms | null | An optional [connection timeout](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/urlconnection/UrlConnectionHttpClient.Builder.html#connectionTimeout(java.time.Duration)) in milliseconds | + +Users can use catalog properties to override the defaults. For example, to configure the socket timeout for URL Connection HTTP Client when starting a spark shell, one can add: +```shell +--conf spark.sql.catalog.my_catalog.http-client.urlconnection.socket-timeout-ms=80 +``` + +#### Apache HTTP Client Configurations + +Apache HTTP Client has the following configurable properties: + +| Property | Default | Description | +|-------------------------------------------------------|---------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| http-client.apache.socket-timeout-ms | null | An optional [socket timeout](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#socketTimeout(java.time.Duration)) in milliseconds | +| http-client.apache.connection-timeout-ms | null | An optional [connection timeout](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#connectionTimeout(java.time.Duration)) in milliseconds | +| http-client.apache.connection-acquisition-timeout-ms | null | An optional [connection acquisition timeout](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#connectionAcquisitionTimeout(java.time.Duration)) in milliseconds | +| http-client.apache.connection-max-idle-time-ms | null | An optional [connection max idle timeout](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#connectionMaxIdleTime(java.time.Duration)) in milliseconds | +| http-client.apache.connection-time-to-live-ms | null | An optional [connection time to live](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#connectionTimeToLive(java.time.Duration)) in milliseconds | +| http-client.apache.expect-continue-enabled | null, disabled by default | An optional `true/false` setting that controls whether [expect continue](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#expectContinueEnabled(java.lang.Boolean)) is enabled | +| http-client.apache.max-connections | null | An optional [max connections](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#maxConnections(java.lang.Integer)) in integer | +| http-client.apache.tcp-keep-alive-enabled | null, disabled by default | An optional `true/false` setting that controls whether [tcp keep alive](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#tcpKeepAlive(java.lang.Boolean)) is enabled | +| http-client.apache.use-idle-connection-reaper-enabled | null, enabled by default | An optional `true/false` setting that controls whether [use idle connection reaper](https://sdk.amazonaws.com/java/api/latest/software/amazon/awssdk/http/apache/ApacheHttpClient.Builder.html#useIdleConnectionReaper(java.lang.Boolean)) is used | + +Users can use catalog properties to override the defaults. For example, to configure the max connections for Apache HTTP Client when starting a spark shell, one can add: +```shell +--conf spark.sql.catalog.my_catalog.http-client.apache.max-connections=5 +``` + +## Run Iceberg on AWS + +### Amazon Athena + +[Amazon Athena](https://aws.amazon.com/athena/) provides a serverless query engine that could be used to perform read, write, update and optimization tasks against Iceberg tables. +More details could be found [here](https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html). + +### Amazon EMR + +[Amazon EMR](https://aws.amazon.com/emr/) can provision clusters with [Spark](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-spark.html) (EMR 6 for Spark 3, EMR 5 for Spark 2), +[Hive](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-hive.html), [Flink](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-flink.html), +[Trino](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-presto.html) that can run Iceberg. + +Starting with EMR version 6.5.0, EMR clusters can be configured to have the necessary Apache Iceberg dependencies installed without requiring bootstrap actions. +Please refer to the [official documentation](https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-iceberg-use-cluster.html) on how to create a cluster with Iceberg installed. + +For versions before 6.5.0, you can use a [bootstrap action](https://docs.aws.amazon.com/emr/latest/ManagementGuide/emr-plan-bootstrap.html) similar to the following to pre-install all necessary dependencies: +```sh +#!/bin/bash + +AWS_SDK_VERSION=2.20.18 +ICEBERG_VERSION={{ icebergVersion }} +MAVEN_URL=https://repo1.maven.org/maven2 +ICEBERG_MAVEN_URL=$MAVEN_URL/org/apache/iceberg +AWS_MAVEN_URL=$MAVEN_URL/software/amazon/awssdk +# NOTE: this is just an example shared class path between Spark and Flink, +# please choose a proper class path for production. +LIB_PATH=/usr/share/aws/aws-java-sdk/ + +AWS_PACKAGES=( + "bundle" +) + +ICEBERG_PACKAGES=( + "iceberg-spark-runtime-3.3_2.12" + "iceberg-flink-runtime" +) + +install_dependencies () { + install_path=$1 + download_url=$2 + version=$3 + shift + pkgs=("$@") + for pkg in "${pkgs[@]}"; do + sudo wget -P $install_path $download_url/$pkg/$version/$pkg-$version.jar + done +} + +install_dependencies $LIB_PATH $ICEBERG_MAVEN_URL $ICEBERG_VERSION "${ICEBERG_PACKAGES[@]}" +install_dependencies $LIB_PATH $AWS_MAVEN_URL $AWS_SDK_VERSION "${AWS_PACKAGES[@]}" +``` + +### AWS Glue + +[AWS Glue](https://aws.amazon.com/glue/) provides a serverless data integration service +that could be used to perform read, write and update tasks against Iceberg tables. +More details could be found [here](https://docs.aws.amazon.com/glue/latest/dg/aws-glue-programming-etl-format-iceberg.html). + + +### AWS EKS + +[AWS Elastic Kubernetes Service (EKS)](https://aws.amazon.com/eks/) can be used to start any Spark, Flink, Hive, Presto or Trino clusters to work with Iceberg. +Search the [Iceberg blogs](../../blogs.md) page for tutorials around running Iceberg with Docker and Kubernetes. + +### Amazon Kinesis + +[Amazon Kinesis Data Analytics](https://aws.amazon.com/about-aws/whats-new/2019/11/you-can-now-run-fully-managed-apache-flink-applications-with-apache-kafka/) provides a platform +to run fully managed Apache Flink applications. You can include Iceberg in your application Jar and run it in the platform. diff --git a/docs-new/home/docs/latest/branching.md b/docs-new/home/docs/latest/branching.md new file mode 100644 index 000000000000..3ec28e628507 --- /dev/null +++ b/docs-new/home/docs/latest/branching.md @@ -0,0 +1,115 @@ +--- +title: "Branching and Tagging" +--- + + + +# Branching and Tagging + +## Overview + +Iceberg table metadata maintains a log of snapshots which represent the changes applied to a table. +Snapshots are fundamental in Iceberg as they are the basis for reader isolation and time travel queries. +For controlling metadata size and storage costs, Iceberg provides snapshot lifecycle management procedures such as [`expire_snapshots`](spark-procedures.md#expire-snapshots) for removing unused snapshots and no longer neccessary data files based on table snapshot retention properties. + +**For more sophisticated snapshot lifecycle management, Iceberg supports branches and tags which are named references to snapshots with their own independent lifecycles. This lifecycle is controlled by branch and tag level retention policies.** +Branches are independent lineages of snapshots and point to the head of the lineage. +Branches and tags have a maximum reference age property which control when the reference to the snapshot itself should be expired. +Branches have retention properties which define the minimum number of snapshots to retain on a branch as well as the maximum age of individual snapshots to retain on the branch. +These properties are used when the expireSnapshots procedure is run. +For details on the algorithm for expireSnapshots, refer to the [spec](../../spec.md#snapshot-retention-policy). + +## Use Cases + +Branching and tagging can be used for handling GDPR requirements and retaining important historical snapshots for auditing. +Branches can also be used as part of data engineering workflows, for enabling experimental branches for testing and validating new jobs. +See below for some examples of how branching and tagging can facilitate these use cases. + +### Historical Tags + +Tags can be used for retaining important historical snapshots for auditing purposes. + +![Historical Tags](assets/images/historical-snapshot-tag.png) + +The above diagram demonstrates retaininig important historical snapshot with the following retention policy, defined +via Spark SQL. + +1. Retain 1 snapshot per week for 1 month. This can be achieved by tagging the weekly snapshot and setting the tag retention to be a month. +snapshots will be kept, and the branch reference itself will be retained for 1 week. +```sql +-- Create a tag for the first end of week snapshot. Retain the snapshot for a week +ALTER TABLE prod.db.table CREATE TAG 'EOW-01' AS OF VERSION 7 RETAIN 7 DAYS +``` + +2. Retain 1 snapshot per month for 6 months. This can be achieved by tagging the monthly snapshot and setting the tag retention to be 6 months. +```sql +-- Create a tag for the first end of month snapshot. Retain the snapshot for 6 months +ALTER TABLE prod.db.table CREATE TAG 'EOM-01' AS OF VERSION 30 RETAIN 180 DAYS +``` + +3. Retain 1 snapshot per year forever. This can be achieved by tagging the annual snapshot. The default retention for branches and tags is forever. +```sql +-- Create a tag for the end of the year and retain it forever. +ALTER TABLE prod.db.table CREATE TAG 'EOY-2023' AS OF VERSION 365 +``` + +4. Create a temporary "test-branch" which is retained for 7 days and the latest 2 snapshots on the branch are retained. +```sql +-- Create a branch "test-branch" which will be retained for 7 days along with the latest 2 snapshots +ALTER TABLE prod.db.table CREATE BRANCH test-branch RETAIN 7 DAYS WITH RETENTION 2 SNAPSHOTS +``` + +### Audit Branch + +![Audit Branch](assets/images/audit-branch.png) + +The above diagram shows an example of using an audit branch for validating a write workflow. + +1. First ensure `write.wap.enabled` is set. +```sql +ALTER TABLE db.table SET TBLPROPERTIES ( + 'write.wap.enabled''true' +) +``` +2. Create `audit-branch` starting from snapshot 3, which will be written to and retained for 1 week. +```sql +ALTER TABLE db.table CREATE BRANCH `audit-branch` AS OF VERSION 3 RETAIN 7 DAYS +``` +3. Writes are performed on a separate `audit-branch` independent from the main table history. +```sql +-- WAP Branch write +SET spark.wap.branch = 'audit-branch' +INSERT INTO prod.db.table VALUES (3, 'c') +``` +4. A validation workflow can validate (e.g. data quality) the state of `audit-branch`. +5. After validation, the main branch can be `fastForward` to the head of `audit-branch` to update the main table state. +```java +table.manageSnapshots().fastForward("main", "audit-branch").commit() +``` +6. The branch reference will be removed when `expireSnapshots` is run 1 week later. + +## Usage + +Creating, querying and writing to branches and tags are supported in the Iceberg Java library, and in Spark and Flink engine integrations. + +- [Iceberg Java Library](java-api-quickstart.md#branching-and-tagging) +- [Spark DDLs](spark-ddl.md#branching-and-tagging-ddl) +- [Spark Reads](spark-queries.md#time-travel) +- [Spark Branch Writes](spark-writes.md#writing-to-branches) +- [Flink Reads](flink-queries.md#reading-branches-and-tags-with-SQL) +- [Flink Branch Writes](flink-writes.md#branch-writes) diff --git a/docs-new/home/docs/latest/configuration.md b/docs-new/home/docs/latest/configuration.md new file mode 100644 index 000000000000..bc919150a3e1 --- /dev/null +++ b/docs-new/home/docs/latest/configuration.md @@ -0,0 +1,191 @@ +--- +title: "Configuration" +--- + + +# Configuration + +## Table properties + +Iceberg tables support table properties to configure table behavior, like the default split size for readers. + +### Read properties + +| Property | Default | Description | +| --------------------------------- | ------------------ | ------------------------------------------------------ | +| read.split.target-size | 134217728 (128 MB) | Target size when combining data input splits | +| read.split.metadata-target-size | 33554432 (32 MB) | Target size when combining metadata input splits | +| read.split.planning-lookback | 10 | Number of bins to consider when combining input splits | +| read.split.open-file-cost | 4194304 (4 MB) | The estimated cost to open a file, used as a minimum weight when combining splits. | +| read.parquet.vectorization.enabled| true | Controls whether Parquet vectorized reads are used | +| read.parquet.vectorization.batch-size| 5000 | The batch size for parquet vectorized reads | +| read.orc.vectorization.enabled | false | Controls whether orc vectorized reads are used | +| read.orc.vectorization.batch-size | 5000 | The batch size for orc vectorized reads | + +### Write properties + +| Property | Default | Description | +|-----------------------------------------------------|----------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| write.format.default | parquet | Default file format for the table; parquet, avro, or orc | +| write.delete.format.default | data file format | Default delete file format for the table; parquet, avro, or orc | +| write.parquet.row-group-size-bytes | 134217728 (128 MB) | Parquet row group size | +| write.parquet.page-size-bytes | 1048576 (1 MB) | Parquet page size | +| write.parquet.page-row-limit | 20000 | Parquet page row limit | +| write.parquet.dict-size-bytes | 2097152 (2 MB) | Parquet dictionary page size | +| write.parquet.compression-codec | gzip | Parquet compression codec: zstd, brotli, lz4, gzip, snappy, uncompressed | +| write.parquet.compression-level | null | Parquet compression level | +| write.parquet.bloom-filter-enabled.column.col1 | (not set) | Enables writing a bloom filter for the column: col1 | +| write.parquet.bloom-filter-max-bytes | 1048576 (1 MB) | The maximum number of bytes for a bloom filter bitset | +| write.avro.compression-codec | gzip | Avro compression codec: gzip(deflate with 9 level), zstd, snappy, uncompressed | +| write.avro.compression-level | null | Avro compression level | +| write.orc.stripe-size-bytes | 67108864 (64 MB) | Define the default ORC stripe size, in bytes | +| write.orc.block-size-bytes | 268435456 (256 MB) | Define the default file system block size for ORC files | +| write.orc.compression-codec | zlib | ORC compression codec: zstd, lz4, lzo, zlib, snappy, none | +| write.orc.compression-strategy | speed | ORC compression strategy: speed, compression | +| write.orc.bloom.filter.columns | (not set) | Comma separated list of column names for which a Bloom filter must be created | +| write.orc.bloom.filter.fpp | 0.05 | False positive probability for Bloom filter (must > 0.0 and < 1.0) | +| write.location-provider.impl | null | Optional custom implementation for LocationProvider | +| write.metadata.compression-codec | none | Metadata compression codec; none or gzip | +| write.metadata.metrics.max-inferred-column-defaults | 100 | Defines the maximum number of columns for which metrics are collected | +| write.metadata.metrics.default | truncate(16) | Default metrics mode for all columns in the table; none, counts, truncate(length), or full | +| write.metadata.metrics.column.col1 | (not set) | Metrics mode for column 'col1' to allow per-column tuning; none, counts, truncate(length), or full | +| write.target-file-size-bytes | 536870912 (512 MB) | Controls the size of files generated to target about this many bytes | +| write.delete.target-file-size-bytes | 67108864 (64 MB) | Controls the size of delete files generated to target about this many bytes | +| write.distribution-mode | none | Defines distribution of write data: __none__: don't shuffle rows; __hash__: hash distribute by partition key ; __range__: range distribute by partition key or sort key if table has an SortOrder | +| write.delete.distribution-mode | hash | Defines distribution of write delete data | +| write.update.distribution-mode | hash | Defines distribution of write update data | +| write.merge.distribution-mode | none | Defines distribution of write merge data | +| write.wap.enabled | false | Enables write-audit-publish writes | +| write.summary.partition-limit | 0 | Includes partition-level summary stats in snapshot summaries if the changed partition count is less than this limit | +| write.metadata.delete-after-commit.enabled | false | Controls whether to delete the oldest **tracked** version metadata files after commit | +| write.metadata.previous-versions-max | 100 | The max number of previous version metadata files to keep before deleting after commit | +| write.spark.fanout.enabled | false | Enables the fanout writer in Spark that does not require data to be clustered; uses more memory | +| write.object-storage.enabled | false | Enables the object storage location provider that adds a hash component to file paths | +| write.data.path | table location + /data | Base location for data files | +| write.metadata.path | table location + /metadata | Base location for metadata files | +| write.delete.mode | copy-on-write | Mode used for delete commands: copy-on-write or merge-on-read (v2 only) | +| write.delete.isolation-level | serializable | Isolation level for delete commands: serializable or snapshot | +| write.update.mode | copy-on-write | Mode used for update commands: copy-on-write or merge-on-read (v2 only) | +| write.update.isolation-level | serializable | Isolation level for update commands: serializable or snapshot | +| write.merge.mode | copy-on-write | Mode used for merge commands: copy-on-write or merge-on-read (v2 only) | +| write.merge.isolation-level | serializable | Isolation level for merge commands: serializable or snapshot | + +### Table behavior properties + +| Property | Default | Description | +| ---------------------------------- | ---------------- | ------------------------------------------------------------- | +| commit.retry.num-retries | 4 | Number of times to retry a commit before failing | +| commit.retry.min-wait-ms | 100 | Minimum time in milliseconds to wait before retrying a commit | +| commit.retry.max-wait-ms | 60000 (1 min) | Maximum time in milliseconds to wait before retrying a commit | +| commit.retry.total-timeout-ms | 1800000 (30 min) | Total retry timeout period in milliseconds for a commit | +| commit.status-check.num-retries | 3 | Number of times to check whether a commit succeeded after a connection is lost before failing due to an unknown commit state | +| commit.status-check.min-wait-ms | 1000 (1s) | Minimum time in milliseconds to wait before retrying a status-check | +| commit.status-check.max-wait-ms | 60000 (1 min) | Maximum time in milliseconds to wait before retrying a status-check | +| commit.status-check.total-timeout-ms| 1800000 (30 min) | Total timeout period in which the commit status-check must succeed, in milliseconds | +| commit.manifest.target-size-bytes | 8388608 (8 MB) | Target size when merging manifest files | +| commit.manifest.min-count-to-merge | 100 | Minimum number of manifests to accumulate before merging | +| commit.manifest-merge.enabled | true | Controls whether to automatically merge manifests on writes | +| history.expire.max-snapshot-age-ms | 432000000 (5 days) | Default max age of snapshots to keep on the table and all of its branches while expiring snapshots | +| history.expire.min-snapshots-to-keep | 1 | Default min number of snapshots to keep on the table and all of its branches while expiring snapshots | +| history.expire.max-ref-age-ms | `Long.MAX_VALUE` (forever) | For snapshot references except the `main` branch, default max age of snapshot references to keep while expiring snapshots. The `main` branch never expires. | + +### Reserved table properties +Reserved table properties are only used to control behaviors when creating or updating a table. +The value of these properties are not persisted as a part of the table metadata. + +| Property | Default | Description | +| -------------- | -------- | ------------------------------------------------------------- | +| format-version | 1 | Table's format version (can be 1 or 2) as defined in the [Spec](../../spec.md#format-versioning). | + +### Compatibility flags + +| Property | Default | Description | +| --------------------------------------------- | -------- | ------------------------------------------------------------- | +| compatibility.snapshot-id-inheritance.enabled | false | Enables committing snapshots without explicit snapshot IDs | + +## Catalog properties + +Iceberg catalogs support using catalog properties to configure catalog behaviors. Here is a list of commonly used catalog properties: + +| Property | Default | Description | +| --------------------------------- | ------------------ | ------------------------------------------------------ | +| catalog-impl | null | a custom `Catalog` implementation to use by an engine | +| io-impl | null | a custom `FileIO` implementation to use in a catalog | +| warehouse | null | the root path of the data warehouse | +| uri | null | a URI string, such as Hive metastore URI | +| clients | 2 | client pool size | +| cache-enabled | true | Whether to cache catalog entries | +| cache.expiration-interval-ms | 30000 | How long catalog entries are locally cached, in milliseconds; 0 disables caching, negative values disable expiration | + +`HadoopCatalog` and `HiveCatalog` can access the properties in their constructors. +Any other custom catalog can access the properties by implementing `Catalog.initialize(catalogName, catalogProperties)`. +The properties can be manually constructed or passed in from a compute engine like Spark or Flink. +Spark uses its session properties as catalog properties, see more details in the [Spark configuration](spark-configuration.md#catalog-configuration) section. +Flink passes in catalog properties through `CREATE CATALOG` statement, see more details in the [Flink](flink.md#adding-catalogs) section. + +### Lock catalog properties + +Here are the catalog properties related to locking. They are used by some catalog implementations to control the locking behavior during commits. + +| Property | Default | Description | +| --------------------------------- | ------------------ | ------------------------------------------------------ | +| lock-impl | null | a custom implementation of the lock manager, the actual interface depends on the catalog used | +| lock.table | null | an auxiliary table for locking, such as in [AWS DynamoDB lock manager](aws.md#dynamodb-lock-manager) | +| lock.acquire-interval-ms | 5000 (5 s) | the interval to wait between each attempt to acquire a lock | +| lock.acquire-timeout-ms | 180000 (3 min) | the maximum time to try acquiring a lock | +| lock.heartbeat-interval-ms | 3000 (3 s) | the interval to wait between each heartbeat after acquiring a lock | +| lock.heartbeat-timeout-ms | 15000 (15 s) | the maximum time without a heartbeat to consider a lock expired | + + +## Hadoop configuration + +The following properties from the Hadoop configuration are used by the Hive Metastore connector. +The HMS table locking is a 2-step process: +1. Lock Creation: Create lock in HMS and queue for acquisition +2. Lock Check: Check if lock successfully acquired + +| Property | Default | Description | +|-------------------------------------------|-----------------|------------------------------------------------------------------------------| +| iceberg.hive.client-pool-size | 5 | The size of the Hive client pool when tracking tables in HMS | +| iceberg.hive.lock-creation-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to create a lock in the HMS | +| iceberg.hive.lock-creation-min-wait-ms | 50 | Minimum time in milliseconds between retries of creating the lock in the HMS | +| iceberg.hive.lock-creation-max-wait-ms | 5000 | Maximum time in milliseconds between retries of creating the lock in the HMS | +| iceberg.hive.lock-timeout-ms | 180000 (3 min) | Maximum time in milliseconds to acquire a lock | +| iceberg.hive.lock-check-min-wait-ms | 50 | Minimum time in milliseconds between checking the acquisition of the lock | +| iceberg.hive.lock-check-max-wait-ms | 5000 | Maximum time in milliseconds between checking the acquisition of the lock | +| iceberg.hive.lock-heartbeat-interval-ms | 240000 (4 min) | The heartbeat interval for the HMS locks. | +| iceberg.hive.metadata-refresh-max-retries | 2 | Maximum number of retries when the metadata file is missing | +| iceberg.hive.table-level-lock-evict-ms | 600000 (10 min) | The timeout for the JVM table lock is | +| iceberg.engine.hive.lock-enabled | true | Use HMS locks to ensure atomicity of commits | + +Note: `iceberg.hive.lock-check-max-wait-ms` and `iceberg.hive.lock-heartbeat-interval-ms` should be less than the [transaction timeout](https://cwiki.apache.org/confluence/display/Hive/Configuration+Properties#ConfigurationProperties-hive.txn.timeout) +of the Hive Metastore (`hive.txn.timeout` or `metastore.txn.timeout` in the newer versions). Otherwise, the heartbeats on the lock (which happens during the lock checks) would end up expiring in the +Hive Metastore before the lock is retried from Iceberg. + +Warn: Setting `iceberg.engine.hive.lock-enabled`=`false` will cause HiveCatalog to commit to tables without using Hive locks. +This should only be set to `false` if all following conditions are met: + - [HIVE-26882](https://issues.apache.org/jira/browse/HIVE-26882) +is available on the Hive Metastore server + - All other HiveCatalogs committing to tables that this HiveCatalog commits to are also on Iceberg 1.3 or later + - All other HiveCatalogs committing to tables that this HiveCatalog commits to have also disabled Hive locks on commit. + +**Failing to ensure these conditions risks corrupting the table.** + +Even with `iceberg.engine.hive.lock-enabled` set to `false`, a HiveCatalog can still use locks for individual tables by setting the table property `engine.hive.lock-enabled`=`true`. +This is useful in the case where other HiveCatalogs cannot be upgraded and set to commit without using Hive locks. + diff --git a/docs-new/home/docs/latest/custom-catalog.md b/docs-new/home/docs/latest/custom-catalog.md new file mode 100644 index 000000000000..bfcb786cb449 --- /dev/null +++ b/docs-new/home/docs/latest/custom-catalog.md @@ -0,0 +1,268 @@ +--- +title: "Java Custom Catalog" +--- + + +# Custom Catalog + +It's possible to read an iceberg table either from an hdfs path or from a hive table. It's also possible to use a custom metastore in place of hive. The steps to do that are as follows. + +- [Custom TableOperations](#custom-table-operations-implementation) +- [Custom Catalog](#custom-catalog-implementation) +- [Custom FileIO](#custom-file-io-implementation) +- [Custom LocationProvider](#custom-location-provider-implementation) +- [Custom IcebergSource](#custom-icebergsource) + +### Custom table operations implementation +Extend `BaseMetastoreTableOperations` to provide implementation on how to read and write metadata + +Example: +```java +class CustomTableOperations extends BaseMetastoreTableOperations { + private String dbName; + private String tableName; + private Configuration conf; + private FileIO fileIO; + + protected CustomTableOperations(Configuration conf, String dbName, String tableName) { + this.conf = conf; + this.dbName = dbName; + this.tableName = tableName; + } + + // The doRefresh method should provide implementation on how to get the metadata location + @Override + public void doRefresh() { + + // Example custom service which returns the metadata location given a dbName and tableName + String metadataLocation = CustomService.getMetadataForTable(conf, dbName, tableName); + + // When updating from a metadata file location, call the helper method + refreshFromMetadataLocation(metadataLocation); + + } + + // The doCommit method should provide implementation on how to update with metadata location atomically + @Override + public void doCommit(TableMetadata base, TableMetadata metadata) { + String oldMetadataLocation = base.location(); + + // Write new metadata using helper method + String newMetadataLocation = writeNewMetadata(metadata, currentVersion() + 1); + + // Example custom service which updates the metadata location for the given db and table atomically + CustomService.updateMetadataLocation(dbName, tableName, oldMetadataLocation, newMetadataLocation); + + } + + // The io method provides a FileIO which is used to read and write the table metadata files + @Override + public FileIO io() { + if (fileIO == null) { + fileIO = new HadoopFileIO(conf); + } + return fileIO; + } +} +``` + +A `TableOperations` instance is usually obtained by calling `Catalog.newTableOps(TableIdentifier)`. +See the next section about implementing and loading a custom catalog. + +### Custom catalog implementation +Extend `BaseMetastoreCatalog` to provide default warehouse locations and instantiate `CustomTableOperations` + +Example: +```java +public class CustomCatalog extends BaseMetastoreCatalog { + + private Configuration configuration; + + // must have a no-arg constructor to be dynamically loaded + // initialize(String name, Map properties) will be called to complete initialization + public CustomCatalog() { + } + + public CustomCatalog(Configuration configuration) { + this.configuration = configuration; + } + + @Override + protected TableOperations newTableOps(TableIdentifier tableIdentifier) { + String dbName = tableIdentifier.namespace().level(0); + String tableName = tableIdentifier.name(); + // instantiate the CustomTableOperations + return new CustomTableOperations(configuration, dbName, tableName); + } + + @Override + protected String defaultWarehouseLocation(TableIdentifier tableIdentifier) { + + // Can choose to use any other configuration name + String tableLocation = configuration.get("custom.iceberg.warehouse.location"); + + // Can be an s3 or hdfs path + if (tableLocation == null) { + throw new RuntimeException("custom.iceberg.warehouse.location configuration not set!"); + } + + return String.format( + "%s/%s.db/%s", tableLocation, + tableIdentifier.namespace().levels()[0], + tableIdentifier.name()); + } + + @Override + public boolean dropTable(TableIdentifier identifier, boolean purge) { + // Example service to delete table + CustomService.deleteTable(identifier.namepsace().level(0), identifier.name()); + } + + @Override + public void renameTable(TableIdentifier from, TableIdentifier to) { + Preconditions.checkArgument(from.namespace().level(0).equals(to.namespace().level(0)), + "Cannot move table between databases"); + // Example service to rename table + CustomService.renameTable(from.namepsace().level(0), from.name(), to.name()); + } + + // implement this method to read catalog name and properties during initialization + public void initialize(String name, Map properties) { + } +} +``` + +Catalog implementations can be dynamically loaded in most compute engines. +For Spark and Flink, you can specify the `catalog-impl` catalog property to load it. +Read the [Configuration](configuration.md#catalog-properties) section for more details. +For MapReduce, implement `org.apache.iceberg.mr.CatalogLoader` and set Hadoop property `iceberg.mr.catalog.loader.class` to load it. +If your catalog must read Hadoop configuration to access certain environment properties, make your catalog implement `org.apache.hadoop.conf.Configurable`. + +### Custom file IO implementation + +Extend `FileIO` and provide implementation to read and write data files + +Example: +```java +public class CustomFileIO implements FileIO { + + // must have a no-arg constructor to be dynamically loaded + // initialize(Map properties) will be called to complete initialization + public CustomFileIO() { + } + + @Override + public InputFile newInputFile(String s) { + // you also need to implement the InputFile interface for a custom input file + return new CustomInputFile(s); + } + + @Override + public OutputFile newOutputFile(String s) { + // you also need to implement the OutputFile interface for a custom output file + return new CustomOutputFile(s); + } + + @Override + public void deleteFile(String path) { + Path toDelete = new Path(path); + FileSystem fs = Util.getFs(toDelete); + try { + fs.delete(toDelete, false /* not recursive */); + } catch (IOException e) { + throw new RuntimeIOException(e, "Failed to delete file: %s", path); + } + } + + // implement this method to read catalog properties during initialization + public void initialize(Map properties) { + } +} +``` + +If you are already implementing your own catalog, you can implement `TableOperations.io()` to use your custom `FileIO`. +In addition, custom `FileIO` implementations can also be dynamically loaded in `HadoopCatalog` and `HiveCatalog` by specifying the `io-impl` catalog property. +Read the [Configuration](configuration.md#catalog-properties) section for more details. +If your `FileIO` must read Hadoop configuration to access certain environment properties, make your `FileIO` implement `org.apache.hadoop.conf.Configurable`. + +### Custom location provider implementation + +Extend `LocationProvider` and provide implementation to determine the file path to write data + +Example: +```java +public class CustomLocationProvider implements LocationProvider { + + private String tableLocation; + + // must have a 2-arg constructor like this, or a no-arg constructor + public CustomLocationProvider(String tableLocation, Map properties) { + this.tableLocation = tableLocation; + } + + @Override + public String newDataLocation(String filename) { + // can use any custom method to generate a file path given a file name + return String.format("%s/%s/%s", tableLocation, UUID.randomUUID().toString(), filename); + } + + @Override + public String newDataLocation(PartitionSpec spec, StructLike partitionData, String filename) { + // can use any custom method to generate a file path given a partition info and file name + return newDataLocation(filename); + } +} +``` + +If you are already implementing your own catalog, you can override `TableOperations.locationProvider()` to use your custom default `LocationProvider`. +To use a different custom location provider for a specific table, specify the implementation when creating the table using table property `write.location-provider.impl` + +Example: +```sql +CREATE TABLE hive.default.my_table ( + id bigint, + data string, + category string) +USING iceberg +OPTIONS ( + 'write.location-provider.impl'='com.my.CustomLocationProvider' +) +PARTITIONED BY (category); +``` + +### Custom IcebergSource +Extend `IcebergSource` and provide implementation to read from `CustomCatalog` + +Example: +```java +public class CustomIcebergSource extends IcebergSource { + + @Override + protected Table findTable(DataSourceOptions options, Configuration conf) { + Optional path = options.get("path"); + Preconditions.checkArgument(path.isPresent(), "Cannot open table: path is not set"); + + // Read table from CustomCatalog + CustomCatalog catalog = new CustomCatalog(conf); + TableIdentifier tableIdentifier = TableIdentifier.parse(path.get()); + return catalog.loadTable(tableIdentifier); + } +} +``` + +Register the `CustomIcebergSource` by updating `META-INF/services/org.apache.spark.sql.sources.DataSourceRegister` with its fully qualified name diff --git a/docs-new/home/docs/latest/dell.md b/docs-new/home/docs/latest/dell.md new file mode 100644 index 000000000000..ddeebeca657d --- /dev/null +++ b/docs-new/home/docs/latest/dell.md @@ -0,0 +1,130 @@ +--- +title: "Dell" +--- + + + +# Iceberg Dell Integration + +## Dell ECS Integration + +Iceberg can be used with Dell's Enterprise Object Storage (ECS) by using the ECS catalog since 0.15.0. + +See [Dell ECS](https://www.dell.com/en-us/dt/storage/ecs/index.htm) for more information on Dell ECS. + +### Parameters + +When using Dell ECS with Iceberg, these configuration parameters are required: + +| Name | Description | +| ------------------------ | --------------------------------- | +| ecs.s3.endpoint | ECS S3 service endpoint | +| ecs.s3.access-key-id | ECS Username | +| ecs.s3.secret-access-key | S3 Secret Key | +| warehouse | The location of data and metadata | + +The warehouse should use the following formats: + +| Example | Description | +| -------------------------- | --------------------------------------------------------------- | +| ecs://bucket-a | Use the whole bucket as the data | +| ecs://bucket-a/ | Use the whole bucket as the data. The last `/` is ignored. | +| ecs://bucket-a/namespace-a | Use a prefix to access the data only in this specific namespace | + +The Iceberg `runtime` jar supports different versions of Spark and Flink. You should pick the correct version. + +Even though the [Dell ECS client](https://github.com/EMCECS/ecs-object-client-java) jar is backward compatible, Dell EMC still recommends using the latest version of the client. + +### Spark + +To use the Dell ECS catalog with Spark 3.2.1, you should create a Spark session like: + +```bash +ICEBERG_VERSION=0.15.0 +SPARK_VERSION=3.2_2.12 +ECS_CLIENT_VERSION=3.3.2 + +DEPENDENCIES="org.apache.iceberg:iceberg-spark-runtime-${SPARK_VERSION}:${ICEBERG_VERSION},\ +org.apache.iceberg:iceberg-dell:${ICEBERG_VERSION},\ +com.emc.ecs:object-client-bundle:${ECS_CLIENT_VERSION}" + +spark-sql --packages ${DEPENDENCIES} \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=ecs://bucket-a/namespace-a \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.dell.ecs.EcsCatalog \ + --conf spark.sql.catalog.my_catalog.ecs.s3.endpoint=http://10.x.x.x:9020 \ + --conf spark.sql.catalog.my_catalog.ecs.s3.access-key-id= \ + --conf spark.sql.catalog.my_catalog.ecs.s3.secret-access-key= +``` + +Then, use `my_catalog` to access the data in ECS. You can use `SHOW NAMESPACES IN my_catalog` and `SHOW TABLES IN my_catalog` to fetch the namespaces and tables of the catalog. + +The related problems of catalog usage: + +1. The `SparkSession.catalog` won't access the 3rd-party catalog of Spark in both Python and Scala, so please use DDL SQL to list all tables and namespaces. + + +### Flink + +Use the Dell ECS catalog with Flink, you first must create a Flink environment. + +```bash +# HADOOP_HOME is your hadoop root directory after unpack the binary package. +export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath` + +# download Iceberg dependency +MAVEN_URL=https://repo1.maven.org/maven2 +ICEBERG_VERSION=0.15.0 +FLINK_VERSION=1.14 +wget ${MAVEN_URL}/org/apache/iceberg/iceberg-flink-runtime-${FLINK_VERSION}/${ICEBERG_VERSION}/iceberg-flink-runtime-${FLINK_VERSION}-${ICEBERG_VERSION}.jar +wget ${MAVEN_URL}/org/apache/iceberg/iceberg-dell/${ICEBERG_VERSION}/iceberg-dell-${ICEBERG_VERSION}.jar + +# download ECS object client +ECS_CLIENT_VERSION=3.3.2 +wget ${MAVEN_URL}/com/emc/ecs/object-client-bundle/${ECS_CLIENT_VERSION}/object-client-bundle-${ECS_CLIENT_VERSION}.jar + +# open the SQL client. +/path/to/bin/sql-client.sh embedded \ + -j iceberg-flink-runtime-${FLINK_VERSION}-${ICEBERG_VERSION}.jar \ + -j iceberg-dell-${ICEBERG_VERSION}.jar \ + -j object-client-bundle-${ECS_CLIENT_VERSION}.jar \ + shell +``` + +Then, use Flink SQL to create a catalog named `my_catalog`: + +```SQL +CREATE CATALOG my_catalog WITH ( + 'type'='iceberg', + 'warehouse' = 'ecs://bucket-a/namespace-a', + 'catalog-impl'='org.apache.iceberg.dell.ecs.EcsCatalog', + 'ecs.s3.endpoint' = 'http://10.x.x.x:9020', + 'ecs.s3.access-key-id' = '', + 'ecs.s3.secret-access-key' = '') +``` + +Then, you can run `USE CATALOG my_catalog`, `SHOW DATABASES`, and `SHOW TABLES` to fetch the namespaces and tables of the catalog. + +### Limitations + +When you use the catalog with Dell ECS only, you should care about these limitations: + +1. `RENAME` statements are supported without other protections. When you try to rename a table, you need to guarantee all commits are finished in the original table. +2. `RENAME` statements only rename the table without moving any data files. This can lead to a table's data being stored in a path outside of the configured warehouse path. +3. The CAS operations used by table commits are based on the checksum of the object. There is a very small probability of a checksum conflict. diff --git a/docs-new/home/docs/latest/delta-lake-migration.md b/docs-new/home/docs/latest/delta-lake-migration.md new file mode 100644 index 000000000000..45f4f09593ce --- /dev/null +++ b/docs-new/home/docs/latest/delta-lake-migration.md @@ -0,0 +1,118 @@ +--- +title: "Delta Lake Migration" +--- + + +# Delta Lake Table Migration +Delta Lake is a table format that supports Parquet file format and provides time travel and versioning features. When migrating data from Delta Lake to Iceberg, +it is common to migrate all snapshots to maintain the history of the data. + +Currently, Iceberg supports the Snapshot Table action for migrating from Delta Lake to Iceberg tables. +Since Delta Lake tables maintain transactions, all available transactions will be committed to the new Iceberg table as transactions in order. +For Delta Lake tables, any additional data files added after the initial migration will be included in their corresponding transactions and subsequently added to the new Iceberg table using the Add Transaction action. +The Add Transaction action, a variant of the Add File action, is still under development. + +## Enabling Migration from Delta Lake to Iceberg +The `iceberg-delta-lake` module is not bundled with Spark and Flink engine runtimes. To enable migration from delta lake features, the minimum required dependencies are: + + - [iceberg-delta-lake](https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-delta-lake/1.2.1/iceberg-delta-lake-1.2.1.jar) + - [delta-standalone-0.6.0](https://repo1.maven.org/maven2/io/delta/delta-standalone_2.13/0.6.0/delta-standalone_2.13-0.6.0.jar) + - [delta-storage-2.2.0](https://repo1.maven.org/maven2/io/delta/delta-storage/2.2.0/delta-storage-2.2.0.jar) + +### Compatibilities +The module is built and tested with `Delta Standalone:0.6.0` and supports Delta Lake tables with the following protocol version: +* `minReaderVersion`: 1 +* `minWriterVersion`: 2 + +Please refer to [Delta Lake Table Protocol Versioning](https://docs.delta.io/latest/versioning.html) for more details about Delta Lake protocol versions. + +### API +The `iceberg-delta-lake` module provides an interface named `DeltaLakeToIcebergMigrationActionsProvider`, which contains actions that helps converting from Delta Lake to Iceberg. +The supported actions are: +* `snapshotDeltaLakeTable`: snapshot an existing Delta Lake table to an Iceberg table + +### Default Implementation +The `iceberg-delta-lake` module also provides a default implementation of the interface which can be accessed by +```java +DeltaLakeToIcebergMigrationActionsProvider defaultActions = DeltaLakeToIcebergMigrationActionsProvider.defaultActions() +``` + +## Snapshot Delta Lake Table to Iceberg +The action `snapshotDeltaLakeTable` reads the Delta Lake table's transactions and converts them to a new Iceberg table with the same schema and partitioning in one iceberg transaction. +The original Delta Lake table remains unchanged. + +The newly created table can be changed or written to without affecting the source table, but the snapshot uses the original table's data files. +Existing data files are added to the Iceberg table's metadata and can be read using a name-to-id mapping created from the original table schema. + +When inserts or overwrites run on the snapshot, new files are placed in the snapshot table's location. The location is default to be the same as that +of the source Delta Lake Table. Users can also specify a different location for the snapshot table. + +!!! info + Because tables created by `snapshotDeltaLakeTable` are not the sole owners of their data files, they are prohibited from + actions like `expire_snapshots` which would physically delete data files. Iceberg deletes, which only effect metadata, + are still allowed. In addition, any operations which affect the original data files will disrupt the Snapshot's + integrity. DELETE statements executed against the original Delta Lake table will remove original data files and the + `snapshotDeltaLakeTable` table will no longer be able to access them. + + +#### Usage +| Required Input | Configured By | Description | +|------------------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------| +| Source Table Location | Argument [`sourceTableLocation`](../../javadoc/latest/org/apache/iceberg/delta/DeltaLakeToIcebergMigrationActionsProvider.html#snapshotDeltaLakeTable(java.lang.String)) | The location of the source Delta Lake table | +| New Iceberg Table Identifier | Configuration API [`as`](../../javadoc/latest/org/apache/iceberg/delta/SnapshotDeltaLakeTable.html#as(org.apache.iceberg.catalog.TableIdentifier)) | The identifier specifies the namespace and table name for the new iceberg table | +| Iceberg Catalog | Configuration API [`icebergCatalog`](../../javadoc/latest/org/apache/iceberg/delta/SnapshotDeltaLakeTable.html#icebergCatalog(org.apache.iceberg.catalog.Catalog)) | The catalog used to create the new iceberg table | +| Hadoop Configuration | Configuration API [`deltaLakeConfiguration`](../../javadoc/latest/org/apache/iceberg/delta/SnapshotDeltaLakeTable.html#deltaLakeConfiguration(org.apache.hadoop.conf.Configuration)) | The Hadoop Configuration used to read the source Delta Lake table. | + +For detailed usage and other optional configurations, please refer to the [SnapshotDeltaLakeTable API](../../javadoc/latest/org/apache/iceberg/delta/SnapshotDeltaLakeTable.html) + +#### Output +| Output Name | Type | Description | +| ------------|------|-------------| +| `imported_files_count` | long | Number of files added to the new table | + +#### Added Table Properties +The following table properties are added to the Iceberg table to be created by default: + +| Property Name | Value | Description | +|-------------------------------|-------------------------------------------|--------------------------------------------------------------------| +| `snapshot_source` | `delta` | Indicates that the table is snapshot from a delta lake table | +| `original_location` | location of the delta lake table | The absolute path to the location of the original delta lake table | +| `schema.name-mapping.default` | JSON name mapping derived from the schema | The name mapping string used to read Delta Lake table's data files | + +#### Examples +```java +import org.apache.iceberg.catalog.TableIdentifier; +import org.apache.iceberg.catalog.Catalog; +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.delta.DeltaLakeToIcebergMigrationActionsProvider; + +String sourceDeltaLakeTableLocation = "s3://my-bucket/delta-table"; +String destTableLocation = "s3://my-bucket/iceberg-table"; +TableIdentifier destTableIdentifier = TableIdentifier.of("my_db", "my_table"); +Catalog icebergCatalog = ...; // Iceberg Catalog fetched from engines like Spark or created via CatalogUtil.loadCatalog +Configuration hadoopConf = ...; // Hadoop Configuration fetched from engines like Spark and have proper file system configuration to access the Delta Lake table. + +DeltaLakeToIcebergMigrationActionsProvider.defaultActions() + .snapshotDeltaLakeTable(sourceDeltaLakeTableLocation) + .as(destTableIdentifier) + .icebergCatalog(icebergCatalog) + .tableLocation(destTableLocation) + .deltaLakeConfiguration(hadoopConf) + .tableProperty("my_property", "my_value") + .execute(); +``` diff --git a/docs-new/home/docs/latest/evolution.md b/docs-new/home/docs/latest/evolution.md new file mode 100644 index 000000000000..98bdbd27c0c5 --- /dev/null +++ b/docs-new/home/docs/latest/evolution.md @@ -0,0 +1,101 @@ +--- +title: Evolution +--- + + +# Evolution + +Iceberg supports **in-place table evolution**. You can [evolve a table schema](#schema-evolution) just like SQL -- even in nested structures -- or [change partition layout](#partition-evolution) when data volume changes. Iceberg does not require costly distractions, like rewriting table data or migrating to a new table. + +For example, Hive table partitioning cannot change so moving from a daily partition layout to an hourly partition layout requires a new table. And because queries are dependent on partitions, queries must be rewritten for the new table. In some cases, even changes as simple as renaming a column are either not supported, or can cause [data correctness](#correctness) problems. + +## Schema evolution + +Iceberg supports the following schema evolution changes: + +* **Add** -- add a new column to the table or to a nested struct +* **Drop** -- remove an existing column from the table or a nested struct +* **Rename** -- rename an existing column or field in a nested struct +* **Update** -- widen the type of a column, struct field, map key, map value, or list element +* **Reorder** -- change the order of columns or fields in a nested struct + +Iceberg schema updates are **metadata changes**, so no data files need to be rewritten to perform the update. + +Note that map keys do not support adding or dropping struct fields that would change equality. + +### Correctness + +Iceberg guarantees that **schema evolution changes are independent and free of side-effects**, without rewriting files: + +1. Added columns never read existing values from another column. +2. Dropping a column or field does not change the values in any other column. +3. Updating a column or field does not change values in any other column. +4. Changing the order of columns or fields in a struct does not change the values associated with a column or field name. + +Iceberg uses unique IDs to track each column in a table. When you add a column, it is assigned a new ID so existing data is never used by mistake. + +* Formats that track columns by name can inadvertently un-delete a column if a name is reused, which violates #1. +* Formats that track columns by position cannot delete columns without changing the names that are used for each column, which violates #2. + + +## Partition evolution + +Iceberg table partitioning can be updated in an existing table because queries do not reference partition values directly. + +When you evolve a partition spec, the old data written with an earlier spec remains unchanged. New data is written using the new spec in a new layout. Metadata for each of the partition versions is kept separately. Because of this, when you start writing queries, you get split planning. This is where each partition layout plans files separately using the filter it derives for that specific partition layout. Here's a visual representation of a contrived example: + +![Partition evolution diagram](assets/images/partition-spec-evolution.png) +*The data for 2008 is partitioned by month. Starting from 2009 the table is updated so that the data is instead partitioned by day. Both partitioning layouts are able to coexist in the same table.* + +Iceberg uses [hidden partitioning](partitioning.md), so you don't *need* to write queries for a specific partition layout to be fast. Instead, you can write queries that select the data you need, and Iceberg automatically prunes out files that don't contain matching data. + +Partition evolution is a metadata operation and does not eagerly rewrite files. + +Iceberg's Java table API provides `updateSpec` API to update partition spec. +For example, the following code could be used to update the partition spec to add a new partition field that places `id` column values into 8 buckets and remove an existing partition field `category`: + +```java +Table sampleTable = ...; +sampleTable.updateSpec() + .addField(bucket("id", 8)) + .removeField("category") + .commit(); +``` + +Spark supports updating partition spec through its `ALTER TABLE` SQL statement, see more details in [Spark SQL](spark-ddl.md#alter-table-add-partition-field). + +## Sort order evolution + +Similar to partition spec, Iceberg sort order can also be updated in an existing table. +When you evolve a sort order, the old data written with an earlier order remains unchanged. +Engines can always choose to write data in the latest sort order or unsorted when sorting is prohibitively expensive. + +Iceberg's Java table API provides `replaceSortOrder` API to update sort order. +For example, the following code could be used to create a new sort order +with `id` column sorted in ascending order with nulls last, +and `category` column sorted in descending order with nulls first: + +```java +Table sampleTable = ...; +sampleTable.replaceSortOrder() + .asc("id", NullOrder.NULLS_LAST) + .dec("category", NullOrder.NULL_FIRST) + .commit(); +``` + +Spark supports updating sort order through its `ALTER TABLE` SQL statement, see more details in [Spark SQL](spark-ddl.md#alter-table-write-ordered-by). diff --git a/docs-new/home/docs/latest/flink-actions.md b/docs-new/home/docs/latest/flink-actions.md new file mode 100644 index 000000000000..ca67ef0e5f8f --- /dev/null +++ b/docs-new/home/docs/latest/flink-actions.md @@ -0,0 +1,35 @@ +--- +title: "Flink Actions" +--- + + +## Rewrite files action. + +Iceberg provides API to rewrite small files into large files by submitting Flink batch jobs. The behavior of this Flink action is the same as Spark's [rewriteDataFiles](maintenance.md#compact-data-files). + +```java +import org.apache.iceberg.flink.actions.Actions; + +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path"); +Table table = tableLoader.loadTable(); +RewriteDataFilesActionResult result = Actions.forTable(table) + .rewriteDataFiles() + .execute(); +``` + +For more details of the rewrite files action, please refer to [RewriteDataFilesAction](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/flink/actions/RewriteDataFilesAction.html) diff --git a/docs-new/home/docs/latest/flink-configuration.md b/docs-new/home/docs/latest/flink-configuration.md new file mode 100644 index 000000000000..d3725947d902 --- /dev/null +++ b/docs-new/home/docs/latest/flink-configuration.md @@ -0,0 +1,158 @@ +--- +title: "Flink Configuration" +--- + + +# Flink Configuration + +## Catalog Configuration + +A catalog is created and named by executing the following query (replace `` with your catalog name and +``=`` with catalog implementation config): + +```sql +CREATE CATALOG WITH ( + 'type'='iceberg', + ``=`` +); +``` + +The following properties can be set globally and are not limited to a specific catalog implementation: + +| Property | Required | Values | Description | +| ---------------------------- |----------| -------------------------- |----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| type | ✔️ | iceberg | Must be `iceberg`. | +| catalog-type | | `hive`, `hadoop` or `rest` | `hive`, `hadoop` or `rest` for built-in catalogs, or left unset for custom catalog implementations using catalog-impl. | +| catalog-impl | | | The fully-qualified class name of a custom catalog implementation. Must be set if `catalog-type` is unset. | +| property-version | | | Version number to describe the property version. This property can be used for backwards compatibility in case the property format changes. The current property version is `1`. | +| cache-enabled | | `true` or `false` | Whether to enable catalog cache, default value is `true`. | +| cache.expiration-interval-ms | | | How long catalog entries are locally cached, in milliseconds; negative values like `-1` will disable expiration, value 0 is not allowed to set. default value is `-1`. | + +The following properties can be set if using the Hive catalog: + +| Property | Required | Values | Description | +| --------------- |----------| ------ |--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| uri | ✔️ | | The Hive metastore's thrift URI. | +| clients | | | The Hive metastore client pool size, default value is 2. | +| warehouse | | | The Hive warehouse location, users should specify this path if neither set the `hive-conf-dir` to specify a location containing a `hive-site.xml` configuration file nor add a correct `hive-site.xml` to classpath. | +| hive-conf-dir | | | Path to a directory containing a `hive-site.xml` configuration file which will be used to provide custom Hive configuration values. The value of `hive.metastore.warehouse.dir` from `/hive-site.xml` (or hive configure file from classpath) will be overwritten with the `warehouse` value if setting both `hive-conf-dir` and `warehouse` when creating iceberg catalog. | +| hadoop-conf-dir | | | Path to a directory containing `core-site.xml` and `hdfs-site.xml` configuration files which will be used to provide custom Hadoop configuration values. | + +The following properties can be set if using the Hadoop catalog: + +| Property | Required | Values | Description | +| --------- |-------------| ------ | ---------------------------------------------------------- | +| warehouse | ✔️ | | The HDFS directory to store metadata files and data files. | + +The following properties can be set if using the REST catalog: + +| Property | Required | Values | Description | +| ---------- |----------| ------ |-----------------------------------------------------------------------------| +| uri | ✔️ | | The URL to the REST Catalog. | +| credential | | | A credential to exchange for a token in the OAuth2 client credentials flow. | +| token | | | A token which will be used to interact with the server. | + + +## Runtime configuration + +### Read options + +Flink read options are passed when configuring the Flink IcebergSource: + +``` +IcebergSource.forRowData() + .tableLoader(TableLoader.fromCatalog(...)) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(true) + .streamingStartingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .startSnapshotId(3821550127947089987L) + .monitorInterval(Duration.ofMillis(10L)) // or .set("monitor-interval", "10s") \ set(FlinkReadOptions.MONITOR_INTERVAL, "10s") + .build() +``` + +For Flink SQL, read options can be passed in via SQL hints like this: + +``` +SELECT * FROM tableName /*+ OPTIONS('monitor-interval'='10s') */ +... +``` + +Options can be passed in via Flink configuration, which will be applied to current session. Note that not all options support this mode. + +``` +env.getConfig() + .getConfiguration() + .set(FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION, 1000L); +... +``` + +`Read option` has the highest priority, followed by `Flink configuration` and then `Table property`. + +| Read option | Flink configuration | Table property | Default | Description | +| --------------------------- | --------------------------------------------- | ---------------------------- | -------------------------------- | ------------------------------------------------------------ | +| snapshot-id | N/A | N/A | null | For time travel in batch mode. Read data from the specified snapshot-id. | +| case-sensitive | connector.iceberg.case-sensitive | N/A | false | If true, match column name in a case sensitive way. | +| as-of-timestamp | N/A | N/A | null | For time travel in batch mode. Read data from the most recent snapshot as of the given time in milliseconds. | +| starting-strategy | connector.iceberg.starting-strategy | N/A | INCREMENTAL_FROM_LATEST_SNAPSHOT | Starting strategy for streaming execution. TABLE_SCAN_THEN_INCREMENTAL: Do a regular table scan then switch to the incremental mode. The incremental mode starts from the current snapshot exclusive. INCREMENTAL_FROM_LATEST_SNAPSHOT: Start incremental mode from the latest snapshot inclusive. If it is an empty map, all future append snapshots should be discovered. INCREMENTAL_FROM_EARLIEST_SNAPSHOT: Start incremental mode from the earliest snapshot inclusive. If it is an empty map, all future append snapshots should be discovered. INCREMENTAL_FROM_SNAPSHOT_ID: Start incremental mode from a snapshot with a specific id inclusive. INCREMENTAL_FROM_SNAPSHOT_TIMESTAMP: Start incremental mode from a snapshot with a specific timestamp inclusive. If the timestamp is between two snapshots, it should start from the snapshot after the timestamp. Just for FIP27 Source. | +| start-snapshot-timestamp | N/A | N/A | null | Start to read data from the most recent snapshot as of the given time in milliseconds. | +| start-snapshot-id | N/A | N/A | null | Start to read data from the specified snapshot-id. | +| end-snapshot-id | N/A | N/A | The latest snapshot id | Specifies the end snapshot. +| branch | N/A | N/A | main | Specifies the branch to read from in batch mode +| tag | N/A | N/A | null | Specifies the tag to read from in batch mode +| start-tag | N/A | N/A | null | Specifies the starting tag to read from for incremental reads +| end-tag | N/A | N/A | null | Specifies the ending tag to to read from for incremental reads | +| split-size | connector.iceberg.split-size | read.split.target-size | 128 MB | Target size when combining input splits. | +| split-lookback | connector.iceberg.split-file-open-cost | read.split.planning-lookback | 10 | Number of bins to consider when combining input splits. | +| split-file-open-cost | connector.iceberg.split-file-open-cost | read.split.open-file-cost | 4MB | The estimated cost to open a file, used as a minimum weight when combining splits. | +| streaming | connector.iceberg.streaming | N/A | false | Sets whether the current task runs in streaming or batch mode. | +| monitor-interval | connector.iceberg.monitor-interval | N/A | 60s | Monitor interval to discover splits from new snapshots. Applicable only for streaming read. | +| include-column-stats | connector.iceberg.include-column-stats | N/A | false | Create a new scan from this that loads the column stats with each data file. Column stats include: value count, null value count, lower bounds, and upper bounds. | +| max-planning-snapshot-count | connector.iceberg.max-planning-snapshot-count | N/A | Integer.MAX_VALUE | Max number of snapshots limited per split enumeration. Applicable only to streaming read. | +| limit | connector.iceberg.limit | N/A | -1 | Limited output number of rows. | + + +### Write options + +Flink write options are passed when configuring the FlinkSink, like this: + +``` +FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .set("write-format", "orc") + .set(FlinkWriteOptions.OVERWRITE_MODE, "true"); +``` + +For Flink SQL, write options can be passed in via SQL hints like this: + +``` +INSERT INTO tableName /*+ OPTIONS('upsert-enabled'='true') */ +... +``` + +| Flink option | Default | Description | +| ---------------------- | ------------------------------------------ | ------------------------------------------------------------ | +| write-format | Table write.format.default | File format to use for this write operation; parquet, avro, or orc | +| target-file-size-bytes | As per table property | Overrides this table's write.target-file-size-bytes | +| upsert-enabled | Table write.upsert.enabled | Overrides this table's write.upsert.enabled | +| overwrite-enabled | false | Overwrite the table's data, overwrite mode shouldn't be enable when configuring to use UPSERT data stream. | +| distribution-mode | Table write.distribution-mode | Overrides this table's write.distribution-mode | +| compression-codec | Table write.(fileformat).compression-codec | Overrides this table's compression codec for this write | +| compression-level | Table write.(fileformat).compression-level | Overrides this table's compression level for Parquet and Avro tables for this write | +| compression-strategy | Table write.orc.compression-strategy | Overrides this table's compression strategy for ORC tables for this write | +| write-parallelism | Upstream operator parallelism | Overrides the writer parallelism | diff --git a/docs-new/home/docs/latest/flink-connector.md b/docs-new/home/docs/latest/flink-connector.md new file mode 100644 index 000000000000..025e9aee92ea --- /dev/null +++ b/docs-new/home/docs/latest/flink-connector.md @@ -0,0 +1,141 @@ +--- +title: "Flink Connector" +--- + + +# Flink Connector +Apache Flink supports creating Iceberg table directly without creating the explicit Flink catalog in Flink SQL. That means we can just create an iceberg table by specifying `'connector'='iceberg'` table option in Flink SQL which is similar to usage in the Flink official [document](https://nightlies.apache.org/flink/flink-docs-release-1.13/docs/connectors/table/overview/). + +In Flink, the SQL `CREATE TABLE test (..) WITH ('connector'='iceberg', ...)` will create a Flink table in current Flink catalog (use [GenericInMemoryCatalog](https://ci.apache.org/projects/flink/flink-docs-release-1.13/docs/dev/table/catalogs/#genericinmemorycatalog) by default), +which is just mapping to the underlying iceberg table instead of maintaining iceberg table directly in current Flink catalog. + +To create the table in Flink SQL by using SQL syntax `CREATE TABLE test (..) WITH ('connector'='iceberg', ...)`, Flink iceberg connector provides the following table properties: + +* `connector`: Use the constant `iceberg`. +* `catalog-name`: User-specified catalog name. It's required because the connector don't have any default value. +* `catalog-type`: `hive` or `hadoop` for built-in catalogs (defaults to `hive`), or left unset for custom catalog implementations using `catalog-impl`. +* `catalog-impl`: The fully-qualified class name of a custom catalog implementation. Must be set if `catalog-type` is unset. See also [custom catalog](flink.md#adding-catalogs) for more details. +* `catalog-database`: The iceberg database name in the backend catalog, use the current flink database name by default. +* `catalog-table`: The iceberg table name in the backend catalog. Default to use the table name in the flink `CREATE TABLE` sentence. + +## Table managed in Hive catalog. + +Before executing the following SQL, please make sure you've configured the Flink SQL client correctly according to the [quick start documentation](flink.md). + +The following SQL will create a Flink table in the current Flink catalog, which maps to the iceberg table `default_database.flink_table` managed in iceberg catalog. + +```sql +CREATE TABLE flink_table ( + id BIGINT, + data STRING +) WITH ( + 'connector'='iceberg', + 'catalog-name'='hive_prod', + 'uri'='thrift://localhost:9083', + 'warehouse'='hdfs://nn:8020/path/to/warehouse' +); +``` + +If you want to create a Flink table mapping to a different iceberg table managed in Hive catalog (such as `hive_db.hive_iceberg_table` in Hive), then you can create Flink table as following: + +```sql +CREATE TABLE flink_table ( + id BIGINT, + data STRING +) WITH ( + 'connector'='iceberg', + 'catalog-name'='hive_prod', + 'catalog-database'='hive_db', + 'catalog-table'='hive_iceberg_table', + 'uri'='thrift://localhost:9083', + 'warehouse'='hdfs://nn:8020/path/to/warehouse' +); +``` + +!!! info + The underlying catalog database (`hive_db` in the above example) will be created automatically if it does not exist when writing records into the Flink table. + + +## Table managed in hadoop catalog + +The following SQL will create a Flink table in current Flink catalog, which maps to the iceberg table `default_database.flink_table` managed in hadoop catalog. + +```sql +CREATE TABLE flink_table ( + id BIGINT, + data STRING +) WITH ( + 'connector'='iceberg', + 'catalog-name'='hadoop_prod', + 'catalog-type'='hadoop', + 'warehouse'='hdfs://nn:8020/path/to/warehouse' +); +``` + +## Table managed in custom catalog + +The following SQL will create a Flink table in current Flink catalog, which maps to the iceberg table `default_database.flink_table` managed in +a custom catalog of type `com.my.custom.CatalogImpl`. + +```sql +CREATE TABLE flink_table ( + id BIGINT, + data STRING +) WITH ( + 'connector'='iceberg', + 'catalog-name'='custom_prod', + 'catalog-impl'='com.my.custom.CatalogImpl', + -- More table properties for the customized catalog + 'my-additional-catalog-config'='my-value', + ... +); +``` + +Please check sections under the Integrations tab for all custom catalogs. + +## A complete example. + +Take the Hive catalog as an example: + +```sql +CREATE TABLE flink_table ( + id BIGINT, + data STRING +) WITH ( + 'connector'='iceberg', + 'catalog-name'='hive_prod', + 'uri'='thrift://localhost:9083', + 'warehouse'='file:///path/to/warehouse' +); + +INSERT INTO flink_table VALUES (1, 'AAA'), (2, 'BBB'), (3, 'CCC'); + +SET execution.result-mode=tableau; +SELECT * FROM flink_table; + ++----+------+ +| id | data | ++----+------+ +| 1 | AAA | +| 2 | BBB | +| 3 | CCC | ++----+------+ +3 rows in set +``` + +For more details, please refer to the Iceberg [Flink documentation](flink.md). diff --git a/docs-new/home/docs/latest/flink-ddl.md b/docs-new/home/docs/latest/flink-ddl.md new file mode 100644 index 000000000000..fc44b1b7a54d --- /dev/null +++ b/docs-new/home/docs/latest/flink-ddl.md @@ -0,0 +1,206 @@ +--- +title: "Flink DDL" +--- + + +## DDL commands + +### `CREATE Catalog` + +#### Hive catalog + +This creates an Iceberg catalog named `hive_catalog` that can be configured using `'catalog-type'='hive'`, which loads tables from Hive metastore: + +```sql +CREATE CATALOG hive_catalog WITH ( + 'type'='iceberg', + 'catalog-type'='hive', + 'uri'='thrift://localhost:9083', + 'clients'='5', + 'property-version'='1', + 'warehouse'='hdfs://nn:8020/warehouse/path' +); +``` + +The following properties can be set if using the Hive catalog: + +* `uri`: The Hive metastore's thrift URI. (Required) +* `clients`: The Hive metastore client pool size, default value is 2. (Optional) +* `warehouse`: The Hive warehouse location, users should specify this path if neither set the `hive-conf-dir` to specify a location containing a `hive-site.xml` configuration file nor add a correct `hive-site.xml` to classpath. +* `hive-conf-dir`: Path to a directory containing a `hive-site.xml` configuration file which will be used to provide custom Hive configuration values. The value of `hive.metastore.warehouse.dir` from `/hive-site.xml` (or hive configure file from classpath) will be overwritten with the `warehouse` value if setting both `hive-conf-dir` and `warehouse` when creating iceberg catalog. +* `hadoop-conf-dir`: Path to a directory containing `core-site.xml` and `hdfs-site.xml` configuration files which will be used to provide custom Hadoop configuration values. + +#### Hadoop catalog + +Iceberg also supports a directory-based catalog in HDFS that can be configured using `'catalog-type'='hadoop'`: + +```sql +CREATE CATALOG hadoop_catalog WITH ( + 'type'='iceberg', + 'catalog-type'='hadoop', + 'warehouse'='hdfs://nn:8020/warehouse/path', + 'property-version'='1' +); +``` + +The following properties can be set if using the Hadoop catalog: + +* `warehouse`: The HDFS directory to store metadata files and data files. (Required) + +Execute the sql command `USE CATALOG hadoop_catalog` to set the current catalog. + +#### REST catalog + +This creates an iceberg catalog named `rest_catalog` that can be configured using `'catalog-type'='rest'`, which loads tables from a REST catalog: + +```sql +CREATE CATALOG rest_catalog WITH ( + 'type'='iceberg', + 'catalog-type'='rest', + 'uri'='https://localhost/' +); +``` + +The following properties can be set if using the REST catalog: + +* `uri`: The URL to the REST Catalog (Required) +* `credential`: A credential to exchange for a token in the OAuth2 client credentials flow (Optional) +* `token`: A token which will be used to interact with the server (Optional) + +#### Custom catalog + +Flink also supports loading a custom Iceberg `Catalog` implementation by specifying the `catalog-impl` property: + +```sql +CREATE CATALOG my_catalog WITH ( + 'type'='iceberg', + 'catalog-impl'='com.my.custom.CatalogImpl', + 'my-additional-catalog-config'='my-value' +); +``` + +#### Create through YAML config + +Catalogs can be registered in `sql-client-defaults.yaml` before starting the SQL client. + +```yaml +catalogs: + - name: my_catalog + type: iceberg + catalog-type: hadoop + warehouse: hdfs://nn:8020/warehouse/path +``` + +#### Create through SQL Files + +The Flink SQL Client supports the `-i` startup option to execute an initialization SQL file to set up environment when starting up the SQL Client. + +```sql +-- define available catalogs +CREATE CATALOG hive_catalog WITH ( + 'type'='iceberg', + 'catalog-type'='hive', + 'uri'='thrift://localhost:9083', + 'warehouse'='hdfs://nn:8020/warehouse/path' +); + +USE CATALOG hive_catalog; +``` + +Using `-i ` option to initialize SQL Client session: + +```bash +/path/to/bin/sql-client.sh -i /path/to/init.sql +``` + +### `CREATE DATABASE` + +By default, Iceberg will use the `default` database in Flink. Using the following example to create a separate database in order to avoid creating tables under the `default` database: + +```sql +CREATE DATABASE iceberg_db; +USE iceberg_db; +``` + +### `CREATE TABLE` + +```sql +CREATE TABLE `hive_catalog`.`default`.`sample` ( + id BIGINT COMMENT 'unique id', + data STRING +); +``` + +Table create commands support the commonly used [Flink create clauses](https://nightlies.apache.org/flink/flink-docs-master/docs/dev/table/sql/create/) including: + +* `PARTITION BY (column1, column2, ...)` to configure partitioning, Flink does not yet support hidden partitioning. +* `COMMENT 'table document'` to set a table description. +* `WITH ('key'='value', ...)` to set [table configuration](configuration.md) which will be stored in Iceberg table properties. + +Currently, it does not support computed column, primary key and watermark definition etc. + +### `PARTITIONED BY` + +To create a partition table, use `PARTITIONED BY`: + +```sql +CREATE TABLE `hive_catalog`.`default`.`sample` ( + id BIGINT COMMENT 'unique id', + data STRING +) PARTITIONED BY (data); +``` + +Iceberg support hidden partition but Flink don't support partitioning by a function on columns, so there is no way to support hidden partition in Flink DDL. + +### `CREATE TABLE LIKE` + +To create a table with the same schema, partitioning, and table properties as another table, use `CREATE TABLE LIKE`. + +```sql +CREATE TABLE `hive_catalog`.`default`.`sample` ( + id BIGINT COMMENT 'unique id', + data STRING +); + +CREATE TABLE `hive_catalog`.`default`.`sample_like` LIKE `hive_catalog`.`default`.`sample`; +``` + +For more details, refer to the [Flink `CREATE TABLE` documentation](https://nightlies.apache.org/flink/flink-docs-release-1.16/docs/dev/table/sql/create/). + + +### `ALTER TABLE` + +Iceberg only support altering table properties: + +```sql +ALTER TABLE `hive_catalog`.`default`.`sample` SET ('write.format.default'='avro') +``` + +### `ALTER TABLE .. RENAME TO` + +```sql +ALTER TABLE `hive_catalog`.`default`.`sample` RENAME TO `hive_catalog`.`default`.`new_sample`; +``` + +### `DROP TABLE` + +To delete a table, run: + +```sql +DROP TABLE `hive_catalog`.`default`.`sample`; +``` diff --git a/docs-new/home/docs/latest/flink-queries.md b/docs-new/home/docs/latest/flink-queries.md new file mode 100644 index 000000000000..c33d498587d5 --- /dev/null +++ b/docs-new/home/docs/latest/flink-queries.md @@ -0,0 +1,489 @@ +--- +title: "Flink Queries" +--- + + +# Flink Queries + +Iceberg support streaming and batch read With [Apache Flink](https://flink.apache.org/)'s DataStream API and Table API. + +## Reading with SQL + +Iceberg support both streaming and batch read in Flink. Execute the following sql command to switch execution mode from `streaming` to `batch`, and vice versa: + +```sql +-- Execute the flink job in streaming mode for current session context +SET execution.runtime-mode = streaming; + +-- Execute the flink job in batch mode for current session context +SET execution.runtime-mode = batch; +``` + +### Flink batch read + +Submit a Flink __batch__ job using the following sentences: + +```sql +-- Execute the flink job in batch mode for current session context +SET execution.runtime-mode = batch; +SELECT * FROM sample; +``` + +### Flink streaming read + +Iceberg supports processing incremental data in Flink streaming jobs which starts from a historical snapshot-id: + +```sql +-- Submit the flink job in streaming mode for current session. +SET execution.runtime-mode = streaming; + +-- Enable this switch because streaming read SQL will provide few job options in flink SQL hint options. +SET table.dynamic-table-options.enabled=true; + +-- Read all the records from the iceberg current snapshot, and then read incremental data starting from that snapshot. +SELECT * FROM sample /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/ ; + +-- Read all incremental data starting from the snapshot-id '3821550127947089987' (records from this snapshot will be excluded). +SELECT * FROM sample /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'start-snapshot-id'='3821550127947089987')*/ ; +``` + +There are some options that could be set in Flink SQL hint options for streaming job, see [read options](#read-options) for details. + +### FLIP-27 source for SQL + +Here are the SQL settings for the [FLIP-27](https://cwiki.apache.org/confluence/display/FLINK/FLIP-27%3A+Refactor+Source+Interface) source. All other SQL settings and options documented above are applicable to the FLIP-27 source. + +```sql +-- Opt in the FLIP-27 source. Default is false. +SET table.exec.iceberg.use-flip27-source = true; +``` + +### Reading branches and tags with SQL +Branch and tags can be read via SQL by specifying options. For more details +refer to [Flink Configuration](flink-configuration.md#read-options) + +```sql +--- Read from branch b1 +SELECT * FROM table /*+ OPTIONS('branch'='b1') */ ; + +--- Read from tag t1 +SELECT * FROM table /*+ OPTIONS('tag'='t1') */; + +--- Incremental scan from tag t1 to tag t2 +SELECT * FROM table /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'start-tag'='t1', 'end-tag'='t2') */; +``` + +## Reading with DataStream + +Iceberg support streaming or batch read in Java API now. + +### Batch Read + +This example will read all records from iceberg table and then print to the stdout console in flink batch job: + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path"); +DataStream batch = FlinkSource.forRowData() + .env(env) + .tableLoader(tableLoader) + .streaming(false) + .build(); + +// Print all records to stdout. +batch.print(); + +// Submit and execute this batch read job. +env.execute("Test Iceberg Batch Read"); +``` + +### Streaming read + +This example will read incremental records which start from snapshot-id '3821550127947089987' and print to stdout console in flink streaming job: + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path"); +DataStream stream = FlinkSource.forRowData() + .env(env) + .tableLoader(tableLoader) + .streaming(true) + .startSnapshotId(3821550127947089987L) + .build(); + +// Print all records to stdout. +stream.print(); + +// Submit and execute this streaming read job. +env.execute("Test Iceberg Streaming Read"); +``` + +There are other options that can be set, please see the [FlinkSource#Builder](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/flink/source/FlinkSource.html). + +## Reading with DataStream (FLIP-27 source) + +[FLIP-27 source interface](https://cwiki.apache.org/confluence/display/FLINK/FLIP-27%3A+Refactor+Source+Interface) +was introduced in Flink 1.12. It aims to solve several shortcomings of the old `SourceFunction` +streaming source interface. It also unifies the source interfaces for both batch and streaming executions. +Most source connectors (like Kafka, file) in Flink repo have migrated to the FLIP-27 interface. +Flink is planning to deprecate the old `SourceFunction` interface in the near future. + +A FLIP-27 based Flink `IcebergSource` is added in `iceberg-flink` module. The FLIP-27 `IcebergSource` is currently an experimental feature. + +### Batch Read + +This example will read all records from iceberg table and then print to the stdout console in flink batch job: + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path"); + +IcebergSource source = IcebergSource.forRowData() + .tableLoader(tableLoader) + .assignerFactory(new SimpleSplitAssignerFactory()) + .build(); + +DataStream batch = env.fromSource( + source, + WatermarkStrategy.noWatermarks(), + "My Iceberg Source", + TypeInformation.of(RowData.class)); + +// Print all records to stdout. +batch.print(); + +// Submit and execute this batch read job. +env.execute("Test Iceberg Batch Read"); +``` + +### Streaming read + +This example will start the streaming read from the latest table snapshot (inclusive). +Every 60s, it polls Iceberg table to discover new append-only snapshots. +CDC read is not supported yet. + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path"); + +IcebergSource source = IcebergSource.forRowData() + .tableLoader(tableLoader) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(true) + .streamingStartingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .monitorInterval(Duration.ofSeconds(60)) + .build() + +DataStream stream = env.fromSource( + source, + WatermarkStrategy.noWatermarks(), + "My Iceberg Source", + TypeInformation.of(RowData.class)); + +// Print all records to stdout. +stream.print(); + +// Submit and execute this streaming read job. +env.execute("Test Iceberg Streaming Read"); +``` + +There are other options that could be set by Java API, please see the +[IcebergSource#Builder](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/flink/source/IcebergSource.html). + +### Reading branches and tags with DataStream +Branches and tags can also be read via the DataStream API + +```java +StreamExecutionEnvironment env = StreamExecutionEnvironment.createLocalEnvironment(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path"); +// Read from branch +DataStream batch = FlinkSource.forRowData() + .env(env) + .tableLoader(tableLoader) + .branch("test-branch") + .streaming(false) + .build(); + +// Read from tag +DataStream batch = FlinkSource.forRowData() + .env(env) + .tableLoader(tableLoader) + .tag("test-tag") + .streaming(false) + .build(); + +// Streaming read from start-tag +DataStream batch = FlinkSource.forRowData() + .env(env) + .tableLoader(tableLoader) + .streaming(true) + .startTag("test-tag") + .build(); +``` + +### Read as Avro GenericRecord + +FLIP-27 Iceberg source provides `AvroGenericRecordReaderFunction` that converts +Flink `RowData` Avro `GenericRecord`. You can use the convert to read from +Iceberg table as Avro GenericRecord DataStream. + +Please make sure `flink-avro` jar is included in the classpath. +Also `iceberg-flink-runtime` shaded bundle jar can't be used +because the runtime jar shades the avro package. +Please use non-shaded `iceberg-flink` jar instead. + +```java +TableLoader tableLoader = ...; +Table table; +try (TableLoader loader = tableLoader) { + loader.open(); + table = loader.loadTable(); +} + +AvroGenericRecordReaderFunction readerFunction = AvroGenericRecordReaderFunction.fromTable(table); + +IcebergSource source = + IcebergSource.builder() + .tableLoader(tableLoader) + .readerFunction(readerFunction) + .assignerFactory(new SimpleSplitAssignerFactory()) + ... + .build(); + +DataStream stream = env.fromSource(source, WatermarkStrategy.noWatermarks(), + "Iceberg Source as Avro GenericRecord", new GenericRecordAvroTypeInfo(avroSchema)); +``` + +## Options + +### Read options + +Flink read options are passed when configuring the Flink IcebergSource: + +``` +IcebergSource.forRowData() + .tableLoader(TableLoader.fromCatalog(...)) + .assignerFactory(new SimpleSplitAssignerFactory()) + .streaming(true) + .streamingStartingStrategy(StreamingStartingStrategy.INCREMENTAL_FROM_LATEST_SNAPSHOT) + .startSnapshotId(3821550127947089987L) + .monitorInterval(Duration.ofMillis(10L)) // or .set("monitor-interval", "10s") \ set(FlinkReadOptions.MONITOR_INTERVAL, "10s") + .build() +``` + +For Flink SQL, read options can be passed in via SQL hints like this: + +``` +SELECT * FROM tableName /*+ OPTIONS('monitor-interval'='10s') */ +... +``` + +Options can be passed in via Flink configuration, which will be applied to current session. Note that not all options support this mode. + +``` +env.getConfig() + .getConfiguration() + .set(FlinkReadOptions.SPLIT_FILE_OPEN_COST_OPTION, 1000L); +... +``` + +Check out all the options here: [read-options](flink-configuration.md#read-options) + +## Inspecting tables + +To inspect a table's history, snapshots, and other metadata, Iceberg supports metadata tables. + +Metadata tables are identified by adding the metadata table name after the original table name. For example, history for `db.table` is read using `db.table$history`. + +### History + +To show table history: + +```sql +SELECT * FROM prod.db.table$history; +``` + +| made_current_at | snapshot_id | parent_id | is_current_ancestor | +| ----------------------- | ------------------- | ------------------- | ------------------- | +| 2019-02-08 03:29:51.215 | 5781947118336215154 | NULL | true | +| 2019-02-08 03:47:55.948 | 5179299526185056830 | 5781947118336215154 | true | +| 2019-02-09 16:24:30.13 | 296410040247533544 | 5179299526185056830 | false | +| 2019-02-09 16:32:47.336 | 2999875608062437330 | 5179299526185056830 | true | +| 2019-02-09 19:42:03.919 | 8924558786060583479 | 2999875608062437330 | true | +| 2019-02-09 19:49:16.343 | 6536733823181975045 | 8924558786060583479 | true | + +!!! info + **This shows a commit that was rolled back.** In this example, snapshot 296410040247533544 and 2999875608062437330 have the same parent snapshot 5179299526185056830. Snapshot 296410040247533544 was rolled back and is *not* an ancestor of the current table state. + + +### Metadata Log Entries + +To show table metadata log entries: + +```sql +SELECT * from prod.db.table$metadata_log_entries; +``` + +| timestamp | file | latest_snapshot_id | latest_schema_id | latest_sequence_number | +| ----------------------- | ------------------------------------------------------------ | ------------------ | ---------------- | ---------------------- | +| 2022-07-28 10:43:52.93 | s3://.../table/metadata/00000-9441e604-b3c2-498a-a45a-6320e8ab9006.metadata.json | null | null | null | +| 2022-07-28 10:43:57.487 | s3://.../table/metadata/00001-f30823df-b745-4a0a-b293-7532e0c99986.metadata.json | 170260833677645300 | 0 | 1 | +| 2022-07-28 10:43:58.25 | s3://.../table/metadata/00002-2cc2837a-02dc-4687-acc1-b4d86ea486f4.metadata.json | 958906493976709774 | 0 | 2 | + +### Snapshots + +To show the valid snapshots for a table: + +```sql +SELECT * FROM prod.db.table$snapshots; +``` + +| committed_at | snapshot_id | parent_id | operation | manifest_list | summary | +| ----------------------- | -------------- | --------- | --------- | -------------------------------------------------- | ------------------------------------------------------------ | +| 2019-02-08 03:29:51.215 | 57897183625154 | null | append | s3://.../table/metadata/snap-57897183625154-1.avro | { added-records -> 2478404, total-records -> 2478404, added-data-files -> 438, total-data-files -> 438, flink.job-id -> 2e274eecb503d85369fb390e8956c813 } | + +You can also join snapshots to table history. For example, this query will show table history, with the application ID that wrote each snapshot: + +```sql +select + h.made_current_at, + s.operation, + h.snapshot_id, + h.is_current_ancestor, + s.summary['flink.job-id'] +from prod.db.table$history h +join prod.db.table$snapshots s + on h.snapshot_id = s.snapshot_id +order by made_current_at +``` + +| made_current_at | operation | snapshot_id | is_current_ancestor | summary[flink.job-id] | +| ----------------------- | --------- | -------------- | ------------------- | -------------------------------- | +| 2019-02-08 03:29:51.215 | append | 57897183625154 | true | 2e274eecb503d85369fb390e8956c813 | + +### Files + +To show a table's current data files: + +```sql +SELECT * FROM prod.db.table$files; +``` + +| content | file_path | file_format | spec_id | partition | record_count | file_size_in_bytes | column_sizes | value_counts | null_value_counts | nan_value_counts | lower_bounds | upper_bounds | key_metadata | split_offsets | equality_ids | sort_order_id | +| ------- | ------------------------------------------------------------ | ----------- | ------- | ---------------- | ------------ | ------------------ | ------------------ | ---------------- | ----------------- | ---------------- | --------------- | --------------- | ------------ | ------------- | ------------ | ------------- | +| 0 | s3:/.../table/data/00000-3-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 01} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> c] | [1 -> , 2 -> c] | null | [4] | null | null | +| 0 | s3:/.../table/data/00001-4-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 02} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> b] | [1 -> , 2 -> b] | null | [4] | null | null | +| 0 | s3:/.../table/data/00002-5-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 03} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> a] | [1 -> , 2 -> a] | null | [4] | null | null | + +### Manifests + +To show a table's current file manifests: + +```sql +SELECT * FROM prod.db.table$manifests; +``` + +| path | length | partition_spec_id | added_snapshot_id | added_data_files_count | existing_data_files_count | deleted_data_files_count | partition_summaries | +| ------------------------------------------------------------ | ------ | ----------------- | ------------------- | ---------------------- | ------------------------- | ------------------------ | ------------------------------------ | +| s3://.../table/metadata/45b5290b-ee61-4788-b324-b1e2735c0e10-m0.avro | 4479 | 0 | 6668963634911763636 | 8 | 0 | 0 | [[false,null,2019-05-13,2019-05-15]] | + +Note: + +1. Fields within `partition_summaries` column of the manifests table correspond to `field_summary` structs within [manifest list](../../spec.md#manifest-lists), with the following order: + - `contains_null` + - `contains_nan` + - `lower_bound` + - `upper_bound` +2. `contains_nan` could return null, which indicates that this information is not available from the file's metadata. + This usually occurs when reading from V1 table, where `contains_nan` is not populated. + +### Partitions + +To show a table's current partitions: + +```sql +SELECT * FROM prod.db.table$partitions; +``` + +| partition | record_count | file_count | spec_id | +| -------------- | ------------ | ---------- | ------- | +| {20211001, 11} | 1 | 1 | 0 | +| {20211002, 11} | 1 | 1 | 0 | +| {20211001, 10} | 1 | 1 | 0 | +| {20211002, 10} | 1 | 1 | 0 | + +Note: +For unpartitioned tables, the partitions table will contain only the record_count and file_count columns. + +### All Metadata Tables + +These tables are unions of the metadata tables specific to the current snapshot, and return metadata across all snapshots. + +!!! danger + The "all" metadata tables may produce more than one row per data file or manifest file because metadata files may be part of more than one table snapshot. + + +#### All Data Files + +To show all of the table's data files and each file's metadata: + +```sql +SELECT * FROM prod.db.table$all_data_files; +``` + +| content | file_path | file_format | partition | record_count | file_size_in_bytes | column_sizes | value_counts | null_value_counts | nan_value_counts | lower_bounds | upper_bounds | key_metadata | split_offsets | equality_ids | sort_order_id | +| ------- | ------------------------------------------------------------ | ----------- | ---------- | ------------ | ------------------ | ------------------ | ------------------ | ----------------- | ---------------- | ----------------------- | ----------------------- | ------------ | ------------- | ------------ | ------------- | +| 0 | s3://.../dt=20210102/00000-0-756e2512-49ae-45bb-aae3-c0ca475e7879-00001.parquet | PARQUET | {20210102} | 14 | 2444 | {1 -> 94, 2 -> 17} | {1 -> 14, 2 -> 14} | {1 -> 0, 2 -> 0} | {} | {1 -> 1, 2 -> 20210102} | {1 -> 2, 2 -> 20210102} | null | [4] | null | 0 | +| 0 | s3://.../dt=20210103/00000-0-26222098-032f-472b-8ea5-651a55b21210-00001.parquet | PARQUET | {20210103} | 14 | 2444 | {1 -> 94, 2 -> 17} | {1 -> 14, 2 -> 14} | {1 -> 0, 2 -> 0} | {} | {1 -> 1, 2 -> 20210103} | {1 -> 3, 2 -> 20210103} | null | [4] | null | 0 | +| 0 | s3://.../dt=20210104/00000-0-a3bb1927-88eb-4f1c-bc6e-19076b0d952e-00001.parquet | PARQUET | {20210104} | 14 | 2444 | {1 -> 94, 2 -> 17} | {1 -> 14, 2 -> 14} | {1 -> 0, 2 -> 0} | {} | {1 -> 1, 2 -> 20210104} | {1 -> 3, 2 -> 20210104} | null | [4] | null | 0 | + +#### All Manifests + +To show all of the table's manifest files: + +```sql +SELECT * FROM prod.db.table$all_manifests; +``` + +| path | length | partition_spec_id | added_snapshot_id | added_data_files_count | existing_data_files_count | deleted_data_files_count | partition_summaries | +| ------------------------------------------------------------ | ------ | ----------------- | ------------------- | ---------------------- | ------------------------- | ------------------------ | ------------------------------------ | +| s3://.../metadata/a85f78c5-3222-4b37-b7e4-faf944425d48-m0.avro | 6376 | 0 | 6272782676904868561 | 2 | 0 | 0 | [{false, false, 20210101, 20210101}] | + +Note: + +1. Fields within `partition_summaries` column of the manifests table correspond to `field_summary` structs within [manifest list](../../spec.md#manifest-lists), with the following order: + - `contains_null` + - `contains_nan` + - `lower_bound` + - `upper_bound` +2. `contains_nan` could return null, which indicates that this information is not available from the file's metadata. + This usually occurs when reading from V1 table, where `contains_nan` is not populated. + +### References + +To show a table's known snapshot references: + +```sql +SELECT * FROM prod.db.table$refs; +``` + +| name | type | snapshot_id | max_reference_age_in_ms | min_snapshots_to_keep | max_snapshot_age_in_ms | +| ------- | ------ | ------------------- | ----------------------- | --------------------- | ---------------------- | +| main | BRANCH | 4686954189838128572 | 10 | 20 | 30 | +| testTag | TAG | 4686954189838128572 | 10 | null | null | + diff --git a/docs-new/home/docs/latest/flink-writes.md b/docs-new/home/docs/latest/flink-writes.md new file mode 100644 index 000000000000..9c45f805f29c --- /dev/null +++ b/docs-new/home/docs/latest/flink-writes.md @@ -0,0 +1,265 @@ +--- +title: "Flink Writes" +--- + +# Flink Writes + +Iceberg support batch and streaming writes With [Apache Flink](https://flink.apache.org/)'s DataStream API and Table API. + +## Writing with SQL + +Iceberg support both `INSERT INTO` and `INSERT OVERWRITE`. + +### `INSERT INTO` + +To append new data to a table with a Flink streaming job, use `INSERT INTO`: + +```sql +INSERT INTO `hive_catalog`.`default`.`sample` VALUES (1, 'a'); +INSERT INTO `hive_catalog`.`default`.`sample` SELECT id, data from other_kafka_table; +``` + +### `INSERT OVERWRITE` + +To replace data in the table with the result of a query, use `INSERT OVERWRITE` in batch job (flink streaming job does not support `INSERT OVERWRITE`). Overwrites are atomic operations for Iceberg tables. + +Partitions that have rows produced by the SELECT query will be replaced, for example: + +```sql +INSERT OVERWRITE sample VALUES (1, 'a'); +``` + +Iceberg also support overwriting given partitions by the `select` values: + +```sql +INSERT OVERWRITE `hive_catalog`.`default`.`sample` PARTITION(data='a') SELECT 6; +``` + +For a partitioned iceberg table, when all the partition columns are set a value in `PARTITION` clause, it is inserting into a static partition, otherwise if partial partition columns (prefix part of all partition columns) are set a value in `PARTITION` clause, it is writing the query result into a dynamic partition. +For an unpartitioned iceberg table, its data will be completely overwritten by `INSERT OVERWRITE`. + +### `UPSERT` + +Iceberg supports `UPSERT` based on the primary key when writing data into v2 table format. There are two ways to enable upsert. + +1. Enable the `UPSERT` mode as table-level property `write.upsert.enabled`. Here is an example SQL statement to set the table property when creating a table. It would be applied for all write paths to this table (batch or streaming) unless overwritten by write options as described later. + +```sql +CREATE TABLE `hive_catalog`.`default`.`sample` ( + `id` INT UNIQUE COMMENT 'unique id', + `data` STRING NOT NULL, + PRIMARY KEY(`id`) NOT ENFORCED +) with ('format-version'='2', 'write.upsert.enabled'='true'); +``` + +2. Enabling `UPSERT` mode using `upsert-enabled` in the [write options](#write-options) provides more flexibility than a table level config. Note that you still need to use v2 table format and specify the primary key when creating the table. + +```sql +INSERT INTO tableName /*+ OPTIONS('upsert-enabled'='true') */ +... +``` + +!!! info + OVERWRITE and UPSERT can't be set together. In UPSERT mode, if the table is partitioned, the partition fields should be included in equality fields. + + + + +## Writing with DataStream + +Iceberg support writing to iceberg table from different DataStream input. + + +### Appending data. + +Flink supports writing `DataStream` and `DataStream` to the sink iceberg table natively. + +```java +StreamExecutionEnvironment env = ...; + +DataStream input = ... ; +Configuration hadoopConf = new Configuration(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path", hadoopConf); + +FlinkSink.forRowData(input) + .tableLoader(tableLoader) + .append(); + +env.execute("Test Iceberg DataStream"); +``` + +The iceberg API also allows users to write generic `DataStream` to iceberg table, more example could be found in this [unit test](https://github.com/apache/iceberg/blob/master/flink/v1.16/flink/src/test/java/org/apache/iceberg/flink/sink/TestFlinkIcebergSink.java). + +### Overwrite data + +Set the `overwrite` flag in FlinkSink builder to overwrite the data in existing iceberg tables: + +```java +StreamExecutionEnvironment env = ...; + +DataStream input = ... ; +Configuration hadoopConf = new Configuration(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path", hadoopConf); + +FlinkSink.forRowData(input) + .tableLoader(tableLoader) + .overwrite(true) + .append(); + +env.execute("Test Iceberg DataStream"); +``` + +### Upsert data + +Set the `upsert` flag in FlinkSink builder to upsert the data in existing iceberg table. The table must use v2 table format and have a primary key. + +```java +StreamExecutionEnvironment env = ...; + +DataStream input = ... ; +Configuration hadoopConf = new Configuration(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path", hadoopConf); + +FlinkSink.forRowData(input) + .tableLoader(tableLoader) + .upsert(true) + .append(); + +env.execute("Test Iceberg DataStream"); +``` + +!!! info + OVERWRITE and UPSERT can't be set together. In UPSERT mode, if the table is partitioned, the partition fields should be included in equality fields. + + +### Write with Avro GenericRecord + +Flink Iceberg sink provides `AvroGenericRecordToRowDataMapper` that converts +Avro `GenericRecord` to Flink `RowData`. You can use the mapper to write +Avro GenericRecord DataStream to Iceberg. + +Please make sure `flink-avro` jar is included in the classpath. +Also `iceberg-flink-runtime` shaded bundle jar can't be used +because the runtime jar shades the avro package. +Please use non-shaded `iceberg-flink` jar instead. + +```java +DataStream dataStream = ...; + +Schema icebergSchema = table.schema(); + + +// The Avro schema converted from Iceberg schema can't be used +// due to precision difference between how Iceberg schema (micro) +// and Flink AvroToRowDataConverters (milli) deal with time type. +// Instead, use the Avro schema defined directly. +// See AvroGenericRecordToRowDataMapper Javadoc for more details. +org.apache.avro.Schema avroSchema = AvroSchemaUtil.convert(icebergSchema, table.name()); + +GenericRecordAvroTypeInfo avroTypeInfo = new GenericRecordAvroTypeInfo(avroSchema); +RowType rowType = FlinkSchemaUtil.convert(icebergSchema); + +FlinkSink.builderFor( + dataStream, + AvroGenericRecordToRowDataMapper.forAvroSchema(avroSchema), + FlinkCompatibilityUtil.toTypeInfo(rowType)) + .table(table) + .tableLoader(tableLoader) + .append(); +``` + +### Branch Writes +Writing to branches in Iceberg tables is also supported via the `toBranch` API in `FlinkSink` +For more information on branches please refer to [branches](branching.md). +```java +FlinkSink.forRowData(input) + .tableLoader(tableLoader) + .toBranch("audit-branch") + .append(); +``` + +### Metrics + +The following Flink metrics are provided by the Flink Iceberg sink. + +Parallel writer metrics are added under the sub group of `IcebergStreamWriter`. +They should have the following key-value tags. + +* table: full table name (like iceberg.my_db.my_table) +* subtask_index: writer subtask index starting from 0 + + Metric name | Metric type | Description | +| ------------------------- |------------|-----------------------------------------------------------------------------------------------------| +| lastFlushDurationMs | Gague | The duration (in milli) that writer subtasks take to flush and upload the files during checkpoint. | +| flushedDataFiles | Counter | Number of data files flushed and uploaded. | +| flushedDeleteFiles | Counter | Number of delete files flushed and uploaded. | +| flushedReferencedDataFiles| Counter | Number of data files referenced by the flushed delete files. | +| dataFilesSizeHistogram | Histogram | Histogram distribution of data file sizes (in bytes). | +| deleteFilesSizeHistogram | Histogram | Histogram distribution of delete file sizes (in bytes). | + +Committer metrics are added under the sub group of `IcebergFilesCommitter`. +They should have the following key-value tags. + +* table: full table name (like iceberg.my_db.my_table) + + Metric name | Metric type | Description | +|---------------------------------|--------|----------------------------------------------------------------------------| +| lastCheckpointDurationMs | Gague | The duration (in milli) that the committer operator checkpoints its state. | +| lastCommitDurationMs | Gague | The duration (in milli) that the Iceberg table commit takes. | +| committedDataFilesCount | Counter | Number of data files committed. | +| committedDataFilesRecordCount | Counter | Number of records contained in the committed data files. | +| committedDataFilesByteCount | Counter | Number of bytes contained in the committed data files. | +| committedDeleteFilesCount | Counter | Number of delete files committed. | +| committedDeleteFilesRecordCount | Counter | Number of records contained in the committed delete files. | +| committedDeleteFilesByteCount | Counter | Number of bytes contained in the committed delete files. | +| elapsedSecondsSinceLastSuccessfulCommit| Gague | Elapsed time (in seconds) since last successful Iceberg commit. | + +`elapsedSecondsSinceLastSuccessfulCommit` is an ideal alerting metric +to detect failed or missing Iceberg commits. + +* Iceberg commit happened after successful Flink checkpoint in the `notifyCheckpointComplete` callback. + It could happen that Iceberg commits failed (for whatever reason), while Flink checkpoints succeeding. +* It could also happen that `notifyCheckpointComplete` wasn't triggered (for whatever bug). + As a result, there won't be any Iceberg commits attempted. + +If the checkpoint interval (and expected Iceberg commit interval) is 5 minutes, set up alert with rule like `elapsedSecondsSinceLastSuccessfulCommit > 60 minutes` to detect failed or missing Iceberg commits in the past hour. + + + +## Options + +### Write options + +Flink write options are passed when configuring the FlinkSink, like this: + +```java +FlinkSink.Builder builder = FlinkSink.forRow(dataStream, SimpleDataUtil.FLINK_SCHEMA) + .table(table) + .tableLoader(tableLoader) + .set("write-format", "orc") + .set(FlinkWriteOptions.OVERWRITE_MODE, "true"); +``` + +For Flink SQL, write options can be passed in via SQL hints like this: + +```sql +INSERT INTO tableName /*+ OPTIONS('upsert-enabled'='true') */ +... +``` + +Check out all the options here: [write-options](flink-configuration.md#write-options) diff --git a/docs-new/home/docs/latest/flink.md b/docs-new/home/docs/latest/flink.md new file mode 100644 index 000000000000..a906fd8db7b0 --- /dev/null +++ b/docs-new/home/docs/latest/flink.md @@ -0,0 +1,393 @@ +--- +title: "Flink Getting Started" +--- + + +# Flink + +Apache Iceberg supports both [Apache Flink](https://flink.apache.org/)'s DataStream API and Table API. See the [Multi-Engine Support#apache-flink](../../multi-engine-support.md#apache-flink) page for the integration of Apache Flink. + +| Feature support | Flink | Notes | +| ----------------------------------------------------------- |-------|----------------------------------------------------------------------------------------| +| [SQL create catalog](#creating-catalogs-and-using-catalogs) | ✔️ | | +| [SQL create database](#create-database) | ✔️ | | +| [SQL create table](#create-table) | ✔️ | | +| [SQL create table like](#create-table-like) | ✔️ | | +| [SQL alter table](#alter-table) | ✔️ | Only support altering table properties, column and partition changes are not supported | +| [SQL drop_table](#drop-table) | ✔️ | | +| [SQL select](#querying-with-sql) | ✔️ | Support both streaming and batch mode | +| [SQL insert into](#insert-into) | ✔️ ️ | Support both streaming and batch mode | +| [SQL insert overwrite](#insert-overwrite) | ✔️ ️ | | +| [DataStream read](#reading-with-datastream) | ✔️ ️ | | +| [DataStream append](#appending-data) | ✔️ ️ | | +| [DataStream overwrite](#overwrite-data) | ✔️ ️ | | +| [Metadata tables](#inspecting-tables) | ✔️ | | +| [Rewrite files action](#rewrite-files-action) | ✔️ ️ | | + +## Preparation when using Flink SQL Client + +To create Iceberg table in Flink, it is recommended to use [Flink SQL Client](https://ci.apache.org/projects/flink/flink-docs-stable/dev/table/sqlClient.html) as it's easier for users to understand the concepts. + +Download Flink from the [Apache download page](https://flink.apache.org/downloads.html). Iceberg uses Scala 2.12 when compiling the Apache `iceberg-flink-runtime` jar, so it's recommended to use Flink 1.16 bundled with Scala 2.12. + +```bash +FLINK_VERSION=1.16.1 +SCALA_VERSION=2.12 +APACHE_FLINK_URL=https://archive.apache.org/dist/flink/ +wget ${APACHE_FLINK_URL}/flink-${FLINK_VERSION}/flink-${FLINK_VERSION}-bin-scala_${SCALA_VERSION}.tgz +tar xzvf flink-${FLINK_VERSION}-bin-scala_${SCALA_VERSION}.tgz +``` + +Start a standalone Flink cluster within Hadoop environment: + +```bash +# HADOOP_HOME is your hadoop root directory after unpack the binary package. +APACHE_HADOOP_URL=https://archive.apache.org/dist/hadoop/ +HADOOP_VERSION=2.8.5 +wget ${APACHE_HADOOP_URL}/common/hadoop-${HADOOP_VERSION}/hadoop-${HADOOP_VERSION}.tar.gz +tar xzvf hadoop-${HADOOP_VERSION}.tar.gz +HADOOP_HOME=`pwd`/hadoop-${HADOOP_VERSION} + +export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath` + +# Start the flink standalone cluster +./bin/start-cluster.sh +``` + +Start the Flink SQL client. There is a separate `flink-runtime` module in the Iceberg project to generate a bundled jar, which could be loaded by Flink SQL client directly. To build the `flink-runtime` bundled jar manually, build the `iceberg` project, and it will generate the jar under `/flink-runtime/build/libs`. Or download the `flink-runtime` jar from the [Apache repository](https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.16/{{ icebergVersion }}/). + +```bash +# HADOOP_HOME is your hadoop root directory after unpack the binary package. +export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath` + +./bin/sql-client.sh embedded -j /iceberg-flink-runtime-1.16-{{ icebergVersion }}.jar shell +``` + +By default, Iceberg ships with Hadoop jars for Hadoop catalog. To use Hive catalog, load the Hive jars when opening the Flink SQL client. Fortunately, Flink has provided a [bundled hive jar](https://repo.maven.apache.org/maven2/org/apache/flink/flink-sql-connector-hive-2.3.9_2.12/1.16.1/flink-sql-connector-hive-2.3.9_2.12-1.16.1.jar) for the SQL client. An example on how to download the dependencies and get started: + +```bash +# HADOOP_HOME is your hadoop root directory after unpack the binary package. +export HADOOP_CLASSPATH=`$HADOOP_HOME/bin/hadoop classpath` + +ICEBERG_VERSION={{ icebergVersion }} +MAVEN_URL=https://repo1.maven.org/maven2 +ICEBERG_MAVEN_URL=${MAVEN_URL}/org/apache/iceberg +ICEBERG_PACKAGE=iceberg-flink-runtime +wget ${ICEBERG_MAVEN_URL}/${ICEBERG_PACKAGE}-${FLINK_VERSION_MAJOR}/${ICEBERG_VERSION}/${ICEBERG_PACKAGE}-${FLINK_VERSION_MAJOR}-${ICEBERG_VERSION}.jar -P lib/ + +HIVE_VERSION=2.3.9 +SCALA_VERSION=2.12 +FLINK_VERSION=1.16.1 +FLINK_CONNECTOR_URL=${MAVEN_URL}/org/apache/flink +FLINK_CONNECTOR_PACKAGE=flink-sql-connector-hive +wget ${FLINK_CONNECTOR_URL}/${FLINK_CONNECTOR_PACKAGE}-${HIVE_VERSION}_${SCALA_VERSION}/${FLINK_VERSION}/${FLINK_CONNECTOR_PACKAGE}-${HIVE_VERSION}_${SCALA_VERSION}-${FLINK_VERSION}.jar + +./bin/sql-client.sh embedded shell +``` + +## Flink's Python API + +!!! info + PyFlink 1.6.1 [does not work on OSX with a M1 cpu](https://issues.apache.org/jira/browse/FLINK-28786) + + +Install the Apache Flink dependency using `pip`: + +```python +pip install apache-flink==1.16.1 +``` + +Provide a `file://` path to the `iceberg-flink-runtime` jar, which can be obtained by building the project and looking at `/flink-runtime/build/libs`, or downloading it from the [Apache official repository](https://repo.maven.apache.org/maven2/org/apache/iceberg/iceberg-flink-runtime/). Third-party jars can be added to `pyflink` via: + +- `env.add_jars("file:///my/jar/path/connector.jar")` +- `table_env.get_config().get_configuration().set_string("pipeline.jars", "file:///my/jar/path/connector.jar")` + +This is also mentioned in the official [docs](https://ci.apache.org/projects/flink/flink-docs-release-1.16/docs/dev/python/dependency_management/). The example below uses `env.add_jars(..)`: + +```python +import os + +from pyflink.datastream import StreamExecutionEnvironment + +env = StreamExecutionEnvironment.get_execution_environment() +iceberg_flink_runtime_jar = os.path.join(os.getcwd(), "iceberg-flink-runtime-1.16-{{ icebergVersion }}.jar") + +env.add_jars("file://{}".format(iceberg_flink_runtime_jar)) +``` + +Next, create a `StreamTableEnvironment` and execute Flink SQL statements. The below example shows how to create a custom catalog via the Python Table API: + +```python +from pyflink.table import StreamTableEnvironment +table_env = StreamTableEnvironment.create(env) +table_env.execute_sql(""" +CREATE CATALOG my_catalog WITH ( + 'type'='iceberg', + 'catalog-impl'='com.my.custom.CatalogImpl', + 'my-additional-catalog-config'='my-value' +) +""") +``` + +Run a query: + +```python +(table_env + .sql_query("SELECT PULocationID, DOLocationID, passenger_count FROM my_catalog.nyc.taxis LIMIT 5") + .execute() + .print()) +``` + +``` ++----+----------------------+----------------------+--------------------------------+ +| op | PULocationID | DOLocationID | passenger_count | ++----+----------------------+----------------------+--------------------------------+ +| +I | 249 | 48 | 1.0 | +| +I | 132 | 233 | 1.0 | +| +I | 164 | 107 | 1.0 | +| +I | 90 | 229 | 1.0 | +| +I | 137 | 249 | 1.0 | ++----+----------------------+----------------------+--------------------------------+ +5 rows in set +``` + +For more details, please refer to the [Python Table API](https://ci.apache.org/projects/flink/flink-docs-release-1.16/docs/dev/python/table/intro_to_table_api/). + +## Adding catalogs. + +Flink support to create catalogs by using Flink SQL. + +### Catalog Configuration + +A catalog is created and named by executing the following query (replace `` with your catalog name and +``=`` with catalog implementation config): + +```sql +CREATE CATALOG WITH ( + 'type'='iceberg', + ``=`` +); +``` + +The following properties can be set globally and are not limited to a specific catalog implementation: + +* `type`: Must be `iceberg`. (required) +* `catalog-type`: `hive`, `hadoop` or `rest` for built-in catalogs, or left unset for custom catalog implementations using catalog-impl. (Optional) +* `catalog-impl`: The fully-qualified class name of a custom catalog implementation. Must be set if `catalog-type` is unset. (Optional) +* `property-version`: Version number to describe the property version. This property can be used for backwards compatibility in case the property format changes. The current property version is `1`. (Optional) +* `cache-enabled`: Whether to enable catalog cache, default value is `true`. (Optional) +* `cache.expiration-interval-ms`: How long catalog entries are locally cached, in milliseconds; negative values like `-1` will disable expiration, value 0 is not allowed to set. default value is `-1`. (Optional) + +### Hive catalog + +This creates an Iceberg catalog named `hive_catalog` that can be configured using `'catalog-type'='hive'`, which loads tables from Hive metastore: + +```sql +CREATE CATALOG hive_catalog WITH ( + 'type'='iceberg', + 'catalog-type'='hive', + 'uri'='thrift://localhost:9083', + 'clients'='5', + 'property-version'='1', + 'warehouse'='hdfs://nn:8020/warehouse/path' +); +``` + +The following properties can be set if using the Hive catalog: + +* `uri`: The Hive metastore's thrift URI. (Required) +* `clients`: The Hive metastore client pool size, default value is 2. (Optional) +* `warehouse`: The Hive warehouse location, users should specify this path if neither set the `hive-conf-dir` to specify a location containing a `hive-site.xml` configuration file nor add a correct `hive-site.xml` to classpath. +* `hive-conf-dir`: Path to a directory containing a `hive-site.xml` configuration file which will be used to provide custom Hive configuration values. The value of `hive.metastore.warehouse.dir` from `/hive-site.xml` (or hive configure file from classpath) will be overwritten with the `warehouse` value if setting both `hive-conf-dir` and `warehouse` when creating iceberg catalog. +* `hadoop-conf-dir`: Path to a directory containing `core-site.xml` and `hdfs-site.xml` configuration files which will be used to provide custom Hadoop configuration values. + +## Creating a table + +```sql +CREATE TABLE `hive_catalog`.`default`.`sample` ( + id BIGINT COMMENT 'unique id', + data STRING +); +``` + +## Writing + +To append new data to a table with a Flink streaming job, use `INSERT INTO`: + +```sql +INSERT INTO `hive_catalog`.`default`.`sample` VALUES (1, 'a'); +INSERT INTO `hive_catalog`.`default`.`sample` SELECT id, data from other_kafka_table; +``` + +To replace data in the table with the result of a query, use `INSERT OVERWRITE` in batch job (flink streaming job does not support `INSERT OVERWRITE`). Overwrites are atomic operations for Iceberg tables. + +Partitions that have rows produced by the SELECT query will be replaced, for example: + +```sql +INSERT OVERWRITE `hive_catalog`.`default`.`sample` VALUES (1, 'a'); +``` + +Iceberg also support overwriting given partitions by the `select` values: + +```sql +INSERT OVERWRITE `hive_catalog`.`default`.`sample` PARTITION(data='a') SELECT 6; +``` + +Flink supports writing `DataStream` and `DataStream` to the sink iceberg table natively. + +```java +StreamExecutionEnvironment env = ...; + +DataStream input = ... ; +Configuration hadoopConf = new Configuration(); +TableLoader tableLoader = TableLoader.fromHadoopTable("hdfs://nn:8020/warehouse/path", hadoopConf); + +FlinkSink.forRowData(input) + .tableLoader(tableLoader) + .append(); + +env.execute("Test Iceberg DataStream"); +``` + +### Branch Writes +Writing to branches in Iceberg tables is also supported via the `toBranch` API in `FlinkSink` +For more information on branches please refer to [branches](branching.md). +```java +FlinkSink.forRowData(input) + .tableLoader(tableLoader) + .toBranch("audit-branch") + .append(); +``` + +## Reading + +Submit a Flink __batch__ job using the following sentences: + +```sql +-- Execute the flink job in batch mode for current session context +SET execution.runtime-mode = batch; +SELECT * FROM `hive_catalog`.`default`.`sample`; +``` + +Iceberg supports processing incremental data in flink __streaming__ jobs which starts from a historical snapshot-id: + +```sql +-- Submit the flink job in streaming mode for current session. +SET execution.runtime-mode = streaming; + +-- Enable this switch because streaming read SQL will provide few job options in flink SQL hint options. +SET table.dynamic-table-options.enabled=true; + +-- Read all the records from the iceberg current snapshot, and then read incremental data starting from that snapshot. +SELECT * FROM `hive_catalog`.`default`.`sample` /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s')*/ ; + +-- Read all incremental data starting from the snapshot-id '3821550127947089987' (records from this snapshot will be excluded). +SELECT * FROM `hive_catalog`.`default`.`sample` /*+ OPTIONS('streaming'='true', 'monitor-interval'='1s', 'start-snapshot-id'='3821550127947089987')*/ ; +``` + +SQL is also the recommended way to inspect tables. To view all of the snapshots in a table, use the snapshots metadata table: + +```sql +SELECT * FROM `hive_catalog`.`default`.`sample`.`snapshots` +``` + +Iceberg support streaming or batch read in Java API: + +``` +DataStream batch = FlinkSource.forRowData() + .env(env) + .tableLoader(tableLoader) + .streaming(false) + .build(); +``` + + + + +## Type conversion + +Iceberg's integration for Flink automatically converts between Flink and Iceberg types. When writing to a table with types that are not supported by Flink, like UUID, Iceberg will accept and convert values from the Flink type. + +### Flink to Iceberg + +Flink types are converted to Iceberg types according to the following table: + +| Flink | Iceberg | Notes | +| ------------------- | -------------------------- | ------------- | +| boolean | boolean | | +| tinyint | integer | | +| smallint | integer | | +| integer | integer | | +| bigint | long | | +| float | float | | +| double | double | | +| char | string | | +| varchar | string | | +| string | string | | +| binary | binary | | +| varbinary | fixed | | +| decimal | decimal | | +| date | date | | +| time | time | | +| timestamp | timestamp without timezone | | +| timestamp_ltz | timestamp with timezone | | +| array | list | | +| map | map | | +| multiset | map | | +| row | struct | | +| raw | | Not supported | +| interval | | Not supported | +| structured | | Not supported | +| timestamp with zone | | Not supported | +| distinct | | Not supported | +| null | | Not supported | +| symbol | | Not supported | +| logical | | Not supported | + +### Iceberg to Flink + +Iceberg types are converted to Flink types according to the following table: + +| Iceberg | Flink | +| -------------------------- | --------------------- | +| boolean | boolean | +| struct | row | +| list | array | +| map | map | +| integer | integer | +| long | bigint | +| float | float | +| double | double | +| date | date | +| time | time | +| timestamp without timezone | timestamp(6) | +| timestamp with timezone | timestamp_ltz(6) | +| string | varchar(2147483647) | +| uuid | binary(16) | +| fixed(N) | binary(N) | +| binary | varbinary(2147483647) | +| decimal(P, S) | decimal(P, S) | + +## Future improvements + +There are some features that are do not yet supported in the current Flink Iceberg integration work: + +* Don't support creating iceberg table with hidden partitioning. [Discussion](http://mail-archives.apache.org/mod_mbox/flink-dev/202008.mbox/%3cCABi+2jQCo3MsOa4+ywaxV5J-Z8TGKNZDX-pQLYB-dG+dVUMiMw@mail.gmail.com%3e) in flink mail list. +* Don't support creating iceberg table with computed column. +* Don't support creating iceberg table with watermark. +* Don't support adding columns, removing columns, renaming columns, changing columns. [FLINK-19062](https://issues.apache.org/jira/browse/FLINK-19062) is tracking this. diff --git a/docs-new/home/docs/latest/hive-migration.md b/docs-new/home/docs/latest/hive-migration.md new file mode 100644 index 000000000000..e14f1c72d2aa --- /dev/null +++ b/docs-new/home/docs/latest/hive-migration.md @@ -0,0 +1,55 @@ +--- +title: "Hive Migration" +--- + + +# Hive Table Migration +Apache Hive supports ORC, Parquet, and Avro file formats that could be migrated to Iceberg. +When migrating data to an Iceberg table, which provides versioning and transactional updates, only the most recent data files need to be migrated. + +Iceberg supports all three migration actions: Snapshot Table, Migrate Table, and Add Files for migrating from Hive tables to Iceberg tables. Since Hive tables do not maintain snapshots, +the migration process essentially involves creating a new Iceberg table with the existing schema and committing all data files across all partitions to the new Iceberg table. +After the initial migration, any new data files are added to the new Iceberg table using the Add Files action. + +## Enabling Migration from Hive to Iceberg +The Hive table migration actions are supported by the Spark Integration module via Spark Procedures. +The procedures are bundled in the Spark runtime jar, which is available in the [Iceberg Release Downloads](../../releases.md#downloads). + +## Snapshot Hive Table to Iceberg +To snapshot a Hive table, users can run the following Spark SQL: +```sql +CALL catalog_name.system.snapshot('db.source', 'db.dest') +``` +See [Spark Procedure: snapshot](spark-procedures.md#snapshot) for more details. + +## Migrate Hive Table To Iceberg +To migrate a Hive table to Iceberg, users can run the following Spark SQL: +```sql +CALL catalog_name.system.migrate('db.sample') +``` +See [Spark Procedure: migrate](spark-procedures.md#migrate) for more details. + +## Add Files From Hive Table to Iceberg +To add data files from a Hive table to a given Iceberg table, users can run the following Spark SQL: +```sql +CALL spark_catalog.system.add_files( +table => 'db.tbl', +source_table => 'db.src_tbl' +) +``` +See [Spark Procedure: add_files](spark-procedures.md#add_files) for more details. diff --git a/docs-new/home/docs/latest/hive.md b/docs-new/home/docs/latest/hive.md new file mode 100644 index 000000000000..7246ab644e8a --- /dev/null +++ b/docs-new/home/docs/latest/hive.md @@ -0,0 +1,590 @@ +--- +title: "Hive" +--- + + +# Hive + +Iceberg supports reading and writing Iceberg tables through [Hive](https://hive.apache.org) by using +a [StorageHandler](https://cwiki.apache.org/confluence/display/Hive/StorageHandlers). + +## Feature support +Iceberg compatibility with Hive 2.x and Hive 3.1.2/3 supports the following features: + +* Creating a table +* Dropping a table +* Reading a table +* Inserting into a table (INSERT INTO) + +!!! warning + DML operations work only with MapReduce execution engine. + + +With Hive version 4.0.0-alpha-2 and above, +the Iceberg integration when using HiveCatalog supports the following additional features: + +* Altering a table with expiring snapshots. +* Create a table like an existing table (CTLT table) +* Support adding parquet compression type via Table properties [Compression types](https://spark.apache.org/docs/2.4.3/sql-data-sources-parquet.html#configuration) +* Altering a table metadata location +* Supporting table rollback +* Honors sort orders on existing tables when writing a table [Sort orders specification](../../spec.md#sort-orders) + +With Hive version 4.0.0-alpha-1 and above, +the Iceberg integration when using HiveCatalog supports the following additional features: + +* Creating an Iceberg identity-partitioned table +* Creating an Iceberg table with any partition spec, including the various transforms supported by Iceberg +* Creating a table from an existing table (CTAS table) +* Altering a table while keeping Iceberg and Hive schemas in sync +* Altering the partition schema (updating columns) +* Altering the partition schema by specifying partition transforms +* Truncating a table +* Migrating tables in Avro, Parquet, or ORC (Non-ACID) format to Iceberg +* Reading the schema of a table +* Querying Iceberg metadata tables +* Time travel applications +* Inserting into a table (INSERT INTO) +* Inserting data overwriting existing data (INSERT OVERWRITE) + +!!! warning + DML operations work only with Tez execution engine. + + +## Enabling Iceberg support in Hive + +### Hive 4.0.0-alpha-1 + +Hive 4.0.0-alpha-1 comes with the Iceberg 0.13.1 included. No additional downloads or jars are needed. + +### Hive 2.3.x, Hive 3.1.x + +In order to use Hive 2.3.x or Hive 3.1.x, you must load the Iceberg-Hive runtime jar and enable Iceberg support, either globally or for an individual table using a table property. + +#### Loading runtime jar + +To enable Iceberg support in Hive, the `HiveIcebergStorageHandler` and supporting classes need to be made available on +Hive's classpath. These are provided by the `iceberg-hive-runtime` jar file. For example, if using the Hive shell, this +can be achieved by issuing a statement like so: + +``` +add jar /path/to/iceberg-hive-runtime.jar; +``` + +There are many others ways to achieve this including adding the jar file to Hive's auxiliary classpath so it is +available by default. Please refer to Hive's documentation for more information. + +#### Enabling support + +If the Iceberg storage handler is not in Hive's classpath, then Hive cannot load or update the metadata for an Iceberg +table when the storage handler is set. To avoid the appearance of broken tables in Hive, Iceberg will not add the +storage handler to a table unless Hive support is enabled. The storage handler is kept in sync (added or removed) every +time Hive engine support for the table is updated, i.e. turned on or off in the table properties. There are two ways to +enable Hive support: globally in Hadoop Configuration and per-table using a table property. + +##### Hadoop configuration + +To enable Hive support globally for an application, set `iceberg.engine.hive.enabled=true` in its Hadoop configuration. +For example, setting this in the `hive-site.xml` loaded by Spark will enable the storage handler for all tables created +by Spark. + +!!! danger + Starting with Apache Iceberg 0.11.0, when using Hive with Tez you also have to disable vectorization (hive.vectorized.execution.enabled=false). + + +##### Table property configuration + +Alternatively, the property `engine.hive.enabled` can be set to `true` and added to the table properties when creating +the Iceberg table. Here is an example of doing it programmatically: + +```java +Catalog catalog=...; + Map tableProperties=Maps.newHashMap(); + tableProperties.put(TableProperties.ENGINE_HIVE_ENABLED,"true"); // engine.hive.enabled=true + catalog.createTable(tableId,schema,spec,tableProperties); +``` + +The table level configuration overrides the global Hadoop configuration. + +##### Hive on Tez configuration + +To use the Tez engine on Hive `3.1.2` or later, Tez needs to be upgraded to >= `0.10.1` which contains a necessary fix [TEZ-4248](https://issues.apache.org/jira/browse/TEZ-4248). + +To use the Tez engine on Hive `2.3.x`, you will need to manually build Tez from the `branch-0.9` branch due to a +backwards incompatibility issue with Tez `0.10.1`. + +In both cases, you will also need to set the following property in the `tez-site.xml` configuration file: `tez.mrreader.config.update.properties=hive.io.file.readcolumn.names,hive.io.file.readcolumn.ids`. + +## Catalog Management + +### Global Hive catalog + +From the Hive engine's perspective, there is only one global data catalog that is defined in the Hadoop configuration in +the runtime environment. In contrast, Iceberg supports multiple different data catalog types such as Hive, Hadoop, AWS +Glue, or custom catalog implementations. Iceberg also allows loading a table directly based on its path in the file +system. Those tables do not belong to any catalog. Users might want to read these cross-catalog and path-based tables +through the Hive engine for use cases like join. + +To support this, a table in the Hive metastore can represent three different ways of loading an Iceberg table, depending +on the table's `iceberg.catalog` property: + +1. The table will be loaded using a `HiveCatalog` that corresponds to the metastore configured in the Hive environment + if no `iceberg.catalog` is set +2. The table will be loaded using a custom catalog if `iceberg.catalog` is set to a catalog name (see below) +3. The table can be loaded directly using the table's root location if `iceberg.catalog` is set + to `location_based_table` + +For cases 2 and 3 above, users can create an overlay of an Iceberg table in the Hive metastore, so that different table +types can work together in the same Hive environment. See [CREATE EXTERNAL TABLE](#create-external-table-overlaying-an-existing-iceberg-table) +and [CREATE TABLE](#create-table) for more details. + +### Custom Iceberg catalogs + +To globally register different catalogs, set the following Hadoop configurations: + +| Config Key | Description | +| --------------------------------------------- | ------------------------------------------------------ | +| iceberg.catalog..type | type of catalog: `hive`, `hadoop`, or left unset if using a custom catalog | +| iceberg.catalog..catalog-impl | catalog implementation, must not be null if type is empty | +| iceberg.catalog.. | any config key and value pairs for the catalog | + +Here are some examples using Hive CLI: + +Register a `HiveCatalog` called `another_hive`: + +``` +SET iceberg.catalog.another_hive.type=hive; +SET iceberg.catalog.another_hive.uri=thrift://example.com:9083; +SET iceberg.catalog.another_hive.clients=10; +SET iceberg.catalog.another_hive.warehouse=hdfs://example.com:8020/warehouse; +``` + +Register a `HadoopCatalog` called `hadoop`: + +``` +SET iceberg.catalog.hadoop.type=hadoop; +SET iceberg.catalog.hadoop.warehouse=hdfs://example.com:8020/warehouse; +``` + +Register an AWS `GlueCatalog` called `glue`: + +``` +SET iceberg.catalog.glue.catalog-impl=org.apache.iceberg.aws.glue.GlueCatalog; +SET iceberg.catalog.glue.warehouse=s3://my-bucket/my/key/prefix; +SET iceberg.catalog.glue.lock.table=myGlueLockTable; +``` + +## DDL Commands + +Not all the features below are supported with Hive 2.3.x and Hive 3.1.x. Please refer to the +[Feature support](#feature-support) paragraph for further details. + +One generally applicable difference is that Hive 4.0.0-alpha-1 provides the possibility to use +`STORED BY ICEBERG` instead of the old `STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'` + +### CREATE TABLE + +#### Non partitioned tables + +The Hive `CREATE EXTERNAL TABLE` command creates an Iceberg table when you specify the storage handler as follows: + +```sql +CREATE EXTERNAL TABLE x (i int) STORED BY ICEBERG; +``` + +If you want to create external tables using CREATE TABLE, configure the MetaStoreMetadataTransformer on the cluster, +and `CREATE TABLE` commands are transformed to create external tables. For example: + +```sql +CREATE TABLE x (i int) STORED BY ICEBERG; +``` + +You can specify the default file format (Avro, Parquet, ORC) at the time of the table creation. +The default is Parquet: + +```sql +CREATE TABLE x (i int) STORED BY ICEBERG STORED AS ORC; +``` + +#### Partitioned tables +You can create Iceberg partitioned tables using a command familiar to those who create non-Iceberg tables: + +```sql +CREATE TABLE x (i int) PARTITIONED BY (j int) STORED BY ICEBERG; +``` + +!!! info + The resulting table does not create partitions in HMS, but instead, converts partition data into Iceberg identity partitions. + + +Use the DESCRIBE command to get information about the Iceberg identity partitions: + +```sql +DESCRIBE x; +``` +The result is: + +| col_name | data_type | comment +| ---------------------------------- | -------------- | ------- +| i | int | +| j | int | +| | NULL | NULL +| # Partition Transform Information | NULL | NULL +| # col_name | transform_type | NULL +| j | IDENTITY | NULL + +You can create Iceberg partitions using the following Iceberg partition specification syntax +(supported only from Hive 4.0.0-alpha-1): + +```sql +CREATE TABLE x (i int, ts timestamp) PARTITIONED BY SPEC (month(ts), bucket(2, i)) STORED AS ICEBERG; +DESCRIBE x; +``` +The result is: + +| col_name | data_type | comment +| ---------------------------------- | -------------- | ------- +| i | int | +| ts | timestamp | +| | NULL | NULL +| # Partition Transform Information | NULL | NULL +| # col_name | transform_type | NULL +| ts | MONTH | NULL +| i | BUCKET\[2\] | NULL + +The supported transformations for Hive are the same as for Spark: +* years(ts): partition by year +* months(ts): partition by month +* days(ts) or date(ts): equivalent to dateint partitioning +* hours(ts) or date_hour(ts): equivalent to dateint and hour partitioning +* bucket(N, col): partition by hashed value mod N buckets +* truncate(L, col): partition by value truncated to L + - Strings are truncated to the given length + - Integers and longs truncate to bins: truncate(10, i) produces partitions 0, 10, 20, 30, + +!!! info + The resulting table does not create partitions in HMS, but instead, converts partition data into Iceberg partitions. + + +### CREATE TABLE AS SELECT + +`CREATE TABLE AS SELECT` operation resembles the native Hive operation with a single important difference. +The Iceberg table and the corresponding Hive table are created at the beginning of the query execution. +The data is inserted / committed when the query finishes. So for a transient period the table already exists but contains no data. + +```sql +CREATE TABLE target PARTITIONED BY SPEC (year(year_field), identity_field) STORED BY ICEBERG AS + SELECT * FROM source; +``` + +### CREATE TABLE LIKE TABLE + +```sql +CREATE TABLE target LIKE source STORED BY ICEBERG; +``` + +### CREATE EXTERNAL TABLE overlaying an existing Iceberg table + +The `CREATE EXTERNAL TABLE` command is used to overlay a Hive table "on top of" an existing Iceberg table. Iceberg +tables are created using either a [`Catalog`](../../javadoc/{{ icebergVersion }}/index.html?org/apache/iceberg/catalog/Catalog.html), or an implementation of the [`Tables`](../../javadoc/{{ icebergVersion }}/index.html?org/apache/iceberg/Tables.html) interface, and Hive needs to be configured accordingly to operate on these different types of table. + +#### Hive catalog tables + +As described before, tables created by the `HiveCatalog` with Hive engine feature enabled are directly visible by the +Hive engine, so there is no need to create an overlay. + +#### Custom catalog tables + +For a table in a registered catalog, specify the catalog name in the statement using table property `iceberg.catalog`. +For example, the SQL below creates an overlay for a table in a `hadoop` type catalog named `hadoop_cat`: + +```sql +SET +iceberg.catalog.hadoop_cat.type=hadoop; +SET +iceberg.catalog.hadoop_cat.warehouse=hdfs://example.com:8020/hadoop_cat; + +CREATE +EXTERNAL TABLE database_a.table_a +STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' +TBLPROPERTIES ('iceberg.catalog'='hadoop_cat'); +``` + +When `iceberg.catalog` is missing from both table properties and the global Hadoop configuration, `HiveCatalog` will be +used as default. + +#### Path-based Hadoop tables + +Iceberg tables created using `HadoopTables` are stored entirely in a directory in a filesystem like HDFS. These tables +are considered to have no catalog. To indicate that, set `iceberg.catalog` property to `location_based_table`. For +example: + +```sql +CREATE +EXTERNAL TABLE table_a +STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' +LOCATION 'hdfs://some_bucket/some_path/table_a' +TBLPROPERTIES ('iceberg.catalog'='location_based_table'); +``` + +#### CREATE TABLE overlaying an existing Iceberg table + +You can also create a new table that is managed by a custom catalog. For example, the following code creates a table in +a custom Hadoop catalog: + +```sql +SET +iceberg.catalog.hadoop_cat.type=hadoop; +SET +iceberg.catalog.hadoop_cat.warehouse=hdfs://example.com:8020/hadoop_cat; + +CREATE TABLE database_a.table_a +( + id bigint, + name string +) PARTITIONED BY ( + dept string +) STORED BY 'org.apache.iceberg.mr.hive.HiveIcebergStorageHandler' +TBLPROPERTIES ('iceberg.catalog'='hadoop_cat'); +``` + +!!! danger + table. This means technically you can omit the `EXTERNAL` keyword when creating an overlay table. However, this is **not + recommended** because creating managed overlay tables could pose a risk to the shared data files in case of accidental + drop table commands from the Hive side, which would unintentionally remove all the data in the table. + + +### ALTER TABLE +#### Table properties +For HiveCatalog tables the Iceberg table properties and the Hive table properties stored in HMS are kept in sync. + +!!! info + IMPORTANT: This feature is not available for other Catalog implementations. + +```sql +ALTER TABLE t SET TBLPROPERTIES('...'='...'); +``` + +#### Schema evolution +The Hive table schema is kept in sync with the Iceberg table. If an outside source (Impala/Spark/Java API/etc) +changes the schema, the Hive table immediately reflects the changes. You alter the table schema using Hive commands: + +* Add a column +```sql +ALTER TABLE orders ADD COLUMNS (nickname string); +``` +* Rename a column +```sql +ALTER TABLE orders CHANGE COLUMN item fruit string; +``` +* Reorder columns +```sql +ALTER TABLE orders CHANGE COLUMN quantity quantity int AFTER price; +``` +* Change a column type - only if the Iceberg defined the column type change as safe +```sql +ALTER TABLE orders CHANGE COLUMN price price long; +``` +* Drop column by using REPLACE COLUMN to remove the old column +```sql +ALTER TABLE orders REPLACE COLUMNS (remaining string); +``` +!!! info + Note, that dropping columns is only thing REPLACE COLUMNS can be used for + i.e. if columns are specified out-of-order an error will be thrown signalling this limitation. + + +#### Partition evolution +You change the partitioning schema using the following commands: +* Change the partitioning schema to new identity partitions: +```sql +ALTER TABLE default.customers SET PARTITION SPEC (last_name); +``` +* Alternatively, provide a partition specification: +```sql +ALTER TABLE order SET PARTITION SPEC (month(ts)); +``` +#### Table migration +You can migrate Avro / Parquet / ORC external tables to Iceberg tables using the following command: +```sql +ALTER TABLE t SET TBLPROPERTIES ('storage_handler'='org.apache.iceberg.mr.hive.HiveIcebergStorageHandler'); +``` +During the migration the data files are not changed, only the appropriate Iceberg metadata files are created. +After the migration, handle the table as a normal Iceberg table. + +### TRUNCATE TABLE +The following command truncates the Iceberg table: +```sql +TRUNCATE TABLE t; +``` +Using a partition specification is not allowed. + +### DROP TABLE + +Tables can be dropped using the `DROP TABLE` command: + +```sql +DROP TABLE [IF EXISTS] table_name [PURGE]; +``` + +### METADATA LOCATION + +The metadata location (snapshot location) only can be changed if the new path contains the exact same metadata json. +It can be done only after migrating the table to Iceberg, the two operation cannot be done in one step. + +```sql +ALTER TABLE t set TBLPROPERTIES ('metadata_location'='/hivemetadata/00003-a1ada2b8-fc86-4b5b-8c91-400b6b46d0f2.metadata.json'); +``` + +## DML Commands + +### SELECT +Select statements work the same on Iceberg tables in Hive. You will see the Iceberg benefits over Hive in compilation and execution: +* **No file system listings** - especially important on blob stores, like S3 +* **No partition listing from** the Metastore +* **Advanced partition filtering** - the partition keys are not needed in the queries when they could be calculated +* Could handle **higher number of partitions** than normal Hive tables + +Here are the features highlights for Iceberg Hive read support: +1. **Predicate pushdown**: Pushdown of the Hive SQL `WHERE` clause has been implemented so that these filters are used at the Iceberg `TableScan` level as well as by the Parquet and ORC Readers. +2. **Column projection**: Columns from the Hive SQL `SELECT` clause are projected down to the Iceberg readers to reduce the number of columns read. +3. **Hive query engines**: + - With Hive 2.3.x, 3.1.x both the MapReduce and Tez query execution engines are supported. + - With Hive 4.0.0-alpha-1 Tez query execution engine is supported. + +Some of the advanced / little used optimizations are not yet implemented for Iceberg tables, so you should check your individual queries. +Also currently the statistics stored in the MetaStore are used for query planning. This is something we are planning to improve in the future. + +### INSERT INTO + +Hive supports the standard single-table INSERT INTO operation: + +```sql +INSERT INTO table_a +VALUES ('a', 1); +INSERT INTO table_a +SELECT...; +``` + +Multi-table insert is also supported, but it will not be atomic. Commits occur one table at a time. +Partial changes will be visible during the commit process and failures can leave partial changes committed. +Changes within a single table will remain atomic. + +Here is an example of inserting into multiple tables at once in Hive SQL: + +```sql +FROM customers + INSERT INTO target1 SELECT customer_id, first_name + INSERT INTO target2 SELECT last_name, customer_id; +``` + +### INSERT OVERWRITE +INSERT OVERWRITE can replace data in the table with the result of a query. Overwrites are atomic operations for Iceberg tables. +For nonpartitioned tables the content of the table is always removed. For partitioned tables the partitions +that have rows produced by the SELECT query will be replaced. +```sql +INSERT OVERWRITE TABLE target SELECT * FROM source; +``` + +### QUERYING METADATA TABLES +Hive supports querying of the Iceberg Metadata tables. The tables could be used as normal +Hive tables, so it is possible to use projections / joins / filters / etc. +To reference a metadata table the full name of the table should be used, like: +... + +Currently the following metadata tables are available in Hive: +* files +* entries +* snapshots +* manifests +* partitions + +```sql +SELECT * FROM default.table_a.files; +``` + +### TIMETRAVEL +Hive supports snapshot id based and time base timetravel queries. +For these views it is possible to use projections / joins / filters / etc. +The function is available with the following syntax: +```sql +SELECT * FROM table_a FOR SYSTEM_TIME AS OF '2021-08-09 10:35:57'; +SELECT * FROM table_a FOR SYSTEM_VERSION AS OF 1234567; +``` + +You can expire snapshots of an Iceberg table using an ALTER TABLE query from Hive. You should periodically expire snapshots to delete data files that is no longer needed, and reduce the size of table metadata. + +Each write to an Iceberg table from Hive creates a new snapshot, or version, of a table. Snapshots can be used for time-travel queries, or the table can be rolled back to any valid snapshot. Snapshots accumulate until they are expired by the expire_snapshots operation. +Enter a query to expire snapshots having the following timestamp: `2021-12-09 05:39:18.689000000` +```sql +ALTER TABLE test_table EXECUTE expire_snapshots('2021-12-09 05:39:18.689000000'); +``` + +### Type compatibility + +Hive and Iceberg support different set of types. Iceberg can perform type conversion automatically, but not for all +combinations, so you may want to understand the type conversion in Iceberg in prior to design the types of columns in +your tables. You can enable auto-conversion through Hadoop configuration (not enabled by default): + +| Config key | Default | Description | +| -----------------------------------------| --------------------------- | --------------------------------------------------- | +| iceberg.mr.schema.auto.conversion | false | if Hive should perform type auto-conversion | + +### Hive type to Iceberg type + +This type conversion table describes how Hive types are converted to the Iceberg types. The conversion applies on both +creating Iceberg table and writing to Iceberg table via Hive. + +| Hive | Iceberg | Notes | +|------------------|-------------------------|-------| +| boolean | boolean | | +| short | integer | auto-conversion | +| byte | integer | auto-conversion | +| integer | integer | | +| long | long | | +| float | float | | +| double | double | | +| date | date | | +| timestamp | timestamp without timezone | | +| timestamplocaltz | timestamp with timezone | Hive 3 only | +| interval_year_month | | not supported | +| interval_day_time | | not supported | +| char | string | auto-conversion | +| varchar | string | auto-conversion | +| string | string | | +| binary | binary | | +| decimal | decimal | | +| struct | struct | | +| list | list | | +| map | map | | +| union | | not supported | + +### Table rollback + +Rolling back iceberg table's data to the state at an older table snapshot. + +Rollback to the last snapshot before a specific timestamp + +```sql +ALTER TABLE ice_t EXECUTE ROLLBACK('2022-05-12 00:00:00') +``` + +Rollback to a specific snapshot ID +```sql +ALTER TABLE ice_t EXECUTE ROLLBACK(1111); +``` diff --git a/docs-new/home/docs/latest/index.md b/docs-new/home/docs/latest/index.md new file mode 100644 index 000000000000..19dc954b51b3 --- /dev/null +++ b/docs-new/home/docs/latest/index.md @@ -0,0 +1,52 @@ +--- +title: "Introduction" +--- + + +# Documentation + +**Apache Iceberg is an open table format for huge analytic datasets.** Iceberg adds tables to compute engines including Spark, Trino, PrestoDB, Flink, Hive and Impala using a high-performance table format that works just like a SQL table. + +### User experience + +Iceberg avoids unpleasant surprises. Schema evolution works and won't inadvertently un-delete data. Users don't need to know about partitioning to get fast queries. + +* [Schema evolution](evolution.md#schema-evolution) supports add, drop, update, or rename, and has [no side-effects](evolution.md#correctness) +* [Hidden partitioning](partitioning.md) prevents user mistakes that cause silently incorrect results or extremely slow queries +* [Partition layout evolution](evolution.md#partition-evolution) can update the layout of a table as data volume or query patterns change +* [Time travel](spark-queries.md#time-travel) enables reproducible queries that use exactly the same table snapshot, or lets users easily examine changes +* Version rollback allows users to quickly correct problems by resetting tables to a good state + +### Reliability and performance + +Iceberg was built for huge tables. Iceberg is used in production where a single table can contain tens of petabytes of data and even these huge tables can be read without a distributed SQL engine. + +* [Scan planning is fast](performance.md#scan-planning) -- a distributed SQL engine isn't needed to read a table or find files +* [Advanced filtering](performance.md#data-filtering) -- data files are pruned with partition and column-level stats, using table metadata + +Iceberg was designed to solve correctness problems in eventually-consistent cloud object stores. + +* [Works with any cloud store](reliability.md) and reduces NN congestion when in HDFS, by avoiding listing and renames +* [Serializable isolation](reliability.md) -- table changes are atomic and readers never see partial or uncommitted changes +* [Multiple concurrent writers](reliability.md#concurrent-write-operations) use optimistic concurrency and will retry to ensure that compatible updates succeed, even when writes conflict + +### Open standard + +Iceberg has been designed and developed to be an open community standard with a [specification](../../spec.md) to ensure compatibility across languages and implementations. + +[Apache Iceberg is open source](../../community.md), and is developed at the [Apache Software Foundation](https://www.apache.org/). diff --git a/docs-new/home/docs/latest/java-api-quickstart.md b/docs-new/home/docs/latest/java-api-quickstart.md new file mode 100644 index 000000000000..cd6d647b62b1 --- /dev/null +++ b/docs-new/home/docs/latest/java-api-quickstart.md @@ -0,0 +1,317 @@ +--- +title: "Java Quickstart" +--- + + +# Java API Quickstart + +## Create a table + +Tables are created using either a [`Catalog`](../../javadoc/{{ icebergVersion }}/index.html?org/apache/iceberg/catalog/Catalog.html) or an implementation of the [`Tables`](../../javadoc/{{ icebergVersion }}/index.html?org/apache/iceberg/Tables.html) interface. + +### Using a Hive catalog + +The Hive catalog connects to a Hive metastore to keep track of Iceberg tables. +You can initialize a Hive catalog with a name and some properties. +(see: [Catalog properties](configuration.md#catalog-properties)) + +**Note:** Currently, `setConf` is always required for hive catalogs, but this will change in the future. + +```java +import org.apache.iceberg.hive.HiveCatalog; + +HiveCatalog catalog = new HiveCatalog(); +catalog.setConf(spark.sparkContext().hadoopConfiguration()); // Configure using Spark's Hadoop configuration + +Map properties = new HashMap(); +properties.put("warehouse", "..."); +properties.put("uri", "..."); + +catalog.initialize("hive", properties); +``` + +The `Catalog` interface defines methods for working with tables, like `createTable`, `loadTable`, `renameTable`, and `dropTable`. `HiveCatalog` implements the `Catalog` interface. + +To create a table, pass an `Identifier` and a `Schema` along with other initial metadata: + +```java +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; + +TableIdentifier name = TableIdentifier.of("logging", "logs"); +Table table = catalog.createTable(name, schema, spec); + +// or to load an existing table, use the following line +// Table table = catalog.loadTable(name); +``` + +The logs [schema](#create-a-schema) and [partition spec](#create-a-partition-spec) are created below. + + +### Using a Hadoop catalog + +A Hadoop catalog doesn't need to connect to a Hive MetaStore, but can only be used with HDFS or similar file systems that support atomic rename. Concurrent writes with a Hadoop catalog are not safe with a local FS or S3. To create a Hadoop catalog: + +```java +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.hadoop.HadoopCatalog; + +Configuration conf = new Configuration(); +String warehousePath = "hdfs://host:8020/warehouse_path"; +HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath); +``` + +Like the Hive catalog, `HadoopCatalog` implements `Catalog`, so it also has methods for working with tables, like `createTable`, `loadTable`, and `dropTable`. + +This example creates a table with Hadoop catalog: + +```java +import org.apache.iceberg.Table; +import org.apache.iceberg.catalog.TableIdentifier; + +TableIdentifier name = TableIdentifier.of("logging", "logs"); +Table table = catalog.createTable(name, schema, spec); + +// or to load an existing table, use the following line +// Table table = catalog.loadTable(name); +``` + +The logs [schema](#create-a-schema) and [partition spec](#create-a-partition-spec) are created below. + + +### Using Hadoop tables + +Iceberg also supports tables that are stored in a directory in HDFS. Concurrent writes with a Hadoop tables are not safe when stored in the local FS or S3. Directory tables don't support all catalog operations, like rename, so they use the `Tables` interface instead of `Catalog`. + +To create a table in HDFS, use `HadoopTables`: + +```java +import org.apache.hadoop.conf.Configuration; +import org.apache.iceberg.hadoop.HadoopTables; +import org.apache.iceberg.Table; + +Configuration conf = new Configuration(); +HadoopTables tables = new HadoopTables(conf); +Table table = tables.create(schema, spec, table_location); + +// or to load an existing table, use the following line +// Table table = tables.load(table_location); +``` + +!!! danger + Hadoop tables shouldn't be used with file systems that do not support atomic rename. Iceberg relies on rename to synchronize concurrent commits for directory tables. + + +### Tables in Spark + +Spark uses both `HiveCatalog` and `HadoopTables` to load tables. Hive is used when the identifier passed to `load` or `save` is not a path, otherwise Spark assumes it is a path-based table. + +To read and write to tables from Spark see: + +* [SQL queries in Spark](spark-queries.md#querying-with-sql) +* [`INSERT INTO` in Spark](spark-writes.md#insert-into) +* [`MERGE INTO` in Spark](spark-writes.md#merge-into) + + +## Schemas + +### Create a schema + +This example creates a schema for a `logs` table: + +```java +import org.apache.iceberg.Schema; +import org.apache.iceberg.types.Types; + +Schema schema = new Schema( + Types.NestedField.required(1, "level", Types.StringType.get()), + Types.NestedField.required(2, "event_time", Types.TimestampType.withZone()), + Types.NestedField.required(3, "message", Types.StringType.get()), + Types.NestedField.optional(4, "call_stack", Types.ListType.ofRequired(5, Types.StringType.get())) + ); +``` + +When using the Iceberg API directly, type IDs are required. Conversions from other schema formats, like Spark, Avro, and Parquet will automatically assign new IDs. + +When a table is created, all IDs in the schema are re-assigned to ensure uniqueness. + +### Convert a schema from Avro + +To create an Iceberg schema from an existing Avro schema, use converters in `AvroSchemaUtil`: + +```java +import org.apache.avro.Schema; +import org.apache.avro.Schema.Parser; +import org.apache.iceberg.avro.AvroSchemaUtil; + +Schema avroSchema = new Parser().parse("{\"type\": \"record\" , ... }"); +Schema icebergSchema = AvroSchemaUtil.toIceberg(avroSchema); +``` + +### Convert a schema from Spark + +To create an Iceberg schema from an existing table, use converters in `SparkSchemaUtil`: + +```java +import org.apache.iceberg.spark.SparkSchemaUtil; + +Schema schema = SparkSchemaUtil.schemaForTable(sparkSession, table_name); +``` + +## Partitioning + +### Create a partition spec + +Partition specs describe how Iceberg should group records into data files. Partition specs are created for a table's schema using a builder. + +This example creates a partition spec for the `logs` table that partitions records by the hour of the log event's timestamp and by log level: + +```java +import org.apache.iceberg.PartitionSpec; + +PartitionSpec spec = PartitionSpec.builderFor(schema) + .hour("event_time") + .identity("level") + .build(); +``` + +For more information on the different partition transforms that Iceberg offers, visit [this page](../../spec.md#partitioning). + +## Branching and Tagging + +### Creating branches and tags + +New branches and tags can be created via the Java library's ManageSnapshots API. + +```java + +/* Create a branch test-branch which is retained for 1 week, and the latest 2 snapshots on test-branch will always be retained. +Snapshots on test-branch which are created within the last hour will also be retained. */ + +String branch = "test-branch"; +table.manageSnapshots() + .createBranch(branch, 3) + .setMinSnapshotsToKeep(branch, 2) + .setMaxSnapshotAgeMs(branch, 3600000) + .setMaxRefAgeMs(branch, 604800000) + .commit(); + +// Create a tag historical-tag at snapshot 10 which is retained for a day +String tag = "historical-tag" +table.manageSnapshots() + .createTag(tag, 10) + .setMaxRefAgeMs(tag, 86400000) + .commit(); +``` + +### Committing to branches + +Writing to a branch can be performed by specifying `toBranch` in the operation. For the full list refer to [UpdateOperations](api.md#update-operations). +```java +// Append FILE_A to branch test-branch +String branch = "test-branch"; + +table.newAppend() + .appendFile(FILE_A) + .toBranch(branch) + .commit(); + + +// Perform row level updates on "test-branch" +table.newRowDelta() + .addRows(DATA_FILE) + .addDeletes(DELETES) + .toBranch(branch) + .commit(); + + +// Perform a rewrite operation replacing small_file_1 and small_file_2 on "test-branch" with compacted_file. +table.newRewrite() + .rewriteFiles(ImmutableSet.of(small_file_1,small_file_2), ImmutableSet.of(compacted_file)) + .toBranch(branch) + .commit(); + +``` + +### Reading from branches and tags +Reading from a branch or tag can be done as usual via the Table Scan API, by passing in a branch or tag in the `useRef` API. When a branch is passed in, the snapshot that's used is the head of the branch. Note that currently reading from a branch and specifying an `asOfSnapshotId` in the scan is not supported. + +```java +// Read from the head snapshot of test-branch +TableScan branchRead = table.newScan().useRef("test-branch"); + +// Read from the snapshot referenced by audit-tag +Table tagRead = table.newScan().useRef("audit-tag"); +``` + +### Replacing and fast forwarding branches and tags + +The snapshots which existing branches and tags point to can be updated via the `replace` APIs. The fast forward operation is similar to git fast-forwarding. Fast forward can be used to advance a target branch to the head of a source branch or a tag when the target branch is an ancestor of the source. For both fast forward and replace, retention properties of the target branch are maintained by default. + +```java + +// Update "test-branch" to point to snapshot 4 +table.manageSnapshots() + .replaceBranch(branch, 4) + .commit() + +String tag = "audit-tag"; +// Replace "audit-tag" to point to snapshot 3 and update its retention +table.manageSnapshots() + .replaceBranch(tag, 4) + .setMaxRefAgeMs(1000) + .commit() + + +``` + +### Updating retention properties + +Retention properties for branches and tags can be updated as well. +Use the setMaxRefAgeMs for updating the retention property of the branch or tag itself. Branch snapshot retention properties can be updated via the `setMinSnapshotsToKeep` and `setMaxSnapshotAgeMs` APIs. + +```java +String branch = "test-branch"; +// Update retention properties for test-branch +table.manageSnapshots() + .setMinSnapshotsToKeep(branch, 10) + .setMaxSnapshotAgeMs(branch, 7200000) + .setMaxRefAgeMs(branch, 604800000) + .commit(); + +// Update retention properties for test-tag +table.manageSnapshots() + .setMaxRefAgeMs("test-tag", 604800000) + .commit(); +``` + +### Removing branches and tags + +Branches and tags can be removed via the `removeBranch` and `removeTag` APIs respectively + +```java +// Remove test-branch +table.manageSnapshots() + .removeBranch("test-branch") + .commit() + +// Remove test-tag +table.manageSnapshots() + .removeTag("test-tag") + .commit() +``` diff --git a/docs-new/home/docs/latest/jdbc.md b/docs-new/home/docs/latest/jdbc.md new file mode 100644 index 000000000000..7b525fb7ee77 --- /dev/null +++ b/docs-new/home/docs/latest/jdbc.md @@ -0,0 +1,70 @@ +--- +title: "JDBC" +--- + + +# Iceberg JDBC Integration + +## JDBC Catalog + +Iceberg supports using a table in a relational database to manage Iceberg tables through JDBC. +The database that JDBC connects to must support atomic transaction to allow the JDBC catalog implementation to +properly support atomic Iceberg table commits and read serializable isolation. + +### Configurations + +Because each database and database service provider might require different configurations, +the JDBC catalog allows arbitrary configurations through: + +| Property | Default | Description | +| -------------------- | --------------------------------- | ------------------------------------------------------ | +| uri | | the JDBC connection string | +| jdbc. | | any key value pairs to configure the JDBC connection | + +### Examples + + +#### Spark + +You can start a Spark session with a MySQL JDBC connection using the following configurations: + +```shell +spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{ icebergVersion }} \ + --conf spark.sql.catalog.my_catalog=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.my_catalog.warehouse=s3://my-bucket/my/key/prefix \ + --conf spark.sql.catalog.my_catalog.catalog-impl=org.apache.iceberg.jdbc.JdbcCatalog \ + --conf spark.sql.catalog.my_catalog.uri=jdbc:mysql://test.1234567890.us-west-2.rds.amazonaws.com:3306/default \ + --conf spark.sql.catalog.my_catalog.jdbc.verifyServerCertificate=true \ + --conf spark.sql.catalog.my_catalog.jdbc.useSSL=true \ + --conf spark.sql.catalog.my_catalog.jdbc.user=admin \ + --conf spark.sql.catalog.my_catalog.jdbc.password=pass +``` + +#### Java API + +```java +Class.forName("com.mysql.cj.jdbc.Driver"); // ensure JDBC driver is at runtime classpath +Map properties = new HashMap<>(); +properties.put(CatalogProperties.CATALOG_IMPL, JdbcCatalog.class.getName()); +properties.put(CatalogProperties.URI, "jdbc:mysql://localhost:3306/test"); +properties.put(JdbcCatalog.PROPERTY_PREFIX + "user", "admin"); +properties.put(JdbcCatalog.PROPERTY_PREFIX + "password", "pass"); +properties.put(CatalogProperties.WAREHOUSE_LOCATION, "s3://warehouse/path"); +Configuration hadoopConf = new Configuration(); // configs if you use HadoopFileIO +JdbcCatalog catalog = CatalogUtil.buildIcebergCatalog("test_jdbc_catalog", properties, hadoopConf); +``` diff --git a/docs-new/home/docs/latest/maintenance.md b/docs-new/home/docs/latest/maintenance.md new file mode 100644 index 000000000000..dd6bf7abdb5d --- /dev/null +++ b/docs-new/home/docs/latest/maintenance.md @@ -0,0 +1,157 @@ +--- +title: Maintenance +--- + + +# Maintenance + +!!! info + Maintenance operations require the `Table` instance. Please refer [Java API quickstart](java-api-quickstart.md#create-a-table) page to refer how to load an existing table. + +## Recommended Maintenance + +### Expire Snapshots + +Each write to an Iceberg table creates a new _snapshot_, or version, of a table. Snapshots can be used for time-travel queries, or the table can be rolled back to any valid snapshot. + +Snapshots accumulate until they are expired by the [`expireSnapshots`](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/Table.html#expireSnapshots--) operation. Regularly expiring snapshots is recommended to delete data files that are no longer needed, and to keep the size of table metadata small. + +This example expires snapshots that are older than 1 day: + +```java +Table table = ... +long tsToExpire = System.currentTimeMillis() - (1000 * 60 * 60 * 24); // 1 day +table.expireSnapshots() + .expireOlderThan(tsToExpire) + .commit(); +``` + +See the [`ExpireSnapshots` Javadoc](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/ExpireSnapshots.html) to see more configuration options. + +There is also a Spark action that can run table expiration in parallel for large tables: + +```java +Table table = ... +SparkActions + .get() + .expireSnapshots(table) + .expireOlderThan(tsToExpire) + .execute(); +``` + +Expiring old snapshots removes them from metadata, so they are no longer available for time travel queries. + +!!! info + Data files are not deleted until they are no longer referenced by a snapshot that may be used for time travel or rollback. + Regularly expiring snapshots deletes unused data files. + + +### Remove old metadata files + +Iceberg keeps track of table metadata using JSON files. Each change to a table produces a new metadata file to provide atomicity. + +Old metadata files are kept for history by default. Tables with frequent commits, like those written by streaming jobs, may need to regularly clean metadata files. + +To automatically clean metadata files, set `write.metadata.delete-after-commit.enabled=true` in table properties. This will keep some metadata files (up to `write.metadata.previous-versions-max`) and will delete the oldest metadata file after each new one is created. + +| Property | Description | +| -------------------------------------------- |--------------------------------------------------------------------------| +| `write.metadata.delete-after-commit.enabled` | Whether to delete old **tracked** metadata files after each table commit | +| `write.metadata.previous-versions-max` | The number of old metadata files to keep | + +Note that this will only delete metadata files that are **tracked** in the metadata log and will not delete orphaned metadata files. +Example: With `write.metadata.delete-after-commit.enabled=false` and `write.metadata.previous-versions-max=10`, one will have 10 tracked metadata files and 90 orphaned metadata files after 100 commits. +Configuring `write.metadata.delete-after-commit.enabled=true` and `write.metadata.previous-versions-max=20` will not automatically delete metadata files. Tracked metadata files would be deleted again when reaching `write.metadata.previous-versions-max=20`. + +See [table write properties](configuration.md#write-properties) for more details. + +### Delete orphan files + +In Spark and other distributed processing engines, task or job failures can leave files that are not referenced by table metadata, and in some cases normal snapshot expiration may not be able to determine a file is no longer needed and delete it. + +To clean up these "orphan" files under a table location, use the `deleteOrphanFiles` action. + +```java +Table table = ... +SparkActions + .get() + .deleteOrphanFiles(table) + .execute(); +``` + +See the [DeleteOrphanFiles Javadoc](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/actions/DeleteOrphanFiles.html) to see more configuration options. + +This action may take a long time to finish if you have lots of files in data and metadata directories. It is recommended to execute this periodically, but you may not need to execute this often. + +!!! info + It is dangerous to remove orphan files with a retention interval shorter than the time expected for any write to complete because it + might corrupt the table if in-progress files are considered orphaned and are deleted. The default interval is 3 days. + + +!!! info + Iceberg uses the string representations of paths when determining which files need to be removed. On some file systems, + the path can change over time, but it still represents the same file. For example, if you change authorities for an HDFS cluster, + none of the old path urls used during creation will match those that appear in a current listing. *This will lead to data loss when + RemoveOrphanFiles is run*. Please be sure the entries in your MetadataTables match those listed by the Hadoop + FileSystem API to avoid unintentional deletion. + + +## Optional Maintenance + +Some tables require additional maintenance. For example, streaming queries may produce small data files that should be [compacted into larger files](#compact-data-files). And some tables can benefit from [rewriting manifest files](#rewrite-manifests) to make locating data for queries much faster. + +### Compact data files + +Iceberg tracks each data file in a table. More data files leads to more metadata stored in manifest files, and small data files causes an unnecessary amount of metadata and less efficient queries from file open costs. + +Iceberg can compact data files in parallel using Spark with the `rewriteDataFiles` action. This will combine small files into larger files to reduce metadata overhead and runtime file open cost. + +```java +Table table = ... +SparkActions + .get() + .rewriteDataFiles(table) + .filter(Expressions.equal("date", "2020-08-18")) + .option("target-file-size-bytes", Long.toString(500 * 1024 * 1024)) // 500 MB + .execute(); +``` + +The `files` metadata table is useful for inspecting data file sizes and determining when to compact partitions. + +See the [`RewriteDataFiles` Javadoc](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/actions/RewriteDataFiles.html) to see more configuration options. + +### Rewrite manifests + +Iceberg uses metadata in its manifest list and manifest files speed up query planning and to prune unnecessary data files. The metadata tree functions as an index over a table's data. + +Manifests in the metadata tree are automatically compacted in the order they are added, which makes queries faster when the write pattern aligns with read filters. For example, writing hourly-partitioned data as it arrives is aligned with time range query filters. + +When a table's write pattern doesn't align with the query pattern, metadata can be rewritten to re-group data files into manifests using `rewriteManifests` or the `rewriteManifests` action (for parallel rewrites using Spark). + +This example rewrites small manifests and groups data files by the first partition field. + +```java +Table table = ... +SparkActions + .get() + .rewriteManifests(table) + .rewriteIf(file -> file.length() < 10 * 1024 * 1024) // 10 MB + .execute(); +``` + +See the [`RewriteManifests` Javadoc](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/actions/RewriteManifests.html) to see more configuration options. diff --git a/docs-new/home/docs/latest/mkdocs.yml b/docs-new/home/docs/latest/mkdocs.yml new file mode 100644 index 000000000000..4e3012528ff5 --- /dev/null +++ b/docs-new/home/docs/latest/mkdocs.yml @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +site_name: docs/latest +docs_dir: . + +plugins: + - search + +nav: + - index.md + - Tables: + - branching.md + - configuration.md + - evolution.md + - maintenance.md + - partitioning.md + - performance.md + - reliability.md + - schemas.md + - Spark: + - spark-getting-started.md + - spark-configuration.md + - spark-ddl.md + - spark-procedures.md + - spark-queries.md + - spark-structured-streaming.md + - spark-writes.md + - Flink: + - flink.md + - flink-connector.md + - flink-ddl.md + - flink-queries.md + - flink-writes.md + - flink-actions.md + - flink-configuration.md + - hive.md + - Trino: https://trino.io/docs/current/connector/iceberg.html + - Clickhouse: https://clickhouse.com/docs/en/engines/table-engines/integrations/iceberg + - Presto: https://prestodb.io/docs/current/connector/iceberg.html + - Dremio: https://docs.dremio.com/data-formats/apache-iceberg/ + - Starrocks: https://docs.starrocks.io/en-us/latest/data_source/catalog/iceberg_catalog + - Amazon Athena: https://docs.aws.amazon.com/athena/latest/ug/querying-iceberg.html + - Amazon EMR: https://docs.aws.amazon.com/emr/latest/ReleaseGuide/emr-iceberg-use-cluster.html + - Impala: https://impala.apache.org/docs/build/html/topics/impala_iceberg.html + - Doris: https://doris.apache.org/docs/dev/lakehouse/multi-catalog/iceberg + - Integrations: + - aws.md + - dell.md + - jdbc.md + - nessie.md + - API: + - java-api-quickstart.md + - api.md + - custom-catalog.md + - Javadoc: ../../javadoc/latest/ + - PyIceberg: https://py.iceberg.apache.org/ diff --git a/docs-new/home/docs/latest/nessie.md b/docs-new/home/docs/latest/nessie.md new file mode 100644 index 000000000000..809a3309eb81 --- /dev/null +++ b/docs-new/home/docs/latest/nessie.md @@ -0,0 +1,160 @@ +--- +title: "Nessie" +--- + + +# Iceberg Nessie Integration + +Iceberg provides integration with Nessie through the `iceberg-nessie` module. +This section describes how to use Iceberg with Nessie. Nessie provides several key features on top of Iceberg: + +* multi-table transactions +* git-like operations (eg branches, tags, commits) +* hive-like metastore capabilities + +See [Project Nessie](https://projectnessie.org) for more information on Nessie. Nessie requires a server to run, see +[Getting Started](https://projectnessie.org/try/) to start a Nessie server. + +## Enabling Nessie Catalog + +The `iceberg-nessie` module is bundled with Spark and Flink runtimes for all versions from `0.11.0`. To get started +with Nessie (with spark-3.3) and Iceberg simply add the Iceberg runtime to your process. Eg: `spark-sql --packages +org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:{{ icebergVersion }}`. + +## Spark SQL Extensions + +From Spark 3.1 and above, Nessie SQL extensions can be used to manage the Nessie repo as shown below. +Example for Spark 3.3 with scala 2.12: + +``` +bin/spark-sql + --packages "org.apache.iceberg:iceberg-spark-runtime-3.3_2.12:{{ icebergVersion }},org.projectnessie.nessie-integrations:nessie-spark-extensions-3.3_2.12:{{ nessieVersion }}" + --conf spark.sql.extensions="org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions" + --conf +``` +Please refer [Nessie SQL extension document](https://projectnessie.org/tools/sql/) to learn more about it. + +## Nessie Catalog + +One major feature introduced in release `0.11.0` is the ability to easily interact with a [Custom Catalog](custom-catalog.md) from Spark and Flink. See [Spark Configuration](spark-configuration.md#catalog-configuration) + and [Flink Configuration](flink.md#custom-catalog) for instructions for adding a custom catalog to Iceberg. + +To use the Nessie Catalog the following properties are required: + +* `warehouse`. Like most other catalogs the warehouse property is a file path to where this catalog should store tables. +* `uri`. This is the Nessie server base uri. Eg `http://localhost:19120/api/v1`. +* `ref` (optional). This is the Nessie branch or tag you want to work in. + +To run directly in Java this looks like: + +``` java +Map options = new HashMap<>(); +options.put("warehouse", "/path/to/warehouse"); +options.put("ref", "main"); +options.put("uri", "https://localhost:19120/api/v1"); +Catalog nessieCatalog = CatalogUtil.loadCatalog("org.apache.iceberg.nessie.NessieCatalog", "nessie", options, hadoopConfig); +``` + +and in Spark: + +``` java +conf.set("spark.sql.catalog.nessie.warehouse", "/path/to/warehouse"); +conf.set("spark.sql.catalog.nessie.uri", "http://localhost:19120/api/v1") +conf.set("spark.sql.catalog.nessie.ref", "main") +conf.set("spark.sql.catalog.nessie.catalog-impl", "org.apache.iceberg.nessie.NessieCatalog") +conf.set("spark.sql.catalog.nessie", "org.apache.iceberg.spark.SparkCatalog") +conf.set("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions,org.projectnessie.spark.extensions.NessieSparkSessionExtensions") +``` +This is how it looks in Flink via the Python API (additional details can be found [here](flink.md#preparation-when-using-flinks-python-api)): +```python +import os +from pyflink.datastream import StreamExecutionEnvironment +from pyflink.table import StreamTableEnvironment + +env = StreamExecutionEnvironment.get_execution_environment() +iceberg_flink_runtime_jar = os.path.join(os.getcwd(), "iceberg-flink-runtime-{{ icebergVersion }}.jar") +env.add_jars("file://{}".format(iceberg_flink_runtime_jar)) +table_env = StreamTableEnvironment.create(env) + +table_env.execute_sql("CREATE CATALOG nessie_catalog WITH (" + "'type'='iceberg', " + "'catalog-impl'='org.apache.iceberg.nessie.NessieCatalog', " + "'uri'='http://localhost:19120/api/v1', " + "'ref'='main', " + "'warehouse'='/path/to/warehouse')") +``` + +There is nothing special above about the `nessie` name. A spark catalog can have any name, the important parts are the +settings for the `catalog-impl` and the required config to start Nessie correctly. +Once you have a Nessie catalog you have access to your entire Nessie repo. You can then perform create/delete/merge +operations on branches and perform commits on branches. Each Iceberg table in a Nessie Catalog is identified by an +arbitrary length namespace and table name (eg `data.base.name.table`). These namespaces are implicit and don't need to +be created separately. Any transaction on a Nessie enabled Iceberg table is a single commit in Nessie. Nessie commits +can encompass an arbitrary number of actions on an arbitrary number of tables, however in Iceberg this will be limited +to the set of single table transactions currently available. + +Further operations such as merges, viewing the commit log or diffs are performed by direct interaction with the +`NessieClient` in java or by using the python client or cli. See [Nessie CLI](https://projectnessie.org/tools/cli/) for +more details on the CLI and [Spark Guide](https://projectnessie.org/tools/iceberg/spark/) for a more complete description of +Nessie functionality. + +## Nessie and Iceberg + +For most cases Nessie acts just like any other Catalog for Iceberg: providing a logical organization of a set of tables +and providing atomicity to transactions. However, using Nessie opens up other interesting possibilities. When using Nessie with +Iceberg every Iceberg transaction becomes a Nessie commit. This history can be listed, merged or cherry-picked across branches. + +### Loosely coupled transactions + +By creating a branch and performing a set of operations on that branch you can approximate a multi-table transaction. +A sequence of commits can be performed on the newly created branch and then merged back into the main branch atomically. +This gives the appearance of a series of connected changes being exposed to the main branch simultaneously. While downstream +consumers will see multiple transactions appear at once this isn't a true multi-table transaction on the database. It is +effectively a fast-forward merge of multiple commits (in git language) and each operation from the branch is its own distinct +transaction and commit. This is different from a real multi-table transaction where all changes would be in the same commit. +This does allow multiple applications to take part in modifying a branch and for this distributed set of transactions to be +exposed to the downstream users simultaneously. + + +### Experimentation + +Changes to a table can be tested in a branch before merging back into main. This is particularly useful when performing +large changes like schema evolution or partition evolution. A partition evolution could be performed in a branch and you +would be able to test out the change (eg performance benchmarks) before merging it. This provides great flexibility in +performing on-line table modifications and testing without interrupting downstream use cases. If the changes are +incorrect or not performant the branch can be dropped without being merged. + +### Further use cases + +Please see the [Nessie Documentation](https://projectnessie.org/features/) for further descriptions of +Nessie features. + +!!! danger + Regular table maintenance in Iceberg is complicated when using nessie. Please consult + [Management Services](https://projectnessie.org/features/management/) before performing any + [table maintenance](maintenance.md). + + +## Example + +Please have a look at the [Nessie Demos repo](https://github.com/projectnessie/nessie-demos) +for different examples of Nessie and Iceberg in action together. + +## Future Improvements + +* Iceberg multi-table transactions. Changes to multiple Iceberg tables in the same transaction, isolation levels etc diff --git a/docs-new/home/docs/latest/partitioning.md b/docs-new/home/docs/latest/partitioning.md new file mode 100644 index 000000000000..757daebd515e --- /dev/null +++ b/docs-new/home/docs/latest/partitioning.md @@ -0,0 +1,95 @@ +--- +title: Partitioning +--- + + +# Partitioning + +## What is partitioning? + +Partitioning is a way to make queries faster by grouping similar rows together when writing. + +For example, queries for log entries from a `logs` table would usually include a time range, like this query for logs between 10 and 12 AM: + +```sql +SELECT level, message FROM logs +WHERE event_time BETWEEN '2018-12-01 10:00:00' AND '2018-12-01 12:00:00' +``` + +Configuring the `logs` table to partition by the date of `event_time` will group log events into files with the same event date. Iceberg keeps track of that date and will use it to skip files for other dates that don't have useful data. + +Iceberg can partition timestamps by year, month, day, and hour granularity. It can also use a categorical column, like `level` in this logs example, to store rows together and speed up queries. + + +## What does Iceberg do differently? + +Other tables formats like Hive support partitioning, but Iceberg supports *hidden partitioning*. + +* Iceberg handles the tedious and error-prone task of producing partition values for rows in a table. +* Iceberg avoids reading unnecessary partitions automatically. Consumers don't need to know how the table is partitioned and add extra filters to their queries. +* Iceberg partition layouts can evolve as needed. + +### Partitioning in Hive + +To demonstrate the difference, consider how Hive would handle a `logs` table. + +In Hive, partitions are explicit and appear as a column, so the `logs` table would have a column called `event_date`. When writing, an insert needs to supply the data for the `event_date` column: + +```sql +INSERT INTO logs PARTITION (event_date) + SELECT level, message, event_time, format_time(event_time, 'YYYY-MM-dd') + FROM unstructured_log_source +``` + +Similarly, queries that search through the `logs` table must have an `event_date` filter in addition to an `event_time` filter. + +```sql +SELECT level, count(1) as count FROM logs +WHERE event_time BETWEEN '2018-12-01 10:00:00' AND '2018-12-01 12:00:00' + AND event_date = '2018-12-01' +``` + +If the `event_date` filter were missing, Hive would scan through every file in the table because it doesn't know that the `event_time` column is related to the `event_date` column. + +### Problems with Hive partitioning + +Hive must be given partition values. In the logs example, it doesn't know the relationship between `event_time` and `event_date`. + +This leads to several problems: + +* Hive can't validate partition values -- it is up to the writer to produce the correct value + - Using the wrong format, `2018-12-01` instead of `20181201`, produces silently incorrect results, not query failures + - Using the wrong source column, like `processing_time`, or time zone also causes incorrect results, not failures +* It is up to the user to write queries correctly + - Using the wrong format also leads to silently incorrect results + - Users that don't understand a table's physical layout get needlessly slow queries -- Hive can't translate filters automatically +* Working queries are tied to the table's partitioning scheme, so partitioning configuration cannot be changed without breaking queries + +### Iceberg's hidden partitioning + +Iceberg produces partition values by taking a column value and optionally transforming it. Iceberg is responsible for converting `event_time` into `event_date`, and keeps track of the relationship. + +Table partitioning is configured using these relationships. The `logs` table would be partitioned by `date(event_time)` and `level`. + +Because Iceberg doesn't require user-maintained partition columns, it can hide partitioning. Partition values are produced correctly every time and always used to speed up queries, when possible. Producers and consumers wouldn't even see `event_date`. + +Most importantly, queries no longer depend on a table's physical layout. With a separation between physical and logical, Iceberg tables can evolve partition schemes over time as data volume changes. Misconfigured tables can be fixed without an expensive migration. + +For details about all the supported hidden partition transformations, see the [Partition Transforms](../../spec.md#partition-transforms) section. + +For details about updating a table's partition spec, see the [partition evolution](evolution.md#partition-evolution) section. diff --git a/docs-new/home/docs/latest/performance.md b/docs-new/home/docs/latest/performance.md new file mode 100644 index 000000000000..cbe870347a1d --- /dev/null +++ b/docs-new/home/docs/latest/performance.md @@ -0,0 +1,55 @@ +--- +title: Performance +--- + + +# Performance + +* Iceberg is designed for huge tables and is used in production where a *single table* can contain tens of petabytes of data. +* Even multi-petabyte tables can be read from a single node, without needing a distributed SQL engine to sift through table metadata. + +## Scan planning + +Scan planning is the process of finding the files in a table that are needed for a query. + +Planning in an Iceberg table fits on a single node because Iceberg's metadata can be used to prune *metadata* files that aren't needed, in addition to filtering *data* files that don't contain matching data. + +Fast scan planning from a single node enables: + +* Lower latency SQL queries -- by eliminating a distributed scan to plan a distributed scan +* Access from any client -- stand-alone processes can read data directly from Iceberg tables + +### Metadata filtering + +Iceberg uses two levels of metadata to track the files in a snapshot. + +* **Manifest files** store a list of data files, along each data file's partition data and column-level stats +* A **manifest list** stores the snapshot's list of manifests, along with the range of values for each partition field + +For fast scan planning, Iceberg first filters manifests using the partition value ranges in the manifest list. Then, it reads each manifest to get data files. With this scheme, the manifest list acts as an index over the manifest files, making it possible to plan without reading all manifests. + +In addition to partition value ranges, a manifest list also stores the number of files added or deleted in a manifest to speed up operations like snapshot expiration. + +### Data filtering + +Manifest files include a tuple of partition data and column-level stats for each data file. + +During planning, query predicates are automatically converted to predicates on the partition data and applied first to filter data files. Next, column-level value counts, null counts, lower bounds, and upper bounds are used to eliminate files that cannot match the query predicate. + +By using upper and lower bounds to filter data files at planning time, Iceberg uses clustered data to eliminate splits without running tasks. In some cases, this is a [10x performance improvement](https://conferences.oreilly.com/strata/strata-ny-2018/cdn.oreillystatic.com/en/assets/1/event/278/Introducing%20Iceberg_%20Tables%20designed%20for%20object%20stores%20Presentation.pdf +). diff --git a/docs-new/home/docs/latest/reliability.md b/docs-new/home/docs/latest/reliability.md new file mode 100644 index 000000000000..7628b017c872 --- /dev/null +++ b/docs-new/home/docs/latest/reliability.md @@ -0,0 +1,68 @@ +--- +title: Reliability +--- + + +# Reliability + +Iceberg was designed to solve correctness problems that affect Hive tables running in S3. + +Hive tables track data files using both a central metastore for partitions and a file system for individual files. This makes atomic changes to a table's contents impossible, and eventually consistent stores like S3 may return incorrect results due to the use of listing files to reconstruct the state of a table. It also requires job planning to make many slow listing calls: O(n) with the number of partitions. + +Iceberg tracks the complete list of data files in each [snapshot](../../terms.md#snapshot) using a persistent tree structure. Every write or delete produces a new snapshot that reuses as much of the previous snapshot's metadata tree as possible to avoid high write volumes. + +Valid snapshots in an Iceberg table are stored in the table metadata file, along with a reference to the current snapshot. Commits replace the path of the current table metadata file using an atomic operation. This ensures that all updates to table data and metadata are atomic, and is the basis for [serializable isolation](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable). + +This results in improved reliability guarantees: + +* **Serializable isolation**: All table changes occur in a linear history of atomic table updates +* **Reliable reads**: Readers always use a consistent snapshot of the table without holding a lock +* **Version history and rollback**: Table snapshots are kept as history and tables can roll back if a job produces bad data +* **Safe file-level operations**. By supporting atomic changes, Iceberg enables new use cases, like safely compacting small files and safely appending late data to tables + +This design also has performance benefits: + +* **O(1) RPCs to plan**: Instead of listing O(n) directories in a table to plan a job, reading a snapshot requires O(1) RPC calls +* **Distributed planning**: File pruning and predicate push-down is distributed to jobs, removing the metastore as a bottleneck +* **Finer granularity partitioning**: Distributed planning and O(1) RPC calls remove the current barriers to finer-grained partitioning + + +## Concurrent write operations + +Iceberg supports multiple concurrent writes using optimistic concurrency. + +Each writer assumes that no other writers are operating and writes out new table metadata for an operation. Then, the writer attempts to commit by atomically swapping the new table metadata file for the existing metadata file. + +If the atomic swap fails because another writer has committed, the failed writer retries by writing a new metadata tree based on the new current table state. + +### Cost of retries + +Writers avoid expensive retry operations by structuring changes so that work can be reused across retries. + +For example, appends usually create a new manifest file for the appended data files, which can be added to the table without rewriting the manifest on every attempt. + +### Retry validation + +Commits are structured as assumptions and actions. After a conflict, a writer checks that the assumptions are met by the current table state. If the assumptions are met, then it is safe to re-apply the actions and commit. + +For example, a compaction might rewrite `file_a.avro` and `file_b.avro` as `merged.parquet`. This is safe to commit as long as the table still contains both `file_a.avro` and `file_b.avro`. If either file was deleted by a conflicting commit, then the operation must fail. Otherwise, it is safe to remove the source files and add the merged file. + + +## Compatibility + +By avoiding file listing and rename operations, Iceberg tables are compatible with any object store. No consistent listing is required. diff --git a/docs-new/home/docs/latest/schemas.md b/docs-new/home/docs/latest/schemas.md new file mode 100644 index 000000000000..a22c9ba7a331 --- /dev/null +++ b/docs-new/home/docs/latest/schemas.md @@ -0,0 +1,44 @@ +--- +title: Schemas +--- + + +# Schemas + +Iceberg tables support the following types: + +| Type | Description | Notes | +|--------------------|--------------------------------------------------------------------------|--------------------------------------------------| +| **`boolean`** | True or false | | +| **`int`** | 32-bit signed integers | Can promote to `long` | +| **`long`** | 64-bit signed integers | | +| **`float`** | [32-bit IEEE 754](https://en.wikipedia.org/wiki/IEEE_754) floating point | Can promote to `double` | +| **`double`** | [64-bit IEEE 754](https://en.wikipedia.org/wiki/IEEE_754) floating point | | +| **`decimal(P,S)`** | Fixed-point decimal; precision P, scale S | Scale is fixed and precision must be 38 or less | +| **`date`** | Calendar date without timezone or time | | +| **`time`** | Time of day without date, timezone | Stored as microseconds | +| **`timestamp`** | Timestamp without timezone | Stored as microseconds | +| **`timestamptz`** | Timestamp with timezone | Stored as microseconds | +| **`string`** | Arbitrary-length character sequences | Encoded with UTF-8 | +| **`fixed(L)`** | Fixed-length byte array of length L | | +| **`binary`** | Arbitrary-length byte array | | +| **`struct<...>`** | A record with named fields of any data type | | +| **`list`** | A list with elements of any data type | | +| **`map`** | A map with keys and values of any data type | | + +Iceberg tracks each field in a table schema using an ID that is never reused in a table. See [correctness guarantees](evolution.md#correctness) for more information. diff --git a/docs-new/home/docs/latest/spark-configuration.md b/docs-new/home/docs/latest/spark-configuration.md new file mode 100644 index 000000000000..a5447dccdacd --- /dev/null +++ b/docs-new/home/docs/latest/spark-configuration.md @@ -0,0 +1,188 @@ +--- +title: "Configuration" +--- + + +# Spark Configuration + +## Catalogs + +Spark adds an API to plug in table catalogs that are used to load, create, and manage Iceberg tables. Spark catalogs are configured by setting Spark properties under `spark.sql.catalog`. + +This creates an Iceberg catalog named `hive_prod` that loads tables from a Hive metastore: + +```plain +spark.sql.catalog.hive_prod = org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.hive_prod.type = hive +spark.sql.catalog.hive_prod.uri = thrift://metastore-host:port +# omit uri to use the same URI as Spark: hive.metastore.uris in hive-site.xml +``` + +Below is an example for a REST catalog named `rest_prod` that loads tables from REST URL `http://localhost:8080`: + +```plain +spark.sql.catalog.rest_prod = org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.rest_prod.type = rest +spark.sql.catalog.rest_prod.uri = http://localhost:8080 +``` + +Iceberg also supports a directory-based catalog in HDFS that can be configured using `type=hadoop`: + +```plain +spark.sql.catalog.hadoop_prod = org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.hadoop_prod.type = hadoop +spark.sql.catalog.hadoop_prod.warehouse = hdfs://nn:8020/warehouse/path +``` + +!!! info + The Hive-based catalog only loads Iceberg tables. To load non-Iceberg tables in the same Hive metastore, use a [session catalog](#replacing-the-session-catalog). + + +### Catalog configuration + +A catalog is created and named by adding a property `spark.sql.catalog.(catalog-name)` with an implementation class for its value. + +Iceberg supplies two implementations: + +* `org.apache.iceberg.spark.SparkCatalog` supports a Hive Metastore or a Hadoop warehouse as a catalog +* `org.apache.iceberg.spark.SparkSessionCatalog` adds support for Iceberg tables to Spark's built-in catalog, and delegates to the built-in catalog for non-Iceberg tables + +Both catalogs are configured using properties nested under the catalog name. Common configuration properties for Hive and Hadoop are: + +| Property | Values | Description | +| -------------------------------------------------- | ----------------------------- | -------------------------------------------------------------------- | +| spark.sql.catalog._catalog-name_.type | `hive`, `hadoop` or `rest` | The underlying Iceberg catalog implementation, `HiveCatalog`, `HadoopCatalog`, `RESTCatalog` or left unset if using a custom catalog | +| spark.sql.catalog._catalog-name_.catalog-impl | | The custom Iceberg catalog implementation. If `type` is null, `catalog-impl` must not be null. | +| spark.sql.catalog._catalog-name_.io-impl | | The custom FileIO implementation. | +| spark.sql.catalog._catalog-name_.metrics-reporter-impl | | The custom MetricsReporter implementation. | +| spark.sql.catalog._catalog-name_.default-namespace | default | The default current namespace for the catalog | +| spark.sql.catalog._catalog-name_.uri | thrift://host:port | Hive metastore URL for hive typed catalog, REST URL for REST typed catalog | +| spark.sql.catalog._catalog-name_.warehouse | hdfs://nn:8020/warehouse/path | Base path for the warehouse directory | +| spark.sql.catalog._catalog-name_.cache-enabled | `true` or `false` | Whether to enable catalog cache, default value is `true` | +| spark.sql.catalog._catalog-name_.cache.expiration-interval-ms | `30000` (30 seconds) | Duration after which cached catalog entries are expired; Only effective if `cache-enabled` is `true`. `-1` disables cache expiration and `0` disables caching entirely, irrespective of `cache-enabled`. Default is `30000` (30 seconds) | +| spark.sql.catalog._catalog-name_.table-default._propertyKey_ | | Default Iceberg table property value for property key _propertyKey_, which will be set on tables created by this catalog if not overridden | +| spark.sql.catalog._catalog-name_.table-override._propertyKey_ | | Enforced Iceberg table property value for property key _propertyKey_, which cannot be overridden by user | + +Additional properties can be found in common [catalog configuration](configuration.md#catalog-properties). + + +### Using catalogs + +Catalog names are used in SQL queries to identify a table. In the examples above, `hive_prod` and `hadoop_prod` can be used to prefix database and table names that will be loaded from those catalogs. + +```sql +SELECT * FROM hive_prod.db.table -- load db.table from catalog hive_prod +``` + +Spark 3 keeps track of the current catalog and namespace, which can be omitted from table names. + +```sql +USE hive_prod.db; +SELECT * FROM table -- load db.table from catalog hive_prod +``` + +To see the current catalog and namespace, run `SHOW CURRENT NAMESPACE`. + +### Replacing the session catalog + +To add Iceberg table support to Spark's built-in catalog, configure `spark_catalog` to use Iceberg's `SparkSessionCatalog`. + +```plain +spark.sql.catalog.spark_catalog = org.apache.iceberg.spark.SparkSessionCatalog +spark.sql.catalog.spark_catalog.type = hive +``` + +Spark's built-in catalog supports existing v1 and v2 tables tracked in a Hive Metastore. This configures Spark to use Iceberg's `SparkSessionCatalog` as a wrapper around that session catalog. When a table is not an Iceberg table, the built-in catalog will be used to load it instead. + +This configuration can use same Hive Metastore for both Iceberg and non-Iceberg tables. + +### Using catalog specific Hadoop configuration values + +Similar to configuring Hadoop properties by using `spark.hadoop.*`, it's possible to set per-catalog Hadoop configuration values when using Spark by adding the property for the catalog with the prefix `spark.sql.catalog.(catalog-name).hadoop.*`. These properties will take precedence over values configured globally using `spark.hadoop.*` and will only affect Iceberg tables. + +```plain +spark.sql.catalog.hadoop_prod.hadoop.fs.s3a.endpoint = http://aws-local:9000 +``` + +### Loading a custom catalog + +Spark supports loading a custom Iceberg `Catalog` implementation by specifying the `catalog-impl` property. Here is an example: + +```plain +spark.sql.catalog.custom_prod = org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.custom_prod.catalog-impl = com.my.custom.CatalogImpl +spark.sql.catalog.custom_prod.my-additional-catalog-config = my-value +``` + +## SQL Extensions + +Iceberg 0.11.0 and later add an extension module to Spark to add new SQL commands, like `CALL` for stored procedures or `ALTER TABLE ... WRITE ORDERED BY`. + +Using those SQL commands requires adding Iceberg extensions to your Spark environment using the following Spark property: + + +| Spark extensions property | Iceberg extensions implementation | +|---------------------------|---------------------------------------------------------------------| +| `spark.sql.extensions` | `org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions` | + +## Runtime configuration + +### Read options + +Spark read options are passed when configuring the DataFrameReader, like this: + +```scala +// time travel +spark.read + .option("snapshot-id", 10963874102873L) + .table("catalog.db.table") +``` + +| Spark option | Default | Description | +| --------------- | --------------------- | ----------------------------------------------------------------------------------------- | +| snapshot-id | (latest) | Snapshot ID of the table snapshot to read | +| as-of-timestamp | (latest) | A timestamp in milliseconds; the snapshot used will be the snapshot current at this time. | +| split-size | As per table property | Overrides this table's read.split.target-size and read.split.metadata-target-size | +| lookback | As per table property | Overrides this table's read.split.planning-lookback | +| file-open-cost | As per table property | Overrides this table's read.split.open-file-cost | +| vectorization-enabled | As per table property | Overrides this table's read.parquet.vectorization.enabled | +| batch-size | As per table property | Overrides this table's read.parquet.vectorization.batch-size | +| stream-from-timestamp | (none) | A timestamp in milliseconds to stream from; if before the oldest known ancestor snapshot, the oldest will be used | + +### Write options + +Spark write options are passed when configuring the DataFrameWriter, like this: + +```scala +// write with Avro instead of Parquet +df.write + .option("write-format", "avro") + .option("snapshot-property.key", "value") + .insertInto("catalog.db.table") +``` + +| Spark option | Default | Description | +| ---------------------- | -------------------------- | ------------------------------------------------------------ | +| write-format | Table write.format.default | File format to use for this write operation; parquet, avro, or orc | +| target-file-size-bytes | As per table property | Overrides this table's write.target-file-size-bytes | +| check-nullability | true | Sets the nullable check on fields | +| snapshot-property._custom-key_ | null | Adds an entry with custom-key and corresponding value in the snapshot summary | +| fanout-enabled | false | Overrides this table's write.spark.fanout.enabled | +| check-ordering | true | Checks if input schema and table schema are same | +| isolation-level | null | Desired isolation level for Dataframe overwrite operations. `null` => no checks (for idempotent writes), `serializable` => check for concurrent inserts or deletes in destination partitions, `snapshot` => checks for concurrent deletes in destination partitions. | +| validate-from-snapshot-id | null | If isolation level is set, id of base snapshot from which to check concurrent write conflicts into a table. Should be the snapshot before any reads from the table. Can be obtained via [Table API](api.md#table-metadata) or [Snapshots table](spark-queries.md#snapshots). If null, the table's oldest known snapshot is used. | diff --git a/docs-new/home/docs/latest/spark-ddl.md b/docs-new/home/docs/latest/spark-ddl.md new file mode 100644 index 000000000000..8e84a98b3938 --- /dev/null +++ b/docs-new/home/docs/latest/spark-ddl.md @@ -0,0 +1,521 @@ +--- +title: "DDL" +--- + + +# Spark DDL + +To use Iceberg in Spark, first configure [Spark catalogs](spark-configuration.md). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. + +## `CREATE TABLE` + +Spark 3 can create tables in any Iceberg catalog with the clause `USING iceberg`: + +```sql +CREATE TABLE prod.db.sample ( + id bigint COMMENT 'unique id', + data string) +USING iceberg +``` + +Iceberg will convert the column type in Spark to corresponding Iceberg type. Please check the section of [type compatibility on creating table](spark-writes.md#spark-type-to-iceberg-type) for details. + +Table create commands, including CTAS and RTAS, support the full range of Spark create clauses, including: + +* `PARTITIONED BY (partition-expressions)` to configure partitioning +* `LOCATION '(fully-qualified-uri)'` to set the table location +* `COMMENT 'table documentation'` to set a table description +* `TBLPROPERTIES ('key'='value', ...)` to set [table configuration](configuration.md) + +Create commands may also set the default format with the `USING` clause. This is only supported for `SparkCatalog` because Spark handles the `USING` clause differently for the built-in catalog. + +### `PARTITIONED BY` + +To create a partitioned table, use `PARTITIONED BY`: + +```sql +CREATE TABLE prod.db.sample ( + id bigint, + data string, + category string) +USING iceberg +PARTITIONED BY (category) +``` + +The `PARTITIONED BY` clause supports transform expressions to create [hidden partitions](partitioning.md). + +```sql +CREATE TABLE prod.db.sample ( + id bigint, + data string, + category string, + ts timestamp) +USING iceberg +PARTITIONED BY (bucket(16, id), days(ts), category) +``` + +Supported transformations are: + +* `years(ts)`: partition by year +* `months(ts)`: partition by month +* `days(ts)` or `date(ts)`: equivalent to dateint partitioning +* `hours(ts)` or `date_hour(ts)`: equivalent to dateint and hour partitioning +* `bucket(N, col)`: partition by hashed value mod N buckets +* `truncate(L, col)`: partition by value truncated to L + * Strings are truncated to the given length + * Integers and longs truncate to bins: `truncate(10, i)` produces partitions 0, 10, 20, 30, ... + +## `CREATE TABLE ... AS SELECT` + +Iceberg supports CTAS as an atomic operation when using a [`SparkCatalog`](spark-configuration.md#catalog-configuration). CTAS is supported, but is not atomic when using [`SparkSessionCatalog`](spark-configuration.md#replacing-the-session-catalog). + +```sql +CREATE TABLE prod.db.sample +USING iceberg +AS SELECT ... +``` + +The newly created table won't inherit the partition spec and table properties from the source table in SELECT, you can use PARTITIONED BY and TBLPROPERTIES in CTAS to declare partition spec and table properties for the new table. + +```sql +CREATE TABLE prod.db.sample +USING iceberg +PARTITIONED BY (part) +TBLPROPERTIES ('key'='value') +AS SELECT ... +``` + +## `REPLACE TABLE ... AS SELECT` + +Iceberg supports RTAS as an atomic operation when using a [`SparkCatalog`](spark-configuration.md#catalog-configuration). RTAS is supported, but is not atomic when using [`SparkSessionCatalog`](spark-configuration.md#replacing-the-session-catalog). + +Atomic table replacement creates a new snapshot with the results of the `SELECT` query, but keeps table history. + +```sql +REPLACE TABLE prod.db.sample +USING iceberg +AS SELECT ... +``` +```sql +REPLACE TABLE prod.db.sample +USING iceberg +PARTITIONED BY (part) +TBLPROPERTIES ('key'='value') +AS SELECT ... +``` +```sql +CREATE OR REPLACE TABLE prod.db.sample +USING iceberg +AS SELECT ... +``` + +The schema and partition spec will be replaced if changed. To avoid modifying the table's schema and partitioning, use `INSERT OVERWRITE` instead of `REPLACE TABLE`. +The new table properties in the `REPLACE TABLE` command will be merged with any existing table properties. The existing table properties will be updated if changed else they are preserved. + +## `DROP TABLE` + +The drop table behavior changed in 0.14. + +Prior to 0.14, running `DROP TABLE` would remove the table from the catalog and delete the table contents as well. + +From 0.14 onwards, `DROP TABLE` would only remove the table from the catalog. +In order to delete the table contents `DROP TABLE PURGE` should be used. + +### `DROP TABLE` + +To drop the table from the catalog, run: + +```sql +DROP TABLE prod.db.sample +``` + +### `DROP TABLE PURGE` + +To drop the table from the catalog and delete the table's contents, run: + +```sql +DROP TABLE prod.db.sample PURGE +``` + +## `ALTER TABLE` + +Iceberg has full `ALTER TABLE` support in Spark 3, including: + +* Renaming a table +* Setting or removing table properties +* Adding, deleting, and renaming columns +* Adding, deleting, and renaming nested fields +* Reordering top-level columns and nested struct fields +* Widening the type of `int`, `float`, and `decimal` fields +* Making required columns optional + +In addition, [SQL extensions](spark-configuration.md#sql-extensions) can be used to add support for partition evolution and setting a table's write order + +### `ALTER TABLE ... RENAME TO` + +```sql +ALTER TABLE prod.db.sample RENAME TO prod.db.new_name +``` + +### `ALTER TABLE ... SET TBLPROPERTIES` + +```sql +ALTER TABLE prod.db.sample SET TBLPROPERTIES ( + 'read.split.target-size'='268435456' +) +``` + +Iceberg uses table properties to control table behavior. For a list of available properties, see [Table configuration](configuration.md). + +`UNSET` is used to remove properties: + +```sql +ALTER TABLE prod.db.sample UNSET TBLPROPERTIES ('read.split.target-size') +``` + +`SET TBLPROPERTIES` can also be used to set the table comment (description): + +```sql +ALTER TABLE prod.db.sample SET TBLPROPERTIES ( + 'comment' = 'A table comment.' +) +``` + +### `ALTER TABLE ... ADD COLUMN` + +To add a column to Iceberg, use the `ADD COLUMNS` clause with `ALTER TABLE`: + +```sql +ALTER TABLE prod.db.sample +ADD COLUMNS ( + new_column string comment 'new_column docs' + ) +``` + +Multiple columns can be added at the same time, separated by commas. + +Nested columns should be identified using the full column name: + +```sql +-- create a struct column +ALTER TABLE prod.db.sample +ADD COLUMN point struct; + +-- add a field to the struct +ALTER TABLE prod.db.sample +ADD COLUMN point.z double +``` + +```sql +-- create a nested array column of struct +ALTER TABLE prod.db.sample +ADD COLUMN points array>; + +-- add a field to the struct within an array. Using keyword 'element' to access the array's element column. +ALTER TABLE prod.db.sample +ADD COLUMN points.element.z double +``` + +```sql +-- create a map column of struct key and struct value +ALTER TABLE prod.db.sample +ADD COLUMN points map, struct>; + +-- add a field to the value struct in a map. Using keyword 'value' to access the map's value column. +ALTER TABLE prod.db.sample +ADD COLUMN points.value.b int +``` + +Note: Altering a map 'key' column by adding columns is not allowed. Only map values can be updated. + +Add columns in any position by adding `FIRST` or `AFTER` clauses: + +```sql +ALTER TABLE prod.db.sample +ADD COLUMN new_column bigint AFTER other_column +``` + +```sql +ALTER TABLE prod.db.sample +ADD COLUMN nested.new_column bigint FIRST +``` + +### `ALTER TABLE ... RENAME COLUMN` + +Iceberg allows any field to be renamed. To rename a field, use `RENAME COLUMN`: + +```sql +ALTER TABLE prod.db.sample RENAME COLUMN data TO payload +ALTER TABLE prod.db.sample RENAME COLUMN location.lat TO latitude +``` + +Note that nested rename commands only rename the leaf field. The above command renames `location.lat` to `location.latitude` + +### `ALTER TABLE ... ALTER COLUMN` + +Alter column is used to widen types, make a field optional, set comments, and reorder fields. + +Iceberg allows updating column types if the update is safe. Safe updates are: + +* `int` to `bigint` +* `float` to `double` +* `decimal(P,S)` to `decimal(P2,S)` when P2 > P (scale cannot change) + +```sql +ALTER TABLE prod.db.sample ALTER COLUMN measurement TYPE double +``` + +To add or remove columns from a struct, use `ADD COLUMN` or `DROP COLUMN` with a nested column name. + +Column comments can also be updated using `ALTER COLUMN`: + +```sql +ALTER TABLE prod.db.sample ALTER COLUMN measurement TYPE double COMMENT 'unit is bytes per second' +ALTER TABLE prod.db.sample ALTER COLUMN measurement COMMENT 'unit is kilobytes per second' +``` + +Iceberg allows reordering top-level columns or columns in a struct using `FIRST` and `AFTER` clauses: + +```sql +ALTER TABLE prod.db.sample ALTER COLUMN col FIRST +``` +```sql +ALTER TABLE prod.db.sample ALTER COLUMN nested.col AFTER other_col +``` + +Nullability can be changed using `SET NOT NULL` and `DROP NOT NULL`: + +```sql +ALTER TABLE prod.db.sample ALTER COLUMN id DROP NOT NULL +``` + +!!! info + `ALTER COLUMN` is not used to update `struct` types. Use `ADD COLUMN` and `DROP COLUMN` to add or remove struct fields. + + + +### `ALTER TABLE ... DROP COLUMN` + +To drop columns, use `ALTER TABLE ... DROP COLUMN`: + +```sql +ALTER TABLE prod.db.sample DROP COLUMN id +ALTER TABLE prod.db.sample DROP COLUMN point.z +``` + +## `ALTER TABLE` SQL extensions + +These commands are available in Spark 3 when using Iceberg [SQL extensions](spark-configuration.md#sql-extensions). + +### `ALTER TABLE ... ADD PARTITION FIELD` + +Iceberg supports adding new partition fields to a spec using `ADD PARTITION FIELD`: + +```sql +ALTER TABLE prod.db.sample ADD PARTITION FIELD catalog -- identity transform +``` + +[Partition transforms](#partitioned-by) are also supported: + +```sql +ALTER TABLE prod.db.sample ADD PARTITION FIELD bucket(16, id) +ALTER TABLE prod.db.sample ADD PARTITION FIELD truncate(4, data) +ALTER TABLE prod.db.sample ADD PARTITION FIELD years(ts) +-- use optional AS keyword to specify a custom name for the partition field +ALTER TABLE prod.db.sample ADD PARTITION FIELD bucket(16, id) AS shard +``` + +Adding a partition field is a metadata operation and does not change any of the existing table data. New data will be written with the new partitioning, but existing data will remain in the old partition layout. Old data files will have null values for the new partition fields in metadata tables. + +Dynamic partition overwrite behavior will change when the table's partitioning changes because dynamic overwrite replaces partitions implicitly. To overwrite explicitly, use the new `DataFrameWriterV2` API. + +!!! note + To migrate from daily to hourly partitioning with transforms, it is not necessary to drop the daily partition field. Keeping the field ensures existing metadata table queries continue to work. + + +!!! danger + **Dynamic partition overwrite behavior will change** when partitioning changes + For example, if you partition by days and move to partitioning by hours, overwrites will overwrite hourly partitions but not days anymore. + + +### `ALTER TABLE ... DROP PARTITION FIELD` + +Partition fields can be removed using `DROP PARTITION FIELD`: + +```sql +ALTER TABLE prod.db.sample DROP PARTITION FIELD catalog +ALTER TABLE prod.db.sample DROP PARTITION FIELD bucket(16, id) +ALTER TABLE prod.db.sample DROP PARTITION FIELD truncate(4, data) +ALTER TABLE prod.db.sample DROP PARTITION FIELD years(ts) +ALTER TABLE prod.db.sample DROP PARTITION FIELD shard +``` + +Note that although the partition is removed, the column will still exist in the table schema. + +Dropping a partition field is a metadata operation and does not change any of the existing table data. New data will be written with the new partitioning, but existing data will remain in the old partition layout. + +!!! danger + **Dynamic partition overwrite behavior will change** when partitioning changes + For example, if you partition by days and move to partitioning by hours, overwrites will overwrite hourly partitions but not days anymore. + + +!!! danger + Be careful when dropping a partition field because it will change the schema of metadata tables, like `files`, and may cause metadata queries to fail or produce different results. + + +### `ALTER TABLE ... REPLACE PARTITION FIELD` + +A partition field can be replaced by a new partition field in a single metadata update by using `REPLACE PARTITION FIELD`: + +```sql +ALTER TABLE prod.db.sample REPLACE PARTITION FIELD ts_day WITH days(ts) +-- use optional AS keyword to specify a custom name for the new partition field +ALTER TABLE prod.db.sample REPLACE PARTITION FIELD ts_day WITH days(ts) AS day_of_ts +``` + +### `ALTER TABLE ... WRITE ORDERED BY` + +Iceberg tables can be configured with a sort order that is used to automatically sort data that is written to the table in some engines. For example, `MERGE INTO` in Spark will use the table ordering. + +To set the write order for a table, use `WRITE ORDERED BY`: + +```sql +ALTER TABLE prod.db.sample WRITE ORDERED BY category, id +-- use optional ASC/DEC keyword to specify sort order of each field (default ASC) +ALTER TABLE prod.db.sample WRITE ORDERED BY category ASC, id DESC +-- use optional NULLS FIRST/NULLS LAST keyword to specify null order of each field (default FIRST) +ALTER TABLE prod.db.sample WRITE ORDERED BY category ASC NULLS LAST, id DESC NULLS FIRST +``` + +!!! info + Table write order does not guarantee data order for queries. It only affects how data is written to the table. + + +`WRITE ORDERED BY` sets a global ordering where rows are ordered across tasks, like using `ORDER BY` in an `INSERT` command: + +```sql +INSERT INTO prod.db.sample +SELECT id, data, category, ts FROM another_table +ORDER BY ts, category +``` + +To order within each task, not across tasks, use `LOCALLY ORDERED BY`: + +```sql +ALTER TABLE prod.db.sample WRITE LOCALLY ORDERED BY category, id +``` + +### `ALTER TABLE ... WRITE DISTRIBUTED BY PARTITION` + +`WRITE DISTRIBUTED BY PARTITION` will request that each partition is handled by one writer, the default implementation is hash distribution. + +```sql +ALTER TABLE prod.db.sample WRITE DISTRIBUTED BY PARTITION +``` + +`DISTRIBUTED BY PARTITION` and `LOCALLY ORDERED BY` may be used together, to distribute by partition and locally order rows within each task. + +```sql +ALTER TABLE prod.db.sample WRITE DISTRIBUTED BY PARTITION LOCALLY ORDERED BY category, id +``` + +### `ALTER TABLE ... SET IDENTIFIER FIELDS` + +Iceberg supports setting identifier fields to a spec using `SET IDENTIFIER FIELDS`: + +```sql +ALTER TABLE prod.db.sample SET IDENTIFIER FIELDS id +-- single column +ALTER TABLE prod.db.sample SET IDENTIFIER FIELDS id, data +-- multiple columns +``` + +identifier fields must be `NOT NULL`, The later `ALTER` statement will overwrite the previous setting. + +### `ALTER TABLE ... DROP IDENTIFIER FIELDS` + +Identifier fields can be removed using `DROP IDENTIFIER FIELDS`: + +```sql +ALTER TABLE prod.db.sample DROP IDENTIFIER FIELDS id +-- single column +ALTER TABLE prod.db.sample DROP IDENTIFIER FIELDS id, data +-- multiple columns +``` + +Note that although the identifier is removed, the column will still exist in the table schema. + +### Branching and Tagging DDL + +#### `ALTER TABLE ... CREATE BRANCH` + +Branches can be created via the `CREATE BRANCH` statement, which includes +the snapshot to create the branch at and an optional retention clause. + +```sql +-- CREATE audit-branch at snapshot 1234 with default retention. +ALTER TABLE prod.db.sample CREATE BRANCH audit-branch +AS OF VERSION 1234 + +-- CREATE audit-branch at snapshot 1234, retain audit-branch for 31 days, and retain the latest 31 days. The latest 3 snapshot snapshots, and 2 days worth of snapshots +ALTER TABLE prod.db.sample CREATE BRANCH audit-branch +AS OF VERSION 1234 RETAIN 30 DAYS +WITH RETENTION 3 SNAPSHOTS 2 DAYS +``` + + +#### `ALTER TABLE ... CREATE TAG` + +Tags can be created via the `CREATE TAG` statement, which includes +the snapshot to create the branch at and an optional retention clause. + +```sql +-- CREATE historical-tag at snapshot 1234 with default retention. +ALTER TABLE prod.db.sample CREATE TAG historical-tag AS OF VERSION 1234 + +-- CREATE historical-tag at snapshot 1234 and retain it for 1 year. +ALTER TABLE prod.db.sample CREATE TAG historical-tag +AS OF VERSION 1234 RETAIN 365 DAYS +``` + +### `ALTER TABLE ... REPLACE BRANCH` + +The snapshot which a branch references can be updated via +the `REPLACE BRANCH` sql. Retention can also be updated in this statement. + +```sql +-- REPLACE audit-branch to reference snapshot 4567 and update the retention to 60 days +ALTER TABLE prod.db.sample REPLACE BRANCH audit-branch +AS OF VERSION 4567 RETAIN 60 DAYS +``` + +#### `ALTER TABLE ... DROP BRANCH` + +Branches can be removed via the `DROP BRANCH` sql + +```sql +ALTER TABLE prod.db.sample DROP BRANCH audit-branch +``` + +#### `ALTER TABLE ... DROP TAG` + +Tags can be removed via the `DROP TAG` sql + +```sql +ALTER TABLE prod.db.sample DROP TAG historical-tag +``` diff --git a/docs-new/home/docs/latest/spark-getting-started.md b/docs-new/home/docs/latest/spark-getting-started.md new file mode 100644 index 000000000000..0c2c63e0760e --- /dev/null +++ b/docs-new/home/docs/latest/spark-getting-started.md @@ -0,0 +1,138 @@ +--- +title: "Getting Started" +--- + + +# Getting Started + +The latest version of Iceberg is [{{ icebergVersion }}](../../releases.md). + +Spark is currently the most feature-rich compute engine for Iceberg operations. +We recommend you to get started with Spark to understand Iceberg concepts and features with examples. +You can also view documentations of using Iceberg with other compute engine under the [Multi-Engine Support](../../multi-engine-support.md) page. + +## Using Iceberg in Spark 3 + +To use Iceberg in a Spark shell, use the `--packages` option: + +```sh +spark-shell --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{ icebergVersion }} +``` + +!!! info + If you want to include Iceberg in your Spark installation, add the [`iceberg-spark-runtime-3.2_2.12` Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.2_2.12-{{ icebergVersion }}.jar) to Spark's `jars` folder. + + +### Adding catalogs + +Iceberg comes with [catalogs](spark-configuration.md#catalogs) that enable SQL commands to manage tables and load them by name. Catalogs are configured using properties under `spark.sql.catalog.(catalog_name)`. + +This command creates a path-based catalog named `local` for tables under `$PWD/warehouse` and adds support for Iceberg tables to Spark's built-in catalog: + +```sh +spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{ icebergVersion }}\ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf spark.sql.catalog.local.warehouse=$PWD/warehouse +``` + +### Creating a table + +To create your first Iceberg table in Spark, use the `spark-sql` shell or `spark.sql(...)` to run a [`CREATE TABLE`](spark-ddl.md#create-table) command: + +```sql +-- local is the path-based catalog defined above +CREATE TABLE local.db.table (id bigint, data string) USING iceberg +``` + +Iceberg catalogs support the full range of SQL DDL commands, including: + +* [`CREATE TABLE ... PARTITIONED BY`](spark-ddl.md#create-table) +* [`CREATE TABLE ... AS SELECT`](spark-ddl.md#create-table-as-select) +* [`ALTER TABLE`](spark-ddl.md#alter-table) +* [`DROP TABLE`](spark-ddl.md#drop-table) + +### Writing + +Once your table is created, insert data using [`INSERT INTO`](spark-writes.md#insert-into): + +```sql +INSERT INTO local.db.table VALUES (1, 'a'), (2, 'b'), (3, 'c'); +INSERT INTO local.db.table SELECT id, data FROM source WHERE length(data) = 1; +``` + +Iceberg also adds row-level SQL updates to Spark, [`MERGE INTO`](spark-writes.md#merge-into) and [`DELETE FROM`](spark-writes.md#delete-from): + +```sql +MERGE INTO local.db.target t USING (SELECT * FROM updates) u ON t.id = u.id +WHEN MATCHED THEN UPDATE SET t.count = t.count + u.count +WHEN NOT MATCHED THEN INSERT * +``` + +Iceberg supports writing DataFrames using the new [v2 DataFrame write API](spark-writes.md#writing-with-dataframes): + +```scala +spark.table("source").select("id", "data") + .writeTo("local.db.table").append() +``` + +The old `write` API is supported, but _not_ recommended. + +### Reading + +To read with SQL, use the an Iceberg table name in a `SELECT` query: + +```sql +SELECT count(1) as count, data +FROM local.db.table +GROUP BY data +``` + +SQL is also the recommended way to [inspect tables](spark-queries.md#inspecting-tables). To view all of the snapshots in a table, use the `snapshots` metadata table: +```sql +SELECT * FROM local.db.table.snapshots +``` +``` ++-------------------------+----------------+-----------+-----------+----------------------------------------------------+-----+ +| committed_at | snapshot_id | parent_id | operation | manifest_list | ... | ++-------------------------+----------------+-----------+-----------+----------------------------------------------------+-----+ +| 2019-02-08 03:29:51.215 | 57897183625154 | null | append | s3://.../table/metadata/snap-57897183625154-1.avro | ... | +| | | | | | ... | +| | | | | | ... | +| ... | ... | ... | ... | ... | ... | ++-------------------------+----------------+-----------+-----------+----------------------------------------------------+-----+ +``` + +[DataFrame reads](spark-queries.md#querying-with-dataframes) are supported and can now reference tables by name using `spark.table`: + +```scala +val df = spark.table("local.db.table") +df.count() +``` + +### Next steps + +Next, you can learn more about Iceberg tables in Spark: + +* [DDL commands](spark-ddl.md): `CREATE`, `ALTER`, and `DROP` +* [Querying data](spark-queries.md): `SELECT` queries and metadata tables +* [Writing data](spark-writes.md): `INSERT INTO` and `MERGE INTO` +* [Maintaining tables](spark-procedures.md) with stored procedures diff --git a/docs-new/home/docs/latest/spark-procedures.md b/docs-new/home/docs/latest/spark-procedures.md new file mode 100644 index 000000000000..919b6de9451b --- /dev/null +++ b/docs-new/home/docs/latest/spark-procedures.md @@ -0,0 +1,698 @@ +--- +title: "Procedures" +--- + + +# Spark Procedures + +To use Iceberg in Spark, first configure [Spark catalogs](spark-configuration.md). Stored procedures are only available when using [Iceberg SQL extensions](spark-configuration.md#sql-extensions) in Spark 3. + +## Usage + +Procedures can be used from any configured Iceberg catalog with `CALL`. All procedures are in the namespace `system`. + +`CALL` supports passing arguments by name (recommended) or by position. Mixing position and named arguments is not supported. + +### Named arguments + +All procedure arguments are named. When passing arguments by name, arguments can be in any order and any optional argument can be omitted. + +```sql +CALL catalog_name.system.procedure_name(arg_name_2 => arg_2, arg_name_1 => arg_1) +``` + +### Positional arguments + +When passing arguments by position, only the ending arguments may be omitted if they are optional. + +```sql +CALL catalog_name.system.procedure_name(arg_1, arg_2, ... arg_n) +``` + +## Snapshot management + +### `rollback_to_snapshot` + +Roll back a table to a specific snapshot ID. + +To roll back to a specific time, use [`rollback_to_timestamp`](#rollback_to_timestamp). + +!!! info + This procedure invalidates all cached Spark plans that reference the affected table. + + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to update | +| `snapshot_id` | ✔️ | long | Snapshot ID to rollback to | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `previous_snapshot_id` | long | The current snapshot ID before the rollback | +| `current_snapshot_id` | long | The new current snapshot ID | + +#### Example + +Roll back table `db.sample` to snapshot ID `1`: + +```sql +CALL catalog_name.system.rollback_to_snapshot('db.sample', 1) +``` + +### `rollback_to_timestamp` + +Roll back a table to the snapshot that was current at some time. + +!!! info + This procedure invalidates all cached Spark plans that reference the affected table. + + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to update | +| `timestamp` | ✔️ | timestamp | A timestamp to rollback to | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `previous_snapshot_id` | long | The current snapshot ID before the rollback | +| `current_snapshot_id` | long | The new current snapshot ID | + +#### Example + +Roll back `db.sample` to a specific day and time. +```sql +CALL catalog_name.system.rollback_to_timestamp('db.sample', TIMESTAMP '2021-06-30 00:00:00.000') +``` + +### `set_current_snapshot` + +Sets the current snapshot ID for a table. + +Unlike rollback, the snapshot is not required to be an ancestor of the current table state. + +!!! info + This procedure invalidates all cached Spark plans that reference the affected table. + + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to update | +| `snapshot_id` | ✔️ | long | Snapshot ID to set as current | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `previous_snapshot_id` | long | The current snapshot ID before the rollback | +| `current_snapshot_id` | long | The new current snapshot ID | + +#### Example + +Set the current snapshot for `db.sample` to 1: +```sql +CALL catalog_name.system.set_current_snapshot('db.sample', 1) +``` + +### `cherrypick_snapshot` + +Cherry-picks changes from a snapshot into the current table state. + +Cherry-picking creates a new snapshot from an existing snapshot without altering or removing the original. + +Only append and dynamic overwrite snapshots can be cherry-picked. + +!!! info + This procedure invalidates all cached Spark plans that reference the affected table. + + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to update | +| `snapshot_id` | ✔️ | long | The snapshot ID to cherry-pick | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `source_snapshot_id` | long | The table's current snapshot before the cherry-pick | +| `current_snapshot_id` | long | The snapshot ID created by applying the cherry-pick | + +#### Examples + +Cherry-pick snapshot 1 +```sql +CALL catalog_name.system.cherrypick_snapshot('my_table', 1) +``` + +Cherry-pick snapshot 1 with named args +```sql +CALL catalog_name.system.cherrypick_snapshot(snapshot_id => 1, table => 'my_table' ) +``` + + +## Metadata management + +Many [maintenance actions](maintenance.md) can be performed using Iceberg stored procedures. + +### `expire_snapshots` + +Each write/update/delete/upsert/compaction in Iceberg produces a new snapshot while keeping the old data and metadata +around for snapshot isolation and time travel. The `expire_snapshots` procedure can be used to remove older snapshots +and their files which are no longer needed. + +This procedure will remove old snapshots and data files which are uniquely required by those old snapshots. This means +the `expire_snapshots` procedure will never remove files which are still required by a non-expired snapshot. + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to update | +| `older_than` | ️ | timestamp | Timestamp before which snapshots will be removed (Default: 5 days ago) | +| `retain_last` | | int | Number of ancestor snapshots to preserve regardless of `older_than` (defaults to 1) | +| `max_concurrent_deletes` | | int | Size of the thread pool used for delete file actions (by default, no thread pool is used) | +| `stream_results` | | boolean | When true, deletion files will be sent to Spark driver by RDD partition (by default, all the files will be sent to Spark driver). This option is recommended to set to `true` to prevent Spark driver OOM from large file size | +| `snapshot_ids` | | array of long | Array of snapshot IDs to expire. | + +If `older_than` and `retain_last` are omitted, the table's [expiration properties](configuration.md#table-behavior-properties) will be used. + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `deleted_data_files_count` | long | Number of data files deleted by this operation | +| `deleted_position_delete_files_count` | long | Number of position delete files deleted by this operation | +| `deleted_equality_delete_files_count` | long | Number of equality delete files deleted by this operation | +| `deleted_manifest_files_count` | long | Number of manifest files deleted by this operation | +| `deleted_manifest_lists_count` | long | Number of manifest List files deleted by this operation | + +#### Examples + +Remove snapshots older than specific day and time, but retain the last 100 snapshots: + +```sql +CALL hive_prod.system.expire_snapshots('db.sample', TIMESTAMP '2021-06-30 00:00:00.000', 100) +``` + +Remove snapshots with snapshot ID `123` (note that this snapshot ID should not be the current snapshot): + +```sql +CALL hive_prod.system.expire_snapshots(table => 'db.sample', snapshot_ids => ARRAY(123)) +``` + +### `remove_orphan_files` + +Used to remove files which are not referenced in any metadata files of an Iceberg table and can thus be considered "orphaned". + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to clean | +| `older_than` | ️ | timestamp | Remove orphan files created before this timestamp (Defaults to 3 days ago) | +| `location` | | string | Directory to look for files in (defaults to the table's location) | +| `dry_run` | | boolean | When true, don't actually remove files (defaults to false) | +| `max_concurrent_deletes` | | int | Size of the thread pool used for delete file actions (by default, no thread pool is used) | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `orphan_file_location` | String | The path to each file determined to be an orphan by this command | + +#### Examples + +List all the files that are candidates for removal by performing a dry run of the `remove_orphan_files` command on this table without actually removing them: +```sql +CALL catalog_name.system.remove_orphan_files(table => 'db.sample', dry_run => true) +``` + +Remove any files in the `tablelocation/data` folder which are not known to the table `db.sample`. +```sql +CALL catalog_name.system.remove_orphan_files(table => 'db.sample', location => 'tablelocation/data') +``` + +### `rewrite_data_files` + +Iceberg tracks each data file in a table. More data files leads to more metadata stored in manifest files, and small data files causes an unnecessary amount of metadata and less efficient queries from file open costs. + +Iceberg can compact data files in parallel using Spark with the `rewriteDataFiles` action. This will combine small files into larger files to reduce metadata overhead and runtime file open cost. + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to update | +| `strategy` | | string | Name of the strategy - binpack or sort. Defaults to binpack strategy | +| `sort_order` | | string | For Zorder use a comma separated list of columns within zorder(). (Supported in Spark 3.2 and Above) Example: zorder(c1,c2,c3).
Else, Comma separated sort orders in the format (ColumnName SortDirection NullOrder).
Where SortDirection can be ASC or DESC. NullOrder can be NULLS FIRST or NULLS LAST.
Defaults to the table's sort order | +| `options` | ️ | map | Options to be used for actions| +| `where` | ️ | string | predicate as a string used for filtering the files. Note that all files that may contain data matching the filter will be selected for rewriting| + + +See the [`RewriteDataFiles` Javadoc](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/actions/RewriteDataFiles.html#field.summary), +
[`BinPackStrategy` Javadoc](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/actions/BinPackStrategy.html#field.summary) +and
[`SortStrategy` Javadoc](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/actions/SortStrategy.html#field.summary) +for list of all the supported options for this action. + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `rewritten_data_files_count` | int | Number of data which were re-written by this command | +| `added_data_files_count` | int | Number of new data files which were written by this command | + +#### Examples + +Rewrite the data files in table `db.sample` using the default rewrite algorithm of bin-packing to combine small files +and also split large files according to the default write size of the table. +```sql +CALL catalog_name.system.rewrite_data_files('db.sample') +``` + +Rewrite the data files in table `db.sample` by sorting all the data on id and name +using the same defaults as bin-pack to determine which files to rewrite. +```sql +CALL catalog_name.system.rewrite_data_files(table => 'db.sample', strategy => 'sort', sort_order => 'id DESC NULLS LAST,name ASC NULLS FIRST') +``` + +Rewrite the data files in table `db.sample` by zOrdering on column c1 and c2. +Using the same defaults as bin-pack to determine which files to rewrite. +```sql +CALL catalog_name.system.rewrite_data_files(table => 'db.sample', strategy => 'sort', sort_order => 'zorder(c1,c2)') +``` + +Rewrite the data files in table `db.sample` using bin-pack strategy in any partition where more than 2 or more files need to be rewritten. +```sql +CALL catalog_name.system.rewrite_data_files(table => 'db.sample', options => map('min-input-files','2')) +``` + +Rewrite the data files in table `db.sample` and select the files that may contain data matching the filter (id = 3 and name = "foo") to be rewritten. +```sql +CALL catalog_name.system.rewrite_data_files(table => 'db.sample', where => 'id = 3 and name = "foo"') +``` + +### `rewrite_manifests` + +Rewrite manifests for a table to optimize scan planning. + +Data files in manifests are sorted by fields in the partition spec. This procedure runs in parallel using a Spark job. + +See the [`RewriteManifests` Javadoc](../../javadoc/{{ icebergVersion }}/org/apache/iceberg/actions/RewriteManifests.html) +to see more configuration options. + +!!! info + This procedure invalidates all cached Spark plans that reference the affected table. + + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to update | +| `use_caching` | ️ | boolean | Use Spark caching during operation (defaults to true) | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `rewritten_manifests_count` | int | Number of manifests which were re-written by this command | +| `added_mainfests_count` | int | Number of new manifest files which were written by this command | + +#### Examples + +Rewrite the manifests in table `db.sample` and align manifest files with table partitioning. +```sql +CALL catalog_name.system.rewrite_manifests('db.sample') +``` + +Rewrite the manifests in table `db.sample` and disable the use of Spark caching. This could be done to avoid memory issues on executors. +```sql +CALL catalog_name.system.rewrite_manifests('db.sample', false) +``` + +## Table migration + +The `snapshot` and `migrate` procedures help test and migrate existing Hive or Spark tables to Iceberg. + +### `snapshot` + +Create a light-weight temporary copy of a table for testing, without changing the source table. + +The newly created table can be changed or written to without affecting the source table, but the snapshot uses the original table's data files. + +When inserts or overwrites run on the snapshot, new files are placed in the snapshot table's location rather than the original table location. + +When finished testing a snapshot table, clean it up by running `DROP TABLE`. + +!!! info + Because tables created by `snapshot` are not the sole owners of their data files, they are prohibited from + actions like `expire_snapshots` which would physically delete data files. Iceberg deletes, which only effect metadata, + are still allowed. In addition, any operations which affect the original data files will disrupt the Snapshot's + integrity. DELETE statements executed against the original Hive table will remove original data files and the + `snapshot` table will no longer be able to access them. + + +See [`migrate`](#migrate) to replace an existing table with an Iceberg table. + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `source_table`| ✔️ | string | Name of the table to snapshot | +| `table` | ✔️ | string | Name of the new Iceberg table to create | +| `location` | | string | Table location for the new table (delegated to the catalog by default) | +| `properties` | ️ | map | Properties to add to the newly created table | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `imported_files_count` | long | Number of files added to the new table | + +#### Examples + +Make an isolated Iceberg table which references table `db.sample` named `db.snap` at the +catalog's default location for `db.snap`. +```sql +CALL catalog_name.system.snapshot('db.sample', 'db.snap') +``` + +Migrate an isolated Iceberg table which references table `db.sample` named `db.snap` at +a manually specified location `/tmp/temptable/`. +```sql +CALL catalog_name.system.snapshot('db.sample', 'db.snap', '/tmp/temptable/') +``` + +### `migrate` + +Replace a table with an Iceberg table, loaded with the source's data files. + +Table schema, partitioning, properties, and location will be copied from the source table. + +Migrate will fail if any table partition uses an unsupported format. Supported formats are Avro, Parquet, and ORC. +Existing data files are added to the Iceberg table's metadata and can be read using a name-to-id mapping created from the original table schema. + +To leave the original table intact while testing, use [`snapshot`](#snapshot) to create new temporary table that shares source data files and schema. + +By default, the original table is retained with the name `table_BACKUP_`. + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to migrate | +| `properties` | ️ | map | Properties for the new Iceberg table | +| `drop_backup` | | boolean | When true, the original table will not be retained as backup (defaults to false) | + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `migrated_files_count` | long | Number of files appended to the Iceberg table | + +#### Examples + +Migrate the table `db.sample` in Spark's default catalog to an Iceberg table and add a property 'foo' set to 'bar': + +```sql +CALL catalog_name.system.migrate('spark_catalog.db.sample', map('foo', 'bar')) +``` + +Migrate `db.sample` in the current catalog to an Iceberg table without adding any additional properties: +```sql +CALL catalog_name.system.migrate('db.sample') +``` + +### `add_files` + +Attempts to directly add files from a Hive or file based table into a given Iceberg table. Unlike migrate or +snapshot, `add_files` can import files from a specific partition or partitions and does not create a new Iceberg table. +This command will create metadata for the new files and will not move them. This procedure will not analyze the schema +of the files to determine if they actually match the schema of the Iceberg table. Upon completion, the Iceberg table +will then treat these files as if they are part of the set of files owned by Iceberg. This means any subsequent +`expire_snapshot` calls will be able to physically delete the added files. This method should not be used if +`migrate` or `snapshot` are possible. + +#### Usage + +| Argument Name | Required? | Type | Description | +|-------------------------|-----------|---------------------|-----------------------------------------------------------------------------------------------------| +| `table` | ✔️ | string | Table which will have files added to | +| `source_table` | ✔️ | string | Table where files should come from, paths are also possible in the form of \`file_format\`.\`path\` | +| `partition_filter` | ️ | map | A map of partitions in the source table to import from | +| `check_duplicate_files` | ️ | boolean | Whether to prevent files existing in the table from being added (defaults to true) | + +Warning : Schema is not validated, adding files with different schema to the Iceberg table will cause issues. + +Warning : Files added by this method can be physically deleted by Iceberg operations + +#### Output + +| Output Name | Type | Description | +|---------------------------|------|---------------------------------------------------| +| `added_files_count` | long | The number of files added by this command | +| `changed_partition_count` | long | The number of partitioned changed by this command | + +!!! warning + changed_partition_count will be 0 when table property `compatibility.snapshot-id-inheritance.enabled` is set to true + +#### Examples + +Add the files from table `db.src_table`, a Hive or Spark table registered in the session Catalog, to Iceberg table +`db.tbl`. Only add files that exist within partitions where `part_col_1` is equal to `A`. +```sql +CALL spark_catalog.system.add_files( +table => 'db.tbl', +source_table => 'db.src_tbl', +partition_filter => map('part_col_1', 'A') +) +``` + +Add files from a `parquet` file based table at location `path/to/table` to the Iceberg table `db.tbl`. Add all +files regardless of what partition they belong to. +```sql +CALL spark_catalog.system.add_files( + table => 'db.tbl', + source_table => '`parquet`.`path/to/table`' +) +``` + +### `register_table` + +Creates a catalog entry for a metadata.json file which already exists but does not have a corresponding catalog identifier. + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Table which is to be registered | +| `metadata_file`| ✔️ | string | Metadata file which is to be registered as a new catalog identifier | + +!!! warning + Having the same metadata.json registered in more than one catalog can lead to missing updates, loss of data, and table corruption. + Only use this procedure when the table is no longer registered in an existing catalog, or you are moving a table between catalogs. + + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `current_snapshot_id` | long | The current snapshot ID of the newly registered Iceberg table | +| `total_records_count` | long | Total records count of the newly registered Iceberg table | +| `total_data_files_count` | long | Total data files count of the newly registered Iceberg table | + +#### Examples + +Register a new table as `db.tbl` to `spark_catalog` pointing to metadata.json file `path/to/metadata/file.json`. +```sql +CALL spark_catalog.system.register_table( + table => 'db.tbl', + metadata_file => 'path/to/metadata/file.json' +) +``` + +## Metadata information + +### `ancestors_of` + +Report the live snapshot IDs of parents of a specified snapshot + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|-----------|------|-------------| +| `table` | ✔️ | string | Name of the table to report live snapshot IDs | +| `snapshot_id` | ️ | long | Use a specified snapshot to get the live snapshot IDs of parents | + +> tip : Using snapshot_id +> +> Given snapshots history with roll back to B and addition of C' -> D' +> ```shell +> A -> B - > C -> D +> \ -> C' -> (D') +> ``` +> Not specifying the snapshot ID would return A -> B -> C' -> D', while providing the snapshot ID of +> D as an argument would return A-> B -> C -> D + +#### Output + +| Output Name | Type | Description | +| ------------|------|-------------| +| `snapshot_id` | long | the ancestor snapshot id | +| `timestamp` | long | snapshot creation time | + +#### Examples + +Get all the snapshot ancestors of current snapshots(default) +```sql +CALL spark_catalog.system.ancestors_of('db.tbl') +``` + +Get all the snapshot ancestors by a particular snapshot +```sql +CALL spark_catalog.system.ancestors_of('db.tbl', 1) +CALL spark_catalog.system.ancestors_of(snapshot_id => 1, table => 'db.tbl') +``` + +## Change Data Capture + +### `create_changelog_view` + +Creates a view that contains the changes from a given table. + +#### Usage + +| Argument Name | Required? | Type | Description | +|---------------|----------|------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| `table` | ✔️ | string | Name of the source table for the changelog | +| `changelog_view` | | string | Name of the view to create | +| `options` | | map | A map of Spark read options to use | +|`compute_updates`| | boolean | Whether to compute pre/post update images (see below for more information). Defaults to false. | +|`identifier_columns`| | array | The list of identifier columns to compute updates. If the argument `compute_updates` is set to true and `identifier_columns` are not provided, the table’s current identifier fields will be used to compute updates. | +|`remove_carryovers`| | boolean | Whether to remove carry-over rows (see below for more information). Defaults to true. | + +Here is a list of commonly used Spark read options: +* `start-snapshot-id`: the exclusive start snapshot ID. If not provided, it reads from the table’s first snapshot inclusively. +* `end-snapshot-id`: the inclusive end snapshot id, default to table's current snapshot. +* `start-timestamp`: the exclusive start timestamp. If not provided, it reads from the table’s first snapshot inclusively. +* `end-timestamp`: the inclusive end timestamp, default to table's current snapshot. + +#### Output +| Output Name | Type | Description | +| ------------|------|----------------------------------------| +| `changelog_view` | string | The name of the created changelog view | + +#### Examples + +Create a changelog view `tbl_changes` based on the changes that happened between snapshot `1` (exclusive) and `2` (inclusive). +```sql +CALL spark_catalog.system.create_changelog_view( + table => 'db.tbl', + options => map('start-snapshot-id','1','end-snapshot-id', '2') +) +``` + +Create a changelog view `my_changelog_view` based on the changes that happened between timestamp `1678335750489` (exclusive) and `1678992105265` (inclusive). +```sql +CALL spark_catalog.system.create_changelog_view( + table => 'db.tbl', + options => map('start-timestamp','1678335750489','end-timestamp', '1678992105265'), + changelog_view => 'my_changelog_view' +) +``` + +Create a changelog view that computes updates based on the identifier columns `id` and `name`. +```sql +CALL spark_catalog.system.create_changelog_view( + table => 'db.tbl', + options => map('start-snapshot-id','1','end-snapshot-id', '2'), + identifier_columns => array('id', 'name') +) +``` + +Once the changelog view is created, you can query the view to see the changes that happened between the snapshots. +```sql +SELECT * FROM tbl_changes +``` +```sql +SELECT * FROM tbl_changes where _change_type = 'INSERT' AND id = 3 ORDER BY _change_ordinal +``` +Please note that the changelog view includes Change Data Capture(CDC) metadata columns +that provide additional information about the changes being tracked. These columns are: +- `_change_type`: the type of change. It has one of the following values: `INSERT`, `DELETE`, `UPDATE_BEFORE`, or `UPDATE_AFTER`. +- `_change_ordinal`: the order of changes +- `_commit_snapshot_id`: the snapshot ID where the change occurred + +Here is an example of corresponding results. It shows that the first snapshot inserted 2 records, and the +second snapshot deleted 1 record. + +| id | name |_change_type | _change_ordinal | _change_snapshot_id | +|---|--------|---|---|---| +|1 | Alice |INSERT |0 |5390529835796506035| +|2 | Bob |INSERT |0 |5390529835796506035| +|1 | Alice |DELETE |1 |8764748981452218370| + +#### Carry-over Rows + +The procedure removes the carry-over rows by default. Carry-over rows are the result of row-level operations(`MERGE`, `UPDATE` and `DELETE`) +when using copy-on-write. For example, given a file which contains row1 `(id=1, name='Alice')` and row2 `(id=2, name='Bob')`. +A copy-on-write delete of row2 would require erasing this file and preserving row1 in a new file. The changelog table +reports this as the following pair of rows, despite it not being an actual change to the table. + +| id | name | _change_type | +|-----|-------|--------------| +| 1 | Alice | DELETE | +| 1 | Alice | INSERT | + +By default, this view finds the carry-over rows and removes them from the result. User can disable this +behavior by setting the `remove_carryovers` option to `false`. + +#### Pre/Post Update Images + +The procedure computes the pre/post update images if configured. Pre/post update images are converted from a +pair of a delete row and an insert row. Identifier columns are used for determining whether an insert and a delete record +refer to the same row. If the two records share the same values for the identity columns they are considered to be before +and after states of the same row. You can either set identifier fields in the table schema or input them as the procedure parameters. + +The following example shows pre/post update images computation with an identifier column(`id`), where a row deletion +and an insertion with the same `id` are treated as a single update operation. Specifically, suppose we have the following pair of rows: + +| id | name | _change_type | +|-----|--------|--------------| +| 3 | Robert | DELETE | +| 3 | Dan | INSERT | + +In this case, the procedure marks the row before the update as an `UPDATE_BEFORE` image and the row after the update +as an `UPDATE_AFTER` image, resulting in the following pre/post update images: + +| id | name | _change_type | +|-----|--------|--------------| +| 3 | Robert | UPDATE_BEFORE| +| 3 | Dan | UPDATE_AFTER | diff --git a/docs-new/home/docs/latest/spark-queries.md b/docs-new/home/docs/latest/spark-queries.md new file mode 100644 index 000000000000..90a3573b6a76 --- /dev/null +++ b/docs-new/home/docs/latest/spark-queries.md @@ -0,0 +1,403 @@ +--- +title: "Queries" +--- + + +# Spark Queries + +To use Iceberg in Spark, first configure [Spark catalogs](spark-configuration.md). Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. + +## Querying with SQL + +In Spark 3, tables use identifiers that include a [catalog name](spark-configuration.md#using-catalogs). + +```sql +SELECT * FROM prod.db.table; -- catalog: prod, namespace: db, table: table +``` + +Metadata tables, like `history` and `snapshots`, can use the Iceberg table name as a namespace. + +For example, to read from the `files` metadata table for `prod.db.table`: + +```sql +SELECT * FROM prod.db.table.files; +``` + +| content | file_path | file_format | spec_id | partition | record_count | file_size_in_bytes | column_sizes | value_counts | null_value_counts | nan_value_counts | lower_bounds | upper_bounds | key_metadata | split_offsets | equality_ids | sort_order_id | +| ------- | ----------------------------------------------------------------------- | ----------- | ------- | ---------------- | ------------ | ------------------ | ------------------ | ---------------- | ----------------- | ---------------- | --------------- | --------------- | ------------ | ------------- | ------------ | ------------- | +| 0 | s3:/.../table/data/00000-3-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 01} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> c] | [1 -> , 2 -> c] | null | [4] | null | null | +| 0 | s3:/.../table/data/00001-4-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 02} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> b] | [1 -> , 2 -> b] | null | [4] | null | null | +| 0 | s3:/.../table/data/00002-5-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 03} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> a] | [1 -> , 2 -> a] | null | [4] | null | null | + +## Querying with DataFrames + +To load a table as a DataFrame, use `table`: + +```scala +val df = spark.table("prod.db.table") +``` + +### Catalogs with DataFrameReader + +Paths and table names can be loaded with Spark's `DataFrameReader` interface. How tables are loaded depends on how +the identifier is specified. When using `spark.read.format("iceberg").load(table)` or `spark.table(table)` the `table` +variable can take a number of forms as listed below: + +* `file:///path/to/table`: loads a HadoopTable at given path +* `tablename`: loads `currentCatalog.currentNamespace.tablename` +* `catalog.tablename`: loads `tablename` from the specified catalog. +* `namespace.tablename`: loads `namespace.tablename` from current catalog +* `catalog.namespace.tablename`: loads `namespace.tablename` from the specified catalog. +* `namespace1.namespace2.tablename`: loads `namespace1.namespace2.tablename` from current catalog + +The above list is in order of priority. For example: a matching catalog will take priority over any namespace resolution. + + +### Time travel + +#### SQL + +Spark 3.3 and later supports time travel in SQL queries using `TIMESTAMP AS OF` or `VERSION AS OF` clauses. +The `VERSION AS OF` clause can contain a long snapshot ID or a string branch or tag name. + +!!! info + Note: If the name of a branch or tag is the same as a snapshot ID, then the snapshot which is selected for time travel is the snapshot + with the given snapshot ID. For example, consider the case where there is a tag named '1' and it references snapshot with ID 2. + If the version travel clause is `VERSION AS OF '1'`, time travel will be done to the snapshot with ID 1. + If this is not desired, rename the tag or branch with a well-defined prefix such as 'snapshot-1'. + + +```sql +-- time travel to October 26, 1986 at 01:21:00 +SELECT * FROM prod.db.table TIMESTAMP AS OF '1986-10-26 01:21:00'; + +-- time travel to snapshot with id 10963874102873L +SELECT * FROM prod.db.table VERSION AS OF 10963874102873; + +-- time travel to the head snapshot of audit-branch +SELECT * FROM prod.db.table VERSION AS OF 'audit-branch'; + +-- time travel to the snapshot referenced by the tag historical-snapshot +SELECT * FROM prod.db.table VERSION AS OF 'historical-snapshot'; +``` + +In addition, `FOR SYSTEM_TIME AS OF` and `FOR SYSTEM_VERSION AS OF` clauses are also supported: + +```sql +SELECT * FROM prod.db.table FOR SYSTEM_TIME AS OF '1986-10-26 01:21:00'; +SELECT * FROM prod.db.table FOR SYSTEM_VERSION AS OF 10963874102873; +SELECT * FROM prod.db.table FOR SYSTEM_VERSION AS OF 'audit-branch'; +SELECT * FROM prod.db.table FOR SYSTEM_VERSION AS OF 'historical-snapshot'; +``` + +Timestamps may also be supplied as a Unix timestamp, in seconds: + +```sql +-- timestamp in seconds +SELECT * FROM prod.db.table TIMESTAMP AS OF 499162860; +SELECT * FROM prod.db.table FOR SYSTEM_TIME AS OF 499162860; +``` + +#### DataFrame + +To select a specific table snapshot or the snapshot at some time in the DataFrame API, Iceberg supports four Spark read options: + +* `snapshot-id` selects a specific table snapshot +* `as-of-timestamp` selects the current snapshot at a timestamp, in milliseconds +* `branch` selects the head snapshot of the specified branch. Note that currently branch cannot be combined with as-of-timestamp. +* `tag` selects the snapshot associated with the specified tag. Tags cannot be combined with `as-of-timestamp`. + +```scala +// time travel to October 26, 1986 at 01:21:00 +spark.read + .option("as-of-timestamp", "499162860000") + .format("iceberg") + .load("path/to/table") +``` + +```scala +// time travel to snapshot with ID 10963874102873L +spark.read + .option("snapshot-id", 10963874102873L) + .format("iceberg") + .load("path/to/table") +``` + +```scala +// time travel to tag historical-snapshot +spark.read + .option(SparkReadOptions.TAG, "historical-snapshot") + .format("iceberg") + .load("path/to/table") +``` + +```scala +// time travel to the head snapshot of audit-branch +spark.read + .option(SparkReadOptions.BRANCH, "audit-branch") + .format("iceberg") + .load("path/to/table") +``` + +!!! info + Spark 3.0 and earlier versions do not support using `option` with `table` in DataFrameReader commands. All options will be silently + ignored. Do not use `table` when attempting to time-travel or use other options. See [SPARK-32592](https://issues.apache.org/jira/browse/SPARK-32592). + + + +### Incremental read + +To read appended data incrementally, use: + +* `start-snapshot-id` Start snapshot ID used in incremental scans (exclusive). +* `end-snapshot-id` End snapshot ID used in incremental scans (inclusive). This is optional. Omitting it will default to the current snapshot. + +```scala +// get the data added after start-snapshot-id (10963874102873L) until end-snapshot-id (63874143573109L) +spark.read() + .format("iceberg") + .option("start-snapshot-id", "10963874102873") + .option("end-snapshot-id", "63874143573109") + .load("path/to/table") +``` + +!!! info + Currently gets only the data from `append` operation. Cannot support `replace`, `overwrite`, `delete` operations. + Incremental read works with both V1 and V2 format-version. + Incremental read is not supported by Spark's SQL syntax. + + +## Inspecting tables + +To inspect a table's history, snapshots, and other metadata, Iceberg supports metadata tables. + +Metadata tables are identified by adding the metadata table name after the original table name. For example, history for `db.table` is read using `db.table.history`. + +!!! info + For Spark 3, prior to 3.2, the Spark [session catalog](spark-configuration.md#replacing-the-session-catalog) does not support table names with multipart identifiers such as `catalog.database.table.metadata`. As a workaround, configure an `org.apache.iceberg.spark.SparkCatalog`, or use the Spark `DataFrameReader` API. + + +### History + +To show table history: + +```sql +SELECT * FROM prod.db.table.history; +``` + +| made_current_at | snapshot_id | parent_id | is_current_ancestor | +| -- | -- | -- | -- | +| 2019-02-08 03:29:51.215 | 5781947118336215154 | NULL | true | +| 2019-02-08 03:47:55.948 | 5179299526185056830 | 5781947118336215154 | true | +| 2019-02-09 16:24:30.13 | 296410040247533544 | 5179299526185056830 | false | +| 2019-02-09 16:32:47.336 | 2999875608062437330 | 5179299526185056830 | true | +| 2019-02-09 19:42:03.919 | 8924558786060583479 | 2999875608062437330 | true | +| 2019-02-09 19:49:16.343 | 6536733823181975045 | 8924558786060583479 | true | + +!!! info + **This shows a commit that was rolled back.** The example has two snapshots with the same parent, and one is *not* an ancestor of the current table state. + + +### Metadata Log Entries + +To show table metadata log entries: + +```sql +SELECT * from prod.db.table.metadata_log_entries; +``` + +| timestamp | file | latest_snapshot_id | latest_schema_id | latest_sequence_number | +| -- | -- | -- | -- | -- | +| 2022-07-28 10:43:52.93 | s3://.../table/metadata/00000-9441e604-b3c2-498a-a45a-6320e8ab9006.metadata.json | null | null | null | +| 2022-07-28 10:43:57.487 | s3://.../table/metadata/00001-f30823df-b745-4a0a-b293-7532e0c99986.metadata.json | 170260833677645300 | 0 | 1 | +| 2022-07-28 10:43:58.25 | s3://.../table/metadata/00002-2cc2837a-02dc-4687-acc1-b4d86ea486f4.metadata.json | 958906493976709774 | 0 | 2 | + +### Snapshots + +To show the valid snapshots for a table: + +```sql +SELECT * FROM prod.db.table.snapshots; +``` + +| committed_at | snapshot_id | parent_id | operation | manifest_list | summary | +| -- | -- | -- | -- | -- | -- | +| 2019-02-08 03:29:51.215 | 57897183625154 | null | append | s3://.../table/metadata/snap-57897183625154-1.avro | { added-records -> 2478404, total-records -> 2478404, added-data-files -> 438, total-data-files -> 438, spark.app.id -> application_1520379288616_155055 } | + +You can also join snapshots to table history. For example, this query will show table history, with the application ID that wrote each snapshot: + +```sql +select + h.made_current_at, + s.operation, + h.snapshot_id, + h.is_current_ancestor, + s.summary['spark.app.id'] +from prod.db.table.history h +join prod.db.table.snapshots s + on h.snapshot_id = s.snapshot_id +order by made_current_at +``` + +| made_current_at | operation | snapshot_id | is_current_ancestor | summary[spark.app.id] | +| -- | -- | -- | -- | -- | +| 2019-02-08 03:29:51.215 | append | 57897183625154 | true | application_1520379288616_155055 | +| 2019-02-09 16:24:30.13 | delete | 29641004024753 | false | application_1520379288616_151109 | +| 2019-02-09 16:32:47.336 | append | 57897183625154 | true | application_1520379288616_155055 | +| 2019-02-08 03:47:55.948 | overwrite | 51792995261850 | true | application_1520379288616_152431 | + +### Files + +To show a table's current data files: + +```sql +SELECT * FROM prod.db.table.files; +``` + +|content|file_path |file_format|spec_id|partition|record_count|file_size_in_bytes|column_sizes |value_counts |null_value_counts|nan_value_counts|lower_bounds |upper_bounds |key_metadata|split_offsets|equality_ids|sort_order_id| +| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | +| 0 | s3:/.../table/data/00000-3-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 01} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> c] | [1 -> , 2 -> c] | null | [4] | null | null | +| 0 | s3:/.../table/data/00001-4-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 02} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> b] | [1 -> , 2 -> b] | null | [4] | null | null | +| 0 | s3:/.../table/data/00002-5-8d6d60e8-d427-4809-bcf0-f5d45a4aad96.parquet | PARQUET | 0 | {1999-01-01, 03} | 1 | 597 | [1 -> 90, 2 -> 62] | [1 -> 1, 2 -> 1] | [1 -> 0, 2 -> 0] | [] | [1 -> , 2 -> a] | [1 -> , 2 -> a] | null | [4] | null | null | + +### Manifests + +To show a table's current file manifests: + +```sql +SELECT * FROM prod.db.table.manifests; +``` + +| path | length | partition_spec_id | added_snapshot_id | added_data_files_count | existing_data_files_count | deleted_data_files_count | partition_summaries | +| -- | -- | -- | -- | -- | -- | -- | -- | +| s3://.../table/metadata/45b5290b-ee61-4788-b324-b1e2735c0e10-m0.avro | 4479 | 0 | 6668963634911763636 | 8 | 0 | 0 | [[false,null,2019-05-13,2019-05-15]] | + +Note: +1. Fields within `partition_summaries` column of the manifests table correspond to `field_summary` structs within [manifest list](../../spec.md#manifest-lists), with the following order: + - `contains_null` + - `contains_nan` + - `lower_bound` + - `upper_bound` +2. `contains_nan` could return null, which indicates that this information is not available from the file's metadata. + This usually occurs when reading from V1 table, where `contains_nan` is not populated. + +### Partitions + +To show a table's current partitions: + +```sql +SELECT * FROM prod.db.table.partitions; +``` + +| partition | record_count | file_count | spec_id | +| -- | -- | -- | -- | +| {20211001, 11}| 1| 1| 0| +| {20211002, 11}| 1| 1| 0| +| {20211001, 10}| 1| 1| 0| +| {20211002, 10}| 1| 1| 0| + +Note: +1. For unpartitioned tables, the partitions table will contain only the record_count and file_count columns. + +2. The partitions metadata table shows partitions with data files or delete files in the current snapshot. However, delete files are not applied, and so in some cases partitions may be shown even though all their data rows are marked deleted by delete files. + +### All Metadata Tables + +These tables are unions of the metadata tables specific to the current snapshot, and return metadata across all snapshots. + +!!! danger + The "all" metadata tables may produce more than one row per data file or manifest file because metadata files may be part of more than one table snapshot. + + +#### All Data Files + +To show all of the table's data files and each file's metadata: + +```sql +SELECT * FROM prod.db.table.all_data_files; +``` + +| content | file_path | file_format | partition | record_count | file_size_in_bytes | column_sizes| value_counts | null_value_counts | nan_value_counts| lower_bounds| upper_bounds|key_metadata|split_offsets|equality_ids|sort_order_id| +| -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | -- | +| 0|s3://.../dt=20210102/00000-0-756e2512-49ae-45bb-aae3-c0ca475e7879-00001.parquet| PARQUET|{20210102}| 14| 2444|{1 -> 94, 2 -> 17}|{1 -> 14, 2 -> 14}| {1 -> 0, 2 -> 0}| {}|{1 -> 1, 2 -> 20210102}|{1 -> 2, 2 -> 20210102}| null| [4]| null| 0| +| 0|s3://.../dt=20210103/00000-0-26222098-032f-472b-8ea5-651a55b21210-00001.parquet| PARQUET|{20210103}| 14| 2444|{1 -> 94, 2 -> 17}|{1 -> 14, 2 -> 14}| {1 -> 0, 2 -> 0}| {}|{1 -> 1, 2 -> 20210103}|{1 -> 3, 2 -> 20210103}| null| [4]| null| 0| +| 0|s3://.../dt=20210104/00000-0-a3bb1927-88eb-4f1c-bc6e-19076b0d952e-00001.parquet| PARQUET|{20210104}| 14| 2444|{1 -> 94, 2 -> 17}|{1 -> 14, 2 -> 14}| {1 -> 0, 2 -> 0}| {}|{1 -> 1, 2 -> 20210104}|{1 -> 3, 2 -> 20210104}| null| [4]| null| 0| + +#### All Manifests + +To show all of the table's manifest files: + +```sql +SELECT * FROM prod.db.table.all_manifests; +``` + +| path | length | partition_spec_id | added_snapshot_id | added_data_files_count | existing_data_files_count | deleted_data_files_count| partition_summaries| +| -- | -- | -- | -- | -- | -- | -- | -- | +| s3://.../metadata/a85f78c5-3222-4b37-b7e4-faf944425d48-m0.avro | 6376 | 0 | 6272782676904868561 | 2 | 0 | 0 |[{false, false, 20210101, 20210101}]| + +Note: +1. Fields within `partition_summaries` column of the manifests table correspond to `field_summary` structs within [manifest list](../../spec.md#manifest-lists), with the following order: + - `contains_null` + - `contains_nan` + - `lower_bound` + - `upper_bound` +2. `contains_nan` could return null, which indicates that this information is not available from the file's metadata. + This usually occurs when reading from V1 table, where `contains_nan` is not populated. + +### References + +To show a table's known snapshot references: + +```sql +SELECT * FROM prod.db.table.refs; +``` + +| name | type | snapshot_id | max_reference_age_in_ms | min_snapshots_to_keep | max_snapshot_age_in_ms | +| -- | -- | -- | -- | -- | -- | +| main | BRANCH | 4686954189838128572 | 10 | 20 | 30 | +| testTag | TAG | 4686954189838128572 | 10 | null | null | + +### Inspecting with DataFrames + +Metadata tables can be loaded using the DataFrameReader API: + +```scala +// named metastore table +spark.read.format("iceberg").load("db.table.files") +// Hadoop path table +spark.read.format("iceberg").load("hdfs://nn:8020/path/to/table#files") +``` + +### Time Travel with Metadata Tables + +To inspect a tables's metadata with the time travel feature: + +```sql +-- get the table's file manifests at timestamp Sep 20, 2021 08:00:00 +SELECT * FROM prod.db.table.manifests TIMESTAMP AS OF '2021-09-20 08:00:00'; + +-- get the table's partitions with snapshot id 10963874102873L +SELECT * FROM prod.db.table.partitions VERSION AS OF 10963874102873; +``` + +Metadata tables can also be inspected with time travel using the DataFrameReader API: + +```scala +// load the table's file metadata at snapshot-id 10963874102873 as DataFrame +spark.read.format("iceberg").option("snapshot-id", 10963874102873L).load("db.table.files") +``` diff --git a/docs-new/home/docs/latest/spark-structured-streaming.md b/docs-new/home/docs/latest/spark-structured-streaming.md new file mode 100644 index 000000000000..b3858972cbc3 --- /dev/null +++ b/docs-new/home/docs/latest/spark-structured-streaming.md @@ -0,0 +1,121 @@ +--- +title: "Structured Streaming" +--- + + +# Spark Structured Streaming + +Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API +with different levels of support in Spark versions. + +As of Spark 3, DataFrame reads and writes are supported. + +## Streaming Reads + +Iceberg supports processing incremental data in spark structured streaming jobs which starts from a historical timestamp: + +```scala +val df = spark.readStream + .format("iceberg") + .option("stream-from-timestamp", Long.toString(streamStartTimestamp)) + .load("database.table_name") +``` + +!!! warning + Iceberg only supports reading data from append snapshots. Overwrite snapshots cannot be processed and will cause an exception by default. Overwrites may be ignored by setting `streaming-skip-overwrite-snapshots=true`. Similarly, delete snapshots will cause an exception by default, and deletes may be ignored by setting `streaming-skip-delete-snapshots=true`. + +## Streaming Writes + +To write values from streaming query to Iceberg table, use `DataStreamWriter`: + +```scala +val tableIdentifier: String = ... +data.writeStream + .format("iceberg") + .outputMode("append") + .trigger(Trigger.ProcessingTime(1, TimeUnit.MINUTES)) + .option("path", tableIdentifier) + .option("checkpointLocation", checkpointPath) + .start() +``` + +The `tableIdentifier` can be: + +* The fully-qualified path to a HDFS table, like `hdfs://nn:8020/path/to/table` +* A table name if the table is tracked by a catalog, like `database.table_name` + +Iceberg doesn't support "continuous processing", as it doesn't provide the interface to "commit" the output. + +Iceberg supports `append` and `complete` output modes: + +* `append`: appends the rows of every micro-batch to the table +* `complete`: replaces the table contents every micro-batch + +The table should be created in prior to start the streaming query. Refer [SQL create table](spark-ddl.md#create-table) +on Spark page to see how to create the Iceberg table. + +### Writing against partitioned table + +Iceberg requires the data to be sorted according to the partition spec per task (Spark partition) in prior to write +against partitioned table. For batch queries you're encouraged to do explicit sort to fulfill the requirement +(see [here](spark-writes.md#writing-distribution-modes)), but the approach would bring additional latency as +repartition and sort are considered as heavy operations for streaming workload. To avoid additional latency, you can +enable fanout writer to eliminate the requirement. + +```scala +val tableIdentifier: String = ... +data.writeStream + .format("iceberg") + .outputMode("append") + .trigger(Trigger.ProcessingTime(1, TimeUnit.MINUTES)) + .option("path", tableIdentifier) + .option("fanout-enabled", "true") + .option("checkpointLocation", checkpointPath) + .start() +``` + +Fanout writer opens the files per partition value and doesn't close these files till write task is finished. +This functionality is discouraged for batch query, as explicit sort against output rows isn't expensive for batch workload. + +## Maintenance for streaming tables + +Streaming queries can create new table versions quickly, which creates lots of table metadata to track those versions. +Maintaining metadata by tuning the rate of commits, expiring old snapshots, and automatically cleaning up metadata files +is highly recommended. + +### Tune the rate of commits + +Having high rate of commits would produce lots of data files, manifests, and snapshots which leads the table hard +to maintain. We encourage having trigger interval 1 minute at minimum, and increase the interval if needed. + +The triggers section in [Structured Streaming Programming Guide](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) +documents how to configure the interval. + +### Expire old snapshots + +Each micro-batch written to a table produces a new snapshot, which are tracked in table metadata until they are expired to remove the metadata and any data files that are no longer needed. Snapshots accumulate quickly with frequent commits, so it is highly recommended that tables written by streaming queries are [regularly maintained](maintenance.md#expire-snapshots). + +### Compacting data files + +The amount of data written in a micro batch is typically small, which can cause the table metadata to track lots of small files. [Compacting small files into larger files](maintenance.md#compact-data-files) reduces the metadata needed by the table, and increases query efficiency. + +### Rewrite manifests + +To optimize write latency on streaming workload, Iceberg may write the new snapshot with a "fast" append that does not automatically compact manifests. +This could lead lots of small manifest files. Manifests can be [rewritten to optimize queries and to compact](maintenance.md#rewrite-manifests). + diff --git a/docs-new/home/docs/latest/spark-writes.md b/docs-new/home/docs/latest/spark-writes.md new file mode 100644 index 000000000000..52d7e23f644c --- /dev/null +++ b/docs-new/home/docs/latest/spark-writes.md @@ -0,0 +1,442 @@ +--- +title: "Writes" +--- + + +# Spark Writes + +To use Iceberg in Spark, first configure [Spark catalogs](spark-configuration.md). + +Some plans are only available when using [Iceberg SQL extensions](spark-configuration.md#sql-extensions) in Spark 3. + +Iceberg uses Apache Spark's DataSourceV2 API for data source and catalog implementations. Spark DSv2 is an evolving API with different levels of support in Spark versions: + +| Feature support | Spark 3 | Notes | +|--------------------------------------------------|-----------|----------------------------------------------| +| [SQL insert into](#insert-into) | ✔️ | | +| [SQL merge into](#merge-into) | ✔️ | ⚠ Requires Iceberg Spark extensions | +| [SQL insert overwrite](#insert-overwrite) | ✔️ | | +| [SQL delete from](#delete-from) | ✔️ | ⚠ Row-level delete requires Spark extensions | +| [SQL update](#update) | ✔️ | ⚠ Requires Iceberg Spark extensions | +| [DataFrame append](#appending-data) | ✔️ | | +| [DataFrame overwrite](#overwriting-data) | ✔️ | | +| [DataFrame CTAS and RTAS](#creating-tables) | ✔️ | | + + +## Writing with SQL + +Spark 3 supports SQL `INSERT INTO`, `MERGE INTO`, and `INSERT OVERWRITE`, as well as the new `DataFrameWriterV2` API. + +### `INSERT INTO` + +To append new data to a table, use `INSERT INTO`. + +```sql +INSERT INTO prod.db.table VALUES (1, 'a'), (2, 'b') +``` +```sql +INSERT INTO prod.db.table SELECT ... +``` + +### `MERGE INTO` + +Spark 3 added support for `MERGE INTO` queries that can express row-level updates. + +Iceberg supports `MERGE INTO` by rewriting data files that contain rows that need to be updated in an `overwrite` commit. + +**`MERGE INTO` is recommended instead of `INSERT OVERWRITE`** because Iceberg can replace only the affected data files, and because the data overwritten by a dynamic overwrite may change if the table's partitioning changes. + + +#### `MERGE INTO` syntax + +`MERGE INTO` updates a table, called the _target_ table, using a set of updates from another query, called the _source_. The update for a row in the target table is found using the `ON` clause that is like a join condition. + +```sql +MERGE INTO prod.db.target t -- a target table +USING (SELECT ...) s -- the source updates +ON t.id = s.id -- condition to find updates for target rows +WHEN ... -- updates +``` + +Updates to rows in the target table are listed using `WHEN MATCHED ... THEN ...`. Multiple `MATCHED` clauses can be added with conditions that determine when each match should be applied. The first matching expression is used. + +```sql +WHEN MATCHED AND s.op = 'delete' THEN DELETE +WHEN MATCHED AND t.count IS NULL AND s.op = 'increment' THEN UPDATE SET t.count = 0 +WHEN MATCHED AND s.op = 'increment' THEN UPDATE SET t.count = t.count + 1 +``` + +Source rows (updates) that do not match can be inserted: + +```sql +WHEN NOT MATCHED THEN INSERT * +``` + +Inserts also support additional conditions: + +```sql +WHEN NOT MATCHED AND s.event_time > still_valid_threshold THEN INSERT (id, count) VALUES (s.id, 1) +``` + +Only one record in the source data can update any given row of the target table, or else an error will be thrown. + + +### `INSERT OVERWRITE` + +`INSERT OVERWRITE` can replace data in the table with the result of a query. Overwrites are atomic operations for Iceberg tables. + +The partitions that will be replaced by `INSERT OVERWRITE` depends on Spark's partition overwrite mode and the partitioning of a table. `MERGE INTO` can rewrite only affected data files and has more easily understood behavior, so it is recommended instead of `INSERT OVERWRITE`. + +#### Overwrite behavior + +Spark's default overwrite mode is **static**, but **dynamic overwrite mode is recommended when writing to Iceberg tables.** Static overwrite mode determines which partitions to overwrite in a table by converting the `PARTITION` clause to a filter, but the `PARTITION` clause can only reference table columns. + +Dynamic overwrite mode is configured by setting `spark.sql.sources.partitionOverwriteMode=dynamic`. + +To demonstrate the behavior of dynamic and static overwrites, consider a `logs` table defined by the following DDL: + +```sql +CREATE TABLE prod.my_app.logs ( + uuid string NOT NULL, + level string NOT NULL, + ts timestamp NOT NULL, + message string) +USING iceberg +PARTITIONED BY (level, hours(ts)) +``` + +#### Dynamic overwrite + +When Spark's overwrite mode is dynamic, partitions that have rows produced by the `SELECT` query will be replaced. + +For example, this query removes duplicate log events from the example `logs` table. + +```sql +INSERT OVERWRITE prod.my_app.logs +SELECT uuid, first(level), first(ts), first(message) +FROM prod.my_app.logs +WHERE cast(ts as date) = '2020-07-01' +GROUP BY uuid +``` + +In dynamic mode, this will replace any partition with rows in the `SELECT` result. Because the date of all rows is restricted to 1 July, only hours of that day will be replaced. + +#### Static overwrite + +When Spark's overwrite mode is static, the `PARTITION` clause is converted to a filter that is used to delete from the table. If the `PARTITION` clause is omitted, all partitions will be replaced. + +Because there is no `PARTITION` clause in the query above, it will drop all existing rows in the table when run in static mode, but will only write the logs from 1 July. + +To overwrite just the partitions that were loaded, add a `PARTITION` clause that aligns with the `SELECT` query filter: + +```sql +INSERT OVERWRITE prod.my_app.logs +PARTITION (level = 'INFO') +SELECT uuid, first(level), first(ts), first(message) +FROM prod.my_app.logs +WHERE level = 'INFO' +GROUP BY uuid +``` + +Note that this mode cannot replace hourly partitions like the dynamic example query because the `PARTITION` clause can only reference table columns, not hidden partitions. + +### `DELETE FROM` + +Spark 3 added support for `DELETE FROM` queries to remove data from tables. + +Delete queries accept a filter to match rows to delete. + +```sql +DELETE FROM prod.db.table +WHERE ts >= '2020-05-01 00:00:00' and ts < '2020-06-01 00:00:00' + +DELETE FROM prod.db.all_events +WHERE session_time < (SELECT min(session_time) FROM prod.db.good_events) + +DELETE FROM prod.db.orders AS t1 +WHERE EXISTS (SELECT oid FROM prod.db.returned_orders WHERE t1.oid = oid) +``` + +If the delete filter matches entire partitions of the table, Iceberg will perform a metadata-only delete. If the filter matches individual rows of a table, then Iceberg will rewrite only the affected data files. + +### `UPDATE` + +Spark 3.1 added support for `UPDATE` queries that update matching rows in tables. + +Update queries accept a filter to match rows to update. + +```sql +UPDATE prod.db.table +SET c1 = 'update_c1', c2 = 'update_c2' +WHERE ts >= '2020-05-01 00:00:00' and ts < '2020-06-01 00:00:00' + +UPDATE prod.db.all_events +SET session_time = 0, ignored = true +WHERE session_time < (SELECT min(session_time) FROM prod.db.good_events) + +UPDATE prod.db.orders AS t1 +SET order_status = 'returned' +WHERE EXISTS (SELECT oid FROM prod.db.returned_orders WHERE t1.oid = oid) +``` + +For more complex row-level updates based on incoming data, see the section on `MERGE INTO`. + +## Writing to Branches +Branch writes can be performed via SQL by providing a branch identifier, `branch_yourBranch` in the operation. +Branch writes can also be performed as part of a write-audit-publish (WAP) workflow by specifying the `spark.wap.branch` config. +Note WAP branch and branch identifier cannot both be specified. +Also, the branch must exist before performing the write. +The operation does **not** create the branch if it does not exist. +For more information on branches please refer to [branches](branching.md). + +```sql +-- INSERT (1,' a') (2, 'b') into the audit branch. +INSERT INTO prod.db.table.branch_audit VALUES (1, 'a'), (2, 'b'); + +-- MERGE INTO audit branch +MERGE INTO prod.db.table.branch_audit t +USING (SELECT ...) s +ON t.id = s.id +WHEN ... + +-- UPDATE audit branch +UPDATE prod.db.table.branch_audit AS t1 +SET val = 'c' + +-- DELETE FROM audit branch +DELETE FROM prod.dbl.table.branch_audit WHERE id = 2; + +-- WAP Branch write +SET spark.wap.branch = audit-branch +INSERT INTO prod.db.table VALUES (3, 'c'); +``` + +## Writing with DataFrames + +Spark 3 introduced the new `DataFrameWriterV2` API for writing to tables using data frames. The v2 API is recommended for several reasons: + +* CTAS, RTAS, and overwrite by filter are supported +* All operations consistently write columns to a table by name +* Hidden partition expressions are supported in `partitionedBy` +* Overwrite behavior is explicit, either dynamic or by a user-supplied filter +* The behavior of each operation corresponds to SQL statements + - `df.writeTo(t).create()` is equivalent to `CREATE TABLE AS SELECT` + - `df.writeTo(t).replace()` is equivalent to `REPLACE TABLE AS SELECT` + - `df.writeTo(t).append()` is equivalent to `INSERT INTO` + - `df.writeTo(t).overwritePartitions()` is equivalent to dynamic `INSERT OVERWRITE` + +The v1 DataFrame `write` API is still supported, but is not recommended. + +!!! danger + When writing with the v1 DataFrame API in Spark 3, use `saveAsTable` or `insertInto` to load tables with a catalog. + Using `format("iceberg")` loads an isolated table reference that will not automatically refresh tables used by queries. + + + +### Appending data + +To append a dataframe to an Iceberg table, use `append`: + +```scala +val data: DataFrame = ... +data.writeTo("prod.db.table").append() +``` + +### Overwriting data + +To overwrite partitions dynamically, use `overwritePartitions()`: + +```scala +val data: DataFrame = ... +data.writeTo("prod.db.table").overwritePartitions() +``` + +To explicitly overwrite partitions, use `overwrite` to supply a filter: + +```scala +data.writeTo("prod.db.table").overwrite($"level" === "INFO") +``` + +### Creating tables + +To run a CTAS or RTAS, use `create`, `replace`, or `createOrReplace` operations: + +```scala +val data: DataFrame = ... +data.writeTo("prod.db.table").create() +``` + +If you have replaced the default Spark catalog (`spark_catalog`) with Iceberg's `SparkSessionCatalog`, do: + +```scala +val data: DataFrame = ... +data.writeTo("db.table").using("iceberg").create() +``` + +Create and replace operations support table configuration methods, like `partitionedBy` and `tableProperty`: + +```scala +data.writeTo("prod.db.table") + .tableProperty("write.format.default", "orc") + .partitionedBy($"level", days($"ts")) + .createOrReplace() +``` + +The Iceberg table location can also be specified by the `location` table property: + +```scala +data.writeTo("prod.db.table") + .tableProperty("location", "/path/to/location") + .createOrReplace() +``` + +## Writing Distribution Modes + +Iceberg's default Spark writers require that the data in each spark task is clustered by partition values. This +distribution is required to minimize the number of file handles that are held open while writing. By default, starting +in Iceberg 1.2.0, Iceberg also requests that Spark pre-sort data to be written to fit this distribution. The +request to Spark is done through the table property `write.distribution-mode` with the value `hash`. + +Let's go through writing the data against below sample table: + +```sql +CREATE TABLE prod.db.sample ( + id bigint, + data string, + category string, + ts timestamp) +USING iceberg +PARTITIONED BY (days(ts), category) +``` + +To write data to the sample table, data needs to be sorted by `days(ts), category` but this is taken care +of automatically by the default `hash` distribution. Previously this would have required manually sorting, but this +is no longer the case. + +```sql +INSERT INTO prod.db.sample +SELECT id, data, category, ts FROM another_table +``` + + +There are 3 options for `write.distribution-mode` + +* `none` - This is the previous default for Iceberg. +This mode does not request any shuffles or sort to be performed automatically by Spark. Because no work is done +automatically by Spark, the data must be *manually* sorted by partition value. The data must be sorted either within +each spark task, or globally within the entire dataset. A global sort will minimize the number of output files. +A sort can be avoided by using the Spark [write fanout](#write-properties) property but this will cause all +file handles to remain open until each write task has completed. +* `hash` - This mode is the new default and requests that Spark uses a hash-based exchange to shuffle the incoming +write data before writing. +Practically, this means that each row is hashed based on the row's partition value and then placed +in a corresponding Spark task based upon that value. Further division and coalescing of tasks may take place because of +[Spark's Adaptive Query planning](#controlling-file-sizes). +* `range` - This mode requests that Spark perform a range based exchanged to shuffle the data before writing. +This is a two stage procedure which is more expensive than the `hash` mode. The first stage samples the data to +be written based on the partition and sort columns. The second stage uses the range information to shuffle the input data into Spark +tasks. Each task gets an exclusive range of the input data which clusters the data by partition and also globally sorts. +While this is more expensive than the hash distribution, the global ordering can be beneficial for read performance if +sorted columns are used during queries. This mode is used by default if a table is created with a +sort-order. Further division and coalescing of tasks may take place because of +[Spark's Adaptive Query planning](#controlling-file-sizes). + + +## Controlling File Sizes + +When writing data to Iceberg with Spark, it's important to note that Spark cannot write a file larger than a Spark +task and a file cannot span an Iceberg partition boundary. This means although Iceberg will always roll over a file +when it grows to [`write.target-file-size-bytes`](configuration.md#write-properties), but unless the Spark task is +large enough that will not happen. The size of the file created on disk will also be much smaller than the Spark task +since the on disk data will be both compressed and in columnar format as opposed to Spark's uncompressed row +representation. This means a 100 megabyte Spark task will create a file much smaller than 100 megabytes even if that +task is writing to a single Iceberg partition. If the task writes to multiple partitions, the files will be even +smaller than that. + +To control what data ends up in each Spark task use a [`write distribution mode`](#writing-distribution-modes) +or manually repartition the data. + +To adjust Spark's task size it is important to become familiar with Spark's various Adaptive Query Execution (AQE) +parameters. When the `write.distribution-mode` is not `none`, AQE will control the coalescing and splitting of Spark +tasks during the exchange to try to create tasks of `spark.sql.adaptive.advisoryPartitionSizeInBytes` size. These +settings will also affect any user performed re-partitions or sorts. +It is important again to note that this is the in-memory Spark row size and not the on disk +columnar-compressed size, so a larger value than the target file size will need to be specified. The ratio of +in-memory size to on disk size is data dependent. Future work in Spark should allow Iceberg to automatically adjust this +parameter at write time to match the `write.target-file-size-bytes`. + +## Type compatibility + +Spark and Iceberg support different set of types. Iceberg does the type conversion automatically, but not for all combinations, +so you may want to understand the type conversion in Iceberg in prior to design the types of columns in your tables. + +### Spark type to Iceberg type + +This type conversion table describes how Spark types are converted to the Iceberg types. The conversion applies on both creating Iceberg table and writing to Iceberg table via Spark. + +| Spark | Iceberg | Notes | +|-----------------|-------------------------|-------| +| boolean | boolean | | +| short | integer | | +| byte | integer | | +| integer | integer | | +| long | long | | +| float | float | | +| double | double | | +| date | date | | +| timestamp | timestamp with timezone | | +| char | string | | +| varchar | string | | +| string | string | | +| binary | binary | | +| decimal | decimal | | +| struct | struct | | +| array | list | | +| map | map | | + +!!! info + The table is based on representing conversion during creating table. In fact, broader supports are applied on write. Here're some points on write: + + * Iceberg numeric types (`integer`, `long`, `float`, `double`, `decimal`) support promotion during writes. e.g. You can write Spark types `short`, `byte`, `integer`, `long` to Iceberg type `long`. + * You can write to Iceberg `fixed` type using Spark `binary` type. Note that assertion on the length will be performed. + + +### Iceberg type to Spark type + +This type conversion table describes how Iceberg types are converted to the Spark types. The conversion applies on reading from Iceberg table via Spark. + +| Iceberg | Spark | Note | +|----------------------------|-------------------------|---------------| +| boolean | boolean | | +| integer | integer | | +| long | long | | +| float | float | | +| double | double | | +| date | date | | +| time | | Not supported | +| timestamp with timezone | timestamp | | +| timestamp without timezone | | Not supported | +| string | string | | +| uuid | string | | +| fixed | binary | | +| binary | binary | | +| decimal | decimal | | +| struct | struct | | +| list | array | | +| map | map | | + diff --git a/docs-new/home/docs/latest/table-migration.md b/docs-new/home/docs/latest/table-migration.md new file mode 100644 index 000000000000..2e56d378c531 --- /dev/null +++ b/docs-new/home/docs/latest/table-migration.md @@ -0,0 +1,74 @@ +--- +title: "Overview" +--- + + +# Table Migration +Apache Iceberg supports converting existing tables in other formats to Iceberg tables. This section introduces the general concept of table migration, its approaches, and existing implementations in Iceberg. + +## Migration Approaches +There are two methods for executing table migration: full data migration and in-place metadata migration. + +Full data migration involves copying all data files from the source table to the new Iceberg table. This method makes the new table fully isolated from the source table, but is slower and doubles the space. +In practice, users can use operations like [Create-Table-As-Select](spark-ddl.md#create-table-as-select), [INSERT](spark-writes.md#insert-into), and Change-Data-Capture pipelines to perform such migration. + +In-place metadata migration preserves the existing data files while incorporating Iceberg metadata on top of them. +This method is not only faster but also eliminates the need for data duplication. However, the new table and the source table are not fully isolated. In other words, if any processes vacuum data files from the source table, the new table will also be affected. + +In this doc, we will describe more about in-place metadata migration. + +![In-Place Metadata Migration](assets/images/iceberg-in-place-metadata-migration.png) + +Apache Iceberg supports the in-place metadata migration approach, which includes three important actions: **Snapshot Table**, **Migrate Table**, and **Add Files**. + +## Snapshot Table +The Snapshot Table action creates a new iceberg table with a different name and with the same schema and partitioning as the source table, leaving the source table unchanged during and after the action. + +- Create a new Iceberg table with the same metadata (schema, partition spec, etc.) as the source table and a different name. Readers and Writers on the source table can continue to work. + +![Snapshot Table Step 1](assets/images/iceberg-snapshotaction-step1.png) + +- Commit all data files across all partitions to the new Iceberg table. The source table remains unchanged. Readers can be switched to the new Iceberg table. + +![Snapshot Table Step 2](assets/images/iceberg-snapshotaction-step2.png) + +- Eventually, all writers can be switched to the new Iceberg table. Once all writers are transitioned to the new Iceberg table, the migration process will be considered complete. + +## Migrate Table +The Migrate Table action also creates a new Iceberg table with the same schema and partitioning as the source table. However, during the action execution, it locks and drops the source table from the catalog. +Consequently, Migrate Table requires all modifications working on the source table to be stopped before the action is performed. + +Stop all writers interacting with the source table. Readers that also support Iceberg may continue reading. + +![Migrate Table Step 1](assets/images/iceberg-migrateaction-step1.png) + +- Create a new Iceberg table with the same identifier and metadata (schema, partition spec, etc.) as the source table. Rename the source table for a backup in case of failure and rollback. + +![Migrate Table Step 2](assets/images/iceberg-migrateaction-step2.png) + +- Commit all data files across all partitions to the new Iceberg table. Drop the source table. Writers can start writing to the new Iceberg table. + +![Migrate Table Step 3](assets/images/iceberg-migrateaction-step3.png) + +## Add Files +After the initial step (either Snapshot Table or Migrate Table), it is common to find some data files that have not been migrated. These files often originate from concurrent writers who continue writing to the source table during or after the migration process. +In practice, these files can be new data files in Hive tables or new snapshots (versions) of Delta Lake tables. The Add Files action is essential for incorporating these files into the Iceberg table. + +# Migrating From Different Table Formats +* [From Hive to Iceberg](hive-migration.md) +* [From Delta Lake to Iceberg](delta-lake-migration.md) diff --git a/docs-new/home/fileio.md b/docs-new/home/fileio.md new file mode 100644 index 000000000000..454e70b0eaad --- /dev/null +++ b/docs-new/home/fileio.md @@ -0,0 +1,44 @@ +--- +title: "FileIO" +--- + + +# Iceberg FileIO + +## Overview + +Iceberg comes with a flexible abstraction around reading and writing data and metadata files. The FileIO interface allows the Iceberg library to communicate with the underlying storage layer. FileIO is used for all metadata operations during the job planning and commit stages. + +## Iceberg Files + +The metadata for an Iceberg table tracks the absolute path for data files which allows greater abstraction over the physical layout. Additionally, changes to table state are performed by writing new metadata files and never involve renaming files. This allows a much smaller set of requirements for file operations. The essential functionality for a FileIO implementation is that it can read files, write files, and seek to any position within a stream. + +## Usage in Processing Engines + +The responsibility of reading and writing data files lies with the processing engines and happens during task execution. However, after data files are written, processing engines use FileIO to write new Iceberg metadata files that capture the new state of the table. A blog post that provides a deeper understanding of FileIO is +[Iceberg FileIO: Cloud Native Tables](https://tabular.io/blog/iceberg-fileio/) + +Different FileIO implementations are used depending on the type of storage. Iceberg comes with a set of FileIO implementations for popular storage providers. +- Amazon S3 +- Google Cloud Storage +- Object Service Storage (including https) +- Dell Enterprise Cloud Storage +- Hadoop (adapts any Hadoop FileSystem implementation) + +As an example, take a look at the blog post [Using Iceberg's S3FileIO Implementation to Store Your Data in MinIO](https://tabular.io/blog/minio/) +which walks through how to use the Amazon S3 FileIO with MinIO. diff --git a/docs-new/home/gcm-stream-spec.md b/docs-new/home/gcm-stream-spec.md new file mode 100644 index 000000000000..d2074e02ab6d --- /dev/null +++ b/docs-new/home/gcm-stream-spec.md @@ -0,0 +1,85 @@ +--- +title: "AES GCM Stream Spec" +--- + + +# AES GCM Stream file format extension + +## Background and Motivation + +Iceberg supports a number of data file formats. Two of these formats (Parquet and ORC) have built-in encryption capabilities, that allow to protect sensitive information in the data files. However, besides the data files, Iceberg tables also have metadata files, that keep sensitive information too (e.g., min/max values in manifest files, or bloom filter bitsets in puffin files). Metadata file formats (AVRO, JSON, Puffin) don't have encryption support. + +Moreover, with the exception of Parquet, no Iceberg data or metadata file format supports integrity verification, required for end-to-end tamper proofing of Iceberg tables. + +This document specifies details of a simple file format extension that adds encryption and tamper-proofing to any existing file format. + +## Goals + +* Metadata encryption: enable encryption of manifests, manifest lists, snapshots and stats. +* Avro data encryption: enable encryption of data files in tables that use the Avro format. +* Support read splitting: enable seekable decrypting streams that can be used with splittable formats like Avro. +* Tamper proofing of Iceberg data and metadata files. + +## Overview + +The output stream, produced by a metadata or data writer, is split into equal-size blocks (plus last block that can be shorter). Each block is enciphered (encrypted/signed) with a given encryption key, and stored in a file in the AES GCM Stream format. Upon reading, the stored cipherblocks are verified for integrity; then decrypted and passed to metadata or data readers. + +## Encryption algorithm + +AES GCM Stream uses the standard AEG GCM cipher, and supports all AES key sizes: 128, 192 and 256 bits. + +AES GCM is an authenticated encryption. Besides data confidentiality (encryption), it supports two levels of integrity verification (authentication): of the data (default), and of the data combined with an optional AAD (“additional authenticated data”). An AAD is a free text to be authenticated, together with the data. The structure of AES GCM Stream AADs is described below. + +AES GCM requires a unique vector to be provided for each encrypted block. In this document, the unique input to GCM encryption is called nonce (“number used once”). AES GCM Stream encryption uses the RBG-based (random bit generator) nonce construction as defined in the section 8.2.2 of the NIST SP 800-38D document. For each encrypted block, AES GCM Stream generates a unique nonce with a length of 12 bytes (96 bits). + +## Format specification + +### File structure + +The AES GCM Stream files have the following structure + +``` +Magic BlockLength CipherBlock₁ CipherBlock₂ ... CipherBlockₙ +``` + +where + +- `Magic` is four bytes 0x41, 0x47, 0x53, 0x31 ("AGS1", short for: AES GCM Stream, version 1) +- `BlockLength` is four bytes (little endian) integer keeping the length of the equal-size split blocks before encryption. The length is specified in bytes. +- `CipherBlockᵢ` is the i-th enciphered block in the file, with the structure defined below. + +### Cipher Block structure + +Cipher blocks have the following structure + +| nonce | ciphertext | tag | +|-------|------------|-----| + +where + +- `nonce` is the AES GCM nonce, with a length of 12 bytes. +- `ciphertext` is the encrypted block. Its length is identical to the length of the block before encryption ("plaintext"). The length of all plaintext blocks, except the last, is `BlockLength` bytes. The last block has a non-zero length <= `BlockLength`. +- `tag` is the AES GCM tag, with a length of 16 bytes. + +AES GCM Stream encrypts all blocks by the GCM cipher, without padding. The AES GCM cipher must be implemented by a cryptographic provider according to the NIST SP 800-38D specification. In AES GCM Stream, an input to the GCM cipher is an AES encryption key, a nonce, a plaintext and an AAD (described below). The output is a ciphertext with the length equal to that of plaintext, and a 16-byte authentication tag used to verify the ciphertext and AAD integrity. + +### Additional Authenticated Data + +The AES GCM cipher protects against byte replacement inside a ciphertext block - but, without an AAD, it can't prevent replacement of one ciphertext block with another (encrypted with the same key). AES GCM Stream leverages AADs to protect against swapping ciphertext blocks inside a file or between files. AES GCM Stream can also protect against swapping full files - for example, replacement of a metadata file with an old version. AADs are built to reflects the identity of a file and of the blocks inside the file. + +AES GCM Stream constructs a block AAD from two components: an AAD prefix - a string provided by Iceberg for the file (with the file ID), and an AAD suffix - the block sequence number in the file, as an int in a 4-byte little-endian form. The block AAD is a direct concatenation of the prefix and suffix parts. diff --git a/docs-new/home/hive-quickstart.md b/docs-new/home/hive-quickstart.md new file mode 100644 index 000000000000..57cc02157a53 --- /dev/null +++ b/docs-new/home/hive-quickstart.md @@ -0,0 +1,115 @@ +--- +title: "Hive and Iceberg Quickstart" +--- + + + +## Hive and Iceberg Quickstart + +This guide will get you up and running with an Iceberg and Hive environment, including sample code to +highlight some powerful features. You can learn more about Iceberg's Hive runtime by checking out the [Hive](docs/latest/hive.md) section. + +- [Docker Images](#docker-images) +- [Creating a Table](#creating-a-table) +- [Writing Data to a Table](#writing-data-to-a-table) +- [Reading Data from a Table](#reading-data-from-a-table) +- [Next Steps](#next-steps) + +### Docker Images + +The fastest way to get started is to use [Apache Hive images](https://hub.docker.com/r/apache/hive) +which provides a SQL-like interface to create and query Iceberg tables from your laptop. You need to install the [Docker Desktop](https://www.docker.com/products/docker-desktop/). + +Take a look at the Tags tab in [Apache Hive docker images](https://hub.docker.com/r/apache/hive/tags?page=1&ordering=-last_updated) to see the available Hive versions. + +Set the version variable. +```sh +export HIVE_VERSION=4.0.0-alpha-2 +``` + +Start the container, using the option `--platform linux/amd64` for a Mac with an M-Series chip: +```sh +docker run -d --platform linux/amd64 -p 10000:10000 -p 10002:10002 --env SERVICE_NAME=hiveserver2 --name hive4 apache/hive:${HIVE_VERSION} +``` + +The docker run command above configures Hive to use the embedded derby database for Hive Metastore. Hive Metastore functions as the Iceberg catalog to locate Iceberg files, which can be anywhere. + +Give HiveServer (HS2) a little time to come up in the docker container, and then start the Hive Beeline client using the following command to connect with the HS2 containers you already started: +```sh +docker exec -it hive4 beeline -u 'jdbc:hive2://localhost:10000/' +``` + +The hive prompt appears: +```sh +0: jdbc:hive2://localhost:10000> +``` + +You can now run SQL queries to create Iceberg tables and query the tables. +```sql +show databases; +``` + +### Creating a Table + +To create your first Iceberg table in Hive, run a [`CREATE TABLE`](docs/latest/hive.md#create-table) command. Let's create a table +using `nyc.taxis` where `nyc` is the database name and `taxis` is the table name. +```sql +CREATE DATABASE nyc; +``` +```sql +CREATE TABLE nyc.taxis +( + trip_id bigint, + trip_distance float, + fare_amount double, + store_and_fwd_flag string +) +PARTITIONED BY (vendor_id bigint) STORED BY ICEBERG; +``` +Iceberg catalogs support the full range of SQL DDL commands, including: + +* [`CREATE TABLE`](docs/latest/hive.md#create-table) +* [`CREATE TABLE AS SELECT`](docs/latest/hive.md#create-table-as-select) +* [`CREATE TABLE LIKE TABLE`](docs/latest/hive.md#create-table-like-table) +* [`ALTER TABLE`](docs/latest/hive.md#alter-table) +* [`DROP TABLE`](docs/latest/hive.md#drop-table) + +### Writing Data to a Table + +After your table is created, you can insert records. +```sql +INSERT INTO nyc.taxis +VALUES (1000371, 1.8, 15.32, 'N', 1), (1000372, 2.5, 22.15, 'N', 2), (1000373, 0.9, 9.01, 'N', 2), (1000374, 8.4, 42.13, 'Y', 1); +``` + +### Reading Data from a Table + +To read a table, simply use the Iceberg table's name. +```sql +SELECT * FROM nyc.taxis; +``` + +### Next steps + +#### Adding Iceberg to Hive + +If you already have a Hive 4.0.0-alpha-1, or later, environment, it comes with the Iceberg 0.13.1 included. No additional downloads or jars are needed. If you have a Hive 2.3.x or Hive 3.1.x environment see [Enabling Iceberg support in Hive](docs/latest/hive.md#enabling-iceberg-support-in-hive). + +#### Learn More + +To learn more about setting up a database other than Derby, see [Apache Hive Quick Start](https://hive.apache.org/developement/quickstart/). You can also [set up a standalone metastore, HS2 and Postgres](https://github.com/apache/hive/blob/master/packaging/src/docker/docker-compose.yml). Now that you're up and running with Iceberg and Hive, check out the [Iceberg-Hive docs](docs/latest/hive.md) to learn more! diff --git a/docs-new/home/how-to-release.md b/docs-new/home/how-to-release.md new file mode 100644 index 000000000000..e2d9ae4ceec3 --- /dev/null +++ b/docs-new/home/how-to-release.md @@ -0,0 +1,516 @@ +--- +title: "How To Release" +--- + + +## Introduction + +This page walks you through the release process of the Iceberg project. [Here](https://www.apache.org/legal/release-policy.html) you can read about the release process in general for an Apache project. + +Decisions about releases are made by three groups: + +* Release Manager: Does the work of creating the release, signing it, counting [votes](#voting), announcing the release and so on. Requires the assistance of a committer for some steps. +* The community: Performs the discussion of whether it is the right time to create a release and what that release should contain. The community can also cast non-binding votes on the release. +* PMC: Gives binding votes on the release. + +This page describes the procedures that the release manager and voting PMC members take during the release process. + +## Setup + +To create a release candidate, you will need: + +* Apache LDAP credentals for Nexus and SVN +* A [GPG key for signing](https://www.apache.org/dev/release-signing#generate), published in [KEYS](https://dist.apache.org/repos/dist/dev/iceberg/KEYS) + +If you have not published your GPG key yet, you must publish it before sending the vote email by doing: + +```shell +svn co https://dist.apache.org/repos/dist/dev/iceberg icebergsvn +cd icebergsvn +echo "" >> KEYS # append a newline +gpg --list-sigs >> KEYS # append signatures +gpg --armor --export >> KEYS # append public key block +svn commit -m "add key for " +``` + +### Nexus access + +Nexus credentials are configured in your personal `~/.gradle/gradle.properties` file using `mavenUser` and `mavenPassword`: + +``` +mavenUser=yourApacheID +mavenPassword=SomePassword +``` + +### PGP signing + +The release scripts use the command-line `gpg` utility so that signing can use the gpg-agent and does not require writing your private key's passphrase to a configuration file. + +To configure gradle to sign convenience binary artifacts, add the following settings to `~/.gradle/gradle.properties`: + +``` +signing.gnupg.keyName=Your Name (CODE SIGNING KEY) +``` + +To use `gpg` instead of `gpg2`, also set `signing.gnupg.executable=gpg` + +For more information, see the Gradle [signing documentation](https://docs.gradle.org/current/userguide/signing_plugin.html#sec:signatory_credentials). + +### Apache repository + +The release should be executed against `https://github.com/apache/iceberg.git` instead of any fork. +Set it as remote with name `apache` for release if it is not already set up. + +## Creating a release candidate + +### Initiate a discussion about the release with the community + +This step can be useful to gather ongoing patches that the community thinks should be in the upcoming release. + +The communication can be started via a [DISCUSS] mail on the dev@ channel and the desired tickets can be added to the github milestone of the next release. + +Note, creating a milestone in github requires a committer. However, a non-committer can assign tasks to a milestone if added to the list of collaborators in [.asf.yaml](https://github.com/apache/iceberg/blob/master/.asf.yaml) + +The release status is discussed during each community sync meeting. Release manager should join the meeting to report status and discuss any release blocker. + +### Build the source release + +To create the source release artifacts, run the `source-release.sh` script with the release version and release candidate number: + +```bash +dev/source-release.sh -v 0.13.0 -r 0 -k +``` + +Example console output: + +```text +Preparing source for apache-iceberg-0.13.0-rc1 +Adding version.txt and tagging release... +[master ca8bb7d0] Add version.txt for release 0.13.0 + 1 file changed, 1 insertion(+) + create mode 100644 version.txt +Pushing apache-iceberg-0.13.0-rc1 to origin... +Enumerating objects: 5, done. +Counting objects: 100% (5/5), done. +Delta compression using up to 12 threads +Compressing objects: 100% (3/3), done. +Writing objects: 100% (4/4), 433 bytes | 433.00 KiB/s, done. +Total 4 (delta 1), reused 0 (delta 0) +remote: Resolving deltas: 100% (1/1), completed with 1 local object. +To https://github.com/apache/iceberg.git + * [new tag] apache-iceberg-0.13.0-rc1 -> apache-iceberg-0.13.0-rc1 +Creating tarball using commit ca8bb7d0821f35bbcfa79a39841be8fb630ac3e5 +Signing the tarball... +Checking out Iceberg RC subversion repo... +Checked out revision 52260. +Adding tarball to the Iceberg distribution Subversion repo... +A tmp/apache-iceberg-0.13.0-rc1 +A tmp/apache-iceberg-0.13.0-rc1/apache-iceberg-0.13.0.tar.gz.asc +A (bin) tmp/apache-iceberg-0.13.0-rc1/apache-iceberg-0.13.0.tar.gz +A tmp/apache-iceberg-0.13.0-rc1/apache-iceberg-0.13.0.tar.gz.sha512 +Adding tmp/apache-iceberg-0.13.0-rc1 +Adding (bin) tmp/apache-iceberg-0.13.0-rc1/apache-iceberg-0.13.0.tar.gz +Adding tmp/apache-iceberg-0.13.0-rc1/apache-iceberg-0.13.0.tar.gz.asc +Adding tmp/apache-iceberg-0.13.0-rc1/apache-iceberg-0.13.0.tar.gz.sha512 +Transmitting file data ...done +Committing transaction... +Committed revision 52261. +Creating release-announcement-email.txt... +Success! The release candidate is available here: + https://dist.apache.org/repos/dist/dev/iceberg/apache-iceberg-0.13.0-rc1 + +Commit SHA1: ca8bb7d0821f35bbcfa79a39841be8fb630ac3e5 + +We have generated a release announcement email for you here: +/Users/jackye/iceberg/release-announcement-email.txt + +Please note that you must update the Nexus repository URL +contained in the mail before sending it out. +``` + +The source release script will create a candidate tag based on the HEAD revision in git and will prepare the release tarball, signature, and checksum files. It will also upload the source artifacts to SVN. + +Note the commit SHA1 and candidate location because those will be added to the vote thread. + +Once the source release is ready, use it to stage convenience binary artifacts in Nexus. + +### Build and stage convenience binaries + +Convenience binaries are created using the source release tarball from in the last step. + +Untar the source release and go into the release directory: + +```bash +tar xzf apache-iceberg-0.13.0.tar.gz +cd apache-iceberg-0.13.0 +``` + +To build and publish the convenience binaries, run the `dev/stage-binaries.sh` script. This will push to a release staging repository. + +Disable gradle parallelism by setting `org.gradle.parallel=false` in `gradle.properties`. + +``` +dev/stage-binaries.sh +``` + +Next, you need to close the staging repository: + +1. Go to [Nexus](https://repository.apache.org/) and log in +2. In the menu on the left, choose "Staging Repositories" +3. Select the Iceberg repository + * If multiple staging repositories are created after running the script, verify that gradle parallelism is disabled and try again. +4. At the top, select "Close" and follow the instructions + * In the comment field use "Apache Iceberg <version> RC<num>" + +### Start a VOTE thread + +The last step for a candidate is to create a VOTE thread on the dev mailing list. +The email template is already generated in `release-announcement-email.txt` with some details filled. + +Example title subject: + +```text +[VOTE] Release Apache Iceberg RC +``` + +Example content: + +```text +Hi everyone, + +I propose the following RC to be released as official Apache Iceberg release. + +The commit id is +* This corresponds to the tag: apache-iceberg--rc +* https://github.com/apache/iceberg/commits/apache-iceberg--rc +* https://github.com/apache/iceberg/tree/ + +The release tarball, signature, and checksums are here: +* https://dist.apache.org/repos/dist/dev/iceberg/apache-iceberg--rc/ + +You can find the KEYS file here: +* https://dist.apache.org/repos/dist/dev/iceberg/KEYS + +Convenience binary artifacts are staged in Nexus. The Maven repository URL is: +* https://repository.apache.org/content/repositories/orgapacheiceberg-/ + +This release includes important changes that I should have summarized here, but I'm lazy. + +Please download, verify, and test. + +Please vote in the next 72 hours. (Weekends excluded) + +[ ] +1 Release this as Apache Iceberg +[ ] +0 +[ ] -1 Do not release this because... + +Only PMC members have binding votes, but other community members are encouraged to cast +non-binding votes. This vote will pass if there are 3 binding +1 votes and more binding ++1 votes than -1 votes. +``` + +When a candidate is passed or rejected, reply with the voting result: + +```text +Subject: [RESULT][VOTE] Release Apache Iceberg RC +``` + +```text +Thanks everyone who participated in the vote for Release Apache Iceberg RC. + +The vote result is: + ++1: 3 (binding), 5 (non-binding) ++0: 0 (binding), 0 (non-binding) +-1: 0 (binding), 0 (non-binding) + +Therefore, the release candidate is passed/rejected. +``` + + +### Finishing the release + +After the release vote has passed, you need to release the last candidate's artifacts. + +But note that releasing the artifacts should happen around the same time the new docs are released +so make sure the [documentation changes](#documentation-release) +are prepared when going through the below steps. + +#### Publishing the release + +First, copy the source release directory to releases: + +```bash +mkdir iceberg +cd iceberg +svn co https://dist.apache.org/repos/dist/dev/iceberg candidates +svn co https://dist.apache.org/repos/dist/release/iceberg releases +cp -r candidates/apache-iceberg--rcN/ releases/apache-iceberg- +cd releases +svn add apache-iceberg- +svn ci -m 'Iceberg: Add release ' +``` + +!!! Note +The above step requires PMC privileges to execute. + +Next, add a release tag to the git repository based on the passing candidate tag: + +```bash +git tag -am 'Release Apache Iceberg ' apache-iceberg- apache-iceberg--rcN +``` + +Then release the candidate repository in [Nexus](https://repository.apache.org/#stagingRepositories). + +#### Announcing the release + +To announce the release, wait until Maven central has mirrored the Apache binaries, then update the Iceberg site and send an announcement email: + +```text +[ANNOUNCE] Apache Iceberg release +``` +```text +I'm pleased to announce the release of Apache Iceberg ! + +Apache Iceberg is an open table format for huge analytic datasets. Iceberg +delivers high query performance for tables with tens of petabytes of data, +along with atomic commits, concurrent writes, and SQL-compatible table +evolution. + +This release can be downloaded from: https://www.apache.org/dyn/closer.cgi/iceberg// + +Java artifacts are available from Maven Central. + +Thanks to everyone for contributing! +``` + +#### Update revapi + +Create a PR in the `iceberg` repo to make revapi run on the new release. For an example see [this PR](https://github.com/apache/iceberg/pull/6275). + +#### Update github issue template + +Create a PR in the `iceberg` repo to add the new version to the github issue template. For an example see [this PR](https://github.com/apache/iceberg/pull/6287). + +### Documentation Release + +Documentation needs to be updated as a part of an Iceberg release after a release candidate is passed. +The commands described below assume you are in a directory containing a local clone of the `iceberg-docs` +repository and `iceberg` repository. Adjust the commands accordingly if it is not the case. Note that all +changes in `iceberg` need to happen against the `master` branch and changes in `iceberg-docs` need to happen +against the `main` branch. + +#### Common documentation update + +1. To start the release process, run the following steps in the `iceberg-docs` repository to copy docs over: +```shell +cp -r ../iceberg/format/* ../iceberg-docs/landing-page/content/common/ +``` +2. Change into the `iceberg-docs` repository and create a branch. +```shell +cd ../iceberg-docs +git checkout -b +``` +3. Commit, push, and open a PR against the `iceberg-docs` repo (`` -> `main`) + +#### Versioned documentation update + +Once the common docs changes have been merged into `main`, the next step is to update the versioned docs. + +1. In the `iceberg-docs` repository, cut a new branch using the version number as the branch name +```shell +cd ../iceberg-docs +git checkout -b +git push --set-upstream apache +``` +2. Copy the versioned docs from the `iceberg` repo into the `iceberg-docs` repo +```shell +rm -rf ../iceberg-docs/docs/content +cp -r ../iceberg/docs ../iceberg-docs/docs/content +``` +3. Commit the changes and open a PR against the `` branch in the `iceberg-docs` repo + +#### Javadoc update + +In the `iceberg` repository, generate the javadoc for your release and copy it to the `javadoc` folder in `iceberg-docs` repo: +```shell +cd ../iceberg +./gradlew refreshJavadoc +rm -rf ../iceberg-docs/javadoc +cp -r site/docs/javadoc/ ../iceberg-docs/javadoc +``` + +This resulted changes in `iceberg-docs` should be approved in a separate PR. + +#### Update the latest branch + +Since `main` is currently the same as the version branch, one needs to rebase `latest` branch against `main`: + +```shell +git checkout latest +git rebase main +git push apache latest +``` + +#### Set latest version in iceberg-docs repo + +The last step is to update the `main` branch in `iceberg-docs` to set the latest version. +A PR needs to be published in the `iceberg-docs` repository with the following changes: +1. Update variable `latestVersions.iceberg` to the new release version in `landing-page/config.toml` +2. Update variable `latestVersions.iceberg` to the new release version and +`versions.nessie` to the version of `org.projectnessie.nessie:*` from [versions.props](https://github.com/apache/iceberg/blob/master/versions.props) in `docs/config.toml` +3. Update list `versions` with the new release in `landing-page/config.toml` +4. Update list `versions` with the new release in `docs/config.toml` +5. Mark the current latest release notes to past releases under `landing-page/content/common/release-notes.md` +6. Add release notes for the new release version in `landing-page/content/common/release-notes.md` + +# How to Verify a Release + +Each Apache Iceberg release is validated by the community by holding a vote. A community release manager +will prepare a release candidate and call a vote on the Iceberg +[dev list](community.md#mailing-lists). +To validate the release candidate, community members will test it out in their downstream projects and environments. +It's recommended to report the Java, Scala, Spark, Flink and Hive versions you have tested against when you vote. + +In addition to testing in downstream projects, community members also check the release's signatures, checksums, and +license documentation. + +## Validating a source release candidate + +Release announcements include links to the following: + +- **A source tarball** +- **A signature (.asc)** +- **A checksum (.sha512)** +- **KEYS file** +- **GitHub change comparison** + +After downloading the source tarball, signature, checksum, and KEYS file, here are instructions on how to +verify signatures, checksums, and documentation. + +### Verifying Signatures + +First, import the keys. +```bash +curl https://dist.apache.org/repos/dist/dev/iceberg/KEYS -o KEYS +gpg --import KEYS +``` + +Next, verify the `.asc` file. +```bash +gpg --verify apache-iceberg-{{ icebergVersion }}.tar.gz.asc +``` + +### Verifying Checksums + +```bash +shasum -a 512 --check apache-iceberg-{{ icebergVersion }}.tar.gz.sha512 +``` + +### Verifying License Documentation + +Untar the archive and change into the source directory. +```bash +tar xzf apache-iceberg-{{ icebergVersion }}.tar.gz +cd apache-iceberg-{{ icebergVersion }} +``` + +Run RAT checks to validate license headers. +```bash +dev/check-license +``` + +### Verifying Build and Test + +To verify that the release candidate builds properly, run the following command. +```bash +./gradlew build +``` + +## Testing release binaries + +Release announcements will also include a maven repository location. You can use this +location to test downstream dependencies by adding it to your maven or gradle build. + +To use the release in your maven build, add the following to your `POM` or `settings.xml`: +```xml +... + + + iceberg-release-candidate + Iceberg Release Candidate + ${MAVEN_URL} + + +... +``` + +To use the release in your gradle build, add the following to your `build.gradle`: +```groovy +repositories { + mavenCentral() + maven { + url "${MAVEN_URL}" + } +} +``` + +!!! Note +Replace `${MAVEN_URL}` with the URL provided in the release announcement + +### Verifying with Spark + +To verify using spark, start a `spark-shell` with a command like the following command (use the appropriate +spark-runtime jar for the Spark installation): +```bash +spark-shell \ + --conf spark.jars.repositories=${MAVEN_URL} \ + --packages org.apache.iceberg:iceberg-spark3-runtime:{{ icebergVersion }} \ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf spark.sql.catalog.local.warehouse=${LOCAL_WAREHOUSE_PATH} \ + --conf spark.sql.catalog.local.default-namespace=default \ + --conf spark.sql.defaultCatalog=local +``` + +### Verifying with Flink + +To verify using Flink, start a Flink SQL Client with the following command: +```bash +wget ${MAVEN_URL}/iceberg-flink-runtime/{{ icebergVersion }}/iceberg-flink-runtime-{{ icebergVersion }}.jar + +sql-client.sh embedded \ + -j iceberg-flink-runtime-{{ icebergVersion }}.jar \ + -j ${FLINK_CONNECTOR_PACKAGE}-${HIVE_VERSION}_${SCALA_VERSION}-${FLINK_VERSION}.jar \ + shell +``` + +## Voting + +Votes are cast by replying to the release candidate announcement email on the dev mailing list +with either `+1`, `0`, or `-1`. + +> [ ] +1 Release this as Apache Iceberg {{ icebergVersion }} +[ ] +0 +[ ] -1 Do not release this because... + +In addition to your vote, it's customary to specify if your vote is binding or non-binding. Only members +of the Project Management Committee have formally binding votes. If you're unsure, you can specify that your +vote is non-binding. To read more about voting in the Apache framework, checkout the +[Voting](https://www.apache.org/foundation/voting.html) information page on the Apache foundation's website. diff --git a/docs-new/home/index.md b/docs-new/home/index.md new file mode 100644 index 000000000000..ff7576fac9a3 --- /dev/null +++ b/docs-new/home/index.md @@ -0,0 +1,108 @@ +--- +title: "Intro to Apache Iceberg" +--- + + + + + +
+
+
+
+ +## Apache Iceberg + +### The open table format for analytic datasets. + +* [Community](community.md) +* [GitHub](https://github.com/apache/iceberg) +* [Slack](https://github.com/apache/iceberg) + +## What is Iceberg? + +Iceberg is a high-performance format for huge analytic tables. Iceberg brings the reliability and simplicity of SQL tables to big data, while making it possible for engines like Spark, Trino, Flink, Presto, Hive and Impala to safely work with the same tables, at the same time. + +[Learn More]('https://iceberg.apache.org/getting-started') + +## Expressive SQL + +Iceberg supports flexible SQL commands to merge new data, update existing rows, and perform targeted deletes. Iceberg can eagerly rewrite data files for read performance, or it can use delete deltas for faster updates. +[Learn More]('https://iceberg.apache.org/docs/latest/spark-writes/') + +
+
MERGE INTO prod.nyc.taxis ptUSING (SELECT * FROM staging.nyc.taxis) stON pt.id = st.idWHEN NOT MATCHED THEN INSERT *;Done! +
+
+ +## Full Schema Evolution + +Schema evolution just works. Adding a column won't bring back "zombie"🧟 data. Columns can be renamed and reordered. Best of all, schema changes never require rewriting your table. + +[Learn More]('https://iceberg.apache.org/docs/latest/evolution/') + +
+
ALTER TABLE taxisALTER COLUMN trip_distanceTYPE double;Done!ALTER TABLE taxisALTER COLUMN trip_distanceAFTER fare;Done!ALTER TABLE taxisRENAME COLUMN trip_distanceTO distance;Done! +
+
+ +## Hidden Partitioning + +Iceberg handles the tedious and error-prone task of producing partition values for rows in a table and skips unnecessary partitions and files automatically. No extra filters are needed for fast queries, and table layout can be updated as data or queries change. + +[Learn More]('https://iceberg.apache.org/docs/latest/partitioning/#icebergs-hidden-partitioning') + +
+
+ + +
+
+ +## Time Travel and Rollback + +Time-travel enables reproducible queries that use exactly the same table snapshot, or lets users easily examine changes. Version rollback allows users to quickly correct problems by resetting tables to a good state. +[Learn More]('https://iceberg.apache.org/docs/latest/spark-queries/#time-travel') + +
+
SELECT count(*) FROM nyc.taxis2,853,020SELECT count(*) FROM nyc.taxis FOR VERSION AS OF 21884653078355854432,798,371SELECT count(*) FROM nyc.taxis FOR TIMESTAMP AS OF TIMESTAMP '2022-01-01 00:00:00.000000 Z'2,798,371 +
+
+ +## Data Compaction + +Data compaction is supported out-of-the-box and you can choose from different rewrite strategies such as bin-packing or sorting to optimize file layout and size. + +
+
CALL system.rewrite_data_files("nyc.taxis"); +
+
+ + diff --git a/docs-new/home/multi-engine-support.md b/docs-new/home/multi-engine-support.md new file mode 100644 index 000000000000..75477ffc7106 --- /dev/null +++ b/docs-new/home/multi-engine-support.md @@ -0,0 +1,116 @@ +--- +title: "Multi-Engine Support" +--- + + +# Multi-Engine Support + +Apache Iceberg is an open standard for huge analytic tables that can be used by any processing engine. +The community continuously improves Iceberg core library components to enable integrations with different compute engines that power analytics, business intelligence, machine learning, etc. +Connectors for Spark, Flink and Hive are maintained in the main Iceberg repository. + +## Multi-Version Support + +Processing engine connectors maintained in the iceberg repository are built for multiple versions. + +For Spark and Flink, each new version that introduces backwards incompatible upgrade has its dedicated integration codebase and release artifacts. +For example, the code for Iceberg Spark 3.1 integration is under `/spark/v3.1` and the code for Iceberg Spark 3.2 integration is under `/spark/v3.2`. +Different artifacts (`iceberg-spark-3.1_2.12` and `iceberg-spark-3.2_2.12`) are released for users to consume. +By doing this, changes across versions are isolated. +New features in Iceberg could be developed against the latest features of an engine without breaking support of old APIs in past engine versions. + +For Hive, Hive 2 uses the `iceberg-mr` package for Iceberg integration, and Hive 3 requires an additional dependency of the `iceberg-hive3` package. + +### Runtime Jar + +Iceberg provides a runtime connector jar for each supported version of Spark, Flink and Hive. +When using Iceberg with these engines, the runtime jar is the only addition to the classpath needed in addition to vendor dependencies. +For example, to use Iceberg with Spark 3.2 and AWS integrations, `iceberg-spark-runtime-3.2_2.12` and AWS SDK dependencies are needed for the Spark installation. + +Spark and Flink provide different runtime jars for each supported engine version. +Hive 2 and Hive 3 currently share the same runtime jar. +The runtime jar names and latest version download links are listed in [the tables below](#current-engine-version-lifecycle-status). + +### Engine Version Lifecycle + +Each engine version undergoes the following lifecycle stages: + +1. **Beta**: a new engine version is supported, but still in the experimental stage. Maybe the engine version itself is still in preview (e.g. Spark `3.0.0-preview`), or the engine does not yet have full feature compatibility compared to old versions yet. This stage allows Iceberg to release an engine version support without the need to wait for feature parity, shortening the release time. +2. **Maintained**: an engine version is actively maintained by the community. Users can expect parity for most features across all the maintained versions. If a feature has to leverage some new engine functionalities that older versions don't have, then feature parity across maintained versions is not guaranteed. +3. **Deprecated**: an engine version is no longer actively maintained. People who are still interested in the version can backport any necessary feature or bug fix from newer versions, but the community will not spend effort in achieving feature parity. Iceberg recommends users to move towards a newer version. Contributions to a deprecated version is expected to diminish over time, so that eventually no change is added to a deprecated version. +4. **End-of-life**: a vote can be initiated in the community to fully remove a deprecated version out of the Iceberg repository to mark as its end of life. + +## Current Engine Version Lifecycle Status + +### Apache Spark + +| Version | Lifecycle Stage | Initial Iceberg Support | Latest Iceberg Support | Latest Runtime Jar | +| ---------- | ------------------ | ----------------------- |------------------------| ------------------ | +| 2.4 | End of Life | 0.7.0-incubating | 1.2.1 | [iceberg-spark-runtime-2.4](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-2.4/1.2.1/iceberg-spark-runtime-2.4-1.2.1.jar) | +| 3.0 | End of Life | 0.9.0 | 1.0.0 | [iceberg-spark-runtime-3.0_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.0_2.12/1.0.0/iceberg-spark-runtime-3.0_2.12-1.0.0.jar) | +| 3.1 | Deprecated | 0.12.0 | {{ icebergVersion }} | [iceberg-spark-runtime-3.1_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.1_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.1_2.12-{{ icebergVersion }}.jar) [1] | +| 3.2 | Maintained | 0.13.0 | {{ icebergVersion }} | [iceberg-spark-runtime-3.2_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.2_2.12-{{ icebergVersion }}.jar) | +| 3.3 | Maintained | 0.14.0 | {{ icebergVersion }} | [iceberg-spark-runtime-3.3_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.3_2.12-{{ icebergVersion }}.jar) | +| 3.4 | Maintained | 1.3.0 | {{ icebergVersion }} | [iceberg-spark-runtime-3.4_2.12](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.4_2.12-{{ icebergVersion }}.jar) | + +* [1] Spark 3.1 shares the same runtime jar `iceberg-spark3-runtime` with Spark 3.0 before Iceberg 0.13.0 + +### Apache Flink + +Based on the guideline of the Flink community, only the latest 2 minor versions are actively maintained. +Users should continuously upgrade their Flink version to stay up-to-date. + +| Version | Lifecycle Stage | Initial Iceberg Support | Latest Iceberg Support | Latest Runtime Jar | +| ------- | --------------- | ----------------------- | ---------------------- | ------------------------------------------------------------ | +| 1.11 | End of Life | 0.9.0 | 0.12.1 | [iceberg-flink-runtime](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime/0.12.1/iceberg-flink-runtime-0.12.1.jar) | +| 1.12 | End of Life | 0.12.0 | 0.13.1 | [iceberg-flink-runtime-1.12](https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.12/0.13.2/iceberg-flink-runtime-1.12-0.13.2.jar) [3] | +| 1.13 | End of Life | 0.13.0 | 1.0.0 | [iceberg-flink-runtime-1.13](https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.13/1.2.0/iceberg-flink-runtime-1.13-1.0.0.jar) | +| 1.14 | End of Life | 0.13.0 | 1.2.0 | [iceberg-flink-runtime-1.14](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.14/1.2.0/iceberg-flink-runtime-1.14-1.2.0.jar) | +| 1.15 | Deprecated | 0.14.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.15](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.15/{{ icebergVersion }}/iceberg-flink-runtime-1.15-{{ icebergVersion }}.jar) | +| 1.16 | Maintained | 1.1.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.16](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.16/{{ icebergVersion }}/iceberg-flink-runtime-1.16-{{ icebergVersion }}.jar) | +| 1.17 | Maintained | 1.3.0 | {{ icebergVersion }} | [iceberg-flink-runtime-1.17](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.17/{{ icebergVersion }}/iceberg-flink-runtime-1.17-{{ icebergVersion }}.jar) | + +* [3] Flink 1.12 shares the same runtime jar `iceberg-flink-runtime` with Flink 1.11 before Iceberg 0.13.0 + +### Apache Hive + +| Version | Recommended minor version | Lifecycle Stage | Initial Iceberg Support | Latest Iceberg Support | Latest Runtime Jar | +| -------------- | ------------------------- | ----------------- | ----------------------- | ---------------------- | ------------------ | +| 2 | 2.3.8 | Maintained | 0.8.0-incubating | {{ icebergVersion }} | [iceberg-hive-runtime](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/{{ icebergVersion }}/iceberg-hive-runtime-{{ icebergVersion }}.jar) | +| 3 | 3.1.2 | Maintained | 0.10.0 | {{ icebergVersion }} | [iceberg-hive-runtime](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/{{ icebergVersion }}/iceberg-hive-runtime-{{ icebergVersion }}.jar) | + +## Developer Guide + +### Maintaining existing engine versions + +Iceberg recommends the following for developers who are maintaining existing engine versions: + +1. New features should always be prioritized first in the latest version, which is either a maintained or beta version. +2. For features that could be backported, contributors are encouraged to either perform backports to all maintained versions, or at least create some issues to track the backport. +3. If the change is small enough, updating all versions in a single PR is acceptable. Otherwise, using separated PRs for each version is recommended. + +### Supporting new engines + +Iceberg recommends new engines to build support by importing the Iceberg libraries to the engine's project. +This allows the Iceberg support to evolve with the engine. +Projects such as [Trino](https://trino.io/docs/current/connector/iceberg.html) and [Presto](https://prestodb.io/docs/current/connector/iceberg.html) are good examples of such support strategy. + +In this approach, an Iceberg version upgrade is needed for an engine to consume new Iceberg features. +To facilitate engine development against unreleased Iceberg features, a daily snapshot is published in the [Apache snapshot repository](https://repository.apache.org/content/repositories/snapshots/org/apache/iceberg/). + +If bringing an engine directly to the Iceberg main repository is needed, please raise a discussion thread in the [Iceberg community](community.md). diff --git a/docs-new/home/puffin-spec.md b/docs-new/home/puffin-spec.md new file mode 100644 index 000000000000..1fe9e5b9bc56 --- /dev/null +++ b/docs-new/home/puffin-spec.md @@ -0,0 +1,143 @@ +--- +title: "Puffin Spec" +--- + + +# Puffin file format + +This is a specification for Puffin, a file format designed to store +information such as indexes and statistics about data managed in an +Iceberg table that cannot be stored directly within the Iceberg manifest. A +Puffin file contains arbitrary pieces of information (here called "blobs"), +along with metadata necessary to interpret them. The blobs supported by Iceberg +are documented at [Blob types](#blob-types). + +## Format specification + +A file conforming to the Puffin file format specification should have the structure +as described below. + +### Versions + +Currently, there is a single version of the Puffin file format, described below. + +### File structure + +The Puffin file has the following structure + +``` +Magic Blob₁ Blob₂ ... Blobₙ Footer +``` + +where + +- `Magic` is four bytes 0x50, 0x46, 0x41, 0x31 (short for: Puffin _Fratercula + arctica_, version 1), +- `Blobᵢ` is i-th blob contained in the file, to be interpreted by application + according to the footer, +- `Footer` is defined below. + +### Footer structure + +Footer has the following structure + +``` +Magic FooterPayload FooterPayloadSize Flags Magic +``` + +where + +- `Magic`: four bytes, same as at the beginning of the file +- `FooterPayload`: optionally compressed, UTF-8 encoded JSON payload describing the + blobs in the file, with the structure described below +- `FooterPayloadSize`: a length in bytes of the `FooterPayload` (after compression, + if compressed), stored as 4 byte integer +- `Flags`: 4 bytes for boolean flags + - byte 0 (first) + - bit 0 (lowest bit): whether `FooterPayload` is compressed + - all other bits are reserved for future use and should be set to 0 on write + - all other bytes are reserved for future use and should be set to 0 on write + +A 4 byte integer is always signed, in a two's complement representation, stored +little-endian. + +### Footer Payload + +Footer payload bytes is either uncompressed or LZ4-compressed (as a single +[LZ4 compression frame](https://github.com/lz4/lz4/blob/77d1b93f72628af7bbde0243b4bba9205c3138d9/doc/lz4_Frame_format.md) +with content size present), UTF-8 encoded JSON payload representing a single +`FileMetadata` object. + +#### FileMetadata + +`FileMetadata` has the following fields + + +| Field Name | Field Type | Required | Description | +| ---------- | --------------------------------------- | -------- | ----------- | +| blobs | list of BlobMetadata objects | yes | +| properties | JSON object with string property values | no | storage for arbitrary meta-information, like writer identification/version. See [Common properties](#common-properties) for properties that are recommended to be set by a writer. + +#### BlobMetadata + +`BlobMetadata` has the following fields + +| Field Name | Field Type | Required | Description | +|-------------------|-----------------------------------------|----------| ----------- | +| type | JSON string | yes | See [Blob types](#blob-types) +| fields | JSON list of ints | yes | List of field IDs the blob was computed for; the order of items is used to compute sketches stored in the blob. +| snapshot-id | JSON long | yes | ID of the Iceberg table's snapshot the blob was computed from. +| sequence-number | JSON long | yes | Sequence number of the Iceberg table's snapshot the blob was computed from. +| offset | JSON long | yes | The offset in the file where the blob contents start +| length | JSON long | yes | The length of the blob stored in the file (after compression, if compressed) +| compression-codec | JSON string | no | See [Compression codecs](#compression-codecs). If omitted, the data is assumed to be uncompressed. +| properties | JSON object with string property values | no | storage for arbitrary meta-information about the blob + +### Blob types + +The blobs can be of a type listed below + +#### `apache-datasketches-theta-v1` blob type + +A serialized form of a "compact" Theta sketch produced by the [Apache +DataSketches](https://datasketches.apache.org/) library. The sketch is obtained by +constructing Alpha family sketch with default seed, and feeding it with individual +distinct values converted to bytes using Iceberg's single-value serialization. + +The blob metadata for this blob may include following properties: + +- `ndv`: estimate of number of distinct values, derived from the sketch. + +### Compression codecs + +The data can also be uncompressed. If it is compressed the codec should be one of +codecs listed below. For maximal interoperability, other codecs are not supported. + +| Codec name | Description | +|------------|-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------| +| lz4 | Single [LZ4 compression frame](https://github.com/lz4/lz4/blob/77d1b93f72628af7bbde0243b4bba9205c3138d9/doc/lz4_Frame_format.md), with content size present | +| zstd | Single [Zstandard compression frame](https://github.com/facebook/zstd/blob/8af64f41161f6c2e0ba842006fe238c664a6a437/doc/zstd_compression_format.md#zstandard-frames), with content size present | +__ + +### Common properties + +When writing a Puffin file it is recommended to set the following fields in the +[FileMetadata](#filemetadata)'s `properties` field. + +- `created-by` - human-readable identification of the application writing the file, + along with its version. Example "Trino version 381". diff --git a/docs-new/home/releases.md b/docs-new/home/releases.md new file mode 100644 index 000000000000..264773b46dc7 --- /dev/null +++ b/docs-new/home/releases.md @@ -0,0 +1,777 @@ +--- +title: "Releases" +--- + + +## Downloads + +The latest version of Iceberg is [{{ icebergVersion }}](https://github.com/apache/iceberg/releases/tag/apache-iceberg-{{ icebergVersion }}). + +* [{{ icebergVersion }} source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-{{ icebergVersion }}/apache-iceberg-{{ icebergVersion }}.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-{{ icebergVersion }}/apache-iceberg-{{ icebergVersion }}.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-{{ icebergVersion }}/apache-iceberg-{{ icebergVersion }}.tar.gz.sha512) +* [{{ icebergVersion }} Spark 3.4\_2.12 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.4_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.4_2.12-{{ icebergVersion }}.jar) -- [3.4\_2.13](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.4_2.13/{{ icebergVersion }}/iceberg-spark-runtime-3.4_2.13-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Spark 3.3\_2.12 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.3_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.3_2.12-{{ icebergVersion }}.jar) -- [3.3\_2.13](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.3_2.13/{{ icebergVersion }}/iceberg-spark-runtime-3.3_2.13-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Spark 3.2\_2.12 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.2_2.12-{{ icebergVersion }}.jar) -- [3.2\_2.13](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.13/{{ icebergVersion }}/iceberg-spark-runtime-3.2_2.13-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Spark 3.1 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.1_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.1_2.12-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Flink 1.17 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.17/{{ icebergVersion }}/iceberg-flink-runtime-1.17-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Flink 1.16 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.16/{{ icebergVersion }}/iceberg-flink-runtime-1.16-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Flink 1.15 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.15/{{ icebergVersion }}/iceberg-flink-runtime-1.15-{{ icebergVersion }}.jar) +* [{{ icebergVersion }} Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/{{ icebergVersion }}/iceberg-hive-runtime-{{ icebergVersion }}.jar) + +To use Iceberg in Spark or Flink, download the runtime JAR for your engine version and add it to the jars folder of your installation. + +To use Iceberg in Hive 2 or Hive 3, download the Hive runtime JAR and add it to Hive using `ADD JAR`. + +### Gradle + +To add a dependency on Iceberg in Gradle, add the following to `build.gradle`: + +``` +dependencies { + compile 'org.apache.iceberg:iceberg-core:{{ icebergVersion }}' +} +``` + +You may also want to include `iceberg-parquet` for Parquet file support. + +### Maven + +To add a dependency on Iceberg in Maven, add the following to your `pom.xml`: + +``` + + ... + + org.apache.iceberg + iceberg-core + {{ icebergVersion }} + + ... + +``` + +## 1.3.1 release + +Apache Iceberg 1.3.1 was released on July 25, 2023. +The 1.3.1 release addresses various issues identified in the 1.3.0 release. + +* Core + - Table Metadata parser now accepts null for fields: current-snapshot-id, properties, and snapshots ([\#8064](https://github.com/apache/iceberg/pull/8064)) +* Hive + - Fix HiveCatalog deleting metadata on failures in checking lock status ([\#7931](https://github.com/apache/iceberg/pull/7931)) +* Spark + - Fix RewritePositionDeleteFiles failure for certain partition types ([\#8059](https://github.com/apache/iceberg/pull/8059)) + - Fix RewriteDataFiles concurrency edge-case on commit timeouts ([\#7933](https://github.com/apache/iceberg/pull/7933)) + - Fix partition-level DELETE operations for WAP branches ([\#7900](https://github.com/apache/iceberg/pull/7900)) +* Flink + - FlinkCatalog creation no longer creates the default database ([\#8039](https://github.com/apache/iceberg/pull/8039)) + +## Past releases + +### 1.3.0 release + +Apache Iceberg 1.3.0 was released on May 30th, 2023. +The 1.3.0 release adds a variety of new features and bug fixes. + +* Core + - Expose file and data sequence numbers in ContentFile ([\#7555](https://github.com/apache/iceberg/pull/7555)) + - Improve bit density in object storage layout ([\#7128](https://github.com/apache/iceberg/pull/7128)) + - Store split offsets for delete files ([\#7011](https://github.com/apache/iceberg/pull/7011)) + - Readable metrics in entries metadata table ([\#7539](https://github.com/apache/iceberg/pull/7539)) + - Delete file stats in partitions metadata table ([\#6661](https://github.com/apache/iceberg/pull/6661)) + - Optimized vectorized reads for Parquet Decimal ([\#3249](https://github.com/apache/iceberg/pull/3249)) + - Vectorized reads for Parquet INT96 timestamps in imported data ([\#6962](https://github.com/apache/iceberg/pull/6962)) + - Support selected vector with ORC row and batch readers ([\#7197](https://github.com/apache/iceberg/pull/7197)) + - Clean up expired metastore clients ([\#7310](https://github.com/apache/iceberg/pull/7310)) + - Support for deleting old partition spec columns in V1 tables ([\#7398](https://github.com/apache/iceberg/pull/7398)) +* Spark + - Initial support for Spark 3.4 + - Removed integration for Spark 2.4 + - Support for storage-partitioned joins with mismatching keys in Spark 3.4 (MERGE commands) ([\#7424](https://github.com/apache/iceberg/pull/7424)) + - Support for TimestampNTZ in Spark 3.4 ([\#7553](https://github.com/apache/iceberg/pull/7553)) + - Ability to handle skew during writes in Spark 3.4 ([\#7520](https://github.com/apache/iceberg/pull/7520)) + - Ability to coalesce small tasks during writes in Spark 3.4 ([\#7532](https://github.com/apache/iceberg/pull/7532)) + - Distribution and ordering enhancements in Spark 3.4 ([\#7637](https://github.com/apache/iceberg/pull/7637)) + - Action for rewriting position deletes ([\#7389](https://github.com/apache/iceberg/pull/7389)) + - Procedure for rewriting position deletes ([\#7572](https://github.com/apache/iceberg/pull/7572)) + - Avoid local sort for MERGE cardinality check ([\#7558](https://github.com/apache/iceberg/pull/7558)) + - Support for rate limits in Structured Streaming ([\#4479](https://github.com/apache/iceberg/pull/4479)) + - Read and write support for UUIDs ([\#7399](https://github.com/apache/iceberg/pull/7399)) + - Concurrent compaction is enabled by default ([\#6907](https://github.com/apache/iceberg/pull/6907)) + - Support for metadata columns in changelog tables ([\#7152](https://github.com/apache/iceberg/pull/7152)) + - Add file group failure info for data compaction ([\#7361](https://github.com/apache/iceberg/pull/7361)) +* Flink + - Initial support for Flink 1.17 + - Removed integration for Flink 1.14 + - Data statistics operator to collect traffic distribution for guiding smart shuffling ([\#6382](https://github.com/apache/iceberg/pull/6382)) + - Data statistics operator sends local data statistics to coordinator and receives aggregated data statistics from coordinator for smart shuffling ([\#7269](https://github.com/apache/iceberg/pull/7269)) + - Exposed write parallelism in SQL hints ([\#7039](https://github.com/apache/iceberg/pull/7039)) + - Row-level filtering ([\#7109](https://github.com/apache/iceberg/pull/7109)) + - Use starting sequence number by default when rewriting data files ([\#7218](https://github.com/apache/iceberg/pull/7218)) + - Config for max allowed consecutive planning failures in IcebergSource before failing the job ([\#7571](https://github.com/apache/iceberg/pull/7571)) +* Vendor Integrations + - AWS: Use Apache HTTP client as default AWS HTTP client ([\#7119](https://github.com/apache/iceberg/pull/7119)) + - AWS: Prevent token refresh scheduling on every sign request ([\#7270](https://github.com/apache/iceberg/pull/7270)) + - AWS: Disable local credentials if remote signing is enabled ([\#7230](https://github.com/apache/iceberg/pull/7230)) +* Dependencies + - Bump Arrow to 12.0.0 + - Bump ORC to 1.8.3 + - Bump Parquet to 1.13.1 + - Bump Nessie to 0.59.0 + +### 1.2.1 release + +Apache Iceberg 1.2.1 was released on April 11th, 2023. +The 1.2.1 release is a patch release to address various issues identified in the prior release. +Here is an overview: + +* CORE + - REST: fix previous locations for refs-only load [\#7284](https://github.com/apache/iceberg/pull/7284) + - Parse snapshot-id as long in remove-statistics update [\#7235](https://github.com/apache/iceberg/pull/7235) +* Spark + - Broadcast table instead of file IO in rewrite manifests [\#7263](https://github.com/apache/iceberg/pull/7263) + - Revert "Spark: Add "Iceberg" prefix to SparkTable name string for SparkUI [\#7273](https://github.com/apache/iceberg/pull/7273) +* AWS + - Make AuthSession cache static [\#7289](https://github.com/apache/iceberg/pull/7289) + - Abort S3 input stream on close if not EOS [\#7262](https://github.com/apache/iceberg/pull/7262) + - Disable local credentials if remote signing is enabled [\#7230](https://github.com/apache/iceberg/pull/7230) + - Prevent token refresh scheduling on every sign request [\#7270](https://github.com/apache/iceberg/pull/7270) + - S3 Credentials provider support in DefaultAwsClientFactory [\#7066](https://github.com/apache/iceberg/pull/7066) + +### 1.2.0 release + +Apache Iceberg 1.2.0 was released on March 20th, 2023. +The 1.2.0 release adds a variety of new features and bug fixes. +Here is an overview: + +* Core + - Added AES GCM encrpytion stream spec ([\#5432](https://github.com/apache/iceberg/pull/5432)) + - Added support for Delta Lake to Iceberg table conversion ([\#6449](https://github.com/apache/iceberg/pull/6449), [\#6880](https://github.com/apache/iceberg/pull/6880)) + - Added support for `position_deletes` metadata table ([\#6365](https://github.com/apache/iceberg/pull/6365), [\#6716](https://github.com/apache/iceberg/pull/6716)) + - Added support for scan and commit metrics reporter that is pluggable through catalog ([\#6404](https://github.com/apache/iceberg/pull/6404), [\#6246](https://github.com/apache/iceberg/pull/6246), [\#6410](https://github.com/apache/iceberg/pull/6410)) + - Added support for branch commit for all operations ([\#4926](https://github.com/apache/iceberg/pull/4926), [\#5010](https://github.com/apache/iceberg/pull/5010)) + - Added `FileIO` support for ORC readers and writers ([\#6293](https://github.com/apache/iceberg/pull/6293)) + - Updated all actions to leverage bulk delete whenever possible ([\#6682](https://github.com/apache/iceberg/pull/6682)) + - Updated snapshot ID definition in Puffin spec to support statistics file reuse ([\#6272](https://github.com/apache/iceberg/pull/6267)) + - Added human-readable metrics information in `files` metadata table ([\#5376](https://github.com/apache/iceberg/pull/5376)) + - Fixed incorrect Parquet row group skipping when min and max values are `NaN` ([\#6517](https://github.com/apache/iceberg/pull/6517)) + - Fixed a bug that location provider could generate paths with double slash (`//`) which is not compatible in a Hadoop file system ([\#6777](https://github.com/apache/iceberg/pull/6777)) + - Fixed metadata table time travel failure for tables that performed schema evolution ([\#6980](https://github.com/apache/iceberg/pull/6980)) +* Spark + - Added time range query support for changelog table ([\#6350](https://github.com/apache/iceberg/pull/6350)) + - Added changelog view procedure for v1 table ([\#6012](https://github.com/apache/iceberg/pull/6012)) + - Added support for storage partition joins to improve read and write performance ([\#6371](https://github.com/apache/iceberg/pull/6371)) + - Updated default Arrow environment settings to improve read performance ([\#6550](https://github.com/apache/iceberg/pull/6550)) + - Added aggregate pushdown support for `min`, `max` and `count` to improve read performance ([\#6622](https://github.com/apache/iceberg/pull/6622)) + - Updated default distribution mode settings to improve write performance ([\#6828](https://github.com/apache/iceberg/pull/6828), [\#6838](https://github.com/apache/iceberg/pull/6838)) + - Updated DELETE to perform metadata-only update whenever possible to improve write performance ([\#6899](https://github.com/apache/iceberg/pull/6899)) + - Improved predicate pushdown support for write operations ([\#6636](https://github.com/apache/iceberg/pull/6633)) + - Added support for reading a branch or tag through table identifier and `VERSION AS OF` (a.k.a. `FOR SYSTEM_VERSION AS OF`) SQL syntax ([\#6717](https://github.com/apache/iceberg/pull/6717), [\#6575](https://github.com/apache/iceberg/pull/6575)) + - Added support for writing to a branch through identifier or through write-audit-publish (WAP) workflow settings ([\#6965](https://github.com/apache/iceberg/pull/6965), [\#7050](https://github.com/apache/iceberg/pull/7050)) + - Added DDL SQL extensions to create, replace and drop a branch or tag ([\#6638](https://github.com/apache/iceberg/pull/6638), [\#6637](https://github.com/apache/iceberg/pull/6637), [\#6752](https://github.com/apache/iceberg/pull/6752), [\#6807](https://github.com/apache/iceberg/pull/6807)) + - Added UDFs for `years`, `months`, `days` and `hours` transforms ([\#6207](https://github.com/apache/iceberg/pull/6207), [\#6261](https://github.com/apache/iceberg/pull/6261), [\#6300](https://github.com/apache/iceberg/pull/6300), [\#6339](https://github.com/apache/iceberg/pull/6339)) + - Added partition related stats for `add_files` procedure result ([\#6797](https://github.com/apache/iceberg/pull/6797)) + - Fixed a bug that `rewrite_manifests` procedure produced a new manifest even when there was no rewrite performed ([\#6659](https://github.com/apache/iceberg/pull/6695)) + - Fixed a bug that statistics files were not cleaned up in `expire_snapshots` procedure ([\#6090](https://github.com/apache/iceberg/pull/6090)) +* Flink + - Added support for metadata tables ([\#6222](https://github.com/apache/iceberg/pull/6222)) + - Added support for read options in Flink source ([\#5967](https://github.com/apache/iceberg/pull/5967)) + - Added support for reading and writing Avro `GenericRecord` ([\#6557](https://github.com/apache/iceberg/pull/6557), [\#6584](https://github.com/apache/iceberg/pull/6584)) + - Added support for reading a branch or tag and write to a branch ([\#6660](https://github.com/apache/iceberg/pull/6660), [\#5029](https://github.com/apache/iceberg/pull/5029)) + - Added throttling support for streaming read ([\#6299](https://github.com/apache/iceberg/pull/6299)) + - Added support for multiple sinks for the same table in the same job ([\#6528](https://github.com/apache/iceberg/pull/6528)) + - Fixed a bug that metrics config was not applied to equality and position deletes ([\#6271](https://github.com/apache/iceberg/pull/6271), [\#6313](https://github.com/apache/iceberg/pull/6313)) +* Vendor Integrations + - Added Snowflake catalog integration ([\#6428](https://github.com/apache/iceberg/pull/6428)) + - Added AWS sigV4 authentication support for REST catalog ([\#6951](https://github.com/apache/iceberg/pull/6951)) + - Added support for AWS S3 remote signing ([\#6169](https://github.com/apache/iceberg/pull/6169), [\#6835](https://github.com/apache/iceberg/pull/6835), [\#7080](https://github.com/apache/iceberg/pull/7080)) + - Updated AWS Glue catalog to skip table version archive by default ([\#6919](https://github.com/apache/iceberg/pull/6916)) + - Updated AWS Glue catalog to not require a warehouse location ([\#6586](https://github.com/apache/iceberg/pull/6586)) + - Fixed a bug that a bucket-only AWS S3 location such as `s3://my-bucket` could not be parsed ([\#6352](https://github.com/apache/iceberg/pull/6352)) + - Fixed a bug that unnecessary HTTP client dependencies had to be included to use any AWS integration ([\#6746](https://github.com/apache/iceberg/pull/6746)) + - Fixed a bug that AWS Glue catalog did not respect custom catalog ID when determining default warehouse location ([\#6223](https://github.com/apache/iceberg/pull/6223)) + - Fixes a bug that AWS DynamoDB catalog namespace listing result was incomplete ([\#6823](https://github.com/apache/iceberg/pull/6823)) +* Dependencies + - Upgraded ORC to 1.8.1 ([\#6349](https://github.com/apache/iceberg/pull/6349)) + - Upgraded Jackson to 2.14.1 ([\#6168](https://github.com/apache/iceberg/pull/6168)) + - Upgraded AWS SDK V2 to 2.20.18 ([\#7003](https://github.com/apache/iceberg/pull/7003)) + - Upgraded Nessie to 0.50.0 ([\#6875](https://github.com/apache/iceberg/pull/6875)) + +For more details, please visit [Github](https://github.com/apache/iceberg/releases/tag/apache-iceberg-1.2.0). + + +### 1.1.0 release + +Apache Iceberg 1.1.0 was released on November 28th, 2022. +The 1.1.0 release deprecates various pre-1.0.0 methods, +and adds a variety of new features. +Here is an overview: + +* Core + - Puffin statistics have been [added to the Table API](https://github.com/apache/iceberg/pull/4945) + - Support for [Table scan reporting](https://github.com/apache/iceberg/pull/5268), which enables collection of statistics of the table scans. + - [Add file sequence number to ManifestEntry](https://github.com/apache/iceberg/pull/6002) + - [Support register table](https://github.com/apache/iceberg/pull/5037) for all the catalogs (previously it was only for Hive) + - [Support performing merge appends and delete files on branches](https://github.com/apache/iceberg/pull/5618) + - [Improved Expire Snapshots FileCleanupStrategy](https://github.com/apache/iceberg/pull/5669) + - [SnapshotProducer supports branch writes](https://github.com/apache/iceberg/pull/4926) +* Spark + - [Support for aggregate expressions](https://github.com/apache/iceberg/pull/5961) + - [SparkChangelogTable for querying changelogs](https://github.com/apache/iceberg/pull/5740) + - Dropped support for Apache Spark 3.0 +* Flink + - [FLIP-27 reader is supported in SQL](https://github.com/apache/iceberg/pull/5318) + - Added support for Flink 1.16, dropped support for Flink 1.13 +* Dependencies + - [AWS SDK: 2.17.257](https://github.com/apache/iceberg/pull/5612) + - [Nessie: 0.44](https://github.com/apache/iceberg/pull/6008) + - [Apache ORC: 1.8.0](https://github.com/apache/iceberg/pull/5699) (Also, supports [setting bloom filters on row groups](https://github.com/apache/iceberg/pull/5313/files)) + +For more details, please visit [Github](https://github.com/apache/iceberg/releases/tag/apache-iceberg-1.1.0). + +### 1.0.0 release + +The 1.0.0 release officially guarantees the stability of the Iceberg API. + +Iceberg's API has been largely stable since very early releases and has been integrated with many processing engines, but was still released under a 0.y.z version number indicating that breaking changes may happen. From 1.0.0 forward, the project will follow semver in the public API module, iceberg-api. + +This release removes deprecated APIs that are no longer part of the API. To make transitioning to the new release easier, it is based on the 0.14.1 release with only important bug fixes: + +* Increase metrics limit to 100 columns ([#5933](https://github.com/apache/iceberg/pull/5933)) +* Bump Spark patch versions for CVE-2022-33891 ([#5292](https://github.com/apache/iceberg/pull/5292)) +* Exclude Scala from Spark runtime Jars ([#5884](https://github.com/apache/iceberg/pull/5884)) + +### 0.14.1 release + +This release includes all bug fixes from the 0.14.x patch releases. + +#### Notable bug fixes + +* API + - API: Fix ID assignment in schema merging ([#5395](https://github.com/apache/iceberg/pull/5395)) +* Core + - Core: Fix snapshot log with intermediate transaction snapshots ([#5568](https://github.com/apache/iceberg/pull/5568)) + - Core: Fix exception handling in BaseTaskWriter ([#5683](https://github.com/apache/iceberg/pull/5683)) + - Core: Support deleting tables without metadata files ([#5510](https://github.com/apache/iceberg/pull/5510)) + - Core: Add CommitStateUnknownException handling to REST ([#5694](https://github.com/apache/iceberg/pull/5694)) +* Spark + - Spark: Fix stats in rewrite metadata action ([#5691](https://github.com/apache/iceberg/pull/5691)) +* File Formats + - Parquet: Close zstd input stream early to avoid memory pressure ([#5681](https://github.com/apache/iceberg/pull/5681)) +* Vendor Integrations + - Core, AWS: Fix Kryo serialization failure for FileIO ([#5437](https://github.com/apache/iceberg/pull/5437)) + - AWS: S3OutputStream - failure to close should persist on subsequent close calls ([#5311](https://github.com/apache/iceberg/pull/5311)) + +### 0.14.0 release + +Apache Iceberg 0.14.0 was released on 16 July 2022. + +#### Highlights + +* Added several [performance improvements](#performance-improvements) for scan planning and Spark queries +* Added a common REST catalog client that uses change-based commits to resolve commit conflicts on the service side +* Added support for Spark 3.3, including `AS OF` syntax for SQL time travel queries +* Added support for Scala 2.13 with Spark 3.2 or later +* Added merge-on-read support for MERGE and UPDATE queries in Spark 3.2 or later +* Added support to rewrite partitions using zorder +* Added support for Flink 1.15 and dropped support for Flink 1.12 +* Added a spec and implementation for Puffin, a format for large stats and index blobs, like [Theta sketches](https://datasketches.apache.org/docs/Theta/InverseEstimate.html) or bloom filters +* Added new interfaces for consuming data incrementally (both append and changelog scans) +* Added support for bulk operations and ranged reads to FileIO interfaces +* Added more metadata tables to show delete files in the metadata tree + +#### High-level features + +* API + - Added IcebergBuild to expose Iceberg version and build information + - Added binary compatibility checking to the build ([#4638](https://github.com/apache/iceberg/pull/4638), [#4798](https://github.com/apache/iceberg/pull/4798)) + - Added a new IncrementalAppendScan interface and planner implementation ([#4580](https://github.com/apache/iceberg/pull/4580)) + - Added a new IncrementalChangelogScan interface ([#4870](https://github.com/apache/iceberg/pull/4870)) + - Refactored the ScanTask hierarchy to create new task types for changelog scans ([#5077](https://github.com/apache/iceberg/pull/5077)) + - Added expression sanitizer ([#4672](https://github.com/apache/iceberg/pull/4672)) + - Added utility to check expression equivalence ([#4947](https://github.com/apache/iceberg/pull/4947)) + - Added support for serializing FileIO instances using initialization properties ([#5178](https://github.com/apache/iceberg/pull/5178)) + - Updated Snapshot methods to accept a FileIO to read metadata files, deprecated old methods ([#4873](https://github.com/apache/iceberg/pull/4873)) + - Added optional interfaces to FileIO, for batch deletes ([#4052](https://github.com/apache/iceberg/pull/4052)), prefix operations ([#5096](https://github.com/apache/iceberg/pull/5096)), and ranged reads ([#4608](https://github.com/apache/iceberg/pull/4608)) +* Core + - Added a common client for REST-based catalog services that uses a change-based protocol ([#4320](https://github.com/apache/iceberg/pull/4320), [#4319](https://github.com/apache/iceberg/pull/4319)) + - Added Puffin, a file format for statistics and index payloads or sketches ([#4944](https://github.com/apache/iceberg/pull/4944), [#4537](https://github.com/apache/iceberg/pull/4537)) + - Added snapshot references to track tags and branches ([#4019](https://github.com/apache/iceberg/pull/4019)) + - ManageSnapshots now supports multiple operations using transactions, and added branch and tag operations ([#4128](https://github.com/apache/iceberg/pull/4128), [#4071](https://github.com/apache/iceberg/pull/4071)) + - ReplacePartitions and OverwriteFiles now support serializable isolation ([#2925](https://github.com/apache/iceberg/pull/2925), [#4052](https://github.com/apache/iceberg/pull/4052)) + - Added new metadata tables: `data_files` ([#4336](https://github.com/apache/iceberg/pull/4336)), `delete_files` ([#4243](https://github.com/apache/iceberg/pull/4243)), `all_delete_files`, and `all_files` ([#4694](https://github.com/apache/iceberg/pull/4694)) + - Added deleted files to the `files` metadata table ([#4336](https://github.com/apache/iceberg/pull/4336)) and delete file counts to the `manifests` table ([#4764](https://github.com/apache/iceberg/pull/4764)) + - Added support for predicate pushdown for the `all_data_files` metadata table ([#4382](https://github.com/apache/iceberg/pull/4382)) and the `all_manifests` table ([#4736](https://github.com/apache/iceberg/pull/4736)) + - Added support for catalogs to default table properties on creation ([#4011](https://github.com/apache/iceberg/pull/4011)) + - Updated sort order construction to ensure all partition fields are added to avoid partition closed failures ([#5131](https://github.com/apache/iceberg/pull/5131)) +* Spark + - Spark 3.3 is now supported ([#5056](https://github.com/apache/iceberg/pull/5056)) + - Added SQL time travel using `AS OF` syntax in Spark 3.3 ([#5156](https://github.com/apache/iceberg/pull/5156)) + - Scala 2.13 is now supported for Spark 3.2 and 3.3 ([#4009](https://github.com/apache/iceberg/pull/4009)) + - Added support for the `mergeSchema` option for DataFrame writes ([#4154](https://github.com/apache/iceberg/pull/4154)) + - MERGE and UPDATE queries now support the lazy / merge-on-read strategy ([#3984](https://github.com/apache/iceberg/pull/3984), [#4047](https://github.com/apache/iceberg/pull/4047)) + - Added zorder rewrite strategy to the `rewrite_data_files` stored procedure and action ([#3983](https://github.com/apache/iceberg/pull/3983), [#4902](https://github.com/apache/iceberg/pull/4902)) + - Added a `register_table` stored procedure to create tables from metadata JSON files ([#4810](https://github.com/apache/iceberg/pull/4810)) + - Added a `publish_changes` stored procedure to publish staged commits by ID ([#4715](https://github.com/apache/iceberg/pull/4715)) + - Added `CommitMetadata` helper class to set snapshot summary properties from SQL ([#4956](https://github.com/apache/iceberg/pull/4956)) + - Added support to supply a file listing to remove orphan data files procedure and action ([#4503](https://github.com/apache/iceberg/pull/4503)) + - Added FileIO metrics to the Spark UI ([#4030](https://github.com/apache/iceberg/pull/4030), [#4050](https://github.com/apache/iceberg/pull/4050)) + - DROP TABLE now supports the PURGE flag ([#3056](https://github.com/apache/iceberg/pull/3056)) + - Added support for custom isolation level for dynamic partition overwrites ([#2925](https://github.com/apache/iceberg/pull/2925)) and filter overwrites ([#4293](https://github.com/apache/iceberg/pull/4293)) + - Schema identifier fields are now shown in table properties ([#4475](https://github.com/apache/iceberg/pull/4475)) + - Abort cleanup now supports parallel execution ([#4704](https://github.com/apache/iceberg/pull/4704)) +* Flink + - Flink 1.15 is now supported ([#4553](https://github.com/apache/iceberg/pull/4553)) + - Flink 1.12 support was removed ([#4551](https://github.com/apache/iceberg/pull/4551)) + - Added a FLIP-27 source and builder to 1.14 and 1.15 ([#5109](https://github.com/apache/iceberg/pull/5109)) + - Added an option to set the monitor interval ([#4887](https://github.com/apache/iceberg/pull/4887)) and an option to limit the number of snapshots in a streaming read planning operation ([#4943](https://github.com/apache/iceberg/pull/4943)) + - Added support for write options, like `write-format` to Flink sink builder ([#3998](https://github.com/apache/iceberg/pull/3998)) + - Added support for task locality when reading from HDFS ([#3817](https://github.com/apache/iceberg/pull/3817)) + - Use Hadoop configuration files from `hadoop-conf-dir` property ([#4622](https://github.com/apache/iceberg/pull/4622)) +* Vendor integrations + - Added Dell ECS integration ([#3376](https://github.com/apache/iceberg/pull/3376), [#4221](https://github.com/apache/iceberg/pull/4221)) + - JDBC catalog now supports namespace properties ([#3275](https://github.com/apache/iceberg/pull/3275)) + - AWS Glue catalog supports native Glue locking ([#4166](https://github.com/apache/iceberg/pull/4166)) + - AWS S3FileIO supports using S3 access points ([#4334](https://github.com/apache/iceberg/pull/4334)), bulk operations ([#4052](https://github.com/apache/iceberg/pull/4052), [#5096](https://github.com/apache/iceberg/pull/5096)), ranged reads ([#4608](https://github.com/apache/iceberg/pull/4608)), and tagging at write time or in place of deletes ([#4259](https://github.com/apache/iceberg/pull/4259), [#4342](https://github.com/apache/iceberg/pull/4342)) + - AWS GlueCatalog supports passing LakeFormation credentials ([#4280](https://github.com/apache/iceberg/pull/4280)) + - AWS DynamoDB catalog and lock supports overriding the DynamoDB endpoint ([#4726](https://github.com/apache/iceberg/pull/4726)) + - Nessie now supports namespaces and namespace properties ([#4385](https://github.com/apache/iceberg/pull/4385), [#4610](https://github.com/apache/iceberg/pull/4610)) + - Nessie now passes most common catalog tests ([#4392](https://github.com/apache/iceberg/pull/4392)) +* Parquet + - Added support for row group skipping using Parquet bloom filters ([#4938](https://github.com/apache/iceberg/pull/4938)) + - Added table configuration options for writing Parquet bloom filters ([#5035](https://github.com/apache/iceberg/pull/5035)) +* ORC + - Support file rolling at a target file size ([#4419](https://github.com/apache/iceberg/pull/4419)) + - Support table compression settings, `write.orc.compression-codec` and `write.orc.compression-strategy` ([#4273](https://github.com/apache/iceberg/pull/4273)) + +#### Performance improvements + +* Core + - Fixed manifest file handling in scan planning to open manifests in the planning threadpool ([#5206](https://github.com/apache/iceberg/pull/5206)) + - Avoided an extra S3 HEAD request by passing file length when opening manifest files ([#5207](https://github.com/apache/iceberg/pull/5207)) + - Refactored Arrow vectorized readers to avoid extra dictionary copies ([#5137](https://github.com/apache/iceberg/pull/5137)) + - Improved Arrow decimal handling to improve decimal performance ([#5168](https://github.com/apache/iceberg/pull/5168), [#5198](https://github.com/apache/iceberg/pull/5198)) + - Added support for Avro files with Zstd compression ([#4083](https://github.com/apache/iceberg/pull/4083)) + - Column metrics are now disabled by default after the first 32 columns ([#3959](https://github.com/apache/iceberg/pull/3959), [#5215](https://github.com/apache/iceberg/pull/5215)) + - Updated delete filters to copy row wrappers to avoid expensive type analysis ([#5249](https://github.com/apache/iceberg/pull/5249)) + - Snapshot expiration supports parallel execution ([#4148](https://github.com/apache/iceberg/pull/4148)) + - Manifest updates can use a custom thread pool ([#4146](https://github.com/apache/iceberg/pull/4146)) +* Spark + - Parquet vectorized reads are enabled by default ([#4196](https://github.com/apache/iceberg/pull/4196)) + - Scan statistics now adjust row counts for split data files ([#4446](https://github.com/apache/iceberg/pull/4446)) + - Implemented `SupportsReportStatistics` in `ScanBuilder` to work around SPARK-38962 ([#5136](https://github.com/apache/iceberg/pull/5136)) + - Updated Spark tables to avoid expensive (and inaccurate) size estimation ([#5225](https://github.com/apache/iceberg/pull/5225)) +* Flink + - Operators will now use a worker pool per job ([#4177](https://github.com/apache/iceberg/pull/4177)) + - Fixed `ClassCastException` thrown when reading arrays from Parquet ([#4432](https://github.com/apache/iceberg/pull/4432)) +* Hive + - Added vectorized Parquet reads for Hive 3 ([#3980](https://github.com/apache/iceberg/pull/3980)) + - Improved generic reader performance using copy instead of create ([#4218](https://github.com/apache/iceberg/pull/4218)) + +#### Notable bug fixes + +This release includes all bug fixes from the 0.13.x patch releases. + +* Core + - Fixed an exception thrown when metadata-only deletes encounter delete files that are partially matched ([#4304](https://github.com/apache/iceberg/pull/4304)) + - Fixed transaction retries for changes without validations, like schema updates, that could ignore an update ([#4464](https://github.com/apache/iceberg/pull/4464)) + - Fixed failures when reading metadata tables with evolved partition specs ([#4520](https://github.com/apache/iceberg/pull/4520), [#4560](https://github.com/apache/iceberg/pull/4560)) + - Fixed delete files dropped when a manifest is rewritten following a format version upgrade ([#4514](https://github.com/apache/iceberg/pull/4514)) + - Fixed missing metadata files resulting from an OOM during commit cleanup ([#4673](https://github.com/apache/iceberg/pull/4673)) + - Updated logging to use sanitized expressions to avoid leaking values ([#4672](https://github.com/apache/iceberg/pull/4672)) +* Spark + - Fixed Spark to skip calling abort when CommitStateUnknownException is thrown ([#4687](https://github.com/apache/iceberg/pull/4687)) + - Fixed MERGE commands with mixed case identifiers ([#4848](https://github.com/apache/iceberg/pull/4848)) +* Flink + - Fixed table property update failures when tables have a primary key ([#4561](https://github.com/apache/iceberg/pull/4561)) +* Integrations + - JDBC catalog behavior has been updated to pass common catalog tests ([#4220](https://github.com/apache/iceberg/pull/4220), [#4231](https://github.com/apache/iceberg/pull/4231)) + +#### Dependency changes + +* Updated Apache Avro to 1.10.2 (previously 1.10.1) +* Updated Apache Parquet to 1.12.3 (previously 1.12.2) +* Updated Apache ORC to 1.7.5 (previously 1.7.2) +* Updated Apache Arrow to 7.0.0 (previously 6.0.0) +* Updated AWS SDK to 2.17.131 (previously 2.15.7) +* Updated Nessie to 0.30.0 (previously 0.18.0) +* Updated Caffeine to 2.9.3 (previously 2.8.4) + +### 0.13.2 + +Apache Iceberg 0.13.2 was released on June 15th, 2022. + +* Git tag: [0.13.2](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.13.2) +* [0.13.2 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.13.2/apache-iceberg-0.13.2.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.13.2/apache-iceberg-0.13.2.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.13.2/apache-iceberg-0.13.2.tar.gz.sha512) +* [0.13.2 Spark 3.2 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/0.13.2/iceberg-spark-runtime-3.2_2.12-0.13.2.jar) +* [0.13.2 Spark 3.1 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.1_2.12/0.13.2/iceberg-spark-runtime-3.1_2.12-0.13.2.jar) +* [0.13.2 Spark 3.0 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.13.2/iceberg-spark3-runtime-0.13.2.jar) +* [0.13.2 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.13.2/iceberg-spark-runtime-0.13.2.jar) +* [0.13.2 Flink 1.14 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.14/0.13.2/iceberg-flink-runtime-1.14-0.13.2.jar) +* [0.13.2 Flink 1.13 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.13/0.13.2/iceberg-flink-runtime-1.13-0.13.2.jar) +* [0.13.2 Flink 1.12 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.12/0.13.2/iceberg-flink-runtime-1.12-0.13.2.jar) +* [0.13.2 Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/0.13.2/iceberg-hive-runtime-0.13.2.jar) + +**Important bug fixes and changes:** + +* **Core** + * [\#4673](https://github.com/apache/iceberg/pull/4673) fixes table corruption from OOM during commit cleanup + * [\#4514](https://github.com/apache/iceberg/pull/4514) row delta delete files were dropped in sequential commits after table format updated to v2 + * [\#4464](https://github.com/apache/iceberg/pull/4464) fixes an issue were conflicting transactions have been ignored during a commit + * [\#4520](https://github.com/apache/iceberg/pull/4520) fixes an issue with wrong table predicate filtering with evolved partition specs +* **Spark** + * [\#4663](https://github.com/apache/iceberg/pull/4663) fixes NPEs in Spark value converter + * [\#4687](https://github.com/apache/iceberg/pull/4687) fixes an issue with incorrect aborts when non-runtime exceptions were thrown in Spark +* **Flink** + * Note that there's a correctness issue when using upsert mode in Flink 1.12. Given that Flink 1.12 is deprecated, it was decided to not fix this bug but rather log a warning (see also [\#4754](https://github.com/apache/iceberg/pull/4754)). +* **Nessie** + * [\#4509](https://github.com/apache/iceberg/pull/4509) fixes a NPE that occurred when accessing refreshed tables in NessieCatalog + + +A more exhaustive list of changes is available under the [0.13.2 release milestone](https://github.com/apache/iceberg/milestone/18?closed=1). + +### 0.13.1 + +Apache Iceberg 0.13.1 was released on February 14th, 2022. + +* Git tag: [0.13.1](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.13.1) +* [0.13.1 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.13.1/apache-iceberg-0.13.1.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.13.1/apache-iceberg-0.13.1.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.13.1/apache-iceberg-0.13.1.tar.gz.sha512) +* [0.13.1 Spark 3.2 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/0.13.1/iceberg-spark-runtime-3.2_2.12-0.13.1.jar) +* [0.13.1 Spark 3.1 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.1_2.12/0.13.1/iceberg-spark-runtime-3.1_2.12-0.13.1.jar) +* [0.13.1 Spark 3.0 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.13.1/iceberg-spark3-runtime-0.13.1.jar) +* [0.13.1 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.13.1/iceberg-spark-runtime-0.13.1.jar) +* [0.13.1 Flink 1.14 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.14/0.13.1/iceberg-flink-runtime-1.14-0.13.1.jar) +* [0.13.1 Flink 1.13 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.13/0.13.1/iceberg-flink-runtime-1.13-0.13.1.jar) +* [0.13.1 Flink 1.12 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.12/0.13.1/iceberg-flink-runtime-1.12-0.13.1.jar) +* [0.13.1 Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/0.13.1/iceberg-hive-runtime-0.13.1.jar) + +**Important bug fixes:** + +* **Spark** + * [\#4023](https://github.com/apache/iceberg/pull/4023) fixes predicate pushdown in row-level operations for merge conditions in Spark 3.2. + Prior to the fix, filters would not be extracted and targeted merge conditions were not pushed down leading to degraded performance + for these targeted merge operations. + * [\#4024](https://github.com/apache/iceberg/pull/4024) fixes table creation in the root namespace of a Hadoop Catalog. + +* **Flink** + * [\#3986](https://github.com/apache/iceberg/pull/3986) fixes manifest location collisions when there are multiple committers + in the same Flink job. + + +### 0.13.0 + +Apache Iceberg 0.13.0 was released on February 4th, 2022. + +* Git tag: [0.13.0](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.13.0) +* [0.13.0 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.13.0/apache-iceberg-0.13.0.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.13.0/apache-iceberg-0.13.0.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.13.0/apache-iceberg-0.13.0.tar.gz.sha512) +* [0.13.0 Spark 3.2 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/0.13.0/iceberg-spark-runtime-3.2_2.12-0.13.0.jar) +* [0.13.0 Spark 3.1 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.1_2.12/0.13.0/iceberg-spark-runtime-3.1_2.12-0.13.0.jar) +* [0.13.0 Spark 3.0 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.13.0/iceberg-spark3-runtime-0.13.0.jar) +* [0.13.0 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.13.0/iceberg-spark-runtime-0.13.0.jar) +* [0.13.0 Flink 1.14 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.14/0.13.0/iceberg-flink-runtime-1.14-0.13.0.jar) +* [0.13.0 Flink 1.13 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.13/0.13.0/iceberg-flink-runtime-1.13-0.13.0.jar) +* [0.13.0 Flink 1.12 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime-1.12/0.13.0/iceberg-flink-runtime-1.12-0.13.0.jar) +* [0.13.0 Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/0.13.0/iceberg-hive-runtime-0.13.0.jar) + +**High-level features:** + +* **Core** + * Catalog caching now supports cache expiration through catalog property `cache.expiration-interval-ms` [[\#3543](https://github.com/apache/iceberg/pull/3543)] + * Catalog now supports registration of Iceberg table from a given metadata file location [[\#3851](https://github.com/apache/iceberg/pull/3851)] + * Hadoop catalog can be used with S3 and other file systems safely by using a lock manager [[\#3663](https://github.com/apache/iceberg/pull/3663)] +* **Vendor Integrations** + * Google Cloud Storage (GCS) `FileIO` is supported with optimized read and write using GCS streaming transfer [[\#3711](https://github.com/apache/iceberg/pull/3711)] + * Aliyun Object Storage Service (OSS) `FileIO` is supported [[\#3553](https://github.com/apache/iceberg/pull/3553)] + * Any S3-compatible storage (e.g. MinIO) can now be accessed through AWS `S3FileIO` with custom endpoint and credential configurations [[\#3656](https://github.com/apache/iceberg/pull/3656)] [[\#3658](https://github.com/apache/iceberg/pull/3658)] + * AWS `S3FileIO` now supports server-side checksum validation [[\#3813](https://github.com/apache/iceberg/pull/3813)] + * AWS `GlueCatalog` now displays more table information including table location, description [[\#3467](https://github.com/apache/iceberg/pull/3467)] and columns [[\#3888](https://github.com/apache/iceberg/pull/3888)] + * Using multiple `FileIO`s based on file path scheme is supported by configuring a `ResolvingFileIO` [[\#3593](https://github.com/apache/iceberg/pull/3593)] +* **Spark** + * Spark 3.2 is supported [[\#3335](https://github.com/apache/iceberg/pull/3335)] with merge-on-read `DELETE` [[\#3970](https://github.com/apache/iceberg/pull/3970)] + * `RewriteDataFiles` action now supports sort-based table optimization [[\#2829](https://github.com/apache/iceberg/pull/2829)] and merge-on-read delete compaction [[\#3454](https://github.com/apache/iceberg/pull/3454)]. The corresponding Spark call procedure `rewrite_data_files` is also supported [[\#3375](https://github.com/apache/iceberg/pull/3375)] + * Time travel queries now use snapshot schema instead of the table's latest schema [[\#3722](https://github.com/apache/iceberg/pull/3722)] + * Spark vectorized reads now support row-level deletes [[\#3557](https://github.com/apache/iceberg/pull/3557)] [[\#3287](https://github.com/apache/iceberg/pull/3287)] + * `add_files` procedure now skips duplicated files by default (can be turned off with the `check_duplicate_files` flag) [[\#2895](https://github.com/apache/iceberg/issues/2779)], skips folder without file [[\#2895](https://github.com/apache/iceberg/issues/3455)] and partitions with `null` values [[\#2895](https://github.com/apache/iceberg/issues/3778)] instead of throwing exception, and supports partition pruning for faster table import [[\#3745](https://github.com/apache/iceberg/issues/3745)] +* **Flink** + * Flink 1.13 and 1.14 are supported [[\#3116](https://github.com/apache/iceberg/pull/3116)] [[\#3434](https://github.com/apache/iceberg/pull/3434)] + * Flink connector support is supported [[\#2666](https://github.com/apache/iceberg/pull/2666)] + * Upsert write option is supported [[\#2863](https://github.com/apache/iceberg/pull/2863)] +* **Hive** + * Table listing in Hive catalog can now skip non-Iceberg tables by disabling flag `list-all-tables` [[\#3908](https://github.com/apache/iceberg/pull/3908)] + * Hive tables imported to Iceberg can now be read by `IcebergInputFormat` [[\#3312](https://github.com/apache/iceberg/pull/3312)] +* **File Formats** + * ORC now supports writing delete file [[\#3248](https://github.com/apache/iceberg/pull/3248)] [[\#3250](https://github.com/apache/iceberg/pull/3250)] [[\#3366](https://github.com/apache/iceberg/pull/3366)] + +**Important bug fixes:** + +* **Core** + * Iceberg new data file root path is configured through `write.data.path` going forward. `write.folder-storage.path` and `write.object-storage.path` are deprecated [[\#3094](https://github.com/apache/iceberg/pull/3094)] + * Catalog commit status is `UNKNOWN` instead of `FAILURE` when new metadata location cannot be found in snapshot history [[\#3717](https://github.com/apache/iceberg/pull/3717)] + * Dropping table now also deletes old metadata files instead of leaving them strained [[\#3622](https://github.com/apache/iceberg/pull/3622)] + * `history` and `snapshots` metadata tables can now query tables with no current snapshot instead of returning empty [[\#3812](https://github.com/apache/iceberg/pull/3812)] +* **Vendor Integrations** + * Using cloud service integrations such as AWS `GlueCatalog` and `S3FileIO` no longer fail when missing Hadoop dependencies in the execution environment [[\#3590](https://github.com/apache/iceberg/pull/3590)] + * AWS clients are now auto-closed when related `FileIO` or `Catalog` is closed. There is no need to close the AWS clients separately [[\#2878](https://github.com/apache/iceberg/pull/2878)] +* **Spark** + * For Spark >= 3.1, `REFRESH TABLE` can now be used with Spark session catalog instead of throwing exception [[\#3072](https://github.com/apache/iceberg/pull/3072)] + * Insert overwrite mode now skips partition with 0 record instead of failing the write operation [[\#2895](https://github.com/apache/iceberg/issues/2895)] + * Spark snapshot expiration action now supports custom `FileIO` instead of just `HadoopFileIO` [[\#3089](https://github.com/apache/iceberg/pull/3089)] + * `REPLACE TABLE AS SELECT` can now work with tables with columns that have changed partition transform. Each old partition field of the same column is converted to a void transform with a different name [[\#3421](https://github.com/apache/iceberg/issues/3421)] + * Spark SQL filters containing binary or fixed literals can now be pushed down instead of throwing exception [[\#3728](https://github.com/apache/iceberg/pull/3728)] +* **Flink** + * A `ValidationException` will be thrown if a user configures both `catalog-type` and `catalog-impl`. Previously it chose to use `catalog-type`. The new behavior brings Flink consistent with Spark and Hive [[\#3308](https://github.com/apache/iceberg/issues/3308)] + * Changelog tables can now be queried without `RowData` serialization issues [[\#3240](https://github.com/apache/iceberg/pull/3240)] + * `java.sql.Time` data type can now be written without data overflow problem [[\#3740](https://github.com/apache/iceberg/pull/3740)] + * Avro position delete files can now be read without encountering `NullPointerException` [[\#3540](https://github.com/apache/iceberg/pull/3540)] +* **Hive** + * Hive catalog can now be initialized with a `null` Hadoop configuration instead of throwing exception [[\#3252](https://github.com/apache/iceberg/pull/3252)] + * Table creation can now succeed instead of throwing exception when some columns do not have comments [[\#3531](https://github.com/apache/iceberg/pull/3531)] +* **File Formats** + * Parquet file writing issue is fixed for string data with over 16 unparseable chars (e.g. high/low surrogates) [[\#3760](https://github.com/apache/iceberg/pull/3760)] + * ORC vectorized read is now configured using `read.orc.vectorization.batch-size` instead of `read.parquet.vectorization.batch-size` [[\#3133](https://github.com/apache/iceberg/pull/3133)] + +**Other notable changes:** + +* The community has finalized the long-term strategy of Spark, Flink and Hive support. See [Multi-Engine Support](multi-engine-support.md) page for more details. + +### 0.12.1 + +Apache Iceberg 0.12.1 was released on November 8th, 2021. + +* Git tag: [0.12.1](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.12.1) +* [0.12.1 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.12.1/apache-iceberg-0.12.1.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.12.1/apache-iceberg-0.12.1.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.12.1/apache-iceberg-0.12.1.tar.gz.sha512) +* [0.12.1 Spark 3.x runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.12.1/iceberg-spark3-runtime-0.12.1.jar) +* [0.12.1 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.12.1/iceberg-spark-runtime-0.12.1.jar) +* [0.12.1 Flink runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime/0.12.1/iceberg-flink-runtime-0.12.1.jar) +* [0.12.1 Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/0.12.1/iceberg-hive-runtime-0.12.1.jar) + +Important bug fixes and changes: + +* [\#3264](https://github.com/apache/iceberg/pull/3258) fixes validation failures that occurred after snapshot expiration when writing Flink CDC streams to Iceberg tables. +* [\#3264](https://github.com/apache/iceberg/pull/3264) fixes reading projected map columns from Parquet files written before Parquet 1.11.1. +* [\#3195](https://github.com/apache/iceberg/pull/3195) allows validating that commits that produce row-level deltas don't conflict with concurrently added files. Ensures users can maintain serializable isolation for update and delete operations, including merge operations. +* [\#3199](https://github.com/apache/iceberg/pull/3199) allows validating that commits that overwrite files don't conflict with concurrently added files. Ensures users can maintain serializable isolation for overwrite operations. +* [\#3135](https://github.com/apache/iceberg/pull/3135) fixes equality-deletes using `DATE`, `TIMESTAMP`, and `TIME` types. +* [\#3078](https://github.com/apache/iceberg/pull/3078) prevents the JDBC catalog from overwriting the `jdbc.user` property if any property called user exists in the environment. +* [\#3035](https://github.com/apache/iceberg/pull/3035) fixes drop namespace calls with the DyanmoDB catalog. +* [\#3273](https://github.com/apache/iceberg/pull/3273) fixes importing Avro files via `add_files` by correctly setting the number of records. +* [\#3332](https://github.com/apache/iceberg/pull/3332) fixes importing ORC files with float or double columns in `add_files`. + +A more exhaustive list of changes is available under the [0.12.1 release milestone](https://github.com/apache/iceberg/milestone/15?closed=1). + +### 0.12.0 + +Apache Iceberg 0.12.0 was released on August 15, 2021. It consists of 395 commits authored by 74 contributors over a 139 day period. + +* Git tag: [0.12.0](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.12.0) +* [0.12.0 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.12.0/apache-iceberg-0.12.0.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.12.0/apache-iceberg-0.12.0.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.12.0/apache-iceberg-0.12.0.tar.gz.sha512) +* [0.12.0 Spark 3.x runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.12.0/iceberg-spark3-runtime-0.12.0.jar) +* [0.12.0 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.12.0/iceberg-spark-runtime-0.12.0.jar) +* [0.12.0 Flink runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime/0.12.0/iceberg-flink-runtime-0.12.0.jar) +* [0.12.0 Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/0.12.0/iceberg-hive-runtime-0.12.0.jar) + +**High-level features:** + +* **Core** + * Allow Iceberg schemas to specify one or more columns as row identifiers [[\#2465](https://github.com/apache/iceberg/pull/2465)]. Note that this is a prerequisite for supporting upserts in Flink. + * Added JDBC [[\#1870](https://github.com/apache/iceberg/pull/1870)] and DynamoDB [[\#2688](https://github.com/apache/iceberg/pull/2688)] catalog implementations. + * Added predicate pushdown for partitions and files metadata tables [[\#2358](https://github.com/apache/iceberg/pull/2358), [\#2926](https://github.com/apache/iceberg/pull/2926)]. + * Added a new, more flexible compaction action for Spark that can support different strategies such as bin packing and sorting. [[\#2501](https://github.com/apache/iceberg/pull/2501), [\#2609](https://github.com/apache/iceberg/pull/2609)]. + * Added the ability to upgrade to v2 or create a v2 table using the table property format-version=2 [[\#2887](https://github.com/apache/iceberg/pull/2887)]. + * Added support for nulls in StructLike collections [[\#2929](https://github.com/apache/iceberg/pull/2929)]. + * Added `key_metadata` field to manifest lists for encryption [[\#2675](https://github.com/apache/iceberg/pull/2675)]. +* **Flink** + * Added support for SQL primary keys [[\#2410](https://github.com/apache/iceberg/pull/2410)]. +* **Hive** + * Added the ability to set the catalog at the table level in the Hive Metastore. This makes it possible to write queries that reference tables from multiple catalogs [[\#2129](https://github.com/apache/iceberg/pull/2129)]. + * As a result of [[\#2129](https://github.com/apache/iceberg/pull/2129)], deprecated the configuration property `iceberg.mr.catalog` which was previously used to configure the Iceberg catalog in MapReduce and Hive [[\#2565](https://github.com/apache/iceberg/pull/2565)]. + * Added table-level JVM lock on commits[[\#2547](https://github.com/apache/iceberg/pull/2547)]. + * Added support for Hive's vectorized ORC reader [[\#2613](https://github.com/apache/iceberg/pull/2613)]. +* **Spark** + * Added `SET` and `DROP IDENTIFIER FIELDS` clauses to `ALTER TABLE` so people don't have to look up the DDL [[\#2560](https://github.com/apache/iceberg/pull/2560)]. + * Added support for `ALTER TABLE REPLACE PARTITION FIELD` DDL [[\#2365](https://github.com/apache/iceberg/pull/2365)]. + * Added support for micro-batch streaming reads for structured streaming in Spark3 [[\#2660](https://github.com/apache/iceberg/pull/2660)]. + * Improved the performance of importing a Hive table by not loading all partitions from Hive and instead pushing the partition filter to the Metastore [[\#2777](https://github.com/apache/iceberg/pull/2777)]. + * Added support for `UPDATE` statements in Spark [[\#2193](https://github.com/apache/iceberg/pull/2193), [\#2206](https://github.com/apache/iceberg/pull/2206)]. + * Added support for Spark 3.1 [[\#2512](https://github.com/apache/iceberg/pull/2512)]. + * Added `RemoveReachableFiles` action [[\#2415](https://github.com/apache/iceberg/pull/2415)]. + * Added `add_files` stored procedure [[\#2210](https://github.com/apache/iceberg/pull/2210)]. + * Refactored Actions API and added a new entry point. + * Added support for Hadoop configuration overrides [[\#2922](https://github.com/apache/iceberg/pull/2922)]. + * Added support for the `TIMESTAMP WITHOUT TIMEZONE` type in Spark [[\#2757](https://github.com/apache/iceberg/pull/2757)]. + * Added validation that files referenced by row-level deletes are not concurrently rewritten [[\#2308](https://github.com/apache/iceberg/pull/2308)]. + + +**Important bug fixes:** + +* **Core** + * Fixed string bucketing with non-BMP characters [[\#2849](https://github.com/apache/iceberg/pull/2849)]. + * Fixed Parquet dictionary filtering with fixed-length byte arrays and decimals [[\#2551](https://github.com/apache/iceberg/pull/2551)]. + * Fixed a problem with the configuration of HiveCatalog [[\#2550](https://github.com/apache/iceberg/pull/2550)]. + * Fixed partition field IDs in table replacement [[\#2906](https://github.com/apache/iceberg/pull/2906)]. +* **Hive** + * Enabled dropping HMS tables even if the metadata on disk gets corrupted [[\#2583](https://github.com/apache/iceberg/pull/2583)]. +* **Parquet** + * Fixed Parquet row group filters when types are promoted from `int` to `long` or from `float` to `double` [[\#2232](https://github.com/apache/iceberg/pull/2232)] +* **Spark** + * Fixed `MERGE INTO` in Spark when used with `SinglePartition` partitioning [[\#2584](https://github.com/apache/iceberg/pull/2584)]. + * Fixed nested struct pruning in Spark [[\#2877](https://github.com/apache/iceberg/pull/2877)]. + * Fixed NaN handling for float and double metrics [[\#2464](https://github.com/apache/iceberg/pull/2464)]. + * Fixed Kryo serialization for data and delete files [[\#2343](https://github.com/apache/iceberg/pull/2343)]. + +**Other notable changes:** + +* The Iceberg Community [voted to approve](https://mail-archives.apache.org/mod_mbox/iceberg-dev/202107.mbox/%3cCAMwmD1-k1gnShK=wQ0PD88it6cg9mY7Y1hKHjDZ7L-jcDzpyZA@mail.gmail.com%3e) version 2 of the Apache Iceberg Format Specification. The differences between version 1 and 2 of the specification are documented [here](spec.md#version-2). +* Bugfixes and stability improvements for NessieCatalog. +* Improvements and fixes for Iceberg's Python library. +* Added a vectorized reader for Apache Arrow [[\#2286](https://github.com/apache/iceberg/pull/2286)]. +* The following Iceberg dependencies were upgraded: + * Hive 2.3.8 [[\#2110](https://github.com/apache/iceberg/pull/2110)]. + * Avro 1.10.1 [[\#1648](https://github.com/apache/iceberg/pull/1648)]. + * Parquet 1.12.0 [[\#2441](https://github.com/apache/iceberg/pull/2441)]. + +### 0.11.1 + +* Git tag: [0.11.1](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.11.1) +* [0.11.1 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.11.1/apache-iceberg-0.11.1.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.11.1/apache-iceberg-0.11.1.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.11.1/apache-iceberg-0.11.1.tar.gz.sha512) +* [0.11.1 Spark 3.0 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.11.1/iceberg-spark3-runtime-0.11.1.jar) +* [0.11.1 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.11.1/iceberg-spark-runtime-0.11.1.jar) +* [0.11.1 Flink runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime/0.11.1/iceberg-flink-runtime-0.11.1.jar) +* [0.11.1 Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/0.11.1/iceberg-hive-runtime-0.11.1.jar) + +Important bug fixes: + +* [\#2367](https://github.com/apache/iceberg/pull/2367) prohibits deleting data files when tables are dropped if GC is disabled. +* [\#2196](https://github.com/apache/iceberg/pull/2196) fixes data loss after compaction when large files are split into multiple parts and only some parts are combined with other files. +* [\#2232](https://github.com/apache/iceberg/pull/2232) fixes row group filters with promoted types in Parquet. +* [\#2267](https://github.com/apache/iceberg/pull/2267) avoids listing non-Iceberg tables in Glue. +* [\#2254](https://github.com/apache/iceberg/pull/2254) fixes predicate pushdown for Date in Hive. +* [\#2126](https://github.com/apache/iceberg/pull/2126) fixes writing of Date, Decimal, Time, UUID types in Hive. +* [\#2241](https://github.com/apache/iceberg/pull/2241) fixes vectorized ORC reads with metadata columns in Spark. +* [\#2154](https://github.com/apache/iceberg/pull/2154) refreshes the relation cache in DELETE and MERGE operations in Spark. + +### 0.11.0 + +* Git tag: [0.11.0](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.11.0) +* [0.11.0 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.11.0/apache-iceberg-0.11.0.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.11.0/apache-iceberg-0.11.0.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.11.0/apache-iceberg-0.11.0.tar.gz.sha512) +* [0.11.0 Spark 3.0 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.11.0/iceberg-spark3-runtime-0.11.0.jar) +* [0.11.0 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.11.0/iceberg-spark-runtime-0.11.0.jar) +* [0.11.0 Flink runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime/0.11.0/iceberg-flink-runtime-0.11.0.jar) +* [0.11.0 Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/0.11.0/iceberg-hive-runtime-0.11.0.jar) + +High-level features: + +* **Core API** now supports partition spec and sort order evolution +* **Spark 3** now supports the following SQL extensions: + * MERGE INTO (experimental) + * DELETE FROM (experimental) + * ALTER TABLE ... ADD/DROP PARTITION + * ALTER TABLE ... WRITE ORDERED BY + * Invoke stored procedures using CALL +* **Flink** now supports streaming reads, CDC writes (experimental), and filter pushdown +* **AWS module** is added to support better integration with AWS, with [AWS Glue catalog](https://aws.amazon.com/glue/) support and dedicated S3 FileIO implementation +* **Nessie module** is added to support integration with [project Nessie](https://projectnessie.org/) + +Important bug fixes: + +* [\#1981](https://github.com/apache/iceberg/pull/1981) fixes bug that date and timestamp transforms were producing incorrect values for dates and times before 1970. Before the fix, negative values were incorrectly transformed by date and timestamp transforms to 1 larger than the correct value. For example, `day(1969-12-31 10:00:00)` produced 0 instead of -1. The fix is backwards compatible, which means predicate projection can still work with the incorrectly transformed partitions written using older versions. +* [\#2091](https://github.com/apache/iceberg/pull/2091) fixes `ClassCastException` for type promotion `int` to `long` and `float` to `double` during Parquet vectorized read. Now Arrow vector is created by looking at Parquet file schema instead of Iceberg schema for `int` and `float` fields. +* [\#1998](https://github.com/apache/iceberg/pull/1998) fixes bug in `HiveTableOperation` that `unlock` is not called if new metadata cannot be deleted. Now it is guaranteed that `unlock` is always called for Hive catalog users. +* [\#1979](https://github.com/apache/iceberg/pull/1979) fixes table listing failure in Hadoop catalog when user does not have permission to some tables. Now the tables with no permission are ignored in listing. +* [\#1798](https://github.com/apache/iceberg/pull/1798) fixes scan task failure when encountering duplicate entries of data files. Spark and Flink readers can now ignore duplicated entries in data files for each scan task. +* [\#1785](https://github.com/apache/iceberg/pull/1785) fixes invalidation of metadata tables in `CachingCatalog`. When a table is dropped, all the metadata tables associated with it are also invalidated in the cache. +* [\#1960](https://github.com/apache/iceberg/pull/1960) fixes bug that ORC writer does not read metrics config and always use the default. Now customized metrics config is respected. + +Other notable changes: + +* NaN counts are now supported in metadata +* Shared catalog properties are added in core library to standardize catalog level configurations +* Spark and Flink now support dynamically loading customized `Catalog` and `FileIO` implementations +* Spark 2 now supports loading tables from other catalogs, like Spark 3 +* Spark 3 now supports catalog names in DataFrameReader when using Iceberg as a format +* Flink now uses the number of Iceberg read splits as its job parallelism to improve performance and save resource. +* Hive (experimental) now supports INSERT INTO, case insensitive query, projection pushdown, create DDL with schema and auto type conversion +* ORC now supports reading tinyint, smallint, char, varchar types +* Avro to Iceberg schema conversion now preserves field docs + + + +### 0.10.0 + +* Git tag: [0.10.0](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.10.0) +* [0.10.0 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.10.0/apache-iceberg-0.10.0.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.10.0/apache-iceberg-0.10.0.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.10.0/apache-iceberg-0.10.0.tar.gz.sha512) +* [0.10.0 Spark 3.0 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.10.0/iceberg-spark3-runtime-0.10.0.jar) +* [0.10.0 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.10.0/iceberg-spark-runtime-0.10.0.jar) +* [0.10.0 Flink runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-flink-runtime/0.10.0/iceberg-flink-runtime-0.10.0.jar) +* [0.10.0 Hive runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-hive-runtime/0.10.0/iceberg-hive-runtime-0.10.0.jar) + +High-level features: + +* **Format v2 support** for building row-level operations (`MERGE INTO`) in processing engines + * Note: format v2 is not yet finalized and does not have a forward-compatibility guarantee +* **Flink integration** for writing to Iceberg tables and reading from Iceberg tables (reading supports batch mode only) +* **Hive integration** for reading from Iceberg tables, with filter pushdown (experimental; configuration may change) + +Important bug fixes: + +* [\#1706](https://github.com/apache/iceberg/pull/1706) fixes non-vectorized ORC reads in Spark that incorrectly skipped rows +* [\#1536](https://github.com/apache/iceberg/pull/1536) fixes ORC conversion of `notIn` and `notEqual` to match null values +* [\#1722](https://github.com/apache/iceberg/pull/1722) fixes `Expressions.notNull` returning an `isNull` predicate; API only, method was not used by processing engines +* [\#1736](https://github.com/apache/iceberg/pull/1736) fixes `IllegalArgumentException` in vectorized Spark reads with negative decimal values +* [\#1666](https://github.com/apache/iceberg/pull/1666) fixes file lengths returned by the ORC writer, using compressed size rather than uncompressed size +* [\#1674](https://github.com/apache/iceberg/pull/1674) removes catalog expiration in HiveCatalogs +* [\#1545](https://github.com/apache/iceberg/pull/1545) automatically refreshes tables in Spark when not caching table instances + +Other notable changes: + +* The `iceberg-hive` module has been renamed to `iceberg-hive-metastore` to avoid confusion +* Spark 3 is based on 3.0.1 that includes the fix for [SPARK-32168](https://issues.apache.org/jira/browse/SPARK-32168) +* Hadoop tables will recover from version hint corruption +* Tables can be configured with a required sort order +* Data file locations can be customized with a dynamically loaded `LocationProvider` +* ORC file imports can apply a name mapping for stats + + +A more exhaustive list of changes is available under the [0.10.0 release milestone](https://github.com/apache/iceberg/milestone/10?closed=1). + +### 0.9.1 + +* Git tag: [0.9.1](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.9.1) +* [0.9.1 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.9.1/apache-iceberg-0.9.1.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.9.1/apache-iceberg-0.9.1.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.9.1/apache-iceberg-0.9.1.tar.gz.sha512) +* [0.9.1 Spark 3.0 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.9.1/iceberg-spark3-runtime-0.9.1.jar) +* [0.9.1 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.9.1/iceberg-spark-runtime-0.9.1.jar) + +### 0.9.0 + +* Git tag: [0.9.0](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.9.0) +* [0.9.0 source tar.gz](https://www.apache.org/dyn/closer.cgi/iceberg/apache-iceberg-0.9.0/apache-iceberg-0.9.0.tar.gz) -- [signature](https://downloads.apache.org/iceberg/apache-iceberg-0.9.0/apache-iceberg-0.9.0.tar.gz.asc) -- [sha512](https://downloads.apache.org/iceberg/apache-iceberg-0.9.0/apache-iceberg-0.9.0.tar.gz.sha512) +* [0.9.0 Spark 3.0 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark3-runtime/0.9.0/iceberg-spark3-runtime-0.9.0.jar) +* [0.9.0 Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.9.0/iceberg-spark-runtime-0.9.0.jar) + +### 0.8.0 + +* Git tag: [apache-iceberg-0.8.0-incubating](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.8.0-incubating) +* [0.8.0-incubating source tar.gz](https://www.apache.org/dyn/closer.cgi/incubator/iceberg/apache-iceberg-0.8.0-incubating/apache-iceberg-0.8.0-incubating.tar.gz) -- [signature](https://downloads.apache.org/incubator/iceberg/apache-iceberg-0.8.0-incubating/apache-iceberg-0.8.0-incubating.tar.gz.asc) -- [sha512](https://downloads.apache.org/incubator/iceberg/apache-iceberg-0.8.0-incubating/apache-iceberg-0.8.0-incubating.tar.gz.sha512) +* [0.8.0-incubating Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.8.0-incubating/iceberg-spark-runtime-0.8.0-incubating.jar) + + +### 0.7.0 + +* Git tag: [apache-iceberg-0.7.0-incubating](https://github.com/apache/iceberg/releases/tag/apache-iceberg-0.7.0-incubating) +* [0.7.0-incubating source tar.gz](https://www.apache.org/dyn/closer.cgi/incubator/iceberg/apache-iceberg-0.7.0-incubating/apache-iceberg-0.7.0-incubating.tar.gz) -- [signature](https://dist.apache.org/repos/dist/release/incubator/iceberg/apache-iceberg-0.7.0-incubating/apache-iceberg-0.7.0-incubating.tar.gz.asc) -- [sha512](https://dist.apache.org/repos/dist/release/incubator/iceberg/apache-iceberg-0.7.0-incubating/apache-iceberg-0.7.0-incubating.tar.gz.sha512) +* [0.7.0-incubating Spark 2.4 runtime Jar](https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime/0.7.0-incubating/iceberg-spark-runtime-0.7.0-incubating.jar) + diff --git a/docs-new/home/roadmap.md b/docs-new/home/roadmap.md new file mode 100644 index 000000000000..c0b0efe9dd8d --- /dev/null +++ b/docs-new/home/roadmap.md @@ -0,0 +1,47 @@ +--- +title: "Roadmap" +--- + + +# Roadmap Overview + +This roadmap outlines projects that the Iceberg community is working on, their priority, and a rough size estimate. +This is based on the latest [community priority discussion](https://lists.apache.org/thread.html/r84e80216c259c81f824c6971504c321cd8c785774c489d52d4fc123f%40%3Cdev.iceberg.apache.org%3E). +Each high-level item links to a Github project board that tracks the current status. +Related design docs will be linked on the planning boards. + +# Priority 1 + +* API: [Iceberg 1.0.0](https://github.com/apache/iceberg/projects/3) [medium] +* Python: [Pythonic refactor](https://github.com/apache/iceberg/projects/7) [medium] +* Spec: [Z-ordering / Space-filling curves](https://github.com/apache/iceberg/projects/16) [medium] +* Spec: [Snapshot tagging and branching](https://github.com/apache/iceberg/projects/4) [small] +* Views: [Spec](https://github.com/apache/iceberg/projects/6) [medium] +* Puffin: [Implement statistics information in table snapshot](https://github.com/apache/iceberg/pull/4741) [medium] +* Flink: [FLIP-27 based Iceberg source](https://github.com/apache/iceberg/projects/23) [large] + +# Priority 2 + +* ORC: [Support delete files stored as ORC](https://github.com/apache/iceberg/projects/13) [small] +* Spark: [DSv2 streaming improvements](https://github.com/apache/iceberg/projects/2) [small] +* Flink: [Inline file compaction](https://github.com/apache/iceberg/projects/14) [small] +* Flink: [Support UPSERT](https://github.com/apache/iceberg/projects/15) [small] +* Spec: [Secondary indexes](https://github.com/apache/iceberg/projects/17) [large] +* Spec v3: [Encryption](https://github.com/apache/iceberg/projects/5) [large] +* Spec v3: [Relative paths](https://github.com/apache/iceberg/projects/18) [large] +* Spec v3: [Default field values](https://github.com/apache/iceberg/projects/19) [medium] diff --git a/docs-new/home/security.md b/docs-new/home/security.md new file mode 100644 index 000000000000..ee0850263912 --- /dev/null +++ b/docs-new/home/security.md @@ -0,0 +1,33 @@ +--- +title: "Security" +--- + + +# Reporting Security Issues + +The Apache Iceberg Project uses the standard process outlined by the [Apache +Security Team](https://www.apache.org/security/) for reporting vulnerabilities. +Note that vulnerabilities should not be publicly disclosed until the project has +responded. + +To report a possible security vulnerability, please email security@iceberg.apache.org. + + +# Verifying Signed Releases + +Please refer to the instructions on the [Release Verification](https://www.apache.org/info/verification.html) page. diff --git a/docs-new/home/spark-quickstart.md b/docs-new/home/spark-quickstart.md new file mode 100644 index 000000000000..cd5efdb44c00 --- /dev/null +++ b/docs-new/home/spark-quickstart.md @@ -0,0 +1,342 @@ +--- +title: "Spark and Iceberg Quickstart" +--- + + +## Spark and Iceberg Quickstart + +This guide will get you up and running with an Iceberg and Spark environment, including sample code to +highlight some powerful features. You can learn more about Iceberg's Spark runtime by checking out the [Spark](docs/latest/spark-ddl.md) section. + +- [Docker-Compose](#docker-compose) +- [Creating a table](#creating-a-table) +- [Writing Data to a Table](#writing-data-to-a-table) +- [Reading Data from a Table](#reading-data-from-a-table) +- [Adding A Catalog](#adding-a-catalog) +- [Next Steps](#next-steps) + +### Docker-Compose + +The fastest way to get started is to use a docker-compose file that uses the [tabulario/spark-iceberg](https://hub.docker.com/r/tabulario/spark-iceberg) image +which contains a local Spark cluster with a configured Iceberg catalog. To use this, you'll need to install the [Docker CLI](https://docs.docker.com/get-docker/) as well as the [Docker Compose CLI](https://github.com/docker/compose-cli/blob/main/INSTALL.md). + +Once you have those, save the yaml below into a file named `docker-compose.yml`: + +```yaml +version: "3" + +services: + spark-iceberg: + image: tabulario/spark-iceberg + container_name: spark-iceberg + build: spark/ + networks: + iceberg_net: + depends_on: + - rest + - minio + volumes: + - ./warehouse:/home/iceberg/warehouse + - ./notebooks:/home/iceberg/notebooks/notebooks + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + ports: + - 8888:8888 + - 8080:8080 + - 10000:10000 + - 10001:10001 + rest: + image: tabulario/iceberg-rest + container_name: iceberg-rest + networks: + iceberg_net: + ports: + - 8181:8181 + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - CATALOG_WAREHOUSE=s3://warehouse/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://minio:9000 + minio: + image: minio/minio + container_name: minio + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio + networks: + iceberg_net: + aliases: + - warehouse.minio + ports: + - 9001:9001 + - 9000:9000 + command: ["server", "/data", "--console-address", ":9001"] + mc: + depends_on: + - minio + image: minio/mc + container_name: mc + networks: + iceberg_net: + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/warehouse; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + tail -f /dev/null + " +networks: + iceberg_net: + +``` + +Next, start up the docker containers with this command: +```sh +docker-compose up +``` + +You can then run any of the following commands to start a Spark session. + +=== "SparkSQL" + + ``` sh + docker exec -it spark-iceberg spark-sql + ``` + +=== "Spark-Shell" + + ``` sh + docker exec -it spark-iceberg spark-shell + ``` + +=== "PySpark" + + ``` sh + docker exec -it spark-iceberg pyspark + ``` + +!!! note + + You can also launch a notebook server by running `docker exec -it spark-iceberg notebook`. + The notebook server will be available at [http://localhost:8888](http://localhost:8888) + +### Creating a table + +To create your first Iceberg table in Spark, run a [`CREATE TABLE`](docs/latest/spark-ddl.md#create-table) command. Let's create a table +using `demo.nyc.taxis` where `demo` is the catalog name, `nyc` is the database name, and `taxis` is the table name. + + +=== "SparkSQL" + + ```sql + CREATE TABLE demo.nyc.taxis + ( + vendor_id bigint, + trip_id bigint, + trip_distance float, + fare_amount double, + store_and_fwd_flag string + ) + PARTITIONED BY (vendor_id); + ``` + +=== "Spark-Shell" + + ```scala + import org.apache.spark.sql.types._ + import org.apache.spark.sql.Row + val schema = StructType( Array( + StructField("vendor_id", LongType,true), + StructField("trip_id", LongType,true), + StructField("trip_distance", FloatType,true), + StructField("fare_amount", DoubleType,true), + StructField("store_and_fwd_flag", StringType,true) + )) + val df = spark.createDataFrame(spark.sparkContext.emptyRDD[Row],schema) + df.writeTo("demo.nyc.taxis").create() + ``` + +=== "PySpark" + + ```py + from pyspark.sql.types import DoubleType, FloatType, LongType, StructType,StructField, StringType + schema = StructType([ + StructField("vendor_id", LongType(), True), + StructField("trip_id", LongType(), True), + StructField("trip_distance", FloatType(), True), + StructField("fare_amount", DoubleType(), True), + StructField("store_and_fwd_flag", StringType(), True) + ]) + + df = spark.createDataFrame([], schema) + df.writeTo("demo.nyc.taxis").create() + ``` + + +Iceberg catalogs support the full range of SQL DDL commands, including: + +* [`CREATE TABLE ... PARTITIONED BY`](docs/latest/spark-ddl.md#create-table) +* [`CREATE TABLE ... AS SELECT`](docs/latest/spark-ddl.md#create-table--as-select) +* [`ALTER TABLE`](docs/latest/spark-ddl.md#alter-table) +* [`DROP TABLE`](docs/latest/spark-ddl.md#drop-table) + +### Writing Data to a Table + +Once your table is created, you can insert records. + +=== "SparkSQL" + + ```sql + INSERT INTO demo.nyc.taxis + VALUES (1, 1000371, 1.8, 15.32, 'N'), (2, 1000372, 2.5, 22.15, 'N'), (2, 1000373, 0.9, 9.01, 'N'), (1, 1000374, 8.4, 42.13, 'Y'); + ``` + +=== "Spark-Shell" + + ```scala + import org.apache.spark.sql.Row + + val schema = spark.table("demo.nyc.taxis").schema + val data = Seq( + Row(1: Long, 1000371: Long, 1.8f: Float, 15.32: Double, "N": String), + Row(2: Long, 1000372: Long, 2.5f: Float, 22.15: Double, "N": String), + Row(2: Long, 1000373: Long, 0.9f: Float, 9.01: Double, "N": String), + Row(1: Long, 1000374: Long, 8.4f: Float, 42.13: Double, "Y": String) + ) + val df = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) + df.writeTo("demo.nyc.taxis").append() + ``` + +=== "PySpark" + + ```py + schema = spark.table("demo.nyc.taxis").schema + data = [ + (1, 1000371, 1.8, 15.32, "N"), + (2, 1000372, 2.5, 22.15, "N"), + (2, 1000373, 0.9, 9.01, "N"), + (1, 1000374, 8.4, 42.13, "Y") + ] + df = spark.createDataFrame(data, schema) + df.writeTo("demo.nyc.taxis").append() + ``` + +### Reading Data from a Table + +To read a table, simply use the Iceberg table's name. + + +=== "SparkSQL" + + ```sql + SELECT * FROM demo.nyc.taxis; + ``` + +=== "Spark-Shell" + + ```scala + val df = spark.table("demo.nyc.taxis").show() + ``` + +=== "PySpark" + + ```py + df = spark.table("demo.nyc.taxis").show() + ``` + +### Adding A Catalog + +Iceberg has several catalog back-ends that can be used to track tables, like JDBC, Hive MetaStore and Glue. +Catalogs are configured using properties under `spark.sql.catalog.(catalog_name)`. In this guide, +we use JDBC, but you can follow these instructions to configure other catalog types. To learn more, check out +the [Catalog](docs/latest/spark-configuration.md#catalogs) page in the Spark section. + +This configuration creates a path-based catalog named `local` for tables under `$PWD/warehouse` and adds support for Iceberg tables to Spark's built-in catalog. + +=== "CLI" + + ```sh + spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{ icebergVersion }}\ + --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ + --conf spark.sql.catalog.spark_catalog.type=hive \ + --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ + --conf spark.sql.catalog.local.type=hadoop \ + --conf spark.sql.catalog.local.warehouse=$PWD/warehouse \ + --conf spark.sql.defaultCatalog=local + ``` + +=== "spark-defaults.conf" + + ```sh + spark.jars.packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{ icebergVersion }} + spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + spark.sql.catalog.spark_catalog org.apache.iceberg.spark.SparkSessionCatalog + spark.sql.catalog.spark_catalog.type hive + spark.sql.catalog.local org.apache.iceberg.spark.SparkCatalog + spark.sql.catalog.local.type hadoop + spark.sql.catalog.local.warehouse $PWD/warehouse + spark.sql.defaultCatalog local + ``` + +!!! note + If your Iceberg catalog is not set as the default catalog, you will have to switch to it by executing `USE local;` + +### Next steps + +#### Adding Iceberg to Spark + +If you already have a Spark environment, you can add Iceberg, using the `--packages` option. + +=== "SparkSQL" + + ```sh + spark-sql --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{ icebergVersion }} + ``` + +=== "Spark-Shell" + + ```sh + spark-shell --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{ icebergVersion }} + ``` + +=== "PySpark" + + ```sh + pyspark --packages org.apache.iceberg:iceberg-spark-runtime-3.2_2.12:{{ icebergVersion }} + ``` + +!!! note + If you want to include Iceberg in your Spark installation, add the Iceberg Spark runtime to Spark's `jars` folder. + You can download the runtime by visiting to the [Releases](releases.md) page. + +[spark-runtime-jar]: https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-3.2_2.12/{{ icebergVersion }}/iceberg-spark-runtime-3.2_2.12-{{ icebergVersion }}.jar + +#### Learn More + +Now that you're up an running with Iceberg and Spark, check out the [Iceberg-Spark docs](docs/latest/spark-ddl.md) to learn more! diff --git a/docs-new/home/spec.md b/docs-new/home/spec.md new file mode 100644 index 000000000000..e1c1ed1b49c0 --- /dev/null +++ b/docs-new/home/spec.md @@ -0,0 +1,1259 @@ +--- +title: "Spec" +url: spec +toc: true +disableSidebar: true +--- + + +# Iceberg Table Spec + +This is a specification for the Iceberg table format that is designed to manage a large, slow-changing collection of files in a distributed file system or key-value store as a table. + +## Format Versioning + +Versions 1 and 2 of the Iceberg spec are complete and adopted by the community. + +The format version number is incremented when new features are added that will break forward-compatibility---that is, when older readers would not read newer table features correctly. Tables may continue to be written with an older version of the spec to ensure compatibility by not using features that are not yet implemented by processing engines. + +#### Version 1: Analytic Data Tables + +Version 1 of the Iceberg spec defines how to manage large analytic tables using immutable file formats: Parquet, Avro, and ORC. + +All version 1 data and metadata files are valid after upgrading a table to version 2. [Appendix E](#version-2) documents how to default version 2 fields when reading version 1 metadata. + +#### Version 2: Row-level Deletes + +Version 2 of the Iceberg spec adds row-level updates and deletes for analytic tables with immutable files. + +The primary change in version 2 adds delete files to encode rows that are deleted in existing data files. This version can be used to delete or replace individual rows in immutable data files without rewriting the files. + +In addition to row-level deletes, version 2 makes some requirements stricter for writers. The full set of changes are listed in [Appendix E](#version-2). + + +## Goals + +* **Serializable isolation** -- Reads will be isolated from concurrent writes and always use a committed snapshot of a table’s data. Writes will support removing and adding files in a single operation and are never partially visible. Readers will not acquire locks. +* **Speed** -- Operations will use O(1) remote calls to plan the files for a scan and not O(n) where n grows with the size of the table, like the number of partitions or files. +* **Scale** -- Job planning will be handled primarily by clients and not bottleneck on a central metadata store. Metadata will include information needed for cost-based optimization. +* **Evolution** -- Tables will support full schema and partition spec evolution. Schema evolution supports safe column add, drop, reorder and rename, including in nested structures. +* **Dependable types** -- Tables will provide well-defined and dependable support for a core set of types. +* **Storage separation** -- Partitioning will be table configuration. Reads will be planned using predicates on data values, not partition values. Tables will support evolving partition schemes. +* **Formats** -- Underlying data file formats will support identical schema evolution rules and types. Both read-optimized and write-optimized formats will be available. + +## Overview + +![Iceberg snapshot structure](assets/images/iceberg-metadata.png) + +This table format tracks individual data files in a table instead of directories. This allows writers to create data files in-place and only adds files to the table in an explicit commit. + +Table state is maintained in metadata files. All changes to table state create a new metadata file and replace the old metadata with an atomic swap. The table metadata file tracks the table schema, partitioning config, custom properties, and snapshots of the table contents. A snapshot represents the state of a table at some time and is used to access the complete set of data files in the table. + +Data files in snapshots are tracked by one or more manifest files that contain a row for each data file in the table, the file's partition data, and its metrics. The data in a snapshot is the union of all files in its manifests. Manifest files are reused across snapshots to avoid rewriting metadata that is slow-changing. Manifests can track data files with any subset of a table and are not associated with partitions. + +The manifests that make up a snapshot are stored in a manifest list file. Each manifest list stores metadata about manifests, including partition stats and data file counts. These stats are used to avoid reading manifests that are not required for an operation. + +#### Optimistic Concurrency + +An atomic swap of one table metadata file for another provides the basis for serializable isolation. Readers use the snapshot that was current when they load the table metadata and are not affected by changes until they refresh and pick up a new metadata location. + +Writers create table metadata files optimistically, assuming that the current version will not be changed before the writer's commit. Once a writer has created an update, it commits by swapping the table’s metadata file pointer from the base version to the new version. + +If the snapshot on which an update is based is no longer current, the writer must retry the update based on the new current version. Some operations support retry by re-applying metadata changes and committing, under well-defined conditions. For example, a change that rewrites files can be applied to a new table snapshot if all of the rewritten files are still in the table. + +The conditions required by a write to successfully commit determines the isolation level. Writers can select what to validate and can make different isolation guarantees. + +#### Sequence Numbers + +The relative age of data and delete files relies on a sequence number that is assigned to every successful commit. When a snapshot is created for a commit, it is optimistically assigned the next sequence number, and it is written into the snapshot's metadata. If the commit fails and must be retried, the sequence number is reassigned and written into new snapshot metadata. + +All manifests, data files, and delete files created for a snapshot inherit the snapshot's sequence number. Manifest file metadata in the manifest list stores a manifest's sequence number. New data and metadata file entries are written with `null` in place of a sequence number, which is replaced with the manifest's sequence number at read time. When a data or delete file is written to a new manifest (as "existing"), the inherited sequence number is written to ensure it does not change after it is first inherited. + +Inheriting the sequence number from manifest metadata allows writing a new manifest once and reusing it in commit retries. To change a sequence number for a retry, only the manifest list must be rewritten -- which would be rewritten anyway with the latest set of manifests. + + +#### Row-level Deletes + +Row-level deletes are stored in delete files. + +There are two ways to encode a row-level delete: + +* [_Position deletes_](#position-delete-files) mark a row deleted by data file path and the row position in the data file +* [_Equality deletes_](#equality-delete-files) mark a row deleted by one or more column values, like `id = 5` + +Like data files, delete files are tracked by partition. In general, a delete file must be applied to older data files with the same partition; see [Scan Planning](#scan-planning) for details. Column metrics can be used to determine whether a delete file's rows overlap the contents of a data file or a scan range. + + +#### File System Operations + +Iceberg only requires that file systems support the following operations: + +* **In-place write** -- Files are not moved or altered once they are written. +* **Seekable reads** -- Data file formats require seek support. +* **Deletes** -- Tables delete files that are no longer used. + +These requirements are compatible with object stores, like S3. + +Tables do not require random-access writes. Once written, data and metadata files are immutable until they are deleted. + +Tables do not require rename, except for tables that use atomic rename to implement the commit operation for new metadata files. + + +## Specification + +#### Terms + +* **Schema** -- Names and types of fields in a table. +* **Partition spec** -- A definition of how partition values are derived from data fields. +* **Snapshot** -- The state of a table at some point in time, including the set of all data files. +* **Manifest list** -- A file that lists manifest files; one per snapshot. +* **Manifest** -- A file that lists data or delete files; a subset of a snapshot. +* **Data file** -- A file that contains rows of a table. +* **Delete file** -- A file that encodes rows of a table that are deleted by position or data values. + +#### Writer requirements + +Some tables in this spec have columns that specify requirements for v1 and v2 tables. These requirements are intended for writers when adding metadata files to a table with the given version. + +| Requirement | Write behavior | +|-------------|----------------| +| (blank) | The field should be omitted | +| _optional_ | The field can be written | +| _required_ | The field must be written | + +Readers should be more permissive because v1 metadata files are allowed in v2 tables so that tables can be upgraded to v2 without rewriting the metadata tree. For manifest list and manifest files, this table shows the expected v2 read behavior: + +| v1 | v2 | v2 read behavior | +|------------|------------|------------------| +| | _optional_ | Read the field as _optional_ | +| | _required_ | Read the field as _optional_; it may be missing in v1 files | +| _optional_ | | Ignore the field | +| _optional_ | _optional_ | Read the field as _optional_ | +| _optional_ | _required_ | Read the field as _optional_; it may be missing in v1 files | +| _required_ | | Ignore the field | +| _required_ | _optional_ | Read the field as _optional_ | +| _required_ | _required_ | Fill in a default or throw an exception if the field is missing | + +Readers may be more strict for metadata JSON files because the JSON files are not reused and will always match the table version. Required v2 fields that were not present in v1 or optional in v1 may be handled as required fields. For example, a v2 table that is missing `last-sequence-number` can throw an exception. + +### Schemas and Data Types + +A table's **schema** is a list of named columns. All data types are either primitives or nested types, which are maps, lists, or structs. A table schema is also a struct type. + +For the representations of these types in Avro, ORC, and Parquet file formats, see Appendix A. + +#### Nested Types + +A **`struct`** is a tuple of typed values. Each field in the tuple is named and has an integer id that is unique in the table schema. Each field can be either optional or required, meaning that values can (or cannot) be null. Fields may be any type. Fields may have an optional comment or doc string. Fields can have [default values](#default-values). + +A **`list`** is a collection of values with some element type. The element field has an integer id that is unique in the table schema. Elements can be either optional or required. Element types may be any type. + +A **`map`** is a collection of key-value pairs with a key type and a value type. Both the key field and value field each have an integer id that is unique in the table schema. Map keys are required and map values can be either optional or required. Both map keys and map values may be any type, including nested types. + +#### Primitive Types + +| Primitive type | Description | Requirements | +|--------------------|--------------------------------------------------------------------------|--------------------------------------------------| +| **`boolean`** | True or false | | +| **`int`** | 32-bit signed integers | Can promote to `long` | +| **`long`** | 64-bit signed integers | | +| **`float`** | [32-bit IEEE 754](https://en.wikipedia.org/wiki/IEEE_754) floating point | Can promote to double | +| **`double`** | [64-bit IEEE 754](https://en.wikipedia.org/wiki/IEEE_754) floating point | | +| **`decimal(P,S)`** | Fixed-point decimal; precision P, scale S | Scale is fixed [1], precision must be 38 or less | +| **`date`** | Calendar date without timezone or time | | +| **`time`** | Time of day without date, timezone | Microsecond precision [2] | +| **`timestamp`** | Timestamp without timezone | Microsecond precision [2] | +| **`timestamptz`** | Timestamp with timezone | Stored as UTC [2] | +| **`string`** | Arbitrary-length character sequences | Encoded with UTF-8 [3] | +| **`uuid`** | Universally unique identifiers | Should use 16-byte fixed | +| **`fixed(L)`** | Fixed-length byte array of length L | | +| **`binary`** | Arbitrary-length byte array | | + +Notes: + +1. Decimal scale is fixed and cannot be changed by schema evolution. Precision can only be widened. +2. All time and timestamp values are stored with microsecond precision. + - Timestamps _with time zone_ represent a point in time: values are stored as UTC and do not retain a source time zone (`2017-11-16 17:10:34 PST` is stored/retrieved as `2017-11-17 01:10:34 UTC` and these values are considered identical). + - Timestamps _without time zone_ represent a date and time of day regardless of zone: the time value is independent of zone adjustments (`2017-11-16 17:10:34` is always retrieved as `2017-11-16 17:10:34`). Timestamp values are stored as a long that encodes microseconds from the unix epoch. +3. Character strings must be stored as UTF-8 encoded byte arrays. + +For details on how to serialize a schema to JSON, see Appendix C. + + +#### Default values + +Default values can be tracked for struct fields (both nested structs and the top-level schema's struct). There can be two defaults with a field: +- `initial-default` is used to populate the field's value for all records that were written before the field was added to the schema +- `write-default` is used to populate the field's value for any records written after the field was added to the schema, if the writer does not supply the field's value + +The `initial-default` is set only when a field is added to an existing schema. The `write-default` is initially set to the same value as `initial-default` and can be changed through schema evolution. If either default is not set for an optional field, then the default value is null for compatibility with older spec versions. + +The `initial-default` and `write-default` produce SQL default value behavior, without rewriting data files. SQL default value behavior when a field is added handles all existing rows as though the rows were written with the new field's default value. Default value changes may only affect future records and all known fields are written into data files. Omitting a known field when writing a data file is never allowed. The write default for a field must be written if a field is not supplied to a write. If the write default for a required field is not set, the writer must fail. + +Default values are attributes of fields in schemas and serialized with fields in the JSON format. See [Appendix C](#appendix-c-json-serialization). + + +#### Schema Evolution + +Schemas may be evolved by type promotion or adding, deleting, renaming, or reordering fields in structs (both nested structs and the top-level schema’s struct). + +Evolution applies changes to the table's current schema to produce a new schema that is identified by a unique schema ID, is added to the table's list of schemas, and is set as the table's current schema. + +Valid type promotions are: + +* `int` to `long` +* `float` to `double` +* `decimal(P, S)` to `decimal(P', S)` if `P' > P` -- widen the precision of decimal types. + +Any struct, including a top-level schema, can evolve through deleting fields, adding new fields, renaming existing fields, reordering existing fields, or promoting a primitive using the valid type promotions. Adding a new field assigns a new ID for that field and for any nested fields. Renaming an existing field must change the name, but not the field ID. Deleting a field removes it from the current schema. Field deletion cannot be rolled back unless the field was nullable or if the current snapshot has not changed. + +Grouping a subset of a struct’s fields into a nested struct is **not** allowed, nor is moving fields from a nested struct into its immediate parent struct (`struct ↔ struct>`). Evolving primitive types to structs is **not** allowed, nor is evolving a single-field struct to a primitive (`map ↔ map>`). + +Struct evolution requires the following rules for default values: +* The `initial-default` must be set when a field is added and cannot change +* The `write-default` must be set when a field is added and may change +* When a required field is added, both defaults must be set to a non-null value +* When an optional field is added, the defaults may be null and should be explicitly set +* When a new field is added to a struct with a default value, updating the struct's default is optional +* If a field value is missing from a struct's `initial-default`, the field's `initial-default` must be used for the field +* If a field value is missing from a struct's `write-default`, the field's `write-default` must be used for the field + + +#### Column Projection + +Columns in Iceberg data files are selected by field id. The table schema's column names and order may change after a data file is written, and projection must be done using field ids. If a field id is missing from a data file, its value for each row should be `null`. + +For example, a file may be written with schema `1: a int, 2: b string, 3: c double` and read using projection schema `3: measurement, 2: name, 4: a`. This must select file columns `c` (renamed to `measurement`), `b` (now called `name`), and a column of `null` values called `a`; in that order. + +Tables may also define a property `schema.name-mapping.default` with a JSON name mapping containing a list of field mapping objects. These mappings provide fallback field ids to be used when a data file does not contain field id information. Each object should contain + +* `names`: A required list of 0 or more names for a field. +* `field-id`: An optional Iceberg field ID used when a field's name is present in `names` +* `fields`: An optional list of field mappings for child field of structs, maps, and lists. + +Field mapping fields are constrained by the following rules: + +* A name may contain `.` but this refers to a literal name, not a nested field. For example, `a.b` refers to a field named `a.b`, not child field `b` of field `a`. +* Each child field should be defined with their own field mapping under `fields`. +* Multiple values for `names` may be mapped to a single field ID to support cases where a field may have different names in different data files. For example, all Avro field aliases should be listed in `names`. +* Fields which exist only in the Iceberg schema and not in imported data files may use an empty `names` list. +* Fields that exist in imported files but not in the Iceberg schema may omit `field-id`. +* List types should contain a mapping in `fields` for `element`. +* Map types should contain mappings in `fields` for `key` and `value`. +* Struct types should contain mappings in `fields` for their child fields. + +For details on serialization, see [Appendix C](#name-mapping-serialization). + +#### Identifier Field IDs + +A schema can optionally track the set of primitive fields that identify rows in a table, using the property `identifier-field-ids` (see JSON encoding in Appendix C). + +Two rows are the "same"---that is, the rows represent the same entity---if the identifier fields are equal. However, uniqueness of rows by this identifier is not guaranteed or required by Iceberg and it is the responsibility of processing engines or data providers to enforce. + +Identifier fields may be nested in structs but cannot be nested within maps or lists. Float, double, and optional fields cannot be used as identifier fields and a nested field cannot be used as an identifier field if it is nested in an optional struct, to avoid null values in identifiers. + + +#### Reserved Field IDs + +Iceberg tables must not use field ids greater than 2147483447 (`Integer.MAX_VALUE - 200`). This id range is reserved for metadata columns that can be used in user data schemas, like the `_file` column that holds the file path in which a row was stored. + +The set of metadata columns is: + +| Field id, name | Type | Description | +|-----------------------------|---------------|-------------| +| **`2147483646 _file`** | `string` | Path of the file in which a row is stored | +| **`2147483645 _pos`** | `long` | Ordinal position of a row in the source data file | +| **`2147483644 _deleted`** | `boolean` | Whether the row has been deleted | +| **`2147483643 _spec_id`** | `int` | Spec ID used to track the file containing a row | +| **`2147483642 _partition`** | `struct` | Partition to which a row belongs | +| **`2147483546 file_path`** | `string` | Path of a file, used in position-based delete files | +| **`2147483545 pos`** | `long` | Ordinal position of a row, used in position-based delete files | +| **`2147483544 row`** | `struct<...>` | Deleted row values, used in position-based delete files | + + +### Partitioning + +Data files are stored in manifests with a tuple of partition values that are used in scans to filter out files that cannot contain records that match the scan’s filter predicate. Partition values for a data file must be the same for all records stored in the data file. (Manifests store data files from any partition, as long as the partition spec is the same for the data files.) + +Tables are configured with a **partition spec** that defines how to produce a tuple of partition values from a record. A partition spec has a list of fields that consist of: + +* A **source column id** from the table’s schema +* A **partition field id** that is used to identify a partition field and is unique within a partition spec. In v2 table metadata, it is unique across all partition specs. +* A **transform** that is applied to the source column to produce a partition value +* A **partition name** + +The source column, selected by id, must be a primitive type and cannot be contained in a map or list, but may be nested in a struct. For details on how to serialize a partition spec to JSON, see Appendix C. + +Partition specs capture the transform from table data to partition values. This is used to transform predicates to partition predicates, in addition to transforming data values. Deriving partition predicates from column predicates on the table data is used to separate the logical queries from physical storage: the partitioning can change and the correct partition filters are always derived from column predicates. This simplifies queries because users don’t have to supply both logical predicates and partition predicates. For more information, see Scan Planning below. + + +#### Partition Transforms + +| Transform name | Description | Source types | Result type | +|-------------------|--------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------|-------------| +| **`identity`** | Source value, unmodified | Any | Source type | +| **`bucket[N]`** | Hash of value, mod `N` (see below) | `int`, `long`, `decimal`, `date`, `time`, `timestamp`, `timestamptz`, `string`, `uuid`, `fixed`, `binary` | `int` | +| **`truncate[W]`** | Value truncated to width `W` (see below) | `int`, `long`, `decimal`, `string` | Source type | +| **`year`** | Extract a date or timestamp year, as years from 1970 | `date`, `timestamp`, `timestamptz` | `int` | +| **`month`** | Extract a date or timestamp month, as months from 1970-01-01 | `date`, `timestamp`, `timestamptz` | `int` | +| **`day`** | Extract a date or timestamp day, as days from 1970-01-01 | `date`, `timestamp`, `timestamptz` | `int` | +| **`hour`** | Extract a timestamp hour, as hours from 1970-01-01 00:00:00 | `timestamp`, `timestamptz` | `int` | +| **`void`** | Always produces `null` | Any | Source type or `int` | + +All transforms must return `null` for a `null` input value. + +The `void` transform may be used to replace the transform in an existing partition field so that the field is effectively dropped in v1 tables. See partition evolution below. + + +#### Bucket Transform Details + +Bucket partition transforms use a 32-bit hash of the source value. The 32-bit hash implementation is the 32-bit Murmur3 hash, x86 variant, seeded with 0. + +Transforms are parameterized by a number of buckets [1], `N`. The hash mod `N` must produce a positive value by first discarding the sign bit of the hash value. In pseudo-code, the function is: + +``` + def bucket_N(x) = (murmur3_x86_32_hash(x) & Integer.MAX_VALUE) % N +``` + +Notes: + +1. Changing the number of buckets as a table grows is possible by evolving the partition spec. + +For hash function details by type, see Appendix B. + + +#### Truncate Transform Details + +| **Type** | **Config** | **Truncate specification** | **Examples** | +|---------------|-----------------------|------------------------------------------------------------------|----------------------------------| +| **`int`** | `W`, width | `v - (v % W)` remainders must be positive [1] | `W=10`: `1` → `0`, `-1` → `-10` | +| **`long`** | `W`, width | `v - (v % W)` remainders must be positive [1] | `W=10`: `1` → `0`, `-1` → `-10` | +| **`decimal`** | `W`, width (no scale) | `scaled_W = decimal(W, scale(v))` `v - (v % scaled_W)` [1, 2] | `W=50`, `s=2`: `10.65` → `10.50` | +| **`string`** | `L`, length | Substring of length `L`: `v.substring(0, L)` [3] | `L=3`: `iceberg` → `ice` | + +Notes: + +1. The remainder, `v % W`, must be positive. For languages where `%` can produce negative values, the correct truncate function is: `v - (((v % W) + W) % W)` +2. The width, `W`, used to truncate decimal values is applied using the scale of the decimal column to avoid additional (and potentially conflicting) parameters. +3. Strings are truncated to a valid UTF-8 string with no more than `L` code points. + + +#### Partition Evolution + +Table partitioning can be evolved by adding, removing, renaming, or reordering partition spec fields. + +Changing a partition spec produces a new spec identified by a unique spec ID that is added to the table's list of partition specs and may be set as the table's default spec. + +When evolving a spec, changes should not cause partition field IDs to change because the partition field IDs are used as the partition tuple field IDs in manifest files. + +In v2, partition field IDs must be explicitly tracked for each partition field. New IDs are assigned based on the last assigned partition ID in table metadata. + +In v1, partition field IDs were not tracked, but were assigned sequentially starting at 1000 in the reference implementation. This assignment caused problems when reading metadata tables based on manifest files from multiple specs because partition fields with the same ID may contain different data types. For compatibility with old versions, the following rules are recommended for partition evolution in v1 tables: + +1. Do not reorder partition fields +2. Do not drop partition fields; instead replace the field's transform with the `void` transform +3. Only add partition fields at the end of the previous partition spec + + +### Sorting + +Users can sort their data within partitions by columns to gain performance. The information on how the data is sorted can be declared per data or delete file, by a **sort order**. + +A sort order is defined by a sort order id and a list of sort fields. The order of the sort fields within the list defines the order in which the sort is applied to the data. Each sort field consists of: + +* A **source column id** from the table's schema +* A **transform** that is used to produce values to be sorted on from the source column. This is the same transform as described in [partition transforms](#partition-transforms). +* A **sort direction**, that can only be either `asc` or `desc` +* A **null order** that describes the order of null values when sorted. Can only be either `nulls-first` or `nulls-last` + +Order id `0` is reserved for the unsorted order. + +Sorting floating-point numbers should produce the following behavior: `-NaN` < `-Infinity` < `-value` < `-0` < `0` < `value` < `Infinity` < `NaN`. This aligns with the implementation of Java floating-point types comparisons. + +A data or delete file is associated with a sort order by the sort order's id within [a manifest](#manifests). Therefore, the table must declare all the sort orders for lookup. A table could also be configured with a default sort order id, indicating how the new data should be sorted by default. Writers should use this default sort order to sort the data on write, but are not required to if the default order is prohibitively expensive, as it would be for streaming writes. + + +### Manifests + +A manifest is an immutable Avro file that lists data files or delete files, along with each file’s partition data tuple, metrics, and tracking information. One or more manifest files are used to store a [snapshot](#snapshots), which tracks all of the files in a table at some point in time. Manifests are tracked by a [manifest list](#manifest-lists) for each table snapshot. + +A manifest is a valid Iceberg data file: files must use valid Iceberg formats, schemas, and column projection. + +A manifest may store either data files or delete files, but not both because manifests that contain delete files are scanned first during job planning. Whether a manifest is a data manifest or a delete manifest is stored in manifest metadata. + +A manifest stores files for a single partition spec. When a table’s partition spec changes, old files remain in the older manifest and newer files are written to a new manifest. This is required because a manifest file’s schema is based on its partition spec (see below). The partition spec of each manifest is also used to transform predicates on the table's data rows into predicates on partition values that are used during job planning to select files from a manifest. + +A manifest file must store the partition spec and other metadata as properties in the Avro file's key-value metadata: + +| v1 | v2 | Key | Value | +|------------|------------|---------------------|------------------------------------------------------------------------------| +| _required_ | _required_ | `schema` | JSON representation of the table schema at the time the manifest was written | +| _optional_ | _required_ | `schema-id` | ID of the schema used to write the manifest as a string | +| _required_ | _required_ | `partition-spec` | JSON fields representation of the partition spec used to write the manifest | +| _optional_ | _required_ | `partition-spec-id` | ID of the partition spec used to write the manifest as a string | +| _optional_ | _required_ | `format-version` | Table format version number of the manifest as a string | +| | _required_ | `content` | Type of content files tracked by the manifest: "data" or "deletes" | + +The schema of a manifest file is a struct called `manifest_entry` with the following fields: + +| v1 | v2 | Field id, name | Type | Description | +| ---------- | ---------- |-------------------------------|-----------------------------------------------------------|-------------| +| _required_ | _required_ | **`0 status`** | `int` with meaning: `0: EXISTING` `1: ADDED` `2: DELETED` | Used to track additions and deletions. Deletes are informational only and not used in scans. | +| _required_ | _optional_ | **`1 snapshot_id`** | `long` | Snapshot id where the file was added, or deleted if status is 2. Inherited when null. | +| | _optional_ | **`3 sequence_number`** | `long` | Data sequence number of the file. Inherited when null and status is 1 (added). | +| | _optional_ | **`4 file_sequence_number`** | `long` | File sequence number indicating when the file was added. Inherited when null and status is 1 (added). | +| _required_ | _required_ | **`2 data_file`** | `data_file` `struct` (see below) | File path, partition tuple, metrics, ... | + +`data_file` is a struct with the following fields: + +| v1 | v2 | Field id, name | Type | Description | +| ---------- | ---------- |-----------------------------------|------------------------------|-------------| +| | _required_ | **`134 content`** | `int` with meaning: `0: DATA`, `1: POSITION DELETES`, `2: EQUALITY DELETES` | Type of content stored by the data file: data, equality deletes, or position deletes (all v1 files are data files) | +| _required_ | _required_ | **`100 file_path`** | `string` | Full URI for the file with FS scheme | +| _required_ | _required_ | **`101 file_format`** | `string` | String file format name, avro, orc or parquet | +| _required_ | _required_ | **`102 partition`** | `struct<...>` | Partition data tuple, schema based on the partition spec output using partition field ids for the struct field ids | +| _required_ | _required_ | **`103 record_count`** | `long` | Number of records in this file | +| _required_ | _required_ | **`104 file_size_in_bytes`** | `long` | Total file size in bytes | +| _required_ | | ~~**`105 block_size_in_bytes`**~~ | `long` | **Deprecated. Always write a default in v1. Do not write in v2.** | +| _optional_ | | ~~**`106 file_ordinal`**~~ | `int` | **Deprecated. Do not write.** | +| _optional_ | | ~~**`107 sort_columns`**~~ | `list<112: int>` | **Deprecated. Do not write.** | +| _optional_ | _optional_ | **`108 column_sizes`** | `map<117: int, 118: long>` | Map from column id to the total size on disk of all regions that store the column. Does not include bytes necessary to read other columns, like footers. Leave null for row-oriented formats (Avro) | +| _optional_ | _optional_ | **`109 value_counts`** | `map<119: int, 120: long>` | Map from column id to number of values in the column (including null and NaN values) | +| _optional_ | _optional_ | **`110 null_value_counts`** | `map<121: int, 122: long>` | Map from column id to number of null values in the column | +| _optional_ | _optional_ | **`137 nan_value_counts`** | `map<138: int, 139: long>` | Map from column id to number of NaN values in the column | +| _optional_ | _optional_ | **`111 distinct_counts`** | `map<123: int, 124: long>` | Map from column id to number of distinct values in the column; distinct counts must be derived using values in the file by counting or using sketches, but not using methods like merging existing distinct counts | +| _optional_ | _optional_ | **`125 lower_bounds`** | `map<126: int, 127: binary>` | Map from column id to lower bound in the column serialized as binary [1]. Each value must be less than or equal to all non-null, non-NaN values in the column for the file [2] | +| _optional_ | _optional_ | **`128 upper_bounds`** | `map<129: int, 130: binary>` | Map from column id to upper bound in the column serialized as binary [1]. Each value must be greater than or equal to all non-null, non-Nan values in the column for the file [2] | +| _optional_ | _optional_ | **`131 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | +| _optional_ | _optional_ | **`132 split_offsets`** | `list<133: long>` | Split offsets for the data file. For example, all row group offsets in a Parquet file. Must be sorted ascending | +| | _optional_ | **`135 equality_ids`** | `list<136: int>` | Field ids used to determine row equality in equality delete files. Required when `content=2` and should be null otherwise. Fields with ids listed in this column must be present in the delete file | +| _optional_ | _optional_ | **`140 sort_order_id`** | `int` | ID representing sort order for this file [3]. | + +Notes: + +1. Single-value serialization for lower and upper bounds is detailed in Appendix D. +2. For `float` and `double`, the value `-0.0` must precede `+0.0`, as in the IEEE 754 `totalOrder` predicate. NaNs are not permitted as lower or upper bounds. +3. If sort order ID is missing or unknown, then the order is assumed to be unsorted. Only data files and equality delete files should be written with a non-null order id. [Position deletes](#position-delete-files) are required to be sorted by file and position, not a table order, and should set sort order id to null. Readers must ignore sort order id for position delete files. +4. The following field ids are reserved on `data_file`: 141. + +The `partition` struct stores the tuple of partition values for each file. Its type is derived from the partition fields of the partition spec used to write the manifest file. In v2, the partition struct's field ids must match the ids from the partition spec. + +The column metrics maps are used when filtering to select both data and delete files. For delete files, the metrics must store bounds and counts for all deleted rows, or must be omitted. Storing metrics for deleted rows ensures that the values can be used during job planning to find delete files that must be merged during a scan. + + +#### Manifest Entry Fields + +The manifest entry fields are used to keep track of the snapshot in which files were added or logically deleted. The `data_file` struct is nested inside of the manifest entry so that it can be easily passed to job planning without the manifest entry fields. + +When a file is added to the dataset, its manifest entry should store the snapshot ID in which the file was added and set status to 1 (added). + +When a file is replaced or deleted from the dataset, its manifest entry fields store the snapshot ID in which the file was deleted and status 2 (deleted). The file may be deleted from the file system when the snapshot in which it was deleted is garbage collected, assuming that older snapshots have also been garbage collected [1]. + +Iceberg v2 adds data and file sequence numbers to the entry and makes the snapshot ID optional. Values for these fields are inherited from manifest metadata when `null`. That is, if the field is `null` for an entry, then the entry must inherit its value from the manifest file's metadata, stored in the manifest list. +The `sequence_number` field represents the data sequence number and must never change after a file is added to the dataset. The data sequence number represents a relative age of the file content and should be used for planning which delete files apply to a data file. +The `file_sequence_number` field represents the sequence number of the snapshot that added the file and must also remain unchanged upon assigning at commit. The file sequence number can't be used for pruning delete files as the data within the file may have an older data sequence number. +The data and file sequence numbers are inherited only if the entry status is 1 (added). If the entry status is 0 (existing) or 2 (deleted), the entry must include both sequence numbers explicitly. + +Notes: + +1. Technically, data files can be deleted when the last snapshot that contains the file as “live” data is garbage collected. But this is harder to detect and requires finding the diff of multiple snapshots. It is easier to track what files are deleted in a snapshot and delete them when that snapshot expires. It is not recommended to add a deleted file back to a table. Adding a deleted file can lead to edge cases where incremental deletes can break table snapshots. +2. Manifest list files are required in v2, so that the `sequence_number` and `snapshot_id` to inherit are always available. + +#### Sequence Number Inheritance + +Manifests track the sequence number when a data or delete file was added to the table. + +When adding a new file, its data and file sequence numbers are set to `null` because the snapshot's sequence number is not assigned until the snapshot is successfully committed. When reading, sequence numbers are inherited by replacing `null` with the manifest's sequence number from the manifest list. +It is also possible to add a new file with data that logically belongs to an older sequence number. In that case, the data sequence number must be provided explicitly and not inherited. However, the file sequence number must be always assigned when the snapshot is successfully committed. + +When writing an existing file to a new manifest or marking an existing file as deleted, the data and file sequence numbers must be non-null and set to the original values that were either inherited or provided at the commit time. + +Inheriting sequence numbers through the metadata tree allows writing a new manifest without a known sequence number, so that a manifest can be written once and reused in commit retries. To change a sequence number for a retry, only the manifest list must be rewritten. + +When reading v1 manifests with no sequence number column, sequence numbers for all files must default to 0. + + +### Snapshots + +A snapshot consists of the following fields: + +| v1 | v2 | Field | Description | +| ---------- | ---------- | ------------------------ | ----------- | +| _required_ | _required_ | **`snapshot-id`** | A unique long ID | +| _optional_ | _optional_ | **`parent-snapshot-id`** | The snapshot ID of the snapshot's parent. Omitted for any snapshot with no parent | +| | _required_ | **`sequence-number`** | A monotonically increasing long that tracks the order of changes to a table | +| _required_ | _required_ | **`timestamp-ms`** | A timestamp when the snapshot was created, used for garbage collection and table inspection | +| _optional_ | _required_ | **`manifest-list`** | The location of a manifest list for this snapshot that tracks manifest files with additional metadata | +| _optional_ | | **`manifests`** | A list of manifest file locations. Must be omitted if `manifest-list` is present | +| _optional_ | _required_ | **`summary`** | A string map that summarizes the snapshot changes, including `operation` (see below) | +| _optional_ | _optional_ | **`schema-id`** | ID of the table's current schema when the snapshot was created | + +The snapshot summary's `operation` field is used by some operations, like snapshot expiration, to skip processing certain snapshots. Possible `operation` values are: + +* `append` -- Only data files were added and no files were removed. +* `replace` -- Data and delete files were added and removed without changing table data; i.e., compaction, changing the data file format, or relocating data files. +* `overwrite` -- Data and delete files were added and removed in a logical overwrite operation. +* `delete` -- Data files were removed and their contents logically deleted and/or delete files were added to delete rows. + +Data and delete files for a snapshot can be stored in more than one manifest. This enables: + +* Appends can add a new manifest to minimize the amount of data written, instead of adding new records by rewriting and appending to an existing manifest. (This is called a “fast append”.) +* Tables can use multiple partition specs. A table’s partition configuration can evolve if, for example, its data volume changes. Each manifest uses a single partition spec, and queries do not need to change because partition filters are derived from data predicates. +* Large tables can be split across multiple manifests so that implementations can parallelize job planning or reduce the cost of rewriting a manifest. + +Manifests for a snapshot are tracked by a manifest list. + +Valid snapshots are stored as a list in table metadata. For serialization, see Appendix C. + + +#### Manifest Lists + +Snapshots are embedded in table metadata, but the list of manifests for a snapshot are stored in a separate manifest list file. + +A new manifest list is written for each attempt to commit a snapshot because the list of manifests always changes to produce a new snapshot. When a manifest list is written, the (optimistic) sequence number of the snapshot is written for all new manifest files tracked by the list. + +A manifest list includes summary metadata that can be used to avoid scanning all of the manifests in a snapshot when planning a table scan. This includes the number of added, existing, and deleted files, and a summary of values for each field of the partition spec used to write the manifest. + +A manifest list is a valid Iceberg data file: files must use valid Iceberg formats, schemas, and column projection. + +Manifest list files store `manifest_file`, a struct with the following fields: + +| v1 | v2 | Field id, name | Type | Description | +| ---------- | ---------- |--------------------------------|---------------------------------------------|-------------| +| _required_ | _required_ | **`500 manifest_path`** | `string` | Location of the manifest file | +| _required_ | _required_ | **`501 manifest_length`** | `long` | Length of the manifest file in bytes | +| _required_ | _required_ | **`502 partition_spec_id`** | `int` | ID of a partition spec used to write the manifest; must be listed in table metadata `partition-specs` | +| | _required_ | **`517 content`** | `int` with meaning: `0: data`, `1: deletes` | The type of files tracked by the manifest, either data or delete files; 0 for all v1 manifests | +| | _required_ | **`515 sequence_number`** | `long` | The sequence number when the manifest was added to the table; use 0 when reading v1 manifest lists | +| | _required_ | **`516 min_sequence_number`** | `long` | The minimum data sequence number of all live data or delete files in the manifest; use 0 when reading v1 manifest lists | +| _required_ | _required_ | **`503 added_snapshot_id`** | `long` | ID of the snapshot where the manifest file was added | +| _optional_ | _required_ | **`504 added_files_count`** | `int` | Number of entries in the manifest that have status `ADDED` (1), when `null` this is assumed to be non-zero | +| _optional_ | _required_ | **`505 existing_files_count`** | `int` | Number of entries in the manifest that have status `EXISTING` (0), when `null` this is assumed to be non-zero | +| _optional_ | _required_ | **`506 deleted_files_count`** | `int` | Number of entries in the manifest that have status `DELETED` (2), when `null` this is assumed to be non-zero | +| _optional_ | _required_ | **`512 added_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `ADDED`, when `null` this is assumed to be non-zero | +| _optional_ | _required_ | **`513 existing_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `EXISTING`, when `null` this is assumed to be non-zero | +| _optional_ | _required_ | **`514 deleted_rows_count`** | `long` | Number of rows in all of files in the manifest that have status `DELETED`, when `null` this is assumed to be non-zero | +| _optional_ | _optional_ | **`507 partitions`** | `list<508: field_summary>` (see below) | A list of field summaries for each partition field in the spec. Each field in the list corresponds to a field in the manifest file’s partition spec. | +| _optional_ | _optional_ | **`519 key_metadata`** | `binary` | Implementation-specific key metadata for encryption | + +`field_summary` is a struct with the following fields: + +| v1 | v2 | Field id, name | Type | Description | +| ---------- | ---------- |-------------------------|---------------|-------------| +| _required_ | _required_ | **`509 contains_null`** | `boolean` | Whether the manifest contains at least one partition with a null value for the field | +| _optional_ | _optional_ | **`518 contains_nan`** | `boolean` | Whether the manifest contains at least one partition with a NaN value for the field | +| _optional_ | _optional_ | **`510 lower_bound`** | `bytes` [1] | Lower bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | +| _optional_ | _optional_ | **`511 upper_bound`** | `bytes` [1] | Upper bound for the non-null, non-NaN values in the partition field, or null if all values are null or NaN [2] | + +Notes: + +1. Lower and upper bounds are serialized to bytes using the single-object serialization in Appendix D. The type of used to encode the value is the type of the partition field data. +2. If -0.0 is a value of the partition field, the `lower_bound` must not be +0.0, and if +0.0 is a value of the partition field, the `upper_bound` must not be -0.0. + +#### Scan Planning + +Scans are planned by reading the manifest files for the current snapshot. Deleted entries in data and delete manifests (those marked with status "DELETED") are not used in a scan. + +Manifests that contain no matching files, determined using either file counts or partition summaries, may be skipped. + +For each manifest, scan predicates, which filter data rows, are converted to partition predicates, which filter data and delete files. These partition predicates are used to select the data and delete files in the manifest. This conversion uses the partition spec used to write the manifest file. + +Scan predicates are converted to partition predicates using an _inclusive projection_: if a scan predicate matches a row, then the partition predicate must match that row’s partition. This is called _inclusive_ [1] because rows that do not match the scan predicate may be included in the scan by the partition predicate. + +For example, an `events` table with a timestamp column named `ts` that is partitioned by `ts_day=day(ts)` is queried by users with ranges over the timestamp column: `ts > X`. The inclusive projection is `ts_day >= day(X)`, which is used to select files that may have matching rows. Note that, in most cases, timestamps just before `X` will be included in the scan because the file contains rows that match the predicate and rows that do not match the predicate. + +Scan predicates are also used to filter data and delete files using column bounds and counts that are stored by field id in manifests. The same filter logic can be used for both data and delete files because both store metrics of the rows either inserted or deleted. If metrics show that a delete file has no rows that match a scan predicate, it may be ignored just as a data file would be ignored [2]. + +Data files that match the query filter must be read by the scan. + +Note that for any snapshot, all file paths marked with "ADDED" or "EXISTING" may appear at most once across all manifest files in the snapshot. If a file path appears more than once, the results of the scan are undefined. Reader implementations may raise an error in this case, but are not required to do so. + + +Delete files that match the query filter must be applied to data files at read time, limited by the scope of the delete file using the following rules. + +* A _position_ delete file must be applied to a data file when all of the following are true: + - The data file's data sequence number is _less than or equal to_ the delete file's data sequence number + - The data file's partition (both spec and partition values) is equal to the delete file's partition +* An _equality_ delete file must be applied to a data file when all of the following are true: + - The data file's data sequence number is _strictly less than_ the delete's data sequence number + - The data file's partition (both spec and partition values) is equal to the delete file's partition _or_ the delete file's partition spec is unpartitioned + +In general, deletes are applied only to data files that are older and in the same partition, except for two special cases: + +* Equality delete files stored with an unpartitioned spec are applied as global deletes. Otherwise, delete files do not apply to files in other partitions. +* Position delete files must be applied to data files from the same commit, when the data and delete file data sequence numbers are equal. This allows deleting rows that were added in the same commit. + + +Notes: + +1. An alternative, *strict projection*, creates a partition predicate that will match a file if all of the rows in the file must match the scan predicate. These projections are used to calculate the residual predicates for each file in a scan. +2. For example, if `file_a` has rows with `id` between 1 and 10 and a delete file contains rows with `id` between 1 and 4, a scan for `id = 9` may ignore the delete file because none of the deletes can match a row that will be selected. + +#### Snapshot Reference + +Iceberg tables keep track of branches and tags using snapshot references. +Tags are labels for individual snapshots. Branches are mutable named references that can be updated by committing a new snapshot as the branch's referenced snapshot using the [Commit Conflict Resolution and Retry](#commit-conflict-resolution-and-retry) procedures. + +The snapshot reference object records all the information of a reference including snapshot ID, reference type and [Snapshot Retention Policy](#snapshot-retention-policy). + +| v1 | v2 | Field name | Type | Description | +| ---------- | ---------- | ---------------------------- | --------- | ----------- | +| _required_ | _required_ | **`snapshot-id`** | `long` | A reference's snapshot ID. The tagged snapshot or latest snapshot of a branch. | +| _required_ | _required_ | **`type`** | `string` | Type of the reference, `tag` or `branch` | +| _optional_ | _optional_ | **`min-snapshots-to-keep`** | `int` | For `branch` type only, a positive number for the minimum number of snapshots to keep in a branch while expiring snapshots. Defaults to table property `history.expire.min-snapshots-to-keep`. | +| _optional_ | _optional_ | **`max-snapshot-age-ms`** | `long` | For `branch` type only, a positive number for the max age of snapshots to keep when expiring, including the latest snapshot. Defaults to table property `history.expire.max-snapshot-age-ms`. | +| _optional_ | _optional_ | **`max-ref-age-ms`** | `long` | For snapshot references except the `main` branch, a positive number for the max age of the snapshot reference to keep while expiring snapshots. Defaults to table property `history.expire.max-ref-age-ms`. The `main` branch never expires. | + +Valid snapshot references are stored as the values of the `refs` map in table metadata. For serialization, see Appendix C. + +#### Snapshot Retention Policy + +Table snapshots expire and are removed from metadata to allow removed or replaced data files to be physically deleted. +The snapshot expiration procedure removes snapshots from table metadata and applies the table's retention policy. +Retention policy can be configured both globally and on snapshot reference through properties `min-snapshots-to-keep`, `max-snapshot-age-ms` and `max-ref-age-ms`. + +When expiring snapshots, retention policies in table and snapshot references are evaluated in the following way: + +1. Start with an empty set of snapshots to retain +2. Remove any refs (other than main) where the referenced snapshot is older than `max-ref-age-ms` +3. For each branch and tag, add the referenced snapshot to the retained set +4. For each branch, add its ancestors to the retained set until: + 1. The snapshot is older than `max-snapshot-age-ms`, AND + 2. The snapshot is not one of the first `min-snapshots-to-keep` in the branch (including the branch's referenced snapshot) +5. Expire any snapshot not in the set of snapshots to retain. + +### Table Metadata + +Table metadata is stored as JSON. Each table metadata change creates a new table metadata file that is committed by an atomic operation. This operation is used to ensure that a new version of table metadata replaces the version on which it was based. This produces a linear history of table versions and ensures that concurrent writes are not lost. + +The atomic operation used to commit metadata depends on how tables are tracked and is not standardized by this spec. See the sections below for examples. + +#### Table Metadata Fields + +Table metadata consists of the following fields: + +| v1 | v2 | Field | Description | +| ---------- | ---------- | ----- | ----------- | +| _required_ | _required_ | **`format-version`** | An integer version number for the format. Currently, this can be 1 or 2 based on the spec. Implementations must throw an exception if a table's version is higher than the supported version. | +| _optional_ | _required_ | **`table-uuid`** | A UUID that identifies the table, generated when the table is created. Implementations must throw an exception if a table's UUID does not match the expected UUID after refreshing metadata. | +| _required_ | _required_ | **`location`**| The table's base location. This is used by writers to determine where to store data files, manifest files, and table metadata files. | +| | _required_ | **`last-sequence-number`**| The table's highest assigned sequence number, a monotonically increasing long that tracks the order of snapshots in a table. | +| _required_ | _required_ | **`last-updated-ms`**| Timestamp in milliseconds from the unix epoch when the table was last updated. Each table metadata file should update this field just before writing. | +| _required_ | _required_ | **`last-column-id`**| An integer; the highest assigned column ID for the table. This is used to ensure columns are always assigned an unused ID when evolving schemas. | +| _required_ | | **`schema`**| The table’s current schema. (**Deprecated**: use `schemas` and `current-schema-id` instead) | +| _optional_ | _required_ | **`schemas`**| A list of schemas, stored as objects with `schema-id`. | +| _optional_ | _required_ | **`current-schema-id`**| ID of the table's current schema. | +| _required_ | | **`partition-spec`**| The table’s current partition spec, stored as only fields. Note that this is used by writers to partition data, but is not used when reading because reads use the specs stored in manifest files. (**Deprecated**: use `partition-specs` and `default-spec-id` instead) | +| _optional_ | _required_ | **`partition-specs`**| A list of partition specs, stored as full partition spec objects. | +| _optional_ | _required_ | **`default-spec-id`**| ID of the "current" spec that writers should use by default. | +| _optional_ | _required_ | **`last-partition-id`**| An integer; the highest assigned partition field ID across all partition specs for the table. This is used to ensure partition fields are always assigned an unused ID when evolving specs. | +| _optional_ | _optional_ | **`properties`**| A string to string map of table properties. This is used to control settings that affect reading and writing and is not intended to be used for arbitrary metadata. For example, `commit.retry.num-retries` is used to control the number of commit retries. | +| _optional_ | _optional_ | **`current-snapshot-id`**| `long` ID of the current table snapshot; must be the same as the current ID of the `main` branch in `refs`. | +| _optional_ | _optional_ | **`snapshots`**| A list of valid snapshots. Valid snapshots are snapshots for which all data files exist in the file system. A data file must not be deleted from the file system until the last snapshot in which it was listed is garbage collected. | +| _optional_ | _optional_ | **`snapshot-log`**| A list (optional) of timestamp and snapshot ID pairs that encodes changes to the current snapshot for the table. Each time the current-snapshot-id is changed, a new entry should be added with the last-updated-ms and the new current-snapshot-id. When snapshots are expired from the list of valid snapshots, all entries before a snapshot that has expired should be removed. | +| _optional_ | _optional_ | **`metadata-log`**| A list (optional) of timestamp and metadata file location pairs that encodes changes to the previous metadata files for the table. Each time a new metadata file is created, a new entry of the previous metadata file location should be added to the list. Tables can be configured to remove oldest metadata log entries and keep a fixed-size log of the most recent entries after a commit. | +| _optional_ | _required_ | **`sort-orders`**| A list of sort orders, stored as full sort order objects. | +| _optional_ | _required_ | **`default-sort-order-id`**| Default sort order id of the table. Note that this could be used by writers, but is not used when reading because reads use the specs stored in manifest files. | +| | _optional_ | **`refs`** | A map of snapshot references. The map keys are the unique snapshot reference names in the table, and the map values are snapshot reference objects. There is always a `main` branch reference pointing to the `current-snapshot-id` even if the `refs` map is null. | +| _optional_ | _optional_ | **`statistics`** | A list (optional) of [table statistics](#table-statistics). | + +For serialization details, see Appendix C. + +#### Table statistics + +Table statistics files are valid [Puffin files](puffin-spec.md). Statistics are informational. A reader can choose to +ignore statistics information. Statistics support is not required to read the table correctly. A table can contain +many statistics files associated with different table snapshots. + +Statistics files metadata within `statistics` table metadata field is a struct with the following fields: + +| v1 | v2 | Field name | Type | Description | +|----|----|------------|------|-------------| +| _required_ | _required_ | **`snapshot-id`** | `string` | ID of the Iceberg table's snapshot the statistics file is associated with. | +| _required_ | _required_ | **`statistics-path`** | `string` | Path of the statistics file. See [Puffin file format](puffin-spec.md). | +| _required_ | _required_ | **`file-size-in-bytes`** | `long` | Size of the statistics file. | +| _required_ | _required_ | **`file-footer-size-in-bytes`** | `long` | Total size of the statistics file's footer (not the footer payload size). See [Puffin file format](puffin-spec.md) for footer definition. | +| _optional_ | _optional_ | **`key-metadata`** | Base64-encoded implementation-specific key metadata for encryption. | +| _required_ | _required_ | **`blob-metadata`** | `list` (see below) | A list of the blob metadata for statistics contained in the file with structure described below. | + +Blob metadata is a struct with the following fields: + +| v1 | v2 | Field name | Type | Description | +|----|----|------------|------|-------------| +| _required_ | _required_ | **`type`** | `string` | Type of the blob. Matches Blob type in the Puffin file. | +| _required_ | _required_ | **`snapshot-id`** | `long` | ID of the Iceberg table's snapshot the blob was computed from. | +| _required_ | _required_ | **`sequence-number`** | `long` | Sequence number of the Iceberg table's snapshot the blob was computed from. | +| _required_ | _required_ | **`fields`** | `list` | Ordered list of fields, given by field ID, on which the statistic was calculated. | +| _optional_ | _optional_ | **`properties`** | `map` | Additional properties associated with the statistic. Subset of Blob properties in the Puffin file. | + + +#### Commit Conflict Resolution and Retry + +When two commits happen at the same time and are based on the same version, only one commit will succeed. In most cases, the failed commit can be applied to the new current version of table metadata and retried. Updates verify the conditions under which they can be applied to a new version and retry if those conditions are met. + +* Append operations have no requirements and can always be applied. +* Replace operations must verify that the files that will be deleted are still in the table. Examples of replace operations include format changes (replace an Avro file with a Parquet file) and compactions (several files are replaced with a single file that contains the same rows). +* Delete operations must verify that specific files to delete are still in the table. Delete operations based on expressions can always be applied (e.g., where timestamp < X). +* Table schema updates and partition spec changes must validate that the schema has not changed between the base version and the current version. + + +#### File System Tables + +An atomic swap can be implemented using atomic rename in file systems that support it, like HDFS or most local file systems [1]. + +Each version of table metadata is stored in a metadata folder under the table’s base location using a file naming scheme that includes a version number, `V`: `v.metadata.json`. To commit a new metadata version, `V+1`, the writer performs the following steps: + +1. Read the current table metadata version `V`. +2. Create new table metadata based on version `V`. +3. Write the new table metadata to a unique file: `.metadata.json`. +4. Rename the unique file to the well-known file for version `V`: `v.metadata.json`. + 1. If the rename succeeds, the commit succeeded and `V+1` is the table’s current version + 2. If the rename fails, go back to step 1. + +Notes: + +1. The file system table scheme is implemented in [HadoopTableOperations](javadoc/latest/index.html?org/apache/iceberg/hadoop/HadoopTableOperations.html). + +#### Metastore Tables + +The atomic swap needed to commit new versions of table metadata can be implemented by storing a pointer in a metastore or database that is updated with a check-and-put operation [1]. The check-and-put validates that the version of the table that a write is based on is still current and then makes the new metadata from the write the current version. + +Each version of table metadata is stored in a metadata folder under the table’s base location using a naming scheme that includes a version and UUID: `-.metadata.json`. To commit a new metadata version, `V+1`, the writer performs the following steps: + +1. Create a new table metadata file based on the current metadata. +2. Write the new table metadata to a unique file: `-.metadata.json`. +3. Request that the metastore swap the table’s metadata pointer from the location of `V` to the location of `V+1`. + 1. If the swap succeeds, the commit succeeded. `V` was still the latest metadata version and the metadata file for `V+1` is now the current metadata. + 2. If the swap fails, another writer has already created `V+1`. The current writer goes back to step 1. + +Notes: + +1. The metastore table scheme is partly implemented in [BaseMetastoreTableOperations](javadoc/latest/index.html?org/apache/iceberg/BaseMetastoreTableOperations.html). + + +### Delete Formats + +This section details how to encode row-level deletes in Iceberg delete files. Row-level deletes are not supported in v1. + +Row-level delete files are valid Iceberg data files: files must use valid Iceberg formats, schemas, and column projection. It is recommended that delete files are written using the table's default file format. + +Row-level delete files are tracked by manifests, like data files. A separate set of manifests is used for delete files, but the manifest schemas are identical. + +Both position and equality deletes allow encoding deleted row values with a delete. This can be used to reconstruct a stream of changes to a table. + + +#### Position Delete Files + +Position-based delete files identify deleted rows by file and position in one or more data files, and may optionally contain the deleted row. + +A data row is deleted if there is an entry in a position delete file for the row's file and position in the data file, starting at 0. + +Position-based delete files store `file_position_delete`, a struct with the following fields: + +| Field id, name | Type | Description | +|-----------------------------|----------------------------|-------------| +| **`2147483546 file_path`** | `string` | Full URI of a data file with FS scheme. This must match the `file_path` of the target data file in a manifest entry | +| **`2147483545 pos`** | `long` | Ordinal position of a deleted row in the target data file identified by `file_path`, starting at `0` | +| **`2147483544 row`** | `required struct<...>` [1] | Deleted row values. Omit the column when not storing deleted rows. | + +1. When present in the delete file, `row` is required because all delete entries must include the row values. + +When the deleted row column is present, its schema may be any subset of the table schema and must use field ids matching the table. + +To ensure the accuracy of statistics, all delete entries must include row values, or the column must be omitted (this is why the column type is `required`). + +The rows in the delete file must be sorted by `file_path` then `pos` to optimize filtering rows while scanning. + +* Sorting by `file_path` allows filter pushdown by file in columnar storage formats. +* Sorting by `pos` allows filtering rows while scanning, to avoid keeping deletes in memory. + +#### Equality Delete Files + +Equality delete files identify deleted rows in a collection of data files by one or more column values, and may optionally contain additional columns of the deleted row. + +Equality delete files store any subset of a table's columns and use the table's field ids. The _delete columns_ are the columns of the delete file used to match data rows. Delete columns are identified by id in the delete file [metadata column `equality_ids`](#manifests). Float and double columns cannot be used as delete columns in equality delete files. + +A data row is deleted if its values are equal to all delete columns for any row in an equality delete file that applies to the row's data file (see [`Scan Planning`](#scan-planning)). + +Each row of the delete file produces one equality predicate that matches any row where the delete columns are equal. Multiple columns can be thought of as an `AND` of equality predicates. A `null` value in a delete column matches a row if the row's value is `null`, equivalent to `col IS NULL`. + +For example, a table with the following data: + +```text + 1: id | 2: category | 3: name +-------|-------------|--------- + 1 | marsupial | Koala + 2 | toy | Teddy + 3 | NULL | Grizzly + 4 | NULL | Polar +``` + +The delete `id = 3` could be written as either of the following equality delete files: + +```text +equality_ids=[1] + + 1: id +------- + 3 +``` + +```text +equality_ids=[1] + + 1: id | 2: category | 3: name +-------|-------------|--------- + 3 | NULL | Grizzly +``` + +The delete `id = 4 AND category IS NULL` could be written as the following equality delete file: + +```text +equality_ids=[1, 2] + + 1: id | 2: category | 3: name +-------|-------------|--------- + 4 | NULL | Polar +``` + +If a delete column in an equality delete file is later dropped from the table, it must still be used when applying the equality deletes. If a column was added to a table and later used as a delete column in an equality delete file, the column value is read for older data files using normal projection rules (defaults to `null`). + + +#### Delete File Stats + +Manifests hold the same statistics for delete files and data files. For delete files, the metrics describe the values that were deleted. + + +## Appendix A: Format-specific Requirements + + +### Avro + +**Data Type Mappings** + +Values should be stored in Avro using the Avro types and logical type annotations in the table below. + +Optional fields, array elements, and map values must be wrapped in an Avro `union` with `null`. This is the only union type allowed in Iceberg data files. + +Optional fields must always set the Avro field default value to null. + +Maps with non-string keys must use an array representation with the `map` logical type. The array representation or Avro’s map type may be used for maps with string keys. + +|Type|Avro type|Notes| +|--- |--- |--- | +|**`boolean`**|`boolean`|| +|**`int`**|`int`|| +|**`long`**|`long`|| +|**`float`**|`float`|| +|**`double`**|`double`|| +|**`decimal(P,S)`**|`{ "type": "fixed",`
  `"size": minBytesRequired(P),`
  `"logicalType": "decimal",`
  `"precision": P,`
  `"scale": S }`|Stored as fixed using the minimum number of bytes for the given precision.| +|**`date`**|`{ "type": "int",`
  `"logicalType": "date" }`|Stores days from the 1970-01-01.| +|**`time`**|`{ "type": "long",`
  `"logicalType": "time-micros" }`|Stores microseconds from midnight.| +|**`timestamp`**|`{ "type": "long",`
  `"logicalType": "timestamp-micros",`
  `"adjust-to-utc": false }`|Stores microseconds from 1970-01-01 00:00:00.000000.| +|**`timestamptz`**|`{ "type": "long",`
  `"logicalType": "timestamp-micros",`
  `"adjust-to-utc": true }`|Stores microseconds from 1970-01-01 00:00:00.000000 UTC.| +|**`string`**|`string`|| +|**`uuid`**|`{ "type": "fixed",`
  `"size": 16,`
  `"logicalType": "uuid" }`|| +|**`fixed(L)`**|`{ "type": "fixed",`
  `"size": L }`|| +|**`binary`**|`bytes`|| +|**`struct`**|`record`|| +|**`list`**|`array`|| +|**`map`**|`array` of key-value records, or `map` when keys are strings (optional).|Array storage must use logical type name `map` and must store elements that are 2-field records. The first field is a non-null key and the second field is the value.| + + +**Field IDs** + +Iceberg struct, list, and map types identify nested types by ID. When writing data to Avro files, these IDs must be stored in the Avro schema to support ID-based column pruning. + +IDs are stored as JSON integers in the following locations: + +|ID|Avro schema location|Property|Example| +|--- |--- |--- |--- | +|**Struct field**|Record field object|`field-id`|`{ "type": "record", ...`
  `"fields": [`
    `{ "name": "l",`
      `"type": ["null", "long"],`
      `"default": null,`
      `"field-id": 8 }`
  `] }`| +|**List element**|Array schema object|`element-id`|`{ "type": "array",`
  `"items": "int",`
  `"element-id": 9 }`| +|**String map key**|Map schema object|`key-id`|`{ "type": "map",`
  `"values": "int",`
  `"key-id": 10,`
  `"value-id": 11 }`| +|**String map value**|Map schema object|`value-id`|| +|**Map key, value**|Key, value fields in the element record.|`field-id`|`{ "type": "array",`
  `"logicalType": "map",`
  `"items": {`
    `"type": "record",`
    `"name": "k12_v13",`
    `"fields": [`
      `{ "name": "key",`
        `"type": "int",`
        `"field-id": 12 },`
      `{ "name": "value",`
        `"type": "string",`
        `"field-id": 13 }`
    `] } }`| + +Note that the string map case is for maps where the key type is a string. Using Avro’s map type in this case is optional. Maps with string keys may be stored as arrays. + + +### Parquet + +**Data Type Mappings** + +Values should be stored in Parquet using the types and logical type annotations in the table below. Column IDs are required. + +Lists must use the [3-level representation](https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#lists). + +| Type | Parquet physical type | Logical type | Notes | +|--------------------|--------------------------------------------------------------------|---------------------------------------------|----------------------------------------------------------------| +| **`boolean`** | `boolean` | | | +| **`int`** | `int` | | | +| **`long`** | `long` | | | +| **`float`** | `float` | | | +| **`double`** | `double` | | | +| **`decimal(P,S)`** | `P <= 9`: `int32`,
`P <= 18`: `int64`,
`fixed` otherwise | `DECIMAL(P,S)` | Fixed must use the minimum number of bytes that can store `P`. | +| **`date`** | `int32` | `DATE` | Stores days from the 1970-01-01. | +| **`time`** | `int64` | `TIME_MICROS` with `adjustToUtc=false` | Stores microseconds from midnight. | +| **`timestamp`** | `int64` | `TIMESTAMP_MICROS` with `adjustToUtc=false` | Stores microseconds from 1970-01-01 00:00:00.000000. | +| **`timestamptz`** | `int64` | `TIMESTAMP_MICROS` with `adjustToUtc=true` | Stores microseconds from 1970-01-01 00:00:00.000000 UTC. | +| **`string`** | `binary` | `UTF8` | Encoding must be UTF-8. | +| **`uuid`** | `fixed_len_byte_array[16]` | `UUID` | | +| **`fixed(L)`** | `fixed_len_byte_array[L]` | | | +| **`binary`** | `binary` | | | +| **`struct`** | `group` | | | +| **`list`** | `3-level list` | `LIST` | See Parquet docs for 3-level representation. | +| **`map`** | `3-level map` | `MAP` | See Parquet docs for 3-level representation. | + + +### ORC + +**Data Type Mappings** + +| Type | ORC type | ORC type attributes | Notes | +|--------------------|---------------------|------------------------------------------------------|-----------------------------------------------------------------------------------------| +| **`boolean`** | `boolean` | | | +| **`int`** | `int` | | ORC `tinyint` and `smallint` would also map to **`int`**. | +| **`long`** | `long` | | | +| **`float`** | `float` | | | +| **`double`** | `double` | | | +| **`decimal(P,S)`** | `decimal` | | | +| **`date`** | `date` | | | +| **`time`** | `long` | `iceberg.long-type`=`TIME` | Stores microseconds from midnight. | +| **`timestamp`** | `timestamp` | | [1] | +| **`timestamptz`** | `timestamp_instant` | | [1] | +| **`string`** | `string` | | ORC `varchar` and `char` would also map to **`string`**. | +| **`uuid`** | `binary` | `iceberg.binary-type`=`UUID` | | +| **`fixed(L)`** | `binary` | `iceberg.binary-type`=`FIXED` & `iceberg.length`=`L` | The length would not be checked by the ORC reader and should be checked by the adapter. | +| **`binary`** | `binary` | | | +| **`struct`** | `struct` | | | +| **`list`** | `array` | | | +| **`map`** | `map` | | | + +Notes: + +1. ORC's [TimestampColumnVector](https://orc.apache.org/api/hive-storage-api/org/apache/hadoop/hive/ql/exec/vector/TimestampColumnVector.html) consists of a time field (milliseconds since epoch) and a nanos field (nanoseconds within the second). Hence the milliseconds within the second are reported twice; once in the time field and again in the nanos field. The read adapter should only use milliseconds within the second from one of these fields. The write adapter should also report milliseconds within the second twice; once in the time field and again in the nanos field. ORC writer is expected to correctly consider millis information from one of the fields. More details at https://issues.apache.org/jira/browse/ORC-546 + +One of the interesting challenges with this is how to map Iceberg’s schema evolution (id based) on to ORC’s (name based). In theory, we could use Iceberg’s column ids as the column and field names, but that would be inconvenient. + +The column IDs must be stored in ORC type attributes using the key `iceberg.id`, and `iceberg.required` to store `"true"` if the Iceberg column is required, otherwise it will be optional. + +Iceberg would build the desired reader schema with their schema evolution rules and pass that down to the ORC reader, which would then use its schema evolution to map that to the writer’s schema. Basically, Iceberg would need to change the names of columns and fields to get the desired mapping. + +|Iceberg writer|ORC writer|Iceberg reader|ORC reader| +|--- |--- |--- |--- | +|`struct`|`struct`|`struct`|`struct`| +|`struct>`|`struct>`|`struct>`|`struct>`| + +## Appendix B: 32-bit Hash Requirements + +The 32-bit hash implementation is 32-bit Murmur3 hash, x86 variant, seeded with 0. + +| Primitive type | Hash specification | Test value | +|--------------------|-------------------------------------------|--------------------------------------------| +| **`int`** | `hashLong(long(v))` [1] | `34` → `2017239379` | +| **`long`** | `hashBytes(littleEndianBytes(v))` | `34L` → `2017239379` | +| **`decimal(P,S)`** | `hashBytes(minBigEndian(unscaled(v)))`[2] | `14.20` → `-500754589` | +| **`date`** | `hashInt(daysFromUnixEpoch(v))` | `2017-11-16` → `-653330422` | +| **`time`** | `hashLong(microsecsFromMidnight(v))` | `22:31:08` → `-662762989` | +| **`timestamp`** | `hashLong(microsecsFromUnixEpoch(v))` | `2017-11-16T22:31:08` → `-2047944441` | +| **`timestamptz`** | `hashLong(microsecsFromUnixEpoch(v))` | `2017-11-16T14:31:08-08:00`→ `-2047944441` | +| **`string`** | `hashBytes(utf8Bytes(v))` | `iceberg` → `1210000089` | +| **`uuid`** | `hashBytes(uuidBytes(v))` [3] | `f79c3e09-677c-4bbd-a479-3f349cb785e7` → `1488055340` | +| **`fixed(L)`** | `hashBytes(v)` | `00 01 02 03` → `-188683207` | +| **`binary`** | `hashBytes(v)` | `00 01 02 03` → `-188683207` | + +The types below are not currently valid for bucketing, and so are not hashed. However, if that changes and a hash value is needed, the following table shall apply: + +| Primitive type | Hash specification | Test value | +|--------------------|-------------------------------------------|--------------------------------------------| +| **`boolean`** | `false: hashInt(0)`, `true: hashInt(1)` | `true` → `1392991556` | +| **`float`** | `hashLong(doubleToLongBits(double(v))` [4]| `1.0F` → `-142385009`, `0.0F` → `1669671676`, `-0.0F` → `1669671676` | +| **`double`** | `hashLong(doubleToLongBits(v))` [4]| `1.0D` → `-142385009`, `0.0D` → `1669671676`, `-0.0D` → `1669671676` | + +Notes: + +1. Integer and long hash results must be identical for all integer values. This ensures that schema evolution does not change bucket partition values if integer types are promoted. +2. Decimal values are hashed using the minimum number of bytes required to hold the unscaled value as a two’s complement big-endian; this representation does not include padding bytes required for storage in a fixed-length array. +Hash results are not dependent on decimal scale, which is part of the type, not the data value. +3. UUIDs are encoded using big endian. The test UUID for the example above is: `f79c3e09-677c-4bbd-a479-3f349cb785e7`. This UUID encoded as a byte array is: +`F7 9C 3E 09 67 7C 4B BD A4 79 3F 34 9C B7 85 E7` +4. `doubleToLongBits` must give the IEEE 754 compliant bit representation of the double value. All `NaN` bit patterns must be canonicalized to `0x7ff8000000000000L`. Negative zero (`-0.0`) must be canonicalized to positive zero (`0.0`). Float hash values are the result of hashing the float cast to double to ensure that schema evolution does not change hash values if float types are promoted. + +## Appendix C: JSON serialization + + +### Schemas + +Schemas are serialized as a JSON object with the same fields as a struct in the table below, and the following additional fields: + +| v1 | v2 |Field|JSON representation|Example| +| ---------- | ---------- |--- |--- |--- | +| _optional_ | _required_ |**`schema-id`**|`JSON int`|`0`| +| _optional_ | _optional_ |**`identifier-field-ids`**|`JSON list of ints`|`[1, 2]`| + +Types are serialized according to this table: + +|Type|JSON representation|Example| +|--- |--- |--- | +|**`boolean`**|`JSON string: "boolean"`|`"boolean"`| +|**`int`**|`JSON string: "int"`|`"int"`| +|**`long`**|`JSON string: "long"`|`"long"`| +|**`float`**|`JSON string: "float"`|`"float"`| +|**`double`**|`JSON string: "double"`|`"double"`| +|**`date`**|`JSON string: "date"`|`"date"`| +|**`time`**|`JSON string: "time"`|`"time"`| +|**`timestamp without zone`**|`JSON string: "timestamp"`|`"timestamp"`| +|**`timestamp with zone`**|`JSON string: "timestamptz"`|`"timestamptz"`| +|**`string`**|`JSON string: "string"`|`"string"`| +|**`uuid`**|`JSON string: "uuid"`|`"uuid"`| +|**`fixed(L)`**|`JSON string: "fixed[]"`|`"fixed[16]"`| +|**`binary`**|`JSON string: "binary"`|`"binary"`| +|**`decimal(P, S)`**|`JSON string: "decimal(

,)"`|`"decimal(9,2)"`,
`"decimal(9, 2)"`| +|**`struct`**|`JSON object: {`
  `"type": "struct",`
  `"fields": [ {`
    `"id": ,`
    `"name": ,`
    `"required": ,`
    `"type": ,`
    `"doc": ,`
    `"initial-default": ,`
    `"write-default": `
    `}, ...`
  `] }`|`{`
  `"type": "struct",`
  `"fields": [ {`
    `"id": 1,`
    `"name": "id",`
    `"required": true,`
    `"type": "uuid",`
    `"initial-default": "0db3e2a8-9d1d-42b9-aa7b-74ebe558dceb",`
    `"write-default": "ec5911be-b0a7-458c-8438-c9a3e53cffae"`
  `}, {`
    `"id": 2,`
    `"name": "data",`
    `"required": false,`
    `"type": {`
      `"type": "list",`
      `...`
    `}`
  `} ]`
`}`| +|**`list`**|`JSON object: {`
  `"type": "list",`
  `"element-id": ,`
  `"element-required": `
  `"element": `
`}`|`{`
  `"type": "list",`
  `"element-id": 3,`
  `"element-required": true,`
  `"element": "string"`
`}`| +|**`map`**|`JSON object: {`
  `"type": "map",`
  `"key-id": ,`
  `"key": ,`
  `"value-id": ,`
  `"value-required": `
  `"value": `
`}`|`{`
  `"type": "map",`
  `"key-id": 4,`
  `"key": "string",`
  `"value-id": 5,`
  `"value-required": false,`
  `"value": "double"`
`}`| + +Note that default values are serialized using the JSON single-value serialization in [Appendix D](#appendix-d-single-value-serialization). + + +### Partition Specs + +Partition specs are serialized as a JSON object with the following fields: + +|Field|JSON representation|Example| +|--- |--- |--- | +|**`spec-id`**|`JSON int`|`0`| +|**`fields`**|`JSON list: [`
  `,`
  `...`
`]`|`[ {`
  `"source-id": 4,`
  `"field-id": 1000,`
  `"name": "ts_day",`
  `"transform": "day"`
`}, {`
  `"source-id": 1,`
  `"field-id": 1001,`
  `"name": "id_bucket",`
  `"transform": "bucket[16]"`
`} ]`| + +Each partition field in the fields list is stored as an object. See the table for more detail: + +|Transform or Field|JSON representation|Example| +|--- |--- |--- | +|**`identity`**|`JSON string: "identity"`|`"identity"`| +|**`bucket[N]`**|`JSON string: "bucket[]"`|`"bucket[16]"`| +|**`truncate[W]`**|`JSON string: "truncate[]"`|`"truncate[20]"`| +|**`year`**|`JSON string: "year"`|`"year"`| +|**`month`**|`JSON string: "month"`|`"month"`| +|**`day`**|`JSON string: "day"`|`"day"`| +|**`hour`**|`JSON string: "hour"`|`"hour"`| +|**`Partition Field`**|`JSON object: {`
  `"source-id": ,`
  `"field-id": ,`
  `"name": ,`
  `"transform": `
`}`|`{`
  `"source-id": 1,`
  `"field-id": 1000,`
  `"name": "id_bucket",`
  `"transform": "bucket[16]"`
`}`| + +In some cases partition specs are stored using only the field list instead of the object format that includes the spec ID, like the deprecated `partition-spec` field in table metadata. The object format should be used unless otherwise noted in this spec. + +The `field-id` property was added for each partition field in v2. In v1, the reference implementation assigned field ids sequentially in each spec starting at 1,000. See Partition Evolution for more details. + +### Sort Orders + +Sort orders are serialized as a list of JSON object, each of which contains the following fields: + +|Field|JSON representation|Example| +|--- |--- |--- | +|**`order-id`**|`JSON int`|`1`| +|**`fields`**|`JSON list: [`
  `,`
  `...`
`]`|`[ {`
  ` "transform": "identity",`
  ` "source-id": 2,`
  ` "direction": "asc",`
  ` "null-order": "nulls-first"`
  `}, {`
  ` "transform": "bucket[4]",`
  ` "source-id": 3,`
  ` "direction": "desc",`
  ` "null-order": "nulls-last"`
`} ]`| + +Each sort field in the fields list is stored as an object with the following properties: + +|Field|JSON representation|Example| +|--- |--- |--- | +|**`Sort Field`**|`JSON object: {`
  `"transform": ,`
  `"source-id": ,`
  `"direction": ,`
  `"null-order": `
`}`|`{`
  ` "transform": "bucket[4]",`
  ` "source-id": 3,`
  ` "direction": "desc",`
  ` "null-order": "nulls-last"`
`}`| + +The following table describes the possible values for the some of the field within sort field: + +|Field|JSON representation|Possible values| +|--- |--- |--- | +|**`direction`**|`JSON string`|`"asc", "desc"`| +|**`null-order`**|`JSON string`|`"nulls-first", "nulls-last"`| + + +### Table Metadata and Snapshots + +Table metadata is serialized as a JSON object according to the following table. Snapshots are not serialized separately. Instead, they are stored in the table metadata JSON. + +|Metadata field|JSON representation|Example| +|--- |--- |--- | +|**`format-version`**|`JSON int`|`1`| +|**`table-uuid`**|`JSON string`|`"fb072c92-a02b-11e9-ae9c-1bb7bc9eca94"`| +|**`location`**|`JSON string`|`"s3://b/wh/data.db/table"`| +|**`last-updated-ms`**|`JSON long`|`1515100955770`| +|**`last-column-id`**|`JSON int`|`22`| +|**`schema`**|`JSON schema (object)`|`See above, read schemas instead`| +|**`schemas`**|`JSON schemas (list of objects)`|`See above`| +|**`current-schema-id`**|`JSON int`|`0`| +|**`partition-spec`**|`JSON partition fields (list)`|`See above, read partition-specs instead`| +|**`partition-specs`**|`JSON partition specs (list of objects)`|`See above`| +|**`default-spec-id`**|`JSON int`|`0`| +|**`last-partition-id`**|`JSON int`|`1000`| +|**`properties`**|`JSON object: {`
  `"": "",`
  `...`
`}`|`{`
  `"write.format.default": "avro",`
  `"commit.retry.num-retries": "4"`
`}`| +|**`current-snapshot-id`**|`JSON long`|`3051729675574597004`| +|**`snapshots`**|`JSON list of objects: [ {`
  `"snapshot-id": ,`
  `"timestamp-ms": ,`
  `"summary": {`
    `"operation": ,`
    `... },`
  `"manifest-list": "",`
  `"schema-id": ""`
  `},`
  `...`
`]`|`[ {`
  `"snapshot-id": 3051729675574597004,`
  `"timestamp-ms": 1515100955770,`
  `"summary": {`
    `"operation": "append"`
  `},`
  `"manifest-list": "s3://b/wh/.../s1.avro"`
  `"schema-id": 0`
`} ]`| +|**`snapshot-log`**|`JSON list of objects: [`
  `{`
  `"snapshot-id": ,`
  `"timestamp-ms": `
  `},`
  `...`
`]`|`[ {`
  `"snapshot-id": 30517296...,`
  `"timestamp-ms": 1515100...`
`} ]`| +|**`metadata-log`**|`JSON list of objects: [`
  `{`
  `"metadata-file": ,`
  `"timestamp-ms": `
  `},`
  `...`
`]`|`[ {`
  `"metadata-file": "s3://bucket/.../v1.json",`
  `"timestamp-ms": 1515100...`
`} ]` | +|**`sort-orders`**|`JSON sort orders (list of sort field object)`|`See above`| +|**`default-sort-order-id`**|`JSON int`|`0`| +|**`refs`**|`JSON map with string key and object value:`
`{`
  `"": {`
  `"snapshot-id": ,`
  `"type": ,`
  `"max-ref-age-ms": ,`
  `...`
  `}`
  `...`
`}`|`{`
  `"test": {`
  `"snapshot-id": 123456789000,`
  `"type": "tag",`
  `"max-ref-age-ms": 10000000`
  `}`
`}`| + +### Name Mapping Serialization + +Name mapping is serialized as a list of field mapping JSON Objects which are serialized as follows + +|Field mapping field|JSON representation|Example| +|--- |--- |--- | +|**`names`**|`JSON list of strings`|`["latitude", "lat"]`| +|**`field_id`**|`JSON int`|`1`| +|**`fields`**|`JSON field mappings (list of objects)`|`[{ `
  `"field-id": 4,`
  `"names": ["latitude", "lat"]`
`}, {`
  `"field-id": 5,`
  `"names": ["longitude", "long"]`
`}]`| + +Example +```json +[ { "field-id": 1, "names": ["id", "record_id"] }, + { "field-id": 2, "names": ["data"] }, + { "field-id": 3, "names": ["location"], "fields": [ + { "field-id": 4, "names": ["latitude", "lat"] }, + { "field-id": 5, "names": ["longitude", "long"] } + ] } ] +``` + + +## Appendix D: Single-value serialization + +### Binary single-value serialization + +This serialization scheme is for storing single values as individual binary values in the lower and upper bounds maps of manifest files. + +| Type | Binary serialization | +|------------------------------|--------------------------------------------------------------------------------------------------------------| +| **`boolean`** | `0x00` for false, non-zero byte for true | +| **`int`** | Stored as 4-byte little-endian | +| **`long`** | Stored as 8-byte little-endian | +| **`float`** | Stored as 4-byte little-endian | +| **`double`** | Stored as 8-byte little-endian | +| **`date`** | Stores days from the 1970-01-01 in an 4-byte little-endian int | +| **`time`** | Stores microseconds from midnight in an 8-byte little-endian long | +| **`timestamp without zone`** | Stores microseconds from 1970-01-01 00:00:00.000000 in an 8-byte little-endian long | +| **`timestamp with zone`** | Stores microseconds from 1970-01-01 00:00:00.000000 UTC in an 8-byte little-endian long | +| **`string`** | UTF-8 bytes (without length) | +| **`uuid`** | 16-byte big-endian value, see example in Appendix B | +| **`fixed(L)`** | Binary value | +| **`binary`** | Binary value (without length) | +| **`decimal(P, S)`** | Stores unscaled value as two’s-complement big-endian binary, using the minimum number of bytes for the value | +| **`struct`** | Not supported | +| **`list`** | Not supported | +| **`map`** | Not supported | + +### JSON single-value serialization + + Single values are serialized as JSON by type according to the following table: + +| Type | JSON representation | Example | Description | +| ------------------ | ----------------------------------------- | ------------------------------------------ | -- | +| **`boolean`** | **`JSON boolean`** | `true` | | +| **`int`** | **`JSON int`** | `34` | | +| **`long`** | **`JSON long`** | `34` | | +| **`float`** | **`JSON number`** | `1.0` | | +| **`double`** | **`JSON number`** | `1.0` | | +| **`decimal(P,S)`** | **`JSON string`** | `"14.20"`, `"2E+20"` | Stores the string representation of the decimal value, specifically, for values with a positive scale, the number of digits to the right of the decimal point is used to indicate scale, for values with a negative scale, the scientific notation is used and the exponent must equal the negated scale | +| **`date`** | **`JSON string`** | `"2017-11-16"` | Stores ISO-8601 standard date | +| **`time`** | **`JSON string`** | `"22:31:08.123456"` | Stores ISO-8601 standard time with microsecond precision | +| **`timestamp`** | **`JSON string`** | `"2017-11-16T22:31:08.123456"` | Stores ISO-8601 standard timestamp with microsecond precision; must not include a zone offset | +| **`timestamptz`** | **`JSON string`** | `"2017-11-16T22:31:08.123456+00:00"` | Stores ISO-8601 standard timestamp with microsecond precision; must include a zone offset and it must be '+00:00' | +| **`string`** | **`JSON string`** | `"iceberg"` | | +| **`uuid`** | **`JSON string`** | `"f79c3e09-677c-4bbd-a479-3f349cb785e7"` | Stores the lowercase uuid string | +| **`fixed(L)`** | **`JSON string`** | `"000102ff"` | Stored as a hexadecimal string | +| **`binary`** | **`JSON string`** | `"000102ff"` | Stored as a hexadecimal string | +| **`struct`** | **`JSON object by field ID`** | `{"1": 1, "2": "bar"}` | Stores struct fields using the field ID as the JSON field name; field values are stored using this JSON single-value format | +| **`list`** | **`JSON array of values`** | `[1, 2, 3]` | Stores a JSON array of values that are serialized using this JSON single-value format | +| **`map`** | **`JSON object of key and value arrays`** | `{ "keys": ["a", "b"], "values": [1, 2] }` | Stores arrays of keys and values; individual keys and values are serialized using this JSON single-value format | + + +## Appendix E: Format version changes + +### Version 3 + +Default values are added to struct fields in v3. +* The `write-default` is a forward-compatible change because it is only used at write time. Old writers will fail because the field is missing. +* Tables with `initial-default` will be read correctly by older readers if `initial-default` is always null for optional fields. Otherwise, old readers will default optional columns with null. Old readers will fail to read required fields which are populated by `initial-default` because that default is not supported. + +### Version 2 + +Writing v1 metadata: + +* Table metadata field `last-sequence-number` should not be written +* Snapshot field `sequence-number` should not be written +* Manifest list field `sequence-number` should not be written +* Manifest list field `min-sequence-number` should not be written +* Manifest list field `content` must be 0 (data) or omitted +* Manifest entry field `sequence_number` should not be written +* Manifest entry field `file_sequence_number` should not be written +* Data file field `content` must be 0 (data) or omitted + +Reading v1 metadata for v2: + +* Table metadata field `last-sequence-number` must default to 0 +* Snapshot field `sequence-number` must default to 0 +* Manifest list field `sequence-number` must default to 0 +* Manifest list field `min-sequence-number` must default to 0 +* Manifest list field `content` must default to 0 (data) +* Manifest entry field `sequence_number` must default to 0 +* Manifest entry field `file_sequence_number` must default to 0 +* Data file field `content` must default to 0 (data) + +Writing v2 metadata: + +* Table metadata JSON: + * `last-sequence-number` was added and is required; default to 0 when reading v1 metadata + * `table-uuid` is now required + * `current-schema-id` is now required + * `schemas` is now required + * `partition-specs` is now required + * `default-spec-id` is now required + * `last-partition-id` is now required + * `sort-orders` is now required + * `default-sort-order-id` is now required + * `schema` is no longer required and should be omitted; use `schemas` and `current-schema-id` instead + * `partition-spec` is no longer required and should be omitted; use `partition-specs` and `default-spec-id` instead +* Snapshot JSON: + * `sequence-number` was added and is required; default to 0 when reading v1 metadata + * `manifest-list` is now required + * `manifests` is no longer required and should be omitted; always use `manifest-list` instead +* Manifest list `manifest_file`: + * `content` was added and is required; 0=data, 1=deletes; default to 0 when reading v1 manifest lists + * `sequence_number` was added and is required + * `min_sequence_number` was added and is required + * `added_files_count` is now required + * `existing_files_count` is now required + * `deleted_files_count` is now required + * `added_rows_count` is now required + * `existing_rows_count` is now required + * `deleted_rows_count` is now required +* Manifest key-value metadata: + * `schema-id` is now required + * `partition-spec-id` is now required + * `format-version` is now required + * `content` was added and is required (must be "data" or "deletes") +* Manifest `manifest_entry`: + * `snapshot_id` is now optional to support inheritance + * `sequence_number` was added and is optional, to support inheritance + * `file_sequence_number` was added and is optional, to support inheritance +* Manifest `data_file`: + * `content` was added and is required; 0=data, 1=position deletes, 2=equality deletes; default to 0 when reading v1 manifests + * `equality_ids` was added, to be used for equality deletes only + * `block_size_in_bytes` was removed (breaks v1 reader compatibility) + * `file_ordinal` was removed + * `sort_columns` was removed + +Note that these requirements apply when writing data to a v2 table. Tables that are upgraded from v1 may contain metadata that does not follow these requirements. Implementations should remain backward-compatible with v1 metadata requirements. diff --git a/docs-new/home/talks.md b/docs-new/home/talks.md new file mode 100644 index 000000000000..52fcdef089d0 --- /dev/null +++ b/docs-new/home/talks.md @@ -0,0 +1,71 @@ +--- +title: "Talks" +--- + + +## Iceberg Talks + +Here is a list of talks and other videos related to Iceberg. + +### [Eliminating Shuffles in DELETE, UPDATE, MERGE](https://www.youtube.com/watch?v=AIZjy6_K0ws) +**Date**: July 27, 2023, **Authors**: Anton Okolnychyi, Chao Sun + +### [Write Distribution Modes in Apache Iceberg](https://www.youtube.com/watch?v=4bOCDP-rhuM) +**Date**: March 15, 2023, **Author**: Russell Spitzer + +### [Technical Evolution of Apache Iceberg](https://www.youtube.com/watch?v=CHs9_h9VLCs) +**Date**: March 15, 2023, **Author**: Anton Okolnychyi + +### [Iceberg's Best Secret Exploring Metadata Tables](https://www.youtube.com/watch?v=s5eKriX6_EU) +**Date**: January 12, 2023, **Author**: Szehon Ho + +### [Data architecture in 2022](https://www.youtube.com/watch?v=1oXmBbB77ak) +**Date**: May 5, 2022, **Authors**: Ryan Blue + +### [Why You Shouldn’t Care About Iceberg | Tabular](https://www.youtube.com/watch?v=_GW3GYZK66U) +**Date**: March 24, 2022, **Authors**: Ryan Blue + +### [Managing Data Files in Apache Iceberg](https://www.dremio.com/resources/webinars/managing-data-files-in-apache-iceberg/) +**Date**: March 2, 2022, **Author**: Russell Spitzer + +### [Tuning Row-Level Operations in Apache Iceberg](https://www.dremio.com/resources/webinars/tuning-row-level-operations-in-apache-iceberg/) +**Date**: March 2, 2022, **Author**: Anton Okolnychyi + +### [Multi Dimensional Clustering with Z Ordering](https://www.youtube.com/watch?v=YLVkITvF6KU) +**Date**: December 6, 2021, **Author**: Russell Spitzer + +### [Expert Roundtable: The Future of Metadata After Hive Metastore](https://www.youtube.com/watch?v=7_Pt1g2x-XE) +**Date**: November 15, 2021, **Authors**: Lior Ebel, Seshu Adunuthula, Ryan Blue & Oz Katz + +### [Presto and Apache Iceberg: Building out Modern Open Data Lakes](https://www.youtube.com/watch?v=OJQHVPChYHw) +**Date**: November 10, 2021, **Authors**: Daniel Weeks, Chunxu Tang + +### [Iceberg Case Studies](https://www.youtube.com/watch?v=Al8feI9QEBc) +**Date**: September 29, 2021, **Authors**: Ryan Blue + +### [Deep Dive into Iceberg SQL Extensions](https://www.dremio.com/resources/webinars/deep-dive-into-iceberg-sql-extensions/) +**Date**: July 13, 2021, **Author**: Anton Okolnychyi + +### [Building efficient and reliable data lakes with Apache Iceberg](https://www.youtube.com/watch?v=QNmSXMQ-gY4) +**Date**: October 21, 2020, **Authors**: Anton Okolnychyi, Vishwa Lakkundi + +### [Spark and Iceberg at Apple's Scale - Leveraging differential files for efficient upserts and deletes](https://www.youtube.com/watch?v=IzkSGKoUxcQ) +**Date**: October 21, 2020, **Authors**: Anton Okolnychyi, Vishwa Lakkundi + +### [Apache Iceberg - A Table Format for Huge Analytic Datasets](https://www.youtube.com/watch?v=mf8Hb0coI6o) +**Date**: October 21, 2020, **Author**: Ryan Blue diff --git a/docs-new/home/terms.md b/docs-new/home/terms.md new file mode 100644 index 000000000000..1ab2e5b30d9d --- /dev/null +++ b/docs-new/home/terms.md @@ -0,0 +1,61 @@ +--- +title: "Terms" +--- + + +# Terms + +### Snapshot + +A **snapshot** is the state of a table at some time. + +Each snapshot lists all of the data files that make up the table's contents at the time of the snapshot. Data files are stored across multiple [manifest](#manifest-file) files, and the manifests for a snapshot are listed in a single [manifest list](#manifest-list) file. + +### Manifest list + +A **manifest list** is a metadata file that lists the [manifests](#manifest-file) that make up a table snapshot. + +Each manifest file in the manifest list is stored with information about its contents, like partition value ranges, used to speed up metadata operations. + +### Manifest file + +A **manifest file** is a metadata file that lists a subset of data files that make up a snapshot. + +Each data file in a manifest is stored with a [partition tuple](#partition-tuple), column-level stats, and summary information used to prune splits during [scan planning](docs/latest/performance.md#scan-planning). + +### Partition spec + +A **partition spec** is a description of how to [partition](docs/latest/partitioning.md) data in a table. + +A spec consists of a list of source columns and transforms. A transform produces a partition value from a source value. For example, `date(ts)` produces the date associated with a timestamp column named `ts`. + +### Partition tuple + +A **partition tuple** is a tuple or struct of partition data stored with each data file. + +All values in a partition tuple are the same for all rows stored in a data file. Partition tuples are produced by transforming values from row data using a partition spec. + +Iceberg stores partition values unmodified, unlike Hive tables that convert values to and from strings in file system paths and keys. + +### Snapshot log (history table) + +The **snapshot log** is a metadata log of how the table's current snapshot has changed over time. + +The log is a list of timestamp and ID pairs: when the current snapshot changed and the snapshot ID the current snapshot was changed to. + +The snapshot log is stored in [table metadata as `snapshot-log`](spec.md#table-metadata-fields). diff --git a/docs-new/home/vendors.md b/docs-new/home/vendors.md new file mode 100644 index 000000000000..14554a37b246 --- /dev/null +++ b/docs-new/home/vendors.md @@ -0,0 +1,62 @@ +--- +title: "Vendors" +--- + + +## Vendors Supporting Iceberg Tables + +This page contains some of the vendors who are shipping and supporting Apache Iceberg in their products + +### [CelerData](https://celerdata.com) + +CelerData provides commercial offerings for [StarRocks](https://www.starrocks.io/), a distributed MPP SQL engine for enterprise analytics on Iceberg. With its fully vectorized technology, local caching, and intelligent materialized view, StarRocks delivers sub-second query latency for both batch and real-time analytics. CelerData offers both an [enterprise deployment](https://celerdata.com/celerdata-enterprise) and a [cloud service](https://celerdata.com/celerdata-cloud) to help customers use StarRocks more smoothly. Learn more about how to query Iceberg with StarRocks [here](https://docs.starrocks.io/en-us/latest/data_source/catalog/iceberg_catalog). + +### [Cloudera](http://cloudera.com) + +Cloudera Data Platform integrates Apache Iceberg to the following components: +* Apache Hive, Apache Impala, and Apache Spark to query Apache Iceberg tables +* Cloudera Data Warehouse service providing access to Apache Iceberg tables through Apache Hive and Apache Impala +* Cloudera Data Engineering service providing access to Apache Iceberg tables through Apache Spark +* The CDP Shared Data Experience (SDX) provides compliance and self-service data access for Apache Iceberg tables +* Hive metastore, which plays a lightweight role in providing the Iceberg Catalog +* Data Visualization to visualize data stored in Apache Iceberg + +https://docs.cloudera.com/cdp-public-cloud/cloud/cdp-iceberg/topics/iceberg-in-cdp.html + +### [Dremio](https://www.dremio.com/) + +With Dremio, an organization can easily build and manage a data lakehouse in which data is stored in open formats like Apache Iceberg and can be processed with Dremio’s interactive SQL query engine and non-Dremio processing engines. [Dremio Cloud](https://www.dremio.com/get-started/) provides these capabilities in a fully managed offering. + +* [Dremio Sonar](https://www.dremio.com/platform/sonar/) is a lakehouse query engine that provides interactive performance and DML on Apache Iceberg, as well as other formats and data sources. +* [Dremio Arctic](https://www.dremio.com/platform/arctic/) is a lakehouse catalog and optimization service for Apache Iceberg. Arctic automatically optimizes tables in the background to ensure high-performance access for any engine. Arctic also simplifies experimentation, data engineering, and data governance by providing Git concepts like branches and tags on Apache Iceberg tables. + +### [IOMETE](https://iomete.com/) + +IOMETE is a fully-managed ready to use, batteries included Data Platform. IOMETE optimizes clustering, compaction, and access control to Apache Iceberg tables. Customer data remains on customer's account to prevent vendor lock-in. The core of IOMETE platform is a serverless Lakehouse that leverages Apache Iceberg as its core table format. IOMETE platform also includes Serverless Spark, an SQL Editor, A Data Catalog, and granular data access control. IOMETE supports Hybrid-multi-cloud setups. + +### [Snowflake](http://snowflake.com/) + +[Snowflake](https://www.snowflake.com/data-cloud/) is a single, cross-cloud platform that enables every organization to mobilize their data with Snowflake’s Data Cloud. Snowflake supports Apache Iceberg by offering [native support for Iceberg Tables](https://www.snowflake.com/blog/iceberg-tables-powering-open-standards-with-snowflake-innovations/) for full DML as well as connectors to [External Tables](https://www.snowflake.com/blog/expanding-the-data-cloud-with-apache-iceberg/) for read-only access. + +### [Starburst](http://starburst.io) + +Starburst is a commercial offering for the [Trino query engine](https://trino.io). Trino is a distributed MPP SQL query engine that can query data in Iceberg at interactive speeds. Trino also enables you to join Iceberg tables with an [array of other systems](https://trino.io/docs/current/connector.html). Starburst offers both an [enterprise deployment](https://www.starburst.io/platform/starburst-enterprise/) and a [fully managed service](https://www.starburst.io/platform/starburst-galaxy/) to make managing and scaling Trino a flawless experience. Starburst also provides customer support and houses many of the original contributors to the open-source project that know Trino best. Learn more about [the Starburst Iceberg connector](https://docs.starburst.io/latest/connector/iceberg.html). + +### [Tabular](https://tabular.io) + +[Tabular](https://tabular.io/product/) is a managed warehouse and automation platform. Tabular offers a central store for analytic data that can be used with any query engine or processing framework that supports Iceberg. Tabular warehouses add role-based access control and automatic optimization, clustering, and compaction to Iceberg tables. diff --git a/docs-new/home/view-spec.md b/docs-new/home/view-spec.md new file mode 100644 index 000000000000..1a82c1ec9e42 --- /dev/null +++ b/docs-new/home/view-spec.md @@ -0,0 +1,329 @@ +--- +title: "View Spec" +--- + + +# Iceberg View Spec + +## Background and Motivation + +Most compute engines (e.g. Trino and Apache Spark) support views. A view is a logical table that can be referenced by future queries. Views do not contain any data. Instead, the query stored by the view is executed every time the view is referenced by another query. + +Each compute engine stores the metadata of the view in its proprietary format in the metastore of choice. Thus, views created from one engine can not be read or altered easily from another engine even when engines share the metastore as well as the storage system. This document standardizes the view metadata for ease of sharing the views across engines. + +## Goals + +* A common metadata format for view metadata, similar to how Iceberg supports a common table format for tables. + +## Overview + +View metadata storage mirrors how Iceberg table metadata is stored and retrieved. View metadata is maintained in metadata files. All changes to view state create a new view metadata file and completely replace the old metadata using an atomic swap. Like Iceberg tables, this atomic swap is delegated to the metastore that tracks tables and/or views by name. The view metadata file tracks the view schema, custom properties, current and past versions, as well as other metadata. + +Each metadata file is self-sufficient. It contains the history of the last few versions of the view and can be used to roll back the view to a previous version. + +### Metadata Location + +An atomic swap of one view metadata file for another provides the basis for making atomic changes. Readers use the version of the view that was current when they loaded the view metadata and are not affected by changes until they refresh and pick up a new metadata location. + +Writers create view metadata files optimistically, assuming that the current metadata location will not be changed before the writer's commit. Once a writer has created an update, it commits by swapping the view's metadata file pointer from the base location to the new location. + +## Specification + +### Terms + +* **Schema** -- Names and types of fields in a view. +* **Version** -- The state of a view at some point in time. + +### View Metadata + +The view version metadata file has the following fields: + +| Requirement | Field name | Description | +|-------------|----------------------|-------------| +| _required_ | `format-version` | An integer version number for the view format; must be 1 | +| _required_ | `location` | The view's base location; used to create metadata file locations | +| _required_ | `current-schema-id` | ID of the current schema of the view, if known | +| _required_ | `schemas` | A list of known schemas | +| _required_ | `current-version-id` | ID of the current version of the view (`version-id`) | +| _required_ | `versions` | A list of known [versions](#versions) of the view [1] | +| _required_ | `version-log` | A list of [version log](#version-log) entries with the timestamp and `version-id` for every change to `current-version-id` | +| _optional_ | `properties` | A string to string map of view properties [2] | + +Notes: +1. The number of versions to retain is controlled by the table property: `version.history.num-entries`. +2. Properties are used for metadata such as `comment` and for settings that affect view maintenance. This is not intended to be used for arbitrary metadata. + +#### Versions + +Each version in `versions` is a struct with the following fields: + +| Requirement | Field name | Description | +|-------------|-------------------|--------------------------------------------------------------------------| +| _required_ | `version-id` | ID for the version | +| _required_ | `schema-id` | ID of the schema for the view version | +| _required_ | `timestamp-ms` | Timestamp when the version was created (ms from epoch) | +| _required_ | `summary` | A string to string map of [summary metadata](#summary) about the version | +| _required_ | `representations` | A list of [representations](#representations) for the view definition | + +#### Summary + +Summary is a string to string map of metadata about a view version. Common metadata keys are documented here. + +| Requirement | Key | Value | +|-------------|------------------|-------| +| _required_ | `operation` | Operation that caused this metadata to be created; must be `create` or `replace` | +| _optional_ | `engine-name` | Name of the engine that created the view version | +| _optional_ | `engine-version` | Version of the engine that created the view version | + +#### Representations + +View definitions can be represented in multiple ways. Representations are documented ways to express a view definition. + +A view version can have more than one representation. All representations for a version must express the same underlying definition. Engines are free to choose the representation to use. + +View versions are immutable. Once a version is created, it cannot be changed. This means that representations for a version cannot be changed. If a view definition changes (or new representations are to be added), a new version must be created. + +Each representation is an object with at least one common field, `type`, that is one of the following: +* `sql`: a SQL SELECT statement that defines the view + +Representations further define metadata for each type. + +##### SQL representation + +The SQL representation stores the view definition as a SQL SELECT, with metadata such as the SQL dialect. + +A view version can have multiple SQL representations of different dialects, but only one SQL representation per dialect. + +| Requirement | Field name | Type | Description | +|-------------|---------------------|----------------|-------------| +| _required_ | `type` | `string` | Must be `sql` | +| _required_ | `sql` | `string` | A SQL SELECT statement | +| _required_ | `dialect` | `string` | The dialect of the `sql` SELECT statement (e.g., "trino" or "spark") | +| _optional_ | `default-catalog` | `string` | Catalog name to use when a reference in the SELECT does not contain a catalog | +| _optional_ | `default-namespace` | `list` | Namespace to use when a reference in the SELECT is a single identifier | +| _optional_ | `field-aliases` | `list` | Column names optionally specified in the create statement | +| _optional_ | `field-comments` | `list` | Column descriptions (COMMENT) optionally specified in the create statement | + +For example: + +```sql +USE prod.default +``` +```sql +CREATE OR REPLACE VIEW event_agg ( + event_count COMMENT 'Count of events', + event_date) AS +SELECT + COUNT(1), CAST(event_ts AS DATE) +FROM events +GROUP BY 2 +``` + +This create statement would produce the following `sql` representation metadata: + +| Field name | Value | +|---------------------|-------| +| `type` | `"sql"` | +| `sql` | `"SELECT\n COUNT(1), CAST(event_ts AS DATE)\nFROM events\nGROUP BY 2"` | +| `dialect` | `"spark"` | +| `default-catalog` | `"prod"` | +| `default-namespace` | `["default"]` | +| `field-aliases` | `["event_count", "event_date"]` | +| `field-comments` | `["Count of events", null]` | + +If a create statement does not include column names or comments before `AS`, the fields should be omitted. + +#### Version log + +The version log tracks changes to the view's current version. This is the view's history and allows reconstructing what version of the view would have been used at some point in time. + +Note that this is not the version's creation time, which is stored in each version's metadata. A version can appear multiple times in the version log, indicating that the view definition was rolled back. + +Each entry in `version-log` is a struct with the following fields: + +| Requirement | Field name | Description | +|-------------|----------------|-------------| +| _required_ | `timestamp-ms` | Timestamp when the view's `current-version-id` was updated (ms from epoch) | +| _required_ | `version-id` | ID that `current-version-id` was set to | + +## Appendix A: An Example + +The JSON metadata file format is described using an example below. + +Imagine the following sequence of operations: + +```sql +USE prod.default +``` +```sql +CREATE OR REPLACE VIEW event_agg ( + event_count COMMENT 'Count of events', + event_date) +COMMENT 'Daily event counts' +AS +SELECT + COUNT(1), CAST(event_ts AS DATE) +FROM events +GROUP BY 2 +``` + + +The metadata JSON file created looks as follows. + +The path is intentionally similar to the path for Iceberg tables and uses a `metadata` directory. + +``` +s3://bucket/warehouse/default.db/event_agg/metadata/00001-(uuid).metadata.json +``` +``` +{ + "format-version" : 1, + "location" : "s3://bucket/warehouse/default.db/event_agg", + "current-version-id" : 1, + "properties" : { + "comment" : "Daily event counts" + }, + "versions" : [ { + "version-id" : 1, + "timestamp-ms" : 1573518431292, + "schema-id" : 1, + "summary" : { + "operation" : "create", + "engine-name" : "Spark", + "engineVersion" : "3.3.2" + }, + "representations" : [ { + "type" : "sql", + "sql" : "SELECT\n COUNT(1), CAST(event_ts AS DATE)\nFROM events\nGROUP BY 2", + "dialect" : "spark", + "default-catalog" : "prod", + "default-namespace" : [ "default" ], + "field-aliases" : ["event_count", "event_date"], + "field-comments" : ["Count of events", null] + } ] + } ], + "current-schema-id": 1, + "schemas": [ { + "schema-id": 1, + "type" : "struct", + "fields" : [ { + "id" : 1, + "name" : "col1", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "col2", + "required" : false, + "type" : "date" + } ] + } ], + "version-log" : [ { + "timestamp-ms" : 1573518431292, + "version-id" : 1 + } ] +} +``` + +Each change creates a new metadata JSON file. + +```sql +USE prod.other_db; +CREATE OR REPLACE VIEW default.event_agg ( + event_count, + event_date) +AS +SELECT + COUNT(1), CAST(event_ts AS DATE) +FROM prod.default.events +GROUP BY 2 +``` + +Updating the view produces a new metadata file that completely replaces the old: + +``` +s3://bucket/warehouse/default.db/event_agg/metadata/00002-(uuid).metadata.json +``` +``` +{ + "format-version" : 1, + "location" : "s3://bucket/warehouse/default.db/event_agg", + "current-version-id" : 1, + "properties" : { + "comment" : "Daily event counts" + }, + "versions" : [ { + "version-id" : 1, + "timestamp-ms" : 1573518431292, + "schema-id" : 1, + "summary" : { + "operation" : "create", + "engine-name" : "Spark", + "engineVersion" : "3.3.2" + }, + "representations" : [ { + "type" : "sql", + "sql" : "SELECT\n COUNT(1), CAST(event_ts AS DATE)\nFROM events\nGROUP BY 2", + "dialect" : "spark", + "default-catalog" : "prod", + "default-namespace" : [ "default" ], + "field-aliases" : ["event_count", "event_date"], + "field-comments" : ["Count of events", null] + } ] + }, { + "version-id" : 2, + "timestamp-ms" : 1573518981593, + "summary" : { + "operation" : "create", + "engine-name" : "Spark", + "engineVersion" : "3.3.2" + }, + "representations" : [ { + "type" : "sql", + "sql" : "SELECT\n COUNT(1), CAST(event_ts AS DATE)\nFROM prod.default.events\nGROUP BY 2", + "dialect" : "spark", + "default-catalog" : "prod", + "default-namespace" : [ "default" ], + "field-aliases" : ["event_count", "event_date"] + } ] + } ], + "current-schema-id": 1, + "schemas": [ { + "schema-id": 1, + "type" : "struct", + "fields" : [ { + "id" : 1, + "name" : "col1", + "required" : false, + "type" : "int" + }, { + "id" : 2, + "name" : "col2", + "required" : false, + "type" : "date" + } ] + } ], + "version-log" : [ { + "timestamp-ms" : 1573518431292, + "version-id" : 1 + }, { + "timestamp-ms" : 1573518981593, + "version-id" : 2 + } ] +} +``` diff --git a/docs-new/mkdocs.yml b/docs-new/mkdocs.yml new file mode 100644 index 000000000000..e99a2d62732e --- /dev/null +++ b/docs-new/mkdocs.yml @@ -0,0 +1,96 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +site_name: Apache Iceberg +docs_dir: home + +theme: + name: material + language: en + logo: assets/images/iceberg-logo-icon.png + favicon: assets/images/favicon-96x96.png + features: + - navigation.tabs + - navigation.path + - navigation.top + - navigation.tracking + - toc.follow + - search.suggest + - search.highlight + - content.tabs.link + - content.code.copy + - content.code.annotate + +plugins: + - search + - macros: + include_yaml: + - variables.yml + - monorepo + +nav: + - Quickstart: + - Hive: hive-quickstart.md + - Spark: spark-quickstart.md + - Docs: + - latest: '!include home/docs/latest/mkdocs.yml' + - 1.3.1: '!include home/docs/1.3.1/mkdocs.yml' +# - 1.3.0: '!include home/docs/1.3.0/mkdocs.yml' + - Releases: releases.md + - Roadmap: roadmap.md + - Blogs: blogs.md + - Talks: talks.md + - Vendors: vendors.md + - Project: + - Join: community.md + - Spec: spec.md + - View spec: view-spec.md + - Puffin spec: puffin-spec.md + - Multi-engine support: multi-engine-support.md + - How to release: how-to-release.md + - Terms: terms.md + - ASF: + - ASF: ASF.md + - Sponsorship: https://www.apache.org/foundation/sponsorship.html + - Events: https://www.apache.org/events/current-event.html + - License: https://www.apache.org/licenses/ + - Security: https://www.apache.org/security/ + - Sponsors: https://www.apache.org/foundation/thanks.html + +markdown_extensions: + - pymdownx.highlight: + anchor_linenums: true + - pymdownx.inlinehilite + - pymdownx.snippets + - admonition + - pymdownx.arithmatex: + generic: true + - footnotes + - pymdownx.details + - pymdownx.superfences + - pymdownx.tabbed: + alternate_style: true + - pymdownx.mark + - attr_list + - pymdownx.emoji: + emoji_index: !!python/name:materialx.emoji.twemoji + emoji_generator: !!python/name:materialx.emoji.to_svg + - tables + + +copyright: | + Apache Iceberg, Iceberg, Apache, the Apache feather logo, and the Apache Iceberg project logo are
either registered trademarks or trademarks of The Apache Software Foundation. Copyright © 2023
The Apache Software Foundation, Licensed under the
Apache License, Version 2.0.

diff --git a/docs-new/requirements.txt b/docs-new/requirements.txt new file mode 100644 index 000000000000..58e6d4982f00 --- /dev/null +++ b/docs-new/requirements.txt @@ -0,0 +1,23 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +mkdocs-awesome-pages-plugin==2.9.2 +mkdocs-macros-plugin==1.0.4 +mkdocs-material==9.1.21 +mkdocs-material-extensions==1.1.1 +mkdocs-monorepo-plugin==1.0.5 +mkdocs-redirects==1.2.1 diff --git a/docs-new/variables.yml b/docs-new/variables.yml new file mode 100644 index 000000000000..ec52f4e68c84 --- /dev/null +++ b/docs-new/variables.yml @@ -0,0 +1,28 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +site_url: http://localhost:8000 + +extra: + icebergVersion: 1.3.1 + social: + - icon: fontawesome/brands/github-alt + link: https://github.com/apache/iceberg + - icon: fontawesome/brands/youtube + link: https://www.youtube.com/@ApacheIceberg + - icon: fontawesome/brands/slack + link: https://join.slack.com/t/apache-iceberg/shared_invite/zt-1znkcg5zm-7_FE~pcox347XwZE3GNfPg