diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25c4d2a7d30..fa757ea2108 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,306 @@
 # Change log
-Generated on 2021-12-07
+Generated on 2022-02-07
+
+## Release 22.02
+
+### Features
+|||
+|:---|:---|
+|[#4305](https://github.com/NVIDIA/spark-rapids/issues/4305)|[FEA] write nvidia tool wrappers to allow old YARN versions to work with MIG|
+|[#4360](https://github.com/NVIDIA/spark-rapids/issues/4360)|[FEA] Add explain api for Spark 2.X|
+|[#3541](https://github.com/NVIDIA/spark-rapids/issues/3541)|[FEA] Support max on single-level struct in aggregation context|
+|[#4238](https://github.com/NVIDIA/spark-rapids/issues/4238)|[FEA] Add a Spark 3.X Explain only mode to the plugin|
+|[#3952](https://github.com/NVIDIA/spark-rapids/issues/3952)|[Audit] [FEA][SPARK-32986][SQL] Add bucketed scan info in query plan of data source v1|
+|[#4412](https://github.com/NVIDIA/spark-rapids/issues/4412)|[FEA] Improve support for \A, \Z, and \z in regular expressions|
+|[#3979](https://github.com/NVIDIA/spark-rapids/issues/3979)|[FEA] Improvements for  CPU(Row) based UDF|
+|[#4467](https://github.com/NVIDIA/spark-rapids/issues/4467)|[FEA] Add support for regular expression with repeated digits (`\d+`, `\d*`, `\d?`)|
+|[#4439](https://github.com/NVIDIA/spark-rapids/issues/4439)|[FEA] Enable GPU broadcast exchange reuse for DPP when AQE enabled|
+|[#3512](https://github.com/NVIDIA/spark-rapids/issues/3512)|[FEA] Support org.apache.spark.sql.catalyst.expressions.Sequence|
+|[#3475](https://github.com/NVIDIA/spark-rapids/issues/3475)|[FEA] Spark 3.2.0 reads Parquet unsigned int64(UINT64) as Decimal(20,0) but CUDF does not support it |
+|[#4091](https://github.com/NVIDIA/spark-rapids/issues/4091)|[FEA] regexp_replace: Improve support for ^ and $|
+|[#4104](https://github.com/NVIDIA/spark-rapids/issues/4104)|[FEA] Support org.apache.spark.sql.catalyst.expressions.ReplicateRows|
+|[#4027](https://github.com/NVIDIA/spark-rapids/issues/4027)|[FEA]  Support SubqueryBroadcast on GPU to enable exchange reuse during DPP|
+|[#4284](https://github.com/NVIDIA/spark-rapids/issues/4284)|[FEA] Support idx = 0 in GpuRegExpExtract|
+|[#4002](https://github.com/NVIDIA/spark-rapids/issues/4002)|[FEA] Implement regexp_extract on GPU|
+|[#3221](https://github.com/NVIDIA/spark-rapids/issues/3221)|[FEA] Support GpuFirst and GpuLast on nested types under reduction aggregations|
+|[#3944](https://github.com/NVIDIA/spark-rapids/issues/3944)|[FEA] Full support for sum with overflow on Decimal 128|
+|[#4028](https://github.com/NVIDIA/spark-rapids/issues/4028)|[FEA] support GpuCast from non-nested ArrayType to StringType|
+|[#3250](https://github.com/NVIDIA/spark-rapids/issues/3250)|[FEA] Make CreateMap duplicate key handling compatible with Spark and enable CreateMap by default|
+|[#4170](https://github.com/NVIDIA/spark-rapids/issues/4170)|[FEA] Make regular expression behavior with `$` and `\r` consistent with CPU|
+|[#4001](https://github.com/NVIDIA/spark-rapids/issues/4001)|[FEA] Add regexp support to regexp_replace|
+|[#3962](https://github.com/NVIDIA/spark-rapids/issues/3962)|[FEA] Support null characters in regular expressions in RLIKE|
+|[#3797](https://github.com/NVIDIA/spark-rapids/issues/3797)|[FEA] Make RLike support consistent with Apache Spark|
+
+### Performance
+|||
+|:---|:---|
+|[#4392](https://github.com/NVIDIA/spark-rapids/issues/4392)|[FEA] could the parquet scan code avoid acquiring the semaphore for an empty batch?|
+|[#679](https://github.com/NVIDIA/spark-rapids/issues/679)|[FEA] move some deserialization code out of the scope of the gpu-semaphore to increase cpu concurrent|
+|[#4350](https://github.com/NVIDIA/spark-rapids/issues/4350)|[FEA] Optimize the all-true and all-false cases in GPU `If` and `CaseWhen` |
+|[#4309](https://github.com/NVIDIA/spark-rapids/issues/4309)|[FEA] Leverage cudf conditional nested loop join to implement semi/anti hash join with condition|
+|[#4395](https://github.com/NVIDIA/spark-rapids/issues/4395)|[FEA] acquire the semaphore after concatToHost in GpuShuffleCoalesceIterator|
+|[#4134](https://github.com/NVIDIA/spark-rapids/issues/4134)|[FEA] Allow `EliminateJoinToEmptyRelation` in `GpuBroadcastExchangeExec` |
+|[#4189](https://github.com/NVIDIA/spark-rapids/issues/4189)|[FEA] understand why between is so expensive|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#4675](https://github.com/NVIDIA/spark-rapids/issues/4675)|[BUG] Jenkins integration build timed out at 10 hours|
+|[#4665](https://github.com/NVIDIA/spark-rapids/issues/4665)|[BUG] Spark321Shims.getParquetFilters failed with NoSuchMethodError|
+|[#4635](https://github.com/NVIDIA/spark-rapids/issues/4635)|[BUG] nvidia-smi wrapper script ignores ENABLE_NON_MIG_GPUS=1 on a heterogeneous multi-GPU machine|
+|[#4500](https://github.com/NVIDIA/spark-rapids/issues/4500)|[BUG] Build failures against Spark 3.2.1 rc1 and make 3.2.1 non snapshot|
+|[#4631](https://github.com/NVIDIA/spark-rapids/issues/4631)|[BUG] Release build with mvn option `-P source-javadoc` FAILED|
+|[#4625](https://github.com/NVIDIA/spark-rapids/issues/4625)|[BUG] NDS query 5 fails with AdaptiveSparkPlanExec assertion|
+|[#4632](https://github.com/NVIDIA/spark-rapids/issues/4632)|[BUG] Build failing for Spark 3.3.0 due to deprecated method warnings|
+|[#4599](https://github.com/NVIDIA/spark-rapids/issues/4599)|[BUG] test_group_apply_udf and test_group_apply_udf_more_types hangs on Databricks 9.1|
+|[#4600](https://github.com/NVIDIA/spark-rapids/issues/4600)|[BUG] crash if we have a decimal128 in a struct in an array |
+|[#4581](https://github.com/NVIDIA/spark-rapids/issues/4581)|[BUG] Build error "GpuOverrides.scala:924: wrong number of arguments" on DB9.1.x spark-3.1.2 |
+|[#4593](https://github.com/NVIDIA/spark-rapids/issues/4593)|[BUG] dup GpuHashJoin.diff case-folding issue|
+|[#4503](https://github.com/NVIDIA/spark-rapids/issues/4503)|[BUG] regexp_replace with back references produces incorrect results on GPU|
+|[#4567](https://github.com/NVIDIA/spark-rapids/issues/4567)|[BUG] Profile tool hangs in compare mode|
+|[#4315](https://github.com/NVIDIA/spark-rapids/issues/4315)|[BUG] test_hash_reduction_decimal_overflow_sum[30] failed OOM in integration tests|
+|[#4551](https://github.com/NVIDIA/spark-rapids/issues/4551)|[BUG] protobuf-java version changed to 3.x|
+|[#4499](https://github.com/NVIDIA/spark-rapids/issues/4499)|[BUG]GpuSequence blows up when nulls exist in any of the inputs (start, stop, step)|
+|[#4454](https://github.com/NVIDIA/spark-rapids/issues/4454)|[BUG] Shade warnings when building the tools artifact|
+|[#4541](https://github.com/NVIDIA/spark-rapids/issues/4541)|[BUG] Column vector leak in conditionals_test.py|
+|[#4514](https://github.com/NVIDIA/spark-rapids/issues/4514)|[BUG] test_hash_reduction_pivot_without_nans failed|
+|[#4521](https://github.com/NVIDIA/spark-rapids/issues/4521)|[BUG] Inconsistencies in handling of newline characters and string and line anchors|
+|[#4548](https://github.com/NVIDIA/spark-rapids/issues/4548)|[BUG] ai.rapids.cudf.CudaException: an illegal instruction was encountered in databricks 9.1|
+|[#4475](https://github.com/NVIDIA/spark-rapids/issues/4475)|[BUG] `\D` and `\W` match newline in Spark but not in cuDF|
+|[#4524](https://github.com/NVIDIA/spark-rapids/issues/4524)|[BUG] RegExp transpiler fails to detect some choice expressions that cuDF cannot compile|
+|[#3226](https://github.com/NVIDIA/spark-rapids/issues/3226)|[BUG]OOM happened when do cube operations|
+|[#2504](https://github.com/NVIDIA/spark-rapids/issues/2504)|[BUG] OOM when running NDS queries with UCX and GDS|
+|[#4273](https://github.com/NVIDIA/spark-rapids/issues/4273)|[BUG] Rounding past the size that can be stored in a type produces incorrect results|
+|[#4060](https://github.com/NVIDIA/spark-rapids/issues/4060)|[BUG] test_hash_groupby_approx_percentile_long_repeated_keys failed intermittently|
+|[#4039](https://github.com/NVIDIA/spark-rapids/issues/4039)|[BUG] Spark 3.3.0 IT Array test failures|
+|[#3849](https://github.com/NVIDIA/spark-rapids/issues/3849)|[BUG] In ANSI mode we can fail in cases Spark would not due to conditionals|
+|[#4421](https://github.com/NVIDIA/spark-rapids/issues/4421)|[BUG] the driver is trying to load CUDA with latest 22.02 |
+|[#4455](https://github.com/NVIDIA/spark-rapids/issues/4455)|[BUG] join_test.py::test_struct_self_join[IGNORE_ORDER({'local': True})] failed in spark330|
+|[#4442](https://github.com/NVIDIA/spark-rapids/issues/4442)|[BUG] mvn build FAILED with option `-P noSnapshotsWithDatabricks`|
+|[#4281](https://github.com/NVIDIA/spark-rapids/issues/4281)|[BUG] q9 regression between 21.10 and 21.12|
+|[#4280](https://github.com/NVIDIA/spark-rapids/issues/4280)|[BUG] q88 regression between 21.10 and 21.12|
+|[#4422](https://github.com/NVIDIA/spark-rapids/issues/4422)|[BUG] Host column vectors are being leaked during tests|
+|[#4446](https://github.com/NVIDIA/spark-rapids/issues/4446)|[BUG] GpuCast crashes when casting from Array with unsupportable child type|
+|[#4432](https://github.com/NVIDIA/spark-rapids/issues/4432)|[BUG] nightly build 3.3.0 failed: HashClusteredDistribution is not a member of org.apache.spark.sql.catalyst.plans.physical|
+|[#4443](https://github.com/NVIDIA/spark-rapids/issues/4443)|[BUG] SPARK-37705 breaks parquet filters from Spark 3.3.0 and Spark 3.2.2 onwards|
+|[#4316](https://github.com/NVIDIA/spark-rapids/issues/4316)|[BUG] Exception: Unable to find py4j, your SPARK_HOME may not be configured correctly intermittently|
+|[#4378](https://github.com/NVIDIA/spark-rapids/issues/4378)|[BUG] udf_test udf_cudf_test failed require_minimum_pandas_version check in spark 320+|
+|[#4423](https://github.com/NVIDIA/spark-rapids/issues/4423)|[BUG] Build is failing due to FileScanRDD changes in Spark 3.3.0-SNAPSHOT|
+|[#4401](https://github.com/NVIDIA/spark-rapids/issues/4401)|[BUG]array_test.py::test_array_contains failures|
+|[#4403](https://github.com/NVIDIA/spark-rapids/issues/4403)|[BUG] NDS query 72 logs codegen fallback exception and produces incorrect results|
+|[#4386](https://github.com/NVIDIA/spark-rapids/issues/4386)|[BUG] conditionals_test.py FAILED with side_effects_cast[Integer/Long] on Databricks 9.1 Runtime|
+|[#3934](https://github.com/NVIDIA/spark-rapids/issues/3934)|[BUG] Dependencies of published integration tests jar are missing|
+|[#4341](https://github.com/NVIDIA/spark-rapids/issues/4341)|[BUG] GpuCast.scala:nnn warning: discarding unmoored doc comment|
+|[#4356](https://github.com/NVIDIA/spark-rapids/issues/4356)|[BUG] nightly spark303 deploy pulling spark301 aggregator|
+|[#4347](https://github.com/NVIDIA/spark-rapids/issues/4347)|[BUG] Dist jar pom lists aggregator jar as dependency|
+|[#4176](https://github.com/NVIDIA/spark-rapids/issues/4176)|[BUG] ParseDateTimeSuite UT failed|
+|[#4292](https://github.com/NVIDIA/spark-rapids/issues/4292)|[BUG] no meaningful message is surfaced to maven when binary-dedupe fails|
+|[#4351](https://github.com/NVIDIA/spark-rapids/issues/4351)|[BUG] Tests FAILED On SPARK-3.2.0, com.nvidia.spark.rapids.SerializedTableColumn cannot be cast to com.nvidia.spark.rapids.GpuColumnVector|
+|[#4346](https://github.com/NVIDIA/spark-rapids/issues/4346)|[BUG] q73 decimal was twice as slow in weekly results|
+|[#4334](https://github.com/NVIDIA/spark-rapids/issues/4334)|[BUG] GpuColumnarToRowExec will always be tagged False for exportColumnarRdd after Spark311 |
+|[#4339](https://github.com/NVIDIA/spark-rapids/issues/4339)|The parameter `dataType` is not necessary in `resolveColumnVector` method.|
+|[#4275](https://github.com/NVIDIA/spark-rapids/issues/4275)|[BUG] Row-based Hive UDF will fail if arguments contain a foldable expression.|
+|[#4229](https://github.com/NVIDIA/spark-rapids/issues/4229)|[BUG] regexp_replace `[^a]` has different behavior between CPU and GPU for multiline strings|
+|[#4294](https://github.com/NVIDIA/spark-rapids/issues/4294)|[BUG] parquet_write_test.py::test_ts_write_fails_datetime_exception failed in spark 3.1.1 and 3.1.2|
+|[#4205](https://github.com/NVIDIA/spark-rapids/issues/4205)|[BUG] Get different results when casting from timestamp to string|
+|[#4277](https://github.com/NVIDIA/spark-rapids/issues/4277)|[BUG] cudf_udf nightly cudf import rmm failed|
+|[#4246](https://github.com/NVIDIA/spark-rapids/issues/4246)|[BUG] Regression in CastOpSuite due to cuDF change in parsing NaN|
+|[#4243](https://github.com/NVIDIA/spark-rapids/issues/4243)|[BUG] test_regexp_replace_null_pattern_fallback[ALLOW_NON_GPU(ProjectExec,RegExpReplace)] failed in databricks|
+|[#4244](https://github.com/NVIDIA/spark-rapids/issues/4244)|[BUG] Cast from string to float using hand-picked values failed|
+|[#4227](https://github.com/NVIDIA/spark-rapids/issues/4227)|[BUG] RAPIDS Shuffle Manager doesn't fallback given encryption settings|
+|[#3374](https://github.com/NVIDIA/spark-rapids/issues/3374)|[BUG] minor deprecation warnings in a 3.2 shim build|
+|[#3613](https://github.com/NVIDIA/spark-rapids/issues/3613)|[BUG] release312db profile pulls in 311until320-apache|
+|[#4213](https://github.com/NVIDIA/spark-rapids/issues/4213)|[BUG] unused method with a misleading outdated comment in ShimLoader |
+|[#3609](https://github.com/NVIDIA/spark-rapids/issues/3609)|[BUG] GpuShuffleExchangeExec in v2 shims has inconsistent packaging|
+|[#4127](https://github.com/NVIDIA/spark-rapids/issues/4127)|[BUG] CUDF 22.02 nightly test failure|
+
+### PRs
+|||
+|:---|:---|
+|[#4700](https://github.com/NVIDIA/spark-rapids/pull/4700)|Update cudfjni version to released 22.02.0|
+|[#4701](https://github.com/NVIDIA/spark-rapids/pull/4701)|Decrease nighlty tests upper limitation to 7 [skip ci]|
+|[#4639](https://github.com/NVIDIA/spark-rapids/pull/4639)|Update changelog for 22.02 and archive info of some older releases [skip ci]|
+|[#4572](https://github.com/NVIDIA/spark-rapids/pull/4572)|Add download page for 22.02 [skip ci]|
+|[#4672](https://github.com/NVIDIA/spark-rapids/pull/4672)|Revert "Disable 311cdh build due to missing dependency (#4659)"|
+|[#4662](https://github.com/NVIDIA/spark-rapids/pull/4662)|Update the deploy script [skip ci]|
+|[#4657](https://github.com/NVIDIA/spark-rapids/pull/4657)|Upmerge spark2 directory to the latest 22.02 changes|
+|[#4659](https://github.com/NVIDIA/spark-rapids/pull/4659)|Disable 311cdh build by default because of a missing dependency|
+|[#4508](https://github.com/NVIDIA/spark-rapids/pull/4508)|Fix Spark 3.2.1 build failures and make it non-snapshot|
+|[#4652](https://github.com/NVIDIA/spark-rapids/pull/4652)|Remove non-deterministic test order in nightly [skip ci]|
+|[#4643](https://github.com/NVIDIA/spark-rapids/pull/4643)|Add profile release301 when mvn help:evaluate|
+|[#4630](https://github.com/NVIDIA/spark-rapids/pull/4630)|Fix the incomplete capture of SubqueryBroadcast |
+|[#4633](https://github.com/NVIDIA/spark-rapids/pull/4633)|Suppress newTaskTempFile method warnings for Spark 3.3.0 build|
+|[#4618](https://github.com/NVIDIA/spark-rapids/pull/4618)|[DB31x] Pick the correct Python runner for flatmap-group Pandas UDF|
+|[#4622](https://github.com/NVIDIA/spark-rapids/pull/4622)|Fallback to CPU when encoding is not supported for JSON reader|
+|[#4470](https://github.com/NVIDIA/spark-rapids/pull/4470)|Add in HashPartitioning support for decimal 128|
+|[#4535](https://github.com/NVIDIA/spark-rapids/pull/4535)|Revert "Disable orc write by default because of https://issues.apache.org/jira/browse/ORC-1075 (#4471)"|
+|[#4583](https://github.com/NVIDIA/spark-rapids/pull/4583)|Avoid unapply on PromotePrecision|
+|[#4573](https://github.com/NVIDIA/spark-rapids/pull/4573)|Correct version from 21.12 to 22.02[skip ci]|
+|[#4575](https://github.com/NVIDIA/spark-rapids/pull/4575)|Correct and update links in UDF doc[skip ci]|
+|[#4501](https://github.com/NVIDIA/spark-rapids/pull/4501)|Switch and/or to use new cudf binops to improve performance|
+|[#4594](https://github.com/NVIDIA/spark-rapids/pull/4594)|Resolve case-folding issue [skip ci]|
+|[#4585](https://github.com/NVIDIA/spark-rapids/pull/4585)|Spark2 module upmerge, deploy script, and updates for Jenkins|
+|[#4589](https://github.com/NVIDIA/spark-rapids/pull/4589)|Increase premerge databricks IDLE_TIMEOUT to 4 hours [skip ci]|
+|[#4485](https://github.com/NVIDIA/spark-rapids/pull/4485)|Add json reader support|
+|[#4556](https://github.com/NVIDIA/spark-rapids/pull/4556)|regexp_replace with back-references should fall back to CPU|
+|[#4569](https://github.com/NVIDIA/spark-rapids/pull/4569)|Fix infinite loop with Profiling tool compare mode and app with no sql ids|
+|[#4529](https://github.com/NVIDIA/spark-rapids/pull/4529)|Add support for Spark 2.x Explain Api|
+|[#4577](https://github.com/NVIDIA/spark-rapids/pull/4577)|Revert "Fix CVE-2021-22569 (#4545)"|
+|[#4520](https://github.com/NVIDIA/spark-rapids/pull/4520)|GpuSequence refactor|
+|[#4570](https://github.com/NVIDIA/spark-rapids/pull/4570)|A few quick fixes to try to reduce max memory usage in the tests|
+|[#4477](https://github.com/NVIDIA/spark-rapids/pull/4477)|Use libcudf mixed joins for conditional hash joins|
+|[#4566](https://github.com/NVIDIA/spark-rapids/pull/4566)|remove scala-library from combined tools jar|
+|[#4552](https://github.com/NVIDIA/spark-rapids/pull/4552)|Fix resource leak in GpuCaseWhen|
+|[#4553](https://github.com/NVIDIA/spark-rapids/pull/4553)|Reenable test_hash_reduction_pivot_without_nans|
+|[#4530](https://github.com/NVIDIA/spark-rapids/pull/4530)|Fix correctness issues in regexp and add `\r` and `\n` to fuzz tests|
+|[#4549](https://github.com/NVIDIA/spark-rapids/pull/4549)|Fix typos in integration tests README [skip ci]|
+|[#4545](https://github.com/NVIDIA/spark-rapids/pull/4545)|Fix CVE-2021-22569|
+|[#4543](https://github.com/NVIDIA/spark-rapids/pull/4543)|Enable auto-merge from branch-22.02 to branch-22.04 [skip ci]|
+|[#4540](https://github.com/NVIDIA/spark-rapids/pull/4540)|Remove user kuhushukla|
+|[#4434](https://github.com/NVIDIA/spark-rapids/pull/4434)|Support max on single-level struct in aggregation context|
+|[#4534](https://github.com/NVIDIA/spark-rapids/pull/4534)|Temporarily disable integration test - test_hash_reduction_pivot_without_nans|
+|[#4322](https://github.com/NVIDIA/spark-rapids/pull/4322)|Add an explain only mode to the plugin|
+|[#4497](https://github.com/NVIDIA/spark-rapids/pull/4497)|Make better use of pinned memory pool|
+|[#4512](https://github.com/NVIDIA/spark-rapids/pull/4512)|remove hadoop version requirement[skip ci]|
+|[#4527](https://github.com/NVIDIA/spark-rapids/pull/4527)|Fall back to CPU for regular expressions containing \D or \W|
+|[#4525](https://github.com/NVIDIA/spark-rapids/pull/4525)|Properly close data writer in GpuFileFormatWriter|
+|[#4502](https://github.com/NVIDIA/spark-rapids/pull/4502)|Removed the redundant test for element_at and fixed the failing one|
+|[#4523](https://github.com/NVIDIA/spark-rapids/pull/4523)|Add more integration tests for decimal 128|
+|[#3762](https://github.com/NVIDIA/spark-rapids/pull/3762)|Call the right method to convert table from row major <=> col major|
+|[#4482](https://github.com/NVIDIA/spark-rapids/pull/4482)|Simplified the construction of zero scalar in GpuUnaryMinus|
+|[#4510](https://github.com/NVIDIA/spark-rapids/pull/4510)|Update copyright in NOTICE [skip ci]|
+|[#4484](https://github.com/NVIDIA/spark-rapids/pull/4484)|Update GpuFileFormatWriter to stay in sync with recent Spark changes, but still not support writing Hive bucketed table on GPU.|
+|[#4492](https://github.com/NVIDIA/spark-rapids/pull/4492)|Fall back to CPU for regular expressions containing hex digits|
+|[#4495](https://github.com/NVIDIA/spark-rapids/pull/4495)|Enable approx_percentile by default|
+|[#4420](https://github.com/NVIDIA/spark-rapids/pull/4420)|Fix up incorrect results of rounding past the max digits of data type|
+|[#4483](https://github.com/NVIDIA/spark-rapids/pull/4483)|Update test case of reading nested unsigned parquet file|
+|[#4490](https://github.com/NVIDIA/spark-rapids/pull/4490)|Remove warning about RMM default allocator|
+|[#4461](https://github.com/NVIDIA/spark-rapids/pull/4461)|[Audit] Add bucketed scan info in query plan of data source v1|
+|[#4489](https://github.com/NVIDIA/spark-rapids/pull/4489)|Add arrays of decimal128 to join tests|
+|[#4476](https://github.com/NVIDIA/spark-rapids/pull/4476)|Don't acquire the semaphore for empty input while scanning|
+|[#4424](https://github.com/NVIDIA/spark-rapids/pull/4424)|Improve support for regular expression string anchors `\A`, `\Z`, and `\z`|
+|[#4491](https://github.com/NVIDIA/spark-rapids/pull/4491)|Skip the test for spark versions 3.1.1, 3.1.2 and 3.2.0 only|
+|[#4459](https://github.com/NVIDIA/spark-rapids/pull/4459)|Use merge sort for struct types in non-key columns|
+|[#4494](https://github.com/NVIDIA/spark-rapids/pull/4494)|Append new authorized user to blossom-ci whitelist [skip ci]|
+|[#4400](https://github.com/NVIDIA/spark-rapids/pull/4400)|Enable approx percentile tests|
+|[#4471](https://github.com/NVIDIA/spark-rapids/pull/4471)|Disable orc write by default because of https://issues.apache.org/jira/browse/ORC-1075|
+|[#4462](https://github.com/NVIDIA/spark-rapids/pull/4462)|Rename DECIMAL_128_FULL and rework usage of TypeSig.gpuNumeric|
+|[#4479](https://github.com/NVIDIA/spark-rapids/pull/4479)|Change signoff check image to slim-buster [skip ci]|
+|[#4464](https://github.com/NVIDIA/spark-rapids/pull/4464)|Throw SparkArrayIndexOutOfBoundsException for Spark 3.3.0+|
+|[#4469](https://github.com/NVIDIA/spark-rapids/pull/4469)|Support repetition of \d and \D in regexp functions|
+|[#4472](https://github.com/NVIDIA/spark-rapids/pull/4472)|Modify docs for 22.02 to address issue-4319[skip ci]|
+|[#4440](https://github.com/NVIDIA/spark-rapids/pull/4440)|Enable GPU broadcast exchange reuse for DPP when AQE enabled|
+|[#4376](https://github.com/NVIDIA/spark-rapids/pull/4376)|Add sequence support|
+|[#4460](https://github.com/NVIDIA/spark-rapids/pull/4460)|Abstract the text based PartitionReader|
+|[#4383](https://github.com/NVIDIA/spark-rapids/pull/4383)|Fix correctness issue with CASE WHEN with expressions that have side-effects|
+|[#4465](https://github.com/NVIDIA/spark-rapids/pull/4465)|Refactor for shims 320+|
+|[#4463](https://github.com/NVIDIA/spark-rapids/pull/4463)|Avoid replacing a hash join if build side is unsupported by the join type|
+|[#4456](https://github.com/NVIDIA/spark-rapids/pull/4456)|Fix build issues: 1 clean non-exists target dirs; 2 remove duplicated plugin|
+|[#4416](https://github.com/NVIDIA/spark-rapids/pull/4416)|Unshim join execs|
+|[#4172](https://github.com/NVIDIA/spark-rapids/pull/4172)|Support String to Decimal 128|
+|[#4458](https://github.com/NVIDIA/spark-rapids/pull/4458)|Exclude some metadata operators when checking GPU replacement|
+|[#4451](https://github.com/NVIDIA/spark-rapids/pull/4451)|Some metrics improvements and timeline reporting|
+|[#4435](https://github.com/NVIDIA/spark-rapids/pull/4435)|Disable add profile src execution by default to make the build log clean|
+|[#4436](https://github.com/NVIDIA/spark-rapids/pull/4436)|Print error log to stderr output|
+|[#4155](https://github.com/NVIDIA/spark-rapids/pull/4155)|Add partial support for line begin and end anchors in regexp_replace|
+|[#4428](https://github.com/NVIDIA/spark-rapids/pull/4428)|Exhaustively iterate ColumnarToRow iterator to avoid leaks|
+|[#4430](https://github.com/NVIDIA/spark-rapids/pull/4430)|update pca example link in ml-integration.md[skip ci]|
+|[#4452](https://github.com/NVIDIA/spark-rapids/pull/4452)|Limit parallelism of nightly tests [skip ci]|
+|[#4449](https://github.com/NVIDIA/spark-rapids/pull/4449)|Add recursive type checking and fallback tests for casting array with unsupported element types to string|
+|[#4437](https://github.com/NVIDIA/spark-rapids/pull/4437)|Change logInfo to logWarning|
+|[#4447](https://github.com/NVIDIA/spark-rapids/pull/4447)|Fix 330 build error and add 322 shims layer|
+|[#4417](https://github.com/NVIDIA/spark-rapids/pull/4417)|Fix an Intellij debug issue|
+|[#4431](https://github.com/NVIDIA/spark-rapids/pull/4431)|Add DateType support for AST expressions|
+|[#4433](https://github.com/NVIDIA/spark-rapids/pull/4433)|Import the right pandas from conda [skip ci]|
+|[#4419](https://github.com/NVIDIA/spark-rapids/pull/4419)|Import the right pandas from conda|
+|[#4427](https://github.com/NVIDIA/spark-rapids/pull/4427)|Update getFileScanRDD shim for recent changes in Spark 3.3.0|
+|[#4397](https://github.com/NVIDIA/spark-rapids/pull/4397)|Ignore cufile.log|
+|[#4388](https://github.com/NVIDIA/spark-rapids/pull/4388)|Add support for ReplicateRows|
+|[#4399](https://github.com/NVIDIA/spark-rapids/pull/4399)|Update docs for Profiling and Qualification tool to change wording|
+|[#4407](https://github.com/NVIDIA/spark-rapids/pull/4407)|Fix GpuSubqueryBroadcast on multi-fields relation|
+|[#4396](https://github.com/NVIDIA/spark-rapids/pull/4396)|GpuShuffleCoalesceIterator acquire semaphore after host concat|
+|[#4361](https://github.com/NVIDIA/spark-rapids/pull/4361)|Accommodate altered semantics of `cudf::lists::contains()`|
+|[#4394](https://github.com/NVIDIA/spark-rapids/pull/4394)|Use correct column name in GpuIf test|
+|[#4385](https://github.com/NVIDIA/spark-rapids/pull/4385)|Add missing GpuSubqueryBroadcast replacement rule for spark31x |
+|[#4387](https://github.com/NVIDIA/spark-rapids/pull/4387)|Fix auto merge conflict 4384[skip ci]|
+|[#4374](https://github.com/NVIDIA/spark-rapids/pull/4374)|Fix the IT module depends on the tests module|
+|[#4365](https://github.com/NVIDIA/spark-rapids/pull/4365)|Not publishing integration_tests jar to Maven Central [skip ci]|
+|[#4358](https://github.com/NVIDIA/spark-rapids/pull/4358)|Update GpuIf to support expressions with side effects|
+|[#4382](https://github.com/NVIDIA/spark-rapids/pull/4382)|Remove unused scallop dependency from integration_tests|
+|[#4364](https://github.com/NVIDIA/spark-rapids/pull/4364)|Replace Scala document with Scala comment for inner functions|
+|[#4373](https://github.com/NVIDIA/spark-rapids/pull/4373)|Add pytest tags for nightly test parallel run [skip ci]|
+|[#4150](https://github.com/NVIDIA/spark-rapids/pull/4150)|Support GpuSubqueryBroadcast for DPP|
+|[#4372](https://github.com/NVIDIA/spark-rapids/pull/4372)|Move casting to string tests from array_test.py and struct_test.py to cast_test.py|
+|[#4371](https://github.com/NVIDIA/spark-rapids/pull/4371)|Fix typo in skipTestsFor330 calculation [skip ci]|
+|[#4355](https://github.com/NVIDIA/spark-rapids/pull/4355)|Dedicated deploy-file with reduced pom in nightly build [skip ci]|
+|[#4352](https://github.com/NVIDIA/spark-rapids/pull/4352)|Revert "Ignore failing string to timestamp tests temporarily (#4197)"|
+|[#4359](https://github.com/NVIDIA/spark-rapids/pull/4359)|Audit - SPARK-37268 - Remove unused variable in GpuFileScanRDD [Databricks]|
+|[#4327](https://github.com/NVIDIA/spark-rapids/pull/4327)|Print meaningful message when calling scripts in maven|
+|[#4354](https://github.com/NVIDIA/spark-rapids/pull/4354)|Fix regression in AQE optimizations|
+|[#4343](https://github.com/NVIDIA/spark-rapids/pull/4343)|Fix issue with binding to hash agg columns with computation|
+|[#4285](https://github.com/NVIDIA/spark-rapids/pull/4285)|Add support for regexp_extract on the GPU|
+|[#4349](https://github.com/NVIDIA/spark-rapids/pull/4349)|Fix PYTHONPATH in pre-merge|
+|[#4269](https://github.com/NVIDIA/spark-rapids/pull/4269)|The option for the nightly script not deploying jars [skip ci]|
+|[#4335](https://github.com/NVIDIA/spark-rapids/pull/4335)|Fix the issue of exporting Column RDD|
+|[#4336](https://github.com/NVIDIA/spark-rapids/pull/4336)|Split expensive pytest files in cases level [skip ci]|
+|[#4328](https://github.com/NVIDIA/spark-rapids/pull/4328)|Change the explanation of why the operator will not work on GPU|
+|[#4338](https://github.com/NVIDIA/spark-rapids/pull/4338)|Use scala Int.box instead of Integer constructors |
+|[#4340](https://github.com/NVIDIA/spark-rapids/pull/4340)|Remove the unnecessary parameter `dataType` in `resolveColumnVector` method|
+|[#4256](https://github.com/NVIDIA/spark-rapids/pull/4256)|Allow returning an EmptyHashedRelation when a broadcast result is empty|
+|[#4333](https://github.com/NVIDIA/spark-rapids/pull/4333)|Add tests about writing empty table to ORC/PAQUET|
+|[#4337](https://github.com/NVIDIA/spark-rapids/pull/4337)|Support GpuFirst and GpuLast on nested types under reduction aggregations|
+|[#4331](https://github.com/NVIDIA/spark-rapids/pull/4331)|Fix parquet options builder calls|
+|[#4310](https://github.com/NVIDIA/spark-rapids/pull/4310)|Fix typo in shim class name|
+|[#4326](https://github.com/NVIDIA/spark-rapids/pull/4326)|Fix 4315 decrease concurrentGpuTasks to avoid sum test OOM|
+|[#4266](https://github.com/NVIDIA/spark-rapids/pull/4266)|Check revisions for all shim jars while build all|
+|[#4282](https://github.com/NVIDIA/spark-rapids/pull/4282)|Use data type to create an inspector for a foldable GPU expression.|
+|[#3144](https://github.com/NVIDIA/spark-rapids/pull/3144)|Optimize AQE with Spark 3.2+ to avoid redundant transitions|
+|[#4317](https://github.com/NVIDIA/spark-rapids/pull/4317)|[BUG] Update nightly test script to dynamically set mem_fraction [skip ci]|
+|[#4206](https://github.com/NVIDIA/spark-rapids/pull/4206)|Porting GpuRowToColumnar converters to InternalColumnarRDDConverter|
+|[#4272](https://github.com/NVIDIA/spark-rapids/pull/4272)|Full support for SUM overflow detection on decimal|
+|[#4255](https://github.com/NVIDIA/spark-rapids/pull/4255)|Make regexp pattern `[^a]` consistent with Spark for multiline strings|
+|[#4306](https://github.com/NVIDIA/spark-rapids/pull/4306)|Revert commonizing the int96ParquetRebase* functions |
+|[#4299](https://github.com/NVIDIA/spark-rapids/pull/4299)|Fix auto merge conflict 4298 [skip ci]|
+|[#4159](https://github.com/NVIDIA/spark-rapids/pull/4159)|Optimize sample perf|
+|[#4235](https://github.com/NVIDIA/spark-rapids/pull/4235)|Commonize v2 shim|
+|[#4274](https://github.com/NVIDIA/spark-rapids/pull/4274)|Add tests for timestamps that overflowed before.|
+|[#4271](https://github.com/NVIDIA/spark-rapids/pull/4271)|Skip test_regexp_replace_null_pattern_fallback on Spark 3.1.1 and later|
+|[#4278](https://github.com/NVIDIA/spark-rapids/pull/4278)|Use mamba for cudf conda install [skip ci]|
+|[#4270](https://github.com/NVIDIA/spark-rapids/pull/4270)|Document exponent differences when casting floating point to string [skip ci]|
+|[#4268](https://github.com/NVIDIA/spark-rapids/pull/4268)|Fix merge conflict with branch-21.12|
+|[#4093](https://github.com/NVIDIA/spark-rapids/pull/4093)|Add tests for regexp() and regexp_like()|
+|[#4259](https://github.com/NVIDIA/spark-rapids/pull/4259)|fix regression in cast from string to float that caused signed NaN to be considered valid|
+|[#4241](https://github.com/NVIDIA/spark-rapids/pull/4241)|fix bug in parsing regex character classes that start with `^` and contain an unescaped `]`|
+|[#4224](https://github.com/NVIDIA/spark-rapids/pull/4224)|Support row-based Hive UDFs|
+|[#4221](https://github.com/NVIDIA/spark-rapids/pull/4221)|GpuCast from ArrayType to StringType|
+|[#4007](https://github.com/NVIDIA/spark-rapids/pull/4007)|Implement duplicate key handling for GpuCreateMap|
+|[#4251](https://github.com/NVIDIA/spark-rapids/pull/4251)|Skip test_regexp_replace_null_pattern_fallback on Databricks|
+|[#4247](https://github.com/NVIDIA/spark-rapids/pull/4247)|Disable failing CastOpSuite test|
+|[#4239](https://github.com/NVIDIA/spark-rapids/pull/4239)|Make EOL anchor behavior match CPU for strings ending with newline|
+|[#4153](https://github.com/NVIDIA/spark-rapids/pull/4153)|Regexp: Only transpile once per expression rather than once per batch|
+|[#4230](https://github.com/NVIDIA/spark-rapids/pull/4230)|Change to build tools module with all the versions by default|
+|[#4223](https://github.com/NVIDIA/spark-rapids/pull/4223)|Fixes a minor deprecation warning|
+|[#4215](https://github.com/NVIDIA/spark-rapids/pull/4215)|Rebalance testing load|
+|[#4214](https://github.com/NVIDIA/spark-rapids/pull/4214)|Fix pre_merge ci_2 [skip ci]|
+|[#4212](https://github.com/NVIDIA/spark-rapids/pull/4212)|Remove an unused method with its outdated comment|
+|[#4211](https://github.com/NVIDIA/spark-rapids/pull/4211)|Update test_floor_ceil_overflow to be more lenient on exception type|
+|[#4203](https://github.com/NVIDIA/spark-rapids/pull/4203)|Move all the GpuShuffleExchangeExec shim v2 classes to org.apache.spark|
+|[#4193](https://github.com/NVIDIA/spark-rapids/pull/4193)|Rename 311until320-apache to 311until320-noncdh|
+|[#4197](https://github.com/NVIDIA/spark-rapids/pull/4197)|Ignore failing string to timestamp tests temporarily|
+|[#4160](https://github.com/NVIDIA/spark-rapids/pull/4160)|Fix merge issues for branch 22.02|
+|[#4081](https://github.com/NVIDIA/spark-rapids/pull/4081)|Convert String to DecimalType without casting to FloatType|
+|[#4132](https://github.com/NVIDIA/spark-rapids/pull/4132)|Fix auto merge conflict 4131 [skip ci]|
+|[#4099](https://github.com/NVIDIA/spark-rapids/pull/4099)|[REVIEW] Init version 22.02.0|
+|[#4113](https://github.com/NVIDIA/spark-rapids/pull/4113)|Fix pre-merge CI 2 conditions [skip ci]|
+|[#4064](https://github.com/NVIDIA/spark-rapids/pull/4064)|Regex: transpile `.` to `[^\r\n]` in cuDF|
+|[#4044](https://github.com/NVIDIA/spark-rapids/pull/4044)|RLike: Fall back to CPU for regex that would produce incorrect results|
 
 ## Release 21.12
 
 ### Features
 |||
 |:---|:---|
+|[#1571](https://github.com/NVIDIA/spark-rapids/issues/1571)|[FEA] Better precision range for decimal multiply, and possibly others|
 |[#3953](https://github.com/NVIDIA/spark-rapids/issues/3953)|[FEA] Audit: Add array support to union by name |
 |[#4085](https://github.com/NVIDIA/spark-rapids/issues/4085)|[FEA] Decimal 128 Support: Concat|
 |[#4073](https://github.com/NVIDIA/spark-rapids/issues/4073)|[FEA] Decimal 128 Support: MapKeys, MapValues, MapEntries|
@@ -106,6 +401,11 @@ Generated on 2021-12-07
 ### PRs
 |||
 |:---|:---|
+|[#4362](https://github.com/NVIDIA/spark-rapids/pull/4362)|Decimal128 support for Parquet|
+|[#4391](https://github.com/NVIDIA/spark-rapids/pull/4391)|update gcp custom dataproc image version to avoid log4j issue[skip ci]|
+|[#4379](https://github.com/NVIDIA/spark-rapids/pull/4379)|update hot fix cudf link v21.12.2|
+|[#4367](https://github.com/NVIDIA/spark-rapids/pull/4367)|update 21.12 branch for doc [skip ci]|
+|[#4245](https://github.com/NVIDIA/spark-rapids/pull/4245)|Update changelog 21.12 to latest [skip ci]|
 |[#4258](https://github.com/NVIDIA/spark-rapids/pull/4258)|Sanitize column names in ParquetCachedBatchSerializer before writing to Parquet|
 |[#4308](https://github.com/NVIDIA/spark-rapids/pull/4308)|Bump up GPU reserve memory to 640MB|
 |[#4307](https://github.com/NVIDIA/spark-rapids/pull/4307)|Update Download page for 21.12 [skip ci]|
@@ -1228,1322 +1528,5 @@ Generated on 2021-12-07
 |[#2402](https://github.com/NVIDIA/spark-rapids/pull/2402)|Add profiling tool|
 |[#2313](https://github.com/NVIDIA/spark-rapids/pull/2313)|Supports `GpuLiteral` of array type|
 
-## Release 0.5
-
-### Features
-|||
-|:---|:---|
-|[#938](https://github.com/NVIDIA/spark-rapids/issues/938)|[FEA] Have hashed shuffle match spark|
-|[#1604](https://github.com/NVIDIA/spark-rapids/issues/1604)|[FEA] Support casting structs to strings |
-|[#1920](https://github.com/NVIDIA/spark-rapids/issues/1920)|[FEA] Support murmur3 hashing of structs|
-|[#2018](https://github.com/NVIDIA/spark-rapids/issues/2018)|[FEA] A way for user to find out the plugin version and cudf version in REPL|
-|[#77](https://github.com/NVIDIA/spark-rapids/issues/77)|[FEA] Support ArrayContains|
-|[#1721](https://github.com/NVIDIA/spark-rapids/issues/1721)|[FEA] build cudf jars with NVTX enabled|
-|[#1782](https://github.com/NVIDIA/spark-rapids/issues/1782)|[FEA] Shim layers to support spark versions|
-|[#1625](https://github.com/NVIDIA/spark-rapids/issues/1625)|[FEA] Support Decimal Casts to String and String to Decimal|
-|[#166](https://github.com/NVIDIA/spark-rapids/issues/166)|[FEA] Support get_json_object|
-|[#1698](https://github.com/NVIDIA/spark-rapids/issues/1698)|[FEA] Support casting structs to string|
-|[#1912](https://github.com/NVIDIA/spark-rapids/issues/1912)|[FEA] Let `Scalar Pandas UDF ` support array of struct type.|
-|[#1136](https://github.com/NVIDIA/spark-rapids/issues/1136)|[FEA] Audit: Script to list commits between different Spark versions/tags|
-|[#1921](https://github.com/NVIDIA/spark-rapids/issues/1921)|[FEA] cudf version check should be lenient on later patch version|
-|[#19](https://github.com/NVIDIA/spark-rapids/issues/19)|[FEA] Out of core sorts|
-
-### Performance
-|||
-|:---|:---|
-|[#2090](https://github.com/NVIDIA/spark-rapids/issues/2090)|[FEA] Make row count estimates available to the cost-based optimizer|
-|[#1341](https://github.com/NVIDIA/spark-rapids/issues/1341)|Optimize unnecessary columnar->row->columnar transitions with AQE|
-|[#1558](https://github.com/NVIDIA/spark-rapids/issues/1558)|[FEA] Initialize UCX early|
-|[#1633](https://github.com/NVIDIA/spark-rapids/issues/1633)|[FEA] Implement a cost-based optimizer|
-|[#1727](https://github.com/NVIDIA/spark-rapids/issues/1727)|[FEA] Put RangePartitioner data path on the GPU|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#2279](https://github.com/NVIDIA/spark-rapids/issues/2279)|[BUG] Hash Partitioning can fail for very small batches|
-|[#2314](https://github.com/NVIDIA/spark-rapids/issues/2314)|[BUG] v0.5.0 pre-release pytests join_test.py::test_hash_join_array FAILED on SPARK-EGX Yarn Cluster|
-|[#2317](https://github.com/NVIDIA/spark-rapids/issues/2317)|[BUG] GpuColumnarToRowIterator can stop after receiving an empty batch|
-|[#2244](https://github.com/NVIDIA/spark-rapids/issues/2244)|[BUG] Executors hanging when running NDS benchmarks|
-|[#2278](https://github.com/NVIDIA/spark-rapids/issues/2278)|[BUG] FullOuter join can produce too many results|
-|[#2220](https://github.com/NVIDIA/spark-rapids/issues/2220)|[BUG] csv_test.py::test_csv_fallback FAILED on the EMR Cluster|
-|[#2225](https://github.com/NVIDIA/spark-rapids/issues/2225)|[BUG] GpuSort fails on tables containing arrays.|
-|[#2232](https://github.com/NVIDIA/spark-rapids/issues/2232)|[BUG] hash_aggregate_test.py::test_hash_grpby_pivot FAILED on the Databricks Cluster|
-|[#2231](https://github.com/NVIDIA/spark-rapids/issues/2231)|[BUG]string_test.py::test_re_replace FAILED on the Dataproc Cluster|
-|[#2042](https://github.com/NVIDIA/spark-rapids/issues/2042)|[BUG] NDS q14a fails with "GpuColumnarToRow does not implement doExecuteBroadcast"|
-|[#2203](https://github.com/NVIDIA/spark-rapids/issues/2203)|[BUG] Spark nightly cache tests fail with -- master flag|
-|[#2230](https://github.com/NVIDIA/spark-rapids/issues/2230)|[BUG] qa_nightly_select_test.py::test_select FAILED on the Dataproc Cluster|
-|[#1711](https://github.com/NVIDIA/spark-rapids/issues/1711)|[BUG] find a way to stop allocating from RMM on the shuffle-client thread|
-|[#2109](https://github.com/NVIDIA/spark-rapids/issues/2109)|[BUG] Fix high priority violations detected by code analysis tools|
-|[#2217](https://github.com/NVIDIA/spark-rapids/issues/2217)|[BUG] qa_nightly_select_test failure in test_select |
-|[#2127](https://github.com/NVIDIA/spark-rapids/issues/2127)|[BUG] Parsing with two-digit year should fall back to CPU|
-|[#2078](https://github.com/NVIDIA/spark-rapids/issues/2078)|[BUG] java.lang.ArithmeticException: divide by zero when spark.sql.ansi.enabled=true|
-|[#2048](https://github.com/NVIDIA/spark-rapids/issues/2048)|[BUG] split function+ repartition result in "ai.rapids.cudf.CudaException: device-side assert triggered"|
-|[#2036](https://github.com/NVIDIA/spark-rapids/issues/2036)|[BUG] Stackoverflow when writing wide parquet files.|
-|[#1973](https://github.com/NVIDIA/spark-rapids/issues/1973)|[BUG] generate_expr_test FAILED on Dataproc Cluster|
-|[#2079](https://github.com/NVIDIA/spark-rapids/issues/2079)|[BUG] koalas.sql fails with java.lang.ArrayIndexOutOfBoundsException|
-|[#217](https://github.com/NVIDIA/spark-rapids/issues/217)|[BUG] CudaUtil should be removed|
-|[#1550](https://github.com/NVIDIA/spark-rapids/issues/1550)|[BUG] The ORC output data of a query is not readable|
-|[#2074](https://github.com/NVIDIA/spark-rapids/issues/2074)|[BUG] Intermittent NPE in RapidsBufferCatalog when running test suite|
-|[#2027](https://github.com/NVIDIA/spark-rapids/issues/2027)|[BUG] udf_cudf_test.py integration tests fail |
-|[#1899](https://github.com/NVIDIA/spark-rapids/issues/1899)|[BUG] Some queries fail when cost-based optimizations are enabled|
-|[#1914](https://github.com/NVIDIA/spark-rapids/issues/1914)|[BUG] Add in float, double, timestamp, and date support to murmur3|
-|[#2014](https://github.com/NVIDIA/spark-rapids/issues/2014)|[BUG] earlyStart option added in 0.5 can cause errors when starting UCX|
-|[#1984](https://github.com/NVIDIA/spark-rapids/issues/1984)|[BUG] NDS q58 Decimal scale (59) cannot be greater than precision (38).|
-|[#2001](https://github.com/NVIDIA/spark-rapids/issues/2001)|[BUG] RapidsShuffleManager didn't pass `dirs` to `getBlockData` from a wrapped `ShuffleBlockResolver`|
-|[#1797](https://github.com/NVIDIA/spark-rapids/issues/1797)|[BUG] occasional crashes in CI|
-|[#1861](https://github.com/NVIDIA/spark-rapids/issues/1861)|Encountered column data outside the range of input buffer|
-|[#1905](https://github.com/NVIDIA/spark-rapids/issues/1905)|[BUG] Large concat task time in GpuShuffleCoalesce with pinned memory pool|
-|[#1638](https://github.com/NVIDIA/spark-rapids/issues/1638)|[BUG] Tests `test_window_aggs_for_rows_collect_list` fails when there are null values in columns.|
-|[#1864](https://github.com/NVIDIA/spark-rapids/issues/1864)|[BUG]HostColumnarToGPU inefficient when only doing count()|
-|[#1862](https://github.com/NVIDIA/spark-rapids/issues/1862)|[BUG] spark 3.2.0-snapshot integration test failed due to conf change|
-|[#1844](https://github.com/NVIDIA/spark-rapids/issues/1844)|[BUG] branch-0.5 nightly IT FAILED on the The mortgage ETL test "Could not read footer for file: file:/xxx/xxx.snappy.parquet"|
-|[#1627](https://github.com/NVIDIA/spark-rapids/issues/1627)|[BUG] GDS exception when restoring spilled buffer|
-|[#1802](https://github.com/NVIDIA/spark-rapids/issues/1802)|[BUG] Many decimal integration test failures for 0.5|
-
-### PRs
-|||
-|:---|:---|
-|[#2326](https://github.com/NVIDIA/spark-rapids/pull/2326)|Update changelog for 0.5.0 release|
-|[#2316](https://github.com/NVIDIA/spark-rapids/pull/2316)|Update doc to note that single quoted json strings are not ok|
-|[#2319](https://github.com/NVIDIA/spark-rapids/pull/2319)|Disable hash partitioning on arrays|
-|[#2318](https://github.com/NVIDIA/spark-rapids/pull/2318)|Fix ColumnarToRowIterator handling of empty batches|
-|[#2304](https://github.com/NVIDIA/spark-rapids/pull/2304)|Update CHANGELOG.md|
-|[#2301](https://github.com/NVIDIA/spark-rapids/pull/2301)|Update doc to reflect nanosleep problem with 460.32.03|
-|[#2298](https://github.com/NVIDIA/spark-rapids/pull/2298)|Update changelog for v0.5.0 release [skip ci]|
-|[#2293](https://github.com/NVIDIA/spark-rapids/pull/2293)|update cudf version to 0.19.2|
-|[#2289](https://github.com/NVIDIA/spark-rapids/pull/2289)|Update docs to warn against 450.80.02 driver with 10.x toolkit|
-|[#2285](https://github.com/NVIDIA/spark-rapids/pull/2285)|Require single batch for full outer join streaming|
-|[#2281](https://github.com/NVIDIA/spark-rapids/pull/2281)|Remove download section for unreleased 0.4.2|
-|[#2264](https://github.com/NVIDIA/spark-rapids/pull/2264)|Add spark312 and spark320 versions of cache serializer|
-|[#2254](https://github.com/NVIDIA/spark-rapids/pull/2254)|updated gcp docs with custom dataproc image instructions|
-|[#2247](https://github.com/NVIDIA/spark-rapids/pull/2247)|Allow specifying a superclass for non-GPU execs|
-|[#2235](https://github.com/NVIDIA/spark-rapids/pull/2235)|Fix distributed cache to read requested schema |
-|[#2261](https://github.com/NVIDIA/spark-rapids/pull/2261)|Make CBO row count test more robust|
-|[#2237](https://github.com/NVIDIA/spark-rapids/pull/2237)|update cudf version to 0.19.1|
-|[#2240](https://github.com/NVIDIA/spark-rapids/pull/2240)|Get the correct 'PIPESTATUS' in bash [skip ci]|
-|[#2242](https://github.com/NVIDIA/spark-rapids/pull/2242)|Add shuffle doc section on the periodicGC configuration|
-|[#2251](https://github.com/NVIDIA/spark-rapids/pull/2251)|Fix issue when out of core sorting nested data types|
-|[#2204](https://github.com/NVIDIA/spark-rapids/pull/2204)|Run nightly tests for ParquetCachedBatchSerializer|
-|[#2245](https://github.com/NVIDIA/spark-rapids/pull/2245)|Fix pivot bug for decimalType|
-|[#2093](https://github.com/NVIDIA/spark-rapids/pull/2093)|Initial implementation of row count estimates in cost-based optimizer|
-|[#2188](https://github.com/NVIDIA/spark-rapids/pull/2188)|Support GPU broadcast exchange reuse to feed CPU BHJ when AQE is enabled|
-|[#2227](https://github.com/NVIDIA/spark-rapids/pull/2227)|ParquetCachedBatchSerializer broadcast AllConfs instead of SQLConf to fix distributed mode|
-|[#2223](https://github.com/NVIDIA/spark-rapids/pull/2223)|Adds subquery aggregate tests from SPARK-31620|
-|[#2222](https://github.com/NVIDIA/spark-rapids/pull/2222)|Remove groupId already specified in parent pom|
-|[#2209](https://github.com/NVIDIA/spark-rapids/pull/2209)|Fixed a few issues with out of core sort|
-|[#2218](https://github.com/NVIDIA/spark-rapids/pull/2218)|Fix incorrect RegExpReplace children handling on Spark 3.1+|
-|[#2207](https://github.com/NVIDIA/spark-rapids/pull/2207)|fix batch size default values in the tuning guide|
-|[#2208](https://github.com/NVIDIA/spark-rapids/pull/2208)|Revert "add nightly cache tests (#2083)"|
-|[#2206](https://github.com/NVIDIA/spark-rapids/pull/2206)|Fix shim301db build|
-|[#2192](https://github.com/NVIDIA/spark-rapids/pull/2192)|Fix index-based access to the head elements|
-|[#2210](https://github.com/NVIDIA/spark-rapids/pull/2210)|Avoid redundant collection conversions|
-|[#2190](https://github.com/NVIDIA/spark-rapids/pull/2190)|JNI fixes for StringWordCount native UDF example|
-|[#2086](https://github.com/NVIDIA/spark-rapids/pull/2086)|Updating documentation for data format support|
-|[#2172](https://github.com/NVIDIA/spark-rapids/pull/2172)|Remove easy unused symbols|
-|[#2089](https://github.com/NVIDIA/spark-rapids/pull/2089)|Update PandasUDF doc|
-|[#2195](https://github.com/NVIDIA/spark-rapids/pull/2195)|fix cudf 0.19.0 download link [skip ci]|
-|[#2175](https://github.com/NVIDIA/spark-rapids/pull/2175)|Branch 0.5 doc update|
-|[#2168](https://github.com/NVIDIA/spark-rapids/pull/2168)|Simplify GpuExpressions w/ withResourceIfAllowed|
-|[#2055](https://github.com/NVIDIA/spark-rapids/pull/2055)|Support PivotFirst|
-|[#2183](https://github.com/NVIDIA/spark-rapids/pull/2183)|GpuParquetScan#readBufferToTable remove dead code|
-|[#2129](https://github.com/NVIDIA/spark-rapids/pull/2129)|Fall back to CPU when parsing two-digit years|
-|[#2083](https://github.com/NVIDIA/spark-rapids/pull/2083)|add nightly cache tests|
-|[#2151](https://github.com/NVIDIA/spark-rapids/pull/2151)|add corresponding close call for HostMemoryOutputStream|
-|[#2169](https://github.com/NVIDIA/spark-rapids/pull/2169)|Work around bug in Spark for integration test|
-|[#2130](https://github.com/NVIDIA/spark-rapids/pull/2130)|Fix divide-by-zero in GpuAverage with ansi mode|
-|[#2149](https://github.com/NVIDIA/spark-rapids/pull/2149)|Auto generate the supported types for the file formats|
-|[#2072](https://github.com/NVIDIA/spark-rapids/pull/2072)|Disable CSV parsing by default and update tests to better show what is left|
-|[#2157](https://github.com/NVIDIA/spark-rapids/pull/2157)|fix merge conflict for 0.4.2 [skip ci]|
-|[#2144](https://github.com/NVIDIA/spark-rapids/pull/2144)|Allow array and struct types to pass thru when doing join|
-|[#2145](https://github.com/NVIDIA/spark-rapids/pull/2145)|Avoid GPU shuffle for round-robin of unsortable types|
-|[#2021](https://github.com/NVIDIA/spark-rapids/pull/2021)|Add in support for murmur3 hashing of structs|
-|[#2128](https://github.com/NVIDIA/spark-rapids/pull/2128)|Add in Partition type check support|
-|[#2116](https://github.com/NVIDIA/spark-rapids/pull/2116)|Add dynamic Spark configuration for Databricks|
-|[#2132](https://github.com/NVIDIA/spark-rapids/pull/2132)|Log plugin and cudf versions on startup|
-|[#2135](https://github.com/NVIDIA/spark-rapids/pull/2135)|Disable Spark 3.2 shim by default|
-|[#2125](https://github.com/NVIDIA/spark-rapids/pull/2125)|enable auto-merge from 0.5 to 0.6 [skip ci]|
-|[#2120](https://github.com/NVIDIA/spark-rapids/pull/2120)|Materialize Stream before serialization|
-|[#2119](https://github.com/NVIDIA/spark-rapids/pull/2119)|Add more comprehensive documentation on supported date formats|
-|[#1717](https://github.com/NVIDIA/spark-rapids/pull/1717)|Decimal32 support|
-|[#2114](https://github.com/NVIDIA/spark-rapids/pull/2114)|Modified the Download page for 0.4.1 and updated doc to point to K8s guide|
-|[#2106](https://github.com/NVIDIA/spark-rapids/pull/2106)|Fix some buffer leaks|
-|[#2097](https://github.com/NVIDIA/spark-rapids/pull/2097)|fix the bound row project empty issue in row frame|
-|[#2099](https://github.com/NVIDIA/spark-rapids/pull/2099)|Remove verbose log prints to make the build/test log clean|
-|[#2105](https://github.com/NVIDIA/spark-rapids/pull/2105)|Cleanup prior Spark sessions in tests consistently|
-|[#2104](https://github.com/NVIDIA/spark-rapids/pull/2104)| Clone apache spark source code to parse the git commit IDs|
-|[#2095](https://github.com/NVIDIA/spark-rapids/pull/2095)|fix refcount when materializing device buffer from GDS|
-|[#2100](https://github.com/NVIDIA/spark-rapids/pull/2100)|[BUG] add wget for fetching conda [skip ci]|
-|[#2096](https://github.com/NVIDIA/spark-rapids/pull/2096)|Adjust images for integration tests|
-|[#2094](https://github.com/NVIDIA/spark-rapids/pull/2094)|Changed name of parquet files for Mortgage ETL Integration test|
-|[#2035](https://github.com/NVIDIA/spark-rapids/pull/2035)|Accelerate data transfer for map Pandas UDF plan|
-|[#2050](https://github.com/NVIDIA/spark-rapids/pull/2050)|stream shuffle buffers from GDS to UCX|
-|[#2084](https://github.com/NVIDIA/spark-rapids/pull/2084)|Enable ORC write by default|
-|[#2088](https://github.com/NVIDIA/spark-rapids/pull/2088)|Upgrade ScalaTest plugin to respect JAVA_HOME|
-|[#1932](https://github.com/NVIDIA/spark-rapids/pull/1932)|Create a getting started on K8s page|
-|[#2080](https://github.com/NVIDIA/spark-rapids/pull/2080)|Improve error message after failed RMM shutdown|
-|[#2064](https://github.com/NVIDIA/spark-rapids/pull/2064)|Optimize unnecessary columnar->row->columnar transitions with AQE|
-|[#2025](https://github.com/NVIDIA/spark-rapids/pull/2025)|Update the doc for pandas udf on databricks|
-|[#2059](https://github.com/NVIDIA/spark-rapids/pull/2059)|Add the flag 'TEST_TYPE' to avoid integration tests silently skipping some test cases|
-|[#2075](https://github.com/NVIDIA/spark-rapids/pull/2075)|Remove debug println from CBO test|
-|[#2046](https://github.com/NVIDIA/spark-rapids/pull/2046)|support casting Decimal to String|
-|[#1812](https://github.com/NVIDIA/spark-rapids/pull/1812)|allow spilled buffers to be unspilled|
-|[#2061](https://github.com/NVIDIA/spark-rapids/pull/2061)|Run the pandas udf using cudf on Databricks|
-|[#1893](https://github.com/NVIDIA/spark-rapids/pull/1893)|Plug-in support for get_json_object|
-|[#2044](https://github.com/NVIDIA/spark-rapids/pull/2044)|Use partition for GPU hash partitioning|
-|[#1954](https://github.com/NVIDIA/spark-rapids/pull/1954)|Fix CBO bug where incompatible plans were produced with AQE on|
-|[#2049](https://github.com/NVIDIA/spark-rapids/pull/2049)|Remove incompatable int overflow checking|
-|[#2056](https://github.com/NVIDIA/spark-rapids/pull/2056)|Remove Spark 3.2 from premerge and nightly CI run|
-|[#1814](https://github.com/NVIDIA/spark-rapids/pull/1814)|Struct to string casting functionality|
-|[#2037](https://github.com/NVIDIA/spark-rapids/pull/2037)|Fix warnings from use of deprecated cudf methods|
-|[#2033](https://github.com/NVIDIA/spark-rapids/pull/2033)|Bump up pre-merge OS from ubuntu 16 to ubuntu 18 [skip ci]|
-|[#1883](https://github.com/NVIDIA/spark-rapids/pull/1883)|Enable sort for single-level nesting struct columns on GPU|
-|[#2016](https://github.com/NVIDIA/spark-rapids/pull/2016)|Refactor logic for parallel testing|
-|[#2022](https://github.com/NVIDIA/spark-rapids/pull/2022)|Update order by to not load native libraries when sorting|
-|[#2017](https://github.com/NVIDIA/spark-rapids/pull/2017)|Add in murmur3 support for float, double, date and timestamp|
-|[#1981](https://github.com/NVIDIA/spark-rapids/pull/1981)|Fix GpuSize|
-|[#1999](https://github.com/NVIDIA/spark-rapids/pull/1999)|support casting string to decimal|
-|[#2006](https://github.com/NVIDIA/spark-rapids/pull/2006)|Enable windowed `collect_list` by default|
-|[#2000](https://github.com/NVIDIA/spark-rapids/pull/2000)|Use Spark's HybridRowQueue to avoid MemoryConsumer API shim|
-|[#2015](https://github.com/NVIDIA/spark-rapids/pull/2015)|Fix bug where rkey buffer is getting advanced after the first handshake|
-|[#2007](https://github.com/NVIDIA/spark-rapids/pull/2007)|Fix unknown column name error when filtering ORC file with no names|
-|[#2005](https://github.com/NVIDIA/spark-rapids/pull/2005)|Update to new is_before_spark_311 function name|
-|[#1944](https://github.com/NVIDIA/spark-rapids/pull/1944)|Support running scalar pandas UDF with array type.|
-|[#1991](https://github.com/NVIDIA/spark-rapids/pull/1991)|Fixes creation of invalid DecimalType in GpuDivide.tagExprForGpu|
-|[#1958](https://github.com/NVIDIA/spark-rapids/pull/1958)|Support legacy behavior of parameterless count |
-|[#1919](https://github.com/NVIDIA/spark-rapids/pull/1919)|Add support for Structs for UnionExec|
-|[#2002](https://github.com/NVIDIA/spark-rapids/pull/2002)|Pass dirs to getBlockData for a wrapped shuffle resolver|
-|[#1983](https://github.com/NVIDIA/spark-rapids/pull/1983)|document building against different CUDA Toolkit versions|
-|[#1994](https://github.com/NVIDIA/spark-rapids/pull/1994)|Merge 0.4 to 0.5 [skip ci]|
-|[#1982](https://github.com/NVIDIA/spark-rapids/pull/1982)|Update ORC pushdown filter building to latest Spark logic|
-|[#1978](https://github.com/NVIDIA/spark-rapids/pull/1978)|Add audit script to list commits from Spark|
-|[#1976](https://github.com/NVIDIA/spark-rapids/pull/1976)|Temp fix for parquet write changes|
-|[#1970](https://github.com/NVIDIA/spark-rapids/pull/1970)|add maven profiles for supported CUDA versions|
-|[#1951](https://github.com/NVIDIA/spark-rapids/pull/1951)|Branch 0.5 doc remove numpartitions|
-|[#1967](https://github.com/NVIDIA/spark-rapids/pull/1967)|Update FAQ for Dataset API and format supported versions|
-|[#1972](https://github.com/NVIDIA/spark-rapids/pull/1972)|support GpuSize|
-|[#1966](https://github.com/NVIDIA/spark-rapids/pull/1966)|add xml report for codecov|
-|[#1955](https://github.com/NVIDIA/spark-rapids/pull/1955)|Fix typo in Arrow optimization config|
-|[#1956](https://github.com/NVIDIA/spark-rapids/pull/1956)|Fix NPE in plugin shutdown|
-|[#1930](https://github.com/NVIDIA/spark-rapids/pull/1930)|Relax cudf version check for patch-level versions|
-|[#1787](https://github.com/NVIDIA/spark-rapids/pull/1787)|support distributed file path in cloud environment|
-|[#1961](https://github.com/NVIDIA/spark-rapids/pull/1961)|change premege GPU_TYPE from secret to global env [skip ci]|
-|[#1957](https://github.com/NVIDIA/spark-rapids/pull/1957)|Update Spark 3.1.2 shim for float upcast behavior|
-|[#1889](https://github.com/NVIDIA/spark-rapids/pull/1889)|Decimal DIV changes |
-|[#1947](https://github.com/NVIDIA/spark-rapids/pull/1947)|Move doc of Pandas UDF to additional-functionality|
-|[#1938](https://github.com/NVIDIA/spark-rapids/pull/1938)|Add spark.executor.resource.gpu.amount=1 to YARN and K8s docs|
-|[#1937](https://github.com/NVIDIA/spark-rapids/pull/1937)|Fix merge conflict with branch-0.4|
-|[#1878](https://github.com/NVIDIA/spark-rapids/pull/1878)|spillable cache for GpuCartesianRDD|
-|[#1843](https://github.com/NVIDIA/spark-rapids/pull/1843)|Refactor GpuGenerateExec and Explode|
-|[#1933](https://github.com/NVIDIA/spark-rapids/pull/1933)|Split DB scripts to make them common for the build and IT pipeline|
-|[#1935](https://github.com/NVIDIA/spark-rapids/pull/1935)|Update Alias SQL quoting and float-to-timestamp casting to match Spark 3.2|
-|[#1926](https://github.com/NVIDIA/spark-rapids/pull/1926)|Consolidate RAT settings in parent pom|
-|[#1918](https://github.com/NVIDIA/spark-rapids/pull/1918)|Minor code cleanup in dateTImeExpressions|
-|[#1906](https://github.com/NVIDIA/spark-rapids/pull/1906)|Remove get call on timeZoneId|
-|[#1908](https://github.com/NVIDIA/spark-rapids/pull/1908)|Remove the Scala version of Mortgage ETL tests from nightly test|
-|[#1894](https://github.com/NVIDIA/spark-rapids/pull/1894)|Modified Download Page to re-order the items and change the format of download links|
-|[#1909](https://github.com/NVIDIA/spark-rapids/pull/1909)|Avoid pinned memory for shuffle host buffers|
-|[#1891](https://github.com/NVIDIA/spark-rapids/pull/1891)|Connect UCX endpoints early during app startup|
-|[#1877](https://github.com/NVIDIA/spark-rapids/pull/1877)|remove docker build in pre-merge [skip ci]|
-|[#1830](https://github.com/NVIDIA/spark-rapids/pull/1830)|Enable the tests for collect over window.|
-|[#1882](https://github.com/NVIDIA/spark-rapids/pull/1882)|GpuArrowColumnarBatchBuilder retains the references of ArrowBuf until HostToGpuCoalesceIterator put them into device|
-|[#1868](https://github.com/NVIDIA/spark-rapids/pull/1868)|Increase row limit when doing count() for HostColumnarToGpu |
-|[#1855](https://github.com/NVIDIA/spark-rapids/pull/1855)|Expose row count statistics in GpuShuffleExchangeExec|
-|[#1875](https://github.com/NVIDIA/spark-rapids/pull/1875)|Fix merge conflict with branch-0.4|
-|[#1841](https://github.com/NVIDIA/spark-rapids/pull/1841)|Add in support for DateAddInterval|
-|[#1869](https://github.com/NVIDIA/spark-rapids/pull/1869)|Fix tests for Spark 3.2.0 shim|
-|[#1858](https://github.com/NVIDIA/spark-rapids/pull/1858)|fix shuffle manager doc on ucx library path|
-|[#1836](https://github.com/NVIDIA/spark-rapids/pull/1836)|Add shim for Spark 3.1.2|
-|[#1852](https://github.com/NVIDIA/spark-rapids/pull/1852)|Fix Part Suite Tests|
-|[#1616](https://github.com/NVIDIA/spark-rapids/pull/1616)|Cost-based optimizer|
-|[#1834](https://github.com/NVIDIA/spark-rapids/pull/1834)|Add shim for Spark 3.0.3|
-|[#1839](https://github.com/NVIDIA/spark-rapids/pull/1839)|Refactor join code to reduce duplicated code|
-|[#1848](https://github.com/NVIDIA/spark-rapids/pull/1848)|Fix merge conflict with branch-0.4|
-|[#1796](https://github.com/NVIDIA/spark-rapids/pull/1796)|Have most of range partitioning run on the GPU|
-|[#1845](https://github.com/NVIDIA/spark-rapids/pull/1845)|Fix fails on the mortgage ETL test|
-|[#1829](https://github.com/NVIDIA/spark-rapids/pull/1829)|Cleanup unused Jenkins files and scripts|
-|[#1704](https://github.com/NVIDIA/spark-rapids/pull/1704)|Create a shim for Spark 3.2.0 development|
-|[#1838](https://github.com/NVIDIA/spark-rapids/pull/1838)|Make databricks build.sh more convenient for dev|
-|[#1835](https://github.com/NVIDIA/spark-rapids/pull/1835)|Fix merge conflict with branch-0.4|
-|[#1808](https://github.com/NVIDIA/spark-rapids/pull/1808)|Update mortgage tests to support reading multiple dataset formats|
-|[#1822](https://github.com/NVIDIA/spark-rapids/pull/1822)|Fix conflict 0.4 to 0.5|
-|[#1807](https://github.com/NVIDIA/spark-rapids/pull/1807)|Fix merge conflict between branch-0.4 and branch-0.5|
-|[#1788](https://github.com/NVIDIA/spark-rapids/pull/1788)|Spill metrics everywhere|
-|[#1719](https://github.com/NVIDIA/spark-rapids/pull/1719)|Add in out of core sort|
-|[#1728](https://github.com/NVIDIA/spark-rapids/pull/1728)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
-|[#1689](https://github.com/NVIDIA/spark-rapids/pull/1689)|Update docs for plugin 0.5.0-SNAPSHOT and cudf 0.19-SNAPSHOT|
-|[#1682](https://github.com/NVIDIA/spark-rapids/pull/1682)|init CI/CD dependencies branch-0.5|
-
-## Release 0.4.1
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#1985](https://github.com/NVIDIA/spark-rapids/issues/1985)|[BUG] broadcast exchange can fail on 0.4|
-
-### PRs
-|||
-|:---|:---|
-|[#1995](https://github.com/NVIDIA/spark-rapids/pull/1995)|update changelog 0.4.1 [skip ci]|
-|[#1990](https://github.com/NVIDIA/spark-rapids/pull/1990)|Prepare for v0.4.1 release|
-|[#1988](https://github.com/NVIDIA/spark-rapids/pull/1988)|broadcast exchange can fail when job group set|
-
-## Release 0.4
-
-### Features
-|||
-|:---|:---|
-|[#1773](https://github.com/NVIDIA/spark-rapids/issues/1773)|[FEA] Spark 3.0.2 release support|
-|[#80](https://github.com/NVIDIA/spark-rapids/issues/80)|[FEA] Support the struct SQL function|
-|[#76](https://github.com/NVIDIA/spark-rapids/issues/76)|[FEA] Support CreateArray|
-|[#1635](https://github.com/NVIDIA/spark-rapids/issues/1635)|[FEA] RAPIDS accelerated Java UDF|
-|[#1333](https://github.com/NVIDIA/spark-rapids/issues/1333)|[FEA] Support window operations on Decimal|
-|[#1419](https://github.com/NVIDIA/spark-rapids/issues/1419)|[FEA] Support GPU accelerated UDF alternative for higher order function "aggregate" over window|
-|[#1580](https://github.com/NVIDIA/spark-rapids/issues/1580)|[FEA] Support Decimal for ParquetCachedBatchSerializer|
-|[#1600](https://github.com/NVIDIA/spark-rapids/issues/1600)|[FEA] Support ScalarSubquery|
-|[#1072](https://github.com/NVIDIA/spark-rapids/issues/1072)|[FEA] Support for a custom DataSource V2 which supplies Arrow data|
-|[#906](https://github.com/NVIDIA/spark-rapids/issues/906)|[FEA] Clarify query explanation to directly state what will run on GPU|
-|[#1335](https://github.com/NVIDIA/spark-rapids/issues/1335)|[FEA] Support CollectLimitExec for decimal|
-|[#1485](https://github.com/NVIDIA/spark-rapids/issues/1485)|[FEA] Decimal Support for Parquet Write|
-|[#1329](https://github.com/NVIDIA/spark-rapids/issues/1329)|[FEA] Decimal support for multiply int div, add, subtract and null safe equals|
-|[#1351](https://github.com/NVIDIA/spark-rapids/issues/1351)|[FEA] Execute UDFs that provide a RAPIDS execution path|
-|[#1330](https://github.com/NVIDIA/spark-rapids/issues/1330)|[FEA] Support Decimal Casts|
-|[#1353](https://github.com/NVIDIA/spark-rapids/issues/1353)|[FEA] Example of RAPIDS UDF using custom GPU code|
-|[#1487](https://github.com/NVIDIA/spark-rapids/issues/1487)|[FEA] Change spark 3.1.0 to 3.1.1|
-|[#1334](https://github.com/NVIDIA/spark-rapids/issues/1334)|[FEA] Add support for count aggregate on decimal|
-|[#1325](https://github.com/NVIDIA/spark-rapids/issues/1325)|[FEA] Add in join support for decimal|
-|[#1326](https://github.com/NVIDIA/spark-rapids/issues/1326)|[FEA] Add in Broadcast support for decimal values|
-|[#37](https://github.com/NVIDIA/spark-rapids/issues/37)|[FEA] round and bround SQL functions|
-|[#78](https://github.com/NVIDIA/spark-rapids/issues/78)|[FEA] Support CreateNamedStruct function|
-|[#1331](https://github.com/NVIDIA/spark-rapids/issues/1331)|[FEA] UnionExec and ExpandExec support for decimal|
-|[#1332](https://github.com/NVIDIA/spark-rapids/issues/1332)|[FEA] Support CaseWhen, Coalesce and IfElse for decimal|
-|[#937](https://github.com/NVIDIA/spark-rapids/issues/937)|[FEA] have murmur3 hash function that matches exactly with spark|
-|[#1324](https://github.com/NVIDIA/spark-rapids/issues/1324)|[FEA] Support Parquet Read of Decimal FIXED_LENGTH_BYTE_ARRAY|
-|[#1428](https://github.com/NVIDIA/spark-rapids/issues/1428)|[FEA] Add support for unary decimal operations abs, floor, ceil, unary - and unary +|
-|[#1375](https://github.com/NVIDIA/spark-rapids/issues/1375)|[FEA] Add log statement for what the concurrentGpuTasks tasks is set to on executor startup|
-|[#1352](https://github.com/NVIDIA/spark-rapids/issues/1352)|[FEA] Example of RAPIDS UDF using cudf Java APIs|
-|[#1328](https://github.com/NVIDIA/spark-rapids/issues/1328)|[FEA] Support sorting and shuffle of decimal|
-|[#1316](https://github.com/NVIDIA/spark-rapids/issues/1316)|[FEA] Support simple DECIMAL aggregates|
-
-### Performance
-|||
-|:---|:---|
-|[#1435](https://github.com/NVIDIA/spark-rapids/issues/1435)|[FEA]Improve the file reading by using local file caching|
-|[#1738](https://github.com/NVIDIA/spark-rapids/issues/1738)|[FEA] Reduce regex usage in CAST string to date/timestamp|
-|[#987](https://github.com/NVIDIA/spark-rapids/issues/987)|[FEA] Optimize CAST from string to temporal types by using cuDF is_timestamp function|
-|[#1594](https://github.com/NVIDIA/spark-rapids/issues/1594)|[FEA] RAPIDS accelerated ScalaUDF|
-|[#103](https://github.com/NVIDIA/spark-rapids/issues/103)|[FEA] GPU version of TakeOrderedAndProject|
-|[#1024](https://github.com/NVIDIA/spark-rapids/issues/1024)|Cleanup RAPIDS transport calls to `receive`|
-|[#1366](https://github.com/NVIDIA/spark-rapids/issues/1366)|Seeing performance differences of multi-threaded/coalesce/perfile Parquet reader type for a single file|
-|[#1200](https://github.com/NVIDIA/spark-rapids/issues/1200)|[FEA] Accelerate the scan speed for coalescing parquet reader when reading files from multiple partitioned folders|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#1885](https://github.com/NVIDIA/spark-rapids/issues/1885)|[BUG] natural join on string key results in a data frame with spurious NULLs|
-|[#1785](https://github.com/NVIDIA/spark-rapids/issues/1785)|[BUG] Rapids pytest integration tests FAILED on Yarn cluster with unrecognized arguments: `--std_input_path=src/test/resources/`|
-|[#999](https://github.com/NVIDIA/spark-rapids/issues/999)|[BUG] test_multi_types_window_aggs_for_rows_lead_lag fails against Spark 3.1.0|
-|[#1818](https://github.com/NVIDIA/spark-rapids/issues/1818)|[BUG] unmoored doc comment warnings in GpuCast|
-|[#1817](https://github.com/NVIDIA/spark-rapids/issues/1817)|[BUG] Developer build with local modifications fails during verify phase|
-|[#1644](https://github.com/NVIDIA/spark-rapids/issues/1644)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks|
-|[#1771](https://github.com/NVIDIA/spark-rapids/issues/1771)|[BUG] Databricks AWS CI/CD failing to create cluster|
-|[#1157](https://github.com/NVIDIA/spark-rapids/issues/1157)|[BUG] Fix regression supporting to_date on GPU with Spark 3.1.0|
-|[#716](https://github.com/NVIDIA/spark-rapids/issues/716)|[BUG] Cast String to TimeStamp issues|
-|[#1117](https://github.com/NVIDIA/spark-rapids/issues/1117)|[BUG] CAST string to date returns wrong values for dates with out-of-range values|
-|[#1670](https://github.com/NVIDIA/spark-rapids/issues/1670)|[BUG] Some TPC-DS queries fail with AQE when decimal types enabled|
-|[#1730](https://github.com/NVIDIA/spark-rapids/issues/1730)|[BUG] Range Partitioning can crash when processing is in the order-by|
-|[#1726](https://github.com/NVIDIA/spark-rapids/issues/1726)|[BUG] java url decode test failing on databricks, emr, and dataproc|
-|[#1651](https://github.com/NVIDIA/spark-rapids/issues/1651)|[BUG] GDS exception when writing shuffle file|
-|[#1702](https://github.com/NVIDIA/spark-rapids/issues/1702)|[BUG] check all tests marked xfail for Spark 3.1.1|
-|[#575](https://github.com/NVIDIA/spark-rapids/issues/575)|[BUG] Spark 3.1 FAILED join_test.py::test_broadcast_join_mixed[FullOuter][IGNORE_ORDER] failed|
-|[#577](https://github.com/NVIDIA/spark-rapids/issues/577)|[BUG] Spark 3.1 log arithmetic functions fail|
-|[#1541](https://github.com/NVIDIA/spark-rapids/issues/1541)|[BUG] Tests fail in integration in distributed mode after allowing nested types through in sort and shuffle|
-|[#1626](https://github.com/NVIDIA/spark-rapids/issues/1626)|[BUG] TPC-DS-like query 77 at scale=3TB fails with maxResultSize exceeded error|
-|[#1576](https://github.com/NVIDIA/spark-rapids/issues/1576)|[BUG] loading SPARK-32639 example parquet file triggers a JVM crash |
-|[#1643](https://github.com/NVIDIA/spark-rapids/issues/1643)|[BUG] TPC-DS-Like q10, q35, and q69 - slow or hanging at leftSemiJoin|
-|[#1650](https://github.com/NVIDIA/spark-rapids/issues/1650)|[BUG] BenchmarkRunner does not include query name in JSON summary filename when running multiple queries|
-|[#1654](https://github.com/NVIDIA/spark-rapids/issues/1654)|[BUG] TPC-DS-like query 59 at scale=3TB with AQE fails with join mismatch|
-|[#1274](https://github.com/NVIDIA/spark-rapids/issues/1274)|[BUG] OutOfMemoryError - Maximum pool size exceeded while running 24 day criteo ETL Transform stage|
-|[#1497](https://github.com/NVIDIA/spark-rapids/issues/1497)|[BUG] Spark-rapids v0.3.0 pytest integration tests with UCX on FAILED on Yarn cluster|
-|[#1534](https://github.com/NVIDIA/spark-rapids/issues/1534)|[BUG] Spark 3.1.1 test failure in writing due to removal of InMemoryFileIndex.shouldFilterOut|
-|[#1155](https://github.com/NVIDIA/spark-rapids/issues/1155)|[BUG] on shutdown don't print `Socket closed` exception when shutting down UCX.scala|
-|[#1510](https://github.com/NVIDIA/spark-rapids/issues/1510)|[BUG] IllegalArgumentException during shuffle|
-|[#1513](https://github.com/NVIDIA/spark-rapids/issues/1513)|[BUG] executor not fully initialized may get calls from Spark, in the process setting the `catalog` incorrectly|
-|[#1466](https://github.com/NVIDIA/spark-rapids/issues/1466)|[BUG] Databricks build must run before the rapids nightly|
-|[#1456](https://github.com/NVIDIA/spark-rapids/issues/1456)|[BUG] Databricks 0.4 parquet integration tests fail|
-|[#1400](https://github.com/NVIDIA/spark-rapids/issues/1400)|[BUG] Regressions in spark-shell usage of benchmark utilities|
-|[#1119](https://github.com/NVIDIA/spark-rapids/issues/1119)|[BUG] inner join fails with Column size cannot be negative|
-|[#1079](https://github.com/NVIDIA/spark-rapids/issues/1079)|[BUG]The Scala UDF function cannot invoke the UDF compiler when it's passed to "explode"|
-|[#1298](https://github.com/NVIDIA/spark-rapids/issues/1298)|TPCxBB query16 failed at UnsupportedOperationException: org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainIntegerDictionary|
-|[#1271](https://github.com/NVIDIA/spark-rapids/issues/1271)|[BUG] CastOpSuite and AnsiCastOpSuite failing with ArithmeticException on Spark 3.1|
-|[#84](https://github.com/NVIDIA/spark-rapids/issues/84)|[BUG] sort does not match spark for -0.0 and 0.0|
-|[#578](https://github.com/NVIDIA/spark-rapids/issues/578)|[BUG] Spark 3.1 qa_nightly_select_test.py Full join test failures|
-|[#586](https://github.com/NVIDIA/spark-rapids/issues/586)|[BUG] Spark3.1 tpch failures|
-|[#837](https://github.com/NVIDIA/spark-rapids/issues/837)|[BUG] Distinct count of floating point values differs with regular spark|
-|[#953](https://github.com/NVIDIA/spark-rapids/issues/953)|[BUG] 3.1.0 pos_explode tests are failing|
-|[#127](https://github.com/NVIDIA/spark-rapids/issues/127)|[BUG] String CSV parsing does not respect nullValues|
-|[#1203](https://github.com/NVIDIA/spark-rapids/issues/1203)|[BUG] tpcds query 51 fails with join error on Spark 3.1.0|
-|[#750](https://github.com/NVIDIA/spark-rapids/issues/750)|[BUG] udf_cudf_test::test_with_column fails with IPC error |
-|[#1348](https://github.com/NVIDIA/spark-rapids/issues/1348)|[BUG] Host columnar decimal conversions are failing|
-|[#1270](https://github.com/NVIDIA/spark-rapids/issues/1270)|[BUG] Benchmark runner fails to produce report if benchmark fails due to an invalid query plan|
-|[#1179](https://github.com/NVIDIA/spark-rapids/issues/1179)|[BUG] SerializeConcatHostBuffersDeserializeBatch may have thread issues|
-|[#1115](https://github.com/NVIDIA/spark-rapids/issues/1115)|[BUG] Unchecked type warning in SparkQueryCompareTestSuite|
-
-### PRs
-|||
-|:---|:---|
-|[#1963](https://github.com/NVIDIA/spark-rapids/pull/1963)|Update changelog 0.4 [skip ci]|
-|[#1960](https://github.com/NVIDIA/spark-rapids/pull/1960)|Replace sonatype staging link with maven central link|
-|[#1945](https://github.com/NVIDIA/spark-rapids/pull/1945)|Update changelog 0.4 [skip ci]|
-|[#1910](https://github.com/NVIDIA/spark-rapids/pull/1910)|Make hash partitioning match CPU|
-|[#1927](https://github.com/NVIDIA/spark-rapids/pull/1927)|Change cuDF dependency to 0.18.1|
-|[#1934](https://github.com/NVIDIA/spark-rapids/pull/1934)|Update documentation to use cudf version 0.18.1|
-|[#1871](https://github.com/NVIDIA/spark-rapids/pull/1871)|Disable coalesce batch spilling to avoid cudf contiguous_split bug|
-|[#1849](https://github.com/NVIDIA/spark-rapids/pull/1849)|Update changelog for 0.4|
-|[#1744](https://github.com/NVIDIA/spark-rapids/pull/1744)|Fix NullPointerException on null partition insert|
-|[#1842](https://github.com/NVIDIA/spark-rapids/pull/1842)|Update to note support for 3.0.2|
-|[#1832](https://github.com/NVIDIA/spark-rapids/pull/1832)|Spark 3.1.1 shim no longer a snapshot shim|
-|[#1831](https://github.com/NVIDIA/spark-rapids/pull/1831)|Spark 3.0.2 shim no longer a snapshot shim|
-|[#1826](https://github.com/NVIDIA/spark-rapids/pull/1826)|Remove benchmarks|
-|[#1828](https://github.com/NVIDIA/spark-rapids/pull/1828)|Update cudf dependency to 0.18|
-|[#1813](https://github.com/NVIDIA/spark-rapids/pull/1813)|Fix LEAD/LAG failures in Spark 3.1.1|
-|[#1819](https://github.com/NVIDIA/spark-rapids/pull/1819)|Fix scaladoc warning in GpuCast|
-|[#1820](https://github.com/NVIDIA/spark-rapids/pull/1820)|[BUG] make modified check pre-merge only|
-|[#1780](https://github.com/NVIDIA/spark-rapids/pull/1780)|Remove SNAPSHOT from test and integration_test READMEs|
-|[#1809](https://github.com/NVIDIA/spark-rapids/pull/1809)|check if modified files after update_config/supported|
-|[#1804](https://github.com/NVIDIA/spark-rapids/pull/1804)|Update UCX documentation for RX_QUEUE_LEN and Docker|
-|[#1810](https://github.com/NVIDIA/spark-rapids/pull/1810)|Pandas UDF: Sort the data before computing the sum.|
-|[#1751](https://github.com/NVIDIA/spark-rapids/pull/1751)|Exclude foldable expressions from GPU if constant folding is disabled|
-|[#1798](https://github.com/NVIDIA/spark-rapids/pull/1798)|Add documentation about explain not on GPU when AQE is on|
-|[#1766](https://github.com/NVIDIA/spark-rapids/pull/1766)|Branch 0.4 release docs|
-|[#1794](https://github.com/NVIDIA/spark-rapids/pull/1794)|Build python output schema from udf expressions|
-|[#1783](https://github.com/NVIDIA/spark-rapids/pull/1783)|Fix the collect_list over window tests failures on db|
-|[#1781](https://github.com/NVIDIA/spark-rapids/pull/1781)|Better float/double cases for casting tests|
-|[#1790](https://github.com/NVIDIA/spark-rapids/pull/1790)|Record row counts in benchmark runs that call collect|
-|[#1779](https://github.com/NVIDIA/spark-rapids/pull/1779)|Add support of DateType and TimestampType for GetTimestamp expression|
-|[#1768](https://github.com/NVIDIA/spark-rapids/pull/1768)|Updating getting started Databricks docs|
-|[#1742](https://github.com/NVIDIA/spark-rapids/pull/1742)|Fix regression supporting to_date with Spark-3.1|
-|[#1775](https://github.com/NVIDIA/spark-rapids/pull/1775)|Fix ambiguous ordering for some tests|
-|[#1760](https://github.com/NVIDIA/spark-rapids/pull/1760)|Update GpuDataSourceScanExec and GpuBroadcastExchangeExec to fix audit issues|
-|[#1750](https://github.com/NVIDIA/spark-rapids/pull/1750)|Detect task failures in benchmarks|
-|[#1767](https://github.com/NVIDIA/spark-rapids/pull/1767)|Consistent Spark version for test and production|
-|[#1741](https://github.com/NVIDIA/spark-rapids/pull/1741)|Reduce regex use in CAST|
-|[#1756](https://github.com/NVIDIA/spark-rapids/pull/1756)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
-|[#1716](https://github.com/NVIDIA/spark-rapids/pull/1716)|Update RapidsShuffleManager documentation for branch 0.4|
-|[#1740](https://github.com/NVIDIA/spark-rapids/pull/1740)|Disable ORC writes until bug can be fixed|
-|[#1747](https://github.com/NVIDIA/spark-rapids/pull/1747)|Fix resource leaks in unit tests|
-|[#1725](https://github.com/NVIDIA/spark-rapids/pull/1725)|Branch 0.4 FAQ reorg|
-|[#1718](https://github.com/NVIDIA/spark-rapids/pull/1718)|CAST string to temporal type now calls isTimestamp|
-|[#1734](https://github.com/NVIDIA/spark-rapids/pull/1734)|Disable range partitioning if computation is needed|
-|[#1723](https://github.com/NVIDIA/spark-rapids/pull/1723)|Removed StructTypes support for ParquetCachedBatchSerializer as cudf doesn't support it yet|
-|[#1714](https://github.com/NVIDIA/spark-rapids/pull/1714)|Add support for RAPIDS accelerated Java UDFs|
-|[#1713](https://github.com/NVIDIA/spark-rapids/pull/1713)|Call GpuDeviceManager.shutdown when the executor plugin is shutting down|
-|[#1596](https://github.com/NVIDIA/spark-rapids/pull/1596)|Added in Decimal support to ParquetCachedBatchSerializer|
-|[#1706](https://github.com/NVIDIA/spark-rapids/pull/1706)|cleanup unused is_before_spark_310|
-|[#1685](https://github.com/NVIDIA/spark-rapids/pull/1685)|Fix CustomShuffleReader replacement when decimal types enabled|
-|[#1699](https://github.com/NVIDIA/spark-rapids/pull/1699)|Add docs about Spark 3.1 in standalone modes not needing extra class path|
-|[#1701](https://github.com/NVIDIA/spark-rapids/pull/1701)|remove xfail for orc test_input_meta for spark 3.1.0|
-|[#1703](https://github.com/NVIDIA/spark-rapids/pull/1703)|Remove xfail for spark 3.1.0 test_broadcast_join_mixed FullOuter|
-|[#1676](https://github.com/NVIDIA/spark-rapids/pull/1676)|BenchmarkRunner option to generate query plan diagrams in DOT format|
-|[#1695](https://github.com/NVIDIA/spark-rapids/pull/1695)|support alternate jar paths|
-|[#1694](https://github.com/NVIDIA/spark-rapids/pull/1694)|increase mem and limit parallelism for pre-merge|
-|[#1691](https://github.com/NVIDIA/spark-rapids/pull/1691)|add validate_execs_in_gpu_plan to pytest.ini|
-|[#1692](https://github.com/NVIDIA/spark-rapids/pull/1692)|Add the integration test resources to the test tarball|
-|[#1677](https://github.com/NVIDIA/spark-rapids/pull/1677)|When PTDS is enabled, print warning if the allocator is not ARENA|
-|[#1683](https://github.com/NVIDIA/spark-rapids/pull/1683)|update changelog to verify autotmerge 0.5 setup [skip ci]|
-|[#1673](https://github.com/NVIDIA/spark-rapids/pull/1673)|support auto-merge for branch 0.5 [skip ci]|
-|[#1681](https://github.com/NVIDIA/spark-rapids/pull/1681)|Xfail the collect_list tests for databricks|
-|[#1678](https://github.com/NVIDIA/spark-rapids/pull/1678)|Fix array/struct checks in Sort and HashAggregate and sorting tests in distributed mode|
-|[#1671](https://github.com/NVIDIA/spark-rapids/pull/1671)|Allow metrics to be configurable by level|
-|[#1675](https://github.com/NVIDIA/spark-rapids/pull/1675)|add run_pyspark_from_build.sh to the pytest distribution tarball|
-|[#1548](https://github.com/NVIDIA/spark-rapids/pull/1548)|Support executing collect_list on GPU with windowing.|
-|[#1593](https://github.com/NVIDIA/spark-rapids/pull/1593)|Avoid unnecessary Table instances after contiguous split|
-|[#1592](https://github.com/NVIDIA/spark-rapids/pull/1592)|Add in support for Decimal divide|
-|[#1668](https://github.com/NVIDIA/spark-rapids/pull/1668)|Implement way for python integration tests to validate Exec is in GPU plan|
-|[#1669](https://github.com/NVIDIA/spark-rapids/pull/1669)|Add FAQ entries for executor-per-GPU questions|
-|[#1661](https://github.com/NVIDIA/spark-rapids/pull/1661)|Enable Parquet test for file containing map struct key|
-|[#1664](https://github.com/NVIDIA/spark-rapids/pull/1664)|Filter nulls for left semi and left anti join to work around cudf|
-|[#1665](https://github.com/NVIDIA/spark-rapids/pull/1665)|Add better automated tests for Arrow columnar copy in HostColumnarToGpu|
-|[#1614](https://github.com/NVIDIA/spark-rapids/pull/1614)|add alluxio getting start document|
-|[#1639](https://github.com/NVIDIA/spark-rapids/pull/1639)|support GpuScalarSubquery|
-|[#1656](https://github.com/NVIDIA/spark-rapids/pull/1656)|Move UDF to Catalyst Expressions to its own document|
-|[#1663](https://github.com/NVIDIA/spark-rapids/pull/1663)|BenchmarkRunner - Include query name in JSON summary filename|
-|[#1655](https://github.com/NVIDIA/spark-rapids/pull/1655)|Fix extraneous shuffles added by AQE|
-|[#1652](https://github.com/NVIDIA/spark-rapids/pull/1652)|Fix typo in arrow optimized config name - spark.rapids.arrowCopyOptimizationEnabled|
-|[#1645](https://github.com/NVIDIA/spark-rapids/pull/1645)|Run Databricks IT with python-xdist parallel, includes test fixes and xfail|
-|[#1649](https://github.com/NVIDIA/spark-rapids/pull/1649)|Move building from source docs to contributing guide|
-|[#1637](https://github.com/NVIDIA/spark-rapids/pull/1637)|Fail DivModLike on zero divisor in ANSI mode|
-|[#1646](https://github.com/NVIDIA/spark-rapids/pull/1646)|Update links in rapids-udfs.md after moving to subfolder|
-|[#1641](https://github.com/NVIDIA/spark-rapids/pull/1641)|Xfail struct and array order by tests on Dataproc|
-|[#1565](https://github.com/NVIDIA/spark-rapids/pull/1565)|Add GPU accelerated array_contains operator|
-|[#1617](https://github.com/NVIDIA/spark-rapids/pull/1617)|Enable nightly test checks for Apache Spark|
-|[#1636](https://github.com/NVIDIA/spark-rapids/pull/1636)|RAPIDS accelerated Spark Scala UDF support|
-|[#1634](https://github.com/NVIDIA/spark-rapids/pull/1634)|Fix databricks build since Arrow code added|
-|[#1599](https://github.com/NVIDIA/spark-rapids/pull/1599)|Add division by zero tests for Spark 3.1 behavior|
-|[#1619](https://github.com/NVIDIA/spark-rapids/pull/1619)|Update GpuFileSourceScanExec to be in sync with DataSourceScanExec|
-|[#1631](https://github.com/NVIDIA/spark-rapids/pull/1631)|Explicitly add maven-jar-plugin version to improve incremental build time.|
-|[#1624](https://github.com/NVIDIA/spark-rapids/pull/1624)|Update explain format to show what will and will not run on the GPU|
-|[#1622](https://github.com/NVIDIA/spark-rapids/pull/1622)|Support faster copy for a custom DataSource V2 which supplies Arrow data|
-|[#1621](https://github.com/NVIDIA/spark-rapids/pull/1621)|Additional functionality docs|
-|[#1618](https://github.com/NVIDIA/spark-rapids/pull/1618)|update blossom-ci for security updates [skip ci]|
-|[#1562](https://github.com/NVIDIA/spark-rapids/pull/1562)|add alluxio support|
-|[#1597](https://github.com/NVIDIA/spark-rapids/pull/1597)|Documentation for Parquet serializer|
-|[#1611](https://github.com/NVIDIA/spark-rapids/pull/1611)|Add in flag for integration tests to not skip required tests|
-|[#1609](https://github.com/NVIDIA/spark-rapids/pull/1609)|Disable float round/bround by default|
-|[#1615](https://github.com/NVIDIA/spark-rapids/pull/1615)|Add in window support for average|
-|[#1610](https://github.com/NVIDIA/spark-rapids/pull/1610)|Limit length of spark app name in BenchmarkRunner|
-|[#1579](https://github.com/NVIDIA/spark-rapids/pull/1579)|Support TakeOrderedAndProject|
-|[#1581](https://github.com/NVIDIA/spark-rapids/pull/1581)|Support Decimal type for CollectLimitExec|
-|[#1591](https://github.com/NVIDIA/spark-rapids/pull/1591)|Add support for running multiple queries in BenchmarkRunner|
-|[#1595](https://github.com/NVIDIA/spark-rapids/pull/1595)|Fix Github documentation issue template|
-|[#1577](https://github.com/NVIDIA/spark-rapids/pull/1577)|rename directory from spark310 to spark311|
-|[#1578](https://github.com/NVIDIA/spark-rapids/pull/1578)|Test to track RAPIDS-side issues re SPARK-32639|
-|[#1583](https://github.com/NVIDIA/spark-rapids/pull/1583)|fix request-action issue [skip ci]|
-|[#1555](https://github.com/NVIDIA/spark-rapids/pull/1555)|Enable ANSI mode for CAST string to timestamp|
-|[#1531](https://github.com/NVIDIA/spark-rapids/pull/1531)|Decimal Support for writing Parquet|
-|[#1545](https://github.com/NVIDIA/spark-rapids/pull/1545)|Support comparing ORC data|
-|[#1570](https://github.com/NVIDIA/spark-rapids/pull/1570)|Branch 0.4 doc cleanup|
-|[#1569](https://github.com/NVIDIA/spark-rapids/pull/1569)|Add shim method shouldIgnorePath|
-|[#1564](https://github.com/NVIDIA/spark-rapids/pull/1564)|Add in support for Decimal Multiply and DIV|
-|[#1561](https://github.com/NVIDIA/spark-rapids/pull/1561)|Decimal support for add and subtract|
-|[#1560](https://github.com/NVIDIA/spark-rapids/pull/1560)|support sum in window aggregation for decimal|
-|[#1546](https://github.com/NVIDIA/spark-rapids/pull/1546)|Cleanup shutdown logging for UCX shuffle|
-|[#1551](https://github.com/NVIDIA/spark-rapids/pull/1551)|RAPIDS-accelerated Hive UDFs support all types|
-|[#1543](https://github.com/NVIDIA/spark-rapids/pull/1543)|Shuffle/transport enabled by default|
-|[#1552](https://github.com/NVIDIA/spark-rapids/pull/1552)|Disable blackduck signature check|
-|[#1540](https://github.com/NVIDIA/spark-rapids/pull/1540)|Handle ShuffleManager api calls when plugin is not fully initialized|
-|[#1547](https://github.com/NVIDIA/spark-rapids/pull/1547)|Cleanup shuffle transport receive calls|
-|[#1512](https://github.com/NVIDIA/spark-rapids/pull/1512)|Support window operations on Decimal|
-|[#1532](https://github.com/NVIDIA/spark-rapids/pull/1532)|Support casting from decimal to decimal|
-|[#1542](https://github.com/NVIDIA/spark-rapids/pull/1542)|Change the number of partitions to zero when a range is empty|
-|[#1506](https://github.com/NVIDIA/spark-rapids/pull/1506)|Add --use-decimals flag to TPC-DS ConvertFiles|
-|[#1511](https://github.com/NVIDIA/spark-rapids/pull/1511)|Remove unused Jenkinsfiles [skip ci]|
-|[#1505](https://github.com/NVIDIA/spark-rapids/pull/1505)|Add least, greatest and eqNullSafe support for DecimalType|
-|[#1484](https://github.com/NVIDIA/spark-rapids/pull/1484)|add doc for nsight systems bundled with cuda toolkit|
-|[#1478](https://github.com/NVIDIA/spark-rapids/pull/1478)|Documentation for RAPIDS-accelerated Hive UDFs|
-|[#1477](https://github.com/NVIDIA/spark-rapids/pull/1477)|Allow structs and arrays to pass through for Shuffle and Sort |
-|[#1489](https://github.com/NVIDIA/spark-rapids/pull/1489)|Adds in some support for the array sql function|
-|[#1438](https://github.com/NVIDIA/spark-rapids/pull/1438)|Cast from numeric types to decimal type|
-|[#1493](https://github.com/NVIDIA/spark-rapids/pull/1493)|Moved ParquetRecordMaterializer to the shim package to follow convention|
-|[#1495](https://github.com/NVIDIA/spark-rapids/pull/1495)|Fix merge conflict, merge branch 0.3 to branch 0.4 [skip ci]|
-|[#1472](https://github.com/NVIDIA/spark-rapids/pull/1472)|Add an example RAPIDS-accelerated Hive UDF using native code|
-|[#1488](https://github.com/NVIDIA/spark-rapids/pull/1488)|Rename Spark 3.1.0 shim to Spark 3.1.1 to match community|
-|[#1474](https://github.com/NVIDIA/spark-rapids/pull/1474)|Fix link|
-|[#1476](https://github.com/NVIDIA/spark-rapids/pull/1476)|DecimalType support for Aggregate Count|
-|[#1475](https://github.com/NVIDIA/spark-rapids/pull/1475)| Join support for DecimalType|
-|[#1244](https://github.com/NVIDIA/spark-rapids/pull/1244)|Support round and bround SQL functions |
-|[#1458](https://github.com/NVIDIA/spark-rapids/pull/1458)|Add in support for struct and named_struct|
-|[#1465](https://github.com/NVIDIA/spark-rapids/pull/1465)|DecimalType support for UnionExec and ExpandExec|
-|[#1450](https://github.com/NVIDIA/spark-rapids/pull/1450)|Add dynamic configs for the spark-rapids IT pipelines|
-|[#1207](https://github.com/NVIDIA/spark-rapids/pull/1207)|Spark SQL hash function using murmur3|
-|[#1457](https://github.com/NVIDIA/spark-rapids/pull/1457)|Support reading decimal columns from parquet files on Databricks|
-|[#1455](https://github.com/NVIDIA/spark-rapids/pull/1455)|Upgrade Scala Maven Plugin to 4.3.0|
-|[#1453](https://github.com/NVIDIA/spark-rapids/pull/1453)|DecimalType support for IfElse and Coalesce|
-|[#1452](https://github.com/NVIDIA/spark-rapids/pull/1452)|Support DecimalType for CaseWhen|
-|[#1444](https://github.com/NVIDIA/spark-rapids/pull/1444)|Improve UX when running benchmarks from Spark shell|
-|[#1294](https://github.com/NVIDIA/spark-rapids/pull/1294)|Support reading decimal columns from parquet files|
-|[#1153](https://github.com/NVIDIA/spark-rapids/pull/1153)|Scala UDF will compile children expressions in Project|
-|[#1416](https://github.com/NVIDIA/spark-rapids/pull/1416)|Optimize mvn dependency download scripts|
-|[#1430](https://github.com/NVIDIA/spark-rapids/pull/1430)|Add project for testing code that requires Spark 3.1.0 or later|
-|[#1425](https://github.com/NVIDIA/spark-rapids/pull/1425)|Add in Decimal support for abs, floor, ceil, unary - and unary +|
-|[#1427](https://github.com/NVIDIA/spark-rapids/pull/1427)|Revert "Make the multi-threaded parquet reader the default"|
-|[#1420](https://github.com/NVIDIA/spark-rapids/pull/1420)|Add udf jar to nightly integration tests|
-|[#1422](https://github.com/NVIDIA/spark-rapids/pull/1422)|Log the number of concurrent gpu tasks allowed on Executor startup|
-|[#1401](https://github.com/NVIDIA/spark-rapids/pull/1401)|Accelerate the coalescing parquet reader when reading files from multiple partitioned folders|
-|[#1413](https://github.com/NVIDIA/spark-rapids/pull/1413)|Add config for cast float to integral types|
-|[#1313](https://github.com/NVIDIA/spark-rapids/pull/1313)|Support spilling to disk directly via cuFile/GDS|
-|[#1411](https://github.com/NVIDIA/spark-rapids/pull/1411)|Add udf-examples jar to databricks build|
-|[#1412](https://github.com/NVIDIA/spark-rapids/pull/1412)|Fix a lot of tests marked with xfail for Spark 3.1.0 that no longer fail|
-|[#1414](https://github.com/NVIDIA/spark-rapids/pull/1414)|Build merged code of HEAD and BASE branch for pre-merge [skip ci]|
-|[#1409](https://github.com/NVIDIA/spark-rapids/pull/1409)|Add option to use decimals in tpc-ds csv to parquet conversion|
-|[#1410](https://github.com/NVIDIA/spark-rapids/pull/1410)|Add Decimal support for In, InSet, AtLeastNNonNulls, GetArrayItem, GetStructField, and GenerateExec|
-|[#1408](https://github.com/NVIDIA/spark-rapids/pull/1408)|Support RAPIDS-accelerated HiveGenericUDF|
-|[#1407](https://github.com/NVIDIA/spark-rapids/pull/1407)|Update docs and tests for null CSV support|
-|[#1393](https://github.com/NVIDIA/spark-rapids/pull/1393)|Support RAPIDS-accelerated HiveSimpleUDF|
-|[#1392](https://github.com/NVIDIA/spark-rapids/pull/1392)|Turn on hash partitioning for decimal support|
-|[#1402](https://github.com/NVIDIA/spark-rapids/pull/1402)|Better GPU Cast type checks|
-|[#1404](https://github.com/NVIDIA/spark-rapids/pull/1404)|Fix branch 0.4 merge conflict|
-|[#1323](https://github.com/NVIDIA/spark-rapids/pull/1323)|More advanced type checking and documentation|
-|[#1391](https://github.com/NVIDIA/spark-rapids/pull/1391)|Remove extra null join filtering because cudf is fast for this now.|
-|[#1395](https://github.com/NVIDIA/spark-rapids/pull/1395)|Fix branch-0.3 -> branch-0.4 automerge|
-|[#1382](https://github.com/NVIDIA/spark-rapids/pull/1382)|Handle "MM[/-]dd" and "dd[/-]MM" datetime formats in UnixTimeExprMeta|
-|[#1390](https://github.com/NVIDIA/spark-rapids/pull/1390)|Accelerated columnar to row/row to columnar for decimal|
-|[#1380](https://github.com/NVIDIA/spark-rapids/pull/1380)|Adds in basic support for decimal sort, sum, and some shuffle|
-|[#1367](https://github.com/NVIDIA/spark-rapids/pull/1367)|Reuse gpu expression conversion rules when checking sort order|
-|[#1349](https://github.com/NVIDIA/spark-rapids/pull/1349)|Add canonicalization tests|
-|[#1368](https://github.com/NVIDIA/spark-rapids/pull/1368)|Move to cudf 0.18-SNAPSHOT|
-|[#1361](https://github.com/NVIDIA/spark-rapids/pull/1361)|Use the correct precision when reading spark columnar data.|
-|[#1273](https://github.com/NVIDIA/spark-rapids/pull/1273)|Update docs and scripts to 0.4.0-SNAPSHOT|
-|[#1321](https://github.com/NVIDIA/spark-rapids/pull/1321)|Refactor to stop inheriting from HashJoin|
-|[#1311](https://github.com/NVIDIA/spark-rapids/pull/1311)|ParquetCachedBatchSerializer code cleanup|
-|[#1303](https://github.com/NVIDIA/spark-rapids/pull/1303)|Add explicit outputOrdering for BHJ and SHJ in spark310 shim|
-|[#1299](https://github.com/NVIDIA/spark-rapids/pull/1299)|Benchmark runner improved error handling|
-
-## Release 0.3
-
-### Features
-|||
-|:---|:---|
-|[#1002](https://github.com/NVIDIA/spark-rapids/issues/1002)|[FEA] RapidsHostColumnVectorCore should verify cudf data with respect to the expected spark type |
-|[#444](https://github.com/NVIDIA/spark-rapids/issues/444)|[FEA] Plugable Cache|
-|[#1158](https://github.com/NVIDIA/spark-rapids/issues/1158)|[FEA] Better documentation on type support|
-|[#57](https://github.com/NVIDIA/spark-rapids/issues/57)|[FEA] Support INT96 for parquet reads and writes|
-|[#1003](https://github.com/NVIDIA/spark-rapids/issues/1003)|[FEA] Reduce overlap between RapidsHostColumnVector and RapidsHostColumnVectorCore|
-|[#913](https://github.com/NVIDIA/spark-rapids/issues/913)|[FEA] In Pluggable Cache Support CalendarInterval while creating CachedBatches|
-|[#1092](https://github.com/NVIDIA/spark-rapids/issues/1092)|[FEA] In Pluggable Cache handle nested types having CalendarIntervalType and NullType|
-|[#670](https://github.com/NVIDIA/spark-rapids/issues/670)|[FEA] Support NullType|
-|[#50](https://github.com/NVIDIA/spark-rapids/issues/50)|[FEA] support `spark.sql.legacy.timeParserPolicy`|
-|[#1144](https://github.com/NVIDIA/spark-rapids/issues/1144)|[FEA] Remove Databricks 3.0.0 shim layer|
-|[#1096](https://github.com/NVIDIA/spark-rapids/issues/1096)|[FEA] Implement parquet CreateDataSourceTableAsSelectCommand|
-|[#688](https://github.com/NVIDIA/spark-rapids/issues/688)|[FEA] udf compiler should be auto-appended to `spark.sql.extensions`|
-|[#502](https://github.com/NVIDIA/spark-rapids/issues/502)|[FEA] Support Databricks 7.3 LTS Runtime|
-|[#764](https://github.com/NVIDIA/spark-rapids/issues/764)|[FEA] Sanity checks for cudf jar mismatch|
-|[#1018](https://github.com/NVIDIA/spark-rapids/issues/1018)|[FEA] Log details related to GPU memory fragmentation on GPU OOM|
-|[#619](https://github.com/NVIDIA/spark-rapids/issues/619)|[FEA] log whether libcudf and libcudfjni were built for PTDS|
-|[#905](https://github.com/NVIDIA/spark-rapids/issues/905)|[FEA] create AWS EMR 3.0.1 shim|
-|[#838](https://github.com/NVIDIA/spark-rapids/issues/838)|[FEA] Support window count for a column|
-|[#864](https://github.com/NVIDIA/spark-rapids/issues/864)|[FEA] config option to enable RMM arena memory resource|
-|[#430](https://github.com/NVIDIA/spark-rapids/issues/430)|[FEA] Audit: Parquet Writer support for TIMESTAMP_MILLIS|
-|[#818](https://github.com/NVIDIA/spark-rapids/issues/818)|[FEA] Create shim layer for AWS EMR |
-|[#608](https://github.com/NVIDIA/spark-rapids/issues/608)|[FEA] Parquet small file optimization improve handle merge schema|
-
-### Performance
-|||
-|:---|:---|
-|[#446](https://github.com/NVIDIA/spark-rapids/issues/446)|[FEA] Test jucx in 1.9.x branch|
-|[#1038](https://github.com/NVIDIA/spark-rapids/issues/1038)|[FEA] Accelerate the data transfer for plan `WindowInPandasExec`|
-|[#533](https://github.com/NVIDIA/spark-rapids/issues/533)|[FEA] Improve PTDS performance|
-|[#849](https://github.com/NVIDIA/spark-rapids/issues/849)|[FEA] Have GpuColumnarBatchSerializer return GpuColumnVectorFromBuffer instances|
-|[#784](https://github.com/NVIDIA/spark-rapids/issues/784)|[FEA] Allow Host Spilling to be more dynamic|
-|[#627](https://github.com/NVIDIA/spark-rapids/issues/627)|[FEA] Further parquet reading small file improvements|
-|[#5](https://github.com/NVIDIA/spark-rapids/issues/5)|[FEA] Support Adaptive Execution|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#1423](https://github.com/NVIDIA/spark-rapids/issues/1423)|[BUG] Mortgage ETL sample failed with spark.sql.adaptive enabled on AWS EMR 6.2 |
-|[#1369](https://github.com/NVIDIA/spark-rapids/issues/1369)|[BUG] TPC-DS Query Failing on EMR 6.2 with AQE|
-|[#1344](https://github.com/NVIDIA/spark-rapids/issues/1344)|[BUG] Spark-rapids Pytests failed on On Databricks cluster spark standalone mode|
-|[#1279](https://github.com/NVIDIA/spark-rapids/issues/1279)|[BUG] TPC-DS query 2 failing with NPE|
-|[#1280](https://github.com/NVIDIA/spark-rapids/issues/1280)|[BUG] TPC-DS query 93 failing with UnsupportedOperationException|
-|[#1308](https://github.com/NVIDIA/spark-rapids/issues/1308)|[BUG] TPC-DS query 14a runs much slower on 0.3|
-|[#1284](https://github.com/NVIDIA/spark-rapids/issues/1284)|[BUG] TPC-DS query 77 at scale=1TB fails with maxResultSize exceeded error|
-|[#1061](https://github.com/NVIDIA/spark-rapids/issues/1061)|[BUG] orc_test.py is failing|
-|[#1197](https://github.com/NVIDIA/spark-rapids/issues/1197)|[BUG] java.lang.NullPointerException when exporting delta table|
-|[#685](https://github.com/NVIDIA/spark-rapids/issues/685)|[BUG] In ParqueCachedBatchSerializer, serializing parquet buffers might blow up in certain cases|
-|[#1269](https://github.com/NVIDIA/spark-rapids/issues/1269)|[BUG] GpuSubstring is not expected to be a part of a SortOrder|
-|[#1246](https://github.com/NVIDIA/spark-rapids/issues/1246)|[BUG] Many TPC-DS benchmarks fail when writing to Parquet|
-|[#961](https://github.com/NVIDIA/spark-rapids/issues/961)|[BUG] ORC predicate pushdown should work with case-insensitive analysis|
-|[#962](https://github.com/NVIDIA/spark-rapids/issues/962)|[BUG] Loading columns from an ORC file without column names returns no data|
-|[#1245](https://github.com/NVIDIA/spark-rapids/issues/1245)|[BUG] Code adding buffers to the spillable store should synchronize|
-|[#570](https://github.com/NVIDIA/spark-rapids/issues/570)|[BUG] Continue debugging OOM after ensuring device store is empty|
-|[#972](https://github.com/NVIDIA/spark-rapids/issues/972)|[BUG] total time metric is redundant with scan time|
-|[#1039](https://github.com/NVIDIA/spark-rapids/issues/1039)|[BUG] UNBOUNDED window ranges on null timestamp columns produces incorrect results.|
-|[#1195](https://github.com/NVIDIA/spark-rapids/issues/1195)|[BUG] AcceleratedColumnarToRowIterator queue empty|
-|[#1177](https://github.com/NVIDIA/spark-rapids/issues/1177)|[BUG] leaks possible in the rapids shuffle if batches are received after the task completes|
-|[#1216](https://github.com/NVIDIA/spark-rapids/issues/1216)|[BUG] Failure to recognize ORC file format when loaded via Hive|
-|[#898](https://github.com/NVIDIA/spark-rapids/issues/898)|[BUG] count reductions are failing on databricks because lack for Complete support|
-|[#1184](https://github.com/NVIDIA/spark-rapids/issues/1184)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks 3.0.1|
-|[#1151](https://github.com/NVIDIA/spark-rapids/issues/1151)|[BUG]Add databricks 3.0.1 shim layer for GpuWindowInPandasExec.|
-|[#1199](https://github.com/NVIDIA/spark-rapids/issues/1199)|[BUG] No data size in Input column in Stages page from Spark UI when using Parquet as file source|
-|[#1031](https://github.com/NVIDIA/spark-rapids/issues/1031)|[BUG] dependency info properties file contains error messages|
-|[#1149](https://github.com/NVIDIA/spark-rapids/issues/1149)|[BUG] Scaladoc warnings in GpuDataSource|
-|[#1185](https://github.com/NVIDIA/spark-rapids/issues/1185)|[BUG] test_hash_multiple_mode_query failing|
-|[#724](https://github.com/NVIDIA/spark-rapids/issues/724)|[BUG] PySpark test_broadcast_nested_loop_join_special_case intermittent failure|
-|[#1164](https://github.com/NVIDIA/spark-rapids/issues/1164)|[BUG] ansi_cast tests are failing in 3.1.0|
-|[#1110](https://github.com/NVIDIA/spark-rapids/issues/1110)|[BUG] Special date "now" has wrong value on GPU|
-|[#1139](https://github.com/NVIDIA/spark-rapids/issues/1139)|[BUG] Host columnar to GPU can be very slow|
-|[#1094](https://github.com/NVIDIA/spark-rapids/issues/1094)|[BUG] unix_timestamp on GPU returns invalid data for special dates|
-|[#1098](https://github.com/NVIDIA/spark-rapids/issues/1098)|[BUG] unix_timestamp on GPU returns invalid data for bad input|
-|[#1082](https://github.com/NVIDIA/spark-rapids/issues/1082)|[BUG] string to timestamp conversion fails with split|
-|[#1140](https://github.com/NVIDIA/spark-rapids/issues/1140)|[BUG] ConcurrentModificationException error after scala test suite completes|
-|[#1073](https://github.com/NVIDIA/spark-rapids/issues/1073)|[BUG] java.lang.RuntimeException: BinaryExpressions must override either eval or nullSafeEval|
-|[#975](https://github.com/NVIDIA/spark-rapids/issues/975)|[BUG] BroadcastExchangeExec fails to fall back to CPU on driver node on GCP Dataproc|
-|[#773](https://github.com/NVIDIA/spark-rapids/issues/773)|[BUG] Investigate high task deserialization|
-|[#1035](https://github.com/NVIDIA/spark-rapids/issues/1035)|[BUG] TPC-DS query 90 with AQE enabled fails with doExecuteBroadcast exception|
-|[#825](https://github.com/NVIDIA/spark-rapids/issues/825)|[BUG] test_window_aggs_for_ranges intermittently fails|
-|[#1008](https://github.com/NVIDIA/spark-rapids/issues/1008)|[BUG] limit function is producing inconsistent result when type is Byte, Long, Boolean and Timestamp|
-|[#996](https://github.com/NVIDIA/spark-rapids/issues/996)|[BUG] TPC-DS benchmark via spark-submit does not provide option to disable appending .dat to path|
-|[#1006](https://github.com/NVIDIA/spark-rapids/issues/1006)|[BUG] Spark3.1.0 changed BasicWriteTaskStats breaks BasicColumnarWriteTaskStatsTracker|
-|[#985](https://github.com/NVIDIA/spark-rapids/issues/985)|[BUG] missing metric `dataSize`|
-|[#881](https://github.com/NVIDIA/spark-rapids/issues/881)|[BUG] cannot disable Sort by itself|
-|[#812](https://github.com/NVIDIA/spark-rapids/issues/812)|[BUG] Test failures for 0.2 when run with multiple executors|
-|[#925](https://github.com/NVIDIA/spark-rapids/issues/925)|[BUG]Range window-functions with non-timestamp order-by expressions not falling back to CPU|
-|[#852](https://github.com/NVIDIA/spark-rapids/issues/852)|[BUG] BenchUtils.compareResults cannot compare partitioned files when ignoreOrdering=false|
-|[#868](https://github.com/NVIDIA/spark-rapids/issues/868)|[BUG] Rounding error when casting timestamp to string for timestamps before 1970|
-|[#880](https://github.com/NVIDIA/spark-rapids/issues/880)|[BUG] doing a window operation with an orderby for a single constant crashes|
-|[#776](https://github.com/NVIDIA/spark-rapids/issues/776)|[BUG] Integration test fails on spark 3.1.0-SNAPSHOT|
-|[#874](https://github.com/NVIDIA/spark-rapids/issues/874)|[BUG] `RapidsConf.scala` has some un-consistency for `spark.rapids.sql.format.parquet.multiThreadedRead`|
-|[#860](https://github.com/NVIDIA/spark-rapids/issues/860)|[BUG] we need to mark columns from received shuffle buffers as `GpuColumnVectorFromBuffer`|
-|[#122](https://github.com/NVIDIA/spark-rapids/issues/122)|[BUG] CSV Timestamp parseing is broken for TS < 1902 and TS > 2038|
-|[#810](https://github.com/NVIDIA/spark-rapids/issues/810)|[BUG] UDF Integration tests fail if pandas is not installed|
-|[#746](https://github.com/NVIDIA/spark-rapids/issues/746)|[BUG] cudf_udf_test.py is flakey|
-|[#811](https://github.com/NVIDIA/spark-rapids/issues/811)|[BUG] 0.3 nightly is timing out |
-|[#574](https://github.com/NVIDIA/spark-rapids/issues/574)|[BUG] Fix GpuTimeSub for Spark 3.1.0|
-
-### PRs
-|||
-|:---|:---|
-|[#1496](https://github.com/NVIDIA/spark-rapids/pull/1496)|Update changelog for v0.3.0 release [skip ci]|
-|[#1473](https://github.com/NVIDIA/spark-rapids/pull/1473)|Update documentation for 0.3 release|
-|[#1371](https://github.com/NVIDIA/spark-rapids/pull/1371)|Start Guide for RAPIDS on AWS EMR 6.2|
-|[#1446](https://github.com/NVIDIA/spark-rapids/pull/1446)|Update changelog for 0.3.0 release [skip ci]|
-|[#1439](https://github.com/NVIDIA/spark-rapids/pull/1439)|when AQE enabled we fail to fix up exchanges properly and EMR|
-|[#1433](https://github.com/NVIDIA/spark-rapids/pull/1433)|fix pandas 1.2 compatible issue|
-|[#1424](https://github.com/NVIDIA/spark-rapids/pull/1424)|Make the multi-threaded parquet reader the default since coalescing doesn't handle partitioned files well|
-|[#1389](https://github.com/NVIDIA/spark-rapids/pull/1389)|Update project version to 0.3.0|
-|[#1387](https://github.com/NVIDIA/spark-rapids/pull/1387)|Update cudf version to 0.17|
-|[#1370](https://github.com/NVIDIA/spark-rapids/pull/1370)|[REVIEW] init changelog 0.3 [skip ci]|
-|[#1376](https://github.com/NVIDIA/spark-rapids/pull/1376)|MetaUtils.getBatchFromMeta should return batches with GpuColumnVectorFromBuffer|
-|[#1358](https://github.com/NVIDIA/spark-rapids/pull/1358)|auto-merge: instant merge after creation [skip ci]|
-|[#1359](https://github.com/NVIDIA/spark-rapids/pull/1359)|Use SortOrder from shims.|
-|[#1343](https://github.com/NVIDIA/spark-rapids/pull/1343)|Do not run UDFs when the partition is empty.|
-|[#1342](https://github.com/NVIDIA/spark-rapids/pull/1342)|Fix and edit docs for standalone mode|
-|[#1350](https://github.com/NVIDIA/spark-rapids/pull/1350)|fix GpuRangePartitioning canonicalization|
-|[#1281](https://github.com/NVIDIA/spark-rapids/pull/1281)|Documentation added for testing|
-|[#1336](https://github.com/NVIDIA/spark-rapids/pull/1336)|Fix missing post-shuffle coalesce with AQE|
-|[#1318](https://github.com/NVIDIA/spark-rapids/pull/1318)|Fix copying GpuFileSourceScanExec node|
-|[#1337](https://github.com/NVIDIA/spark-rapids/pull/1337)|Use UTC instead of GMT|
-|[#1307](https://github.com/NVIDIA/spark-rapids/pull/1307)|Fallback to cpu when reading Delta log files for stats|
-|[#1310](https://github.com/NVIDIA/spark-rapids/pull/1310)|Fix canonicalization of GpuFileSourceScanExec, GpuShuffleCoalesceExec|
-|[#1302](https://github.com/NVIDIA/spark-rapids/pull/1302)|Add GpuSubstring handling to SortOrder canonicalization|
-|[#1265](https://github.com/NVIDIA/spark-rapids/pull/1265)|Chunking input before writing a ParquetCachedBatch|
-|[#1278](https://github.com/NVIDIA/spark-rapids/pull/1278)|Add a config to disable decimal types by default|
-|[#1272](https://github.com/NVIDIA/spark-rapids/pull/1272)|Add Alias to shims|
-|[#1268](https://github.com/NVIDIA/spark-rapids/pull/1268)|Adds in support docs for 0.3 release|
-|[#1235](https://github.com/NVIDIA/spark-rapids/pull/1235)|Trigger reading and handling control data.|
-|[#1266](https://github.com/NVIDIA/spark-rapids/pull/1266)|Updating Databricks getting started for 0.3 release|
-|[#1291](https://github.com/NVIDIA/spark-rapids/pull/1291)|Increase pre-merge resource requests [skip ci]|
-|[#1275](https://github.com/NVIDIA/spark-rapids/pull/1275)|Temporarily disable more CAST tests for Spark 3.1.0|
-|[#1264](https://github.com/NVIDIA/spark-rapids/pull/1264)|Fix race condition in batch creation|
-|[#1260](https://github.com/NVIDIA/spark-rapids/pull/1260)|Update UCX license info in NOTIFY-binary for 1.9 and RAPIDS plugin copyright dates|
-|[#1247](https://github.com/NVIDIA/spark-rapids/pull/1247)|Ensure column names are valid when writing benchmark query results to file|
-|[#1240](https://github.com/NVIDIA/spark-rapids/pull/1240)|Fix loading from ORC file with no column names|
-|[#1242](https://github.com/NVIDIA/spark-rapids/pull/1242)|Remove compatibility documentation about unsupported INT96|
-|[#1192](https://github.com/NVIDIA/spark-rapids/pull/1192)|[REVIEW]  Support GpuFilter and GpuCoalesceBatches for decimal data|
-|[#1170](https://github.com/NVIDIA/spark-rapids/pull/1170)|Add nested type support to MetaUtils|
-|[#1194](https://github.com/NVIDIA/spark-rapids/pull/1194)|Drop redundant total time metric from scan|
-|[#1248](https://github.com/NVIDIA/spark-rapids/pull/1248)|At BatchedTableCompressor.finish synchronize to allow for "right-size…|
-|[#1169](https://github.com/NVIDIA/spark-rapids/pull/1169)|Use CUDF's "UNBOUNDED" window boundaries for time-range queries.|
-|[#1204](https://github.com/NVIDIA/spark-rapids/pull/1204)|Avoid empty batches on columnar to row conversion|
-|[#1133](https://github.com/NVIDIA/spark-rapids/pull/1133)|Refactor batch coalesce to be based solely on batch data size|
-|[#1237](https://github.com/NVIDIA/spark-rapids/pull/1237)|In transport, limit pending transfer requests to fit within a bounce|
-|[#1232](https://github.com/NVIDIA/spark-rapids/pull/1232)|Move SortOrder creation to shims|
-|[#1068](https://github.com/NVIDIA/spark-rapids/pull/1068)|Write int96 to parquet|
-|[#1193](https://github.com/NVIDIA/spark-rapids/pull/1193)|Verify shuffle of decimal columns|
-|[#1180](https://github.com/NVIDIA/spark-rapids/pull/1180)|Remove batches if they are received after the iterator detects that t…|
-|[#1173](https://github.com/NVIDIA/spark-rapids/pull/1173)|Support relational operators for decimal type|
-|[#1220](https://github.com/NVIDIA/spark-rapids/pull/1220)|Support replacing ORC format when Hive is configured|
-|[#1219](https://github.com/NVIDIA/spark-rapids/pull/1219)|Upgrade to jucx 1.9.0|
-|[#1081](https://github.com/NVIDIA/spark-rapids/pull/1081)|Add option to upload benchmark summary JSON file|
-|[#1217](https://github.com/NVIDIA/spark-rapids/pull/1217)|Aggregate reductions in Complete mode should use updateExpressions|
-|[#1218](https://github.com/NVIDIA/spark-rapids/pull/1218)|Remove obsolete HiveStringType usage|
-|[#1214](https://github.com/NVIDIA/spark-rapids/pull/1214)|changelog update 2020-11-30. Trigger automerge check [skip ci]|
-|[#1210](https://github.com/NVIDIA/spark-rapids/pull/1210)|Support auto-merge for branch-0.4 [skip ci]|
-|[#1202](https://github.com/NVIDIA/spark-rapids/pull/1202)|Fix a bug with the support for java.lang.StringBuilder.append.|
-|[#1213](https://github.com/NVIDIA/spark-rapids/pull/1213)|Skip casting StringType to TimestampType for Spark 310|
-|[#1201](https://github.com/NVIDIA/spark-rapids/pull/1201)|Replace only window expressions on databricks.|
-|[#1208](https://github.com/NVIDIA/spark-rapids/pull/1208)|[BUG] Fix GHSL2020-239 [skip ci]|
-|[#1205](https://github.com/NVIDIA/spark-rapids/pull/1205)|Fix missing input bytes read metric for Parquet|
-|[#1206](https://github.com/NVIDIA/spark-rapids/pull/1206)|Update Spark 3.1 shim for ShuffleOrigin shuffle parameter|
-|[#1196](https://github.com/NVIDIA/spark-rapids/pull/1196)|Rename ShuffleCoalesceExec to GpuShuffleCoalesceExec|
-|[#1191](https://github.com/NVIDIA/spark-rapids/pull/1191)|Skip window array tests for databricks.|
-|[#1183](https://github.com/NVIDIA/spark-rapids/pull/1183)|Support for CalendarIntervalType and NullType|
-|[#1150](https://github.com/NVIDIA/spark-rapids/pull/1150)|udf spec|
-|[#1188](https://github.com/NVIDIA/spark-rapids/pull/1188)|Add in tests for parquet nested pruning support|
-|[#1189](https://github.com/NVIDIA/spark-rapids/pull/1189)|Enable NullType for First and Last in 3.0.1+|
-|[#1181](https://github.com/NVIDIA/spark-rapids/pull/1181)|Fix resource leaks in unit tests|
-|[#1186](https://github.com/NVIDIA/spark-rapids/pull/1186)|Fix compilation and scaladoc warnings|
-|[#1187](https://github.com/NVIDIA/spark-rapids/pull/1187)|Updated documentation for distinct count compatibility|
-|[#1182](https://github.com/NVIDIA/spark-rapids/pull/1182)|Close buffer catalog on device manager shutdown|
-|[#1137](https://github.com/NVIDIA/spark-rapids/pull/1137)|Let GpuWindowInPandas declare ArrayType supported.|
-|[#1176](https://github.com/NVIDIA/spark-rapids/pull/1176)|Add in support for null type|
-|[#1174](https://github.com/NVIDIA/spark-rapids/pull/1174)|Fix race condition in SerializeConcatHostBuffersDeserializeBatch|
-|[#1175](https://github.com/NVIDIA/spark-rapids/pull/1175)|Fix leaks seen in shuffle tests|
-|[#1138](https://github.com/NVIDIA/spark-rapids/pull/1138)|[REVIEW] Support decimal type for GpuProjectExec|
-|[#1162](https://github.com/NVIDIA/spark-rapids/pull/1162)|Set job descriptions in benchmark runner|
-|[#1172](https://github.com/NVIDIA/spark-rapids/pull/1172)|Revert "Fix race condition (#1165)"|
-|[#1060](https://github.com/NVIDIA/spark-rapids/pull/1060)|Show partition metrics for custom shuffler reader|
-|[#1152](https://github.com/NVIDIA/spark-rapids/pull/1152)|Add spark301db shim layer for WindowInPandas.|
-|[#1167](https://github.com/NVIDIA/spark-rapids/pull/1167)|Nulls out the dataframe if --gc-between-runs is set|
-|[#1165](https://github.com/NVIDIA/spark-rapids/pull/1165)|Fix race condition in SerializeConcatHostBuffersDeserializeBatch|
-|[#1163](https://github.com/NVIDIA/spark-rapids/pull/1163)|Add in support for GetStructField|
-|[#1166](https://github.com/NVIDIA/spark-rapids/pull/1166)|Fix the cast tests for 3.1.0+|
-|[#1159](https://github.com/NVIDIA/spark-rapids/pull/1159)|fix bug where 'now' had same value as 'today' for timestamps|
-|[#1161](https://github.com/NVIDIA/spark-rapids/pull/1161)|Fix nightly build pipeline failure.|
-|[#1160](https://github.com/NVIDIA/spark-rapids/pull/1160)|Fix some performance problems with columnar to columnar conversion|
-|[#1105](https://github.com/NVIDIA/spark-rapids/pull/1105)|[REVIEW] Change ColumnViewAccess usage to work with ColumnView|
-|[#1148](https://github.com/NVIDIA/spark-rapids/pull/1148)|Add in tests for Maps and extend map support where possible|
-|[#1154](https://github.com/NVIDIA/spark-rapids/pull/1154)|Mark test as xfail until we can get a fix in|
-|[#1113](https://github.com/NVIDIA/spark-rapids/pull/1113)|Support unix_timestamp on GPU for subset of formats|
-|[#1156](https://github.com/NVIDIA/spark-rapids/pull/1156)|Fix warning introduced in iterator suite|
-|[#1095](https://github.com/NVIDIA/spark-rapids/pull/1095)|Dependency info|
-|[#1145](https://github.com/NVIDIA/spark-rapids/pull/1145)|Remove support for databricks 7.0 runtime - shim spark300db|
-|[#1147](https://github.com/NVIDIA/spark-rapids/pull/1147)|Change the assert to require for handling TIMESTAMP_MILLIS in isDateTimeRebaseNeeded |
-|[#1132](https://github.com/NVIDIA/spark-rapids/pull/1132)|Add in basic support to read structs from parquet|
-|[#1121](https://github.com/NVIDIA/spark-rapids/pull/1121)|Shuffle/better error handling|
-|[#1134](https://github.com/NVIDIA/spark-rapids/pull/1134)|Support saveAsTable for writing orc and parquet|
-|[#1124](https://github.com/NVIDIA/spark-rapids/pull/1124)|Add shim layers for GpuWindowInPandasExec.|
-|[#1131](https://github.com/NVIDIA/spark-rapids/pull/1131)|Add in some basic support for Structs|
-|[#1127](https://github.com/NVIDIA/spark-rapids/pull/1127)|Add in basic support for reading lists from parquet|
-|[#1129](https://github.com/NVIDIA/spark-rapids/pull/1129)|Fix resource leaks with new shuffle optimization|
-|[#1116](https://github.com/NVIDIA/spark-rapids/pull/1116)|Optimize normal shuffle by coalescing smaller batches on host|
-|[#1102](https://github.com/NVIDIA/spark-rapids/pull/1102)|Auto-register UDF extention when main plugin is set|
-|[#1108](https://github.com/NVIDIA/spark-rapids/pull/1108)|Remove integration test pipelines on NGCC|
-|[#1123](https://github.com/NVIDIA/spark-rapids/pull/1123)|Mark Pandas udf over window tests as xfail on databricks until they can be fixed|
-|[#1120](https://github.com/NVIDIA/spark-rapids/pull/1120)|Add in support for filtering ArrayType|
-|[#1080](https://github.com/NVIDIA/spark-rapids/pull/1080)|Support for CalendarIntervalType and NullType for ParquetCachedSerializer|
-|[#994](https://github.com/NVIDIA/spark-rapids/pull/994)|Packs bounce buffers for highly partitioned shuffles|
-|[#1112](https://github.com/NVIDIA/spark-rapids/pull/1112)|Remove bad config from pytest setup|
-|[#1107](https://github.com/NVIDIA/spark-rapids/pull/1107)|closeOnExcept -> withResources in MetaUtils|
-|[#1104](https://github.com/NVIDIA/spark-rapids/pull/1104)|Support lists to/from the GPU|
-|[#1106](https://github.com/NVIDIA/spark-rapids/pull/1106)|Improve mechanism for expected exceptions in tests|
-|[#1069](https://github.com/NVIDIA/spark-rapids/pull/1069)|Accelerate the data transfer between JVM and Python for the plan 'GpuWindowInPandasExec'|
-|[#1099](https://github.com/NVIDIA/spark-rapids/pull/1099)|Update how we deal with type checking|
-|[#1077](https://github.com/NVIDIA/spark-rapids/pull/1077)|Improve AQE transitions for shuffle and coalesce batches|
-|[#1097](https://github.com/NVIDIA/spark-rapids/pull/1097)|Cleanup some instances of excess closure serialization|
-|[#1090](https://github.com/NVIDIA/spark-rapids/pull/1090)|Fix the integration build|
-|[#1086](https://github.com/NVIDIA/spark-rapids/pull/1086)|Speed up test performance using pytest-xdist|
-|[#1084](https://github.com/NVIDIA/spark-rapids/pull/1084)|Avoid issues where more scalars that expected show up in an expression|
-|[#1076](https://github.com/NVIDIA/spark-rapids/pull/1076)|[FEA] Support Databricks 7.3 LTS Runtime|
-|[#1083](https://github.com/NVIDIA/spark-rapids/pull/1083)|Revert "Get cudf/spark dependency from the correct .m2 dir"|
-|[#1062](https://github.com/NVIDIA/spark-rapids/pull/1062)|Get cudf/spark dependency from the correct .m2 dir|
-|[#1078](https://github.com/NVIDIA/spark-rapids/pull/1078)|Another round of fixes for mapping of DataType to DType|
-|[#1066](https://github.com/NVIDIA/spark-rapids/pull/1066)|More fixes for conversion to ColumnarBatch|
-|[#1029](https://github.com/NVIDIA/spark-rapids/pull/1029)|BenchmarkRunner should produce JSON summary file even when queries fail|
-|[#1055](https://github.com/NVIDIA/spark-rapids/pull/1055)|Fix build warnings|
-|[#1064](https://github.com/NVIDIA/spark-rapids/pull/1064)|Use array instead of List for from(Table, DataType)|
-|[#1057](https://github.com/NVIDIA/spark-rapids/pull/1057)|Fix empty table broadcast requiring a GPU on driver node|
-|[#1047](https://github.com/NVIDIA/spark-rapids/pull/1047)|Sanity checks for cudf jar mismatch|
-|[#1044](https://github.com/NVIDIA/spark-rapids/pull/1044)|Accelerated row to columnar and columnar to row transitions|
-|[#1056](https://github.com/NVIDIA/spark-rapids/pull/1056)|Add query number to Spark app name when running benchmarks|
-|[#1054](https://github.com/NVIDIA/spark-rapids/pull/1054)|Log total RMM allocated on GPU OOM|
-|[#1053](https://github.com/NVIDIA/spark-rapids/pull/1053)|Remove isGpuBroadcastNestedLoopJoin from shims|
-|[#1052](https://github.com/NVIDIA/spark-rapids/pull/1052)|Allow for GPUCoalesceBatch to deal with Map|
-|[#1051](https://github.com/NVIDIA/spark-rapids/pull/1051)|Add simple retry for URM dependencies [skip ci]|
-|[#1046](https://github.com/NVIDIA/spark-rapids/pull/1046)|Fix broken links|
-|[#1017](https://github.com/NVIDIA/spark-rapids/pull/1017)|Log whether PTDS is enabled|
-|[#1040](https://github.com/NVIDIA/spark-rapids/pull/1040)|Update to cudf 0.17-SNAPSHOT and fix tests|
-|[#1042](https://github.com/NVIDIA/spark-rapids/pull/1042)|Fix inconsistencies in AQE support for broadcast joins|
-|[#1037](https://github.com/NVIDIA/spark-rapids/pull/1037)|Add in support for the SQL functions Least and Greatest|
-|[#1036](https://github.com/NVIDIA/spark-rapids/pull/1036)|Increase number of retries when waiting for databricks cluster|
-|[#1034](https://github.com/NVIDIA/spark-rapids/pull/1034)|[BUG] To honor spark.rapids.memory.gpu.pool=NONE|
-|[#854](https://github.com/NVIDIA/spark-rapids/pull/854)|Arbitrary function call in UDF|
-|[#1028](https://github.com/NVIDIA/spark-rapids/pull/1028)|Update to cudf-0.16|
-|[#1023](https://github.com/NVIDIA/spark-rapids/pull/1023)|Add --gc-between-run flag for TPC* benchmarks.|
-|[#1001](https://github.com/NVIDIA/spark-rapids/pull/1001)|ColumnarBatch to CachedBatch and back|
-|[#990](https://github.com/NVIDIA/spark-rapids/pull/990)|Parquet coalesce file reader for local filesystems|
-|[#1014](https://github.com/NVIDIA/spark-rapids/pull/1014)|Add --append-dat flag for TPC-DS benchmark|
-|[#991](https://github.com/NVIDIA/spark-rapids/pull/991)|Updated GCP Dataproc Mortgage-ETL-GPU.ipynb|
-|[#886](https://github.com/NVIDIA/spark-rapids/pull/886)|Spark BinaryType and cast to BinaryType|
-|[#1016](https://github.com/NVIDIA/spark-rapids/pull/1016)|Change Hash Aggregate to allow pass-through on MapType|
-|[#984](https://github.com/NVIDIA/spark-rapids/pull/984)|Add support for MapType in selected operators |
-|[#1012](https://github.com/NVIDIA/spark-rapids/pull/1012)|Update for new position parameter in Spark 3.1.0 RegExpReplace|
-|[#995](https://github.com/NVIDIA/spark-rapids/pull/995)|Add shim for EMR 3.0.1 and EMR 3.0.1-SNAPSHOT|
-|[#998](https://github.com/NVIDIA/spark-rapids/pull/998)|Update benchmark automation script|
-|[#1000](https://github.com/NVIDIA/spark-rapids/pull/1000)|Always use RAPIDS shuffle when running TPCH and Mortgage tests|
-|[#981](https://github.com/NVIDIA/spark-rapids/pull/981)|Change databricks build to dynamically create a cluster|
-|[#986](https://github.com/NVIDIA/spark-rapids/pull/986)|Fix missing dataSize metric when using RAPIDS shuffle|
-|[#914](https://github.com/NVIDIA/spark-rapids/pull/914)|Write InternalRow to CachedBatch|
-|[#934](https://github.com/NVIDIA/spark-rapids/pull/934)|Iterator to make it easier to work with a window of blocks in the RAPIDS shuffle|
-|[#992](https://github.com/NVIDIA/spark-rapids/pull/992)|Skip post-clean if aborted before the image build stage in pre-merge [skip ci]|
-|[#988](https://github.com/NVIDIA/spark-rapids/pull/988)|Change in Spark caused the 3.1.0 CI to fail|
-|[#983](https://github.com/NVIDIA/spark-rapids/pull/983)|clean jenkins file for premerge on NGCC|
-|[#964](https://github.com/NVIDIA/spark-rapids/pull/964)|Refactor TPC benchmarks to reduce duplicate code|
-|[#978](https://github.com/NVIDIA/spark-rapids/pull/978)|Enable scalastyle checks for udf-compiler module|
-|[#949](https://github.com/NVIDIA/spark-rapids/pull/949)|Fix GpuWindowExec to work with a CPU SortExec|
-|[#973](https://github.com/NVIDIA/spark-rapids/pull/973)|Stop reporting totalTime metric for GpuShuffleExchangeExec|
-|[#968](https://github.com/NVIDIA/spark-rapids/pull/968)|XFail pos_explode tests until final fix can be put in|
-|[#970](https://github.com/NVIDIA/spark-rapids/pull/970)|Add legacy config to clear active Spark 3.1.0 session in tests|
-|[#918](https://github.com/NVIDIA/spark-rapids/pull/918)|Benchmark runner script|
-|[#915](https://github.com/NVIDIA/spark-rapids/pull/915)|Add option to control number of partitions when converting from CSV to Parquet|
-|[#944](https://github.com/NVIDIA/spark-rapids/pull/944)|Fix some issues with non-determinism|
-|[#935](https://github.com/NVIDIA/spark-rapids/pull/935)|Add in support/tests for a window count on a column|
-|[#940](https://github.com/NVIDIA/spark-rapids/pull/940)|Fix closeOnExcept suppressed exception handling|
-|[#942](https://github.com/NVIDIA/spark-rapids/pull/942)|fix github action env setup [skip ci]|
-|[#933](https://github.com/NVIDIA/spark-rapids/pull/933)|Update first/last tests to avoid non-determinisim and ordering differences|
-|[#931](https://github.com/NVIDIA/spark-rapids/pull/931)|Fix checking for nullable columns in window range query|
-|[#924](https://github.com/NVIDIA/spark-rapids/pull/924)|Benchmark guide update for command-line interface / spark-submit|
-|[#926](https://github.com/NVIDIA/spark-rapids/pull/926)|Move pandas_udf functions into the tests functions|
-|[#929](https://github.com/NVIDIA/spark-rapids/pull/929)|Pick a default tableId to use that is non 0 so that flatbuffers allow…|
-|[#928](https://github.com/NVIDIA/spark-rapids/pull/928)|Fix RapidsBufferStore NPE when no spillable buffers are available|
-|[#820](https://github.com/NVIDIA/spark-rapids/pull/820)|Benchmarking guide|
-|[#859](https://github.com/NVIDIA/spark-rapids/pull/859)|Compare partitioned files in order|
-|[#916](https://github.com/NVIDIA/spark-rapids/pull/916)|create new sparkContext explicitly in CPU notebook|
-|[#917](https://github.com/NVIDIA/spark-rapids/pull/917)|create new SparkContext in GPU notebook explicitly.|
-|[#919](https://github.com/NVIDIA/spark-rapids/pull/919)|Add label benchmark to performance subsection in changelog|
-|[#850](https://github.com/NVIDIA/spark-rapids/pull/850)| Add in basic support for lead/lag|
-|[#843](https://github.com/NVIDIA/spark-rapids/pull/843)|[REVIEW] Cache plugin to handle reading CachedBatch to an InternalRow|
-|[#904](https://github.com/NVIDIA/spark-rapids/pull/904)|Add command-line argument for benchmark result filename|
-|[#909](https://github.com/NVIDIA/spark-rapids/pull/909)|GCP preview version image name update|
-|[#903](https://github.com/NVIDIA/spark-rapids/pull/903)|update getting-started-gcp.md with new component list|
-|[#900](https://github.com/NVIDIA/spark-rapids/pull/900)|Turn off CollectLimitExec replacement by default|
-|[#907](https://github.com/NVIDIA/spark-rapids/pull/907)|remove configs from databricks that shouldn't be used by default|
-|[#893](https://github.com/NVIDIA/spark-rapids/pull/893)|Fix rounding error when casting timestamp to string for timestamps before 1970|
-|[#899](https://github.com/NVIDIA/spark-rapids/pull/899)|Mark reduction corner case tests as xfail on databricks until they can be fixed|
-|[#894](https://github.com/NVIDIA/spark-rapids/pull/894)|Replace whole-buffer slicing with direct refcounting|
-|[#891](https://github.com/NVIDIA/spark-rapids/pull/891)|Add config to dump heap on GPU OOM|
-|[#890](https://github.com/NVIDIA/spark-rapids/pull/890)|Clean up CoalesceBatch to use withResource|
-|[#892](https://github.com/NVIDIA/spark-rapids/pull/892)|Only manifest the current batch in cached block shuffle read iterator|
-|[#871](https://github.com/NVIDIA/spark-rapids/pull/871)|Add support for using the arena allocator|
-|[#889](https://github.com/NVIDIA/spark-rapids/pull/889)|Fix crash on scalar only orderby|
-|[#879](https://github.com/NVIDIA/spark-rapids/pull/879)|Update SpillableColumnarBatch to remove buffer from catalog on close|
-|[#888](https://github.com/NVIDIA/spark-rapids/pull/888)|Shrink detect scope to compile only [skip ci]|
-|[#885](https://github.com/NVIDIA/spark-rapids/pull/885)|[BUG] fix IT dockerfile arguments [skip ci]|
-|[#883](https://github.com/NVIDIA/spark-rapids/pull/883)|[BUG] fix IT dockerfile args ordering [skip ci]|
-|[#875](https://github.com/NVIDIA/spark-rapids/pull/875)|fix the non-consistency for `spark.rapids.sql.format.parquet.multiThreadedRead` in RapidsConf.scala|
-|[#862](https://github.com/NVIDIA/spark-rapids/pull/862)|Migrate nightly&integration pipelines to blossom [skip ci]|
-|[#872](https://github.com/NVIDIA/spark-rapids/pull/872)|Ensure that receive-side batches use GpuColumnVectorFromBuffer to avoid|
-|[#833](https://github.com/NVIDIA/spark-rapids/pull/833)|Add nvcomp LZ4 codec support|
-|[#870](https://github.com/NVIDIA/spark-rapids/pull/870)|Cleaned up tests and documentation for csv timestamp parsing|
-|[#823](https://github.com/NVIDIA/spark-rapids/pull/823)|Add command-line interface for TPC-* for use with spark-submit|
-|[#856](https://github.com/NVIDIA/spark-rapids/pull/856)|Move GpuWindowInPandasExec in shims layers|
-|[#756](https://github.com/NVIDIA/spark-rapids/pull/756)|Add stream-time metric|
-|[#832](https://github.com/NVIDIA/spark-rapids/pull/832)|Skip pandas tests if pandas cannot be found|
-|[#841](https://github.com/NVIDIA/spark-rapids/pull/841)|Fix a hanging issue when processing empty data.|
-|[#840](https://github.com/NVIDIA/spark-rapids/pull/840)|[REVIEW] Fixed failing cache tests|
-|[#848](https://github.com/NVIDIA/spark-rapids/pull/848)|Update task memory and disk spill metrics when buffer store spills|
-|[#851](https://github.com/NVIDIA/spark-rapids/pull/851)|Use contiguous table when deserializing columnar batch|
-|[#857](https://github.com/NVIDIA/spark-rapids/pull/857)|fix pvc scheduling issue|
-|[#853](https://github.com/NVIDIA/spark-rapids/pull/853)|Remove nodeAffinity from premerge pipeline|
-|[#796](https://github.com/NVIDIA/spark-rapids/pull/796)|Record spark plan SQL metrics to JSON when running benchmarks|
-|[#781](https://github.com/NVIDIA/spark-rapids/pull/781)|Add AQE unit tests|
-|[#824](https://github.com/NVIDIA/spark-rapids/pull/824)|Skip cudf_udf test by default|
-|[#839](https://github.com/NVIDIA/spark-rapids/pull/839)|First/Last reduction and cleanup of agg APIs|
-|[#827](https://github.com/NVIDIA/spark-rapids/pull/827)|Add Spark 3.0 EMR Shim layer |
-|[#816](https://github.com/NVIDIA/spark-rapids/pull/816)|[BUG] fix nightly is timing out|
-|[#782](https://github.com/NVIDIA/spark-rapids/pull/782)|Benchmark utility to perform diff of output from benchmark runs, allowing for precision differences|
-|[#813](https://github.com/NVIDIA/spark-rapids/pull/813)|Revert "Enable tests in udf_cudf_test.py"|
-|[#788](https://github.com/NVIDIA/spark-rapids/pull/788)|[FEA] Persist workspace data on PVC for premerge|
-|[#805](https://github.com/NVIDIA/spark-rapids/pull/805)|[FEA] nightly build trigger both IT on spark 300 and 301|
-|[#797](https://github.com/NVIDIA/spark-rapids/pull/797)|Allow host spill store to fit a buffer larger than configured max size|
-|[#807](https://github.com/NVIDIA/spark-rapids/pull/807)|Deploy integration-tests javadoc and sources|
-|[#777](https://github.com/NVIDIA/spark-rapids/pull/777)|Enable tests in udf_cudf_test.py|
-|[#790](https://github.com/NVIDIA/spark-rapids/pull/790)|CI: Update cudf python to 0.16 nightly|
-|[#772](https://github.com/NVIDIA/spark-rapids/pull/772)|Add support for empty array construction.|
-|[#783](https://github.com/NVIDIA/spark-rapids/pull/783)|Improved GpuArrowEvalPythonExec|
-|[#771](https://github.com/NVIDIA/spark-rapids/pull/771)|Various improvements to benchmarks|
-|[#763](https://github.com/NVIDIA/spark-rapids/pull/763)|[REVIEW] Allow CoalesceBatch to spill data that is not in active use|
-|[#727](https://github.com/NVIDIA/spark-rapids/pull/727)|Update cudf dependency to 0.16-SNAPSHOT|
-|[#726](https://github.com/NVIDIA/spark-rapids/pull/726)|parquet writer support for TIMESTAMP_MILLIS|
-|[#674](https://github.com/NVIDIA/spark-rapids/pull/674)|Unit test for GPU exchange re-use with AQE|
-|[#723](https://github.com/NVIDIA/spark-rapids/pull/723)|Update code coverage to find source files in new places|
-|[#766](https://github.com/NVIDIA/spark-rapids/pull/766)|Update the integration Dockerfile to reduce the image size|
-|[#762](https://github.com/NVIDIA/spark-rapids/pull/762)|Fixing conflicts in branch-0.3|
-|[#738](https://github.com/NVIDIA/spark-rapids/pull/738)|[auto-merge] branch-0.2 to branch-0.3 - resolve conflict|
-|[#722](https://github.com/NVIDIA/spark-rapids/pull/722)|Initial code changes to support spilling outside of shuffle|
-|[#693](https://github.com/NVIDIA/spark-rapids/pull/693)|Update jenkins files for 0.3|
-|[#692](https://github.com/NVIDIA/spark-rapids/pull/692)|Merge shims dependency to spark-3.0.1 into branch-0.3|
-|[#690](https://github.com/NVIDIA/spark-rapids/pull/690)|Update the version to 0.3.0-SNAPSHOT|
-
-## Release 0.2
-
-### Features
-|||
-|:---|:---|
-|[#696](https://github.com/NVIDIA/spark-rapids/issues/696)|[FEA] run integration tests against SPARK-3.0.1|
-|[#455](https://github.com/NVIDIA/spark-rapids/issues/455)|[FEA] Support UCX shuffle with optimized AQE|
-|[#510](https://github.com/NVIDIA/spark-rapids/issues/510)|[FEA] Investigate libcudf features needed to support struct schema pruning during loads|
-|[#541](https://github.com/NVIDIA/spark-rapids/issues/541)|[FEA] Scala UDF:Support for null Value operands|
-|[#542](https://github.com/NVIDIA/spark-rapids/issues/542)|[FEA] Scala UDF: Support for Date and Time |
-|[#499](https://github.com/NVIDIA/spark-rapids/issues/499)|[FEA] disable any kind of warnings about ExecutedCommandExec not being on the GPU|
-|[#540](https://github.com/NVIDIA/spark-rapids/issues/540)|[FEA] Scala UDF: Support for String replaceFirst()|
-|[#340](https://github.com/NVIDIA/spark-rapids/issues/340)|[FEA] widen the rendered Jekyll pages|
-|[#602](https://github.com/NVIDIA/spark-rapids/issues/602)|[FEA] don't release with any -SNAPSHOT dependencies|
-|[#579](https://github.com/NVIDIA/spark-rapids/issues/579)|[FEA] Auto-merge between branches|
-|[#515](https://github.com/NVIDIA/spark-rapids/issues/515)|[FEA] Write tests for AQE skewed join optimization|
-|[#452](https://github.com/NVIDIA/spark-rapids/issues/452)|[FEA] Update HashSortOptimizerSuite to work with AQE|
-|[#454](https://github.com/NVIDIA/spark-rapids/issues/454)|[FEA] Update GpuCoalesceBatchesSuite to work with AQE enabled|
-|[#354](https://github.com/NVIDIA/spark-rapids/issues/354)|[FEA]Spark 3.1 FileSourceScanExec adds parameter optionalNumCoalescedBuckets|
-|[#566](https://github.com/NVIDIA/spark-rapids/issues/566)|[FEA] Add support for StringSplit with an array index.|
-|[#524](https://github.com/NVIDIA/spark-rapids/issues/524)|[FEA] Add GPU specific metrics to GpuFileSourceScanExec|
-|[#494](https://github.com/NVIDIA/spark-rapids/issues/494)|[FEA] Add some AQE-specific tests to the PySpark test suite|
-|[#146](https://github.com/NVIDIA/spark-rapids/issues/146)|[FEA] Python tests should support running with Adaptive Query Execution enabled|
-|[#465](https://github.com/NVIDIA/spark-rapids/issues/465)|[FEA] Audit: Update script to audit multiple versions of Spark |
-|[#488](https://github.com/NVIDIA/spark-rapids/issues/488)|[FEA] Ability to limit total GPU memory used|
-|[#70](https://github.com/NVIDIA/spark-rapids/issues/70)|[FEA] Support StringSplit|
-|[#403](https://github.com/NVIDIA/spark-rapids/issues/403)|[FEA] Add in support for GetArrayItem|
-|[#493](https://github.com/NVIDIA/spark-rapids/issues/493)|[FEA] Implement shuffle optimization when AQE is enabled|
-|[#500](https://github.com/NVIDIA/spark-rapids/issues/500)|[FEA] Add maven profiles for testing with AQE on or off|
-|[#471](https://github.com/NVIDIA/spark-rapids/issues/471)|[FEA] create a formal process for updating the github-pages branch|
-|[#233](https://github.com/NVIDIA/spark-rapids/issues/233)|[FEA] Audit DataWritingCommandExec |
-|[#240](https://github.com/NVIDIA/spark-rapids/issues/240)|[FEA] Audit Api validation script follow on - Optimize StringToTypeTag |
-|[#388](https://github.com/NVIDIA/spark-rapids/issues/388)|[FEA] Audit WindowExec|
-|[#425](https://github.com/NVIDIA/spark-rapids/issues/425)|[FEA] Add tests for configs in BatchScan Readers|
-|[#453](https://github.com/NVIDIA/spark-rapids/issues/453)|[FEA] Update HashAggregatesSuite to work with AQE|
-|[#184](https://github.com/NVIDIA/spark-rapids/issues/184)|[FEA] Enable NoScalaDoc scalastyle rule|
-|[#438](https://github.com/NVIDIA/spark-rapids/issues/438)|[FEA] Enable StringLPad|
-|[#232](https://github.com/NVIDIA/spark-rapids/issues/232)|[FEA] Audit SortExec |
-|[#236](https://github.com/NVIDIA/spark-rapids/issues/236)|[FEA] Audit ShuffleExchangeExec |
-|[#355](https://github.com/NVIDIA/spark-rapids/issues/355)|[FEA] Support Multiple Spark versions in the same jar|
-|[#385](https://github.com/NVIDIA/spark-rapids/issues/385)|[FEA] Support RangeExec on the GPU|
-|[#317](https://github.com/NVIDIA/spark-rapids/issues/317)|[FEA] Write test wrapper to run SQL queries via pyspark|
-|[#235](https://github.com/NVIDIA/spark-rapids/issues/235)|[FEA] Audit BroadcastExchangeExec|
-|[#234](https://github.com/NVIDIA/spark-rapids/issues/234)|[FEA] Audit BatchScanExec|
-|[#238](https://github.com/NVIDIA/spark-rapids/issues/238)|[FEA] Audit ShuffledHashJoinExec |
-|[#237](https://github.com/NVIDIA/spark-rapids/issues/237)|[FEA] Audit BroadcastHashJoinExec |
-|[#316](https://github.com/NVIDIA/spark-rapids/issues/316)|[FEA] Add some basic Dataframe tests for CoalesceExec|
-|[#145](https://github.com/NVIDIA/spark-rapids/issues/145)|[FEA] Scala tests should support running with Adaptive Query Execution enabled|
-|[#231](https://github.com/NVIDIA/spark-rapids/issues/231)|[FEA] Audit ProjectExec |
-|[#229](https://github.com/NVIDIA/spark-rapids/issues/229)|[FEA] Audit FileSourceScanExec |
-
-### Performance
-|||
-|:---|:---|
-|[#326](https://github.com/NVIDIA/spark-rapids/issues/326)|[DISCUSS] Shuffle read-side error handling|
-|[#601](https://github.com/NVIDIA/spark-rapids/issues/601)|[FEA] Optimize unnecessary sorts when replacing SortAggregate|
-|[#333](https://github.com/NVIDIA/spark-rapids/issues/333)|[FEA] Better handling of reading lots of small Parquet files|
-|[#511](https://github.com/NVIDIA/spark-rapids/issues/511)|[FEA] Connect shuffle table compression to shuffle exec metrics|
-|[#15](https://github.com/NVIDIA/spark-rapids/issues/15)|[FEA] Multiple threads sharing the same GPU|
-|[#272](https://github.com/NVIDIA/spark-rapids/issues/272)|[DOC] Getting started guide for UCX shuffle|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#780](https://github.com/NVIDIA/spark-rapids/issues/780)|[BUG] Inner Join dropping data with bucketed Table input|
-|[#569](https://github.com/NVIDIA/spark-rapids/issues/569)|[BUG] left_semi_join operation is abnormal and serious time-consuming|
-|[#744](https://github.com/NVIDIA/spark-rapids/issues/744)|[BUG] TPC-DS query 6 now produces incorrect results.|
-|[#718](https://github.com/NVIDIA/spark-rapids/issues/718)|[BUG] GpuBroadcastHashJoinExec ArrayIndexOutOfBoundsException|
-|[#698](https://github.com/NVIDIA/spark-rapids/issues/698)|[BUG] batch coalesce can fail to appear between columnar shuffle and subsequent columnar operation|
-|[#658](https://github.com/NVIDIA/spark-rapids/issues/658)|[BUG] GpuCoalesceBatches collectTime metric can be underreported|
-|[#59](https://github.com/NVIDIA/spark-rapids/issues/59)|[BUG] enable tests for string literals in a select|
-|[#486](https://github.com/NVIDIA/spark-rapids/issues/486)|[BUG] GpuWindowExec does not implement requiredChildOrdering|
-|[#631](https://github.com/NVIDIA/spark-rapids/issues/631)|[BUG] Rows are dropped when AQE is enabled in some cases|
-|[#671](https://github.com/NVIDIA/spark-rapids/issues/671)|[BUG] Databricks hash_aggregate_test fails trying to canonicalize a WrappedAggFunction|
-|[#218](https://github.com/NVIDIA/spark-rapids/issues/218)|[BUG] Window function COUNT(x) includes null-values, when it shouldn't|
-|[#153](https://github.com/NVIDIA/spark-rapids/issues/153)|[BUG] Incorrect output from partial-only hash aggregates with multiple distincts and non-distinct functions|
-|[#656](https://github.com/NVIDIA/spark-rapids/issues/656)|[BUG] integration tests produce hive metadata files|
-|[#607](https://github.com/NVIDIA/spark-rapids/issues/607)|[BUG] Fix misleading "cannot run on GPU" warnings when AQE is enabled|
-|[#630](https://github.com/NVIDIA/spark-rapids/issues/630)|[BUG] GpuCustomShuffleReader metrics always show zero rows/batches output|
-|[#643](https://github.com/NVIDIA/spark-rapids/issues/643)|[BUG] race condition while registering a buffer and spilling at the same time|
-|[#606](https://github.com/NVIDIA/spark-rapids/issues/606)|[BUG] Multiple scans for same data source with TPC-DS query59 with delta format|
-|[#626](https://github.com/NVIDIA/spark-rapids/issues/626)|[BUG] parquet_test showing leaked memory buffer|
-|[#155](https://github.com/NVIDIA/spark-rapids/issues/155)|[BUG] Incorrect output from averages with filters in partial only mode|
-|[#277](https://github.com/NVIDIA/spark-rapids/issues/277)|[BUG] HashAggregateSuite failure when AQE is enabled|
-|[#276](https://github.com/NVIDIA/spark-rapids/issues/276)|[BUG] GpuCoalesceBatchSuite failure when AQE is enabled|
-|[#598](https://github.com/NVIDIA/spark-rapids/issues/598)|[BUG] Non-deterministic output from MapOutputTracker.getStatistics() with AQE on GPU|
-|[#192](https://github.com/NVIDIA/spark-rapids/issues/192)|[BUG] test_read_merge_schema fails on Databricks|
-|[#341](https://github.com/NVIDIA/spark-rapids/issues/341)|[BUG] Document compression formats for readers/writers|
-|[#587](https://github.com/NVIDIA/spark-rapids/issues/587)|[BUG] Spark3.1 changed FileScan which means or GpuScans need to be added to shim layer|
-|[#362](https://github.com/NVIDIA/spark-rapids/issues/362)|[BUG] Implement getReaderForRange in the RapidsShuffleManager|
-|[#528](https://github.com/NVIDIA/spark-rapids/issues/528)|[BUG] HashAggregateSuite "Avg Distinct with filter" no longer valid when testing against Spark 3.1.0|
-|[#416](https://github.com/NVIDIA/spark-rapids/issues/416)|[BUG] Fix Spark 3.1.0 integration tests|
-|[#556](https://github.com/NVIDIA/spark-rapids/issues/556)|[BUG] NPE when removing shuffle|
-|[#553](https://github.com/NVIDIA/spark-rapids/issues/553)|[BUG] GpuColumnVector build warnings from raw type access|
-|[#492](https://github.com/NVIDIA/spark-rapids/issues/492)|[BUG] Re-enable AQE integration tests|
-|[#275](https://github.com/NVIDIA/spark-rapids/issues/275)|[BUG] TpchLike query 2 fails when AQE is enabled|
-|[#508](https://github.com/NVIDIA/spark-rapids/issues/508)|[BUG] GpuUnion publishes metrics on the UI that are all 0|
-|[#269](https://github.com/NVIDIA/spark-rapids/issues/269)|Needed to add `--conf spark.driver.extraClassPath=` |
-|[#473](https://github.com/NVIDIA/spark-rapids/issues/473)|[BUG] PartMerge:countDistinct:sum fails sporadically|
-|[#531](https://github.com/NVIDIA/spark-rapids/issues/531)|[BUG] Temporary RMM workaround needs to be removed|
-|[#532](https://github.com/NVIDIA/spark-rapids/issues/532)|[BUG] NPE when enabling shuffle manager|
-|[#525](https://github.com/NVIDIA/spark-rapids/issues/525)|[BUG] GpuFilterExec reports incorrect nullability of output in some cases|
-|[#483](https://github.com/NVIDIA/spark-rapids/issues/483)|[BUG] Multiple scans for the same parquet data source|
-|[#382](https://github.com/NVIDIA/spark-rapids/issues/382)|[BUG] Spark3.1 StringFallbackSuite regexp_replace null cpu fall back test fails.|
-|[#489](https://github.com/NVIDIA/spark-rapids/issues/489)|[FEA] Fix Spark 3.1 GpuHashJoin since it now requires CodegenSupport|
-|[#441](https://github.com/NVIDIA/spark-rapids/issues/441)|[BUG] test_broadcast_nested_loop_join_special_case fails on databricks|
-|[#347](https://github.com/NVIDIA/spark-rapids/issues/347)|[BUG] Failed to read Parquet file generated by GPU-enabled Spark.|
-|[#433](https://github.com/NVIDIA/spark-rapids/issues/433)|`InSet` operator produces an error for Strings|
-|[#144](https://github.com/NVIDIA/spark-rapids/issues/144)|[BUG] spark.sql.legacy.parquet.datetimeRebaseModeInWrite is ignored|
-|[#323](https://github.com/NVIDIA/spark-rapids/issues/323)|[BUG] GpuBroadcastNestedLoopJoinExec can fail if there are no columns|
-|[#356](https://github.com/NVIDIA/spark-rapids/issues/356)|[BUG] Integration cache test for BroadcastNestedLoopJoin failure|
-|[#280](https://github.com/NVIDIA/spark-rapids/issues/280)|[BUG] Full Outer Join does not work on nullable keys|
-|[#149](https://github.com/NVIDIA/spark-rapids/issues/149)|[BUG] Spark driver fails to load native libs when running on node without CUDA|
-
-### PRs
-|||
-|:---|:---|
-|[#826](https://github.com/NVIDIA/spark-rapids/pull/826)|Fix link to cudf-0.15-cuda11.jar|
-|[#815](https://github.com/NVIDIA/spark-rapids/pull/815)|Update documentation for Scala UDFs in 0.2 since you need two things|
-|[#802](https://github.com/NVIDIA/spark-rapids/pull/802)|Update 0.2 CHANGELOG|
-|[#793](https://github.com/NVIDIA/spark-rapids/pull/793)|Update Jenkins scripts for release|
-|[#798](https://github.com/NVIDIA/spark-rapids/pull/798)|Fix shims provider override config not being seen by executors|
-|[#785](https://github.com/NVIDIA/spark-rapids/pull/785)|Make shuffle run on CPU if we do a join where we read from bucketed table|
-|[#765](https://github.com/NVIDIA/spark-rapids/pull/765)|Add config to override shims provider class|
-|[#759](https://github.com/NVIDIA/spark-rapids/pull/759)|Add CHANGELOG for release 0.2|
-|[#758](https://github.com/NVIDIA/spark-rapids/pull/758)|Skip the udf test fails periodically.|
-|[#752](https://github.com/NVIDIA/spark-rapids/pull/752)|Fix snapshot plugin jar version in docs|
-|[#751](https://github.com/NVIDIA/spark-rapids/pull/751)|Correct the channel for cudf installation|
-|[#754](https://github.com/NVIDIA/spark-rapids/pull/754)|Filter nulls from joins where possible to improve performance|
-|[#732](https://github.com/NVIDIA/spark-rapids/pull/732)|Add a timeout for RapidsShuffleIterator to prevent jobs to hang infin…|
-|[#637](https://github.com/NVIDIA/spark-rapids/pull/637)|Documentation changes for 0.2 release |
-|[#747](https://github.com/NVIDIA/spark-rapids/pull/747)|Disable udf tests that fail periodically|
-|[#745](https://github.com/NVIDIA/spark-rapids/pull/745)|Revert Null Join Filter|
-|[#741](https://github.com/NVIDIA/spark-rapids/pull/741)|Fix issue with parquet partitioned reads|
-|[#733](https://github.com/NVIDIA/spark-rapids/pull/733)|Remove GPU Types from github|
-|[#720](https://github.com/NVIDIA/spark-rapids/pull/720)|Stop removing GpuCoalesceBatches from non-AQE queries when AQE is enabled|
-|[#729](https://github.com/NVIDIA/spark-rapids/pull/729)|Fix collect time metric in CoalesceBatches|
-|[#640](https://github.com/NVIDIA/spark-rapids/pull/640)|Support running Pandas UDFs on GPUs in Python processes.|
-|[#721](https://github.com/NVIDIA/spark-rapids/pull/721)|Add some more checks to databricks build scripts|
-|[#714](https://github.com/NVIDIA/spark-rapids/pull/714)|Move spark 3.0.1-shims out of snapshot-shims|
-|[#711](https://github.com/NVIDIA/spark-rapids/pull/711)|fix blossom checkout repo|
-|[#709](https://github.com/NVIDIA/spark-rapids/pull/709)|[BUG] fix unexpected indentation issue in blossom yml|
-|[#642](https://github.com/NVIDIA/spark-rapids/pull/642)|Init workflow for blossom-ci|
-|[#705](https://github.com/NVIDIA/spark-rapids/pull/705)|Enable configuration check for cast string to timestamp|
-|[#702](https://github.com/NVIDIA/spark-rapids/pull/702)|Update slack channel for Jenkins builds|
-|[#701](https://github.com/NVIDIA/spark-rapids/pull/701)|fix checkout-ref for automerge|
-|[#695](https://github.com/NVIDIA/spark-rapids/pull/695)|Fix spark-3.0.1 shim to be released|
-|[#668](https://github.com/NVIDIA/spark-rapids/pull/668)|refactor automerge to support merge for protected branch|
-|[#687](https://github.com/NVIDIA/spark-rapids/pull/687)|Include the UDF compiler in the dist jar|
-|[#689](https://github.com/NVIDIA/spark-rapids/pull/689)|Change shims dependency to spark-3.0.1|
-|[#677](https://github.com/NVIDIA/spark-rapids/pull/677)|Use multi-threaded parquet read with small files|
-|[#638](https://github.com/NVIDIA/spark-rapids/pull/638)|Add Parquet-based cache serializer|
-|[#613](https://github.com/NVIDIA/spark-rapids/pull/613)|Enable UCX + AQE|
-|[#684](https://github.com/NVIDIA/spark-rapids/pull/684)|Enable test for literal string values in a select|
-|[#686](https://github.com/NVIDIA/spark-rapids/pull/686)|Remove sorts when replacing sort aggregate if possible|
-|[#675](https://github.com/NVIDIA/spark-rapids/pull/675)|Added TimeAdd|
-|[#645](https://github.com/NVIDIA/spark-rapids/pull/645)|[window] Add GpuWindowExec requiredChildOrdering|
-|[#676](https://github.com/NVIDIA/spark-rapids/pull/676)|fixUpJoinConsistency rule now works when AQE is enabled|
-|[#683](https://github.com/NVIDIA/spark-rapids/pull/683)|Fix issues with cannonicalization of WrappedAggFunction|
-|[#682](https://github.com/NVIDIA/spark-rapids/pull/682)|Fix path to start-slave.sh script in docs|
-|[#673](https://github.com/NVIDIA/spark-rapids/pull/673)|Increase build timeouts on nightly and premerge builds|
-|[#648](https://github.com/NVIDIA/spark-rapids/pull/648)|add signoff-check use github actions|
-|[#593](https://github.com/NVIDIA/spark-rapids/pull/593)|Add support for isNaN and datetime related instructions in UDF compiler|
-|[#666](https://github.com/NVIDIA/spark-rapids/pull/666)|[window] Disable GPU for COUNT(exp) queries|
-|[#655](https://github.com/NVIDIA/spark-rapids/pull/655)|Implement AQE unit test for InsertAdaptiveSparkPlan|
-|[#614](https://github.com/NVIDIA/spark-rapids/pull/614)|Fix for aggregation with multiple distinct and non distinct functions|
-|[#657](https://github.com/NVIDIA/spark-rapids/pull/657)|Fix verify build after integration tests are run|
-|[#660](https://github.com/NVIDIA/spark-rapids/pull/660)|Add in neverReplaceExec and several rules for it|
-|[#639](https://github.com/NVIDIA/spark-rapids/pull/639)|BooleanType test shouldn't xfail|
-|[#652](https://github.com/NVIDIA/spark-rapids/pull/652)|Mark UVM config as internal until supported|
-|[#653](https://github.com/NVIDIA/spark-rapids/pull/653)|Move to the cudf-0.15 release|
-|[#647](https://github.com/NVIDIA/spark-rapids/pull/647)|Improve warnings about AQE nodes not supported on GPU|
-|[#646](https://github.com/NVIDIA/spark-rapids/pull/646)|Stop reporting zero metrics for GpuCustomShuffleReader|
-|[#644](https://github.com/NVIDIA/spark-rapids/pull/644)|Small fix for race in catalog where a buffer could get spilled while …|
-|[#623](https://github.com/NVIDIA/spark-rapids/pull/623)|Fix issues with canonicalization|
-|[#599](https://github.com/NVIDIA/spark-rapids/pull/599)|[FEA] changelog generator|
-|[#563](https://github.com/NVIDIA/spark-rapids/pull/563)|cudf and spark version info in artifacts|
-|[#633](https://github.com/NVIDIA/spark-rapids/pull/633)|Fix leak if RebaseHelper throws during Parquet read|
-|[#632](https://github.com/NVIDIA/spark-rapids/pull/632)|Copy function isSearchableType from Spark because signature changed in 3.0.1|
-|[#583](https://github.com/NVIDIA/spark-rapids/pull/583)|Add udf compiler unit tests|
-|[#617](https://github.com/NVIDIA/spark-rapids/pull/617)|Documentation updates for branch 0.2|
-|[#616](https://github.com/NVIDIA/spark-rapids/pull/616)|Add config to reserve GPU memory|
-|[#612](https://github.com/NVIDIA/spark-rapids/pull/612)|[REVIEW] Fix incorrect output from averages with filters in partial only mode|
-|[#609](https://github.com/NVIDIA/spark-rapids/pull/609)|fix minor issues with instructions for building ucx|
-|[#611](https://github.com/NVIDIA/spark-rapids/pull/611)|Added in profile to enable shims for SNAPSHOT releases|
-|[#595](https://github.com/NVIDIA/spark-rapids/pull/595)|Parquet small file reading optimization|
-|[#582](https://github.com/NVIDIA/spark-rapids/pull/582)|fix #579 Auto-merge between branches|
-|[#536](https://github.com/NVIDIA/spark-rapids/pull/536)|Add test for skewed join optimization when AQE is enabled|
-|[#603](https://github.com/NVIDIA/spark-rapids/pull/603)|Fix data size metric always 0 when using RAPIDS shuffle|
-|[#600](https://github.com/NVIDIA/spark-rapids/pull/600)|Fix calculation of string data for compressed batches|
-|[#597](https://github.com/NVIDIA/spark-rapids/pull/597)|Remove the xfail for parquet test_read_merge_schema on Databricks|
-|[#591](https://github.com/NVIDIA/spark-rapids/pull/591)|Add ucx license in NOTICE-binary|
-|[#596](https://github.com/NVIDIA/spark-rapids/pull/596)|Add Spark 3.0.2 to Shim layer|
-|[#594](https://github.com/NVIDIA/spark-rapids/pull/594)|Filter nulls from joins where possible to improve performance.|
-|[#590](https://github.com/NVIDIA/spark-rapids/pull/590)|Move GpuParquetScan/GpuOrcScan into Shim|
-|[#588](https://github.com/NVIDIA/spark-rapids/pull/588)|xfail the tpch spark 3.1.0 tests that fail|
-|[#572](https://github.com/NVIDIA/spark-rapids/pull/572)|Update buffer store to return compressed batches directly, add compression NVTX ranges|
-|[#558](https://github.com/NVIDIA/spark-rapids/pull/558)|Fix unit tests when AQE is enabled|
-|[#580](https://github.com/NVIDIA/spark-rapids/pull/580)|xfail the Spark 3.1.0 integration tests that fail |
-|[#565](https://github.com/NVIDIA/spark-rapids/pull/565)|Minor improvements to TPC-DS benchmarking code|
-|[#567](https://github.com/NVIDIA/spark-rapids/pull/567)|Explicitly disable AQE in one test|
-|[#571](https://github.com/NVIDIA/spark-rapids/pull/571)|Fix Databricks shim layer for GpuFileSourceScanExec and GpuBroadcastExchangeExec|
-|[#564](https://github.com/NVIDIA/spark-rapids/pull/564)|Add GPU decode time metric to scans|
-|[#562](https://github.com/NVIDIA/spark-rapids/pull/562)|getCatalog can be called from the driver, and can return null|
-|[#555](https://github.com/NVIDIA/spark-rapids/pull/555)|Fix build warnings for ColumnViewAccess|
-|[#560](https://github.com/NVIDIA/spark-rapids/pull/560)|Fix databricks build for AQE support|
-|[#557](https://github.com/NVIDIA/spark-rapids/pull/557)|Fix tests failing on Spark 3.1|
-|[#547](https://github.com/NVIDIA/spark-rapids/pull/547)|Add GPU metrics to GpuFileSourceScanExec|
-|[#462](https://github.com/NVIDIA/spark-rapids/pull/462)|Implement optimized AQE support so that exchanges run on GPU where possible|
-|[#550](https://github.com/NVIDIA/spark-rapids/pull/550)|Document Parquet and ORC compression support|
-|[#539](https://github.com/NVIDIA/spark-rapids/pull/539)|Update script to audit multiple Spark versions|
-|[#543](https://github.com/NVIDIA/spark-rapids/pull/543)|Add metrics to GpuUnion operator|
-|[#549](https://github.com/NVIDIA/spark-rapids/pull/549)|Move spark shim properties to top level pom|
-|[#497](https://github.com/NVIDIA/spark-rapids/pull/497)|Add UDF compiler implementations|
-|[#487](https://github.com/NVIDIA/spark-rapids/pull/487)|Add framework for batch compression of shuffle partitions|
-|[#544](https://github.com/NVIDIA/spark-rapids/pull/544)|Add in driverExtraClassPath for standalone mode docs|
-|[#546](https://github.com/NVIDIA/spark-rapids/pull/546)|Fix Spark 3.1.0 shim build error in GpuHashJoin|
-|[#537](https://github.com/NVIDIA/spark-rapids/pull/537)|Use fresh SparkSession when capturing to avoid late capture of previous query|
-|[#538](https://github.com/NVIDIA/spark-rapids/pull/538)|Revert "Temporary workaround for RMM initial pool size bug (#530)"|
-|[#517](https://github.com/NVIDIA/spark-rapids/pull/517)|Add config to limit maximum RMM pool size|
-|[#527](https://github.com/NVIDIA/spark-rapids/pull/527)|Add support for split and getArrayIndex|
-|[#534](https://github.com/NVIDIA/spark-rapids/pull/534)|Fixes bugs around GpuShuffleEnv initialization|
-|[#529](https://github.com/NVIDIA/spark-rapids/pull/529)|[BUG] Degenerate table metas were not getting copied to the heap|
-|[#530](https://github.com/NVIDIA/spark-rapids/pull/530)|Temporary workaround for RMM initial pool size bug|
-|[#526](https://github.com/NVIDIA/spark-rapids/pull/526)|Fix bug with nullability reporting in GpuFilterExec|
-|[#521](https://github.com/NVIDIA/spark-rapids/pull/521)|Fix typo with databricks shim classname SparkShimServiceProvider|
-|[#522](https://github.com/NVIDIA/spark-rapids/pull/522)|Use SQLConf instead of SparkConf when looking up SQL configs|
-|[#518](https://github.com/NVIDIA/spark-rapids/pull/518)|Fix init order issue in GpuShuffleEnv when RAPIDS shuffle configured|
-|[#514](https://github.com/NVIDIA/spark-rapids/pull/514)|Added clarification of RegExpReplace, DateDiff, made descriptive text consistent|
-|[#506](https://github.com/NVIDIA/spark-rapids/pull/506)|Add in basic support for running tpcds like queries|
-|[#504](https://github.com/NVIDIA/spark-rapids/pull/504)|Add ability to ignore tests depending on spark shim version|
-|[#503](https://github.com/NVIDIA/spark-rapids/pull/503)|Remove unused async buffer spill support|
-|[#501](https://github.com/NVIDIA/spark-rapids/pull/501)|disable codegen in 3.1 shim for hash join|
-|[#466](https://github.com/NVIDIA/spark-rapids/pull/466)|Optimize and fix Api validation script|
-|[#481](https://github.com/NVIDIA/spark-rapids/pull/481)|Codeowners|
-|[#439](https://github.com/NVIDIA/spark-rapids/pull/439)|Check a PR has been committed using git signoff|
-|[#319](https://github.com/NVIDIA/spark-rapids/pull/319)|Update partitioning logic in ShuffledBatchRDD|
-|[#491](https://github.com/NVIDIA/spark-rapids/pull/491)|Temporarily ignore AQE integration tests|
-|[#490](https://github.com/NVIDIA/spark-rapids/pull/490)|Fix Spark 3.1.0 build for HashJoin changes|
-|[#482](https://github.com/NVIDIA/spark-rapids/pull/482)|Prevent bad practice in python tests|
-|[#485](https://github.com/NVIDIA/spark-rapids/pull/485)|Show plan in assertion message if test fails|
-|[#480](https://github.com/NVIDIA/spark-rapids/pull/480)|Fix link from README to getting-started.md|
-|[#448](https://github.com/NVIDIA/spark-rapids/pull/448)|Preliminary support for keeping broadcast exchanges on GPU when AQE is enabled|
-|[#478](https://github.com/NVIDIA/spark-rapids/pull/478)|Fall back to CPU for binary as string in parquet|
-|[#477](https://github.com/NVIDIA/spark-rapids/pull/477)|Fix special case joins in broadcast nested loop join|
-|[#469](https://github.com/NVIDIA/spark-rapids/pull/469)|Update HashAggregateSuite to work with AQE|
-|[#475](https://github.com/NVIDIA/spark-rapids/pull/475)|Udf compiler pom followup|
-|[#434](https://github.com/NVIDIA/spark-rapids/pull/434)|Add UDF compiler skeleton|
-|[#474](https://github.com/NVIDIA/spark-rapids/pull/474)|Re-enable noscaladoc check|
-|[#461](https://github.com/NVIDIA/spark-rapids/pull/461)|Fix comments style to pass scala style check|
-|[#468](https://github.com/NVIDIA/spark-rapids/pull/468)|fix broken link|
-|[#456](https://github.com/NVIDIA/spark-rapids/pull/456)|Add closeOnExcept to clean up code that closes resources only on exceptions|
-|[#464](https://github.com/NVIDIA/spark-rapids/pull/464)|Turn off noscaladoc rule until codebase is fixed|
-|[#449](https://github.com/NVIDIA/spark-rapids/pull/449)|Enforce NoScalaDoc rule in scalastyle checks|
-|[#450](https://github.com/NVIDIA/spark-rapids/pull/450)|Enable scalastyle for shuffle plugin|
-|[#451](https://github.com/NVIDIA/spark-rapids/pull/451)|Databricks remove unneeded files and fix build to not fail on rm when file missing|
-|[#442](https://github.com/NVIDIA/spark-rapids/pull/442)|Shim layer support for Spark 3.0.0 Databricks|
-|[#447](https://github.com/NVIDIA/spark-rapids/pull/447)|Add scalastyle plugin to shim module|
-|[#426](https://github.com/NVIDIA/spark-rapids/pull/426)|Update BufferMeta to support multiple codec buffers per table|
-|[#440](https://github.com/NVIDIA/spark-rapids/pull/440)|Run mortgage test both with AQE on and off|
-|[#445](https://github.com/NVIDIA/spark-rapids/pull/445)|Added in StringRPad and StringLPad|
-|[#422](https://github.com/NVIDIA/spark-rapids/pull/422)|Documentation updates|
-|[#437](https://github.com/NVIDIA/spark-rapids/pull/437)|Fix bug with InSet and Strings|
-|[#435](https://github.com/NVIDIA/spark-rapids/pull/435)|Add in checks for Parquet LEGACY date/time rebase|
-|[#432](https://github.com/NVIDIA/spark-rapids/pull/432)|Fix batch use-after-close in partitioning, shuffle env init|
-|[#423](https://github.com/NVIDIA/spark-rapids/pull/423)|Fix duplicates includes in assembly jar|
-|[#418](https://github.com/NVIDIA/spark-rapids/pull/418)|CI Add unit tests running for Spark 3.0.1|
-|[#421](https://github.com/NVIDIA/spark-rapids/pull/421)|Make it easier to run TPCxBB benchmarks from spark shell|
-|[#413](https://github.com/NVIDIA/spark-rapids/pull/413)|Fix download link|
-|[#414](https://github.com/NVIDIA/spark-rapids/pull/414)|Shim Layer to support multiple Spark versions |
-|[#406](https://github.com/NVIDIA/spark-rapids/pull/406)|Update cast handling to deal with new libcudf casting limitations|
-|[#405](https://github.com/NVIDIA/spark-rapids/pull/405)|Change slave->worker|
-|[#395](https://github.com/NVIDIA/spark-rapids/pull/395)|Databricks doc updates|
-|[#401](https://github.com/NVIDIA/spark-rapids/pull/401)|Extended the FAQ|
-|[#398](https://github.com/NVIDIA/spark-rapids/pull/398)|Add tests for GpuPartition|
-|[#352](https://github.com/NVIDIA/spark-rapids/pull/352)|Change spark tgz package name|
-|[#397](https://github.com/NVIDIA/spark-rapids/pull/397)|Fix small bug in ShuffleBufferCatalog.hasActiveShuffle|
-|[#286](https://github.com/NVIDIA/spark-rapids/pull/286)|[REVIEW] Updated join tests for cache|
-|[#393](https://github.com/NVIDIA/spark-rapids/pull/393)|Contributor license agreement|
-|[#389](https://github.com/NVIDIA/spark-rapids/pull/389)|Added in support for RangeExec|
-|[#390](https://github.com/NVIDIA/spark-rapids/pull/390)|Ucx getting started|
-|[#391](https://github.com/NVIDIA/spark-rapids/pull/391)|Hide slack channel in Jenkins scripts|
-|[#387](https://github.com/NVIDIA/spark-rapids/pull/387)|Remove the term whitelist|
-|[#365](https://github.com/NVIDIA/spark-rapids/pull/365)|[REVIEW] Timesub tests|
-|[#383](https://github.com/NVIDIA/spark-rapids/pull/383)|Test utility to compare SQL query results between CPU and GPU|
-|[#380](https://github.com/NVIDIA/spark-rapids/pull/380)|Fix databricks notebook link|
-|[#378](https://github.com/NVIDIA/spark-rapids/pull/378)|Added in FAQ and fixed spelling|
-|[#377](https://github.com/NVIDIA/spark-rapids/pull/377)|Update heading in configs.md|
-|[#373](https://github.com/NVIDIA/spark-rapids/pull/373)|Modifying branch name to conform with rapidsai branch name change|
-|[#376](https://github.com/NVIDIA/spark-rapids/pull/376)|Add our session extension correctly if there are other extensions configured|
-|[#374](https://github.com/NVIDIA/spark-rapids/pull/374)|Fix rat issue for notebooks|
-|[#364](https://github.com/NVIDIA/spark-rapids/pull/364)|Update Databricks patch for changes to GpuSortMergeJoin|
-|[#371](https://github.com/NVIDIA/spark-rapids/pull/371)|fix typo and use regional bucket per GCP's update|
-|[#359](https://github.com/NVIDIA/spark-rapids/pull/359)|Karthik changes|
-|[#353](https://github.com/NVIDIA/spark-rapids/pull/353)|Fix broadcast nested loop join for the no column case|
-|[#313](https://github.com/NVIDIA/spark-rapids/pull/313)|Additional tests for broadcast hash join|
-|[#342](https://github.com/NVIDIA/spark-rapids/pull/342)|Implement build-side rules for shuffle hash join|
-|[#349](https://github.com/NVIDIA/spark-rapids/pull/349)|Updated join code to treat null equality properly|
-|[#335](https://github.com/NVIDIA/spark-rapids/pull/335)|Integration tests on spark 3.0.1-SNAPSHOT & 3.1.0-SNAPSHOT|
-|[#346](https://github.com/NVIDIA/spark-rapids/pull/346)|Update the Title Header for Fine Tuning|
-|[#344](https://github.com/NVIDIA/spark-rapids/pull/344)|Fix small typo in readme|
-|[#331](https://github.com/NVIDIA/spark-rapids/pull/331)|Adds iterator and client unit tests, and prepares for more fetch failure handling|
-|[#337](https://github.com/NVIDIA/spark-rapids/pull/337)|Fix Scala compile phase to allow Java classes referencing Scala classes|
-|[#332](https://github.com/NVIDIA/spark-rapids/pull/332)|Match GPU overwritten functions with SQL functions from FunctionRegistry|
-|[#339](https://github.com/NVIDIA/spark-rapids/pull/339)|Fix databricks build|
-|[#338](https://github.com/NVIDIA/spark-rapids/pull/338)|Move GpuPartitioning to a separate file|
-|[#310](https://github.com/NVIDIA/spark-rapids/pull/310)|Update release Jenkinsfile for Databricks|
-|[#330](https://github.com/NVIDIA/spark-rapids/pull/330)|Hide private info in Jenkins scripts|
-|[#324](https://github.com/NVIDIA/spark-rapids/pull/324)|Add in basic support for GpuCartesianProductExec|
-|[#328](https://github.com/NVIDIA/spark-rapids/pull/328)|Enable slack notification for Databricks build|
-|[#321](https://github.com/NVIDIA/spark-rapids/pull/321)|update databricks patch for GpuBroadcastNestedLoopJoinExec|
-|[#322](https://github.com/NVIDIA/spark-rapids/pull/322)|Add oss.sonatype.org to download the cudf jar|
-|[#320](https://github.com/NVIDIA/spark-rapids/pull/320)|Don't mount passwd/group to the container|
-|[#258](https://github.com/NVIDIA/spark-rapids/pull/258)|Enable running TPCH tests with AQE enabled|
-|[#318](https://github.com/NVIDIA/spark-rapids/pull/318)|Build docker image with Dockerfile|
-|[#309](https://github.com/NVIDIA/spark-rapids/pull/309)|Update databricks patch to latest changes|
-|[#312](https://github.com/NVIDIA/spark-rapids/pull/312)|Trigger branch-0.2 integration test|
-|[#307](https://github.com/NVIDIA/spark-rapids/pull/307)|[Jenkins] Update the release script and Jenkinsfile|
-|[#304](https://github.com/NVIDIA/spark-rapids/pull/304)|[DOC][Minor] Fix typo in spark config name.|
-|[#303](https://github.com/NVIDIA/spark-rapids/pull/303)|Update compatibility doc for -0.0 issues|
-|[#301](https://github.com/NVIDIA/spark-rapids/pull/301)|Add info about branches in README.md|
-|[#296](https://github.com/NVIDIA/spark-rapids/pull/296)|Added in basic support for broadcast nested loop join|
-|[#297](https://github.com/NVIDIA/spark-rapids/pull/297)|Databricks CI improvements and support runtime env parameter to xfail certain tests|
-|[#292](https://github.com/NVIDIA/spark-rapids/pull/292)|Move artifacts version in version-def.sh|
-|[#254](https://github.com/NVIDIA/spark-rapids/pull/254)|Cleanup QA tests|
-|[#289](https://github.com/NVIDIA/spark-rapids/pull/289)|Clean up GpuCollectLimitMeta and add in metrics|
-|[#287](https://github.com/NVIDIA/spark-rapids/pull/287)|Add in support for right join and fix issues build right|
-|[#273](https://github.com/NVIDIA/spark-rapids/pull/273)|Added releases to the README.md|
-|[#285](https://github.com/NVIDIA/spark-rapids/pull/285)|modify run_pyspark_from_build.sh to be bash 3 friendly|
-|[#281](https://github.com/NVIDIA/spark-rapids/pull/281)|Add in support for Full Outer Join on non-null keys|
-|[#274](https://github.com/NVIDIA/spark-rapids/pull/274)|Add RapidsDiskStore tests|
-|[#259](https://github.com/NVIDIA/spark-rapids/pull/259)|Add RapidsHostMemoryStore tests|
-|[#282](https://github.com/NVIDIA/spark-rapids/pull/282)|Update Databricks patch for 0.2 branch|
-|[#261](https://github.com/NVIDIA/spark-rapids/pull/261)|Add conditional xfail test for DISTINCT aggregates with NaN|
-|[#263](https://github.com/NVIDIA/spark-rapids/pull/263)|More time ops|
-|[#256](https://github.com/NVIDIA/spark-rapids/pull/256)|Remove special cases for contains, startsWith, and endWith|
-|[#253](https://github.com/NVIDIA/spark-rapids/pull/253)|Remove GpuAttributeReference and GpuSortOrder|
-|[#271](https://github.com/NVIDIA/spark-rapids/pull/271)|Update the versions for 0.2.0 properly for the databricks build|
-|[#162](https://github.com/NVIDIA/spark-rapids/pull/162)|Integration tests for corner cases in window functions.|
-|[#264](https://github.com/NVIDIA/spark-rapids/pull/264)|Add a local mvn repo for nightly pipeline|
-|[#262](https://github.com/NVIDIA/spark-rapids/pull/262)|Refer to branch-0.2|
-|[#255](https://github.com/NVIDIA/spark-rapids/pull/255)|Revert change to make dependencies of shaded jar optional|
-|[#257](https://github.com/NVIDIA/spark-rapids/pull/257)|Fix link to RAPIDS cudf in index.md|
-|[#252](https://github.com/NVIDIA/spark-rapids/pull/252)|Update to 0.2.0-SNAPSHOT and cudf-0.15-SNAPSHOT|
-
-## Release 0.1
-
-### Features
-|||
-|:---|:---|
-|[#74](https://github.com/NVIDIA/spark-rapids/issues/74)|[FEA] Support ToUnixTimestamp|
-|[#21](https://github.com/NVIDIA/spark-rapids/issues/21)|[FEA] NormalizeNansAndZeros|
-|[#105](https://github.com/NVIDIA/spark-rapids/issues/105)|[FEA] integration tests for equi-joins|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#116](https://github.com/NVIDIA/spark-rapids/issues/116)|[BUG] calling replace with a NULL throws an exception|
-|[#168](https://github.com/NVIDIA/spark-rapids/issues/168)|[BUG] GpuUnitTests Date tests leak column vectors|
-|[#209](https://github.com/NVIDIA/spark-rapids/issues/209)|[BUG] Developers section in pom need to be updated|
-|[#204](https://github.com/NVIDIA/spark-rapids/issues/204)|[BUG] Code coverage docs are out of date|
-|[#154](https://github.com/NVIDIA/spark-rapids/issues/154)|[BUG] Incorrect output from partial-only averages with nulls|
-|[#61](https://github.com/NVIDIA/spark-rapids/issues/61)|[BUG] Cannot disable Parquet, ORC, CSV reading when using FileSourceScanExec|
-
-### PRs
-|||
-|:---|:---|
-|[#249](https://github.com/NVIDIA/spark-rapids/pull/249)|Compatability -> Compatibility|
-|[#247](https://github.com/NVIDIA/spark-rapids/pull/247)|Add index.md for default doc page, fix table formatting for configs|
-|[#241](https://github.com/NVIDIA/spark-rapids/pull/241)|Let default branch to master per the release rule|
-|[#177](https://github.com/NVIDIA/spark-rapids/pull/177)|Fixed leaks in unit test and use ColumnarBatch for testing|
-|[#243](https://github.com/NVIDIA/spark-rapids/pull/243)|Jenkins file for Databricks release|
-|[#225](https://github.com/NVIDIA/spark-rapids/pull/225)|Make internal project dependencies optional for shaded artifact|
-|[#242](https://github.com/NVIDIA/spark-rapids/pull/242)|Add site pages|
-|[#221](https://github.com/NVIDIA/spark-rapids/pull/221)|Databricks Build Support|
-|[#215](https://github.com/NVIDIA/spark-rapids/pull/215)|Remove CudfColumnVector|
-|[#213](https://github.com/NVIDIA/spark-rapids/pull/213)|Add RapidsDeviceMemoryStore tests|
-|[#214](https://github.com/NVIDIA/spark-rapids/pull/214)|[REVIEW] Test failure to pass Attribute as GpuAttribute|
-|[#211](https://github.com/NVIDIA/spark-rapids/pull/211)|Add project leads to pom developer list|
-|[#210](https://github.com/NVIDIA/spark-rapids/pull/210)|Updated coverage docs|
-|[#195](https://github.com/NVIDIA/spark-rapids/pull/195)|Support public release for plugin jar|
-|[#208](https://github.com/NVIDIA/spark-rapids/pull/208)|Remove unneeded comment from pom.xml|
-|[#191](https://github.com/NVIDIA/spark-rapids/pull/191)|WindowExec handle different spark distributions|
-|[#181](https://github.com/NVIDIA/spark-rapids/pull/181)|Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized|
-|[#196](https://github.com/NVIDIA/spark-rapids/pull/196)|Update Spark dependency to the released 3.0.0 artifacts|
-|[#206](https://github.com/NVIDIA/spark-rapids/pull/206)|Change groupID to 'com.nvidia' in IT scripts|
-|[#202](https://github.com/NVIDIA/spark-rapids/pull/202)|Fixed issue for contains when searching for an empty string|
-|[#201](https://github.com/NVIDIA/spark-rapids/pull/201)|Fix name of scan|
-|[#200](https://github.com/NVIDIA/spark-rapids/pull/200)|Fix issue with GpuAttributeReference not overrideing references|
-|[#197](https://github.com/NVIDIA/spark-rapids/pull/197)|Fix metrics for writes|
-|[#186](https://github.com/NVIDIA/spark-rapids/pull/186)|Fixed issue with nullability on concat|
-|[#193](https://github.com/NVIDIA/spark-rapids/pull/193)|Add RapidsBufferCatalog tests|
-|[#188](https://github.com/NVIDIA/spark-rapids/pull/188)|rebrand to com.nvidia instead of ai.rapids|
-|[#189](https://github.com/NVIDIA/spark-rapids/pull/189)|Handle AggregateExpression having resultIds parameter instead of a single resultId|
-|[#190](https://github.com/NVIDIA/spark-rapids/pull/190)|FileSourceScanExec can have logicalRelation parameter on some distributions|
-|[#185](https://github.com/NVIDIA/spark-rapids/pull/185)|Update type of parameter of GpuExpandExec to make it consistent|
-|[#172](https://github.com/NVIDIA/spark-rapids/pull/172)|Merge qa test to integration test|
-|[#180](https://github.com/NVIDIA/spark-rapids/pull/180)|Add MetaUtils unit tests|
-|[#171](https://github.com/NVIDIA/spark-rapids/pull/171)|Cleanup scaladoc warnings about missing links|
-|[#176](https://github.com/NVIDIA/spark-rapids/pull/176)|Updated join tests to cover more data.|
-|[#169](https://github.com/NVIDIA/spark-rapids/pull/169)|Remove dependency on shaded Spark artifact|
-|[#174](https://github.com/NVIDIA/spark-rapids/pull/174)|Added in fallback tests|
-|[#165](https://github.com/NVIDIA/spark-rapids/pull/165)|Move input metadata tests to pyspark|
-|[#173](https://github.com/NVIDIA/spark-rapids/pull/173)|Fix setting local mode for tests|
-|[#160](https://github.com/NVIDIA/spark-rapids/pull/160)|Integration tests for normalizing NaN/zeroes.|
-|[#163](https://github.com/NVIDIA/spark-rapids/pull/163)|Ignore the order locally for repartition tests|
-|[#157](https://github.com/NVIDIA/spark-rapids/pull/157)|Add partial and final only hash aggregate tests and fix nulls corner case for Average|
-|[#159](https://github.com/NVIDIA/spark-rapids/pull/159)|Add integration tests for joins|
-|[#158](https://github.com/NVIDIA/spark-rapids/pull/158)|Orc merge schema fallback and FileScan format configs|
-|[#164](https://github.com/NVIDIA/spark-rapids/pull/164)|Fix compiler warnings|
-|[#152](https://github.com/NVIDIA/spark-rapids/pull/152)|Moved cudf to 0.14 for CI|
-|[#151](https://github.com/NVIDIA/spark-rapids/pull/151)|Switch CICD pipelines to Github|
+## Older Releases
+Changelog of older releases can be found at [docs/archives](/docs/archives)
diff --git a/NOTICE-binary b/NOTICE-binary
index 8066d8545c3..0c0021116d0 100644
--- a/NOTICE-binary
+++ b/NOTICE-binary
@@ -12,17 +12,6 @@ Copyright 2014 and onwards The Apache Software Foundation
 This product includes software developed at
 The Apache Software Foundation (http://www.apache.org/).
 
----------------------------------------------------------------------
-
-Apache ORC
-Copyright 2013-2019 The Apache Software Foundation
-
-This product includes software developed by The Apache Software
-Foundation (http://www.apache.org/).
-
-This product includes software developed by Hewlett-Packard:
-(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
-
 ---------------------------------------------------------------------
 UCF Consortium - Unified Communication X (UCX)
 
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
index 4ea80017800..b4663bafa2c 100644
--- a/aggregator/pom.xml
+++ b/aggregator/pom.xml
@@ -88,53 +88,8 @@
                     </transformers>
                     <relocations>
                         <relocation>
-                            <pattern>org.apache.orc.</pattern>
-                            <shadedPattern>${rapids.shade.package}.orc.</shadedPattern>
-                        </relocation>
-                        <relocation>
-                            <pattern>org.apache.hadoop.hive.</pattern>
-                            <shadedPattern>${rapids.shade.package}.hadoop.hive.</shadedPattern>
-                            <excludes>
-                                <!--
-                                    Class exclusions for Hive UDFs, to avoid the ClassNotFoundException,
-                                    For example:
-                                        E Caused by: java.lang.ClassNotFoundException: com.nvidia.shaded.spark.hadoop.hive.serde2.objectinspector.ObjectInspector
-                                        E     at java.net.URLClassLoader.findClass(URLClassLoader.java:382)
-                                -->
-                                <exclude>org.apache.hadoop.hive.conf.HiveConf</exclude>
-                                <exclude>org.apache.hadoop.hive.ql.exec.FunctionRegistry</exclude>
-                                <exclude>org.apache.hadoop.hive.ql.exec.UDF</exclude>
-                                <exclude>org.apache.hadoop.hive.ql.exec.UDFMethodResolver</exclude>
-                                <exclude>org.apache.hadoop.hive.ql.udf.UDFType</exclude>
-                                <exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF</exclude>
-                                <exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDF$DeferredObject</exclude>
-                                <exclude>org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils$ConversionHelper</exclude>
-                                <exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector</exclude>
-                                <exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory</exclude>
-                                <exclude>org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory$ObjectInspectorOptions</exclude>
-                                <exclude>org.apache.hadoop.hive.serde2.objectinspector.StructField</exclude>
-                                <exclude>org.apache.hadoop.hive.serde2.typeinfo.TypeInfo</exclude>
-                            </excludes>
-                        </relocation>
-                        <relocation>
-                            <pattern>org.apache.hive.</pattern>
-                            <shadedPattern>${rapids.shade.package}.hive.</shadedPattern>
-                        </relocation>
-                        <relocation>
-                            <pattern>io.airlift.compress.</pattern>
-                            <shadedPattern>${rapids.shade.package}.io.airlift.compress.</shadedPattern>
-                        </relocation>
-                        <relocation>
-                            <pattern>org.apache.commons.codec.</pattern>
-                            <shadedPattern>${rapids.shade.package}.org.apache.commons.codec.</shadedPattern>
-                        </relocation>
-                        <relocation>
-                            <pattern>org.apache.commons.lang.</pattern>
-                            <shadedPattern>${rapids.shade.package}.org.apache.commons.lang.</shadedPattern>
-                        </relocation>
-                        <relocation>
-                            <pattern>com.google</pattern>
-                            <shadedPattern>${rapids.shade.package}.com.google</shadedPattern>
+                            <pattern>com.google.flatbuffers</pattern>
+                            <shadedPattern>${rapids.shade.package}.com.google.flatbuffers</shadedPattern>
                         </relocation>
                     </relocations>
                     <filters>
diff --git a/common/pom.xml b/common/pom.xml
new file mode 100644
index 00000000000..3f46ea8459f
--- /dev/null
+++ b/common/pom.xml
@@ -0,0 +1,96 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+  Copyright (c) 2022, NVIDIA CORPORATION.
+
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+-->
+
+<!-- Filter unused classes for shade purpose, generate a jar with shaded classifier -->
+<project xmlns="http://maven.apache.org/POM/4.0.0"
+         xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+         xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
+    <modelVersion>4.0.0</modelVersion>
+
+    <parent>
+        <groupId>com.nvidia</groupId>
+        <artifactId>rapids-4-spark-parent</artifactId>
+        <version>22.04.0-SNAPSHOT</version>
+    </parent>
+
+    <artifactId>rapids-4-spark-common_2.12</artifactId>
+    <name>RAPIDS Accelerator for Apache Spark Common</name>
+    <description>Utility code that is common across the RAPIDS Accelerator projects</description>
+    <version>22.04.0-SNAPSHOT</version>
+
+    <dependencies>
+        <dependency>
+            <groupId>org.scala-lang</groupId>
+            <artifactId>scala-library</artifactId>
+        </dependency>
+        <dependency>
+            <groupId>org.scalatest</groupId>
+            <artifactId>scalatest_${scala.binary.version}</artifactId>
+            <scope>test</scope>
+        </dependency>
+    </dependencies>
+
+    <build>
+        <resources>
+            <resource>
+                <!-- Include the properties file to provide the build information. -->
+                <directory>${project.build.directory}/extra-resources</directory>
+                <filtering>true</filtering>
+            </resource>
+            <resource>
+                <directory>${project.basedir}/..</directory>
+                <targetPath>META-INF</targetPath>
+                <includes>
+                    <!-- The NOTICE will be taken care of by the antrun task below -->
+                    <include>LICENSE</include>
+                </includes>
+            </resource>
+        </resources>
+        <plugins>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-jar-plugin</artifactId>
+                <executions>
+                    <execution>
+                        <id>default-test-jar</id>
+                        <phase>none</phase>
+                    </execution>
+                </executions>
+            </plugin>
+            <!-- disable surefire as tests are some place else -->
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-surefire-plugin</artifactId>
+                <configuration>
+                    <skipTests>true</skipTests>
+                </configuration>
+            </plugin>
+            <plugin>
+                <groupId>net.alchim31.maven</groupId>
+                <artifactId>scala-maven-plugin</artifactId>
+            </plugin>
+            <plugin>
+                <groupId>org.apache.rat</groupId>
+                <artifactId>apache-rat-plugin</artifactId>
+            </plugin>
+            <plugin>
+                <groupId>org.scalatest</groupId>
+                <artifactId>scalatest-maven-plugin</artifactId>
+            </plugin>
+        </plugins>
+    </build>
+</project>
diff --git a/common/src/main/scala/com/nvidia/spark/rapids/CheckUtils.scala b/common/src/main/scala/com/nvidia/spark/rapids/CheckUtils.scala
new file mode 100644
index 00000000000..65ab724cc50
--- /dev/null
+++ b/common/src/main/scala/com/nvidia/spark/rapids/CheckUtils.scala
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+object CheckUtils {
+  def checkArgument(expression: Boolean, msg: String): Unit = {
+    if (!expression) throw new IllegalArgumentException(msg)
+  }
+}
diff --git a/common/src/main/scala/com/nvidia/spark/rapids/ThreadFactoryBuilder.scala b/common/src/main/scala/com/nvidia/spark/rapids/ThreadFactoryBuilder.scala
new file mode 100644
index 00000000000..d61dd5a9c90
--- /dev/null
+++ b/common/src/main/scala/com/nvidia/spark/rapids/ThreadFactoryBuilder.scala
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import java.util.concurrent.{Executors, ThreadFactory}
+import java.util.concurrent.atomic.AtomicLong
+
+// This is similar to Guava ThreadFactoryBuilder
+// Avoid to use Guava as it is a messy dependency in practice.
+class ThreadFactoryBuilder {
+  private var nameFormat = Option.empty[String]
+  private var daemon = Option.empty[Boolean]
+
+  def setNameFormat(nameFormat: String): ThreadFactoryBuilder = {
+    nameFormat.format(0)
+    this.nameFormat = Some(nameFormat)
+    this
+  }
+
+  def setDaemon(daemon: Boolean): ThreadFactoryBuilder = {
+    this.daemon = Some(daemon)
+    this
+  }
+
+  def build(): ThreadFactory = {
+    val count = nameFormat.map(_ => new AtomicLong(0))
+    new ThreadFactory() {
+      private val defaultThreadFactory = Executors.defaultThreadFactory
+
+      override def newThread(r: Runnable): Thread = {
+        val thread = defaultThreadFactory.newThread(r)
+        nameFormat.foreach(f => thread.setName(f.format(count.get.getAndIncrement())))
+        daemon.foreach(b => thread.setDaemon(b))
+        thread
+      }
+    }
+  }
+}
diff --git a/common/src/test/scala/com/nvidia/spark/rapids/ThreadFactoryBuilderTest.scala b/common/src/test/scala/com/nvidia/spark/rapids/ThreadFactoryBuilderTest.scala
new file mode 100644
index 00000000000..d71915f51d0
--- /dev/null
+++ b/common/src/test/scala/com/nvidia/spark/rapids/ThreadFactoryBuilderTest.scala
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import java.util.concurrent.{Callable, Executors}
+
+import org.scalatest.FunSuite
+
+class ThreadFactoryBuilderTest extends FunSuite {
+
+  test("test thread factory builder") {
+    val pool1 = Executors.newFixedThreadPool(2,
+      new ThreadFactoryBuilder().setNameFormat("thread-pool1-1 %s").setDaemon(true).build())
+    try {
+      var ret = pool1.submit(new Callable[String] {
+        override def call(): String = {
+          assert(Thread.currentThread().isDaemon)
+          assert(Thread.currentThread().getName == "thread-pool1-1 0")
+          ""
+        }
+      })
+      // waits and retrieves the result, if above asserts failed, will get execution exception
+      ret.get()
+      ret = pool1.submit(() => {
+        assert(Thread.currentThread().isDaemon)
+        assert(Thread.currentThread().getName == "thread-pool1-1 1")
+        ""
+      })
+      ret.get()
+    } finally {
+      pool1.shutdown()
+    }
+
+    val pool2 = Executors.newFixedThreadPool(2,
+      new ThreadFactoryBuilder().setNameFormat("pool2-%d").build())
+    try {
+      var ret = pool2.submit(new Callable[String] {
+        override def call(): String = {
+          assert(!Thread.currentThread().isDaemon)
+          assert(Thread.currentThread().getName == "pool2-0")
+          ""
+        }
+      })
+      ret.get()
+      ret = pool2.submit(() => {
+        assert(!Thread.currentThread().isDaemon)
+        assert(Thread.currentThread().getName == "pool2-1")
+        ""
+      })
+      ret.get()
+    } finally {
+      pool2.shutdown()
+    }
+
+    val pool3 = Executors.newFixedThreadPool(2,
+      new ThreadFactoryBuilder().setNameFormat("pool3-%d").setDaemon(false).build())
+    try {
+      pool3.submit(new Callable[String] {
+        override def call(): String = {
+          assert(!Thread.currentThread().isDaemon)
+          assert(Thread.currentThread().getName == "pool3-0")
+          ""
+        }
+      }).get()
+    } finally {
+      pool3.shutdown()
+    }
+  }
+}
diff --git a/docs/archives/CHANGELOG_0.1_to_0.5.md b/docs/archives/CHANGELOG_0.1_to_0.5.md
new file mode 100644
index 00000000000..fa5412f7d8a
--- /dev/null
+++ b/docs/archives/CHANGELOG_0.1_to_0.5.md
@@ -0,0 +1,1325 @@
+# Change log
+Generated on 2022-01-28
+
+## Release 0.5
+
+### Features
+|||
+|:---|:---|
+|[#938](https://github.com/NVIDIA/spark-rapids/issues/938)|[FEA] Have hashed shuffle match spark|
+|[#1604](https://github.com/NVIDIA/spark-rapids/issues/1604)|[FEA] Support casting structs to strings |
+|[#1920](https://github.com/NVIDIA/spark-rapids/issues/1920)|[FEA] Support murmur3 hashing of structs|
+|[#2018](https://github.com/NVIDIA/spark-rapids/issues/2018)|[FEA] A way for user to find out the plugin version and cudf version in REPL|
+|[#77](https://github.com/NVIDIA/spark-rapids/issues/77)|[FEA] Support ArrayContains|
+|[#1721](https://github.com/NVIDIA/spark-rapids/issues/1721)|[FEA] build cudf jars with NVTX enabled|
+|[#1782](https://github.com/NVIDIA/spark-rapids/issues/1782)|[FEA] Shim layers to support spark versions|
+|[#1625](https://github.com/NVIDIA/spark-rapids/issues/1625)|[FEA] Support Decimal Casts to String and String to Decimal|
+|[#166](https://github.com/NVIDIA/spark-rapids/issues/166)|[FEA] Support get_json_object|
+|[#1698](https://github.com/NVIDIA/spark-rapids/issues/1698)|[FEA] Support casting structs to string|
+|[#1912](https://github.com/NVIDIA/spark-rapids/issues/1912)|[FEA] Let `Scalar Pandas UDF ` support array of struct type.|
+|[#1136](https://github.com/NVIDIA/spark-rapids/issues/1136)|[FEA] Audit: Script to list commits between different Spark versions/tags|
+|[#1921](https://github.com/NVIDIA/spark-rapids/issues/1921)|[FEA] cudf version check should be lenient on later patch version|
+|[#19](https://github.com/NVIDIA/spark-rapids/issues/19)|[FEA] Out of core sorts|
+
+### Performance
+|||
+|:---|:---|
+|[#2090](https://github.com/NVIDIA/spark-rapids/issues/2090)|[FEA] Make row count estimates available to the cost-based optimizer|
+|[#1341](https://github.com/NVIDIA/spark-rapids/issues/1341)|Optimize unnecessary columnar->row->columnar transitions with AQE|
+|[#1558](https://github.com/NVIDIA/spark-rapids/issues/1558)|[FEA] Initialize UCX early|
+|[#1633](https://github.com/NVIDIA/spark-rapids/issues/1633)|[FEA] Implement a cost-based optimizer|
+|[#1727](https://github.com/NVIDIA/spark-rapids/issues/1727)|[FEA] Put RangePartitioner data path on the GPU|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#2279](https://github.com/NVIDIA/spark-rapids/issues/2279)|[BUG] Hash Partitioning can fail for very small batches|
+|[#2314](https://github.com/NVIDIA/spark-rapids/issues/2314)|[BUG] v0.5.0 pre-release pytests join_test.py::test_hash_join_array FAILED on SPARK-EGX Yarn Cluster|
+|[#2317](https://github.com/NVIDIA/spark-rapids/issues/2317)|[BUG] GpuColumnarToRowIterator can stop after receiving an empty batch|
+|[#2244](https://github.com/NVIDIA/spark-rapids/issues/2244)|[BUG] Executors hanging when running NDS benchmarks|
+|[#2278](https://github.com/NVIDIA/spark-rapids/issues/2278)|[BUG] FullOuter join can produce too many results|
+|[#2220](https://github.com/NVIDIA/spark-rapids/issues/2220)|[BUG] csv_test.py::test_csv_fallback FAILED on the EMR Cluster|
+|[#2225](https://github.com/NVIDIA/spark-rapids/issues/2225)|[BUG] GpuSort fails on tables containing arrays.|
+|[#2232](https://github.com/NVIDIA/spark-rapids/issues/2232)|[BUG] hash_aggregate_test.py::test_hash_grpby_pivot FAILED on the Databricks Cluster|
+|[#2231](https://github.com/NVIDIA/spark-rapids/issues/2231)|[BUG]string_test.py::test_re_replace FAILED on the Dataproc Cluster|
+|[#2042](https://github.com/NVIDIA/spark-rapids/issues/2042)|[BUG] NDS q14a fails with "GpuColumnarToRow does not implement doExecuteBroadcast"|
+|[#2203](https://github.com/NVIDIA/spark-rapids/issues/2203)|[BUG] Spark nightly cache tests fail with -- master flag|
+|[#2230](https://github.com/NVIDIA/spark-rapids/issues/2230)|[BUG] qa_nightly_select_test.py::test_select FAILED on the Dataproc Cluster|
+|[#1711](https://github.com/NVIDIA/spark-rapids/issues/1711)|[BUG] find a way to stop allocating from RMM on the shuffle-client thread|
+|[#2109](https://github.com/NVIDIA/spark-rapids/issues/2109)|[BUG] Fix high priority violations detected by code analysis tools|
+|[#2217](https://github.com/NVIDIA/spark-rapids/issues/2217)|[BUG] qa_nightly_select_test failure in test_select |
+|[#2127](https://github.com/NVIDIA/spark-rapids/issues/2127)|[BUG] Parsing with two-digit year should fall back to CPU|
+|[#2078](https://github.com/NVIDIA/spark-rapids/issues/2078)|[BUG] java.lang.ArithmeticException: divide by zero when spark.sql.ansi.enabled=true|
+|[#2048](https://github.com/NVIDIA/spark-rapids/issues/2048)|[BUG] split function+ repartition result in "ai.rapids.cudf.CudaException: device-side assert triggered"|
+|[#2036](https://github.com/NVIDIA/spark-rapids/issues/2036)|[BUG] Stackoverflow when writing wide parquet files.|
+|[#1973](https://github.com/NVIDIA/spark-rapids/issues/1973)|[BUG] generate_expr_test FAILED on Dataproc Cluster|
+|[#2079](https://github.com/NVIDIA/spark-rapids/issues/2079)|[BUG] koalas.sql fails with java.lang.ArrayIndexOutOfBoundsException|
+|[#217](https://github.com/NVIDIA/spark-rapids/issues/217)|[BUG] CudaUtil should be removed|
+|[#1550](https://github.com/NVIDIA/spark-rapids/issues/1550)|[BUG] The ORC output data of a query is not readable|
+|[#2074](https://github.com/NVIDIA/spark-rapids/issues/2074)|[BUG] Intermittent NPE in RapidsBufferCatalog when running test suite|
+|[#2027](https://github.com/NVIDIA/spark-rapids/issues/2027)|[BUG] udf_cudf_test.py integration tests fail |
+|[#1899](https://github.com/NVIDIA/spark-rapids/issues/1899)|[BUG] Some queries fail when cost-based optimizations are enabled|
+|[#1914](https://github.com/NVIDIA/spark-rapids/issues/1914)|[BUG] Add in float, double, timestamp, and date support to murmur3|
+|[#2014](https://github.com/NVIDIA/spark-rapids/issues/2014)|[BUG] earlyStart option added in 0.5 can cause errors when starting UCX|
+|[#1984](https://github.com/NVIDIA/spark-rapids/issues/1984)|[BUG] NDS q58 Decimal scale (59) cannot be greater than precision (38).|
+|[#2001](https://github.com/NVIDIA/spark-rapids/issues/2001)|[BUG] RapidsShuffleManager didn't pass `dirs` to `getBlockData` from a wrapped `ShuffleBlockResolver`|
+|[#1797](https://github.com/NVIDIA/spark-rapids/issues/1797)|[BUG] occasional crashes in CI|
+|[#1861](https://github.com/NVIDIA/spark-rapids/issues/1861)|Encountered column data outside the range of input buffer|
+|[#1905](https://github.com/NVIDIA/spark-rapids/issues/1905)|[BUG] Large concat task time in GpuShuffleCoalesce with pinned memory pool|
+|[#1638](https://github.com/NVIDIA/spark-rapids/issues/1638)|[BUG] Tests `test_window_aggs_for_rows_collect_list` fails when there are null values in columns.|
+|[#1864](https://github.com/NVIDIA/spark-rapids/issues/1864)|[BUG]HostColumnarToGPU inefficient when only doing count()|
+|[#1862](https://github.com/NVIDIA/spark-rapids/issues/1862)|[BUG] spark 3.2.0-snapshot integration test failed due to conf change|
+|[#1844](https://github.com/NVIDIA/spark-rapids/issues/1844)|[BUG] branch-0.5 nightly IT FAILED on the The mortgage ETL test "Could not read footer for file: file:/xxx/xxx.snappy.parquet"|
+|[#1627](https://github.com/NVIDIA/spark-rapids/issues/1627)|[BUG] GDS exception when restoring spilled buffer|
+|[#1802](https://github.com/NVIDIA/spark-rapids/issues/1802)|[BUG] Many decimal integration test failures for 0.5|
+
+### PRs
+|||
+|:---|:---|
+|[#2326](https://github.com/NVIDIA/spark-rapids/pull/2326)|Update changelog for 0.5.0 release|
+|[#2316](https://github.com/NVIDIA/spark-rapids/pull/2316)|Update doc to note that single quoted json strings are not ok|
+|[#2319](https://github.com/NVIDIA/spark-rapids/pull/2319)|Disable hash partitioning on arrays|
+|[#2318](https://github.com/NVIDIA/spark-rapids/pull/2318)|Fix ColumnarToRowIterator handling of empty batches|
+|[#2304](https://github.com/NVIDIA/spark-rapids/pull/2304)|Update CHANGELOG.md|
+|[#2301](https://github.com/NVIDIA/spark-rapids/pull/2301)|Update doc to reflect nanosleep problem with 460.32.03|
+|[#2298](https://github.com/NVIDIA/spark-rapids/pull/2298)|Update changelog for v0.5.0 release [skip ci]|
+|[#2293](https://github.com/NVIDIA/spark-rapids/pull/2293)|update cudf version to 0.19.2|
+|[#2289](https://github.com/NVIDIA/spark-rapids/pull/2289)|Update docs to warn against 450.80.02 driver with 10.x toolkit|
+|[#2285](https://github.com/NVIDIA/spark-rapids/pull/2285)|Require single batch for full outer join streaming|
+|[#2281](https://github.com/NVIDIA/spark-rapids/pull/2281)|Remove download section for unreleased 0.4.2|
+|[#2264](https://github.com/NVIDIA/spark-rapids/pull/2264)|Add spark312 and spark320 versions of cache serializer|
+|[#2254](https://github.com/NVIDIA/spark-rapids/pull/2254)|updated gcp docs with custom dataproc image instructions|
+|[#2247](https://github.com/NVIDIA/spark-rapids/pull/2247)|Allow specifying a superclass for non-GPU execs|
+|[#2235](https://github.com/NVIDIA/spark-rapids/pull/2235)|Fix distributed cache to read requested schema |
+|[#2261](https://github.com/NVIDIA/spark-rapids/pull/2261)|Make CBO row count test more robust|
+|[#2237](https://github.com/NVIDIA/spark-rapids/pull/2237)|update cudf version to 0.19.1|
+|[#2240](https://github.com/NVIDIA/spark-rapids/pull/2240)|Get the correct 'PIPESTATUS' in bash [skip ci]|
+|[#2242](https://github.com/NVIDIA/spark-rapids/pull/2242)|Add shuffle doc section on the periodicGC configuration|
+|[#2251](https://github.com/NVIDIA/spark-rapids/pull/2251)|Fix issue when out of core sorting nested data types|
+|[#2204](https://github.com/NVIDIA/spark-rapids/pull/2204)|Run nightly tests for ParquetCachedBatchSerializer|
+|[#2245](https://github.com/NVIDIA/spark-rapids/pull/2245)|Fix pivot bug for decimalType|
+|[#2093](https://github.com/NVIDIA/spark-rapids/pull/2093)|Initial implementation of row count estimates in cost-based optimizer|
+|[#2188](https://github.com/NVIDIA/spark-rapids/pull/2188)|Support GPU broadcast exchange reuse to feed CPU BHJ when AQE is enabled|
+|[#2227](https://github.com/NVIDIA/spark-rapids/pull/2227)|ParquetCachedBatchSerializer broadcast AllConfs instead of SQLConf to fix distributed mode|
+|[#2223](https://github.com/NVIDIA/spark-rapids/pull/2223)|Adds subquery aggregate tests from SPARK-31620|
+|[#2222](https://github.com/NVIDIA/spark-rapids/pull/2222)|Remove groupId already specified in parent pom|
+|[#2209](https://github.com/NVIDIA/spark-rapids/pull/2209)|Fixed a few issues with out of core sort|
+|[#2218](https://github.com/NVIDIA/spark-rapids/pull/2218)|Fix incorrect RegExpReplace children handling on Spark 3.1+|
+|[#2207](https://github.com/NVIDIA/spark-rapids/pull/2207)|fix batch size default values in the tuning guide|
+|[#2208](https://github.com/NVIDIA/spark-rapids/pull/2208)|Revert "add nightly cache tests (#2083)"|
+|[#2206](https://github.com/NVIDIA/spark-rapids/pull/2206)|Fix shim301db build|
+|[#2192](https://github.com/NVIDIA/spark-rapids/pull/2192)|Fix index-based access to the head elements|
+|[#2210](https://github.com/NVIDIA/spark-rapids/pull/2210)|Avoid redundant collection conversions|
+|[#2190](https://github.com/NVIDIA/spark-rapids/pull/2190)|JNI fixes for StringWordCount native UDF example|
+|[#2086](https://github.com/NVIDIA/spark-rapids/pull/2086)|Updating documentation for data format support|
+|[#2172](https://github.com/NVIDIA/spark-rapids/pull/2172)|Remove easy unused symbols|
+|[#2089](https://github.com/NVIDIA/spark-rapids/pull/2089)|Update PandasUDF doc|
+|[#2195](https://github.com/NVIDIA/spark-rapids/pull/2195)|fix cudf 0.19.0 download link [skip ci]|
+|[#2175](https://github.com/NVIDIA/spark-rapids/pull/2175)|Branch 0.5 doc update|
+|[#2168](https://github.com/NVIDIA/spark-rapids/pull/2168)|Simplify GpuExpressions w/ withResourceIfAllowed|
+|[#2055](https://github.com/NVIDIA/spark-rapids/pull/2055)|Support PivotFirst|
+|[#2183](https://github.com/NVIDIA/spark-rapids/pull/2183)|GpuParquetScan#readBufferToTable remove dead code|
+|[#2129](https://github.com/NVIDIA/spark-rapids/pull/2129)|Fall back to CPU when parsing two-digit years|
+|[#2083](https://github.com/NVIDIA/spark-rapids/pull/2083)|add nightly cache tests|
+|[#2151](https://github.com/NVIDIA/spark-rapids/pull/2151)|add corresponding close call for HostMemoryOutputStream|
+|[#2169](https://github.com/NVIDIA/spark-rapids/pull/2169)|Work around bug in Spark for integration test|
+|[#2130](https://github.com/NVIDIA/spark-rapids/pull/2130)|Fix divide-by-zero in GpuAverage with ansi mode|
+|[#2149](https://github.com/NVIDIA/spark-rapids/pull/2149)|Auto generate the supported types for the file formats|
+|[#2072](https://github.com/NVIDIA/spark-rapids/pull/2072)|Disable CSV parsing by default and update tests to better show what is left|
+|[#2157](https://github.com/NVIDIA/spark-rapids/pull/2157)|fix merge conflict for 0.4.2 [skip ci]|
+|[#2144](https://github.com/NVIDIA/spark-rapids/pull/2144)|Allow array and struct types to pass thru when doing join|
+|[#2145](https://github.com/NVIDIA/spark-rapids/pull/2145)|Avoid GPU shuffle for round-robin of unsortable types|
+|[#2021](https://github.com/NVIDIA/spark-rapids/pull/2021)|Add in support for murmur3 hashing of structs|
+|[#2128](https://github.com/NVIDIA/spark-rapids/pull/2128)|Add in Partition type check support|
+|[#2116](https://github.com/NVIDIA/spark-rapids/pull/2116)|Add dynamic Spark configuration for Databricks|
+|[#2132](https://github.com/NVIDIA/spark-rapids/pull/2132)|Log plugin and cudf versions on startup|
+|[#2135](https://github.com/NVIDIA/spark-rapids/pull/2135)|Disable Spark 3.2 shim by default|
+|[#2125](https://github.com/NVIDIA/spark-rapids/pull/2125)|enable auto-merge from 0.5 to 0.6 [skip ci]|
+|[#2120](https://github.com/NVIDIA/spark-rapids/pull/2120)|Materialize Stream before serialization|
+|[#2119](https://github.com/NVIDIA/spark-rapids/pull/2119)|Add more comprehensive documentation on supported date formats|
+|[#1717](https://github.com/NVIDIA/spark-rapids/pull/1717)|Decimal32 support|
+|[#2114](https://github.com/NVIDIA/spark-rapids/pull/2114)|Modified the Download page for 0.4.1 and updated doc to point to K8s guide|
+|[#2106](https://github.com/NVIDIA/spark-rapids/pull/2106)|Fix some buffer leaks|
+|[#2097](https://github.com/NVIDIA/spark-rapids/pull/2097)|fix the bound row project empty issue in row frame|
+|[#2099](https://github.com/NVIDIA/spark-rapids/pull/2099)|Remove verbose log prints to make the build/test log clean|
+|[#2105](https://github.com/NVIDIA/spark-rapids/pull/2105)|Cleanup prior Spark sessions in tests consistently|
+|[#2104](https://github.com/NVIDIA/spark-rapids/pull/2104)| Clone apache spark source code to parse the git commit IDs|
+|[#2095](https://github.com/NVIDIA/spark-rapids/pull/2095)|fix refcount when materializing device buffer from GDS|
+|[#2100](https://github.com/NVIDIA/spark-rapids/pull/2100)|[BUG] add wget for fetching conda [skip ci]|
+|[#2096](https://github.com/NVIDIA/spark-rapids/pull/2096)|Adjust images for integration tests|
+|[#2094](https://github.com/NVIDIA/spark-rapids/pull/2094)|Changed name of parquet files for Mortgage ETL Integration test|
+|[#2035](https://github.com/NVIDIA/spark-rapids/pull/2035)|Accelerate data transfer for map Pandas UDF plan|
+|[#2050](https://github.com/NVIDIA/spark-rapids/pull/2050)|stream shuffle buffers from GDS to UCX|
+|[#2084](https://github.com/NVIDIA/spark-rapids/pull/2084)|Enable ORC write by default|
+|[#2088](https://github.com/NVIDIA/spark-rapids/pull/2088)|Upgrade ScalaTest plugin to respect JAVA_HOME|
+|[#1932](https://github.com/NVIDIA/spark-rapids/pull/1932)|Create a getting started on K8s page|
+|[#2080](https://github.com/NVIDIA/spark-rapids/pull/2080)|Improve error message after failed RMM shutdown|
+|[#2064](https://github.com/NVIDIA/spark-rapids/pull/2064)|Optimize unnecessary columnar->row->columnar transitions with AQE|
+|[#2025](https://github.com/NVIDIA/spark-rapids/pull/2025)|Update the doc for pandas udf on databricks|
+|[#2059](https://github.com/NVIDIA/spark-rapids/pull/2059)|Add the flag 'TEST_TYPE' to avoid integration tests silently skipping some test cases|
+|[#2075](https://github.com/NVIDIA/spark-rapids/pull/2075)|Remove debug println from CBO test|
+|[#2046](https://github.com/NVIDIA/spark-rapids/pull/2046)|support casting Decimal to String|
+|[#1812](https://github.com/NVIDIA/spark-rapids/pull/1812)|allow spilled buffers to be unspilled|
+|[#2061](https://github.com/NVIDIA/spark-rapids/pull/2061)|Run the pandas udf using cudf on Databricks|
+|[#1893](https://github.com/NVIDIA/spark-rapids/pull/1893)|Plug-in support for get_json_object|
+|[#2044](https://github.com/NVIDIA/spark-rapids/pull/2044)|Use partition for GPU hash partitioning|
+|[#1954](https://github.com/NVIDIA/spark-rapids/pull/1954)|Fix CBO bug where incompatible plans were produced with AQE on|
+|[#2049](https://github.com/NVIDIA/spark-rapids/pull/2049)|Remove incompatable int overflow checking|
+|[#2056](https://github.com/NVIDIA/spark-rapids/pull/2056)|Remove Spark 3.2 from premerge and nightly CI run|
+|[#1814](https://github.com/NVIDIA/spark-rapids/pull/1814)|Struct to string casting functionality|
+|[#2037](https://github.com/NVIDIA/spark-rapids/pull/2037)|Fix warnings from use of deprecated cudf methods|
+|[#2033](https://github.com/NVIDIA/spark-rapids/pull/2033)|Bump up pre-merge OS from ubuntu 16 to ubuntu 18 [skip ci]|
+|[#1883](https://github.com/NVIDIA/spark-rapids/pull/1883)|Enable sort for single-level nesting struct columns on GPU|
+|[#2016](https://github.com/NVIDIA/spark-rapids/pull/2016)|Refactor logic for parallel testing|
+|[#2022](https://github.com/NVIDIA/spark-rapids/pull/2022)|Update order by to not load native libraries when sorting|
+|[#2017](https://github.com/NVIDIA/spark-rapids/pull/2017)|Add in murmur3 support for float, double, date and timestamp|
+|[#1981](https://github.com/NVIDIA/spark-rapids/pull/1981)|Fix GpuSize|
+|[#1999](https://github.com/NVIDIA/spark-rapids/pull/1999)|support casting string to decimal|
+|[#2006](https://github.com/NVIDIA/spark-rapids/pull/2006)|Enable windowed `collect_list` by default|
+|[#2000](https://github.com/NVIDIA/spark-rapids/pull/2000)|Use Spark's HybridRowQueue to avoid MemoryConsumer API shim|
+|[#2015](https://github.com/NVIDIA/spark-rapids/pull/2015)|Fix bug where rkey buffer is getting advanced after the first handshake|
+|[#2007](https://github.com/NVIDIA/spark-rapids/pull/2007)|Fix unknown column name error when filtering ORC file with no names|
+|[#2005](https://github.com/NVIDIA/spark-rapids/pull/2005)|Update to new is_before_spark_311 function name|
+|[#1944](https://github.com/NVIDIA/spark-rapids/pull/1944)|Support running scalar pandas UDF with array type.|
+|[#1991](https://github.com/NVIDIA/spark-rapids/pull/1991)|Fixes creation of invalid DecimalType in GpuDivide.tagExprForGpu|
+|[#1958](https://github.com/NVIDIA/spark-rapids/pull/1958)|Support legacy behavior of parameterless count |
+|[#1919](https://github.com/NVIDIA/spark-rapids/pull/1919)|Add support for Structs for UnionExec|
+|[#2002](https://github.com/NVIDIA/spark-rapids/pull/2002)|Pass dirs to getBlockData for a wrapped shuffle resolver|
+|[#1983](https://github.com/NVIDIA/spark-rapids/pull/1983)|document building against different CUDA Toolkit versions|
+|[#1994](https://github.com/NVIDIA/spark-rapids/pull/1994)|Merge 0.4 to 0.5 [skip ci]|
+|[#1982](https://github.com/NVIDIA/spark-rapids/pull/1982)|Update ORC pushdown filter building to latest Spark logic|
+|[#1978](https://github.com/NVIDIA/spark-rapids/pull/1978)|Add audit script to list commits from Spark|
+|[#1976](https://github.com/NVIDIA/spark-rapids/pull/1976)|Temp fix for parquet write changes|
+|[#1970](https://github.com/NVIDIA/spark-rapids/pull/1970)|add maven profiles for supported CUDA versions|
+|[#1951](https://github.com/NVIDIA/spark-rapids/pull/1951)|Branch 0.5 doc remove numpartitions|
+|[#1967](https://github.com/NVIDIA/spark-rapids/pull/1967)|Update FAQ for Dataset API and format supported versions|
+|[#1972](https://github.com/NVIDIA/spark-rapids/pull/1972)|support GpuSize|
+|[#1966](https://github.com/NVIDIA/spark-rapids/pull/1966)|add xml report for codecov|
+|[#1955](https://github.com/NVIDIA/spark-rapids/pull/1955)|Fix typo in Arrow optimization config|
+|[#1956](https://github.com/NVIDIA/spark-rapids/pull/1956)|Fix NPE in plugin shutdown|
+|[#1930](https://github.com/NVIDIA/spark-rapids/pull/1930)|Relax cudf version check for patch-level versions|
+|[#1787](https://github.com/NVIDIA/spark-rapids/pull/1787)|support distributed file path in cloud environment|
+|[#1961](https://github.com/NVIDIA/spark-rapids/pull/1961)|change premege GPU_TYPE from secret to global env [skip ci]|
+|[#1957](https://github.com/NVIDIA/spark-rapids/pull/1957)|Update Spark 3.1.2 shim for float upcast behavior|
+|[#1889](https://github.com/NVIDIA/spark-rapids/pull/1889)|Decimal DIV changes |
+|[#1947](https://github.com/NVIDIA/spark-rapids/pull/1947)|Move doc of Pandas UDF to additional-functionality|
+|[#1938](https://github.com/NVIDIA/spark-rapids/pull/1938)|Add spark.executor.resource.gpu.amount=1 to YARN and K8s docs|
+|[#1937](https://github.com/NVIDIA/spark-rapids/pull/1937)|Fix merge conflict with branch-0.4|
+|[#1878](https://github.com/NVIDIA/spark-rapids/pull/1878)|spillable cache for GpuCartesianRDD|
+|[#1843](https://github.com/NVIDIA/spark-rapids/pull/1843)|Refactor GpuGenerateExec and Explode|
+|[#1933](https://github.com/NVIDIA/spark-rapids/pull/1933)|Split DB scripts to make them common for the build and IT pipeline|
+|[#1935](https://github.com/NVIDIA/spark-rapids/pull/1935)|Update Alias SQL quoting and float-to-timestamp casting to match Spark 3.2|
+|[#1926](https://github.com/NVIDIA/spark-rapids/pull/1926)|Consolidate RAT settings in parent pom|
+|[#1918](https://github.com/NVIDIA/spark-rapids/pull/1918)|Minor code cleanup in dateTImeExpressions|
+|[#1906](https://github.com/NVIDIA/spark-rapids/pull/1906)|Remove get call on timeZoneId|
+|[#1908](https://github.com/NVIDIA/spark-rapids/pull/1908)|Remove the Scala version of Mortgage ETL tests from nightly test|
+|[#1894](https://github.com/NVIDIA/spark-rapids/pull/1894)|Modified Download Page to re-order the items and change the format of download links|
+|[#1909](https://github.com/NVIDIA/spark-rapids/pull/1909)|Avoid pinned memory for shuffle host buffers|
+|[#1891](https://github.com/NVIDIA/spark-rapids/pull/1891)|Connect UCX endpoints early during app startup|
+|[#1877](https://github.com/NVIDIA/spark-rapids/pull/1877)|remove docker build in pre-merge [skip ci]|
+|[#1830](https://github.com/NVIDIA/spark-rapids/pull/1830)|Enable the tests for collect over window.|
+|[#1882](https://github.com/NVIDIA/spark-rapids/pull/1882)|GpuArrowColumnarBatchBuilder retains the references of ArrowBuf until HostToGpuCoalesceIterator put them into device|
+|[#1868](https://github.com/NVIDIA/spark-rapids/pull/1868)|Increase row limit when doing count() for HostColumnarToGpu |
+|[#1855](https://github.com/NVIDIA/spark-rapids/pull/1855)|Expose row count statistics in GpuShuffleExchangeExec|
+|[#1875](https://github.com/NVIDIA/spark-rapids/pull/1875)|Fix merge conflict with branch-0.4|
+|[#1841](https://github.com/NVIDIA/spark-rapids/pull/1841)|Add in support for DateAddInterval|
+|[#1869](https://github.com/NVIDIA/spark-rapids/pull/1869)|Fix tests for Spark 3.2.0 shim|
+|[#1858](https://github.com/NVIDIA/spark-rapids/pull/1858)|fix shuffle manager doc on ucx library path|
+|[#1836](https://github.com/NVIDIA/spark-rapids/pull/1836)|Add shim for Spark 3.1.2|
+|[#1852](https://github.com/NVIDIA/spark-rapids/pull/1852)|Fix Part Suite Tests|
+|[#1616](https://github.com/NVIDIA/spark-rapids/pull/1616)|Cost-based optimizer|
+|[#1834](https://github.com/NVIDIA/spark-rapids/pull/1834)|Add shim for Spark 3.0.3|
+|[#1839](https://github.com/NVIDIA/spark-rapids/pull/1839)|Refactor join code to reduce duplicated code|
+|[#1848](https://github.com/NVIDIA/spark-rapids/pull/1848)|Fix merge conflict with branch-0.4|
+|[#1796](https://github.com/NVIDIA/spark-rapids/pull/1796)|Have most of range partitioning run on the GPU|
+|[#1845](https://github.com/NVIDIA/spark-rapids/pull/1845)|Fix fails on the mortgage ETL test|
+|[#1829](https://github.com/NVIDIA/spark-rapids/pull/1829)|Cleanup unused Jenkins files and scripts|
+|[#1704](https://github.com/NVIDIA/spark-rapids/pull/1704)|Create a shim for Spark 3.2.0 development|
+|[#1838](https://github.com/NVIDIA/spark-rapids/pull/1838)|Make databricks build.sh more convenient for dev|
+|[#1835](https://github.com/NVIDIA/spark-rapids/pull/1835)|Fix merge conflict with branch-0.4|
+|[#1808](https://github.com/NVIDIA/spark-rapids/pull/1808)|Update mortgage tests to support reading multiple dataset formats|
+|[#1822](https://github.com/NVIDIA/spark-rapids/pull/1822)|Fix conflict 0.4 to 0.5|
+|[#1807](https://github.com/NVIDIA/spark-rapids/pull/1807)|Fix merge conflict between branch-0.4 and branch-0.5|
+|[#1788](https://github.com/NVIDIA/spark-rapids/pull/1788)|Spill metrics everywhere|
+|[#1719](https://github.com/NVIDIA/spark-rapids/pull/1719)|Add in out of core sort|
+|[#1728](https://github.com/NVIDIA/spark-rapids/pull/1728)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
+|[#1689](https://github.com/NVIDIA/spark-rapids/pull/1689)|Update docs for plugin 0.5.0-SNAPSHOT and cudf 0.19-SNAPSHOT|
+|[#1682](https://github.com/NVIDIA/spark-rapids/pull/1682)|init CI/CD dependencies branch-0.5|
+
+## Release 0.4.1
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#1985](https://github.com/NVIDIA/spark-rapids/issues/1985)|[BUG] broadcast exchange can fail on 0.4|
+
+### PRs
+|||
+|:---|:---|
+|[#1995](https://github.com/NVIDIA/spark-rapids/pull/1995)|update changelog 0.4.1 [skip ci]|
+|[#1990](https://github.com/NVIDIA/spark-rapids/pull/1990)|Prepare for v0.4.1 release|
+|[#1988](https://github.com/NVIDIA/spark-rapids/pull/1988)|broadcast exchange can fail when job group set|
+
+## Release 0.4
+
+### Features
+|||
+|:---|:---|
+|[#1773](https://github.com/NVIDIA/spark-rapids/issues/1773)|[FEA] Spark 3.0.2 release support|
+|[#80](https://github.com/NVIDIA/spark-rapids/issues/80)|[FEA] Support the struct SQL function|
+|[#76](https://github.com/NVIDIA/spark-rapids/issues/76)|[FEA] Support CreateArray|
+|[#1635](https://github.com/NVIDIA/spark-rapids/issues/1635)|[FEA] RAPIDS accelerated Java UDF|
+|[#1333](https://github.com/NVIDIA/spark-rapids/issues/1333)|[FEA] Support window operations on Decimal|
+|[#1419](https://github.com/NVIDIA/spark-rapids/issues/1419)|[FEA] Support GPU accelerated UDF alternative for higher order function "aggregate" over window|
+|[#1580](https://github.com/NVIDIA/spark-rapids/issues/1580)|[FEA] Support Decimal for ParquetCachedBatchSerializer|
+|[#1600](https://github.com/NVIDIA/spark-rapids/issues/1600)|[FEA] Support ScalarSubquery|
+|[#1072](https://github.com/NVIDIA/spark-rapids/issues/1072)|[FEA] Support for a custom DataSource V2 which supplies Arrow data|
+|[#906](https://github.com/NVIDIA/spark-rapids/issues/906)|[FEA] Clarify query explanation to directly state what will run on GPU|
+|[#1335](https://github.com/NVIDIA/spark-rapids/issues/1335)|[FEA] Support CollectLimitExec for decimal|
+|[#1485](https://github.com/NVIDIA/spark-rapids/issues/1485)|[FEA] Decimal Support for Parquet Write|
+|[#1329](https://github.com/NVIDIA/spark-rapids/issues/1329)|[FEA] Decimal support for multiply int div, add, subtract and null safe equals|
+|[#1351](https://github.com/NVIDIA/spark-rapids/issues/1351)|[FEA] Execute UDFs that provide a RAPIDS execution path|
+|[#1330](https://github.com/NVIDIA/spark-rapids/issues/1330)|[FEA] Support Decimal Casts|
+|[#1353](https://github.com/NVIDIA/spark-rapids/issues/1353)|[FEA] Example of RAPIDS UDF using custom GPU code|
+|[#1487](https://github.com/NVIDIA/spark-rapids/issues/1487)|[FEA] Change spark 3.1.0 to 3.1.1|
+|[#1334](https://github.com/NVIDIA/spark-rapids/issues/1334)|[FEA] Add support for count aggregate on decimal|
+|[#1325](https://github.com/NVIDIA/spark-rapids/issues/1325)|[FEA] Add in join support for decimal|
+|[#1326](https://github.com/NVIDIA/spark-rapids/issues/1326)|[FEA] Add in Broadcast support for decimal values|
+|[#37](https://github.com/NVIDIA/spark-rapids/issues/37)|[FEA] round and bround SQL functions|
+|[#78](https://github.com/NVIDIA/spark-rapids/issues/78)|[FEA] Support CreateNamedStruct function|
+|[#1331](https://github.com/NVIDIA/spark-rapids/issues/1331)|[FEA] UnionExec and ExpandExec support for decimal|
+|[#1332](https://github.com/NVIDIA/spark-rapids/issues/1332)|[FEA] Support CaseWhen, Coalesce and IfElse for decimal|
+|[#937](https://github.com/NVIDIA/spark-rapids/issues/937)|[FEA] have murmur3 hash function that matches exactly with spark|
+|[#1324](https://github.com/NVIDIA/spark-rapids/issues/1324)|[FEA] Support Parquet Read of Decimal FIXED_LENGTH_BYTE_ARRAY|
+|[#1428](https://github.com/NVIDIA/spark-rapids/issues/1428)|[FEA] Add support for unary decimal operations abs, floor, ceil, unary - and unary +|
+|[#1375](https://github.com/NVIDIA/spark-rapids/issues/1375)|[FEA] Add log statement for what the concurrentGpuTasks tasks is set to on executor startup|
+|[#1352](https://github.com/NVIDIA/spark-rapids/issues/1352)|[FEA] Example of RAPIDS UDF using cudf Java APIs|
+|[#1328](https://github.com/NVIDIA/spark-rapids/issues/1328)|[FEA] Support sorting and shuffle of decimal|
+|[#1316](https://github.com/NVIDIA/spark-rapids/issues/1316)|[FEA] Support simple DECIMAL aggregates|
+
+### Performance
+|||
+|:---|:---|
+|[#1435](https://github.com/NVIDIA/spark-rapids/issues/1435)|[FEA]Improve the file reading by using local file caching|
+|[#1738](https://github.com/NVIDIA/spark-rapids/issues/1738)|[FEA] Reduce regex usage in CAST string to date/timestamp|
+|[#987](https://github.com/NVIDIA/spark-rapids/issues/987)|[FEA] Optimize CAST from string to temporal types by using cuDF is_timestamp function|
+|[#1594](https://github.com/NVIDIA/spark-rapids/issues/1594)|[FEA] RAPIDS accelerated ScalaUDF|
+|[#103](https://github.com/NVIDIA/spark-rapids/issues/103)|[FEA] GPU version of TakeOrderedAndProject|
+|[#1024](https://github.com/NVIDIA/spark-rapids/issues/1024)|Cleanup RAPIDS transport calls to `receive`|
+|[#1366](https://github.com/NVIDIA/spark-rapids/issues/1366)|Seeing performance differences of multi-threaded/coalesce/perfile Parquet reader type for a single file|
+|[#1200](https://github.com/NVIDIA/spark-rapids/issues/1200)|[FEA] Accelerate the scan speed for coalescing parquet reader when reading files from multiple partitioned folders|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#1885](https://github.com/NVIDIA/spark-rapids/issues/1885)|[BUG] natural join on string key results in a data frame with spurious NULLs|
+|[#1785](https://github.com/NVIDIA/spark-rapids/issues/1785)|[BUG] Rapids pytest integration tests FAILED on Yarn cluster with unrecognized arguments: `--std_input_path=src/test/resources/`|
+|[#999](https://github.com/NVIDIA/spark-rapids/issues/999)|[BUG] test_multi_types_window_aggs_for_rows_lead_lag fails against Spark 3.1.0|
+|[#1818](https://github.com/NVIDIA/spark-rapids/issues/1818)|[BUG] unmoored doc comment warnings in GpuCast|
+|[#1817](https://github.com/NVIDIA/spark-rapids/issues/1817)|[BUG] Developer build with local modifications fails during verify phase|
+|[#1644](https://github.com/NVIDIA/spark-rapids/issues/1644)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks|
+|[#1771](https://github.com/NVIDIA/spark-rapids/issues/1771)|[BUG] Databricks AWS CI/CD failing to create cluster|
+|[#1157](https://github.com/NVIDIA/spark-rapids/issues/1157)|[BUG] Fix regression supporting to_date on GPU with Spark 3.1.0|
+|[#716](https://github.com/NVIDIA/spark-rapids/issues/716)|[BUG] Cast String to TimeStamp issues|
+|[#1117](https://github.com/NVIDIA/spark-rapids/issues/1117)|[BUG] CAST string to date returns wrong values for dates with out-of-range values|
+|[#1670](https://github.com/NVIDIA/spark-rapids/issues/1670)|[BUG] Some TPC-DS queries fail with AQE when decimal types enabled|
+|[#1730](https://github.com/NVIDIA/spark-rapids/issues/1730)|[BUG] Range Partitioning can crash when processing is in the order-by|
+|[#1726](https://github.com/NVIDIA/spark-rapids/issues/1726)|[BUG] java url decode test failing on databricks, emr, and dataproc|
+|[#1651](https://github.com/NVIDIA/spark-rapids/issues/1651)|[BUG] GDS exception when writing shuffle file|
+|[#1702](https://github.com/NVIDIA/spark-rapids/issues/1702)|[BUG] check all tests marked xfail for Spark 3.1.1|
+|[#575](https://github.com/NVIDIA/spark-rapids/issues/575)|[BUG] Spark 3.1 FAILED join_test.py::test_broadcast_join_mixed[FullOuter][IGNORE_ORDER] failed|
+|[#577](https://github.com/NVIDIA/spark-rapids/issues/577)|[BUG] Spark 3.1 log arithmetic functions fail|
+|[#1541](https://github.com/NVIDIA/spark-rapids/issues/1541)|[BUG] Tests fail in integration in distributed mode after allowing nested types through in sort and shuffle|
+|[#1626](https://github.com/NVIDIA/spark-rapids/issues/1626)|[BUG] TPC-DS-like query 77 at scale=3TB fails with maxResultSize exceeded error|
+|[#1576](https://github.com/NVIDIA/spark-rapids/issues/1576)|[BUG] loading SPARK-32639 example parquet file triggers a JVM crash |
+|[#1643](https://github.com/NVIDIA/spark-rapids/issues/1643)|[BUG] TPC-DS-Like q10, q35, and q69 - slow or hanging at leftSemiJoin|
+|[#1650](https://github.com/NVIDIA/spark-rapids/issues/1650)|[BUG] BenchmarkRunner does not include query name in JSON summary filename when running multiple queries|
+|[#1654](https://github.com/NVIDIA/spark-rapids/issues/1654)|[BUG] TPC-DS-like query 59 at scale=3TB with AQE fails with join mismatch|
+|[#1274](https://github.com/NVIDIA/spark-rapids/issues/1274)|[BUG] OutOfMemoryError - Maximum pool size exceeded while running 24 day criteo ETL Transform stage|
+|[#1497](https://github.com/NVIDIA/spark-rapids/issues/1497)|[BUG] Spark-rapids v0.3.0 pytest integration tests with UCX on FAILED on Yarn cluster|
+|[#1534](https://github.com/NVIDIA/spark-rapids/issues/1534)|[BUG] Spark 3.1.1 test failure in writing due to removal of InMemoryFileIndex.shouldFilterOut|
+|[#1155](https://github.com/NVIDIA/spark-rapids/issues/1155)|[BUG] on shutdown don't print `Socket closed` exception when shutting down UCX.scala|
+|[#1510](https://github.com/NVIDIA/spark-rapids/issues/1510)|[BUG] IllegalArgumentException during shuffle|
+|[#1513](https://github.com/NVIDIA/spark-rapids/issues/1513)|[BUG] executor not fully initialized may get calls from Spark, in the process setting the `catalog` incorrectly|
+|[#1466](https://github.com/NVIDIA/spark-rapids/issues/1466)|[BUG] Databricks build must run before the rapids nightly|
+|[#1456](https://github.com/NVIDIA/spark-rapids/issues/1456)|[BUG] Databricks 0.4 parquet integration tests fail|
+|[#1400](https://github.com/NVIDIA/spark-rapids/issues/1400)|[BUG] Regressions in spark-shell usage of benchmark utilities|
+|[#1119](https://github.com/NVIDIA/spark-rapids/issues/1119)|[BUG] inner join fails with Column size cannot be negative|
+|[#1079](https://github.com/NVIDIA/spark-rapids/issues/1079)|[BUG]The Scala UDF function cannot invoke the UDF compiler when it's passed to "explode"|
+|[#1298](https://github.com/NVIDIA/spark-rapids/issues/1298)|TPCxBB query16 failed at UnsupportedOperationException: org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainIntegerDictionary|
+|[#1271](https://github.com/NVIDIA/spark-rapids/issues/1271)|[BUG] CastOpSuite and AnsiCastOpSuite failing with ArithmeticException on Spark 3.1|
+|[#84](https://github.com/NVIDIA/spark-rapids/issues/84)|[BUG] sort does not match spark for -0.0 and 0.0|
+|[#578](https://github.com/NVIDIA/spark-rapids/issues/578)|[BUG] Spark 3.1 qa_nightly_select_test.py Full join test failures|
+|[#586](https://github.com/NVIDIA/spark-rapids/issues/586)|[BUG] Spark3.1 tpch failures|
+|[#837](https://github.com/NVIDIA/spark-rapids/issues/837)|[BUG] Distinct count of floating point values differs with regular spark|
+|[#953](https://github.com/NVIDIA/spark-rapids/issues/953)|[BUG] 3.1.0 pos_explode tests are failing|
+|[#127](https://github.com/NVIDIA/spark-rapids/issues/127)|[BUG] String CSV parsing does not respect nullValues|
+|[#1203](https://github.com/NVIDIA/spark-rapids/issues/1203)|[BUG] tpcds query 51 fails with join error on Spark 3.1.0|
+|[#750](https://github.com/NVIDIA/spark-rapids/issues/750)|[BUG] udf_cudf_test::test_with_column fails with IPC error |
+|[#1348](https://github.com/NVIDIA/spark-rapids/issues/1348)|[BUG] Host columnar decimal conversions are failing|
+|[#1270](https://github.com/NVIDIA/spark-rapids/issues/1270)|[BUG] Benchmark runner fails to produce report if benchmark fails due to an invalid query plan|
+|[#1179](https://github.com/NVIDIA/spark-rapids/issues/1179)|[BUG] SerializeConcatHostBuffersDeserializeBatch may have thread issues|
+|[#1115](https://github.com/NVIDIA/spark-rapids/issues/1115)|[BUG] Unchecked type warning in SparkQueryCompareTestSuite|
+
+### PRs
+|||
+|:---|:---|
+|[#1963](https://github.com/NVIDIA/spark-rapids/pull/1963)|Update changelog 0.4 [skip ci]|
+|[#1960](https://github.com/NVIDIA/spark-rapids/pull/1960)|Replace sonatype staging link with maven central link|
+|[#1945](https://github.com/NVIDIA/spark-rapids/pull/1945)|Update changelog 0.4 [skip ci]|
+|[#1910](https://github.com/NVIDIA/spark-rapids/pull/1910)|Make hash partitioning match CPU|
+|[#1927](https://github.com/NVIDIA/spark-rapids/pull/1927)|Change cuDF dependency to 0.18.1|
+|[#1934](https://github.com/NVIDIA/spark-rapids/pull/1934)|Update documentation to use cudf version 0.18.1|
+|[#1871](https://github.com/NVIDIA/spark-rapids/pull/1871)|Disable coalesce batch spilling to avoid cudf contiguous_split bug|
+|[#1849](https://github.com/NVIDIA/spark-rapids/pull/1849)|Update changelog for 0.4|
+|[#1744](https://github.com/NVIDIA/spark-rapids/pull/1744)|Fix NullPointerException on null partition insert|
+|[#1842](https://github.com/NVIDIA/spark-rapids/pull/1842)|Update to note support for 3.0.2|
+|[#1832](https://github.com/NVIDIA/spark-rapids/pull/1832)|Spark 3.1.1 shim no longer a snapshot shim|
+|[#1831](https://github.com/NVIDIA/spark-rapids/pull/1831)|Spark 3.0.2 shim no longer a snapshot shim|
+|[#1826](https://github.com/NVIDIA/spark-rapids/pull/1826)|Remove benchmarks|
+|[#1828](https://github.com/NVIDIA/spark-rapids/pull/1828)|Update cudf dependency to 0.18|
+|[#1813](https://github.com/NVIDIA/spark-rapids/pull/1813)|Fix LEAD/LAG failures in Spark 3.1.1|
+|[#1819](https://github.com/NVIDIA/spark-rapids/pull/1819)|Fix scaladoc warning in GpuCast|
+|[#1820](https://github.com/NVIDIA/spark-rapids/pull/1820)|[BUG] make modified check pre-merge only|
+|[#1780](https://github.com/NVIDIA/spark-rapids/pull/1780)|Remove SNAPSHOT from test and integration_test READMEs|
+|[#1809](https://github.com/NVIDIA/spark-rapids/pull/1809)|check if modified files after update_config/supported|
+|[#1804](https://github.com/NVIDIA/spark-rapids/pull/1804)|Update UCX documentation for RX_QUEUE_LEN and Docker|
+|[#1810](https://github.com/NVIDIA/spark-rapids/pull/1810)|Pandas UDF: Sort the data before computing the sum.|
+|[#1751](https://github.com/NVIDIA/spark-rapids/pull/1751)|Exclude foldable expressions from GPU if constant folding is disabled|
+|[#1798](https://github.com/NVIDIA/spark-rapids/pull/1798)|Add documentation about explain not on GPU when AQE is on|
+|[#1766](https://github.com/NVIDIA/spark-rapids/pull/1766)|Branch 0.4 release docs|
+|[#1794](https://github.com/NVIDIA/spark-rapids/pull/1794)|Build python output schema from udf expressions|
+|[#1783](https://github.com/NVIDIA/spark-rapids/pull/1783)|Fix the collect_list over window tests failures on db|
+|[#1781](https://github.com/NVIDIA/spark-rapids/pull/1781)|Better float/double cases for casting tests|
+|[#1790](https://github.com/NVIDIA/spark-rapids/pull/1790)|Record row counts in benchmark runs that call collect|
+|[#1779](https://github.com/NVIDIA/spark-rapids/pull/1779)|Add support of DateType and TimestampType for GetTimestamp expression|
+|[#1768](https://github.com/NVIDIA/spark-rapids/pull/1768)|Updating getting started Databricks docs|
+|[#1742](https://github.com/NVIDIA/spark-rapids/pull/1742)|Fix regression supporting to_date with Spark-3.1|
+|[#1775](https://github.com/NVIDIA/spark-rapids/pull/1775)|Fix ambiguous ordering for some tests|
+|[#1760](https://github.com/NVIDIA/spark-rapids/pull/1760)|Update GpuDataSourceScanExec and GpuBroadcastExchangeExec to fix audit issues|
+|[#1750](https://github.com/NVIDIA/spark-rapids/pull/1750)|Detect task failures in benchmarks|
+|[#1767](https://github.com/NVIDIA/spark-rapids/pull/1767)|Consistent Spark version for test and production|
+|[#1741](https://github.com/NVIDIA/spark-rapids/pull/1741)|Reduce regex use in CAST|
+|[#1756](https://github.com/NVIDIA/spark-rapids/pull/1756)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
+|[#1716](https://github.com/NVIDIA/spark-rapids/pull/1716)|Update RapidsShuffleManager documentation for branch 0.4|
+|[#1740](https://github.com/NVIDIA/spark-rapids/pull/1740)|Disable ORC writes until bug can be fixed|
+|[#1747](https://github.com/NVIDIA/spark-rapids/pull/1747)|Fix resource leaks in unit tests|
+|[#1725](https://github.com/NVIDIA/spark-rapids/pull/1725)|Branch 0.4 FAQ reorg|
+|[#1718](https://github.com/NVIDIA/spark-rapids/pull/1718)|CAST string to temporal type now calls isTimestamp|
+|[#1734](https://github.com/NVIDIA/spark-rapids/pull/1734)|Disable range partitioning if computation is needed|
+|[#1723](https://github.com/NVIDIA/spark-rapids/pull/1723)|Removed StructTypes support for ParquetCachedBatchSerializer as cudf doesn't support it yet|
+|[#1714](https://github.com/NVIDIA/spark-rapids/pull/1714)|Add support for RAPIDS accelerated Java UDFs|
+|[#1713](https://github.com/NVIDIA/spark-rapids/pull/1713)|Call GpuDeviceManager.shutdown when the executor plugin is shutting down|
+|[#1596](https://github.com/NVIDIA/spark-rapids/pull/1596)|Added in Decimal support to ParquetCachedBatchSerializer|
+|[#1706](https://github.com/NVIDIA/spark-rapids/pull/1706)|cleanup unused is_before_spark_310|
+|[#1685](https://github.com/NVIDIA/spark-rapids/pull/1685)|Fix CustomShuffleReader replacement when decimal types enabled|
+|[#1699](https://github.com/NVIDIA/spark-rapids/pull/1699)|Add docs about Spark 3.1 in standalone modes not needing extra class path|
+|[#1701](https://github.com/NVIDIA/spark-rapids/pull/1701)|remove xfail for orc test_input_meta for spark 3.1.0|
+|[#1703](https://github.com/NVIDIA/spark-rapids/pull/1703)|Remove xfail for spark 3.1.0 test_broadcast_join_mixed FullOuter|
+|[#1676](https://github.com/NVIDIA/spark-rapids/pull/1676)|BenchmarkRunner option to generate query plan diagrams in DOT format|
+|[#1695](https://github.com/NVIDIA/spark-rapids/pull/1695)|support alternate jar paths|
+|[#1694](https://github.com/NVIDIA/spark-rapids/pull/1694)|increase mem and limit parallelism for pre-merge|
+|[#1691](https://github.com/NVIDIA/spark-rapids/pull/1691)|add validate_execs_in_gpu_plan to pytest.ini|
+|[#1692](https://github.com/NVIDIA/spark-rapids/pull/1692)|Add the integration test resources to the test tarball|
+|[#1677](https://github.com/NVIDIA/spark-rapids/pull/1677)|When PTDS is enabled, print warning if the allocator is not ARENA|
+|[#1683](https://github.com/NVIDIA/spark-rapids/pull/1683)|update changelog to verify autotmerge 0.5 setup [skip ci]|
+|[#1673](https://github.com/NVIDIA/spark-rapids/pull/1673)|support auto-merge for branch 0.5 [skip ci]|
+|[#1681](https://github.com/NVIDIA/spark-rapids/pull/1681)|Xfail the collect_list tests for databricks|
+|[#1678](https://github.com/NVIDIA/spark-rapids/pull/1678)|Fix array/struct checks in Sort and HashAggregate and sorting tests in distributed mode|
+|[#1671](https://github.com/NVIDIA/spark-rapids/pull/1671)|Allow metrics to be configurable by level|
+|[#1675](https://github.com/NVIDIA/spark-rapids/pull/1675)|add run_pyspark_from_build.sh to the pytest distribution tarball|
+|[#1548](https://github.com/NVIDIA/spark-rapids/pull/1548)|Support executing collect_list on GPU with windowing.|
+|[#1593](https://github.com/NVIDIA/spark-rapids/pull/1593)|Avoid unnecessary Table instances after contiguous split|
+|[#1592](https://github.com/NVIDIA/spark-rapids/pull/1592)|Add in support for Decimal divide|
+|[#1668](https://github.com/NVIDIA/spark-rapids/pull/1668)|Implement way for python integration tests to validate Exec is in GPU plan|
+|[#1669](https://github.com/NVIDIA/spark-rapids/pull/1669)|Add FAQ entries for executor-per-GPU questions|
+|[#1661](https://github.com/NVIDIA/spark-rapids/pull/1661)|Enable Parquet test for file containing map struct key|
+|[#1664](https://github.com/NVIDIA/spark-rapids/pull/1664)|Filter nulls for left semi and left anti join to work around cudf|
+|[#1665](https://github.com/NVIDIA/spark-rapids/pull/1665)|Add better automated tests for Arrow columnar copy in HostColumnarToGpu|
+|[#1614](https://github.com/NVIDIA/spark-rapids/pull/1614)|add alluxio getting start document|
+|[#1639](https://github.com/NVIDIA/spark-rapids/pull/1639)|support GpuScalarSubquery|
+|[#1656](https://github.com/NVIDIA/spark-rapids/pull/1656)|Move UDF to Catalyst Expressions to its own document|
+|[#1663](https://github.com/NVIDIA/spark-rapids/pull/1663)|BenchmarkRunner - Include query name in JSON summary filename|
+|[#1655](https://github.com/NVIDIA/spark-rapids/pull/1655)|Fix extraneous shuffles added by AQE|
+|[#1652](https://github.com/NVIDIA/spark-rapids/pull/1652)|Fix typo in arrow optimized config name - spark.rapids.arrowCopyOptimizationEnabled|
+|[#1645](https://github.com/NVIDIA/spark-rapids/pull/1645)|Run Databricks IT with python-xdist parallel, includes test fixes and xfail|
+|[#1649](https://github.com/NVIDIA/spark-rapids/pull/1649)|Move building from source docs to contributing guide|
+|[#1637](https://github.com/NVIDIA/spark-rapids/pull/1637)|Fail DivModLike on zero divisor in ANSI mode|
+|[#1646](https://github.com/NVIDIA/spark-rapids/pull/1646)|Update links in rapids-udfs.md after moving to subfolder|
+|[#1641](https://github.com/NVIDIA/spark-rapids/pull/1641)|Xfail struct and array order by tests on Dataproc|
+|[#1565](https://github.com/NVIDIA/spark-rapids/pull/1565)|Add GPU accelerated array_contains operator|
+|[#1617](https://github.com/NVIDIA/spark-rapids/pull/1617)|Enable nightly test checks for Apache Spark|
+|[#1636](https://github.com/NVIDIA/spark-rapids/pull/1636)|RAPIDS accelerated Spark Scala UDF support|
+|[#1634](https://github.com/NVIDIA/spark-rapids/pull/1634)|Fix databricks build since Arrow code added|
+|[#1599](https://github.com/NVIDIA/spark-rapids/pull/1599)|Add division by zero tests for Spark 3.1 behavior|
+|[#1619](https://github.com/NVIDIA/spark-rapids/pull/1619)|Update GpuFileSourceScanExec to be in sync with DataSourceScanExec|
+|[#1631](https://github.com/NVIDIA/spark-rapids/pull/1631)|Explicitly add maven-jar-plugin version to improve incremental build time.|
+|[#1624](https://github.com/NVIDIA/spark-rapids/pull/1624)|Update explain format to show what will and will not run on the GPU|
+|[#1622](https://github.com/NVIDIA/spark-rapids/pull/1622)|Support faster copy for a custom DataSource V2 which supplies Arrow data|
+|[#1621](https://github.com/NVIDIA/spark-rapids/pull/1621)|Additional functionality docs|
+|[#1618](https://github.com/NVIDIA/spark-rapids/pull/1618)|update blossom-ci for security updates [skip ci]|
+|[#1562](https://github.com/NVIDIA/spark-rapids/pull/1562)|add alluxio support|
+|[#1597](https://github.com/NVIDIA/spark-rapids/pull/1597)|Documentation for Parquet serializer|
+|[#1611](https://github.com/NVIDIA/spark-rapids/pull/1611)|Add in flag for integration tests to not skip required tests|
+|[#1609](https://github.com/NVIDIA/spark-rapids/pull/1609)|Disable float round/bround by default|
+|[#1615](https://github.com/NVIDIA/spark-rapids/pull/1615)|Add in window support for average|
+|[#1610](https://github.com/NVIDIA/spark-rapids/pull/1610)|Limit length of spark app name in BenchmarkRunner|
+|[#1579](https://github.com/NVIDIA/spark-rapids/pull/1579)|Support TakeOrderedAndProject|
+|[#1581](https://github.com/NVIDIA/spark-rapids/pull/1581)|Support Decimal type for CollectLimitExec|
+|[#1591](https://github.com/NVIDIA/spark-rapids/pull/1591)|Add support for running multiple queries in BenchmarkRunner|
+|[#1595](https://github.com/NVIDIA/spark-rapids/pull/1595)|Fix Github documentation issue template|
+|[#1577](https://github.com/NVIDIA/spark-rapids/pull/1577)|rename directory from spark310 to spark311|
+|[#1578](https://github.com/NVIDIA/spark-rapids/pull/1578)|Test to track RAPIDS-side issues re SPARK-32639|
+|[#1583](https://github.com/NVIDIA/spark-rapids/pull/1583)|fix request-action issue [skip ci]|
+|[#1555](https://github.com/NVIDIA/spark-rapids/pull/1555)|Enable ANSI mode for CAST string to timestamp|
+|[#1531](https://github.com/NVIDIA/spark-rapids/pull/1531)|Decimal Support for writing Parquet|
+|[#1545](https://github.com/NVIDIA/spark-rapids/pull/1545)|Support comparing ORC data|
+|[#1570](https://github.com/NVIDIA/spark-rapids/pull/1570)|Branch 0.4 doc cleanup|
+|[#1569](https://github.com/NVIDIA/spark-rapids/pull/1569)|Add shim method shouldIgnorePath|
+|[#1564](https://github.com/NVIDIA/spark-rapids/pull/1564)|Add in support for Decimal Multiply and DIV|
+|[#1561](https://github.com/NVIDIA/spark-rapids/pull/1561)|Decimal support for add and subtract|
+|[#1560](https://github.com/NVIDIA/spark-rapids/pull/1560)|support sum in window aggregation for decimal|
+|[#1546](https://github.com/NVIDIA/spark-rapids/pull/1546)|Cleanup shutdown logging for UCX shuffle|
+|[#1551](https://github.com/NVIDIA/spark-rapids/pull/1551)|RAPIDS-accelerated Hive UDFs support all types|
+|[#1543](https://github.com/NVIDIA/spark-rapids/pull/1543)|Shuffle/transport enabled by default|
+|[#1552](https://github.com/NVIDIA/spark-rapids/pull/1552)|Disable blackduck signature check|
+|[#1540](https://github.com/NVIDIA/spark-rapids/pull/1540)|Handle ShuffleManager api calls when plugin is not fully initialized|
+|[#1547](https://github.com/NVIDIA/spark-rapids/pull/1547)|Cleanup shuffle transport receive calls|
+|[#1512](https://github.com/NVIDIA/spark-rapids/pull/1512)|Support window operations on Decimal|
+|[#1532](https://github.com/NVIDIA/spark-rapids/pull/1532)|Support casting from decimal to decimal|
+|[#1542](https://github.com/NVIDIA/spark-rapids/pull/1542)|Change the number of partitions to zero when a range is empty|
+|[#1506](https://github.com/NVIDIA/spark-rapids/pull/1506)|Add --use-decimals flag to TPC-DS ConvertFiles|
+|[#1511](https://github.com/NVIDIA/spark-rapids/pull/1511)|Remove unused Jenkinsfiles [skip ci]|
+|[#1505](https://github.com/NVIDIA/spark-rapids/pull/1505)|Add least, greatest and eqNullSafe support for DecimalType|
+|[#1484](https://github.com/NVIDIA/spark-rapids/pull/1484)|add doc for nsight systems bundled with cuda toolkit|
+|[#1478](https://github.com/NVIDIA/spark-rapids/pull/1478)|Documentation for RAPIDS-accelerated Hive UDFs|
+|[#1477](https://github.com/NVIDIA/spark-rapids/pull/1477)|Allow structs and arrays to pass through for Shuffle and Sort |
+|[#1489](https://github.com/NVIDIA/spark-rapids/pull/1489)|Adds in some support for the array sql function|
+|[#1438](https://github.com/NVIDIA/spark-rapids/pull/1438)|Cast from numeric types to decimal type|
+|[#1493](https://github.com/NVIDIA/spark-rapids/pull/1493)|Moved ParquetRecordMaterializer to the shim package to follow convention|
+|[#1495](https://github.com/NVIDIA/spark-rapids/pull/1495)|Fix merge conflict, merge branch 0.3 to branch 0.4 [skip ci]|
+|[#1472](https://github.com/NVIDIA/spark-rapids/pull/1472)|Add an example RAPIDS-accelerated Hive UDF using native code|
+|[#1488](https://github.com/NVIDIA/spark-rapids/pull/1488)|Rename Spark 3.1.0 shim to Spark 3.1.1 to match community|
+|[#1474](https://github.com/NVIDIA/spark-rapids/pull/1474)|Fix link|
+|[#1476](https://github.com/NVIDIA/spark-rapids/pull/1476)|DecimalType support for Aggregate Count|
+|[#1475](https://github.com/NVIDIA/spark-rapids/pull/1475)| Join support for DecimalType|
+|[#1244](https://github.com/NVIDIA/spark-rapids/pull/1244)|Support round and bround SQL functions |
+|[#1458](https://github.com/NVIDIA/spark-rapids/pull/1458)|Add in support for struct and named_struct|
+|[#1465](https://github.com/NVIDIA/spark-rapids/pull/1465)|DecimalType support for UnionExec and ExpandExec|
+|[#1450](https://github.com/NVIDIA/spark-rapids/pull/1450)|Add dynamic configs for the spark-rapids IT pipelines|
+|[#1207](https://github.com/NVIDIA/spark-rapids/pull/1207)|Spark SQL hash function using murmur3|
+|[#1457](https://github.com/NVIDIA/spark-rapids/pull/1457)|Support reading decimal columns from parquet files on Databricks|
+|[#1455](https://github.com/NVIDIA/spark-rapids/pull/1455)|Upgrade Scala Maven Plugin to 4.3.0|
+|[#1453](https://github.com/NVIDIA/spark-rapids/pull/1453)|DecimalType support for IfElse and Coalesce|
+|[#1452](https://github.com/NVIDIA/spark-rapids/pull/1452)|Support DecimalType for CaseWhen|
+|[#1444](https://github.com/NVIDIA/spark-rapids/pull/1444)|Improve UX when running benchmarks from Spark shell|
+|[#1294](https://github.com/NVIDIA/spark-rapids/pull/1294)|Support reading decimal columns from parquet files|
+|[#1153](https://github.com/NVIDIA/spark-rapids/pull/1153)|Scala UDF will compile children expressions in Project|
+|[#1416](https://github.com/NVIDIA/spark-rapids/pull/1416)|Optimize mvn dependency download scripts|
+|[#1430](https://github.com/NVIDIA/spark-rapids/pull/1430)|Add project for testing code that requires Spark 3.1.0 or later|
+|[#1425](https://github.com/NVIDIA/spark-rapids/pull/1425)|Add in Decimal support for abs, floor, ceil, unary - and unary +|
+|[#1427](https://github.com/NVIDIA/spark-rapids/pull/1427)|Revert "Make the multi-threaded parquet reader the default"|
+|[#1420](https://github.com/NVIDIA/spark-rapids/pull/1420)|Add udf jar to nightly integration tests|
+|[#1422](https://github.com/NVIDIA/spark-rapids/pull/1422)|Log the number of concurrent gpu tasks allowed on Executor startup|
+|[#1401](https://github.com/NVIDIA/spark-rapids/pull/1401)|Accelerate the coalescing parquet reader when reading files from multiple partitioned folders|
+|[#1413](https://github.com/NVIDIA/spark-rapids/pull/1413)|Add config for cast float to integral types|
+|[#1313](https://github.com/NVIDIA/spark-rapids/pull/1313)|Support spilling to disk directly via cuFile/GDS|
+|[#1411](https://github.com/NVIDIA/spark-rapids/pull/1411)|Add udf-examples jar to databricks build|
+|[#1412](https://github.com/NVIDIA/spark-rapids/pull/1412)|Fix a lot of tests marked with xfail for Spark 3.1.0 that no longer fail|
+|[#1414](https://github.com/NVIDIA/spark-rapids/pull/1414)|Build merged code of HEAD and BASE branch for pre-merge [skip ci]|
+|[#1409](https://github.com/NVIDIA/spark-rapids/pull/1409)|Add option to use decimals in tpc-ds csv to parquet conversion|
+|[#1410](https://github.com/NVIDIA/spark-rapids/pull/1410)|Add Decimal support for In, InSet, AtLeastNNonNulls, GetArrayItem, GetStructField, and GenerateExec|
+|[#1408](https://github.com/NVIDIA/spark-rapids/pull/1408)|Support RAPIDS-accelerated HiveGenericUDF|
+|[#1407](https://github.com/NVIDIA/spark-rapids/pull/1407)|Update docs and tests for null CSV support|
+|[#1393](https://github.com/NVIDIA/spark-rapids/pull/1393)|Support RAPIDS-accelerated HiveSimpleUDF|
+|[#1392](https://github.com/NVIDIA/spark-rapids/pull/1392)|Turn on hash partitioning for decimal support|
+|[#1402](https://github.com/NVIDIA/spark-rapids/pull/1402)|Better GPU Cast type checks|
+|[#1404](https://github.com/NVIDIA/spark-rapids/pull/1404)|Fix branch 0.4 merge conflict|
+|[#1323](https://github.com/NVIDIA/spark-rapids/pull/1323)|More advanced type checking and documentation|
+|[#1391](https://github.com/NVIDIA/spark-rapids/pull/1391)|Remove extra null join filtering because cudf is fast for this now.|
+|[#1395](https://github.com/NVIDIA/spark-rapids/pull/1395)|Fix branch-0.3 -> branch-0.4 automerge|
+|[#1382](https://github.com/NVIDIA/spark-rapids/pull/1382)|Handle "MM[/-]dd" and "dd[/-]MM" datetime formats in UnixTimeExprMeta|
+|[#1390](https://github.com/NVIDIA/spark-rapids/pull/1390)|Accelerated columnar to row/row to columnar for decimal|
+|[#1380](https://github.com/NVIDIA/spark-rapids/pull/1380)|Adds in basic support for decimal sort, sum, and some shuffle|
+|[#1367](https://github.com/NVIDIA/spark-rapids/pull/1367)|Reuse gpu expression conversion rules when checking sort order|
+|[#1349](https://github.com/NVIDIA/spark-rapids/pull/1349)|Add canonicalization tests|
+|[#1368](https://github.com/NVIDIA/spark-rapids/pull/1368)|Move to cudf 0.18-SNAPSHOT|
+|[#1361](https://github.com/NVIDIA/spark-rapids/pull/1361)|Use the correct precision when reading spark columnar data.|
+|[#1273](https://github.com/NVIDIA/spark-rapids/pull/1273)|Update docs and scripts to 0.4.0-SNAPSHOT|
+|[#1321](https://github.com/NVIDIA/spark-rapids/pull/1321)|Refactor to stop inheriting from HashJoin|
+|[#1311](https://github.com/NVIDIA/spark-rapids/pull/1311)|ParquetCachedBatchSerializer code cleanup|
+|[#1303](https://github.com/NVIDIA/spark-rapids/pull/1303)|Add explicit outputOrdering for BHJ and SHJ in spark310 shim|
+|[#1299](https://github.com/NVIDIA/spark-rapids/pull/1299)|Benchmark runner improved error handling|
+
+## Release 0.3
+
+### Features
+|||
+|:---|:---|
+|[#1002](https://github.com/NVIDIA/spark-rapids/issues/1002)|[FEA] RapidsHostColumnVectorCore should verify cudf data with respect to the expected spark type |
+|[#444](https://github.com/NVIDIA/spark-rapids/issues/444)|[FEA] Plugable Cache|
+|[#1158](https://github.com/NVIDIA/spark-rapids/issues/1158)|[FEA] Better documentation on type support|
+|[#57](https://github.com/NVIDIA/spark-rapids/issues/57)|[FEA] Support INT96 for parquet reads and writes|
+|[#1003](https://github.com/NVIDIA/spark-rapids/issues/1003)|[FEA] Reduce overlap between RapidsHostColumnVector and RapidsHostColumnVectorCore|
+|[#913](https://github.com/NVIDIA/spark-rapids/issues/913)|[FEA] In Pluggable Cache Support CalendarInterval while creating CachedBatches|
+|[#1092](https://github.com/NVIDIA/spark-rapids/issues/1092)|[FEA] In Pluggable Cache handle nested types having CalendarIntervalType and NullType|
+|[#670](https://github.com/NVIDIA/spark-rapids/issues/670)|[FEA] Support NullType|
+|[#50](https://github.com/NVIDIA/spark-rapids/issues/50)|[FEA] support `spark.sql.legacy.timeParserPolicy`|
+|[#1144](https://github.com/NVIDIA/spark-rapids/issues/1144)|[FEA] Remove Databricks 3.0.0 shim layer|
+|[#1096](https://github.com/NVIDIA/spark-rapids/issues/1096)|[FEA] Implement parquet CreateDataSourceTableAsSelectCommand|
+|[#688](https://github.com/NVIDIA/spark-rapids/issues/688)|[FEA] udf compiler should be auto-appended to `spark.sql.extensions`|
+|[#502](https://github.com/NVIDIA/spark-rapids/issues/502)|[FEA] Support Databricks 7.3 LTS Runtime|
+|[#764](https://github.com/NVIDIA/spark-rapids/issues/764)|[FEA] Sanity checks for cudf jar mismatch|
+|[#1018](https://github.com/NVIDIA/spark-rapids/issues/1018)|[FEA] Log details related to GPU memory fragmentation on GPU OOM|
+|[#619](https://github.com/NVIDIA/spark-rapids/issues/619)|[FEA] log whether libcudf and libcudfjni were built for PTDS|
+|[#905](https://github.com/NVIDIA/spark-rapids/issues/905)|[FEA] create AWS EMR 3.0.1 shim|
+|[#838](https://github.com/NVIDIA/spark-rapids/issues/838)|[FEA] Support window count for a column|
+|[#864](https://github.com/NVIDIA/spark-rapids/issues/864)|[FEA] config option to enable RMM arena memory resource|
+|[#430](https://github.com/NVIDIA/spark-rapids/issues/430)|[FEA] Audit: Parquet Writer support for TIMESTAMP_MILLIS|
+|[#818](https://github.com/NVIDIA/spark-rapids/issues/818)|[FEA] Create shim layer for AWS EMR |
+|[#608](https://github.com/NVIDIA/spark-rapids/issues/608)|[FEA] Parquet small file optimization improve handle merge schema|
+
+### Performance
+|||
+|:---|:---|
+|[#446](https://github.com/NVIDIA/spark-rapids/issues/446)|[FEA] Test jucx in 1.9.x branch|
+|[#1038](https://github.com/NVIDIA/spark-rapids/issues/1038)|[FEA] Accelerate the data transfer for plan `WindowInPandasExec`|
+|[#533](https://github.com/NVIDIA/spark-rapids/issues/533)|[FEA] Improve PTDS performance|
+|[#849](https://github.com/NVIDIA/spark-rapids/issues/849)|[FEA] Have GpuColumnarBatchSerializer return GpuColumnVectorFromBuffer instances|
+|[#784](https://github.com/NVIDIA/spark-rapids/issues/784)|[FEA] Allow Host Spilling to be more dynamic|
+|[#627](https://github.com/NVIDIA/spark-rapids/issues/627)|[FEA] Further parquet reading small file improvements|
+|[#5](https://github.com/NVIDIA/spark-rapids/issues/5)|[FEA] Support Adaptive Execution|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#1423](https://github.com/NVIDIA/spark-rapids/issues/1423)|[BUG] Mortgage ETL sample failed with spark.sql.adaptive enabled on AWS EMR 6.2 |
+|[#1369](https://github.com/NVIDIA/spark-rapids/issues/1369)|[BUG] TPC-DS Query Failing on EMR 6.2 with AQE|
+|[#1344](https://github.com/NVIDIA/spark-rapids/issues/1344)|[BUG] Spark-rapids Pytests failed on On Databricks cluster spark standalone mode|
+|[#1279](https://github.com/NVIDIA/spark-rapids/issues/1279)|[BUG] TPC-DS query 2 failing with NPE|
+|[#1280](https://github.com/NVIDIA/spark-rapids/issues/1280)|[BUG] TPC-DS query 93 failing with UnsupportedOperationException|
+|[#1308](https://github.com/NVIDIA/spark-rapids/issues/1308)|[BUG] TPC-DS query 14a runs much slower on 0.3|
+|[#1284](https://github.com/NVIDIA/spark-rapids/issues/1284)|[BUG] TPC-DS query 77 at scale=1TB fails with maxResultSize exceeded error|
+|[#1061](https://github.com/NVIDIA/spark-rapids/issues/1061)|[BUG] orc_test.py is failing|
+|[#1197](https://github.com/NVIDIA/spark-rapids/issues/1197)|[BUG] java.lang.NullPointerException when exporting delta table|
+|[#685](https://github.com/NVIDIA/spark-rapids/issues/685)|[BUG] In ParqueCachedBatchSerializer, serializing parquet buffers might blow up in certain cases|
+|[#1269](https://github.com/NVIDIA/spark-rapids/issues/1269)|[BUG] GpuSubstring is not expected to be a part of a SortOrder|
+|[#1246](https://github.com/NVIDIA/spark-rapids/issues/1246)|[BUG] Many TPC-DS benchmarks fail when writing to Parquet|
+|[#961](https://github.com/NVIDIA/spark-rapids/issues/961)|[BUG] ORC predicate pushdown should work with case-insensitive analysis|
+|[#962](https://github.com/NVIDIA/spark-rapids/issues/962)|[BUG] Loading columns from an ORC file without column names returns no data|
+|[#1245](https://github.com/NVIDIA/spark-rapids/issues/1245)|[BUG] Code adding buffers to the spillable store should synchronize|
+|[#570](https://github.com/NVIDIA/spark-rapids/issues/570)|[BUG] Continue debugging OOM after ensuring device store is empty|
+|[#972](https://github.com/NVIDIA/spark-rapids/issues/972)|[BUG] total time metric is redundant with scan time|
+|[#1039](https://github.com/NVIDIA/spark-rapids/issues/1039)|[BUG] UNBOUNDED window ranges on null timestamp columns produces incorrect results.|
+|[#1195](https://github.com/NVIDIA/spark-rapids/issues/1195)|[BUG] AcceleratedColumnarToRowIterator queue empty|
+|[#1177](https://github.com/NVIDIA/spark-rapids/issues/1177)|[BUG] leaks possible in the rapids shuffle if batches are received after the task completes|
+|[#1216](https://github.com/NVIDIA/spark-rapids/issues/1216)|[BUG] Failure to recognize ORC file format when loaded via Hive|
+|[#898](https://github.com/NVIDIA/spark-rapids/issues/898)|[BUG] count reductions are failing on databricks because lack for Complete support|
+|[#1184](https://github.com/NVIDIA/spark-rapids/issues/1184)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks 3.0.1|
+|[#1151](https://github.com/NVIDIA/spark-rapids/issues/1151)|[BUG]Add databricks 3.0.1 shim layer for GpuWindowInPandasExec.|
+|[#1199](https://github.com/NVIDIA/spark-rapids/issues/1199)|[BUG] No data size in Input column in Stages page from Spark UI when using Parquet as file source|
+|[#1031](https://github.com/NVIDIA/spark-rapids/issues/1031)|[BUG] dependency info properties file contains error messages|
+|[#1149](https://github.com/NVIDIA/spark-rapids/issues/1149)|[BUG] Scaladoc warnings in GpuDataSource|
+|[#1185](https://github.com/NVIDIA/spark-rapids/issues/1185)|[BUG] test_hash_multiple_mode_query failing|
+|[#724](https://github.com/NVIDIA/spark-rapids/issues/724)|[BUG] PySpark test_broadcast_nested_loop_join_special_case intermittent failure|
+|[#1164](https://github.com/NVIDIA/spark-rapids/issues/1164)|[BUG] ansi_cast tests are failing in 3.1.0|
+|[#1110](https://github.com/NVIDIA/spark-rapids/issues/1110)|[BUG] Special date "now" has wrong value on GPU|
+|[#1139](https://github.com/NVIDIA/spark-rapids/issues/1139)|[BUG] Host columnar to GPU can be very slow|
+|[#1094](https://github.com/NVIDIA/spark-rapids/issues/1094)|[BUG] unix_timestamp on GPU returns invalid data for special dates|
+|[#1098](https://github.com/NVIDIA/spark-rapids/issues/1098)|[BUG] unix_timestamp on GPU returns invalid data for bad input|
+|[#1082](https://github.com/NVIDIA/spark-rapids/issues/1082)|[BUG] string to timestamp conversion fails with split|
+|[#1140](https://github.com/NVIDIA/spark-rapids/issues/1140)|[BUG] ConcurrentModificationException error after scala test suite completes|
+|[#1073](https://github.com/NVIDIA/spark-rapids/issues/1073)|[BUG] java.lang.RuntimeException: BinaryExpressions must override either eval or nullSafeEval|
+|[#975](https://github.com/NVIDIA/spark-rapids/issues/975)|[BUG] BroadcastExchangeExec fails to fall back to CPU on driver node on GCP Dataproc|
+|[#773](https://github.com/NVIDIA/spark-rapids/issues/773)|[BUG] Investigate high task deserialization|
+|[#1035](https://github.com/NVIDIA/spark-rapids/issues/1035)|[BUG] TPC-DS query 90 with AQE enabled fails with doExecuteBroadcast exception|
+|[#825](https://github.com/NVIDIA/spark-rapids/issues/825)|[BUG] test_window_aggs_for_ranges intermittently fails|
+|[#1008](https://github.com/NVIDIA/spark-rapids/issues/1008)|[BUG] limit function is producing inconsistent result when type is Byte, Long, Boolean and Timestamp|
+|[#996](https://github.com/NVIDIA/spark-rapids/issues/996)|[BUG] TPC-DS benchmark via spark-submit does not provide option to disable appending .dat to path|
+|[#1006](https://github.com/NVIDIA/spark-rapids/issues/1006)|[BUG] Spark3.1.0 changed BasicWriteTaskStats breaks BasicColumnarWriteTaskStatsTracker|
+|[#985](https://github.com/NVIDIA/spark-rapids/issues/985)|[BUG] missing metric `dataSize`|
+|[#881](https://github.com/NVIDIA/spark-rapids/issues/881)|[BUG] cannot disable Sort by itself|
+|[#812](https://github.com/NVIDIA/spark-rapids/issues/812)|[BUG] Test failures for 0.2 when run with multiple executors|
+|[#925](https://github.com/NVIDIA/spark-rapids/issues/925)|[BUG]Range window-functions with non-timestamp order-by expressions not falling back to CPU|
+|[#852](https://github.com/NVIDIA/spark-rapids/issues/852)|[BUG] BenchUtils.compareResults cannot compare partitioned files when ignoreOrdering=false|
+|[#868](https://github.com/NVIDIA/spark-rapids/issues/868)|[BUG] Rounding error when casting timestamp to string for timestamps before 1970|
+|[#880](https://github.com/NVIDIA/spark-rapids/issues/880)|[BUG] doing a window operation with an orderby for a single constant crashes|
+|[#776](https://github.com/NVIDIA/spark-rapids/issues/776)|[BUG] Integration test fails on spark 3.1.0-SNAPSHOT|
+|[#874](https://github.com/NVIDIA/spark-rapids/issues/874)|[BUG] `RapidsConf.scala` has some un-consistency for `spark.rapids.sql.format.parquet.multiThreadedRead`|
+|[#860](https://github.com/NVIDIA/spark-rapids/issues/860)|[BUG] we need to mark columns from received shuffle buffers as `GpuColumnVectorFromBuffer`|
+|[#122](https://github.com/NVIDIA/spark-rapids/issues/122)|[BUG] CSV Timestamp parseing is broken for TS < 1902 and TS > 2038|
+|[#810](https://github.com/NVIDIA/spark-rapids/issues/810)|[BUG] UDF Integration tests fail if pandas is not installed|
+|[#746](https://github.com/NVIDIA/spark-rapids/issues/746)|[BUG] cudf_udf_test.py is flakey|
+|[#811](https://github.com/NVIDIA/spark-rapids/issues/811)|[BUG] 0.3 nightly is timing out |
+|[#574](https://github.com/NVIDIA/spark-rapids/issues/574)|[BUG] Fix GpuTimeSub for Spark 3.1.0|
+
+### PRs
+|||
+|:---|:---|
+|[#1496](https://github.com/NVIDIA/spark-rapids/pull/1496)|Update changelog for v0.3.0 release [skip ci]|
+|[#1473](https://github.com/NVIDIA/spark-rapids/pull/1473)|Update documentation for 0.3 release|
+|[#1371](https://github.com/NVIDIA/spark-rapids/pull/1371)|Start Guide for RAPIDS on AWS EMR 6.2|
+|[#1446](https://github.com/NVIDIA/spark-rapids/pull/1446)|Update changelog for 0.3.0 release [skip ci]|
+|[#1439](https://github.com/NVIDIA/spark-rapids/pull/1439)|when AQE enabled we fail to fix up exchanges properly and EMR|
+|[#1433](https://github.com/NVIDIA/spark-rapids/pull/1433)|fix pandas 1.2 compatible issue|
+|[#1424](https://github.com/NVIDIA/spark-rapids/pull/1424)|Make the multi-threaded parquet reader the default since coalescing doesn't handle partitioned files well|
+|[#1389](https://github.com/NVIDIA/spark-rapids/pull/1389)|Update project version to 0.3.0|
+|[#1387](https://github.com/NVIDIA/spark-rapids/pull/1387)|Update cudf version to 0.17|
+|[#1370](https://github.com/NVIDIA/spark-rapids/pull/1370)|[REVIEW] init changelog 0.3 [skip ci]|
+|[#1376](https://github.com/NVIDIA/spark-rapids/pull/1376)|MetaUtils.getBatchFromMeta should return batches with GpuColumnVectorFromBuffer|
+|[#1358](https://github.com/NVIDIA/spark-rapids/pull/1358)|auto-merge: instant merge after creation [skip ci]|
+|[#1359](https://github.com/NVIDIA/spark-rapids/pull/1359)|Use SortOrder from shims.|
+|[#1343](https://github.com/NVIDIA/spark-rapids/pull/1343)|Do not run UDFs when the partition is empty.|
+|[#1342](https://github.com/NVIDIA/spark-rapids/pull/1342)|Fix and edit docs for standalone mode|
+|[#1350](https://github.com/NVIDIA/spark-rapids/pull/1350)|fix GpuRangePartitioning canonicalization|
+|[#1281](https://github.com/NVIDIA/spark-rapids/pull/1281)|Documentation added for testing|
+|[#1336](https://github.com/NVIDIA/spark-rapids/pull/1336)|Fix missing post-shuffle coalesce with AQE|
+|[#1318](https://github.com/NVIDIA/spark-rapids/pull/1318)|Fix copying GpuFileSourceScanExec node|
+|[#1337](https://github.com/NVIDIA/spark-rapids/pull/1337)|Use UTC instead of GMT|
+|[#1307](https://github.com/NVIDIA/spark-rapids/pull/1307)|Fallback to cpu when reading Delta log files for stats|
+|[#1310](https://github.com/NVIDIA/spark-rapids/pull/1310)|Fix canonicalization of GpuFileSourceScanExec, GpuShuffleCoalesceExec|
+|[#1302](https://github.com/NVIDIA/spark-rapids/pull/1302)|Add GpuSubstring handling to SortOrder canonicalization|
+|[#1265](https://github.com/NVIDIA/spark-rapids/pull/1265)|Chunking input before writing a ParquetCachedBatch|
+|[#1278](https://github.com/NVIDIA/spark-rapids/pull/1278)|Add a config to disable decimal types by default|
+|[#1272](https://github.com/NVIDIA/spark-rapids/pull/1272)|Add Alias to shims|
+|[#1268](https://github.com/NVIDIA/spark-rapids/pull/1268)|Adds in support docs for 0.3 release|
+|[#1235](https://github.com/NVIDIA/spark-rapids/pull/1235)|Trigger reading and handling control data.|
+|[#1266](https://github.com/NVIDIA/spark-rapids/pull/1266)|Updating Databricks getting started for 0.3 release|
+|[#1291](https://github.com/NVIDIA/spark-rapids/pull/1291)|Increase pre-merge resource requests [skip ci]|
+|[#1275](https://github.com/NVIDIA/spark-rapids/pull/1275)|Temporarily disable more CAST tests for Spark 3.1.0|
+|[#1264](https://github.com/NVIDIA/spark-rapids/pull/1264)|Fix race condition in batch creation|
+|[#1260](https://github.com/NVIDIA/spark-rapids/pull/1260)|Update UCX license info in NOTIFY-binary for 1.9 and RAPIDS plugin copyright dates|
+|[#1247](https://github.com/NVIDIA/spark-rapids/pull/1247)|Ensure column names are valid when writing benchmark query results to file|
+|[#1240](https://github.com/NVIDIA/spark-rapids/pull/1240)|Fix loading from ORC file with no column names|
+|[#1242](https://github.com/NVIDIA/spark-rapids/pull/1242)|Remove compatibility documentation about unsupported INT96|
+|[#1192](https://github.com/NVIDIA/spark-rapids/pull/1192)|[REVIEW]  Support GpuFilter and GpuCoalesceBatches for decimal data|
+|[#1170](https://github.com/NVIDIA/spark-rapids/pull/1170)|Add nested type support to MetaUtils|
+|[#1194](https://github.com/NVIDIA/spark-rapids/pull/1194)|Drop redundant total time metric from scan|
+|[#1248](https://github.com/NVIDIA/spark-rapids/pull/1248)|At BatchedTableCompressor.finish synchronize to allow for "right-size…|
+|[#1169](https://github.com/NVIDIA/spark-rapids/pull/1169)|Use CUDF's "UNBOUNDED" window boundaries for time-range queries.|
+|[#1204](https://github.com/NVIDIA/spark-rapids/pull/1204)|Avoid empty batches on columnar to row conversion|
+|[#1133](https://github.com/NVIDIA/spark-rapids/pull/1133)|Refactor batch coalesce to be based solely on batch data size|
+|[#1237](https://github.com/NVIDIA/spark-rapids/pull/1237)|In transport, limit pending transfer requests to fit within a bounce|
+|[#1232](https://github.com/NVIDIA/spark-rapids/pull/1232)|Move SortOrder creation to shims|
+|[#1068](https://github.com/NVIDIA/spark-rapids/pull/1068)|Write int96 to parquet|
+|[#1193](https://github.com/NVIDIA/spark-rapids/pull/1193)|Verify shuffle of decimal columns|
+|[#1180](https://github.com/NVIDIA/spark-rapids/pull/1180)|Remove batches if they are received after the iterator detects that t…|
+|[#1173](https://github.com/NVIDIA/spark-rapids/pull/1173)|Support relational operators for decimal type|
+|[#1220](https://github.com/NVIDIA/spark-rapids/pull/1220)|Support replacing ORC format when Hive is configured|
+|[#1219](https://github.com/NVIDIA/spark-rapids/pull/1219)|Upgrade to jucx 1.9.0|
+|[#1081](https://github.com/NVIDIA/spark-rapids/pull/1081)|Add option to upload benchmark summary JSON file|
+|[#1217](https://github.com/NVIDIA/spark-rapids/pull/1217)|Aggregate reductions in Complete mode should use updateExpressions|
+|[#1218](https://github.com/NVIDIA/spark-rapids/pull/1218)|Remove obsolete HiveStringType usage|
+|[#1214](https://github.com/NVIDIA/spark-rapids/pull/1214)|changelog update 2020-11-30. Trigger automerge check [skip ci]|
+|[#1210](https://github.com/NVIDIA/spark-rapids/pull/1210)|Support auto-merge for branch-0.4 [skip ci]|
+|[#1202](https://github.com/NVIDIA/spark-rapids/pull/1202)|Fix a bug with the support for java.lang.StringBuilder.append.|
+|[#1213](https://github.com/NVIDIA/spark-rapids/pull/1213)|Skip casting StringType to TimestampType for Spark 310|
+|[#1201](https://github.com/NVIDIA/spark-rapids/pull/1201)|Replace only window expressions on databricks.|
+|[#1208](https://github.com/NVIDIA/spark-rapids/pull/1208)|[BUG] Fix GHSL2020-239 [skip ci]|
+|[#1205](https://github.com/NVIDIA/spark-rapids/pull/1205)|Fix missing input bytes read metric for Parquet|
+|[#1206](https://github.com/NVIDIA/spark-rapids/pull/1206)|Update Spark 3.1 shim for ShuffleOrigin shuffle parameter|
+|[#1196](https://github.com/NVIDIA/spark-rapids/pull/1196)|Rename ShuffleCoalesceExec to GpuShuffleCoalesceExec|
+|[#1191](https://github.com/NVIDIA/spark-rapids/pull/1191)|Skip window array tests for databricks.|
+|[#1183](https://github.com/NVIDIA/spark-rapids/pull/1183)|Support for CalendarIntervalType and NullType|
+|[#1150](https://github.com/NVIDIA/spark-rapids/pull/1150)|udf spec|
+|[#1188](https://github.com/NVIDIA/spark-rapids/pull/1188)|Add in tests for parquet nested pruning support|
+|[#1189](https://github.com/NVIDIA/spark-rapids/pull/1189)|Enable NullType for First and Last in 3.0.1+|
+|[#1181](https://github.com/NVIDIA/spark-rapids/pull/1181)|Fix resource leaks in unit tests|
+|[#1186](https://github.com/NVIDIA/spark-rapids/pull/1186)|Fix compilation and scaladoc warnings|
+|[#1187](https://github.com/NVIDIA/spark-rapids/pull/1187)|Updated documentation for distinct count compatibility|
+|[#1182](https://github.com/NVIDIA/spark-rapids/pull/1182)|Close buffer catalog on device manager shutdown|
+|[#1137](https://github.com/NVIDIA/spark-rapids/pull/1137)|Let GpuWindowInPandas declare ArrayType supported.|
+|[#1176](https://github.com/NVIDIA/spark-rapids/pull/1176)|Add in support for null type|
+|[#1174](https://github.com/NVIDIA/spark-rapids/pull/1174)|Fix race condition in SerializeConcatHostBuffersDeserializeBatch|
+|[#1175](https://github.com/NVIDIA/spark-rapids/pull/1175)|Fix leaks seen in shuffle tests|
+|[#1138](https://github.com/NVIDIA/spark-rapids/pull/1138)|[REVIEW] Support decimal type for GpuProjectExec|
+|[#1162](https://github.com/NVIDIA/spark-rapids/pull/1162)|Set job descriptions in benchmark runner|
+|[#1172](https://github.com/NVIDIA/spark-rapids/pull/1172)|Revert "Fix race condition (#1165)"|
+|[#1060](https://github.com/NVIDIA/spark-rapids/pull/1060)|Show partition metrics for custom shuffler reader|
+|[#1152](https://github.com/NVIDIA/spark-rapids/pull/1152)|Add spark301db shim layer for WindowInPandas.|
+|[#1167](https://github.com/NVIDIA/spark-rapids/pull/1167)|Nulls out the dataframe if --gc-between-runs is set|
+|[#1165](https://github.com/NVIDIA/spark-rapids/pull/1165)|Fix race condition in SerializeConcatHostBuffersDeserializeBatch|
+|[#1163](https://github.com/NVIDIA/spark-rapids/pull/1163)|Add in support for GetStructField|
+|[#1166](https://github.com/NVIDIA/spark-rapids/pull/1166)|Fix the cast tests for 3.1.0+|
+|[#1159](https://github.com/NVIDIA/spark-rapids/pull/1159)|fix bug where 'now' had same value as 'today' for timestamps|
+|[#1161](https://github.com/NVIDIA/spark-rapids/pull/1161)|Fix nightly build pipeline failure.|
+|[#1160](https://github.com/NVIDIA/spark-rapids/pull/1160)|Fix some performance problems with columnar to columnar conversion|
+|[#1105](https://github.com/NVIDIA/spark-rapids/pull/1105)|[REVIEW] Change ColumnViewAccess usage to work with ColumnView|
+|[#1148](https://github.com/NVIDIA/spark-rapids/pull/1148)|Add in tests for Maps and extend map support where possible|
+|[#1154](https://github.com/NVIDIA/spark-rapids/pull/1154)|Mark test as xfail until we can get a fix in|
+|[#1113](https://github.com/NVIDIA/spark-rapids/pull/1113)|Support unix_timestamp on GPU for subset of formats|
+|[#1156](https://github.com/NVIDIA/spark-rapids/pull/1156)|Fix warning introduced in iterator suite|
+|[#1095](https://github.com/NVIDIA/spark-rapids/pull/1095)|Dependency info|
+|[#1145](https://github.com/NVIDIA/spark-rapids/pull/1145)|Remove support for databricks 7.0 runtime - shim spark300db|
+|[#1147](https://github.com/NVIDIA/spark-rapids/pull/1147)|Change the assert to require for handling TIMESTAMP_MILLIS in isDateTimeRebaseNeeded |
+|[#1132](https://github.com/NVIDIA/spark-rapids/pull/1132)|Add in basic support to read structs from parquet|
+|[#1121](https://github.com/NVIDIA/spark-rapids/pull/1121)|Shuffle/better error handling|
+|[#1134](https://github.com/NVIDIA/spark-rapids/pull/1134)|Support saveAsTable for writing orc and parquet|
+|[#1124](https://github.com/NVIDIA/spark-rapids/pull/1124)|Add shim layers for GpuWindowInPandasExec.|
+|[#1131](https://github.com/NVIDIA/spark-rapids/pull/1131)|Add in some basic support for Structs|
+|[#1127](https://github.com/NVIDIA/spark-rapids/pull/1127)|Add in basic support for reading lists from parquet|
+|[#1129](https://github.com/NVIDIA/spark-rapids/pull/1129)|Fix resource leaks with new shuffle optimization|
+|[#1116](https://github.com/NVIDIA/spark-rapids/pull/1116)|Optimize normal shuffle by coalescing smaller batches on host|
+|[#1102](https://github.com/NVIDIA/spark-rapids/pull/1102)|Auto-register UDF extention when main plugin is set|
+|[#1108](https://github.com/NVIDIA/spark-rapids/pull/1108)|Remove integration test pipelines on NGCC|
+|[#1123](https://github.com/NVIDIA/spark-rapids/pull/1123)|Mark Pandas udf over window tests as xfail on databricks until they can be fixed|
+|[#1120](https://github.com/NVIDIA/spark-rapids/pull/1120)|Add in support for filtering ArrayType|
+|[#1080](https://github.com/NVIDIA/spark-rapids/pull/1080)|Support for CalendarIntervalType and NullType for ParquetCachedSerializer|
+|[#994](https://github.com/NVIDIA/spark-rapids/pull/994)|Packs bounce buffers for highly partitioned shuffles|
+|[#1112](https://github.com/NVIDIA/spark-rapids/pull/1112)|Remove bad config from pytest setup|
+|[#1107](https://github.com/NVIDIA/spark-rapids/pull/1107)|closeOnExcept -> withResources in MetaUtils|
+|[#1104](https://github.com/NVIDIA/spark-rapids/pull/1104)|Support lists to/from the GPU|
+|[#1106](https://github.com/NVIDIA/spark-rapids/pull/1106)|Improve mechanism for expected exceptions in tests|
+|[#1069](https://github.com/NVIDIA/spark-rapids/pull/1069)|Accelerate the data transfer between JVM and Python for the plan 'GpuWindowInPandasExec'|
+|[#1099](https://github.com/NVIDIA/spark-rapids/pull/1099)|Update how we deal with type checking|
+|[#1077](https://github.com/NVIDIA/spark-rapids/pull/1077)|Improve AQE transitions for shuffle and coalesce batches|
+|[#1097](https://github.com/NVIDIA/spark-rapids/pull/1097)|Cleanup some instances of excess closure serialization|
+|[#1090](https://github.com/NVIDIA/spark-rapids/pull/1090)|Fix the integration build|
+|[#1086](https://github.com/NVIDIA/spark-rapids/pull/1086)|Speed up test performance using pytest-xdist|
+|[#1084](https://github.com/NVIDIA/spark-rapids/pull/1084)|Avoid issues where more scalars that expected show up in an expression|
+|[#1076](https://github.com/NVIDIA/spark-rapids/pull/1076)|[FEA] Support Databricks 7.3 LTS Runtime|
+|[#1083](https://github.com/NVIDIA/spark-rapids/pull/1083)|Revert "Get cudf/spark dependency from the correct .m2 dir"|
+|[#1062](https://github.com/NVIDIA/spark-rapids/pull/1062)|Get cudf/spark dependency from the correct .m2 dir|
+|[#1078](https://github.com/NVIDIA/spark-rapids/pull/1078)|Another round of fixes for mapping of DataType to DType|
+|[#1066](https://github.com/NVIDIA/spark-rapids/pull/1066)|More fixes for conversion to ColumnarBatch|
+|[#1029](https://github.com/NVIDIA/spark-rapids/pull/1029)|BenchmarkRunner should produce JSON summary file even when queries fail|
+|[#1055](https://github.com/NVIDIA/spark-rapids/pull/1055)|Fix build warnings|
+|[#1064](https://github.com/NVIDIA/spark-rapids/pull/1064)|Use array instead of List for from(Table, DataType)|
+|[#1057](https://github.com/NVIDIA/spark-rapids/pull/1057)|Fix empty table broadcast requiring a GPU on driver node|
+|[#1047](https://github.com/NVIDIA/spark-rapids/pull/1047)|Sanity checks for cudf jar mismatch|
+|[#1044](https://github.com/NVIDIA/spark-rapids/pull/1044)|Accelerated row to columnar and columnar to row transitions|
+|[#1056](https://github.com/NVIDIA/spark-rapids/pull/1056)|Add query number to Spark app name when running benchmarks|
+|[#1054](https://github.com/NVIDIA/spark-rapids/pull/1054)|Log total RMM allocated on GPU OOM|
+|[#1053](https://github.com/NVIDIA/spark-rapids/pull/1053)|Remove isGpuBroadcastNestedLoopJoin from shims|
+|[#1052](https://github.com/NVIDIA/spark-rapids/pull/1052)|Allow for GPUCoalesceBatch to deal with Map|
+|[#1051](https://github.com/NVIDIA/spark-rapids/pull/1051)|Add simple retry for URM dependencies [skip ci]|
+|[#1046](https://github.com/NVIDIA/spark-rapids/pull/1046)|Fix broken links|
+|[#1017](https://github.com/NVIDIA/spark-rapids/pull/1017)|Log whether PTDS is enabled|
+|[#1040](https://github.com/NVIDIA/spark-rapids/pull/1040)|Update to cudf 0.17-SNAPSHOT and fix tests|
+|[#1042](https://github.com/NVIDIA/spark-rapids/pull/1042)|Fix inconsistencies in AQE support for broadcast joins|
+|[#1037](https://github.com/NVIDIA/spark-rapids/pull/1037)|Add in support for the SQL functions Least and Greatest|
+|[#1036](https://github.com/NVIDIA/spark-rapids/pull/1036)|Increase number of retries when waiting for databricks cluster|
+|[#1034](https://github.com/NVIDIA/spark-rapids/pull/1034)|[BUG] To honor spark.rapids.memory.gpu.pool=NONE|
+|[#854](https://github.com/NVIDIA/spark-rapids/pull/854)|Arbitrary function call in UDF|
+|[#1028](https://github.com/NVIDIA/spark-rapids/pull/1028)|Update to cudf-0.16|
+|[#1023](https://github.com/NVIDIA/spark-rapids/pull/1023)|Add --gc-between-run flag for TPC* benchmarks.|
+|[#1001](https://github.com/NVIDIA/spark-rapids/pull/1001)|ColumnarBatch to CachedBatch and back|
+|[#990](https://github.com/NVIDIA/spark-rapids/pull/990)|Parquet coalesce file reader for local filesystems|
+|[#1014](https://github.com/NVIDIA/spark-rapids/pull/1014)|Add --append-dat flag for TPC-DS benchmark|
+|[#991](https://github.com/NVIDIA/spark-rapids/pull/991)|Updated GCP Dataproc Mortgage-ETL-GPU.ipynb|
+|[#886](https://github.com/NVIDIA/spark-rapids/pull/886)|Spark BinaryType and cast to BinaryType|
+|[#1016](https://github.com/NVIDIA/spark-rapids/pull/1016)|Change Hash Aggregate to allow pass-through on MapType|
+|[#984](https://github.com/NVIDIA/spark-rapids/pull/984)|Add support for MapType in selected operators |
+|[#1012](https://github.com/NVIDIA/spark-rapids/pull/1012)|Update for new position parameter in Spark 3.1.0 RegExpReplace|
+|[#995](https://github.com/NVIDIA/spark-rapids/pull/995)|Add shim for EMR 3.0.1 and EMR 3.0.1-SNAPSHOT|
+|[#998](https://github.com/NVIDIA/spark-rapids/pull/998)|Update benchmark automation script|
+|[#1000](https://github.com/NVIDIA/spark-rapids/pull/1000)|Always use RAPIDS shuffle when running TPCH and Mortgage tests|
+|[#981](https://github.com/NVIDIA/spark-rapids/pull/981)|Change databricks build to dynamically create a cluster|
+|[#986](https://github.com/NVIDIA/spark-rapids/pull/986)|Fix missing dataSize metric when using RAPIDS shuffle|
+|[#914](https://github.com/NVIDIA/spark-rapids/pull/914)|Write InternalRow to CachedBatch|
+|[#934](https://github.com/NVIDIA/spark-rapids/pull/934)|Iterator to make it easier to work with a window of blocks in the RAPIDS shuffle|
+|[#992](https://github.com/NVIDIA/spark-rapids/pull/992)|Skip post-clean if aborted before the image build stage in pre-merge [skip ci]|
+|[#988](https://github.com/NVIDIA/spark-rapids/pull/988)|Change in Spark caused the 3.1.0 CI to fail|
+|[#983](https://github.com/NVIDIA/spark-rapids/pull/983)|clean jenkins file for premerge on NGCC|
+|[#964](https://github.com/NVIDIA/spark-rapids/pull/964)|Refactor TPC benchmarks to reduce duplicate code|
+|[#978](https://github.com/NVIDIA/spark-rapids/pull/978)|Enable scalastyle checks for udf-compiler module|
+|[#949](https://github.com/NVIDIA/spark-rapids/pull/949)|Fix GpuWindowExec to work with a CPU SortExec|
+|[#973](https://github.com/NVIDIA/spark-rapids/pull/973)|Stop reporting totalTime metric for GpuShuffleExchangeExec|
+|[#968](https://github.com/NVIDIA/spark-rapids/pull/968)|XFail pos_explode tests until final fix can be put in|
+|[#970](https://github.com/NVIDIA/spark-rapids/pull/970)|Add legacy config to clear active Spark 3.1.0 session in tests|
+|[#918](https://github.com/NVIDIA/spark-rapids/pull/918)|Benchmark runner script|
+|[#915](https://github.com/NVIDIA/spark-rapids/pull/915)|Add option to control number of partitions when converting from CSV to Parquet|
+|[#944](https://github.com/NVIDIA/spark-rapids/pull/944)|Fix some issues with non-determinism|
+|[#935](https://github.com/NVIDIA/spark-rapids/pull/935)|Add in support/tests for a window count on a column|
+|[#940](https://github.com/NVIDIA/spark-rapids/pull/940)|Fix closeOnExcept suppressed exception handling|
+|[#942](https://github.com/NVIDIA/spark-rapids/pull/942)|fix github action env setup [skip ci]|
+|[#933](https://github.com/NVIDIA/spark-rapids/pull/933)|Update first/last tests to avoid non-determinisim and ordering differences|
+|[#931](https://github.com/NVIDIA/spark-rapids/pull/931)|Fix checking for nullable columns in window range query|
+|[#924](https://github.com/NVIDIA/spark-rapids/pull/924)|Benchmark guide update for command-line interface / spark-submit|
+|[#926](https://github.com/NVIDIA/spark-rapids/pull/926)|Move pandas_udf functions into the tests functions|
+|[#929](https://github.com/NVIDIA/spark-rapids/pull/929)|Pick a default tableId to use that is non 0 so that flatbuffers allow…|
+|[#928](https://github.com/NVIDIA/spark-rapids/pull/928)|Fix RapidsBufferStore NPE when no spillable buffers are available|
+|[#820](https://github.com/NVIDIA/spark-rapids/pull/820)|Benchmarking guide|
+|[#859](https://github.com/NVIDIA/spark-rapids/pull/859)|Compare partitioned files in order|
+|[#916](https://github.com/NVIDIA/spark-rapids/pull/916)|create new sparkContext explicitly in CPU notebook|
+|[#917](https://github.com/NVIDIA/spark-rapids/pull/917)|create new SparkContext in GPU notebook explicitly.|
+|[#919](https://github.com/NVIDIA/spark-rapids/pull/919)|Add label benchmark to performance subsection in changelog|
+|[#850](https://github.com/NVIDIA/spark-rapids/pull/850)| Add in basic support for lead/lag|
+|[#843](https://github.com/NVIDIA/spark-rapids/pull/843)|[REVIEW] Cache plugin to handle reading CachedBatch to an InternalRow|
+|[#904](https://github.com/NVIDIA/spark-rapids/pull/904)|Add command-line argument for benchmark result filename|
+|[#909](https://github.com/NVIDIA/spark-rapids/pull/909)|GCP preview version image name update|
+|[#903](https://github.com/NVIDIA/spark-rapids/pull/903)|update getting-started-gcp.md with new component list|
+|[#900](https://github.com/NVIDIA/spark-rapids/pull/900)|Turn off CollectLimitExec replacement by default|
+|[#907](https://github.com/NVIDIA/spark-rapids/pull/907)|remove configs from databricks that shouldn't be used by default|
+|[#893](https://github.com/NVIDIA/spark-rapids/pull/893)|Fix rounding error when casting timestamp to string for timestamps before 1970|
+|[#899](https://github.com/NVIDIA/spark-rapids/pull/899)|Mark reduction corner case tests as xfail on databricks until they can be fixed|
+|[#894](https://github.com/NVIDIA/spark-rapids/pull/894)|Replace whole-buffer slicing with direct refcounting|
+|[#891](https://github.com/NVIDIA/spark-rapids/pull/891)|Add config to dump heap on GPU OOM|
+|[#890](https://github.com/NVIDIA/spark-rapids/pull/890)|Clean up CoalesceBatch to use withResource|
+|[#892](https://github.com/NVIDIA/spark-rapids/pull/892)|Only manifest the current batch in cached block shuffle read iterator|
+|[#871](https://github.com/NVIDIA/spark-rapids/pull/871)|Add support for using the arena allocator|
+|[#889](https://github.com/NVIDIA/spark-rapids/pull/889)|Fix crash on scalar only orderby|
+|[#879](https://github.com/NVIDIA/spark-rapids/pull/879)|Update SpillableColumnarBatch to remove buffer from catalog on close|
+|[#888](https://github.com/NVIDIA/spark-rapids/pull/888)|Shrink detect scope to compile only [skip ci]|
+|[#885](https://github.com/NVIDIA/spark-rapids/pull/885)|[BUG] fix IT dockerfile arguments [skip ci]|
+|[#883](https://github.com/NVIDIA/spark-rapids/pull/883)|[BUG] fix IT dockerfile args ordering [skip ci]|
+|[#875](https://github.com/NVIDIA/spark-rapids/pull/875)|fix the non-consistency for `spark.rapids.sql.format.parquet.multiThreadedRead` in RapidsConf.scala|
+|[#862](https://github.com/NVIDIA/spark-rapids/pull/862)|Migrate nightly&integration pipelines to blossom [skip ci]|
+|[#872](https://github.com/NVIDIA/spark-rapids/pull/872)|Ensure that receive-side batches use GpuColumnVectorFromBuffer to avoid|
+|[#833](https://github.com/NVIDIA/spark-rapids/pull/833)|Add nvcomp LZ4 codec support|
+|[#870](https://github.com/NVIDIA/spark-rapids/pull/870)|Cleaned up tests and documentation for csv timestamp parsing|
+|[#823](https://github.com/NVIDIA/spark-rapids/pull/823)|Add command-line interface for TPC-* for use with spark-submit|
+|[#856](https://github.com/NVIDIA/spark-rapids/pull/856)|Move GpuWindowInPandasExec in shims layers|
+|[#756](https://github.com/NVIDIA/spark-rapids/pull/756)|Add stream-time metric|
+|[#832](https://github.com/NVIDIA/spark-rapids/pull/832)|Skip pandas tests if pandas cannot be found|
+|[#841](https://github.com/NVIDIA/spark-rapids/pull/841)|Fix a hanging issue when processing empty data.|
+|[#840](https://github.com/NVIDIA/spark-rapids/pull/840)|[REVIEW] Fixed failing cache tests|
+|[#848](https://github.com/NVIDIA/spark-rapids/pull/848)|Update task memory and disk spill metrics when buffer store spills|
+|[#851](https://github.com/NVIDIA/spark-rapids/pull/851)|Use contiguous table when deserializing columnar batch|
+|[#857](https://github.com/NVIDIA/spark-rapids/pull/857)|fix pvc scheduling issue|
+|[#853](https://github.com/NVIDIA/spark-rapids/pull/853)|Remove nodeAffinity from premerge pipeline|
+|[#796](https://github.com/NVIDIA/spark-rapids/pull/796)|Record spark plan SQL metrics to JSON when running benchmarks|
+|[#781](https://github.com/NVIDIA/spark-rapids/pull/781)|Add AQE unit tests|
+|[#824](https://github.com/NVIDIA/spark-rapids/pull/824)|Skip cudf_udf test by default|
+|[#839](https://github.com/NVIDIA/spark-rapids/pull/839)|First/Last reduction and cleanup of agg APIs|
+|[#827](https://github.com/NVIDIA/spark-rapids/pull/827)|Add Spark 3.0 EMR Shim layer |
+|[#816](https://github.com/NVIDIA/spark-rapids/pull/816)|[BUG] fix nightly is timing out|
+|[#782](https://github.com/NVIDIA/spark-rapids/pull/782)|Benchmark utility to perform diff of output from benchmark runs, allowing for precision differences|
+|[#813](https://github.com/NVIDIA/spark-rapids/pull/813)|Revert "Enable tests in udf_cudf_test.py"|
+|[#788](https://github.com/NVIDIA/spark-rapids/pull/788)|[FEA] Persist workspace data on PVC for premerge|
+|[#805](https://github.com/NVIDIA/spark-rapids/pull/805)|[FEA] nightly build trigger both IT on spark 300 and 301|
+|[#797](https://github.com/NVIDIA/spark-rapids/pull/797)|Allow host spill store to fit a buffer larger than configured max size|
+|[#807](https://github.com/NVIDIA/spark-rapids/pull/807)|Deploy integration-tests javadoc and sources|
+|[#777](https://github.com/NVIDIA/spark-rapids/pull/777)|Enable tests in udf_cudf_test.py|
+|[#790](https://github.com/NVIDIA/spark-rapids/pull/790)|CI: Update cudf python to 0.16 nightly|
+|[#772](https://github.com/NVIDIA/spark-rapids/pull/772)|Add support for empty array construction.|
+|[#783](https://github.com/NVIDIA/spark-rapids/pull/783)|Improved GpuArrowEvalPythonExec|
+|[#771](https://github.com/NVIDIA/spark-rapids/pull/771)|Various improvements to benchmarks|
+|[#763](https://github.com/NVIDIA/spark-rapids/pull/763)|[REVIEW] Allow CoalesceBatch to spill data that is not in active use|
+|[#727](https://github.com/NVIDIA/spark-rapids/pull/727)|Update cudf dependency to 0.16-SNAPSHOT|
+|[#726](https://github.com/NVIDIA/spark-rapids/pull/726)|parquet writer support for TIMESTAMP_MILLIS|
+|[#674](https://github.com/NVIDIA/spark-rapids/pull/674)|Unit test for GPU exchange re-use with AQE|
+|[#723](https://github.com/NVIDIA/spark-rapids/pull/723)|Update code coverage to find source files in new places|
+|[#766](https://github.com/NVIDIA/spark-rapids/pull/766)|Update the integration Dockerfile to reduce the image size|
+|[#762](https://github.com/NVIDIA/spark-rapids/pull/762)|Fixing conflicts in branch-0.3|
+|[#738](https://github.com/NVIDIA/spark-rapids/pull/738)|[auto-merge] branch-0.2 to branch-0.3 - resolve conflict|
+|[#722](https://github.com/NVIDIA/spark-rapids/pull/722)|Initial code changes to support spilling outside of shuffle|
+|[#693](https://github.com/NVIDIA/spark-rapids/pull/693)|Update jenkins files for 0.3|
+|[#692](https://github.com/NVIDIA/spark-rapids/pull/692)|Merge shims dependency to spark-3.0.1 into branch-0.3|
+|[#690](https://github.com/NVIDIA/spark-rapids/pull/690)|Update the version to 0.3.0-SNAPSHOT|
+
+## Release 0.2
+
+### Features
+|||
+|:---|:---|
+|[#696](https://github.com/NVIDIA/spark-rapids/issues/696)|[FEA] run integration tests against SPARK-3.0.1|
+|[#455](https://github.com/NVIDIA/spark-rapids/issues/455)|[FEA] Support UCX shuffle with optimized AQE|
+|[#510](https://github.com/NVIDIA/spark-rapids/issues/510)|[FEA] Investigate libcudf features needed to support struct schema pruning during loads|
+|[#541](https://github.com/NVIDIA/spark-rapids/issues/541)|[FEA] Scala UDF:Support for null Value operands|
+|[#542](https://github.com/NVIDIA/spark-rapids/issues/542)|[FEA] Scala UDF: Support for Date and Time |
+|[#499](https://github.com/NVIDIA/spark-rapids/issues/499)|[FEA] disable any kind of warnings about ExecutedCommandExec not being on the GPU|
+|[#540](https://github.com/NVIDIA/spark-rapids/issues/540)|[FEA] Scala UDF: Support for String replaceFirst()|
+|[#340](https://github.com/NVIDIA/spark-rapids/issues/340)|[FEA] widen the rendered Jekyll pages|
+|[#602](https://github.com/NVIDIA/spark-rapids/issues/602)|[FEA] don't release with any -SNAPSHOT dependencies|
+|[#579](https://github.com/NVIDIA/spark-rapids/issues/579)|[FEA] Auto-merge between branches|
+|[#515](https://github.com/NVIDIA/spark-rapids/issues/515)|[FEA] Write tests for AQE skewed join optimization|
+|[#452](https://github.com/NVIDIA/spark-rapids/issues/452)|[FEA] Update HashSortOptimizerSuite to work with AQE|
+|[#454](https://github.com/NVIDIA/spark-rapids/issues/454)|[FEA] Update GpuCoalesceBatchesSuite to work with AQE enabled|
+|[#354](https://github.com/NVIDIA/spark-rapids/issues/354)|[FEA]Spark 3.1 FileSourceScanExec adds parameter optionalNumCoalescedBuckets|
+|[#566](https://github.com/NVIDIA/spark-rapids/issues/566)|[FEA] Add support for StringSplit with an array index.|
+|[#524](https://github.com/NVIDIA/spark-rapids/issues/524)|[FEA] Add GPU specific metrics to GpuFileSourceScanExec|
+|[#494](https://github.com/NVIDIA/spark-rapids/issues/494)|[FEA] Add some AQE-specific tests to the PySpark test suite|
+|[#146](https://github.com/NVIDIA/spark-rapids/issues/146)|[FEA] Python tests should support running with Adaptive Query Execution enabled|
+|[#465](https://github.com/NVIDIA/spark-rapids/issues/465)|[FEA] Audit: Update script to audit multiple versions of Spark |
+|[#488](https://github.com/NVIDIA/spark-rapids/issues/488)|[FEA] Ability to limit total GPU memory used|
+|[#70](https://github.com/NVIDIA/spark-rapids/issues/70)|[FEA] Support StringSplit|
+|[#403](https://github.com/NVIDIA/spark-rapids/issues/403)|[FEA] Add in support for GetArrayItem|
+|[#493](https://github.com/NVIDIA/spark-rapids/issues/493)|[FEA] Implement shuffle optimization when AQE is enabled|
+|[#500](https://github.com/NVIDIA/spark-rapids/issues/500)|[FEA] Add maven profiles for testing with AQE on or off|
+|[#471](https://github.com/NVIDIA/spark-rapids/issues/471)|[FEA] create a formal process for updating the github-pages branch|
+|[#233](https://github.com/NVIDIA/spark-rapids/issues/233)|[FEA] Audit DataWritingCommandExec |
+|[#240](https://github.com/NVIDIA/spark-rapids/issues/240)|[FEA] Audit Api validation script follow on - Optimize StringToTypeTag |
+|[#388](https://github.com/NVIDIA/spark-rapids/issues/388)|[FEA] Audit WindowExec|
+|[#425](https://github.com/NVIDIA/spark-rapids/issues/425)|[FEA] Add tests for configs in BatchScan Readers|
+|[#453](https://github.com/NVIDIA/spark-rapids/issues/453)|[FEA] Update HashAggregatesSuite to work with AQE|
+|[#184](https://github.com/NVIDIA/spark-rapids/issues/184)|[FEA] Enable NoScalaDoc scalastyle rule|
+|[#438](https://github.com/NVIDIA/spark-rapids/issues/438)|[FEA] Enable StringLPad|
+|[#232](https://github.com/NVIDIA/spark-rapids/issues/232)|[FEA] Audit SortExec |
+|[#236](https://github.com/NVIDIA/spark-rapids/issues/236)|[FEA] Audit ShuffleExchangeExec |
+|[#355](https://github.com/NVIDIA/spark-rapids/issues/355)|[FEA] Support Multiple Spark versions in the same jar|
+|[#385](https://github.com/NVIDIA/spark-rapids/issues/385)|[FEA] Support RangeExec on the GPU|
+|[#317](https://github.com/NVIDIA/spark-rapids/issues/317)|[FEA] Write test wrapper to run SQL queries via pyspark|
+|[#235](https://github.com/NVIDIA/spark-rapids/issues/235)|[FEA] Audit BroadcastExchangeExec|
+|[#234](https://github.com/NVIDIA/spark-rapids/issues/234)|[FEA] Audit BatchScanExec|
+|[#238](https://github.com/NVIDIA/spark-rapids/issues/238)|[FEA] Audit ShuffledHashJoinExec |
+|[#237](https://github.com/NVIDIA/spark-rapids/issues/237)|[FEA] Audit BroadcastHashJoinExec |
+|[#316](https://github.com/NVIDIA/spark-rapids/issues/316)|[FEA] Add some basic Dataframe tests for CoalesceExec|
+|[#145](https://github.com/NVIDIA/spark-rapids/issues/145)|[FEA] Scala tests should support running with Adaptive Query Execution enabled|
+|[#231](https://github.com/NVIDIA/spark-rapids/issues/231)|[FEA] Audit ProjectExec |
+|[#229](https://github.com/NVIDIA/spark-rapids/issues/229)|[FEA] Audit FileSourceScanExec |
+
+### Performance
+|||
+|:---|:---|
+|[#326](https://github.com/NVIDIA/spark-rapids/issues/326)|[DISCUSS] Shuffle read-side error handling|
+|[#601](https://github.com/NVIDIA/spark-rapids/issues/601)|[FEA] Optimize unnecessary sorts when replacing SortAggregate|
+|[#333](https://github.com/NVIDIA/spark-rapids/issues/333)|[FEA] Better handling of reading lots of small Parquet files|
+|[#511](https://github.com/NVIDIA/spark-rapids/issues/511)|[FEA] Connect shuffle table compression to shuffle exec metrics|
+|[#15](https://github.com/NVIDIA/spark-rapids/issues/15)|[FEA] Multiple threads sharing the same GPU|
+|[#272](https://github.com/NVIDIA/spark-rapids/issues/272)|[DOC] Getting started guide for UCX shuffle|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#780](https://github.com/NVIDIA/spark-rapids/issues/780)|[BUG] Inner Join dropping data with bucketed Table input|
+|[#569](https://github.com/NVIDIA/spark-rapids/issues/569)|[BUG] left_semi_join operation is abnormal and serious time-consuming|
+|[#744](https://github.com/NVIDIA/spark-rapids/issues/744)|[BUG] TPC-DS query 6 now produces incorrect results.|
+|[#718](https://github.com/NVIDIA/spark-rapids/issues/718)|[BUG] GpuBroadcastHashJoinExec ArrayIndexOutOfBoundsException|
+|[#698](https://github.com/NVIDIA/spark-rapids/issues/698)|[BUG] batch coalesce can fail to appear between columnar shuffle and subsequent columnar operation|
+|[#658](https://github.com/NVIDIA/spark-rapids/issues/658)|[BUG] GpuCoalesceBatches collectTime metric can be underreported|
+|[#59](https://github.com/NVIDIA/spark-rapids/issues/59)|[BUG] enable tests for string literals in a select|
+|[#486](https://github.com/NVIDIA/spark-rapids/issues/486)|[BUG] GpuWindowExec does not implement requiredChildOrdering|
+|[#631](https://github.com/NVIDIA/spark-rapids/issues/631)|[BUG] Rows are dropped when AQE is enabled in some cases|
+|[#671](https://github.com/NVIDIA/spark-rapids/issues/671)|[BUG] Databricks hash_aggregate_test fails trying to canonicalize a WrappedAggFunction|
+|[#218](https://github.com/NVIDIA/spark-rapids/issues/218)|[BUG] Window function COUNT(x) includes null-values, when it shouldn't|
+|[#153](https://github.com/NVIDIA/spark-rapids/issues/153)|[BUG] Incorrect output from partial-only hash aggregates with multiple distincts and non-distinct functions|
+|[#656](https://github.com/NVIDIA/spark-rapids/issues/656)|[BUG] integration tests produce hive metadata files|
+|[#607](https://github.com/NVIDIA/spark-rapids/issues/607)|[BUG] Fix misleading "cannot run on GPU" warnings when AQE is enabled|
+|[#630](https://github.com/NVIDIA/spark-rapids/issues/630)|[BUG] GpuCustomShuffleReader metrics always show zero rows/batches output|
+|[#643](https://github.com/NVIDIA/spark-rapids/issues/643)|[BUG] race condition while registering a buffer and spilling at the same time|
+|[#606](https://github.com/NVIDIA/spark-rapids/issues/606)|[BUG] Multiple scans for same data source with TPC-DS query59 with delta format|
+|[#626](https://github.com/NVIDIA/spark-rapids/issues/626)|[BUG] parquet_test showing leaked memory buffer|
+|[#155](https://github.com/NVIDIA/spark-rapids/issues/155)|[BUG] Incorrect output from averages with filters in partial only mode|
+|[#277](https://github.com/NVIDIA/spark-rapids/issues/277)|[BUG] HashAggregateSuite failure when AQE is enabled|
+|[#276](https://github.com/NVIDIA/spark-rapids/issues/276)|[BUG] GpuCoalesceBatchSuite failure when AQE is enabled|
+|[#598](https://github.com/NVIDIA/spark-rapids/issues/598)|[BUG] Non-deterministic output from MapOutputTracker.getStatistics() with AQE on GPU|
+|[#192](https://github.com/NVIDIA/spark-rapids/issues/192)|[BUG] test_read_merge_schema fails on Databricks|
+|[#341](https://github.com/NVIDIA/spark-rapids/issues/341)|[BUG] Document compression formats for readers/writers|
+|[#587](https://github.com/NVIDIA/spark-rapids/issues/587)|[BUG] Spark3.1 changed FileScan which means or GpuScans need to be added to shim layer|
+|[#362](https://github.com/NVIDIA/spark-rapids/issues/362)|[BUG] Implement getReaderForRange in the RapidsShuffleManager|
+|[#528](https://github.com/NVIDIA/spark-rapids/issues/528)|[BUG] HashAggregateSuite "Avg Distinct with filter" no longer valid when testing against Spark 3.1.0|
+|[#416](https://github.com/NVIDIA/spark-rapids/issues/416)|[BUG] Fix Spark 3.1.0 integration tests|
+|[#556](https://github.com/NVIDIA/spark-rapids/issues/556)|[BUG] NPE when removing shuffle|
+|[#553](https://github.com/NVIDIA/spark-rapids/issues/553)|[BUG] GpuColumnVector build warnings from raw type access|
+|[#492](https://github.com/NVIDIA/spark-rapids/issues/492)|[BUG] Re-enable AQE integration tests|
+|[#275](https://github.com/NVIDIA/spark-rapids/issues/275)|[BUG] TpchLike query 2 fails when AQE is enabled|
+|[#508](https://github.com/NVIDIA/spark-rapids/issues/508)|[BUG] GpuUnion publishes metrics on the UI that are all 0|
+|[#269](https://github.com/NVIDIA/spark-rapids/issues/269)|Needed to add `--conf spark.driver.extraClassPath=` |
+|[#473](https://github.com/NVIDIA/spark-rapids/issues/473)|[BUG] PartMerge:countDistinct:sum fails sporadically|
+|[#531](https://github.com/NVIDIA/spark-rapids/issues/531)|[BUG] Temporary RMM workaround needs to be removed|
+|[#532](https://github.com/NVIDIA/spark-rapids/issues/532)|[BUG] NPE when enabling shuffle manager|
+|[#525](https://github.com/NVIDIA/spark-rapids/issues/525)|[BUG] GpuFilterExec reports incorrect nullability of output in some cases|
+|[#483](https://github.com/NVIDIA/spark-rapids/issues/483)|[BUG] Multiple scans for the same parquet data source|
+|[#382](https://github.com/NVIDIA/spark-rapids/issues/382)|[BUG] Spark3.1 StringFallbackSuite regexp_replace null cpu fall back test fails.|
+|[#489](https://github.com/NVIDIA/spark-rapids/issues/489)|[FEA] Fix Spark 3.1 GpuHashJoin since it now requires CodegenSupport|
+|[#441](https://github.com/NVIDIA/spark-rapids/issues/441)|[BUG] test_broadcast_nested_loop_join_special_case fails on databricks|
+|[#347](https://github.com/NVIDIA/spark-rapids/issues/347)|[BUG] Failed to read Parquet file generated by GPU-enabled Spark.|
+|[#433](https://github.com/NVIDIA/spark-rapids/issues/433)|`InSet` operator produces an error for Strings|
+|[#144](https://github.com/NVIDIA/spark-rapids/issues/144)|[BUG] spark.sql.legacy.parquet.datetimeRebaseModeInWrite is ignored|
+|[#323](https://github.com/NVIDIA/spark-rapids/issues/323)|[BUG] GpuBroadcastNestedLoopJoinExec can fail if there are no columns|
+|[#356](https://github.com/NVIDIA/spark-rapids/issues/356)|[BUG] Integration cache test for BroadcastNestedLoopJoin failure|
+|[#280](https://github.com/NVIDIA/spark-rapids/issues/280)|[BUG] Full Outer Join does not work on nullable keys|
+|[#149](https://github.com/NVIDIA/spark-rapids/issues/149)|[BUG] Spark driver fails to load native libs when running on node without CUDA|
+
+### PRs
+|||
+|:---|:---|
+|[#826](https://github.com/NVIDIA/spark-rapids/pull/826)|Fix link to cudf-0.15-cuda11.jar|
+|[#815](https://github.com/NVIDIA/spark-rapids/pull/815)|Update documentation for Scala UDFs in 0.2 since you need two things|
+|[#802](https://github.com/NVIDIA/spark-rapids/pull/802)|Update 0.2 CHANGELOG|
+|[#793](https://github.com/NVIDIA/spark-rapids/pull/793)|Update Jenkins scripts for release|
+|[#798](https://github.com/NVIDIA/spark-rapids/pull/798)|Fix shims provider override config not being seen by executors|
+|[#785](https://github.com/NVIDIA/spark-rapids/pull/785)|Make shuffle run on CPU if we do a join where we read from bucketed table|
+|[#765](https://github.com/NVIDIA/spark-rapids/pull/765)|Add config to override shims provider class|
+|[#759](https://github.com/NVIDIA/spark-rapids/pull/759)|Add CHANGELOG for release 0.2|
+|[#758](https://github.com/NVIDIA/spark-rapids/pull/758)|Skip the udf test fails periodically.|
+|[#752](https://github.com/NVIDIA/spark-rapids/pull/752)|Fix snapshot plugin jar version in docs|
+|[#751](https://github.com/NVIDIA/spark-rapids/pull/751)|Correct the channel for cudf installation|
+|[#754](https://github.com/NVIDIA/spark-rapids/pull/754)|Filter nulls from joins where possible to improve performance|
+|[#732](https://github.com/NVIDIA/spark-rapids/pull/732)|Add a timeout for RapidsShuffleIterator to prevent jobs to hang infin…|
+|[#637](https://github.com/NVIDIA/spark-rapids/pull/637)|Documentation changes for 0.2 release |
+|[#747](https://github.com/NVIDIA/spark-rapids/pull/747)|Disable udf tests that fail periodically|
+|[#745](https://github.com/NVIDIA/spark-rapids/pull/745)|Revert Null Join Filter|
+|[#741](https://github.com/NVIDIA/spark-rapids/pull/741)|Fix issue with parquet partitioned reads|
+|[#733](https://github.com/NVIDIA/spark-rapids/pull/733)|Remove GPU Types from github|
+|[#720](https://github.com/NVIDIA/spark-rapids/pull/720)|Stop removing GpuCoalesceBatches from non-AQE queries when AQE is enabled|
+|[#729](https://github.com/NVIDIA/spark-rapids/pull/729)|Fix collect time metric in CoalesceBatches|
+|[#640](https://github.com/NVIDIA/spark-rapids/pull/640)|Support running Pandas UDFs on GPUs in Python processes.|
+|[#721](https://github.com/NVIDIA/spark-rapids/pull/721)|Add some more checks to databricks build scripts|
+|[#714](https://github.com/NVIDIA/spark-rapids/pull/714)|Move spark 3.0.1-shims out of snapshot-shims|
+|[#711](https://github.com/NVIDIA/spark-rapids/pull/711)|fix blossom checkout repo|
+|[#709](https://github.com/NVIDIA/spark-rapids/pull/709)|[BUG] fix unexpected indentation issue in blossom yml|
+|[#642](https://github.com/NVIDIA/spark-rapids/pull/642)|Init workflow for blossom-ci|
+|[#705](https://github.com/NVIDIA/spark-rapids/pull/705)|Enable configuration check for cast string to timestamp|
+|[#702](https://github.com/NVIDIA/spark-rapids/pull/702)|Update slack channel for Jenkins builds|
+|[#701](https://github.com/NVIDIA/spark-rapids/pull/701)|fix checkout-ref for automerge|
+|[#695](https://github.com/NVIDIA/spark-rapids/pull/695)|Fix spark-3.0.1 shim to be released|
+|[#668](https://github.com/NVIDIA/spark-rapids/pull/668)|refactor automerge to support merge for protected branch|
+|[#687](https://github.com/NVIDIA/spark-rapids/pull/687)|Include the UDF compiler in the dist jar|
+|[#689](https://github.com/NVIDIA/spark-rapids/pull/689)|Change shims dependency to spark-3.0.1|
+|[#677](https://github.com/NVIDIA/spark-rapids/pull/677)|Use multi-threaded parquet read with small files|
+|[#638](https://github.com/NVIDIA/spark-rapids/pull/638)|Add Parquet-based cache serializer|
+|[#613](https://github.com/NVIDIA/spark-rapids/pull/613)|Enable UCX + AQE|
+|[#684](https://github.com/NVIDIA/spark-rapids/pull/684)|Enable test for literal string values in a select|
+|[#686](https://github.com/NVIDIA/spark-rapids/pull/686)|Remove sorts when replacing sort aggregate if possible|
+|[#675](https://github.com/NVIDIA/spark-rapids/pull/675)|Added TimeAdd|
+|[#645](https://github.com/NVIDIA/spark-rapids/pull/645)|[window] Add GpuWindowExec requiredChildOrdering|
+|[#676](https://github.com/NVIDIA/spark-rapids/pull/676)|fixUpJoinConsistency rule now works when AQE is enabled|
+|[#683](https://github.com/NVIDIA/spark-rapids/pull/683)|Fix issues with cannonicalization of WrappedAggFunction|
+|[#682](https://github.com/NVIDIA/spark-rapids/pull/682)|Fix path to start-slave.sh script in docs|
+|[#673](https://github.com/NVIDIA/spark-rapids/pull/673)|Increase build timeouts on nightly and premerge builds|
+|[#648](https://github.com/NVIDIA/spark-rapids/pull/648)|add signoff-check use github actions|
+|[#593](https://github.com/NVIDIA/spark-rapids/pull/593)|Add support for isNaN and datetime related instructions in UDF compiler|
+|[#666](https://github.com/NVIDIA/spark-rapids/pull/666)|[window] Disable GPU for COUNT(exp) queries|
+|[#655](https://github.com/NVIDIA/spark-rapids/pull/655)|Implement AQE unit test for InsertAdaptiveSparkPlan|
+|[#614](https://github.com/NVIDIA/spark-rapids/pull/614)|Fix for aggregation with multiple distinct and non distinct functions|
+|[#657](https://github.com/NVIDIA/spark-rapids/pull/657)|Fix verify build after integration tests are run|
+|[#660](https://github.com/NVIDIA/spark-rapids/pull/660)|Add in neverReplaceExec and several rules for it|
+|[#639](https://github.com/NVIDIA/spark-rapids/pull/639)|BooleanType test shouldn't xfail|
+|[#652](https://github.com/NVIDIA/spark-rapids/pull/652)|Mark UVM config as internal until supported|
+|[#653](https://github.com/NVIDIA/spark-rapids/pull/653)|Move to the cudf-0.15 release|
+|[#647](https://github.com/NVIDIA/spark-rapids/pull/647)|Improve warnings about AQE nodes not supported on GPU|
+|[#646](https://github.com/NVIDIA/spark-rapids/pull/646)|Stop reporting zero metrics for GpuCustomShuffleReader|
+|[#644](https://github.com/NVIDIA/spark-rapids/pull/644)|Small fix for race in catalog where a buffer could get spilled while …|
+|[#623](https://github.com/NVIDIA/spark-rapids/pull/623)|Fix issues with canonicalization|
+|[#599](https://github.com/NVIDIA/spark-rapids/pull/599)|[FEA] changelog generator|
+|[#563](https://github.com/NVIDIA/spark-rapids/pull/563)|cudf and spark version info in artifacts|
+|[#633](https://github.com/NVIDIA/spark-rapids/pull/633)|Fix leak if RebaseHelper throws during Parquet read|
+|[#632](https://github.com/NVIDIA/spark-rapids/pull/632)|Copy function isSearchableType from Spark because signature changed in 3.0.1|
+|[#583](https://github.com/NVIDIA/spark-rapids/pull/583)|Add udf compiler unit tests|
+|[#617](https://github.com/NVIDIA/spark-rapids/pull/617)|Documentation updates for branch 0.2|
+|[#616](https://github.com/NVIDIA/spark-rapids/pull/616)|Add config to reserve GPU memory|
+|[#612](https://github.com/NVIDIA/spark-rapids/pull/612)|[REVIEW] Fix incorrect output from averages with filters in partial only mode|
+|[#609](https://github.com/NVIDIA/spark-rapids/pull/609)|fix minor issues with instructions for building ucx|
+|[#611](https://github.com/NVIDIA/spark-rapids/pull/611)|Added in profile to enable shims for SNAPSHOT releases|
+|[#595](https://github.com/NVIDIA/spark-rapids/pull/595)|Parquet small file reading optimization|
+|[#582](https://github.com/NVIDIA/spark-rapids/pull/582)|fix #579 Auto-merge between branches|
+|[#536](https://github.com/NVIDIA/spark-rapids/pull/536)|Add test for skewed join optimization when AQE is enabled|
+|[#603](https://github.com/NVIDIA/spark-rapids/pull/603)|Fix data size metric always 0 when using RAPIDS shuffle|
+|[#600](https://github.com/NVIDIA/spark-rapids/pull/600)|Fix calculation of string data for compressed batches|
+|[#597](https://github.com/NVIDIA/spark-rapids/pull/597)|Remove the xfail for parquet test_read_merge_schema on Databricks|
+|[#591](https://github.com/NVIDIA/spark-rapids/pull/591)|Add ucx license in NOTICE-binary|
+|[#596](https://github.com/NVIDIA/spark-rapids/pull/596)|Add Spark 3.0.2 to Shim layer|
+|[#594](https://github.com/NVIDIA/spark-rapids/pull/594)|Filter nulls from joins where possible to improve performance.|
+|[#590](https://github.com/NVIDIA/spark-rapids/pull/590)|Move GpuParquetScan/GpuOrcScan into Shim|
+|[#588](https://github.com/NVIDIA/spark-rapids/pull/588)|xfail the tpch spark 3.1.0 tests that fail|
+|[#572](https://github.com/NVIDIA/spark-rapids/pull/572)|Update buffer store to return compressed batches directly, add compression NVTX ranges|
+|[#558](https://github.com/NVIDIA/spark-rapids/pull/558)|Fix unit tests when AQE is enabled|
+|[#580](https://github.com/NVIDIA/spark-rapids/pull/580)|xfail the Spark 3.1.0 integration tests that fail |
+|[#565](https://github.com/NVIDIA/spark-rapids/pull/565)|Minor improvements to TPC-DS benchmarking code|
+|[#567](https://github.com/NVIDIA/spark-rapids/pull/567)|Explicitly disable AQE in one test|
+|[#571](https://github.com/NVIDIA/spark-rapids/pull/571)|Fix Databricks shim layer for GpuFileSourceScanExec and GpuBroadcastExchangeExec|
+|[#564](https://github.com/NVIDIA/spark-rapids/pull/564)|Add GPU decode time metric to scans|
+|[#562](https://github.com/NVIDIA/spark-rapids/pull/562)|getCatalog can be called from the driver, and can return null|
+|[#555](https://github.com/NVIDIA/spark-rapids/pull/555)|Fix build warnings for ColumnViewAccess|
+|[#560](https://github.com/NVIDIA/spark-rapids/pull/560)|Fix databricks build for AQE support|
+|[#557](https://github.com/NVIDIA/spark-rapids/pull/557)|Fix tests failing on Spark 3.1|
+|[#547](https://github.com/NVIDIA/spark-rapids/pull/547)|Add GPU metrics to GpuFileSourceScanExec|
+|[#462](https://github.com/NVIDIA/spark-rapids/pull/462)|Implement optimized AQE support so that exchanges run on GPU where possible|
+|[#550](https://github.com/NVIDIA/spark-rapids/pull/550)|Document Parquet and ORC compression support|
+|[#539](https://github.com/NVIDIA/spark-rapids/pull/539)|Update script to audit multiple Spark versions|
+|[#543](https://github.com/NVIDIA/spark-rapids/pull/543)|Add metrics to GpuUnion operator|
+|[#549](https://github.com/NVIDIA/spark-rapids/pull/549)|Move spark shim properties to top level pom|
+|[#497](https://github.com/NVIDIA/spark-rapids/pull/497)|Add UDF compiler implementations|
+|[#487](https://github.com/NVIDIA/spark-rapids/pull/487)|Add framework for batch compression of shuffle partitions|
+|[#544](https://github.com/NVIDIA/spark-rapids/pull/544)|Add in driverExtraClassPath for standalone mode docs|
+|[#546](https://github.com/NVIDIA/spark-rapids/pull/546)|Fix Spark 3.1.0 shim build error in GpuHashJoin|
+|[#537](https://github.com/NVIDIA/spark-rapids/pull/537)|Use fresh SparkSession when capturing to avoid late capture of previous query|
+|[#538](https://github.com/NVIDIA/spark-rapids/pull/538)|Revert "Temporary workaround for RMM initial pool size bug (#530)"|
+|[#517](https://github.com/NVIDIA/spark-rapids/pull/517)|Add config to limit maximum RMM pool size|
+|[#527](https://github.com/NVIDIA/spark-rapids/pull/527)|Add support for split and getArrayIndex|
+|[#534](https://github.com/NVIDIA/spark-rapids/pull/534)|Fixes bugs around GpuShuffleEnv initialization|
+|[#529](https://github.com/NVIDIA/spark-rapids/pull/529)|[BUG] Degenerate table metas were not getting copied to the heap|
+|[#530](https://github.com/NVIDIA/spark-rapids/pull/530)|Temporary workaround for RMM initial pool size bug|
+|[#526](https://github.com/NVIDIA/spark-rapids/pull/526)|Fix bug with nullability reporting in GpuFilterExec|
+|[#521](https://github.com/NVIDIA/spark-rapids/pull/521)|Fix typo with databricks shim classname SparkShimServiceProvider|
+|[#522](https://github.com/NVIDIA/spark-rapids/pull/522)|Use SQLConf instead of SparkConf when looking up SQL configs|
+|[#518](https://github.com/NVIDIA/spark-rapids/pull/518)|Fix init order issue in GpuShuffleEnv when RAPIDS shuffle configured|
+|[#514](https://github.com/NVIDIA/spark-rapids/pull/514)|Added clarification of RegExpReplace, DateDiff, made descriptive text consistent|
+|[#506](https://github.com/NVIDIA/spark-rapids/pull/506)|Add in basic support for running tpcds like queries|
+|[#504](https://github.com/NVIDIA/spark-rapids/pull/504)|Add ability to ignore tests depending on spark shim version|
+|[#503](https://github.com/NVIDIA/spark-rapids/pull/503)|Remove unused async buffer spill support|
+|[#501](https://github.com/NVIDIA/spark-rapids/pull/501)|disable codegen in 3.1 shim for hash join|
+|[#466](https://github.com/NVIDIA/spark-rapids/pull/466)|Optimize and fix Api validation script|
+|[#481](https://github.com/NVIDIA/spark-rapids/pull/481)|Codeowners|
+|[#439](https://github.com/NVIDIA/spark-rapids/pull/439)|Check a PR has been committed using git signoff|
+|[#319](https://github.com/NVIDIA/spark-rapids/pull/319)|Update partitioning logic in ShuffledBatchRDD|
+|[#491](https://github.com/NVIDIA/spark-rapids/pull/491)|Temporarily ignore AQE integration tests|
+|[#490](https://github.com/NVIDIA/spark-rapids/pull/490)|Fix Spark 3.1.0 build for HashJoin changes|
+|[#482](https://github.com/NVIDIA/spark-rapids/pull/482)|Prevent bad practice in python tests|
+|[#485](https://github.com/NVIDIA/spark-rapids/pull/485)|Show plan in assertion message if test fails|
+|[#480](https://github.com/NVIDIA/spark-rapids/pull/480)|Fix link from README to getting-started.md|
+|[#448](https://github.com/NVIDIA/spark-rapids/pull/448)|Preliminary support for keeping broadcast exchanges on GPU when AQE is enabled|
+|[#478](https://github.com/NVIDIA/spark-rapids/pull/478)|Fall back to CPU for binary as string in parquet|
+|[#477](https://github.com/NVIDIA/spark-rapids/pull/477)|Fix special case joins in broadcast nested loop join|
+|[#469](https://github.com/NVIDIA/spark-rapids/pull/469)|Update HashAggregateSuite to work with AQE|
+|[#475](https://github.com/NVIDIA/spark-rapids/pull/475)|Udf compiler pom followup|
+|[#434](https://github.com/NVIDIA/spark-rapids/pull/434)|Add UDF compiler skeleton|
+|[#474](https://github.com/NVIDIA/spark-rapids/pull/474)|Re-enable noscaladoc check|
+|[#461](https://github.com/NVIDIA/spark-rapids/pull/461)|Fix comments style to pass scala style check|
+|[#468](https://github.com/NVIDIA/spark-rapids/pull/468)|fix broken link|
+|[#456](https://github.com/NVIDIA/spark-rapids/pull/456)|Add closeOnExcept to clean up code that closes resources only on exceptions|
+|[#464](https://github.com/NVIDIA/spark-rapids/pull/464)|Turn off noscaladoc rule until codebase is fixed|
+|[#449](https://github.com/NVIDIA/spark-rapids/pull/449)|Enforce NoScalaDoc rule in scalastyle checks|
+|[#450](https://github.com/NVIDIA/spark-rapids/pull/450)|Enable scalastyle for shuffle plugin|
+|[#451](https://github.com/NVIDIA/spark-rapids/pull/451)|Databricks remove unneeded files and fix build to not fail on rm when file missing|
+|[#442](https://github.com/NVIDIA/spark-rapids/pull/442)|Shim layer support for Spark 3.0.0 Databricks|
+|[#447](https://github.com/NVIDIA/spark-rapids/pull/447)|Add scalastyle plugin to shim module|
+|[#426](https://github.com/NVIDIA/spark-rapids/pull/426)|Update BufferMeta to support multiple codec buffers per table|
+|[#440](https://github.com/NVIDIA/spark-rapids/pull/440)|Run mortgage test both with AQE on and off|
+|[#445](https://github.com/NVIDIA/spark-rapids/pull/445)|Added in StringRPad and StringLPad|
+|[#422](https://github.com/NVIDIA/spark-rapids/pull/422)|Documentation updates|
+|[#437](https://github.com/NVIDIA/spark-rapids/pull/437)|Fix bug with InSet and Strings|
+|[#435](https://github.com/NVIDIA/spark-rapids/pull/435)|Add in checks for Parquet LEGACY date/time rebase|
+|[#432](https://github.com/NVIDIA/spark-rapids/pull/432)|Fix batch use-after-close in partitioning, shuffle env init|
+|[#423](https://github.com/NVIDIA/spark-rapids/pull/423)|Fix duplicates includes in assembly jar|
+|[#418](https://github.com/NVIDIA/spark-rapids/pull/418)|CI Add unit tests running for Spark 3.0.1|
+|[#421](https://github.com/NVIDIA/spark-rapids/pull/421)|Make it easier to run TPCxBB benchmarks from spark shell|
+|[#413](https://github.com/NVIDIA/spark-rapids/pull/413)|Fix download link|
+|[#414](https://github.com/NVIDIA/spark-rapids/pull/414)|Shim Layer to support multiple Spark versions |
+|[#406](https://github.com/NVIDIA/spark-rapids/pull/406)|Update cast handling to deal with new libcudf casting limitations|
+|[#405](https://github.com/NVIDIA/spark-rapids/pull/405)|Change slave->worker|
+|[#395](https://github.com/NVIDIA/spark-rapids/pull/395)|Databricks doc updates|
+|[#401](https://github.com/NVIDIA/spark-rapids/pull/401)|Extended the FAQ|
+|[#398](https://github.com/NVIDIA/spark-rapids/pull/398)|Add tests for GpuPartition|
+|[#352](https://github.com/NVIDIA/spark-rapids/pull/352)|Change spark tgz package name|
+|[#397](https://github.com/NVIDIA/spark-rapids/pull/397)|Fix small bug in ShuffleBufferCatalog.hasActiveShuffle|
+|[#286](https://github.com/NVIDIA/spark-rapids/pull/286)|[REVIEW] Updated join tests for cache|
+|[#393](https://github.com/NVIDIA/spark-rapids/pull/393)|Contributor license agreement|
+|[#389](https://github.com/NVIDIA/spark-rapids/pull/389)|Added in support for RangeExec|
+|[#390](https://github.com/NVIDIA/spark-rapids/pull/390)|Ucx getting started|
+|[#391](https://github.com/NVIDIA/spark-rapids/pull/391)|Hide slack channel in Jenkins scripts|
+|[#387](https://github.com/NVIDIA/spark-rapids/pull/387)|Remove the term whitelist|
+|[#365](https://github.com/NVIDIA/spark-rapids/pull/365)|[REVIEW] Timesub tests|
+|[#383](https://github.com/NVIDIA/spark-rapids/pull/383)|Test utility to compare SQL query results between CPU and GPU|
+|[#380](https://github.com/NVIDIA/spark-rapids/pull/380)|Fix databricks notebook link|
+|[#378](https://github.com/NVIDIA/spark-rapids/pull/378)|Added in FAQ and fixed spelling|
+|[#377](https://github.com/NVIDIA/spark-rapids/pull/377)|Update heading in configs.md|
+|[#373](https://github.com/NVIDIA/spark-rapids/pull/373)|Modifying branch name to conform with rapidsai branch name change|
+|[#376](https://github.com/NVIDIA/spark-rapids/pull/376)|Add our session extension correctly if there are other extensions configured|
+|[#374](https://github.com/NVIDIA/spark-rapids/pull/374)|Fix rat issue for notebooks|
+|[#364](https://github.com/NVIDIA/spark-rapids/pull/364)|Update Databricks patch for changes to GpuSortMergeJoin|
+|[#371](https://github.com/NVIDIA/spark-rapids/pull/371)|fix typo and use regional bucket per GCP's update|
+|[#359](https://github.com/NVIDIA/spark-rapids/pull/359)|Karthik changes|
+|[#353](https://github.com/NVIDIA/spark-rapids/pull/353)|Fix broadcast nested loop join for the no column case|
+|[#313](https://github.com/NVIDIA/spark-rapids/pull/313)|Additional tests for broadcast hash join|
+|[#342](https://github.com/NVIDIA/spark-rapids/pull/342)|Implement build-side rules for shuffle hash join|
+|[#349](https://github.com/NVIDIA/spark-rapids/pull/349)|Updated join code to treat null equality properly|
+|[#335](https://github.com/NVIDIA/spark-rapids/pull/335)|Integration tests on spark 3.0.1-SNAPSHOT & 3.1.0-SNAPSHOT|
+|[#346](https://github.com/NVIDIA/spark-rapids/pull/346)|Update the Title Header for Fine Tuning|
+|[#344](https://github.com/NVIDIA/spark-rapids/pull/344)|Fix small typo in readme|
+|[#331](https://github.com/NVIDIA/spark-rapids/pull/331)|Adds iterator and client unit tests, and prepares for more fetch failure handling|
+|[#337](https://github.com/NVIDIA/spark-rapids/pull/337)|Fix Scala compile phase to allow Java classes referencing Scala classes|
+|[#332](https://github.com/NVIDIA/spark-rapids/pull/332)|Match GPU overwritten functions with SQL functions from FunctionRegistry|
+|[#339](https://github.com/NVIDIA/spark-rapids/pull/339)|Fix databricks build|
+|[#338](https://github.com/NVIDIA/spark-rapids/pull/338)|Move GpuPartitioning to a separate file|
+|[#310](https://github.com/NVIDIA/spark-rapids/pull/310)|Update release Jenkinsfile for Databricks|
+|[#330](https://github.com/NVIDIA/spark-rapids/pull/330)|Hide private info in Jenkins scripts|
+|[#324](https://github.com/NVIDIA/spark-rapids/pull/324)|Add in basic support for GpuCartesianProductExec|
+|[#328](https://github.com/NVIDIA/spark-rapids/pull/328)|Enable slack notification for Databricks build|
+|[#321](https://github.com/NVIDIA/spark-rapids/pull/321)|update databricks patch for GpuBroadcastNestedLoopJoinExec|
+|[#322](https://github.com/NVIDIA/spark-rapids/pull/322)|Add oss.sonatype.org to download the cudf jar|
+|[#320](https://github.com/NVIDIA/spark-rapids/pull/320)|Don't mount passwd/group to the container|
+|[#258](https://github.com/NVIDIA/spark-rapids/pull/258)|Enable running TPCH tests with AQE enabled|
+|[#318](https://github.com/NVIDIA/spark-rapids/pull/318)|Build docker image with Dockerfile|
+|[#309](https://github.com/NVIDIA/spark-rapids/pull/309)|Update databricks patch to latest changes|
+|[#312](https://github.com/NVIDIA/spark-rapids/pull/312)|Trigger branch-0.2 integration test|
+|[#307](https://github.com/NVIDIA/spark-rapids/pull/307)|[Jenkins] Update the release script and Jenkinsfile|
+|[#304](https://github.com/NVIDIA/spark-rapids/pull/304)|[DOC][Minor] Fix typo in spark config name.|
+|[#303](https://github.com/NVIDIA/spark-rapids/pull/303)|Update compatibility doc for -0.0 issues|
+|[#301](https://github.com/NVIDIA/spark-rapids/pull/301)|Add info about branches in README.md|
+|[#296](https://github.com/NVIDIA/spark-rapids/pull/296)|Added in basic support for broadcast nested loop join|
+|[#297](https://github.com/NVIDIA/spark-rapids/pull/297)|Databricks CI improvements and support runtime env parameter to xfail certain tests|
+|[#292](https://github.com/NVIDIA/spark-rapids/pull/292)|Move artifacts version in version-def.sh|
+|[#254](https://github.com/NVIDIA/spark-rapids/pull/254)|Cleanup QA tests|
+|[#289](https://github.com/NVIDIA/spark-rapids/pull/289)|Clean up GpuCollectLimitMeta and add in metrics|
+|[#287](https://github.com/NVIDIA/spark-rapids/pull/287)|Add in support for right join and fix issues build right|
+|[#273](https://github.com/NVIDIA/spark-rapids/pull/273)|Added releases to the README.md|
+|[#285](https://github.com/NVIDIA/spark-rapids/pull/285)|modify run_pyspark_from_build.sh to be bash 3 friendly|
+|[#281](https://github.com/NVIDIA/spark-rapids/pull/281)|Add in support for Full Outer Join on non-null keys|
+|[#274](https://github.com/NVIDIA/spark-rapids/pull/274)|Add RapidsDiskStore tests|
+|[#259](https://github.com/NVIDIA/spark-rapids/pull/259)|Add RapidsHostMemoryStore tests|
+|[#282](https://github.com/NVIDIA/spark-rapids/pull/282)|Update Databricks patch for 0.2 branch|
+|[#261](https://github.com/NVIDIA/spark-rapids/pull/261)|Add conditional xfail test for DISTINCT aggregates with NaN|
+|[#263](https://github.com/NVIDIA/spark-rapids/pull/263)|More time ops|
+|[#256](https://github.com/NVIDIA/spark-rapids/pull/256)|Remove special cases for contains, startsWith, and endWith|
+|[#253](https://github.com/NVIDIA/spark-rapids/pull/253)|Remove GpuAttributeReference and GpuSortOrder|
+|[#271](https://github.com/NVIDIA/spark-rapids/pull/271)|Update the versions for 0.2.0 properly for the databricks build|
+|[#162](https://github.com/NVIDIA/spark-rapids/pull/162)|Integration tests for corner cases in window functions.|
+|[#264](https://github.com/NVIDIA/spark-rapids/pull/264)|Add a local mvn repo for nightly pipeline|
+|[#262](https://github.com/NVIDIA/spark-rapids/pull/262)|Refer to branch-0.2|
+|[#255](https://github.com/NVIDIA/spark-rapids/pull/255)|Revert change to make dependencies of shaded jar optional|
+|[#257](https://github.com/NVIDIA/spark-rapids/pull/257)|Fix link to RAPIDS cudf in index.md|
+|[#252](https://github.com/NVIDIA/spark-rapids/pull/252)|Update to 0.2.0-SNAPSHOT and cudf-0.15-SNAPSHOT|
+
+## Release 0.1
+
+### Features
+|||
+|:---|:---|
+|[#74](https://github.com/NVIDIA/spark-rapids/issues/74)|[FEA] Support ToUnixTimestamp|
+|[#21](https://github.com/NVIDIA/spark-rapids/issues/21)|[FEA] NormalizeNansAndZeros|
+|[#105](https://github.com/NVIDIA/spark-rapids/issues/105)|[FEA] integration tests for equi-joins|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#116](https://github.com/NVIDIA/spark-rapids/issues/116)|[BUG] calling replace with a NULL throws an exception|
+|[#168](https://github.com/NVIDIA/spark-rapids/issues/168)|[BUG] GpuUnitTests Date tests leak column vectors|
+|[#209](https://github.com/NVIDIA/spark-rapids/issues/209)|[BUG] Developers section in pom need to be updated|
+|[#204](https://github.com/NVIDIA/spark-rapids/issues/204)|[BUG] Code coverage docs are out of date|
+|[#154](https://github.com/NVIDIA/spark-rapids/issues/154)|[BUG] Incorrect output from partial-only averages with nulls|
+|[#61](https://github.com/NVIDIA/spark-rapids/issues/61)|[BUG] Cannot disable Parquet, ORC, CSV reading when using FileSourceScanExec|
+
+### PRs
+|||
+|:---|:---|
+|[#249](https://github.com/NVIDIA/spark-rapids/pull/249)|Compatability -> Compatibility|
+|[#247](https://github.com/NVIDIA/spark-rapids/pull/247)|Add index.md for default doc page, fix table formatting for configs|
+|[#241](https://github.com/NVIDIA/spark-rapids/pull/241)|Let default branch to master per the release rule|
+|[#177](https://github.com/NVIDIA/spark-rapids/pull/177)|Fixed leaks in unit test and use ColumnarBatch for testing|
+|[#243](https://github.com/NVIDIA/spark-rapids/pull/243)|Jenkins file for Databricks release|
+|[#225](https://github.com/NVIDIA/spark-rapids/pull/225)|Make internal project dependencies optional for shaded artifact|
+|[#242](https://github.com/NVIDIA/spark-rapids/pull/242)|Add site pages|
+|[#221](https://github.com/NVIDIA/spark-rapids/pull/221)|Databricks Build Support|
+|[#215](https://github.com/NVIDIA/spark-rapids/pull/215)|Remove CudfColumnVector|
+|[#213](https://github.com/NVIDIA/spark-rapids/pull/213)|Add RapidsDeviceMemoryStore tests|
+|[#214](https://github.com/NVIDIA/spark-rapids/pull/214)|[REVIEW] Test failure to pass Attribute as GpuAttribute|
+|[#211](https://github.com/NVIDIA/spark-rapids/pull/211)|Add project leads to pom developer list|
+|[#210](https://github.com/NVIDIA/spark-rapids/pull/210)|Updated coverage docs|
+|[#195](https://github.com/NVIDIA/spark-rapids/pull/195)|Support public release for plugin jar|
+|[#208](https://github.com/NVIDIA/spark-rapids/pull/208)|Remove unneeded comment from pom.xml|
+|[#191](https://github.com/NVIDIA/spark-rapids/pull/191)|WindowExec handle different spark distributions|
+|[#181](https://github.com/NVIDIA/spark-rapids/pull/181)|Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized|
+|[#196](https://github.com/NVIDIA/spark-rapids/pull/196)|Update Spark dependency to the released 3.0.0 artifacts|
+|[#206](https://github.com/NVIDIA/spark-rapids/pull/206)|Change groupID to 'com.nvidia' in IT scripts|
+|[#202](https://github.com/NVIDIA/spark-rapids/pull/202)|Fixed issue for contains when searching for an empty string|
+|[#201](https://github.com/NVIDIA/spark-rapids/pull/201)|Fix name of scan|
+|[#200](https://github.com/NVIDIA/spark-rapids/pull/200)|Fix issue with GpuAttributeReference not overrideing references|
+|[#197](https://github.com/NVIDIA/spark-rapids/pull/197)|Fix metrics for writes|
+|[#186](https://github.com/NVIDIA/spark-rapids/pull/186)|Fixed issue with nullability on concat|
+|[#193](https://github.com/NVIDIA/spark-rapids/pull/193)|Add RapidsBufferCatalog tests|
+|[#188](https://github.com/NVIDIA/spark-rapids/pull/188)|rebrand to com.nvidia instead of ai.rapids|
+|[#189](https://github.com/NVIDIA/spark-rapids/pull/189)|Handle AggregateExpression having resultIds parameter instead of a single resultId|
+|[#190](https://github.com/NVIDIA/spark-rapids/pull/190)|FileSourceScanExec can have logicalRelation parameter on some distributions|
+|[#185](https://github.com/NVIDIA/spark-rapids/pull/185)|Update type of parameter of GpuExpandExec to make it consistent|
+|[#172](https://github.com/NVIDIA/spark-rapids/pull/172)|Merge qa test to integration test|
+|[#180](https://github.com/NVIDIA/spark-rapids/pull/180)|Add MetaUtils unit tests|
+|[#171](https://github.com/NVIDIA/spark-rapids/pull/171)|Cleanup scaladoc warnings about missing links|
+|[#176](https://github.com/NVIDIA/spark-rapids/pull/176)|Updated join tests to cover more data.|
+|[#169](https://github.com/NVIDIA/spark-rapids/pull/169)|Remove dependency on shaded Spark artifact|
+|[#174](https://github.com/NVIDIA/spark-rapids/pull/174)|Added in fallback tests|
+|[#165](https://github.com/NVIDIA/spark-rapids/pull/165)|Move input metadata tests to pyspark|
+|[#173](https://github.com/NVIDIA/spark-rapids/pull/173)|Fix setting local mode for tests|
+|[#160](https://github.com/NVIDIA/spark-rapids/pull/160)|Integration tests for normalizing NaN/zeroes.|
+|[#163](https://github.com/NVIDIA/spark-rapids/pull/163)|Ignore the order locally for repartition tests|
+|[#157](https://github.com/NVIDIA/spark-rapids/pull/157)|Add partial and final only hash aggregate tests and fix nulls corner case for Average|
+|[#159](https://github.com/NVIDIA/spark-rapids/pull/159)|Add integration tests for joins|
+|[#158](https://github.com/NVIDIA/spark-rapids/pull/158)|Orc merge schema fallback and FileScan format configs|
+|[#164](https://github.com/NVIDIA/spark-rapids/pull/164)|Fix compiler warnings|
+|[#152](https://github.com/NVIDIA/spark-rapids/pull/152)|Moved cudf to 0.14 for CI|
+|[#151](https://github.com/NVIDIA/spark-rapids/pull/151)|Switch CICD pipelines to Github|
+
+## Older Releases
+Changelog of older releases can be found at [docs/archives](/docs/archives)
diff --git a/docs/configs.md b/docs/configs.md
index d79514badf6..6fd15ac25b7 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -164,6 +164,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.Atanh"></a>spark.rapids.sql.expression.Atanh|`atanh`|Inverse hyperbolic tangent|true|None|
 <a name="sql.expression.AttributeReference"></a>spark.rapids.sql.expression.AttributeReference| |References an input column|true|None|
 <a name="sql.expression.BRound"></a>spark.rapids.sql.expression.BRound|`bround`|Round an expression to d decimal places using HALF_EVEN rounding mode|true|None|
+<a name="sql.expression.BitLength"></a>spark.rapids.sql.expression.BitLength|`bit_length`|The bit length of string data|true|None|
 <a name="sql.expression.BitwiseAnd"></a>spark.rapids.sql.expression.BitwiseAnd|`&`|Returns the bitwise AND of the operands|true|None|
 <a name="sql.expression.BitwiseNot"></a>spark.rapids.sql.expression.BitwiseNot|`~`|Returns the bitwise NOT of the operands|true|None|
 <a name="sql.expression.BitwiseOr"></a>spark.rapids.sql.expression.BitwiseOr|`\|`|Returns the bitwise OR of the operands|true|None|
@@ -255,6 +256,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.NaNvl"></a>spark.rapids.sql.expression.NaNvl|`nanvl`|Evaluates to `left` iff left is not NaN, `right` otherwise|true|None|
 <a name="sql.expression.NamedLambdaVariable"></a>spark.rapids.sql.expression.NamedLambdaVariable| |A parameter to a higher order SQL function|true|None|
 <a name="sql.expression.Not"></a>spark.rapids.sql.expression.Not|`!`, `not`|Boolean not operator|true|None|
+<a name="sql.expression.OctetLength"></a>spark.rapids.sql.expression.OctetLength|`octet_length`|The byte length of string data|true|None|
 <a name="sql.expression.Or"></a>spark.rapids.sql.expression.Or|`or`|Logical OR|true|None|
 <a name="sql.expression.Pmod"></a>spark.rapids.sql.expression.Pmod|`pmod`|Pmod|true|None|
 <a name="sql.expression.PosExplode"></a>spark.rapids.sql.expression.PosExplode|`posexplode_outer`, `posexplode`|Given an input array produces a sequence of rows for each value in the array|true|None|
diff --git a/docs/download.md b/docs/download.md
index a16a774d18e..7cb561a5ae6 100644
--- a/docs/download.md
+++ b/docs/download.md
@@ -18,6 +18,69 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub
 that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started
 guide](https://nvidia.github.io/spark-rapids/Getting-Started/) for more details.
 
+## Release v22.02.0
+Hardware Requirements:
+
+The plugin is tested on the following architectures:
+
+	GPU Models: NVIDIA V100, T4 and A2/A10/A30/A100 GPUs
+
+Software Requirements:
+
+	OS: Ubuntu 18.04, Ubuntu 20.04 or CentOS 7, CentOS 8
+
+	CUDA & NVIDIA Drivers*: 11.x & v450.80.02+
+
+	Apache Spark 3.0.1, 3.0.2, 3.0.3, 3.1.1, 3.1.2, 3.2.0, 3.2.1, Cloudera CDP 7.1.6, 7.1.7, Databricks 7.3 ML LTS or 9.1 ML LTS Runtime and GCP Dataproc 2.0
+
+	Python 3.6+, Scala 2.12, Java 8
+
+*Some hardware may have a minimum driver version greater than v450.80.02+.  Check the GPU spec sheet
+for your hardware's minimum driver version.
+
+### Download v22.02.0
+* Download the [RAPIDS
+  Accelerator for Apache Spark 22.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.02.0/rapids-4-spark_2.12-22.02.0.jar)
+* Download the [RAPIDS cuDF 22.02.0 jar](https://repo1.maven.org/maven2/ai/rapids/cudf/22.02.0/cudf-22.02.0-cuda11.jar)
+
+This package is built against CUDA 11.5 and has [CUDA forward
+compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html) enabled.  It is tested
+on V100, T4, A2, A10, A30 and A100 GPUs with CUDA 11.0-11.5.  For those using other types of GPUs which
+do not have CUDA forward compatibility (for example, GeForce), CUDA 11.5 is required. Users will
+need to ensure the minimum driver (450.80.02) and CUDA toolkit are installed on each Spark node.
+
+### Verify signature
+* Download the [RAPIDS Accelerator for Apache Spark 22.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.02.0/rapids-4-spark_2.12-22.02.0.jar)
+  and [RAPIDS Accelerator for Apache Spark 22.02.0 jars.asc](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.02.0/rapids-4-spark_2.12-22.02.0.jar.asc)
+* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
+* Import the public key: `gpg --import PUB_KEY`
+* Verify the signature: `gpg --verify rapids-4-spark_2.12-22.02.0.jar.asc rapids-4-spark_2.12-22.02.0.jar`
+
+The output if signature verify:
+
+	gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) <sw-spark@nvidia.com>"
+
+### Release Notes
+New functionality and performance improvements for this release include:
+* Parquet reader and writer support for decimal precision up to 38 digits (128-bits)
+* Decimal 128-bits casting
+  * Casting of decimal 128-bits values in nested types
+  * Casting to String from decimal 128-bits
+  * Casting from String to decimal 128-bits
+* MIG on YARN support
+* GPU explain only mode for Spark 3.x and 2.x
+* JSON reader support
+* Sequence function support
+* regexp_extract function support
+* Min and max on single-level struct
+* CreateMap updates and enable CreateMap by default
+* Cast from array to string
+* Add regular expression support to regexp_replace function
+* Support for conditional joins using libcudf's mixed join feature
+    
+For a detailed list of changes, please refer to the
+[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md).
+
 ## Release v21.12.0
 Hardware Requirements:
 
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
index 52d7fc28468..2a6535befa0 100644
--- a/docs/supported_ops.md
+++ b/docs/supported_ops.md
@@ -2745,6 +2745,79 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<td rowSpan="2">BitLength</td>
+<td rowSpan="2">`bit_length`</td>
+<td rowSpan="2">The bit length of string data</td>
+<td rowSpan="2">None</td>
+<td rowSpan="2">project</td>
+<td>input</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td>S</td>
+<td> </td>
+<td> </td>
+<td><b>NS</b></td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>result</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td>S</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="6">BitwiseAnd</td>
 <td rowSpan="6">`&`</td>
 <td rowSpan="6">Returns the bitwise AND of the operands</td>
@@ -2877,32 +2950,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="4">BitwiseNot</td>
 <td rowSpan="4">`~`</td>
 <td rowSpan="4">Returns the bitwise NOT of the operands</td>
@@ -3125,6 +3172,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="6">BitwiseXor</td>
 <td rowSpan="6">`^`</td>
 <td rowSpan="6">Returns the bitwise XOR of the operands</td>
@@ -3257,32 +3330,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">CaseWhen</td>
 <td rowSpan="3">`when`</td>
 <td rowSpan="3">CASE WHEN expression</td>
@@ -3488,6 +3535,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">CheckOverflow</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">CheckOverflow after arithmetic operations between DecimalType data</td>
@@ -3629,32 +3702,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">ConcatWs</td>
 <td rowSpan="2">`concat_ws`</td>
 <td rowSpan="2">Concatenates multiple input strings or array of strings into a single string using a given separator</td>
@@ -3860,6 +3907,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="4">Cosh</td>
 <td rowSpan="4">`cosh`</td>
 <td rowSpan="4">Hyperbolic cosine</td>
@@ -4040,32 +4113,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">CreateArray</td>
 <td rowSpan="2">`array`</td>
 <td rowSpan="2">Returns an array with the given elements</td>
@@ -4228,6 +4275,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="1">CurrentRow$</td>
 <td rowSpan="1"> </td>
 <td rowSpan="1">Special boundary for a window frame, indicating stopping at the current row</td>
@@ -4458,32 +4531,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">DateFormatClass</td>
 <td rowSpan="3">`date_format`</td>
 <td rowSpan="3">Converts timestamp to a value of string in the format specified by the date format</td>
@@ -4620,6 +4667,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">DayOfMonth</td>
 <td rowSpan="2">`dayofmonth`, `day`</td>
 <td rowSpan="2">Returns the day of the month from a date or timestamp</td>
@@ -4876,32 +4949,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">ElementAt</td>
 <td rowSpan="3">`element_at`</td>
 <td rowSpan="3">Returns element of array at given(1-based) index in value if column is array. Returns value for the given key in value if column is map</td>
@@ -5038,6 +5085,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="3">EqualNullSafe</td>
 <td rowSpan="3">`<=>`</td>
 <td rowSpan="3">Check if the values are equal including nulls <=></td>
@@ -5238,32 +5311,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="4">Exp</td>
 <td rowSpan="4">`exp`</td>
 <td rowSpan="4">Euler's number e raised to a power</td>
@@ -5401,6 +5448,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="4">Expm1</td>
 <td rowSpan="4">`expm1`</td>
 <td rowSpan="4">Euler's number e raised to a power minus 1</td>
@@ -5606,32 +5679,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">GetArrayItem</td>
 <td rowSpan="3"> </td>
 <td rowSpan="3">Gets the field at `ordinal` in the Array</td>
@@ -5768,6 +5815,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="3">GetMapValue</td>
 <td rowSpan="3"> </td>
 <td rowSpan="3">Gets Value from a Map based on a key</td>
@@ -6083,32 +6156,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="6">GreaterThanOrEqual</td>
 <td rowSpan="6">`>=`</td>
 <td rowSpan="6">>= operator</td>
@@ -6241,6 +6288,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">Greatest</td>
 <td rowSpan="2">`greatest`</td>
 <td rowSpan="2">Returns the greatest value of all parameters, skipping null values</td>
@@ -6492,32 +6565,6 @@ are limited.
 <td><b>NS</b></td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">In</td>
 <td rowSpan="3">`in`</td>
 <td rowSpan="3">IN operator</td>
@@ -6633,6 +6680,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">InitCap</td>
 <td rowSpan="2">`initcap`</td>
 <td rowSpan="2">Returns str with the first letter of each word in uppercase. All other letters are in lowercase</td>
@@ -6873,32 +6946,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">IsNotNull</td>
 <td rowSpan="2">`isnotnull`</td>
 <td rowSpan="2">Checks if a value is not null</td>
@@ -7040,6 +7087,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">KnownNotNull</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">Tag an expression as known to not be null</td>
@@ -7244,32 +7317,6 @@ are limited.
 <td><b>NS</b></td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">LastDay</td>
 <td rowSpan="2">`last_day`</td>
 <td rowSpan="2">Returns the last day of the month which the date belongs to</td>
@@ -7406,6 +7453,32 @@ are limited.
 <td><b>NS</b></td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">Least</td>
 <td rowSpan="2">`least`</td>
 <td rowSpan="2">Returns the least value of all parameters, skipping null values</td>
@@ -7632,32 +7705,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="6">LessThanOrEqual</td>
 <td rowSpan="6">`<=`</td>
 <td rowSpan="6"><= operator</td>
@@ -7790,6 +7837,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="3">Like</td>
 <td rowSpan="3">`like`</td>
 <td rowSpan="3">Like</td>
@@ -8000,32 +8073,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">Log1p</td>
 <td rowSpan="2">`log1p`</td>
 <td rowSpan="2">Natural log 1 + expr</td>
@@ -8188,6 +8235,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">Lower</td>
 <td rowSpan="2">`lower`, `lcase`</td>
 <td rowSpan="2">String lowercase operator</td>
@@ -8376,32 +8449,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">MapValues</td>
 <td rowSpan="2">`map_values`</td>
 <td rowSpan="2">Returns an unordered array containing the values of the map</td>
@@ -8569,6 +8616,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">Month</td>
 <td rowSpan="2">`month`</td>
 <td rowSpan="2">Returns the month from a date or timestamp</td>
@@ -8742,36 +8815,10 @@ are limited.
 <td> </td>
 <td> </td>
 <td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-<td> </td>
-</tr>
-<tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
 </tr>
 <tr>
 <td rowSpan="2">Murmur3Hash</td>
@@ -9005,6 +9052,79 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
+<td rowSpan="2">OctetLength</td>
+<td rowSpan="2">`octet_length`</td>
+<td rowSpan="2">The byte length of string data</td>
+<td rowSpan="2">None</td>
+<td rowSpan="2">project</td>
+<td>input</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td>S</td>
+<td> </td>
+<td> </td>
+<td><b>NS</b></td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
+<td>result</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td>S</td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+<td> </td>
+</tr>
+<tr>
 <td rowSpan="6">Or</td>
 <td rowSpan="6">`or`</td>
 <td rowSpan="6">Logical OR</td>
@@ -9137,32 +9257,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="3">Pmod</td>
 <td rowSpan="3">`pmod`</td>
 <td rowSpan="3">Pmod</td>
@@ -9410,6 +9504,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">PreciseTimestampConversion</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">Expression used internally to convert the TimestampType to Long and back without losing precision, i.e. in microseconds. Used in time windowing</td>
@@ -9504,32 +9624,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="8">PythonUDF</td>
 <td rowSpan="8"> </td>
 <td rowSpan="8">UDF run in an external python process. Does not actually run on the GPU, but the transfer of data to/from it can be accelerated</td>
@@ -9821,6 +9915,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">Rand</td>
 <td rowSpan="2">`random`, `rand`</td>
 <td rowSpan="2">Generate a random column with i.i.d. uniformly distributed values in [0, 1)</td>
@@ -9868,32 +9988,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="2">Rank</td>
 <td rowSpan="2">`rank`</td>
 <td rowSpan="2">Window function that returns the rank value within the aggregation window</td>
@@ -10187,6 +10281,32 @@ are limited.
 <td> </td>
 </tr>
 <tr>
+<th>Expression</th>
+<th>SQL Functions(s)</th>
+<th>Description</th>
+<th>Notes</th>
+<th>Context</th>
+<th>Param/Output</th>
+<th>BOOLEAN</th>
+<th>BYTE</th>
+<th>SHORT</th>
+<th>INT</th>
+<th>LONG</th>
+<th>FLOAT</th>
+<th>DOUBLE</th>
+<th>DATE</th>
+<th>TIMESTAMP</th>
+<th>STRING</th>
+<th>DECIMAL</th>
+<th>NULL</th>
+<th>BINARY</th>
+<th>CALENDAR</th>
+<th>ARRAY</th>
+<th>MAP</th>
+<th>STRUCT</th>
+<th>UDT</th>
+</tr>
+<tr>
 <td rowSpan="2">ReplicateRows</td>
 <td rowSpan="2"> </td>
 <td rowSpan="2">Given an input row replicates the row N times</td>
@@ -10234,32 +10354,6 @@ are limited.
 <td> </td>
 </tr>
 <tr>
-<th>Expression</th>
-<th>SQL Functions(s)</th>
-<th>Description</th>
-<th>Notes</th>
-<th>Context</th>
-<th>Param/Output</th>
-<th>BOOLEAN</th>
-<th>BYTE</th>
-<th>SHORT</th>
-<th>INT</th>
-<th>LONG</th>
-<th>FLOAT</th>
-<th>DOUBLE</th>
-<th>DATE</th>
-<th>TIMESTAMP</th>
-<th>STRING</th>
-<th>DECIMAL</th>
-<th>NULL</th>
-<th>BINARY</th>
-<th>CALENDAR</th>
-<th>ARRAY</th>
-<th>MAP</th>
-<th>STRUCT</th>
-<th>UDT</th>
-</tr>
-<tr>
 <td rowSpan="4">Rint</td>
 <td rowSpan="4">`rint`</td>
 <td rowSpan="4">Rounds up a double value to the nearest double equal to an integer</td>
diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py
index 0aa1cb50b4d..9101ef56760 100644
--- a/integration_tests/src/main/python/csv_test.py
+++ b/integration_tests/src/main/python/csv_test.py
@@ -14,13 +14,13 @@
 
 import pytest
 
-from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_fallback_write
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_fallback_write, assert_cpu_and_gpu_are_equal_collect_with_capture
 from conftest import get_non_gpu_allowed
 from datetime import datetime, timezone
 from data_gen import *
 from marks import *
 from pyspark.sql.types import *
-from spark_session import with_cpu_session
+from spark_session import with_cpu_session, is_before_spark_330
 
 _acq_schema = StructType([
     StructField('loan_id', LongType()),
@@ -405,3 +405,23 @@ def test_csv_save_as_table_fallback(spark_tmp_path, spark_tmp_table_factory):
             lambda spark, path: spark.read.csv(path),
             data_path,
             'DataWritingCommandExec')
+
+@pytest.mark.skipif(is_before_spark_330(), reason='Hidden file metadata columns are a new feature of Spark 330')
+@allow_non_gpu(any = True)
+@pytest.mark.parametrize('metadata_column', ["file_path", "file_name", "file_size", "file_modification_time"])
+def test_csv_scan_with_hidden_metadata_fallback(spark_tmp_path, metadata_column):
+    data_path = spark_tmp_path + "/hidden_metadata.csv"
+    with_cpu_session(lambda spark : spark.range(10) \
+                     .selectExpr("id") \
+                     .write \
+                     .mode("overwrite") \
+                     .csv(data_path))
+
+    def do_csv_scan(spark):
+        df = spark.read.csv(data_path).selectExpr("_c0", "_metadata.{}".format(metadata_column))
+        return df
+
+    assert_cpu_and_gpu_are_equal_collect_with_capture(
+        do_csv_scan,
+        exist_classes= "FileSourceScanExec",
+        non_exist_classes= "GpuBatchScanExec")
\ No newline at end of file
diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
index 94289193485..e8414230b1b 100644
--- a/integration_tests/src/main/python/join_test.py
+++ b/integration_tests/src/main/python/join_test.py
@@ -15,7 +15,7 @@
 import pytest
 from pyspark.sql.functions import broadcast
 from pyspark.sql.types import *
-from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture
 from conftest import is_databricks_runtime, is_emr_runtime
 from data_gen import *
 from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
@@ -361,7 +361,7 @@ def do_join(spark):
 def test_right_broadcast_nested_loop_join_with_ast_condition(data_gen, join_type, batch_size):
     def do_join(spark):
         left, right = create_df(spark, data_gen, 50, 25)
-        # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294 
+        # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294
         # if the sizes are large enough to have both 0.0 and -0.0 show up 500 and 250
         # but these take a long time to verify so we run with smaller numbers by default
         # that do not expose the error
@@ -651,7 +651,7 @@ def do_join(spark):
 
         if (cache_side == 'cache_left'):
             # Try to force the shuffle to be split between CPU and GPU for the join
-            # by default if the operation after the shuffle is not on the GPU then 
+            # by default if the operation after the shuffle is not on the GPU then
             # don't do a GPU shuffle, so do something simple after the repartition
             # to make sure that the GPU shuffle is used.
             left = left.repartition('a').selectExpr('b + 1 as b', 'a').cache()
@@ -659,7 +659,7 @@ def do_join(spark):
         else:
             #cache_right
             # Try to force the shuffle to be split between CPU and GPU for the join
-            # by default if the operation after the shuffle is not on the GPU then 
+            # by default if the operation after the shuffle is not on the GPU then
             # don't do a GPU shuffle, so do something simple after the repartition
             # to make sure that the GPU shuffle is used.
             right = right.repartition('r_a').selectExpr('c + 1 as c', 'r_a').cache()
@@ -785,3 +785,37 @@ def do_join(spark):
         return spark.sql("select a.* from {} a, {} b where a.name=b.name".format(
             resultdf_name, resultdf_name))
     assert_gpu_and_cpu_are_equal_collect(do_join)
+
+# ExistenceJoin occurs in the context of existential subqueries (which is rewritten to SemiJoin) if
+# there is an additional condition that may qualify left records even though they don't have
+# join partner records from the right.
+#
+# Thus a query is rewritten roughly as a LeftOuter with an additional Boolean column "exists" added.
+# which feeds into a filter "exists OR someOtherPredicate"
+# If the condition is something like an AND, it makes the result a subset of a SemiJoin, and
+# the optimizer won't use ExistenceJoin.
+@ignore_order(local=True)
+@pytest.mark.parametrize(
+    "allowFallback", [
+        pytest.param('true',
+            marks=pytest.mark.allow_non_gpu('SortMergeJoinExec')),
+        pytest.param('false',
+            marks=pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/589"))
+    ], ids=idfn
+)
+def test_existence_join(allowFallback, spark_tmp_table_factory):
+    leftTable = spark_tmp_table_factory.get()
+    rightTable = spark_tmp_table_factory.get()
+    def do_join(spark):
+        # create non-overlapping ranges to have a mix of exists=true and exists=false
+        spark.createDataFrame([v] for v in range(2, 10)).createOrReplaceTempView(leftTable)
+        spark.createDataFrame([v] for v in range(0, 8)).createOrReplaceTempView(rightTable)
+        res = spark.sql((
+            "select * "
+            "from {} as l "
+            "where l._1 < 0 "
+            "   OR l._1 in (select * from {} as r)"
+        ).format(leftTable, rightTable))
+        return res
+    assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, r".+Join ExistenceJoin\(exists#[0-9]+\).+")
+
diff --git a/integration_tests/src/main/python/map_test.py b/integration_tests/src/main/python/map_test.py
index 72aea6fbd2e..dae5c408cdd 100644
--- a/integration_tests/src/main/python/map_test.py
+++ b/integration_tests/src/main/python/map_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
 from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect
 from data_gen import *
 from marks import incompat, allow_non_gpu
-from spark_session import is_before_spark_311
+from spark_session import is_before_spark_311, is_before_spark_330
 from pyspark.sql.types import *
 from pyspark.sql.types import IntegralType
 import pyspark.sql.functions as f
@@ -145,12 +145,13 @@ def test_map_scalar_project():
 @pytest.mark.skipif(is_before_spark_311(), reason="Only in Spark 3.1.1 + ANSI mode, map key throws on no such element")
 @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
 def test_simple_get_map_value_ansi_fail(data_gen):
+    message = "org.apache.spark.SparkNoSuchElementException" if not is_before_spark_330() else "java.util.NoSuchElementException"
     assert_gpu_and_cpu_error(
             lambda spark: unary_op_df(spark, data_gen).selectExpr(
                 'a["NOT_FOUND"]').collect(),
                 conf={'spark.sql.ansi.enabled':True,
                       'spark.sql.legacy.allowNegativeScaleOfDecimal': True},
-                error_message='java.util.NoSuchElementException')
+                error_message=message)
 
 @pytest.mark.skipif(not is_before_spark_311(), reason="For Spark before 3.1.1 + ANSI mode, null will be returned instead of an exception if key is not found")
 @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
@@ -176,12 +177,13 @@ def test_simple_element_at_map(data_gen):
 @pytest.mark.skipif(is_before_spark_311(), reason="Only in Spark 3.1.1 + ANSI mode, map key throws on no such element")
 @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
 def test_map_element_at_ansi_fail(data_gen):
+    message = "org.apache.spark.SparkNoSuchElementException" if not is_before_spark_330() else "java.util.NoSuchElementException"
     assert_gpu_and_cpu_error(
             lambda spark: unary_op_df(spark, data_gen).selectExpr(
                 'element_at(a, "NOT_FOUND")').collect(),
                 conf={'spark.sql.ansi.enabled':True,
                       'spark.sql.legacy.allowNegativeScaleOfDecimal': True},
-                error_message='java.util.NoSuchElementException')
+                error_message=message)
 
 @pytest.mark.skipif(not is_before_spark_311(), reason="For Spark before 3.1.1 + ANSI mode, null will be returned instead of an exception if key is not found")
 @pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
index e644e32a180..ba5fca0711c 100644
--- a/integration_tests/src/main/python/orc_test.py
+++ b/integration_tests/src/main/python/orc_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
 
 import pytest
 
-from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture
 from data_gen import *
 from marks import *
 from pyspark.sql.types import *
-from spark_session import with_cpu_session
+from spark_session import with_cpu_session, is_before_spark_330
 from parquet_test import _nested_pruning_schemas
 
 pytestmark = pytest.mark.nightly_resource_consuming_test
@@ -444,3 +444,24 @@ def test_read_with_more_columns(spark_tmp_path, orc_gen, reader_confs, v1_enable
     assert_gpu_and_cpu_are_equal_collect(
             lambda spark : spark.read.schema(rs).orc(data_path),
             conf=all_confs)
+
+@pytest.mark.skipif(is_before_spark_330(), reason='Hidden file metadata columns are a new feature of Spark 330')
+@allow_non_gpu(any = True)
+@pytest.mark.parametrize('metadata_column', ["file_path", "file_name", "file_size", "file_modification_time"])
+def test_orc_scan_with_hidden_metadata_fallback(spark_tmp_path, metadata_column):
+    data_path = spark_tmp_path + "/hidden_metadata.orc"
+    with_cpu_session(lambda spark : spark.range(10) \
+                     .selectExpr("id", "id % 3 as p") \
+                     .write \
+                     .partitionBy("p") \
+                     .mode("overwrite") \
+                     .orc(data_path))
+
+    def do_orc_scan(spark):
+        df = spark.read.orc(data_path).selectExpr("id", "_metadata.{}".format(metadata_column))
+        return df
+
+    assert_cpu_and_gpu_are_equal_collect_with_capture(
+        do_orc_scan,
+        exist_classes= "FileSourceScanExec",
+        non_exist_classes= "GpuBatchScanExec")
\ No newline at end of file
diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
index 516cef3e7a5..7d1d9e1a6b3 100644
--- a/integration_tests/src/main/python/parquet_test.py
+++ b/integration_tests/src/main/python/parquet_test.py
@@ -18,6 +18,7 @@
 from data_gen import *
 from marks import *
 from pyspark.sql.types import *
+from pyspark.sql.functions import *
 from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330
 
 def read_parquet_df(data_path):
@@ -728,3 +729,24 @@ def do_parquet_scan(spark):
         exist_classes= "BatchScanExec",
         non_exist_classes= "GpuBatchScanExec",
         conf = conf_for_parquet_aggregate_pushdown)
+
+@pytest.mark.skipif(is_before_spark_330(), reason='Hidden file metadata columns are a new feature of Spark 330')
+@allow_non_gpu(any = True)
+@pytest.mark.parametrize('metadata_column', ["file_path", "file_name", "file_size", "file_modification_time"])
+def test_parquet_scan_with_hidden_metadata_fallback(spark_tmp_path, metadata_column):
+    data_path = spark_tmp_path + "/hidden_metadata.parquet"
+    with_cpu_session(lambda spark : spark.range(10) \
+                     .selectExpr("id", "id % 3 as p") \
+                     .write \
+                     .partitionBy("p") \
+                     .mode("overwrite") \
+                     .parquet(data_path))
+
+    def do_parquet_scan(spark):
+        df = spark.read.parquet(data_path).selectExpr("id", "_metadata.{}".format(metadata_column))
+        return df
+
+    assert_cpu_and_gpu_are_equal_collect_with_capture(
+        do_parquet_scan,
+        exist_classes= "FileSourceScanExec",
+        non_exist_classes= "GpuBatchScanExec")
\ No newline at end of file
diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index 296a1bad26c..857ee4295c2 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -401,6 +401,12 @@ def test_length():
                 'CHAR_LENGTH(a)',
                 'CHARACTER_LENGTH(a)'))
 
+def test_byte_length():
+    gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}')
+    assert_gpu_and_cpu_are_equal_collect(
+            lambda spark: unary_op_df(spark, gen).selectExpr(
+                'BIT_LENGTH(a)', 'OCTET_LENGTH(a)'))
+
 @incompat
 def test_initcap():
     # Because we don't use the same unicode version we need to limit
@@ -712,4 +718,4 @@ def test_rlike_fallback_possessive_quantifier():
             lambda spark: unary_op_df(spark, gen).selectExpr(
                 'a rlike "a*+"'),
                 'RLike',
-            conf={'spark.rapids.sql.expression.RLike': 'true'})
\ No newline at end of file
+            conf={'spark.rapids.sql.expression.RLike': 'true'})
diff --git a/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala b/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala
index a9263c2ed36..145905df727 100644
--- a/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala
+++ b/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,16 +17,21 @@
 package com.nvidia.spark.rapids.tests.mortgage
 
 import com.nvidia.spark.rapids.ShimLoader
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.FunSuite
 
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.functions._
 
-class MortgageSparkSuite extends FunSuite with BeforeAndAfterAll {
+class MortgageSparkSuite extends FunSuite {
 
   /**
    * This is intentionally a def rather than a val so that scalatest uses the correct value (from
    * this class or the derived class) when registering tests.
+   *
+   * @note You are likely to see device/host leaks from this test when using the
+   *       RAPIDS Shuffle Manager. The reason for that is a race between cuDF's MemoryCleaner
+   *       and the SparkContext shutdown. Because of this, shuffle buffers cached may not get
+   *       cleaned (on shuffle unregister) when the MemoryCleaner exits.
    */
   def adaptiveQueryEnabled = false
 
@@ -63,11 +68,6 @@ class MortgageSparkSuite extends FunSuite with BeforeAndAfterAll {
     builder.getOrCreate()
   }
 
-  // Close the session to avoid hanging after all cases are completed
-  override def afterAll() = {
-    session.close()
-  }
-
   test("extract mortgage data") {
     val df = Run.csv(
       session,
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
index 261dd6bec0a..ef8521aea95 100755
--- a/jenkins/databricks/build.sh
+++ b/jenkins/databricks/build.sh
@@ -1,6 +1,6 @@
 #!/bin/bash
 #
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -87,11 +87,19 @@ then
     PARQUETHADOOPJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-hadoop--org.apache.parquet__parquet-hadoop__1.10.1-databricks9.jar
     PARQUETCOMMONJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-common--org.apache.parquet__parquet-common__1.10.1-databricks9.jar
     PARQUETCOLUMNJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-column--org.apache.parquet__parquet-column__1.10.1-databricks9.jar
+    ORC_CORE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-core--org.apache.orc__orc-core__1.5.12.jar
+    ORC_SHIM_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-shims--org.apache.orc__orc-shims__1.5.12.jar
+    ORC_MAPREDUCE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-mapreduce--org.apache.orc__orc-mapreduce__1.5.12.jar
 else
     PARQUETHADOOPJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-hadoop--org.apache.parquet__parquet-hadoop__1.10.1-databricks6.jar
     PARQUETCOMMONJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-common--org.apache.parquet__parquet-common__1.10.1-databricks6.jar
     PARQUETCOLUMNJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-column--org.apache.parquet__parquet-column__1.10.1-databricks6.jar
+    ORC_CORE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-core--org.apache.orc__orc-core__1.5.10.jar
+    ORC_SHIM_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-shims--org.apache.orc__orc-shims__1.5.10.jar
+    ORC_MAPREDUCE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-mapreduce--org.apache.orc__orc-mapreduce__1.5.10.jar
 fi
+
+PROTOBUF_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--com.google.protobuf--protobuf-java--com.google.protobuf__protobuf-java__2.6.1.jar
 PARQUETFORMATJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-format--org.apache.parquet__parquet-format__2.4.0.jar
 
 NETWORKCOMMON=----workspace_${SPARK_MAJOR_VERSION_STRING}--common--network-common--network-common-hive-2.3__hadoop-2.7_2.12_deploy.jar
@@ -363,6 +371,38 @@ mvn -B install:install-file \
    -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
    -Dpackaging=jar
 
+mvn -B install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$ORC_CORE_JAR \
+   -DgroupId=org.apache.orc \
+   -DartifactId=orc-core \
+   -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
+   -Dpackaging=jar
+
+mvn -B install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$ORC_SHIM_JAR \
+   -DgroupId=org.apache.orc \
+   -DartifactId=orc-shims \
+   -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
+   -Dpackaging=jar
+
+mvn -B install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$ORC_MAPREDUCE_JAR \
+   -DgroupId=org.apache.orc \
+   -DartifactId=orc-mapreduce \
+   -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
+   -Dpackaging=jar
+
+mvn -B install:install-file \
+   -Dmaven.repo.local=$M2DIR \
+   -Dfile=$JARDIR/$PROTOBUF_JAR \
+   -DgroupId=com.google.protobuf \
+   -DartifactId=protobuf-java \
+   -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
+   -Dpackaging=jar
+
 mvn -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests
 
 cd /home/ubuntu
diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh
index 10611c8e3f3..a78838edcde 100755
--- a/jenkins/spark-tests.sh
+++ b/jenkins/spark-tests.sh
@@ -258,8 +258,8 @@ if [[ $TEST_MODE == "ALL" || $TEST_MODE == "IT_ONLY" ]]; then
       PARALLELISM=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader | \
                     awk '{if (MAX < $1){ MAX = $1}} END {print int(MAX / (2 * 1024))}')
     fi
-    # parallelism > 8 could slow down the whole process, so we have a limitation for it
-    [[ ${PARALLELISM} -gt 8 ]] && PARALLELISM=8
+    # parallelism > 7 could slow down the whole process, so we have a limitation for it
+    [[ ${PARALLELISM} -gt 7 ]] && PARALLELISM=7
     MEMORY_FRACTION=$(python -c "print(1/($PARALLELISM + 0.1))")
     export MEMORY_FRACTION_CONF="--conf spark.rapids.memory.gpu.allocFraction=${MEMORY_FRACTION} \
     --conf spark.rapids.memory.gpu.maxAllocFraction=${MEMORY_FRACTION}"
diff --git a/pom.xml b/pom.xml
index bc79b7eb526..23dbfa761bb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -118,6 +118,7 @@
                                         <source>${project.basedir}/src/main/301until310-all/scala</source>
                                         <source>${project.basedir}/src/main/301until310-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/301until320-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/pre320-treenode/scala</source>
@@ -129,6 +130,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -166,6 +168,7 @@
                                         <source>${project.basedir}/src/main/301until310-all/scala</source>
                                         <source>${project.basedir}/src/main/301until310-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/301until320-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/pre320-treenode/scala</source>
@@ -182,6 +185,7 @@
                 <spark.version.classifier>spark302</spark.version.classifier>
             </properties>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -223,6 +227,7 @@
                                         <source>${project.basedir}/src/main/301until310-all/scala</source>
                                         <source>${project.basedir}/src/main/301until310-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/301until320-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/pre320-treenode/scala</source>
@@ -234,6 +239,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -275,6 +281,7 @@
                                         <source>${project.basedir}/src/main/301until310-all/scala</source>
                                         <source>${project.basedir}/src/main/301until310-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/301until320-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/pre320-treenode/scala</source>
@@ -286,6 +293,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -325,6 +333,7 @@
                                         <source>${project.basedir}/src/main/301+-nondb/scala</source>
                                         <source>${project.basedir}/src/main/311/scala</source>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/301until320-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/311+-all/scala</source>
@@ -342,6 +351,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -378,8 +388,6 @@
                 <spark.version>${spark301db.version}</spark.version>
                 <spark.test.version>${spark301db.version}</spark.test.version>
                 <rat.consoleOutput>true</rat.consoleOutput>
-                <hive.storage.api.version>${spark301db.version}</hive.storage.api.version>
-                <protobuf.java.version>${spark301db.version}</protobuf.java.version>
             </properties>
             <build>
                 <plugins>
@@ -396,6 +404,7 @@
                                         <source>${project.basedir}/src/main/301db/scala</source>
                                         <source>${project.basedir}/src/main/301until310-all/scala</source>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/pre320-treenode/scala</source>
                                     </sources>
@@ -406,6 +415,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -439,8 +449,6 @@
                 <spark.version>${spark312db.version}</spark.version>
                 <spark.test.version>${spark312db.version}</spark.test.version>
                 <rat.consoleOutput>true</rat.consoleOutput>
-                <hive.storage.api.version>${spark312db.version}</hive.storage.api.version>
-                <protobuf.java.version>${spark312db.version}</protobuf.java.version>
             </properties>
             <build>
                 <plugins>
@@ -455,6 +463,7 @@
                                 <configuration>
                                     <sources>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/312db/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/311until320-all/scala</source>
@@ -471,6 +480,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -508,6 +518,7 @@
                                         <source>${project.basedir}/src/main/301+-nondb/scala</source>
                                         <source>${project.basedir}/src/main/312/scala</source>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/301until320-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/311+-all/scala</source>
@@ -525,6 +536,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -565,6 +577,7 @@
                                         <source>${project.basedir}/src/main/301+-nondb/scala</source>
                                         <source>${project.basedir}/src/main/313/scala</source>
                                         <source>${project.basedir}/src/main/301until320-all/scala</source>
+                                        <source>${project.basedir}/src/main/301until320-noncdh/scala</source>
                                         <source>${project.basedir}/src/main/301until320-nondb/scala</source>
                                         <source>${project.basedir}/src/main/301until330-all/scala</source>
                                         <source>${project.basedir}/src/main/311+-all/scala</source>
@@ -582,6 +595,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -646,6 +660,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -709,6 +724,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -772,6 +788,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -834,6 +851,7 @@
                 </plugins>
             </build>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -895,6 +913,7 @@
                 </repository>
             </repositories>
             <modules>
+                <module>common</module>
                 <module>dist</module>
                 <module>integration_tests</module>
                 <module>shuffle-plugin</module>
@@ -963,8 +982,6 @@
         <cudf.version>22.04.0-SNAPSHOT</cudf.version>
         <scala.binary.version>2.12</scala.binary.version>
         <scala.version>2.12.15</scala.version>
-        <orc.version>1.5.10</orc.version>
-        <orc.classifier/>
         <!--
              If the shade package changes we need to also update jenkins/spark-premerge-build.sh
              so code coverage does not include the shaded classes.
@@ -1005,8 +1022,6 @@
         <guava.cdh.version>30.0-jre</guava.cdh.version>
         <shim.module.name>${spark.version.classifier}</shim.module.name>
         <slf4j.version>1.7.30</slf4j.version>
-        <hive.storage.api.version>2.7.1</hive.storage.api.version>
-        <protobuf.java.version>2.5.0</protobuf.java.version>
         <flatbuffers.java.version>1.11.0</flatbuffers.java.version>
         <scala.local-lib.path>org/scala-lang/scala-library/${scala.version}/scala-library-${scala.version}.jar</scala.local-lib.path>
         <target.classifier>${spark.version.classifier}</target.classifier>
@@ -1058,86 +1073,10 @@
               <scope>provided</scope>
             </dependency>
             <dependency>
-              <groupId>org.apache.orc</groupId>
-              <artifactId>orc-core</artifactId>
-              <version>${orc.version}</version>
-              <classifier>${orc.classifier}</classifier>
-              <exclusions>
-                <exclusion>
-                  <groupId>javax.xml.bind</groupId>
-                  <artifactId>jaxb-api</artifactId>
-                </exclusion>
-                <exclusion>
-                  <groupId>org.apache.hadoop</groupId>
-                  <artifactId>hadoop-common</artifactId>
-                </exclusion>
-                <exclusion>
-                  <groupId>org.apache.hadoop</groupId>
-                  <artifactId>hadoop-hdfs</artifactId>
-                </exclusion>
-                <exclusion>
-                  <groupId>org.apache.hive</groupId>
-                  <artifactId>hive-storage-api</artifactId>
-                </exclusion>
-              </exclusions>
-            </dependency>
-            <dependency>
-              <groupId>org.apache.orc</groupId>
-              <artifactId>orc-mapreduce</artifactId>
-              <version>${orc.version}</version>
-              <classifier>${orc.classifier}</classifier>
-              <exclusions>
-                <exclusion>
-                  <groupId>org.apache.hadoop</groupId>
-                  <artifactId>hadoop-common</artifactId>
-                </exclusion>
-                <exclusion>
-                  <groupId>org.apache.hadoop</groupId>
-                  <artifactId>hadoop-mapreduce-client-core</artifactId>
-                </exclusion>
-                <exclusion>
-                  <groupId>org.apache.orc</groupId>
-                  <artifactId>orc-core</artifactId>
-                </exclusion>
-                <exclusion>
-                  <groupId>org.apache.hive</groupId>
-                  <artifactId>hive-storage-api</artifactId>
-                </exclusion>
-                <exclusion>
-                  <groupId>com.esotericsoftware</groupId>
-                  <artifactId>kryo-shaded</artifactId>
-                </exclusion>
-              </exclusions>
-            </dependency>
-            <dependency>
-              <groupId>org.apache.orc</groupId>
-              <artifactId>orc-shims</artifactId>
-              <version>${orc.version}</version>
-              <exclusions>
-                <exclusion>
-                  <groupId>org.apache.hadoop</groupId>
-                  <artifactId>hadoop-common</artifactId>
-                </exclusion>
-                <exclusion>
-                  <groupId>org.apache.hadoop</groupId>
-                  <artifactId>hadoop-hdfs</artifactId>
-                </exclusion>
-              </exclusions>
-            </dependency>
-            <dependency>
-              <groupId>org.apache.hive</groupId>
-              <artifactId>hive-storage-api</artifactId>
-              <version>${hive.storage.api.version}</version>
-            </dependency>
-            <dependency>
-              <groupId>com.google.protobuf</groupId>
-              <artifactId>protobuf-java</artifactId>
-              <version>${protobuf.java.version}</version>
-            </dependency>
-            <dependency>
-              <groupId>com.google.flatbuffers</groupId>
-              <artifactId>flatbuffers-java</artifactId>
-              <version>${flatbuffers.java.version}</version>
+                <groupId>com.google.flatbuffers</groupId>
+                <artifactId>flatbuffers-java</artifactId>
+                <version>${flatbuffers.java.version}</version>
+                <scope>compile</scope>  <!-- for shade -->
             </dependency>
             <dependency>
               <groupId>org.rogach</groupId>
diff --git a/scripts/generate-changelog b/scripts/generate-changelog
index 0c313f6ec5a..d7459684a52 100755
--- a/scripts/generate-changelog
+++ b/scripts/generate-changelog
@@ -1,6 +1,6 @@
 #!/usr/bin/env python
 
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -46,11 +46,11 @@ Usage:
 
     # generate changelog for releases 0.1 to 21.12
     scripts/generate-changelog --token=<GITHUB_PERSONAL_ACCESS_TOKEN> \
-    --releases=0.1,0.2,0.3,0.4,0.4.1,0.5,21.06,21.06.1,21.06.2,21.08,21.08.1,21.10,21.12
+    --releases=21.06,21.06.1,21.06.2,21.08,21.08.1,21.10,21.12,22.02
 
     # generate changelog for releases 0.1 to 21.12 to /tmp/CHANGELOG.md
     GITHUB_TOKEN=<GITHUB_PERSONAL_ACCESS_TOKEN> scripts/generate-changelog \
-    --releases=0.1,0.2,0.3,0.4,0.4.1,0.5,21.06,21.06.1,21.06.2,21.08,21.08.1,21.10,21.12 \
+    --releases=21.06,21.06.1,21.06.2,21.08,21.08.1,21.10,21.12,22.02 \
     --path=/tmp/CHANGELOG.md
 """
 import os
@@ -273,6 +273,8 @@ def form_changelog(path: str, changelog: dict):
         subsections += form_subsection(issues, PRS)
     markdown = f"""# Change log
 Generated on {date.today()}{subsections}
+\n## Older Releases
+Changelog of older releases can be found at [docs/archives](/docs/archives)
 """
     with open(path, "w") as file:
         file.write(markdown)
diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml
index ed1654161ba..410106f07ac 100644
--- a/shuffle-plugin/pom.xml
+++ b/shuffle-plugin/pom.xml
@@ -47,6 +47,11 @@
           <version>1.11</version>
           <scope>compile</scope>
         </dependency>
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-common_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+        </dependency>
         <dependency>
             <groupId>com.nvidia</groupId>
             <artifactId>rapids-4-spark-sql_${scala.binary.version}</artifactId>
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
index 177eeef1a5c..b9557fb5f63 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,9 +25,9 @@ import java.util.concurrent.atomic.AtomicLong
 import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.{BaseDeviceMemoryBuffer, MemoryBuffer, NvtxColor, NvtxRange}
-import com.google.common.util.concurrent.ThreadFactoryBuilder
 import com.nvidia.spark.rapids.{Arm, GpuDeviceManager, RapidsConf}
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
 import com.nvidia.spark.rapids.shuffle.{ClientConnection, MemoryRegistrationCallback, MessageType, MetadataTransportBuffer, TransportBuffer, TransportUtils}
 import org.openucx.jucx._
 import org.openucx.jucx.ucp._
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
index 942a519b5cf..d7b22affda7 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,8 +23,8 @@ import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer
 
 import ai.rapids.cudf.{BaseDeviceMemoryBuffer, CudaMemoryBuffer, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer}
-import com.google.common.util.concurrent.ThreadFactoryBuilder
 import com.nvidia.spark.rapids.{GpuDeviceManager, HashedPriorityQueue, RapidsConf}
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
 import com.nvidia.spark.rapids.shuffle._
 import com.nvidia.spark.rapids.shuffle.{BounceBufferManager, BufferReceiveState, ClientConnection, PendingTransferRequest, RapidsShuffleClient, RapidsShuffleRequestHandler, RapidsShuffleServer, RapidsShuffleTransport, RefCountedDirectByteBuffer}
 
diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml
index 33417cc2c81..35766938d06 100644
--- a/sql-plugin/pom.xml
+++ b/sql-plugin/pom.xml
@@ -36,9 +36,9 @@
             <classifier>${cuda.version}</classifier>
         </dependency>
         <dependency>
-            <groupId>com.google.flatbuffers</groupId>
-            <artifactId>flatbuffers-java</artifactId>
-            <scope>compile</scope>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-common_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
         </dependency>
         <dependency>
             <groupId>org.scala-lang</groupId>
@@ -49,41 +49,12 @@
             <artifactId>scalatest_${scala.binary.version}</artifactId>
             <scope>test</scope>
         </dependency>
+
+        <!-- Flat buffers is a small jar, it's appropriate to use a fixed version -->
+        <!-- Shade and relocate it in the aggregator module-->
         <dependency>
-          <groupId>org.apache.orc</groupId>
-          <artifactId>orc-core</artifactId>
-          <classifier>${orc.classifier}</classifier>
-          <exclusions>
-            <exclusion>
-              <groupId>org.slf4j</groupId>
-              <artifactId>slf4j-api</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.orc</groupId>
-          <artifactId>orc-mapreduce</artifactId>
-          <classifier>${orc.classifier}</classifier>
-          <exclusions>
-            <exclusion>
-              <groupId>com.google.code.findbugs</groupId>
-              <artifactId>jsr305</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>org.apache.hive</groupId>
-          <artifactId>hive-storage-api</artifactId>
-          <exclusions>
-            <exclusion>
-              <groupId>org.slf4j</groupId>
-              <artifactId>slf4j-api</artifactId>
-            </exclusion>
-          </exclusions>
-        </dependency>
-        <dependency>
-          <groupId>com.google.protobuf</groupId>
-          <artifactId>protobuf-java</artifactId>
+            <groupId>com.google.flatbuffers</groupId>
+            <artifactId>flatbuffers-java</artifactId>
         </dependency>
     </dependencies>
 
@@ -290,6 +261,36 @@
                     <version>${spark.version}</version>
                     <scope>provided</scope>
                 </dependency>
+                <dependency>
+                    <groupId>org.apache.orc</groupId>
+                    <artifactId>orc-core</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
+                <dependency>
+                    <groupId>org.apache.orc</groupId>
+                    <artifactId>orc-shims</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
+                <dependency>
+                    <groupId>org.apache.orc</groupId>
+                    <artifactId>orc-mapreduce</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
+                <dependency>
+                    <groupId>org.apache.hive</groupId>
+                    <artifactId>hive-storage-api</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
+                <dependency>
+                    <groupId>com.google.protobuf</groupId>
+                    <artifactId>protobuf-java</artifactId>
+                    <version>${spark.version}</version>
+                    <scope>provided</scope>
+                </dependency>
             </dependencies>
         </profile>
         <profile>
diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/v2/OrcShims301until320Base.scala b/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/v2/OrcShims301until320Base.scala
new file mode 100644
index 00000000000..543cb7c9afd
--- /dev/null
+++ b/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/v2/OrcShims301until320Base.scala
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.shims.v2
+
+import scala.collection.mutable.ArrayBuffer
+
+import com.nvidia.spark.rapids.OrcOutputStripe
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.common.io.DiskRangeList
+import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcFile, OrcProto, PhysicalWriter, Reader, StripeInformation}
+import org.apache.orc.impl.{DataReaderProperties, OutStream, SchemaEvolution}
+import org.apache.orc.impl.RecordReaderImpl.SargApplier
+
+trait OrcShims301until320Base {
+
+  // read data to buffer
+  def readFileData(dataReader: DataReader, inputDataRanges: DiskRangeList): DiskRangeList = {
+    dataReader.readFileData(inputDataRanges, 0, false)
+  }
+
+  // create reader properties builder
+  def newDataReaderPropertiesBuilder(compressionSize: Int,
+      compressionKind: CompressionKind, typeCount: Int): DataReaderProperties.Builder = {
+    DataReaderProperties.builder()
+      .withBufferSize(compressionSize)
+      .withCompression(compressionKind)
+      .withTypeCount(typeCount)
+  }
+
+  // create ORC out stream
+  def newOrcOutStream(name: String, bufferSize: Int, codec: CompressionCodec,
+      receiver: PhysicalWriter.OutputReceiver): OutStream = {
+    new OutStream(name, bufferSize, codec, receiver)
+  }
+
+  // filter stripes by pushing down filter
+  def filterStripes(
+      stripes: Seq[StripeInformation],
+      conf: Configuration,
+      orcReader: Reader,
+      dataReader: DataReader,
+      gen: (StripeInformation, OrcProto.StripeFooter, Array[Int], Array[Int]) => OrcOutputStripe,
+      evolution: SchemaEvolution,
+      sargApp: SargApplier,
+      sargColumns: Array[Boolean],
+      ignoreNonUtf8BloomFilter: Boolean,
+      writerVersion: OrcFile.WriterVersion,
+      fileIncluded: Array[Boolean],
+      columnMapping: Array[Int],
+      idMapping: Array[Int]): ArrayBuffer[OrcOutputStripe] = {
+    val result = new ArrayBuffer[OrcOutputStripe](stripes.length)
+    stripes.foreach { stripe =>
+      val stripeFooter = dataReader.readStripeFooter(stripe)
+      val needStripe = if (sargApp != null) {
+        // An ORC schema is a single struct type describing the schema fields
+        val orcFileSchema = evolution.getFileType(0)
+        val orcIndex = dataReader.readRowIndex(stripe, orcFileSchema, stripeFooter,
+          ignoreNonUtf8BloomFilter, fileIncluded, null, sargColumns,
+          writerVersion, null, null)
+        val rowGroups = sargApp.pickRowGroups(stripe, orcIndex.getRowGroupIndex,
+          orcIndex.getBloomFilterKinds, stripeFooter.getColumnsList, orcIndex.getBloomFilterIndex,
+          true)
+        rowGroups != SargApplier.READ_NO_RGS
+      } else {
+        true
+      }
+
+      if (needStripe) {
+        result.append(gen(stripe, stripeFooter, columnMapping, idMapping))
+      }
+    }
+    result
+  }
+}
diff --git a/sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala b/sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
new file mode 100644
index 00000000000..dcac01eefe9
--- /dev/null
+++ b/sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.shims.v2
+
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import org.apache.orc.Reader
+
+object OrcShims extends OrcShims301until320Base {
+
+  // the ORC Reader in non CDH Spark is closeable
+  def withReader[T <: AutoCloseable, V](r: T)(block: T => V): V = {
+    try {
+      block(r)
+    } finally {
+      r.safeClose()
+    }
+  }
+
+  // the ORC Reader in non CDH Spark is closeable
+  def closeReader(reader: Reader): Unit = {
+    if (reader != null) {
+      reader.close()
+    }
+  }
+}
diff --git a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala b/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
index 8105b2349df..650758258f6 100644
--- a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
@@ -23,4 +23,11 @@ object RapidsErrorUtils {
     throw new ArrayIndexOutOfBoundsException(s"index $index is beyond the max index allowed " +
         s"${numElements - 1}")
   }
+
+  def throwInvalidElementAtIndexError(
+      elementKey: String, isElementAtFunction: Boolean = false): ColumnVector = {
+    // For now, the default argument is false. The caller sets the correct value accordingly.
+    throw new NoSuchElementException(s"Key: ${elementKey} " +
+      s"does not exist in any one of the rows in the map column")
+  }
 }
diff --git a/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala b/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
new file mode 100644
index 00000000000..ddc4534cb39
--- /dev/null
+++ b/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.shims.v2
+
+import org.apache.orc.Reader
+
+object OrcShims extends OrcShims301until320Base {
+
+  // ORC Reader of the 311cdh Spark has no close method.
+  // The resource is closed internally.
+  def withReader[V](r: Reader)(block: Reader => V): V = {
+    block(r)
+  }
+
+  // empty
+  def closeReader(reader: Reader): Unit = {
+  }
+
+}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
new file mode 100644
index 00000000000..7a0fed4abc9
--- /dev/null
+++ b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.shims.v2
+
+import scala.collection.mutable.ArrayBuffer
+
+import com.nvidia.spark.rapids.OrcOutputStripe
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.common.io.DiskRangeList
+import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcConf, OrcFile, OrcProto, PhysicalWriter, Reader, StripeInformation}
+import org.apache.orc.impl.{BufferChunk, BufferChunkList, DataReaderProperties, InStream, OrcCodecPool, OutStream, ReaderImpl, SchemaEvolution}
+import org.apache.orc.impl.RecordReaderImpl.SargApplier
+import org.apache.orc.impl.reader.StripePlanner
+import org.apache.orc.impl.writer.StreamOptions
+
+// 320+ ORC shims
+object OrcShims {
+
+  // the ORC Reader in non-CDH Spark is closeable
+  def withReader[T <: Reader, V](r: T)(block: T => V): V = {
+    try {
+      block(r)
+    } finally {
+      r.safeClose()
+    }
+  }
+
+  // the ORC Reader in non-CDH Spark is closeable
+  def closeReader(reader: Reader): Unit = {
+    if(reader != null) {
+      reader.close()
+    }
+  }
+
+  // read data to buffer
+  def readFileData(dataReader: DataReader, inputDataRanges: DiskRangeList): DiskRangeList = {
+
+    // convert DiskRangeList to BufferChunkList
+    val chuckList = new BufferChunkList
+    var curr = inputDataRanges
+    while (curr != null) {
+      chuckList.add(new BufferChunk(curr.getOffset, curr.getLength))
+      curr = curr.next
+    }
+
+    // BufferChunk is subclass of DiskRangeList
+    dataReader.readFileData(chuckList, false).get()
+  }
+
+  // create reader properties builder
+  def newDataReaderPropertiesBuilder(compressionSize: Int,
+      compressionKind: CompressionKind, typeCount: Int): DataReaderProperties.Builder = {
+    val compression = new InStream.StreamOptions()
+      .withBufferSize(compressionSize).withCodec(OrcCodecPool.getCodec(compressionKind))
+    DataReaderProperties.builder().withCompression(compression)
+  }
+
+  // create ORC out stream
+  def newOrcOutStream(name: String, bufferSize: Int, codec: CompressionCodec,
+      receiver: PhysicalWriter.OutputReceiver): OutStream = {
+    val options = new StreamOptions(bufferSize).withCodec(codec, codec.getDefaultOptions)
+    new OutStream(name, options, receiver)
+  }
+
+  // filter stripes by pushing down filter
+  def filterStripes(
+      stripes: Seq[StripeInformation],
+      conf: Configuration,
+      orcReader: Reader,
+      dataReader: DataReader,
+      gen: (StripeInformation, OrcProto.StripeFooter, Array[Int], Array[Int]) => OrcOutputStripe,
+      evolution: SchemaEvolution,
+      sargApp: SargApplier,
+      sargColumns: Array[Boolean],
+      ignoreNonUtf8BloomFilter: Boolean,
+      writerVersion: OrcFile.WriterVersion,
+      fileIncluded: Array[Boolean],
+      columnMapping: Array[Int],
+      idMapping: Array[Int]): ArrayBuffer[OrcOutputStripe] = {
+
+    val orcReaderImpl = orcReader.asInstanceOf[ReaderImpl]
+    val maxDiskRangeChunkLimit = OrcConf.ORC_MAX_DISK_RANGE_CHUNK_LIMIT.getInt(conf)
+    val planner = new StripePlanner(evolution.getFileSchema, orcReaderImpl.getEncryption(),
+      dataReader, writerVersion, ignoreNonUtf8BloomFilter, maxDiskRangeChunkLimit)
+
+    val result = new ArrayBuffer[OrcOutputStripe](stripes.length)
+    stripes.foreach { stripe =>
+      val stripeFooter = dataReader.readStripeFooter(stripe)
+      val needStripe = if (sargApp != null) {
+        val orcIndex = planner.parseStripe(stripe, fileIncluded).readRowIndex(sargColumns, null)
+        val rowGroups = sargApp.pickRowGroups(stripe, orcIndex.getRowGroupIndex,
+          orcIndex.getBloomFilterKinds, stripeFooter.getColumnsList, orcIndex.getBloomFilterIndex,
+          true)
+        rowGroups != SargApplier.READ_NO_RGS
+      } else {
+        true
+      }
+
+      if (needStripe) {
+        result.append(gen(stripe, stripeFooter, columnMapping, idMapping))
+      }
+    }
+    result
+
+  }
+}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/Spark320PlusShims.scala b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/Spark320PlusShims.scala
index 606b2f8be00..662008e9fab 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/Spark320PlusShims.scala
+++ b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/Spark320PlusShims.scala
@@ -558,7 +558,7 @@ trait Spark320PlusShims extends SparkShims with RebaseShims with Logging {
           // partition filters and data filters are not run on the GPU
           override val childExprs: Seq[ExprMeta[_]] = Seq.empty
 
-          override def tagPlanForGpu(): Unit = GpuFileSourceScanExec.tagSupport(this)
+          override def tagPlanForGpu(): Unit = tagFileSourceScanExec(this)
 
           override def convertToCpu(): SparkPlan = {
             wrapped.copy(partitionFilters = partitionFilters)
@@ -1051,4 +1051,8 @@ trait Spark320PlusShims extends SparkShims with RebaseShims with Logging {
   }
 
   override def supportsColumnarAdaptivePlans: Boolean = true
+
+  def tagFileSourceScanExec(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {
+    GpuFileSourceScanExec.tagSupport(meta)
+  }
 }
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
index de67b5e5cf7..99b943a0e71 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
@@ -24,4 +24,10 @@ object RapidsErrorUtils {
   def throwArrayIndexOutOfBoundsException(index: Int, numElements: Int): ColumnVector = {
     throw QueryExecutionErrors.invalidArrayIndexError(index, numElements)
   }
+
+  def throwInvalidElementAtIndexError(
+      elementKey: String, isElementAtFunction: Boolean = false): ColumnVector = {
+    // For now, the default argument is false. The caller sets the correct value accordingly.
+    throw QueryExecutionErrors.mapKeyNotExistError(elementKey, isElementAtFunction)
+  }
 }
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/Spark33XShims.scala b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/Spark33XShims.scala
index bad7c3d8e4d..0294f602a7d 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/Spark33XShims.scala
+++ b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/Spark33XShims.scala
@@ -22,10 +22,10 @@ import org.apache.parquet.schema.MessageType
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
 import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, MetadataAttribute}
 import org.apache.spark.sql.catalyst.json.rapids.shims.v2.Spark33XFileOptionsShims
 import org.apache.spark.sql.connector.read.{Scan, SupportsRuntimeFiltering}
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan}
 import org.apache.spark.sql.execution.datasources.{DataSourceUtils, FilePartition, FileScanRDD, PartitionedFile}
 import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters
 import org.apache.spark.sql.execution.datasources.v2.csv.CSVScan
@@ -143,6 +143,16 @@ trait Spark33XShims extends Spark33XFileOptionsShims {
             conf.maxReadBatchSizeBytes)
       })
   ).map(r => (r.getClassFor.asSubclass(classOf[Scan]), r)).toMap
+
+  override def tagFileSourceScanExec(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {
+    if (meta.wrapped.expressions.exists(expr => expr match {
+      case MetadataAttribute(expr) => true
+      case _ => false
+    })) {
+      meta.willNotWorkOnGpu("hidden metadata columns are not supported on GPU")
+    }
+    super.tagFileSourceScanExec(meta)
+  }
 }
 
 // Fallback to the default definition of `deterministic`
diff --git a/sql-plugin/src/main/scala/ai/rapids/cudf/HostConcatResultUtil.scala b/sql-plugin/src/main/scala/ai/rapids/cudf/HostConcatResultUtil.scala
new file mode 100644
index 00000000000..30d7289c902
--- /dev/null
+++ b/sql-plugin/src/main/scala/ai/rapids/cudf/HostConcatResultUtil.scala
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf
+
+import ai.rapids.cudf.JCudfSerialization.HostConcatResult
+import com.nvidia.spark.rapids.{Arm,  GpuColumnVectorFromBuffer}
+
+import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+object HostConcatResultUtil extends Arm {
+  /**
+   * Create a rows-only `HostConcatResult`.
+   */
+  def rowsOnlyHostConcatResult(numRows: Int): HostConcatResult = {
+    new HostConcatResult(
+      new JCudfSerialization.SerializedTableHeader(
+        Array.empty, numRows, 0L),
+      HostMemoryBuffer.allocate(0, false))
+  }
+
+  /**
+   * Given a `HostConcatResult` and a SparkSchema produce a `ColumnarBatch`,
+   * handling the rows-only case.
+   *
+   * @note This function does not consume the `HostConcatResult`, and
+   *       callers are responsible for closing the resulting `ColumnarBatch`
+   */
+  def getColumnarBatch(
+      hostConcatResult: HostConcatResult,
+      sparkSchema: Array[DataType]): ColumnarBatch = {
+    if (hostConcatResult.getTableHeader.getNumColumns == 0) {
+      // We expect the caller to have acquired the GPU unconditionally before calling
+      // `getColumnarBatch`, as a downstream exec may need the GPU, and the assumption is
+      // that it is acquired in the coalesce code.
+      new ColumnarBatch(Array.empty, hostConcatResult.getTableHeader.getNumRows)
+    } else {
+      withResource(hostConcatResult.toContiguousTable) { ct =>
+        GpuColumnVectorFromBuffer.from(ct, sparkSchema)
+      }
+    }
+  }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
index d4333f7bc15..26b1c0c4116 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -46,7 +46,20 @@ object ConcatAndConsumeAll {
    * @return a single batch with all of them concated together.
    */
   def buildNonEmptyBatch(arrayOfBatches: Array[ColumnarBatch],
-      schema: StructType): ColumnarBatch = {
+      schema: StructType): ColumnarBatch =
+    buildNonEmptyBatchFromTypes(
+      arrayOfBatches, GpuColumnVector.extractTypes(schema))
+
+  /**
+   * Build a single batch from the batches collected so far. If array is empty this will likely
+   * blow up.
+   * @param arrayOfBatches the batches to concat. This will be consumed and you do not need to
+   *                       close any of the batches after this is called.
+   * @param dataTypes the output types.
+   * @return a single batch with all of them concated together.
+   */
+  def buildNonEmptyBatchFromTypes(arrayOfBatches: Array[ColumnarBatch],
+                                  dataTypes: Array[DataType]): ColumnarBatch = {
     if (arrayOfBatches.length == 1) {
       arrayOfBatches(0)
     } else {
@@ -54,7 +67,7 @@ object ConcatAndConsumeAll {
       try {
         val combined = Table.concatenate(tables: _*)
         try {
-          GpuColumnVector.from(combined, GpuColumnVector.extractTypes(schema))
+          GpuColumnVector.from(combined, dataTypes)
         } finally {
           combined.close()
         }
@@ -410,9 +423,8 @@ abstract class AbstractGpuCoalesceIterator(
 }
 
 class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
-    schema: StructType,
+    sparkTypes: Array[DataType],
     goal: CoalesceSizeGoal,
-    maxDecompressBatchMemory: Long,
     numInputRows: GpuMetric,
     numInputBatches: GpuMetric,
     numOutputRows: GpuMetric,
@@ -422,8 +434,7 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     opTime: GpuMetric,
     peakDevMemory: GpuMetric,
     spillCallback: SpillCallback,
-    opName: String,
-    codecConfigs: TableCompressionCodecConfig)
+    opName: String)
   extends AbstractGpuCoalesceIterator(iter,
     goal,
     numInputRows,
@@ -435,8 +446,7 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     opTime,
     opName) with Arm {
 
-  private val sparkTypes: Array[DataType] = GpuColumnVector.extractTypes(schema)
-  private val batches: ArrayBuffer[SpillableColumnarBatch] = ArrayBuffer.empty
+  protected val batches: ArrayBuffer[SpillableColumnarBatch] = ArrayBuffer.empty
   private var maxDeviceMemory: Long = 0
 
   override def initNewBatch(batch: ColumnarBatch): Unit = {
@@ -448,10 +458,85 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
     batches.append(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_BATCHING_PRIORITY,
       spillCallback))
 
+  protected def popAll(): Array[ColumnarBatch] = {
+    closeOnExcept(batches.toArray.safeMap(_.getColumnarBatch())) { wip =>
+      batches.safeClose()
+      batches.clear()
+      wip
+    }
+  }
+
+  override def concatAllAndPutOnGPU(): ColumnarBatch = {
+    val ret = ConcatAndConsumeAll.buildNonEmptyBatchFromTypes(popAll(), sparkTypes)
+    // sum of current batches and concatenating batches. Approximately sizeof(ret * 2).
+    maxDeviceMemory = GpuColumnVector.getTotalDeviceMemoryUsed(ret) * 2
+    ret
+  }
+
+  override def cleanupConcatIsDone(): Unit = {
+    peakDevMemory.set(maxDeviceMemory)
+    batches.clear()
+  }
+
+  private var onDeck: Option[SpillableColumnarBatch] = None
+
+  override protected def hasOnDeck: Boolean = onDeck.isDefined
+
+  override protected def saveOnDeck(batch: ColumnarBatch): Unit = {
+    assert(onDeck.isEmpty)
+    onDeck = Some(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
+      spillCallback))
+  }
+
+  override protected def clearOnDeck(): Unit = {
+    onDeck.foreach(_.close())
+    onDeck = None
+  }
+
+  override protected def popOnDeck(): ColumnarBatch = {
+    val ret = onDeck.get.getColumnarBatch()
+    clearOnDeck()
+    ret
+  }
+}
+
+/**
+ * Compression codec-aware `GpuCoalesceIterator` subclass which should be used in cases
+ * where the RAPIDS Shuffle Manager could be configured, as batches to be coalesced
+ * may be compressed.
+ */
+class GpuCompressionAwareCoalesceIterator(
+    iter: Iterator[ColumnarBatch],
+    sparkTypes: Array[DataType],
+    goal: CoalesceSizeGoal,
+    maxDecompressBatchMemory: Long,
+    numInputRows: GpuMetric,
+    numInputBatches: GpuMetric,
+    numOutputRows: GpuMetric,
+    numOutputBatches: GpuMetric,
+    collectTime: GpuMetric,
+    concatTime: GpuMetric,
+    opTime: GpuMetric,
+    peakDevMemory: GpuMetric,
+    spillCallback: SpillCallback,
+    opName: String,
+    codecConfigs: TableCompressionCodecConfig)
+  extends GpuCoalesceIterator(
+    iter, sparkTypes, goal,
+    numInputRows = numInputRows,
+    numInputBatches = numInputBatches,
+    numOutputRows = numOutputRows,
+    numOutputBatches = numOutputBatches,
+    collectTime = collectTime,
+    concatTime = concatTime,
+    opTime = opTime,
+    peakDevMemory = peakDevMemory,
+    spillCallback, opName) {
+
   private[this] var codec: TableCompressionCodec = _
 
-  private[this] def popAllDecompressed(): Array[ColumnarBatch] = {
-    closeOnExcept(batches.map(_.getColumnarBatch())) { wip =>
+  override protected def popAll(): Array[ColumnarBatch] = {
+    closeOnExcept(batches.toArray.safeMap(_.getColumnarBatch())) { wip =>
       batches.safeClose()
       batches.clear()
 
@@ -487,42 +572,9 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
           }
         }
       }
-      wip.toArray
+      wip
     }
   }
-
-  override def concatAllAndPutOnGPU(): ColumnarBatch = {
-    val ret = ConcatAndConsumeAll.buildNonEmptyBatch(popAllDecompressed(), schema)
-    // sum of current batches and concatenating batches. Approximately sizeof(ret * 2).
-    maxDeviceMemory = GpuColumnVector.getTotalDeviceMemoryUsed(ret) * 2
-    ret
-  }
-
-  override def cleanupConcatIsDone(): Unit = {
-    peakDevMemory.set(maxDeviceMemory)
-    batches.clear()
-  }
-
-  private var onDeck: Option[SpillableColumnarBatch] = None
-
-  override protected def hasOnDeck: Boolean = onDeck.isDefined
-
-  override protected def saveOnDeck(batch: ColumnarBatch): Unit = {
-    assert(onDeck.isEmpty)
-    onDeck = Some(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
-      spillCallback))
-  }
-
-  override protected def clearOnDeck(): Unit = {
-    onDeck.foreach(_.close())
-    onDeck = None
-  }
-
-  override protected def popOnDeck(): ColumnarBatch = {
-    val ret = onDeck.get.getColumnarBatch()
-    clearOnDeck()
-    ret
-  }
 }
 
 case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
@@ -579,6 +631,7 @@ case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
 
     // cache in local vars to avoid serializing the plan
     val outputSchema = schema
+    val dataTypes = GpuColumnVector.extractTypes(outputSchema)
     val decompressMemoryTarget = maxDecompressBatchMemory
 
     val batches = child.executeColumnar()
@@ -593,7 +646,8 @@ case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
       goal match {
         case sizeGoal: CoalesceSizeGoal =>
           batches.mapPartitions { iter =>
-            new GpuCoalesceIterator(iter, outputSchema, sizeGoal, decompressMemoryTarget,
+            new GpuCompressionAwareCoalesceIterator(
+              iter, dataTypes, sizeGoal, decompressMemoryTarget,
               numInputRows, numInputBatches, numOutputRows, numOutputBatches, NoopMetric,
               concatTime, opTime, peakDevMemory, callback, "GpuCoalesceBatches",
               codecConfigs)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
index ffba671dc2d..0bffd6cd3cb 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
@@ -26,7 +26,6 @@ import scala.collection.mutable.{ArrayBuffer, LinkedHashMap, Queue}
 import scala.math.max
 
 import ai.rapids.cudf.{ColumnVector, HostMemoryBuffer, NvtxColor, NvtxRange, Table}
-import com.google.common.util.concurrent.ThreadFactoryBuilder
 import com.nvidia.spark.rapids.GpuMetric.{NUM_OUTPUT_BATCHES, PEAK_DEVICE_MEMORY, SEMAPHORE_WAIT_TIME}
 import org.apache.commons.io.IOUtils
 import org.apache.hadoop.conf.Configuration
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScanBase.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScanBase.scala
index e6063853eb2..6493d2cd3a7 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScanBase.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScanBase.scala
@@ -36,6 +36,7 @@ import com.google.protobuf.CodedOutputStream
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.SchemaUtils._
+import com.nvidia.spark.rapids.shims.v2.OrcShims
 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.fs.{FileSystem, Path}
 import org.apache.hadoop.hive.common.io.DiskRangeList
@@ -55,10 +56,10 @@ import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, Par
 import org.apache.spark.sql.execution.QueryExecutionException
 import org.apache.spark.sql.execution.datasources.PartitionedFile
 import org.apache.spark.sql.execution.datasources.orc.OrcUtils
+import org.apache.spark.sql.execution.datasources.rapids.OrcFiltersWrapper
 import org.apache.spark.sql.execution.datasources.v2.{EmptyPartitionReader, FilePartitionReaderFactory}
 import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.rapids.OrcFilters
 import org.apache.spark.sql.rapids.execution.TrampolineUtil
 import org.apache.spark.sql.sources.Filter
 import org.apache.spark.sql.types.{ArrayType, DataType, DecimalType, MapType, StructType}
@@ -319,13 +320,13 @@ trait OrcCommonFunctions extends OrcCodecWritingHelper {
 
     withResource(OrcTools.buildDataReader(ctx)) { dataReader =>
       val start = System.nanoTime()
-      val bufferChunks = dataReader.readFileData(inputDataRanges, 0, false)
+      val bufferChunks = OrcShims.readFileData(dataReader, inputDataRanges)
       val mid = System.nanoTime()
       var current = bufferChunks
       while (current != null) {
         out.write(current.getData)
         if (dataReader.isTrackingDiskRanges && current.isInstanceOf[BufferChunk]) {
-          dataReader.releaseBuffer(current.asInstanceOf[BufferChunk].getChunk)
+          dataReader.releaseBuffer(current.getData)
         }
         current = current.next
       }
@@ -740,17 +741,18 @@ private object OrcTools extends Arm {
       }
       val maxDiskRangeChunkLimit = OrcConf.ORC_MAX_DISK_RANGE_CHUNK_LIMIT.getInt(conf)
       val file = filePath.getFileSystem(conf).open(filePath)
+
+      val typeCount = org.apache.orc.OrcUtils.getOrcTypes(fileSchema).size
       //noinspection ScalaDeprecation
-      RecordReaderUtils.createDefaultDataReader(DataReaderProperties.builder()
-        .withBufferSize(compressionSize)
-        .withCompression(compressionKind)
-        .withFileSystem(fs)
-        .withPath(filePath)
-        .withFile(file) // explicitly specify the FSDataInputStream
-        .withTypeCount(org.apache.orc.OrcUtils.getOrcTypes(fileSchema).size)
-        .withZeroCopy(zeroCopy)
-        .withMaxDiskRangeChunkLimit(maxDiskRangeChunkLimit)
-        .build())
+      val reader = RecordReaderUtils.createDefaultDataReader(
+        OrcShims.newDataReaderPropertiesBuilder(compressionSize, compressionKind, typeCount)
+          .withFileSystem(fs)
+          .withPath(filePath)
+          .withZeroCopy(zeroCopy)
+          .withMaxDiskRangeChunkLimit(maxDiskRangeChunkLimit)
+          .build())
+      reader.open() // 311cdh needs to initialize the internal FSDataInputStream file variable.
+      reader
     }
   }
 
@@ -783,8 +785,8 @@ private case class GpuOrcFileFilterHandler(
     val orcFileReaderOpts = OrcFile.readerOptions(conf).filesystem(fs)
 
     // After getting the necessary information from ORC reader, we must close the ORC reader
-    withResource(OrcFile.createReader(filePath, orcFileReaderOpts)) { orcReader =>
-      val resultedColPruneInfo = requestedColumnIds(isCaseSensitive, dataSchema,
+    OrcShims.withReader(OrcFile.createReader(filePath, orcFileReaderOpts)) { orcReader =>
+    val resultedColPruneInfo = requestedColumnIds(isCaseSensitive, dataSchema,
         readDataSchema, orcReader)
       if (resultedColPruneInfo.isEmpty) {
         // Be careful when the OrcPartitionReaderContext is null, we should change
@@ -822,7 +824,7 @@ private case class GpuOrcFileFilterHandler(
     val readerOpts = OrcInputFormat.buildOptions(
       conf, orcReader, partFile.start, partFile.length)
     // create the search argument if we have pushed filters
-    OrcFilters.createFilter(fullSchema, pushedFilters).foreach { f =>
+    OrcFiltersWrapper.createFilter(fullSchema, pushedFilters).foreach { f =>
       readerOpts.searchArgument(f, fullSchema.fieldNames)
     }
     readerOpts
@@ -882,7 +884,7 @@ private case class GpuOrcFileFilterHandler(
                 if (matchedOrcFields.size > 1) {
                   // Need to fail if there is ambiguity, i.e. more than one field is matched.
                   val matchedOrcFieldsString = matchedOrcFields.mkString("[", ", ", "]")
-                  reader.close()
+                  OrcShims.closeReader(reader)
                   throw new RuntimeException(s"""Found duplicate field(s) "$requiredFieldName": """
                     + s"$matchedOrcFieldsString in case-insensitive mode")
                 } else {
@@ -1088,29 +1090,10 @@ private case class GpuOrcFileFilterHandler(
       val fileIncluded = calcOrcFileIncluded(evolution)
       val (columnMapping, idMapping) = columnRemap(fileIncluded, evolution.getFileSchema,
         updatedReadSchema, isCaseSensitive)
-      val result = new ArrayBuffer[OrcOutputStripe](stripes.length)
-      stripes.foreach { stripe =>
-        val stripeFooter = dataReader.readStripeFooter(stripe)
-        val needStripe = if (sargApp != null) {
-          // An ORC schema is a single struct type describing the schema fields
-          val orcFileSchema = evolution.getFileType(0)
-          val orcIndex = dataReader.readRowIndex(stripe, orcFileSchema, stripeFooter,
-            ignoreNonUtf8BloomFilter, fileIncluded, null, sargColumns,
-            writerVersion, null, null)
-          val rowGroups = sargApp.pickRowGroups(stripe, orcIndex.getRowGroupIndex,
-            orcIndex.getBloomFilterKinds, stripeFooter.getColumnsList, orcIndex.getBloomFilterIndex,
-            true)
-          rowGroups != SargApplier.READ_NO_RGS
-        } else {
-          true
-        }
-
-        if (needStripe) {
-          result.append(buildOutputStripe(stripe, stripeFooter, columnMapping, idMapping))
-        }
-      }
-
-      result
+      OrcShims.filterStripes(stripes, conf, orcReader, dataReader,
+        buildOutputStripe, evolution,
+        sargApp, sargColumns, ignoreNonUtf8BloomFilter,
+        writerVersion, fileIncluded, columnMapping, idMapping)
     }
 
     /**
@@ -1552,8 +1535,8 @@ trait OrcCodecWritingHelper extends Arm {
           // note that this buffer is just for writing meta-data
           OrcConf.BUFFER_SIZE.getDefaultValue.asInstanceOf[Int]
         }
-        withResource(new OutStream(getClass.getSimpleName, orcBufferSize, codec,
-            outReceiver)) { codecStream =>
+        withResource(OrcShims.newOrcOutStream(
+          getClass.getSimpleName, orcBufferSize, codec, outReceiver)) { codecStream =>
           val protoWriter = CodedOutputStream.newInstance(codecStream)
           block(outChannel, protoWriter, codecStream)
         }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
index 16280a47179..82333fa3721 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -492,11 +492,6 @@ object GpuOverrides extends Logging {
     listeners.clear()
   }
 
-  def canRegexpBeTreatedLikeARegularString(strLit: UTF8String): Boolean = {
-    val s = strLit.toString
-    !regexList.exists(pattern => s.contains(pattern))
-  }
-
   private def convertPartToGpuIfPossible(part: Partitioning, conf: RapidsConf): Partitioning = {
     part match {
       case _: GpuPartitioning => part
@@ -3352,7 +3347,23 @@ object GpuOverrides extends Logging {
             TypeSig.DATE)),
         Some(RepeatingParamCheck("step", TypeSig.integral, TypeSig.integral + TypeSig.CALENDAR))),
       (a, conf, p, r) => new GpuSequenceMeta(a, conf, p, r)
-    )
+    ),
+    expr[BitLength](
+      "The bit length of string data",
+      ExprChecks.unaryProject(
+        TypeSig.INT, TypeSig.INT,
+        TypeSig.STRING, TypeSig.STRING + TypeSig.BINARY),
+      (a, conf, p, r) => new UnaryExprMeta[BitLength](a, conf, p, r) {
+        override def convertToGpu(child: Expression): GpuExpression = GpuBitLength(child)
+      }),
+    expr[OctetLength](
+      "The byte length of string data",
+      ExprChecks.unaryProject(
+        TypeSig.INT, TypeSig.INT,
+        TypeSig.STRING, TypeSig.STRING + TypeSig.BINARY),
+      (a, conf, p, r) => new UnaryExprMeta[OctetLength](a, conf, p, r) {
+        override def convertToGpu(child: Expression): GpuExpression = GpuOctetLength(child)
+      })
   ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
 
   // Shim expressions should be last to allow overrides with shim-specific versions
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
index ba42c2d8f3e..c85d6fe1a60 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@ package com.nvidia.spark.rapids
 
 import java.util
 
-import ai.rapids.cudf.{HostMemoryBuffer, JCudfSerialization, NvtxColor, NvtxRange}
-import ai.rapids.cudf.JCudfSerialization.SerializedTableHeader
+import ai.rapids.cudf.{HostConcatResultUtil, HostMemoryBuffer, JCudfSerialization, NvtxColor, NvtxRange}
+import ai.rapids.cudf.JCudfSerialization.{HostConcatResult, SerializedTableHeader}
 import com.nvidia.spark.rapids.shims.v2.ShimUnaryExecNode
 
 import org.apache.spark.TaskContext
@@ -61,10 +61,12 @@ case class GpuShuffleCoalesceExec(child: SparkPlan, targetBatchByteSize: Long)
   override def doExecuteColumnar(): RDD[ColumnarBatch] = {
     val metricsMap = allMetrics
     val targetSize = targetBatchByteSize
-    val sparkSchema = GpuColumnVector.extractTypes(schema)
+    val dataTypes = GpuColumnVector.extractTypes(schema)
 
     child.executeColumnar().mapPartitions { iter =>
-      new GpuShuffleCoalesceIterator(iter, targetSize, sparkSchema, metricsMap)
+      new GpuShuffleCoalesceIterator(
+        new HostShuffleCoalesceIterator(iter, targetSize, dataTypes, metricsMap),
+        dataTypes, metricsMap)
     }
   }
 }
@@ -72,22 +74,18 @@ case class GpuShuffleCoalesceExec(child: SparkPlan, targetBatchByteSize: Long)
 /**
  * Iterator that coalesces columnar batches that are expected to only contain
  * [[SerializedTableColumn]]. The serialized tables within are collected up
- * to the target batch size and then concatenated on the host before the data
- * is transferred to the GPU.
+ * to the target batch size and then concatenated on the host before handing
+ * them to the caller on `.next()`
  */
-class GpuShuffleCoalesceIterator(
+class HostShuffleCoalesceIterator(
     iter: Iterator[ColumnarBatch],
     targetBatchByteSize: Long,
-    sparkSchema: Array[DataType],
+    dataTypes: Array[DataType],
     metricsMap: Map[String, GpuMetric])
-    extends Iterator[ColumnarBatch] with Arm with AutoCloseable {
-  private[this] val opTimeMetric = metricsMap(GpuMetric.OP_TIME)
+      extends Iterator[HostConcatResult] with Arm with AutoCloseable {
+  private[this] val concatTimeMetric = metricsMap(GpuMetric.CONCAT_TIME)
   private[this] val inputBatchesMetric = metricsMap(GpuMetric.NUM_INPUT_BATCHES)
   private[this] val inputRowsMetric = metricsMap(GpuMetric.NUM_INPUT_ROWS)
-  private[this] val outputBatchesMetric = metricsMap(GpuMetric.NUM_OUTPUT_BATCHES)
-  private[this] val outputRowsMetric = metricsMap(GpuMetric.NUM_OUTPUT_ROWS)
-  private[this] val concatTimeMetric = metricsMap(GpuMetric.CONCAT_TIME)
-  private[this] val semWaitTime = metricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
   private[this] val serializedTables = new util.ArrayDeque[SerializedTableColumn]
   private[this] var numTablesInBatch: Int = 0
   private[this] var numRowsInBatch: Int = 0
@@ -95,21 +93,44 @@ class GpuShuffleCoalesceIterator(
 
   Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => close()))
 
-  override def hasNext: Boolean = {
-    bufferNextBatch()
-    numTablesInBatch > 0
+  override def close(): Unit = {
+    serializedTables.forEach(_.close())
+    serializedTables.clear()
   }
 
-  override def next(): ColumnarBatch = {
-    if (!hasNext) {
-      throw new NoSuchElementException("No more columnar batches")
+  def concatenateTablesInHost(): HostConcatResult = {
+    val result = withResource(new MetricRange(concatTimeMetric)) { _ =>
+      val firstHeader = serializedTables.peekFirst().header
+      if (firstHeader.getNumColumns == 0) {
+        (0 until numTablesInBatch).foreach(_ => serializedTables.removeFirst())
+        HostConcatResultUtil.rowsOnlyHostConcatResult(numRowsInBatch)
+      } else {
+        val headers = new Array[SerializedTableHeader](numTablesInBatch)
+        withResource(new Array[HostMemoryBuffer](numTablesInBatch)) { buffers =>
+          headers.indices.foreach { i =>
+            val serializedTable = serializedTables.removeFirst()
+            headers(i) = serializedTable.header
+            buffers(i) = serializedTable.hostBuffer
+          }
+          JCudfSerialization.concatToHostBuffer(headers, buffers)
+        }
+      }
     }
-    concatenateBatch()
-  }
 
-  override def close(): Unit = {
-    serializedTables.forEach(_.close())
-    serializedTables.clear()
+    // update the stats for the next batch in progress
+    numTablesInBatch = serializedTables.size
+
+    batchByteSize = 0
+    numRowsInBatch = 0
+    if (numTablesInBatch > 0) {
+      require(numTablesInBatch == 1,
+        "should only track at most one buffer that is not in a batch")
+      val header = serializedTables.peekFirst().header
+      batchByteSize = header.getDataLen
+      numRowsInBatch = header.getNumRows
+    }
+
+    result
   }
 
   private def bufferNextBatch(): Unit = {
@@ -120,7 +141,7 @@ class GpuShuffleCoalesceIterator(
           inputBatchesMetric += 1
           // don't bother tracking empty tables
           if (batch.numRows > 0) {
-            inputRowsMetric += batch.numRows
+            inputRowsMetric += batch.numRows()
             val tableColumn = batch.column(0).asInstanceOf[SerializedTableColumn]
             batchCanGrow = canAddToBatch(tableColumn.header)
             serializedTables.addLast(tableColumn)
@@ -138,6 +159,18 @@ class GpuShuffleCoalesceIterator(
     }
   }
 
+  override def hasNext(): Boolean = {
+    bufferNextBatch()
+    numTablesInBatch > 0
+  }
+
+  override def next(): HostConcatResult = {
+    if (!hasNext()) {
+      throw new NoSuchElementException("No more host batches to concatenate")
+    }
+    concatenateTablesInHost()
+  }
+
   private def canAddToBatch(nextTable: SerializedTableHeader): Boolean = {
     if (batchByteSize + nextTable.getDataLen > targetBatchByteSize) {
       return false
@@ -147,60 +180,41 @@ class GpuShuffleCoalesceIterator(
     }
     true
   }
+}
 
-  private def concatenateBatch(): ColumnarBatch = {
-    val firstHeader = serializedTables.peekFirst().header
-    val batch = withResource(new MetricRange(concatTimeMetric)) { _ =>
-      if (firstHeader.getNumColumns == 0) {
-        // acquire the GPU unconditionally for now in this case, as a downstream exec
-        // may need the GPU, and the assumption is that it is acquired in the coalesce
-        // code.
-        GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWaitTime)
-        (0 until numTablesInBatch).foreach(_ => serializedTables.removeFirst())
-        new ColumnarBatch(Array.empty, numRowsInBatch)
-      } else {
-        concatenateTablesBatch()
-      }
-    }
+/**
+ * Iterator that coalesces columnar batches that are expected to only contain
+ * [[SerializedTableColumn]]. The serialized tables within are collected up
+ * to the target batch size and then concatenated on the host before the data
+ * is transferred to the GPU.
+ */
+class GpuShuffleCoalesceIterator(iter: Iterator[HostConcatResult],
+                                 dataTypes: Array[DataType],
+                                 metricsMap: Map[String, GpuMetric])
+      extends Iterator[ColumnarBatch] with Arm {
+  private[this] val semWaitTime = metricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
+  private[this] val opTimeMetric = metricsMap(GpuMetric.OP_TIME)
+  private[this] val outputBatchesMetric = metricsMap(GpuMetric.NUM_OUTPUT_BATCHES)
+  private[this] val outputRowsMetric = metricsMap(GpuMetric.NUM_OUTPUT_ROWS)
 
-    withResource(new MetricRange(opTimeMetric)) { _ =>
-      outputBatchesMetric += 1
-      outputRowsMetric += batch.numRows
-
-      // update the stats for the next batch in progress
-      numTablesInBatch = serializedTables.size
-      batchByteSize = 0
-      numRowsInBatch = 0
-      if (numTablesInBatch > 0) {
-        require(numTablesInBatch == 1,
-          "should only track at most one buffer that is not in a batch")
-        val header = serializedTables.peekFirst().header
-        batchByteSize = header.getDataLen
-        numRowsInBatch = header.getNumRows
-      }
+  override def hasNext: Boolean = iter.hasNext
 
-      batch
+  override def next(): ColumnarBatch = {
+    if (!hasNext) {
+      throw new NoSuchElementException("No more columnar batches")
     }
-  }
-
-  private def concatenateTablesBatch(): ColumnarBatch = {
-    val headers = new Array[SerializedTableHeader](numTablesInBatch)
-    withResource(new Array[HostMemoryBuffer](numTablesInBatch)) { buffers =>
-      headers.indices.foreach { i =>
-        val serializedTable = serializedTables.removeFirst()
-        headers(i) = serializedTable.header
-        buffers(i) = serializedTable.hostBuffer
-      }
-
-      withResource(new NvtxRange("Concat+Load Batch", NvtxColor.YELLOW)) { _ =>
-        withResource(JCudfSerialization.concatToHostBuffer(headers, buffers)) { hostConcatResult =>
-          // about to start using the GPU in this task
-          GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWaitTime)
-          withResource(new MetricRange(opTimeMetric)) { _ =>
-            withResource(hostConcatResult.toContiguousTable) { contigTable =>
-              GpuColumnVectorFromBuffer.from(contigTable, sparkSchema)
-            }
-          }
+    withResource(new NvtxRange("Concat+Load Batch", NvtxColor.YELLOW)) { _ =>
+      withResource(iter.next()) { hostConcatResult =>
+        // We acquire the GPU regardless of whether `hostConcatResult`
+        // is an empty batch or not, because the downstream tasks expect
+        // the `GpuShuffleCoalesceIterator` to acquire the semaphore and may
+        // generate GPU data from batches that are empty.
+        GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWaitTime)
+        withResource(new MetricRange(opTimeMetric)) { _ =>
+          val batch = HostConcatResultUtil.getColumnarBatch(hostConcatResult, dataTypes)
+          outputBatchesMetric += 1
+          outputRowsMetric += batch.numRows()
+          batch
         }
       }
     }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
index 675ec01758e..8c33ede98bc 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
@@ -16,8 +16,11 @@
 
 package com.nvidia.spark.rapids
 
+import ai.rapids.cudf.{HostConcatResultUtil, NvtxColor, NvtxRange}
+import ai.rapids.cudf.JCudfSerialization.HostConcatResult
 import com.nvidia.spark.rapids.shims.v2.{GpuHashPartitioning, GpuJoinUtils, ShimBinaryExecNode}
 
+import org.apache.spark.TaskContext
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.catalyst.InternalRow
 import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
@@ -97,7 +100,9 @@ case class GpuShuffledHashJoinExec(
   override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
   override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
     OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
+    CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME),
     BUILD_DATA_SIZE -> createSizeMetric(ESSENTIAL_LEVEL, DESCRIPTION_BUILD_DATA_SIZE),
+    PEAK_DEVICE_MEMORY -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_PEAK_DEVICE_MEMORY),
     BUILD_TIME -> createNanoTimingMetric(ESSENTIAL_LEVEL, DESCRIPTION_BUILD_TIME),
     STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME),
     JOIN_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_JOIN_TIME),
@@ -123,28 +128,39 @@ case class GpuShuffledHashJoinExec(
     val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
     val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
     val opTime = gpuLongMetric(OP_TIME)
-    val buildTime = gpuLongMetric(BUILD_TIME)
     val streamTime = gpuLongMetric(STREAM_TIME)
     val joinTime = gpuLongMetric(JOIN_TIME)
     val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
-    val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
+    val batchSizeBytes = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
     val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
-    val localBuildOutput: Seq[Attribute] = buildPlan.output
+    val localBuildOutput = buildPlan.output
+
+    // Create a map of metrics that can be handed down to shuffle and coalesce
+    // iterators, setting as noop certain metrics that the coalesce iterators
+    // normally update, but that in the case of the join they would produce
+    // the wrong statistics (since there are conflicts)
+    val coalesceMetricsMap = allMetrics +
+      (GpuMetric.NUM_INPUT_ROWS -> NoopMetric,
+       GpuMetric.NUM_INPUT_BATCHES -> NoopMetric,
+       GpuMetric.NUM_OUTPUT_BATCHES -> NoopMetric,
+       GpuMetric.NUM_OUTPUT_ROWS -> NoopMetric)
 
     streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) {
       (streamIter, buildIter) => {
-        val stIt = new CollectTimeIterator("shuffled join stream", streamIter, streamTime)
-        val startTime = System.nanoTime()
+        val (builtBatch, maybeBufferedStreamIter) =
+          GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+            batchSizeBytes,
+            localBuildOutput,
+            buildIter,
+            new CollectTimeIterator("shuffled join stream", streamIter, streamTime),
+            spillCallback,
+            coalesceMetricsMap)
 
-        withResource(ConcatAndConsumeAll.getSingleBatchWithVerification(buildIter,
-          localBuildOutput)) { builtBatch =>
+        withResource(builtBatch) { _ =>
           // doJoin will increment the reference counts as needed for the builtBatch
-          val delta = System.nanoTime() - startTime
-          buildTime += delta
           buildDataSize += GpuColumnVector.getTotalDeviceMemoryUsed(builtBatch)
-
-          doJoin(builtBatch, stIt, targetSize, spillCallback,
-            numOutputRows, joinOutputRows, numOutputBatches,
+          doJoin(builtBatch, maybeBufferedStreamIter,
+            batchSizeBytes, spillCallback, numOutputRows, joinOutputRows, numOutputBatches,
             opTime, joinTime)
         }
       }
@@ -155,3 +171,183 @@ case class GpuShuffledHashJoinExec(
     if (isSkewJoin) super.nodeName + "(skew=true)" else super.nodeName
   }
 }
+
+object GpuShuffledHashJoinExec extends Arm {
+  /**
+   * Helper iterator that wraps a BufferedIterator of AutoCloseable subclasses.
+   * This iterator also implements AutoCloseable, so it can be closed in case
+   * of exceptions.
+   *
+   * @param wrapped the buffered iterator
+   * @tparam T an AutoCloseable subclass
+   */
+  class CloseableBufferedIterator[T <: AutoCloseable](wrapped: BufferedIterator[T])
+    extends BufferedIterator[T] with AutoCloseable {
+    // register against task completion to close any leaked buffered items
+    Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => close()))
+
+    private[this] var isClosed = false
+    override def head: T = wrapped.head
+    override def headOption: Option[T] = wrapped.headOption
+    override def next: T = wrapped.next
+    override def hasNext: Boolean = wrapped.hasNext
+    override def close(): Unit = {
+      if (!isClosed) {
+        headOption.foreach(_.close())
+        isClosed = true
+      }
+    }
+  }
+
+  /**
+   * Gets a `ColumnarBatch` and stream Iterator[ColumnarBatch] pair by acquiring
+   * the GPU semaphore optimally in the scenario where the build side is relatively
+   * small (less than `hostTargetBatchSize`).
+   *
+   * In the optimal case, this function will load the build side on the host up to the
+   * goal configuration and if it fits entirely, allow the stream iterator
+   * to also pull to host its first batch. After the first stream batch is on the host, the
+   * stream iterator acquires the semaphore and then the build side is copied to the GPU.
+   *
+   * Prior to this we would get a build batch on the GPU, acquiring
+   * the semaphore in the process, and then begin pulling from the stream iterator,
+   * which could include IO (while holding onto the semaphore).
+   *
+   * The function handles the case where the build side goes above the configured batch
+   * goal, in which case it will concat on the host, grab the semaphore, and continue to
+   * pull the build iterator to build a bigger batch on the GPU. This is not optimized
+   * because we hold onto the semaphore during the entire time after realizing the goal
+   * has been hit.
+   *
+   * @param hostTargetBatchSize target batch size goal on the host
+   * @param buildOutput output attributes of the build plan
+   * @param buildIter build iterator
+   * @param streamIter stream iterator
+   * @param spillCallback metric updater in case downstream iterators spill
+   * @param coalesceMetricsMap metrics map with metrics to be used in downstream
+   *                           iterators
+   * @return a pair of `ColumnarBatch` and streamed iterator that can be
+   *         used for the join
+   */
+  def getBuiltBatchAndStreamIter(
+      hostTargetBatchSize: Long,
+      buildOutput: Seq[Attribute],
+      buildIter: Iterator[ColumnarBatch],
+      streamIter: Iterator[ColumnarBatch],
+      spillCallback: SpillCallback,
+      coalesceMetricsMap: Map[String, GpuMetric]): (ColumnarBatch, Iterator[ColumnarBatch]) = {
+    val semWait = coalesceMetricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
+    val buildTime = coalesceMetricsMap(GpuMetric.BUILD_TIME)
+    var bufferedBuildIterator: CloseableBufferedIterator[ColumnarBatch] = null
+    closeOnExcept(bufferedBuildIterator) { _ =>
+      val startTime = System.nanoTime()
+      // find if the build side is non-empty, and if the first batch is
+      // a serialized batch. If neither condition is met, we fallback to the
+      // `getSingleBatchWithVerification` method.
+      val firstBatchIsSerialized = {
+        if (!buildIter.hasNext) {
+          false
+        } else {
+          bufferedBuildIterator = new CloseableBufferedIterator(buildIter.buffered)
+          val firstBatch = bufferedBuildIterator.head
+          if (firstBatch.numCols() != 1) {
+            false
+          } else {
+            firstBatch.column(0).isInstanceOf[SerializedTableColumn]
+          }
+        }
+      }
+
+      if (!firstBatchIsSerialized) {
+        // In this scenario we are getting non host-side batches in the build side
+        // given the plan rules we expect this to be a single batch
+        val builtBatch =
+          ConcatAndConsumeAll.getSingleBatchWithVerification(
+            Option(bufferedBuildIterator).getOrElse(buildIter), buildOutput)
+        val delta = System.nanoTime() - startTime
+        buildTime += delta
+        (builtBatch, streamIter)
+      } else {
+        val dataTypes = buildOutput.map(_.dataType).toArray
+        val hostConcatIter = new HostShuffleCoalesceIterator(bufferedBuildIterator,
+          hostTargetBatchSize, dataTypes, coalesceMetricsMap)
+        withResource(hostConcatIter) { _ =>
+          closeOnExcept(hostConcatIter.next()) { hostConcatResult =>
+            if (!hostConcatIter.hasNext()) {
+              // add the time it took to fetch that first host-side build batch
+              buildTime += System.nanoTime() - startTime
+              // Optimal case, we drained the build iterator and we didn't have a prior
+              // so it was a single batch, and is entirely on the host.
+              // We peek at the stream iterator with `hasNext` on the buffered
+              // iterator, which will grab the semaphore when putting the first stream
+              // batch on the GPU, and then we bring the build batch to the GPU and return.
+              val bufferedStreamIter = new CloseableBufferedIterator(streamIter.buffered)
+              closeOnExcept(bufferedStreamIter) { _ =>
+                withResource(new NvtxRange("first stream batch", NvtxColor.RED)) { _ =>
+                  if (bufferedStreamIter.hasNext) {
+                    bufferedStreamIter.head
+                  } else {
+                    GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait)
+                  }
+                }
+                val buildBatch = getBuildBatchOptimized(hostConcatResult, buildOutput, buildTime)
+                (buildBatch, bufferedStreamIter)
+              }
+            } else {
+              val buildBatch = getBuildBatchFromUnfinished(
+                Seq(hostConcatResult).iterator ++ hostConcatIter,
+                buildOutput, spillCallback, coalesceMetricsMap)
+              buildTime += System.nanoTime() - startTime
+              (buildBatch, streamIter)
+            }
+          }
+        }
+      }
+    }
+  }
+
+  private def getBuildBatchFromUnfinished(
+      iterWithPrior: Iterator[HostConcatResult],
+      buildOutput: Seq[Attribute],
+      spillCallback: SpillCallback,
+      coalesceMetricsMap: Map[String, GpuMetric]): ColumnarBatch = {
+    // In the fallback case we build the same iterator chain that the Spark plan
+    // would have produced:
+    //   GpuCoalesceIterator(GpuShuffleCoalesceIterator(shuffled build side))
+    // This allows us to make the shuffle batches spillable in case we have a large,
+    // build-side table, as `RequireSingleBatch` is virtually no limit, and we
+    // know we are now above `hostTargetBatchSize` (which is 2GB by default)
+    val dataTypes = buildOutput.map(_.dataType).toArray
+      val shuffleCoalesce = new GpuShuffleCoalesceIterator(
+        iterWithPrior,
+        dataTypes,
+        coalesceMetricsMap)
+      val res = ConcatAndConsumeAll.getSingleBatchWithVerification(
+        new GpuCoalesceIterator(shuffleCoalesce,
+          dataTypes,
+          RequireSingleBatch,
+          NoopMetric, NoopMetric, NoopMetric, NoopMetric, NoopMetric,
+          coalesceMetricsMap(GpuMetric.CONCAT_TIME),
+          coalesceMetricsMap(GpuMetric.OP_TIME),
+          coalesceMetricsMap(GpuMetric.PEAK_DEVICE_MEMORY),
+          spillCallback,
+          "build batch"),
+        buildOutput)
+      res
+  }
+
+  private def getBuildBatchOptimized(
+      hostConcatResult: HostConcatResult,
+      buildOutput: Seq[Attribute],
+      buildTime: GpuMetric): ColumnarBatch = {
+    val dataTypes = buildOutput.map(_.dataType).toArray
+    // we are on the GPU and our build batch is within `targetSizeBytes`.
+    // we can bring the build batch to the GPU now
+    withResource(hostConcatResult) { _ =>
+      buildTime.ns {
+        HostConcatResultUtil.getColumnarBatch(hostConcatResult, dataTypes)
+      }
+    }
+  }
+}
+
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
index 9e9c496e8b4..54bf57e94fa 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
@@ -237,6 +237,28 @@ class GpuTransitionOverrides extends Rule[SparkPlan] {
       p.withNewChildren(p.children.map(optimizeCoalesce))
   }
 
+  /**
+   * Removes `GpuCoalesceBatches(GpuShuffleCoalesceExec(build side))` for the build side
+   * for the shuffled hash join. The coalesce logic has been moved to the
+   * `GpuShuffleCoalesceExec` class, and is handled differently to prevent holding onto the
+   * GPU semaphore for stream IO.
+   */
+  def shuffledHashJoinOptimizeShuffle(plan: SparkPlan): SparkPlan = plan match {
+    case x@GpuShuffledHashJoinExec(
+         _, _, _, buildSide, _,
+        left: GpuShuffleCoalesceExec,
+        GpuCoalesceBatches(GpuShuffleCoalesceExec(rc, _), _),_) if buildSide == GpuBuildRight =>
+      x.withNewChildren(
+        Seq(shuffledHashJoinOptimizeShuffle(left), shuffledHashJoinOptimizeShuffle(rc)))
+    case x@GpuShuffledHashJoinExec(
+         _, _, _, buildSide, _,
+        GpuCoalesceBatches(GpuShuffleCoalesceExec(lc, _), _),
+        right: GpuShuffleCoalesceExec, _) if buildSide == GpuBuildLeft =>
+      x.withNewChildren(
+        Seq(shuffledHashJoinOptimizeShuffle(lc), shuffledHashJoinOptimizeShuffle(right)))
+    case p => p.withNewChildren(p.children.map(shuffledHashJoinOptimizeShuffle))
+  }
+
   private def insertCoalesce(plans: Seq[SparkPlan], goals: Seq[CoalesceGoal],
       disableUntilInput: Boolean): Seq[SparkPlan] = {
     plans.zip(goals).map {
@@ -550,6 +572,9 @@ class GpuTransitionOverrides extends Rule[SparkPlan] {
         }
         updatedPlan = fixupHostColumnarTransitions(updatedPlan)
         updatedPlan = optimizeCoalesce(updatedPlan)
+        if (rapidsConf.shuffledHashJoinOptimizeShuffle) {
+          updatedPlan = shuffledHashJoinOptimizeShuffle(updatedPlan)
+        }
         if (rapidsConf.exportColumnarRdd) {
           updatedPlan = detectAndTagFinalColumnarOutput(updatedPlan)
         }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
index 93ce85ce3e1..16ff8eb02f2 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -21,7 +21,9 @@ import java.util.Properties
 import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference}
 
 import scala.collection.JavaConverters._
+import scala.collection.mutable.{Map => MutableMap}
 import scala.util.Try
+import scala.util.matching.Regex
 
 import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
 
@@ -391,25 +393,33 @@ object ExecutionPlanCaptureCallback {
     executedPlan.expressions.exists(didFallBack(_, fallbackCpuClass))
   }
 
-  private def containsExpression(exp: Expression, className: String): Boolean = exp.find {
+  private def containsExpression(exp: Expression, className: String,
+    regexMap: MutableMap[String, Regex] // regex memoization
+  ): Boolean = exp.find {
     case e if PlanUtils.getBaseNameFromClass(e.getClass.getName) == className => true
-    case e: ExecSubqueryExpression => containsPlan(e.plan, className)
+    case e: ExecSubqueryExpression => containsPlan(e.plan, className, regexMap)
     case _ => false
   }.nonEmpty
 
-  private def containsPlan(plan: SparkPlan, className: String): Boolean = plan.find {
+  private def containsPlan(plan: SparkPlan, className: String,
+    regexMap: MutableMap[String, Regex] = MutableMap.empty // regex memoization
+  ): Boolean = plan.find {
     case p if PlanUtils.sameClass(p, className) =>
       true
     case p: AdaptiveSparkPlanExec =>
-      containsPlan(p.executedPlan, className)
+      containsPlan(p.executedPlan, className, regexMap)
     case p: QueryStageExec =>
-      containsPlan(p.plan, className)
+      containsPlan(p.plan, className, regexMap)
     case p: ReusedSubqueryExec =>
-      containsPlan(p.child, className)
+      containsPlan(p.child, className, regexMap)
     case p: ReusedExchangeExec =>
-      containsPlan(p.child, className)
-    case p =>
-      p.expressions.exists(containsExpression(_, className))
+      containsPlan(p.child, className, regexMap)
+    case p if p.expressions.exists(containsExpression(_, className, regexMap)) =>
+      true
+    case p: SparkPlan =>
+      regexMap.getOrElseUpdate(className, className.r)
+        .findFirstIn(p.simpleStringWithNodeId())
+        .nonEmpty
   }.nonEmpty
 }
 
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
index 222207c1151..f7730e84f99 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
@@ -299,4 +299,4 @@ object RapidsBufferCatalog extends Logging with Arm {
 
   /** Remove a buffer ID from the catalog and release the resources of the registered buffer. */
   def removeBuffer(id: RapidsBufferId): Unit = singleton.removeBuffer(id)
-}
\ No newline at end of file
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
index 92847350df6..e9ac9c1235e 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,11 +70,17 @@ abstract class RapidsBufferStore(
       }
     }
 
-    def freeAll(): Unit = synchronized {
-      val values = buffers.values().toArray(new Array[RapidsBufferBase](0))
+    def freeAll(): Unit = {
+      val values = synchronized {
+        val buffs = buffers.values().toArray(new Array[RapidsBufferBase](0))
+        buffers.clear()
+        spillable.clear()
+        buffs
+      }
+      // We need to release the `RapidsBufferStore` lock to prevent a lock order inversion
+      // deadlock: (1) `RapidsBufferBase.free`     calls  (2) `RapidsBufferStore.remove` and
+      //           (1) `RapidsBufferStore.freeAll` calls  (2) `RapidsBufferBase.free`.
       values.foreach(_.free())
-      buffers.clear()
-      spillable.clear()
     }
 
     def nextSpillableBuffer(): RapidsBufferBase = synchronized {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index aec84a1f4a6..4042a466cbc 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -471,6 +471,16 @@ object RapidsConf {
     .booleanConf
     .createWithDefault(false)
 
+  val SHUFFLED_HASH_JOIN_OPTIMIZE_SHUFFLE =
+    conf("spark.rapids.sql.shuffledHashJoin.optimizeShuffle")
+      .doc("Enable or disable an optimization where shuffled build side batches are kept " +
+        "on the host while the first stream batch is loaded onto the GPU. The optimization " +
+        "increases off-heap host memory usage to avoid holding onto the GPU semaphore while " +
+        "waiting for stream side IO.")
+      .internal()
+      .booleanConf
+      .createWithDefault(true)
+
   val STABLE_SORT = conf("spark.rapids.sql.stableSort.enabled")
       .doc("Enable or disable stable sorting. Apache Spark's sorting is typically a stable " +
           "sort, but sort stability cannot be guaranteed in distributed work loads because the " +
@@ -1484,6 +1494,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
 
   lazy val exportColumnarRdd: Boolean = get(EXPORT_COLUMNAR_RDD)
 
+  lazy val shuffledHashJoinOptimizeShuffle: Boolean = get(SHUFFLED_HASH_JOIN_OPTIMIZE_SHUFFLE)
+
   lazy val stableSort: Boolean = get(STABLE_SORT)
 
   lazy val isIncompatEnabled: Boolean = get(INCOMPATIBLE_OPS)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
index ac3705f7004..09dfdfc1869 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@ import java.util.concurrent.{Executors, ScheduledExecutorService, TimeUnit}
 
 import scala.collection.mutable.ArrayBuffer
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
 import org.apache.commons.lang3.mutable.MutableLong
 
 import org.apache.spark.SparkEnv
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index 79db0286d8e..002c8b3f04b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -404,6 +404,30 @@ class RegexParser(pattern: String) {
 
 }
 
+object RegexParser {
+  private val regexpChars = Set('\u0000', '\\', '.', '^', '$', '\f')
+
+  def isRegExpString(s: String): Boolean = {
+
+    def isRegExpString(ast: RegexAST): Boolean = ast match {
+      case RegexChar(ch) => regexpChars.contains(ch)
+      case RegexEscaped(_) => true
+      case RegexSequence(parts) => parts.exists(isRegExpString)
+      case _ => true
+    }
+
+    try {
+      val parser = new RegexParser(s)
+      val ast = parser.parse()
+      isRegExpString(ast)
+    } catch {
+      case _: RegexUnsupportedException =>
+        // if we cannot parse it then assume that it might be valid regexp
+        true
+    }
+  }
+}
+
 /**
  * Transpile Java/Spark regular expression to a format that cuDF supports, or throw an exception
  * if this is not possible.
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/execution/datasources/rapids/OrcFiltersWrapper.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/execution/datasources/rapids/OrcFiltersWrapper.scala
new file mode 100644
index 00000000000..65792c76c82
--- /dev/null
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/execution/datasources/rapids/OrcFiltersWrapper.scala
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.rapids
+
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
+
+import org.apache.spark.sql.execution.datasources.orc.OrcFilters
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+
+// Wrapper for Spark OrcFilters which is in private package
+object OrcFiltersWrapper {
+  def createFilter(schema: StructType, filters: Seq[Filter]): Option[SearchArgument] = {
+    OrcFilters.createFilter(schema, filters)
+  }
+}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFilters.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFilters.scala
deleted file mode 100644
index 2dd9973cafd..00000000000
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFilters.scala
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.rapids
-
-import java.time.{Instant, LocalDate}
-
-import org.apache.hadoop.hive.common.`type`.HiveDecimal
-import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument}
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory.newBuilder
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable
-
-import org.apache.spark.SparkException
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateToDays, toJavaDate, toJavaTimestamp}
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.Filter
-import org.apache.spark.sql.types._
-
-// This is derived from Apache Spark's OrcFilters code to avoid calling the
-// Spark version.  Spark's version can potentially create a search argument
-// applier object that is incompatible with the orc:nohive jar that has been
-// shaded as part of this project.
-
-/**
- * Helper object for building ORC `SearchArgument`s, which are used for ORC predicate push-down.
- *
- * Due to limitation of ORC `SearchArgument` builder, we had to implement separate checking and
- * conversion passes through the Filter to make sure we only convert predicates that are known
- * to be convertible.
- *
- * An ORC `SearchArgument` must be built in one pass using a single builder.  For example, you can't
- * build `a = 1` and `b = 2` first, and then combine them into `a = 1 AND b = 2`.  This is quite
- * different from the cases in Spark SQL or Parquet, where complex filters can be easily built using
- * existing simpler ones.
- *
- * The annoying part is that, `SearchArgument` builder methods like `startAnd()`, `startOr()`, and
- * `startNot()` mutate internal state of the builder instance.  This forces us to translate all
- * convertible filters with a single builder instance. However, if we try to translate a filter
- * before checking whether it can be converted or not, we may end up with a builder whose internal
- * state is inconsistent in the case of an inconvertible filter.
- *
- * For example, to convert an `And` filter with builder `b`, we call `b.startAnd()` first, and then
- * try to convert its children.  Say we convert `left` child successfully, but find that `right`
- * child is inconvertible.  Alas, `b.startAnd()` call can't be rolled back, and `b` is inconsistent
- * now.
- *
- * The workaround employed here is to trim the Spark filters before trying to convert them. This
- * way, we can only do the actual conversion on the part of the Filter that is known to be
- * convertible.
- *
- * P.S.: Hive seems to use `SearchArgument` together with `ExprNodeGenericFuncDesc` only.  Usage of
- * builder methods mentioned above can only be found in test code, where all tested filters are
- * known to be convertible.
- */
-object OrcFilters extends OrcFiltersBase {
-
-  /**
-   * Create ORC filter as a SearchArgument instance.
-   */
-  def createFilter(schema: StructType, filters: Seq[Filter]): Option[SearchArgument] = {
-    val dataTypeMap = OrcFilters.getSearchableTypeMap(schema, SQLConf.get.caseSensitiveAnalysis)
-    // Combines all convertible filters using `And` to produce a single conjunction
-    val conjunctionOptional = buildTree(convertibleFilters(dataTypeMap, filters))
-    conjunctionOptional.map { conjunction =>
-      // Then tries to build a single ORC `SearchArgument` for the conjunction predicate.
-      // The input predicate is fully convertible. There should not be any empty result in the
-      // following recursive method call `buildSearchArgument`.
-      buildSearchArgument(dataTypeMap, conjunction, newBuilder).build()
-    }
-  }
-
-  def convertibleFilters(
-      dataTypeMap: Map[String, OrcPrimitiveField],
-      filters: Seq[Filter]): Seq[Filter] = {
-    import org.apache.spark.sql.sources._
-
-    def convertibleFiltersHelper(
-        filter: Filter,
-        canPartialPushDown: Boolean): Option[Filter] = filter match {
-      // At here, it is not safe to just convert one side and remove the other side
-      // if we do not understand what the parent filters are.
-      //
-      // Here is an example used to explain the reason.
-      // Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to
-      // convert b in ('1'). If we only convert a = 2, we will end up with a filter
-      // NOT(a = 2), which will generate wrong results.
-      //
-      // Pushing one side of AND down is only safe to do at the top level or in the child
-      // AND before hitting NOT or OR conditions, and in this case, the unsupported predicate
-      // can be safely removed.
-      case And(left, right) =>
-        val leftResultOptional = convertibleFiltersHelper(left, canPartialPushDown)
-        val rightResultOptional = convertibleFiltersHelper(right, canPartialPushDown)
-        (leftResultOptional, rightResultOptional) match {
-          case (Some(leftResult), Some(rightResult)) => Some(And(leftResult, rightResult))
-          case (Some(leftResult), None) if canPartialPushDown => Some(leftResult)
-          case (None, Some(rightResult)) if canPartialPushDown => Some(rightResult)
-          case _ => None
-        }
-
-      // The Or predicate is convertible when both of its children can be pushed down.
-      // That is to say, if one/both of the children can be partially pushed down, the Or
-      // predicate can be partially pushed down as well.
-      //
-      // Here is an example used to explain the reason.
-      // Let's say we have
-      // (a1 AND a2) OR (b1 AND b2),
-      // a1 and b1 is convertible, while a2 and b2 is not.
-      // The predicate can be converted as
-      // (a1 OR b1) AND (a1 OR b2) AND (a2 OR b1) AND (a2 OR b2)
-      // As per the logical in And predicate, we can push down (a1 OR b1).
-      case Or(left, right) =>
-        for {
-          lhs <- convertibleFiltersHelper(left, canPartialPushDown)
-          rhs <- convertibleFiltersHelper(right, canPartialPushDown)
-        } yield Or(lhs, rhs)
-      case Not(pred) =>
-        val childResultOptional = convertibleFiltersHelper(pred, canPartialPushDown = false)
-        childResultOptional.map(Not)
-      case other =>
-        for (_ <- buildLeafSearchArgument(dataTypeMap, other, newBuilder())) yield other
-    }
-    filters.flatMap { filter =>
-      convertibleFiltersHelper(filter, true)
-    }
-  }
-
-  /**
-   * Get PredicateLeafType which is corresponding to the given DataType.
-   */
-  def getPredicateLeafType(dataType: DataType): PredicateLeaf.Type = dataType match {
-    case BooleanType => PredicateLeaf.Type.BOOLEAN
-    case ByteType | ShortType | IntegerType | LongType => PredicateLeaf.Type.LONG
-    case FloatType | DoubleType => PredicateLeaf.Type.FLOAT
-    case StringType => PredicateLeaf.Type.STRING
-    case DateType => PredicateLeaf.Type.DATE
-    case TimestampType => PredicateLeaf.Type.TIMESTAMP
-    case _: DecimalType => PredicateLeaf.Type.DECIMAL
-    case _ => throw new UnsupportedOperationException(s"DataType: ${dataType.catalogString}")
-  }
-
-  /**
-   * Cast literal values for filters.
-   *
-   * We need to cast to long because ORC raises exceptions
-   * at 'checkLiteralType' of SearchArgumentImpl.java.
-   */
-  private def castLiteralValue(value: Any, dataType: DataType): Any = dataType match {
-    case ByteType | ShortType | IntegerType | LongType =>
-      value.asInstanceOf[Number].longValue
-    case FloatType | DoubleType =>
-      value.asInstanceOf[Number].doubleValue()
-    case _: DecimalType =>
-      new HiveDecimalWritable(HiveDecimal.create(value.asInstanceOf[java.math.BigDecimal]))
-    case _: DateType if value.isInstanceOf[LocalDate] =>
-      toJavaDate(localDateToDays(value.asInstanceOf[LocalDate]))
-    case _: TimestampType if value.isInstanceOf[Instant] =>
-      toJavaTimestamp(instantToMicros(value.asInstanceOf[Instant]))
-    case _ => value
-  }
-
-  /**
-   * Build a SearchArgument and return the builder so far.
-   *
-   * @param dataTypeMap a map from the attribute name to its data type.
-   * @param expression the input predicates, which should be fully convertible to SearchArgument.
-   * @param builder the input SearchArgument.Builder.
-   * @return the builder so far.
-   */
-  private def buildSearchArgument(
-      dataTypeMap: Map[String, OrcPrimitiveField],
-      expression: Filter,
-      builder: Builder): Builder = {
-    import org.apache.spark.sql.sources._
-
-    expression match {
-      case And(left, right) =>
-        val lhs = buildSearchArgument(dataTypeMap, left, builder.startAnd())
-        val rhs = buildSearchArgument(dataTypeMap, right, lhs)
-        rhs.end()
-
-      case Or(left, right) =>
-        val lhs = buildSearchArgument(dataTypeMap, left, builder.startOr())
-        val rhs = buildSearchArgument(dataTypeMap, right, lhs)
-        rhs.end()
-
-      case Not(child) =>
-        buildSearchArgument(dataTypeMap, child, builder.startNot()).end()
-
-      case other =>
-        buildLeafSearchArgument(dataTypeMap, other, builder).getOrElse {
-          throw new SparkException(
-            "The input filter of OrcFilters.buildSearchArgument should be fully convertible.")
-        }
-    }
-  }
-
-  /**
-   * Build a SearchArgument for a leaf predicate and return the builder so far.
-   *
-   * @param dataTypeMap a map from the attribute name to its data type.
-   * @param expression the input filter predicates.
-   * @param builder the input SearchArgument.Builder.
-   * @return the builder so far.
-   */
-  private def buildLeafSearchArgument(
-      dataTypeMap: Map[String, OrcPrimitiveField],
-      expression: Filter,
-      builder: Builder): Option[Builder] = {
-    def getType(attribute: String): PredicateLeaf.Type =
-      getPredicateLeafType(dataTypeMap(attribute).fieldType)
-
-    import org.apache.spark.sql.sources._
-
-    // NOTE: For all case branches dealing with leaf predicates below, the additional `startAnd()`
-    // call is mandatory. ORC `SearchArgument` builder requires that all leaf predicates must be
-    // wrapped by a "parent" predicate (`And`, `Or`, or `Not`).
-    expression match {
-      case EqualTo(name, value) if dataTypeMap.contains(name) =>
-        val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
-        Some(builder.startAnd()
-          .equals(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
-      case EqualNullSafe(name, value) if dataTypeMap.contains(name) =>
-        val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
-        Some(builder.startAnd()
-          .nullSafeEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
-      case LessThan(name, value) if dataTypeMap.contains(name) =>
-        val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
-        Some(builder.startAnd()
-          .lessThan(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
-      case LessThanOrEqual(name, value) if dataTypeMap.contains(name) =>
-        val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
-        Some(builder.startAnd()
-          .lessThanEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
-      case GreaterThan(name, value) if dataTypeMap.contains(name) =>
-        val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
-        Some(builder.startNot()
-          .lessThanEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
-      case GreaterThanOrEqual(name, value) if dataTypeMap.contains(name) =>
-        val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
-        Some(builder.startNot()
-          .lessThan(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
-      case IsNull(name) if dataTypeMap.contains(name) =>
-        Some(builder.startAnd()
-          .isNull(dataTypeMap(name).fieldName, getType(name)).end())
-
-      case IsNotNull(name) if dataTypeMap.contains(name) =>
-        Some(builder.startNot()
-          .isNull(dataTypeMap(name).fieldName, getType(name)).end())
-
-      case In(name, values) if dataTypeMap.contains(name) =>
-        val castedValues = values.map(v => castLiteralValue(v, dataTypeMap(name).fieldType))
-        Some(builder.startAnd().in(dataTypeMap(name).fieldName, getType(name),
-          castedValues.map(_.asInstanceOf[AnyRef]): _*).end())
-
-      case _ => None
-    }
-  }
-}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFiltersBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFiltersBase.scala
deleted file mode 100644
index d4fb2f260d6..00000000000
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFiltersBase.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.rapids
-
-import java.util.Locale
-
-import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
-import org.apache.spark.sql.sources.{And, Filter}
-import org.apache.spark.sql.types.{AtomicType, BinaryType, DataType, StructField, StructType}
-
-/**
- * Methods that can be shared when upgrading the built-in Hive.
- *
- * Derived from Apache Spark to avoid depending upon it directly,
- * since its API has changed between Spark versions.
- */
-trait OrcFiltersBase {
-
-  private[sql] def buildTree(filters: Seq[Filter]): Option[Filter] = {
-    filters match {
-      case Seq() => None
-      case Seq(filter) => Some(filter)
-      case Seq(filter1, filter2) => Some(And(filter1, filter2))
-      case _ => // length > 2
-        val (left, right) = filters.splitAt(filters.length / 2)
-        Some(And(buildTree(left).get, buildTree(right).get))
-    }
-  }
-
-  case class OrcPrimitiveField(fieldName: String, fieldType: DataType)
-
-  /**
-   * This method returns a map which contains ORC field name and data type. Each key
-   * represents a column; `dots` are used as separators for nested columns. If any part
-   * of the names contains `dots`, it is quoted to avoid confusion. See
-   * `org.apache.spark.sql.connector.catalog.quoted` for implementation details.
-   *
-   * BinaryType, UserDefinedType, ArrayType and MapType are ignored.
-   */
-  protected[sql] def getSearchableTypeMap(
-      schema: StructType,
-      caseSensitive: Boolean): Map[String, OrcPrimitiveField] = {
-    import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
-
-    def getPrimitiveFields(
-        fields: Seq[StructField],
-        parentFieldNames: Seq[String] = Seq.empty): Seq[(String, OrcPrimitiveField)] = {
-      fields.flatMap { f =>
-        f.dataType match {
-          case st: StructType =>
-            getPrimitiveFields(st.fields, parentFieldNames :+ f.name)
-          case BinaryType => None
-          case _: AtomicType =>
-            val fieldName = (parentFieldNames :+ f.name).quoted
-            val orcField = OrcPrimitiveField(fieldName, f.dataType)
-            Some((fieldName, orcField))
-          case _ => None
-        }
-      }
-    }
-
-    val primitiveFields = getPrimitiveFields(schema.fields)
-    if (caseSensitive) {
-      primitiveFields.toMap
-    } else {
-      // Don't consider ambiguity here, i.e. more than one field are matched in case insensitive
-      // mode, just skip pushdown for these fields, they will trigger Exception when reading,
-      // See: SPARK-25175.
-      val dedupPrimitiveFields = primitiveFields
-        .groupBy(_._1.toLowerCase(Locale.ROOT))
-        .filter(_._2.size == 1)
-        .mapValues(_.head._2)
-      CaseInsensitiveMap(dedupPrimitiveFields)
-    }
-  }
-}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
index 3bf981d2c63..76ab58f140b 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
@@ -171,9 +171,8 @@ case class GpuElementAt(left: Expression, right: Expression, failOnError: Boolea
               if (!exist.isValid || exist.getBoolean) {
                 lhs.getBase.getMapValue(rhs.getBase)
               } else {
-                throw new NoSuchElementException(
-                  s"Key: ${rhs.getValue.asInstanceOf[UTF8String].toString} " +
-                    s"does not exist in one of the rows in the map column")
+                RapidsErrorUtils.throwInvalidElementAtIndexError(
+                  rhs.getValue.asInstanceOf[UTF8String].toString, true)
               }
             }
           }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeExtractors.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeExtractors.scala
index 8dd0635c988..14d152139eb 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeExtractors.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeExtractors.scala
@@ -183,9 +183,8 @@ case class GpuGetMapValue(child: Expression, key: Expression, failOnError: Boole
       withResource(lhs.getBase.getMapKeyExistence(rhs.getBase)) { keyExistenceColumn =>
         withResource(keyExistenceColumn.all) { exist =>
           if (exist.isValid && !exist.getBoolean) {
-            throw new NoSuchElementException(
-              s"Key: ${rhs.getValue.asInstanceOf[UTF8String].toString} " +
-                s"does not exist in any one of the rows in the map column")
+            RapidsErrorUtils.throwInvalidElementAtIndexError(
+              rhs.getValue.asInstanceOf[UTF8String].toString)
           }
         }
       }
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
index a3ba5724c66..f3e5737c0ca 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -26,7 +26,6 @@ import scala.util.control.NonFatal
 
 import ai.rapids.cudf.{HostMemoryBuffer, JCudfSerialization, NvtxColor, NvtxRange}
 import ai.rapids.cudf.JCudfSerialization.SerializedTableHeader
-import com.google.common.util.concurrent.ThreadFactoryBuilder
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.GpuMetric._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
index 5bcd9826028..827e1d75634 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.rapids
 
 import scala.collection.mutable.ArrayBuffer
 
-import ai.rapids.cudf.{ColumnVector, ColumnView, DType, PadSide, Scalar, Table}
+import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, PadSide, Scalar, Table}
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.RapidsPluginImplicits._
 import com.nvidia.spark.rapids.shims.v2.ShimExpression
@@ -60,6 +60,32 @@ case class GpuLength(child: Expression) extends GpuUnaryExpression with ExpectsI
     input.getBase.getCharLengths()
 }
 
+case class GpuBitLength(child: Expression) extends GpuUnaryExpression with ExpectsInputTypes {
+
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+  override def toString: String = s"bit_length($child)"
+
+  override def doColumnar(input: GpuColumnVector): ColumnVector = {
+    withResource(input.getBase.getByteCount) { byteCnt =>
+      // bit count = byte count * 8
+      withResource(GpuScalar.from(3, IntegerType)) { factor =>
+        byteCnt.binaryOp(BinaryOp.SHIFT_LEFT, factor, DType.INT32)
+      }
+    }
+  }
+}
+
+case class GpuOctetLength(child: Expression) extends GpuUnaryExpression with ExpectsInputTypes {
+
+  override def dataType: DataType = IntegerType
+  override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+  override def toString: String = s"octet_length($child)"
+
+  override def doColumnar(input: GpuColumnVector): ColumnVector =
+    input.getBase.getByteCount
+}
+
 case class GpuStringLocate(substr: Expression, col: Expression, start: Expression)
   extends GpuTernaryExpression with ImplicitCastInputTypes {
 
@@ -1296,7 +1322,7 @@ class GpuStringSplitMeta(
     } else {
       val str = regexp.get.value.asInstanceOf[UTF8String]
       if (str != null) {
-        if (!canRegexpBeTreatedLikeARegularString(str)) {
+        if (RegexParser.isRegExpString(str.toString)) {
           willNotWorkOnGpu("regular expressions are not supported yet")
         }
         if (str.numChars() == 0) {
@@ -1320,7 +1346,7 @@ class GpuStringSplitMeta(
 case class GpuStringSplit(str: Expression, regex: Expression, limit: Expression)
     extends GpuTernaryExpression with ImplicitCastInputTypes {
 
-  override def dataType: DataType = ArrayType(StringType)
+  override def dataType: DataType = ArrayType(StringType, containsNull = false)
   override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
   override def first: Expression = str
   override def second: Expression = regex
diff --git a/tests/pom.xml b/tests/pom.xml
index 356d2b18156..3329f0dee7c 100644
--- a/tests/pom.xml
+++ b/tests/pom.xml
@@ -54,12 +54,6 @@
             <classifier>${cuda.version}</classifier>
             <scope>provided</scope>
         </dependency>
-        <dependency>
-            <groupId>com.nvidia</groupId>
-            <artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
-            <version>${project.version}</version>
-            <scope>provided</scope>
-        </dependency>
         <!-- use aggregator jar because accessing internal classes -->
         <dependency>
             <groupId>com.nvidia</groupId>
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
index 5db550ba083..78913447dae 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
@@ -177,34 +177,54 @@ class AdaptiveQueryExecSuite
       spark.sql("INSERT INTO TABLE t1 SELECT a, b FROM testData").collect()
       spark.sql("INSERT INTO TABLE t2 SELECT a, b FROM testData").collect()
 
-      val df = spark.sql(
-        "SELECT t1.a, t2.b " +
+      // This test checks that inputs to the SHJ are coalesced. We need to check both sides
+      // if we are not optimizing the build-side coalescing logic, and only the stream side
+      // if the optimization is enabled (default).
+      // See `RapidsConf.SHUFFLED_HASH_JOIN_OPTIMIZE_SHUFFLE` for more information.
+      Seq(true, false).foreach { shouldOptimizeHashJoinShuffle =>
+        spark.conf.set(
+          RapidsConf.SHUFFLED_HASH_JOIN_OPTIMIZE_SHUFFLE.key,
+          shouldOptimizeHashJoinShuffle.toString)
+        val df = spark.sql(
+          "SELECT t1.a, t2.b " +
             "FROM t1 " +
             "JOIN t2 " +
             "ON t1.a = t2.a " +
             "WHERE t2.a = 5" // filter on partition key to force dynamic partition pruning
-      )
-      df.collect()
+        )
+        df.collect()
 
-      val isAdaptiveQuery = df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]
-      if (cmpSparkVersion(3, 2, 0) < 0) {
-        // assert that DPP did cause this to run as a non-AQE plan prior to Spark 3.2.0
-        assert(!isAdaptiveQuery)
-      } else {
-        // In 3.2.0 AQE works with DPP
-        assert(isAdaptiveQuery)
+        val isAdaptiveQuery = df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]
+        if (cmpSparkVersion(3, 2, 0) < 0) {
+          // assert that DPP did cause this to run as a non-AQE plan prior to Spark 3.2.0
+          assert(!isAdaptiveQuery)
+        } else {
+          // In 3.2.0 AQE works with DPP
+          assert(isAdaptiveQuery)
+        }
+
+        val shj = TestUtils.findOperator(df.queryExecution.executedPlan,
+          _.isInstanceOf[GpuShuffledHashJoinExec]).get
+          .asInstanceOf[GpuShuffledHashJoinExec]
+        assert(shj.children.length == 2)
+        val childrenToCheck = if (shouldOptimizeHashJoinShuffle) {
+          // assert that the stream side of SHJ is coalesced
+          shj.buildSide match {
+            case GpuBuildLeft => Seq(shj.right)
+            case GpuBuildRight => Seq(shj.left)
+          }
+        } else {
+          // assert that both the build and stream side of SHJ are coalesced
+          // if we are not optimizing the build side shuffle
+          shj.children
+        }
+        assert(childrenToCheck.forall {
+          case GpuShuffleCoalesceExec(_, _) => true
+          case GpuCoalesceBatches(GpuShuffleCoalesceExec(_, _), _) => true
+          case _ => false
+        })
       }
 
-      // assert that both inputs to the SHJ are coalesced
-      val shj = TestUtils.findOperator(df.queryExecution.executedPlan,
-        _.isInstanceOf[GpuShuffledHashJoinExec]).get
-      assert(shj.children.length == 2)
-      assert(shj.children.forall {
-        case GpuShuffleCoalesceExec(_, _) => true
-        case GpuCoalesceBatches(GpuShuffleCoalesceExec(_, _), _) => true
-        case _ => false
-      })
-
     }, conf)
   }
 
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
index bf29a411989..d5332628c5b 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -475,9 +475,9 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
     val schema = new StructType().add("i", LongType)
       .add("j", DecimalType(ai.rapids.cudf.DType.DECIMAL64_MAX_PRECISION, 3))
     val dummyMetric = WrappedGpuMetric(new SQLMetric("ignored"))
-    val coalesceIter = new GpuCoalesceIterator(
+    val coalesceIter = new GpuCompressionAwareCoalesceIterator(
       batchIter,
-      schema,
+      GpuColumnVector.extractTypes(schema),
       TargetSize(coalesceTargetBytes),
       maxCompressedBatchMemoryLimit,
       dummyMetric,
@@ -559,9 +559,9 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
     val schema = new StructType().add("i", LongType)
       .add("j", DecimalType(ai.rapids.cudf.DType.DECIMAL64_MAX_PRECISION, 3))
     val dummyMetric = WrappedGpuMetric(new SQLMetric("ignored"))
-    val coalesceIter = new GpuCoalesceIterator(
+    val coalesceIter = new GpuCompressionAwareCoalesceIterator(
       batchIter,
-      schema,
+      GpuColumnVector.extractTypes(schema),
       TargetSize(coalesceTargetBytes),
       maxCompressedBatchMemoryLimit,
       dummyMetric,
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
new file mode 100644
index 00000000000..ee11387c9ac
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream}
+
+import ai.rapids.cudf.{ColumnVector, HostMemoryBuffer, JCudfSerialization, Table}
+import org.mockito.ArgumentMatchers._
+import org.mockito.Mockito._
+import org.scalatest.FunSuite
+import org.scalatest.mockito.MockitoSugar
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+class GpuShuffledHashJoinExecSuite extends FunSuite with Arm with MockitoSugar {
+  val metricMap = mock[Map[String, GpuMetric]]
+  when(metricMap(any())).thenReturn(NoopMetric)
+
+  test("fallback with empty build iterator") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      val mockBuildIter = mock[Iterator[ColumnarBatch]]
+      when(mockBuildIter.hasNext).thenReturn(false)
+      val mockStreamIter = mock[Iterator[ColumnarBatch]]
+      val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+        0,
+        Seq.empty,
+        mockBuildIter,
+        mockStreamIter,
+        mock[SpillCallback],
+        metricMap)
+      withResource(builtBatch) { _ =>
+        // we ge an empty batch with no columns or rows
+        assertResult(builtBatch.numCols())(0)
+        assertResult(builtBatch.numRows())(0)
+        // 2 invocations, once in the `getBuiltBatchAndStreamIter`
+        // method, and a second one in `getSingleBatchWithVerification`
+        verify(mockBuildIter, times(2)).hasNext
+        verify(mockBuildIter, times(0)).next
+        verify(mockStreamIter, times(0)).hasNext
+      }
+    }
+  }
+
+  test("fallback with 0 column build batches") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      withResource(GpuColumnVector.emptyBatchFromTypes(Array.empty)) {
+        emptyBatch =>
+          val buildIter = mock[Iterator[ColumnarBatch]]
+          when(buildIter.hasNext).thenReturn(true, false)
+          val buildBufferedIter = mock[BufferedIterator[ColumnarBatch]]
+          when(buildBufferedIter.hasNext).thenReturn(true, false)
+          when(buildBufferedIter.head).thenReturn(emptyBatch)
+          when(buildBufferedIter.next).thenReturn(emptyBatch)
+          when(buildIter.buffered).thenReturn(buildBufferedIter)
+          val mockStreamIter = mock[Iterator[ColumnarBatch]]
+          val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+            0,
+            Seq.empty,
+            buildIter,
+            mockStreamIter,
+            mock[SpillCallback],
+            metricMap)
+          withResource(builtBatch) { _ =>
+            assertResult(builtBatch.numCols())(0)
+            assertResult(builtBatch.numRows())(0)
+            // 1 invocation in the `getBuiltBatchAndStreamIter`
+            // after which a buffered iterator is obtained and used for the fallback case
+            verify(buildIter, times(1)).hasNext
+            verify(buildIter, times(1)).buffered
+            // we ask the buffered iterator for `head` to inspect the number of columns
+            verify(buildBufferedIter, times(1)).head
+            // the buffered iterator is passed to `getSingleBatchWithVerification`,
+            // and that code calls hasNext twice
+            verify(buildBufferedIter, times(2)).hasNext
+            // and calls next to get that batch we buffered
+            verify(buildBufferedIter, times(1)).next
+            verify(mockStreamIter, times(0)).hasNext
+          }
+      }
+    }
+  }
+
+  test("fallback with a non-SerializedTableColumn 1 col and 0 rows") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      val emptyBatch = GpuColumnVector.emptyBatchFromTypes(Seq(IntegerType).toArray)
+      val buildIter = Seq(emptyBatch).iterator
+      val mockStreamIter = mock[Iterator[ColumnarBatch]]
+      val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+        0,
+        Seq.empty,
+        buildIter,
+        mockStreamIter,
+        mock[SpillCallback],
+        metricMap)
+      withResource(builtBatch) { _ =>
+        assertResult(builtBatch.numCols())(1)
+        assertResult(builtBatch.numRows())(0)
+        // 2 invocations, once in the `getBuiltBatchAndStreamIter
+        // method, and one in `getSingleBatchWithVerification`
+        verify(mockStreamIter, times(0)).hasNext
+        // the buffered iterator drained the build iterator
+        assertResult(buildIter.hasNext)(false)
+      }
+    }
+  }
+
+  test("fallback with a non-SerialiedTableColumn") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+        val cv = GpuColumnVector.from(cudfCol, IntegerType)
+        val batch = new ColumnarBatch(Seq(cv).toArray, 5)
+        val buildIter = Seq(batch).iterator
+        val mockStreamIter = mock[Iterator[ColumnarBatch]]
+        val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+          0,
+          Seq.empty,
+          buildIter,
+          mockStreamIter,
+          mock[SpillCallback],
+          metricMap)
+        withResource(builtBatch) { _ =>
+          assertResult(builtBatch.numCols())(1)
+          assertResult(builtBatch.numRows())(5)
+          // 2 invocations, once in the `getBuiltBatchAndStreamIter
+          // method, and one in `getSingleBatchWithVerification`
+          verify(mockStreamIter, times(0)).hasNext
+          // the buffered iterator drained the build iterator
+          assertResult(buildIter.hasNext)(false)
+        }
+      }
+    }
+  }
+
+  def getSerializedBatch(tbl: Table): ColumnarBatch = {
+    val outStream = new ByteArrayOutputStream()
+    JCudfSerialization.writeToStream(tbl, outStream, 0, tbl.getRowCount)
+    val dIn = new DataInputStream(new ByteArrayInputStream(outStream.toByteArray))
+    val header = new JCudfSerialization.SerializedTableHeader(dIn)
+    closeOnExcept(HostMemoryBuffer.allocate(header.getDataLen, false)) { hostBuffer =>
+      JCudfSerialization.readTableIntoBuffer(dIn, header, hostBuffer)
+      SerializedTableColumn.from(header, hostBuffer)
+    }
+  }
+
+  def getSerializedBatch(numRows: Int): ColumnarBatch = {
+    val outStream = new ByteArrayOutputStream()
+    JCudfSerialization.writeRowsToStream(outStream, numRows)
+    val dIn = new DataInputStream(new ByteArrayInputStream(outStream.toByteArray))
+    val header = new JCudfSerialization.SerializedTableHeader(dIn)
+    closeOnExcept(HostMemoryBuffer.allocate(header.getDataLen, false)) { hostBuffer =>
+      JCudfSerialization.readTableIntoBuffer(dIn, header, hostBuffer)
+      SerializedTableColumn.from(header, hostBuffer)
+    }
+  }
+
+  test("test a 0-column SerializedTableColumn") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      val serializedBatch = getSerializedBatch(5)
+      val mockStreamIter = mock[Iterator[ColumnarBatch]]
+      val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+      when(mockStreamIter.hasNext).thenReturn(true)
+      when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+      when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+      closeOnExcept(serializedBatch) { _ =>
+        val buildIter = Seq(serializedBatch).iterator
+        val attrs = AttributeReference("a", IntegerType, false)() :: Nil
+        val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+          1024,
+          attrs,
+          buildIter,
+          mockStreamIter,
+          mock[SpillCallback],
+          metricMap)
+        withResource(builtBatch) { _ =>
+          verify(mockBufferedStreamIterator, times(1)).hasNext
+          assertResult(builtBatch.numCols())(0)
+          assertResult(builtBatch.numRows())(5)
+          // the buffered iterator drained the build iterator
+          assertResult(buildIter.hasNext)(false)
+        }
+      }
+    }
+  }
+
+  test("test a SerializedTableColumn") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+        val cv = GpuColumnVector.from(cudfCol, IntegerType)
+        val batch = new ColumnarBatch(Seq(cv).toArray, 5)
+        withResource(GpuColumnVector.from(batch)) { tbl =>
+          val serializedBatch = getSerializedBatch(tbl)
+          val mockStreamIter = mock[Iterator[ColumnarBatch]]
+          val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+          when(mockStreamIter.hasNext).thenReturn(true)
+          when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+          when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+          closeOnExcept(serializedBatch) { _ =>
+            val buildIter = Seq(serializedBatch).iterator
+            val attrs = AttributeReference("a", IntegerType, false)() :: Nil
+            val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+              1024,
+              attrs,
+              buildIter,
+              mockStreamIter,
+              mock[SpillCallback],
+              metricMap)
+            withResource(builtBatch) { _ =>
+              verify(mockBufferedStreamIterator, times(1)).hasNext
+              assertResult(builtBatch.numCols())(1)
+              assertResult(builtBatch.numRows())(5)
+              // the buffered iterator drained the build iterator
+              assertResult(buildIter.hasNext)(false)
+            }
+          }
+        }
+      }
+    }
+  }
+
+  test("test two batches, going over the limit") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+        val cv = GpuColumnVector.from(cudfCol, IntegerType)
+        val batch = new ColumnarBatch(Seq(cv).toArray, 5)
+        withResource(GpuColumnVector.from(batch)) { tbl =>
+          val serializedBatch = getSerializedBatch(tbl)
+          val serializedBatch2 = getSerializedBatch(tbl)
+          val mockStreamIter = mock[Iterator[ColumnarBatch]]
+          val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+          when(mockStreamIter.hasNext).thenReturn(true)
+          when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+          when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+          closeOnExcept(serializedBatch) { _ =>
+            closeOnExcept(serializedBatch2) { _ =>
+              val buildIter = Seq(serializedBatch, serializedBatch2).iterator
+              val attrs = AttributeReference("a", IntegerType, false)() :: Nil
+              val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+                1,
+                attrs,
+                buildIter,
+                mockStreamIter,
+                mock[SpillCallback],
+                metricMap)
+              withResource(builtBatch) { _ =>
+                verify(mockBufferedStreamIterator, times(0)).hasNext
+                assertResult(builtBatch.numCols())(1)
+                assertResult(builtBatch.numRows())(10)
+                // the buffered iterator drained the build iterator
+                assertResult(buildIter.hasNext)(false)
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  test("test two batches, stating within the limit") {
+    TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+      closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+        val cv = GpuColumnVector.from(cudfCol, IntegerType)
+        val batch = new ColumnarBatch(Seq(cv).toArray, 5)
+        withResource(GpuColumnVector.from(batch)) { tbl =>
+          val serializedBatch = getSerializedBatch(tbl)
+          val serializedBatch2 = getSerializedBatch(tbl)
+          val mockStreamIter = mock[Iterator[ColumnarBatch]]
+          val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+          when(mockStreamIter.hasNext).thenReturn(true)
+          when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+          when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+          closeOnExcept(serializedBatch) { _ =>
+            closeOnExcept(serializedBatch2) { _ =>
+              val buildIter = Seq(serializedBatch, serializedBatch2).iterator
+              val attrs = AttributeReference("a", IntegerType, false)() :: Nil
+              val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+                1024,
+                attrs,
+                buildIter,
+                mockStreamIter,
+                mock[SpillCallback],
+                metricMap)
+              withResource(builtBatch) { _ =>
+                verify(mockBufferedStreamIterator, times(1)).hasNext
+                assertResult(builtBatch.numCols())(1)
+                assertResult(builtBatch.numRows())(10)
+                // the buffered iterator drained the build iterator
+                assertResult(buildIter.hasNext)(false)
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
index 997409412fb..a94affbf08d 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -106,6 +106,9 @@ class OrcScanSuite extends SparkQueryCompareTestSuite {
    * is actually 1582-09-23 in proleptic Gregorian calendar.
    */
   test("test hybrid Julian Gregorian calendar vs proleptic Gregorian calendar") {
+    // After Spark 3.1.1, Orc failed to prune when converting Hybrid calendar to Proleptic calendar
+    // Orc bug: https://issues.apache.org/jira/browse/ORC-1083
+    assumePriorToSpark311
 
     withCpuSparkSession(spark => {
       val df = frameFromOrcWithSchema("hybrid-Julian-calendar.orc",
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
index 2d31835cf55..389fe7800af 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,6 +21,24 @@ import org.scalatest.FunSuite
 
 class RegularExpressionParserSuite extends FunSuite {
 
+  test("detect regexp strings") {
+    // Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
+    val strings: Seq[String] = Seq("\\", "\u0000", "\\x00", "\\.",
+      "\f", "\\a", "\\e", "\\cx", "[abc]", "^", "[a-z&&[def]]", ".", "*", "\\d", "\\D",
+      "\\h", "\\H", "\\s", "\\S", "\\v", "\\V", "\\w", "\\w", "\\p", "$", "\\b", "\\B",
+      "\\A", "\\G", "\\Z", "\\z", "\\R", "?", "|", "(abc)", "a{1,}", "\\k", "\\Q", "\\E")
+    for (string <- strings) {
+      assert(RegexParser.isRegExpString(string))
+    }
+  }
+
+  test("detect non-regexp strings") {
+    val strings = Seq("A", ",", "\t", ":", "")
+    for (string <- strings) {
+      assert(!RegexParser.isRegExpString(string))
+    }
+  }
+
   test("empty pattern") {
     assert(parse("") === RegexSequence(ListBuffer()))
   }
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
index 1f84b04ad77..b8357c9db15 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -1835,6 +1835,9 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm {
   def assumeSpark320orLater: Assertion =
     assume(VersionUtils.isSpark320OrLater, "Spark version not 3.2.0+")
 
+  def assumePriorToSpark311: Assertion =
+    assume(!VersionUtils.isSpark311OrLater, "Spark version not before 3.1.1")
+
   def cmpSparkVersion(major: Int, minor: Int, bugfix: Int): Int = {
     val sparkShimVersion = ShimLoader.getSparkShims.getSparkShimVersion
     val (sparkMajor, sparkMinor, sparkBugfix) = sparkShimVersion match {
diff --git a/tools/pom.xml b/tools/pom.xml
index d8fffb17ccb..25fe91d7f5f 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -40,6 +40,11 @@
     </properties>
 
     <dependencies>
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-common_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+        </dependency>
         <dependency>
             <groupId>org.scala-lang</groupId>
             <artifactId>scala-library</artifactId>
@@ -100,18 +105,14 @@
                             <artifactSet>
                                 <includes>
                                     <include>org.rogach:scallop_${scala.binary.version}</include>
+                                    <include>com.nvidia:rapids-4-spark-common_${scala.binary.version}</include>
                                 </includes>
                             </artifactSet>
                             <filters>
-                                <filter>
-                                    <artifact>org.rogach:scallop_${scala.binary.version}:*</artifact>
-                                    <excludes>
-                                        <exclude>META-INF/*.MF</exclude>
-                                    </excludes>
-                                </filter>
                                 <filter>
                                     <artifact>*:*</artifact>
                                     <excludes>
+                                        <exclude>META-INF/*.MF</exclude>
                                         <exclude>META-INF/*.SF</exclude>
                                         <exclude>META-INF/*.DSA</exclude>
                                         <exclude>META-INF/*.RSA</exclude>
diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
index c8ef1f7cd9d..137e35a8f7b 100644
--- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
+++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ import scala.collection.JavaConverters._
 import scala.collection.mutable.{ArrayBuffer, HashMap}
 import scala.util.control.NonFatal
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
 import com.nvidia.spark.rapids.tool.{EventLogInfo, EventLogPathProcessor}
 import org.apache.hadoop.conf.Configuration
 
diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
index 33579a98b3d..a895d28fe69 100644
--- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
+++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@ import java.util.concurrent.{ConcurrentLinkedQueue, Executors, ThreadPoolExecuto
 
 import scala.collection.JavaConverters._
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
 import com.nvidia.spark.rapids.tool.EventLogInfo
 import org.apache.hadoop.conf.Configuration
 
diff --git a/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppFilterImpl.scala b/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppFilterImpl.scala
index 2b95e7639cd..5ebae2a075b 100644
--- a/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppFilterImpl.scala
+++ b/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppFilterImpl.scala
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ import java.util.regex.PatternSyntaxException
 
 import scala.collection.JavaConverters._
 
-import com.google.common.util.concurrent.ThreadFactoryBuilder
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
 import com.nvidia.spark.rapids.tool.EventLogInfo
 import com.nvidia.spark.rapids.tool.qualification.QualificationArgs
 import org.apache.hadoop.conf.Configuration
diff --git a/udf-examples/pom.xml b/udf-examples/pom.xml
index e5bc938f9f7..f44cce94e81 100644
--- a/udf-examples/pom.xml
+++ b/udf-examples/pom.xml
@@ -142,6 +142,12 @@
                 <version>${spark.version}</version>
                 <scope>provided</scope>
            </dependency>
+            <dependency>
+                <groupId>org.apache.hive</groupId>
+                <artifactId>hive-storage-api</artifactId>
+                <version>${spark.version}</version>
+                <scope>provided</scope>
+            </dependency>
         </dependencies>
     </profile>
     <profile>