diff --git a/CHANGELOG.md b/CHANGELOG.md
index 25c4d2a7d30..fa757ea2108 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,11 +1,306 @@
# Change log
-Generated on 2021-12-07
+Generated on 2022-02-07
+
+## Release 22.02
+
+### Features
+|||
+|:---|:---|
+|[#4305](https://github.com/NVIDIA/spark-rapids/issues/4305)|[FEA] write nvidia tool wrappers to allow old YARN versions to work with MIG|
+|[#4360](https://github.com/NVIDIA/spark-rapids/issues/4360)|[FEA] Add explain api for Spark 2.X|
+|[#3541](https://github.com/NVIDIA/spark-rapids/issues/3541)|[FEA] Support max on single-level struct in aggregation context|
+|[#4238](https://github.com/NVIDIA/spark-rapids/issues/4238)|[FEA] Add a Spark 3.X Explain only mode to the plugin|
+|[#3952](https://github.com/NVIDIA/spark-rapids/issues/3952)|[Audit] [FEA][SPARK-32986][SQL] Add bucketed scan info in query plan of data source v1|
+|[#4412](https://github.com/NVIDIA/spark-rapids/issues/4412)|[FEA] Improve support for \A, \Z, and \z in regular expressions|
+|[#3979](https://github.com/NVIDIA/spark-rapids/issues/3979)|[FEA] Improvements for CPU(Row) based UDF|
+|[#4467](https://github.com/NVIDIA/spark-rapids/issues/4467)|[FEA] Add support for regular expression with repeated digits (`\d+`, `\d*`, `\d?`)|
+|[#4439](https://github.com/NVIDIA/spark-rapids/issues/4439)|[FEA] Enable GPU broadcast exchange reuse for DPP when AQE enabled|
+|[#3512](https://github.com/NVIDIA/spark-rapids/issues/3512)|[FEA] Support org.apache.spark.sql.catalyst.expressions.Sequence|
+|[#3475](https://github.com/NVIDIA/spark-rapids/issues/3475)|[FEA] Spark 3.2.0 reads Parquet unsigned int64(UINT64) as Decimal(20,0) but CUDF does not support it |
+|[#4091](https://github.com/NVIDIA/spark-rapids/issues/4091)|[FEA] regexp_replace: Improve support for ^ and $|
+|[#4104](https://github.com/NVIDIA/spark-rapids/issues/4104)|[FEA] Support org.apache.spark.sql.catalyst.expressions.ReplicateRows|
+|[#4027](https://github.com/NVIDIA/spark-rapids/issues/4027)|[FEA] Support SubqueryBroadcast on GPU to enable exchange reuse during DPP|
+|[#4284](https://github.com/NVIDIA/spark-rapids/issues/4284)|[FEA] Support idx = 0 in GpuRegExpExtract|
+|[#4002](https://github.com/NVIDIA/spark-rapids/issues/4002)|[FEA] Implement regexp_extract on GPU|
+|[#3221](https://github.com/NVIDIA/spark-rapids/issues/3221)|[FEA] Support GpuFirst and GpuLast on nested types under reduction aggregations|
+|[#3944](https://github.com/NVIDIA/spark-rapids/issues/3944)|[FEA] Full support for sum with overflow on Decimal 128|
+|[#4028](https://github.com/NVIDIA/spark-rapids/issues/4028)|[FEA] support GpuCast from non-nested ArrayType to StringType|
+|[#3250](https://github.com/NVIDIA/spark-rapids/issues/3250)|[FEA] Make CreateMap duplicate key handling compatible with Spark and enable CreateMap by default|
+|[#4170](https://github.com/NVIDIA/spark-rapids/issues/4170)|[FEA] Make regular expression behavior with `$` and `\r` consistent with CPU|
+|[#4001](https://github.com/NVIDIA/spark-rapids/issues/4001)|[FEA] Add regexp support to regexp_replace|
+|[#3962](https://github.com/NVIDIA/spark-rapids/issues/3962)|[FEA] Support null characters in regular expressions in RLIKE|
+|[#3797](https://github.com/NVIDIA/spark-rapids/issues/3797)|[FEA] Make RLike support consistent with Apache Spark|
+
+### Performance
+|||
+|:---|:---|
+|[#4392](https://github.com/NVIDIA/spark-rapids/issues/4392)|[FEA] could the parquet scan code avoid acquiring the semaphore for an empty batch?|
+|[#679](https://github.com/NVIDIA/spark-rapids/issues/679)|[FEA] move some deserialization code out of the scope of the gpu-semaphore to increase cpu concurrent|
+|[#4350](https://github.com/NVIDIA/spark-rapids/issues/4350)|[FEA] Optimize the all-true and all-false cases in GPU `If` and `CaseWhen` |
+|[#4309](https://github.com/NVIDIA/spark-rapids/issues/4309)|[FEA] Leverage cudf conditional nested loop join to implement semi/anti hash join with condition|
+|[#4395](https://github.com/NVIDIA/spark-rapids/issues/4395)|[FEA] acquire the semaphore after concatToHost in GpuShuffleCoalesceIterator|
+|[#4134](https://github.com/NVIDIA/spark-rapids/issues/4134)|[FEA] Allow `EliminateJoinToEmptyRelation` in `GpuBroadcastExchangeExec` |
+|[#4189](https://github.com/NVIDIA/spark-rapids/issues/4189)|[FEA] understand why between is so expensive|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#4675](https://github.com/NVIDIA/spark-rapids/issues/4675)|[BUG] Jenkins integration build timed out at 10 hours|
+|[#4665](https://github.com/NVIDIA/spark-rapids/issues/4665)|[BUG] Spark321Shims.getParquetFilters failed with NoSuchMethodError|
+|[#4635](https://github.com/NVIDIA/spark-rapids/issues/4635)|[BUG] nvidia-smi wrapper script ignores ENABLE_NON_MIG_GPUS=1 on a heterogeneous multi-GPU machine|
+|[#4500](https://github.com/NVIDIA/spark-rapids/issues/4500)|[BUG] Build failures against Spark 3.2.1 rc1 and make 3.2.1 non snapshot|
+|[#4631](https://github.com/NVIDIA/spark-rapids/issues/4631)|[BUG] Release build with mvn option `-P source-javadoc` FAILED|
+|[#4625](https://github.com/NVIDIA/spark-rapids/issues/4625)|[BUG] NDS query 5 fails with AdaptiveSparkPlanExec assertion|
+|[#4632](https://github.com/NVIDIA/spark-rapids/issues/4632)|[BUG] Build failing for Spark 3.3.0 due to deprecated method warnings|
+|[#4599](https://github.com/NVIDIA/spark-rapids/issues/4599)|[BUG] test_group_apply_udf and test_group_apply_udf_more_types hangs on Databricks 9.1|
+|[#4600](https://github.com/NVIDIA/spark-rapids/issues/4600)|[BUG] crash if we have a decimal128 in a struct in an array |
+|[#4581](https://github.com/NVIDIA/spark-rapids/issues/4581)|[BUG] Build error "GpuOverrides.scala:924: wrong number of arguments" on DB9.1.x spark-3.1.2 |
+|[#4593](https://github.com/NVIDIA/spark-rapids/issues/4593)|[BUG] dup GpuHashJoin.diff case-folding issue|
+|[#4503](https://github.com/NVIDIA/spark-rapids/issues/4503)|[BUG] regexp_replace with back references produces incorrect results on GPU|
+|[#4567](https://github.com/NVIDIA/spark-rapids/issues/4567)|[BUG] Profile tool hangs in compare mode|
+|[#4315](https://github.com/NVIDIA/spark-rapids/issues/4315)|[BUG] test_hash_reduction_decimal_overflow_sum[30] failed OOM in integration tests|
+|[#4551](https://github.com/NVIDIA/spark-rapids/issues/4551)|[BUG] protobuf-java version changed to 3.x|
+|[#4499](https://github.com/NVIDIA/spark-rapids/issues/4499)|[BUG]GpuSequence blows up when nulls exist in any of the inputs (start, stop, step)|
+|[#4454](https://github.com/NVIDIA/spark-rapids/issues/4454)|[BUG] Shade warnings when building the tools artifact|
+|[#4541](https://github.com/NVIDIA/spark-rapids/issues/4541)|[BUG] Column vector leak in conditionals_test.py|
+|[#4514](https://github.com/NVIDIA/spark-rapids/issues/4514)|[BUG] test_hash_reduction_pivot_without_nans failed|
+|[#4521](https://github.com/NVIDIA/spark-rapids/issues/4521)|[BUG] Inconsistencies in handling of newline characters and string and line anchors|
+|[#4548](https://github.com/NVIDIA/spark-rapids/issues/4548)|[BUG] ai.rapids.cudf.CudaException: an illegal instruction was encountered in databricks 9.1|
+|[#4475](https://github.com/NVIDIA/spark-rapids/issues/4475)|[BUG] `\D` and `\W` match newline in Spark but not in cuDF|
+|[#4524](https://github.com/NVIDIA/spark-rapids/issues/4524)|[BUG] RegExp transpiler fails to detect some choice expressions that cuDF cannot compile|
+|[#3226](https://github.com/NVIDIA/spark-rapids/issues/3226)|[BUG]OOM happened when do cube operations|
+|[#2504](https://github.com/NVIDIA/spark-rapids/issues/2504)|[BUG] OOM when running NDS queries with UCX and GDS|
+|[#4273](https://github.com/NVIDIA/spark-rapids/issues/4273)|[BUG] Rounding past the size that can be stored in a type produces incorrect results|
+|[#4060](https://github.com/NVIDIA/spark-rapids/issues/4060)|[BUG] test_hash_groupby_approx_percentile_long_repeated_keys failed intermittently|
+|[#4039](https://github.com/NVIDIA/spark-rapids/issues/4039)|[BUG] Spark 3.3.0 IT Array test failures|
+|[#3849](https://github.com/NVIDIA/spark-rapids/issues/3849)|[BUG] In ANSI mode we can fail in cases Spark would not due to conditionals|
+|[#4421](https://github.com/NVIDIA/spark-rapids/issues/4421)|[BUG] the driver is trying to load CUDA with latest 22.02 |
+|[#4455](https://github.com/NVIDIA/spark-rapids/issues/4455)|[BUG] join_test.py::test_struct_self_join[IGNORE_ORDER({'local': True})] failed in spark330|
+|[#4442](https://github.com/NVIDIA/spark-rapids/issues/4442)|[BUG] mvn build FAILED with option `-P noSnapshotsWithDatabricks`|
+|[#4281](https://github.com/NVIDIA/spark-rapids/issues/4281)|[BUG] q9 regression between 21.10 and 21.12|
+|[#4280](https://github.com/NVIDIA/spark-rapids/issues/4280)|[BUG] q88 regression between 21.10 and 21.12|
+|[#4422](https://github.com/NVIDIA/spark-rapids/issues/4422)|[BUG] Host column vectors are being leaked during tests|
+|[#4446](https://github.com/NVIDIA/spark-rapids/issues/4446)|[BUG] GpuCast crashes when casting from Array with unsupportable child type|
+|[#4432](https://github.com/NVIDIA/spark-rapids/issues/4432)|[BUG] nightly build 3.3.0 failed: HashClusteredDistribution is not a member of org.apache.spark.sql.catalyst.plans.physical|
+|[#4443](https://github.com/NVIDIA/spark-rapids/issues/4443)|[BUG] SPARK-37705 breaks parquet filters from Spark 3.3.0 and Spark 3.2.2 onwards|
+|[#4316](https://github.com/NVIDIA/spark-rapids/issues/4316)|[BUG] Exception: Unable to find py4j, your SPARK_HOME may not be configured correctly intermittently|
+|[#4378](https://github.com/NVIDIA/spark-rapids/issues/4378)|[BUG] udf_test udf_cudf_test failed require_minimum_pandas_version check in spark 320+|
+|[#4423](https://github.com/NVIDIA/spark-rapids/issues/4423)|[BUG] Build is failing due to FileScanRDD changes in Spark 3.3.0-SNAPSHOT|
+|[#4401](https://github.com/NVIDIA/spark-rapids/issues/4401)|[BUG]array_test.py::test_array_contains failures|
+|[#4403](https://github.com/NVIDIA/spark-rapids/issues/4403)|[BUG] NDS query 72 logs codegen fallback exception and produces incorrect results|
+|[#4386](https://github.com/NVIDIA/spark-rapids/issues/4386)|[BUG] conditionals_test.py FAILED with side_effects_cast[Integer/Long] on Databricks 9.1 Runtime|
+|[#3934](https://github.com/NVIDIA/spark-rapids/issues/3934)|[BUG] Dependencies of published integration tests jar are missing|
+|[#4341](https://github.com/NVIDIA/spark-rapids/issues/4341)|[BUG] GpuCast.scala:nnn warning: discarding unmoored doc comment|
+|[#4356](https://github.com/NVIDIA/spark-rapids/issues/4356)|[BUG] nightly spark303 deploy pulling spark301 aggregator|
+|[#4347](https://github.com/NVIDIA/spark-rapids/issues/4347)|[BUG] Dist jar pom lists aggregator jar as dependency|
+|[#4176](https://github.com/NVIDIA/spark-rapids/issues/4176)|[BUG] ParseDateTimeSuite UT failed|
+|[#4292](https://github.com/NVIDIA/spark-rapids/issues/4292)|[BUG] no meaningful message is surfaced to maven when binary-dedupe fails|
+|[#4351](https://github.com/NVIDIA/spark-rapids/issues/4351)|[BUG] Tests FAILED On SPARK-3.2.0, com.nvidia.spark.rapids.SerializedTableColumn cannot be cast to com.nvidia.spark.rapids.GpuColumnVector|
+|[#4346](https://github.com/NVIDIA/spark-rapids/issues/4346)|[BUG] q73 decimal was twice as slow in weekly results|
+|[#4334](https://github.com/NVIDIA/spark-rapids/issues/4334)|[BUG] GpuColumnarToRowExec will always be tagged False for exportColumnarRdd after Spark311 |
+|[#4339](https://github.com/NVIDIA/spark-rapids/issues/4339)|The parameter `dataType` is not necessary in `resolveColumnVector` method.|
+|[#4275](https://github.com/NVIDIA/spark-rapids/issues/4275)|[BUG] Row-based Hive UDF will fail if arguments contain a foldable expression.|
+|[#4229](https://github.com/NVIDIA/spark-rapids/issues/4229)|[BUG] regexp_replace `[^a]` has different behavior between CPU and GPU for multiline strings|
+|[#4294](https://github.com/NVIDIA/spark-rapids/issues/4294)|[BUG] parquet_write_test.py::test_ts_write_fails_datetime_exception failed in spark 3.1.1 and 3.1.2|
+|[#4205](https://github.com/NVIDIA/spark-rapids/issues/4205)|[BUG] Get different results when casting from timestamp to string|
+|[#4277](https://github.com/NVIDIA/spark-rapids/issues/4277)|[BUG] cudf_udf nightly cudf import rmm failed|
+|[#4246](https://github.com/NVIDIA/spark-rapids/issues/4246)|[BUG] Regression in CastOpSuite due to cuDF change in parsing NaN|
+|[#4243](https://github.com/NVIDIA/spark-rapids/issues/4243)|[BUG] test_regexp_replace_null_pattern_fallback[ALLOW_NON_GPU(ProjectExec,RegExpReplace)] failed in databricks|
+|[#4244](https://github.com/NVIDIA/spark-rapids/issues/4244)|[BUG] Cast from string to float using hand-picked values failed|
+|[#4227](https://github.com/NVIDIA/spark-rapids/issues/4227)|[BUG] RAPIDS Shuffle Manager doesn't fallback given encryption settings|
+|[#3374](https://github.com/NVIDIA/spark-rapids/issues/3374)|[BUG] minor deprecation warnings in a 3.2 shim build|
+|[#3613](https://github.com/NVIDIA/spark-rapids/issues/3613)|[BUG] release312db profile pulls in 311until320-apache|
+|[#4213](https://github.com/NVIDIA/spark-rapids/issues/4213)|[BUG] unused method with a misleading outdated comment in ShimLoader |
+|[#3609](https://github.com/NVIDIA/spark-rapids/issues/3609)|[BUG] GpuShuffleExchangeExec in v2 shims has inconsistent packaging|
+|[#4127](https://github.com/NVIDIA/spark-rapids/issues/4127)|[BUG] CUDF 22.02 nightly test failure|
+
+### PRs
+|||
+|:---|:---|
+|[#4700](https://github.com/NVIDIA/spark-rapids/pull/4700)|Update cudfjni version to released 22.02.0|
+|[#4701](https://github.com/NVIDIA/spark-rapids/pull/4701)|Decrease nighlty tests upper limitation to 7 [skip ci]|
+|[#4639](https://github.com/NVIDIA/spark-rapids/pull/4639)|Update changelog for 22.02 and archive info of some older releases [skip ci]|
+|[#4572](https://github.com/NVIDIA/spark-rapids/pull/4572)|Add download page for 22.02 [skip ci]|
+|[#4672](https://github.com/NVIDIA/spark-rapids/pull/4672)|Revert "Disable 311cdh build due to missing dependency (#4659)"|
+|[#4662](https://github.com/NVIDIA/spark-rapids/pull/4662)|Update the deploy script [skip ci]|
+|[#4657](https://github.com/NVIDIA/spark-rapids/pull/4657)|Upmerge spark2 directory to the latest 22.02 changes|
+|[#4659](https://github.com/NVIDIA/spark-rapids/pull/4659)|Disable 311cdh build by default because of a missing dependency|
+|[#4508](https://github.com/NVIDIA/spark-rapids/pull/4508)|Fix Spark 3.2.1 build failures and make it non-snapshot|
+|[#4652](https://github.com/NVIDIA/spark-rapids/pull/4652)|Remove non-deterministic test order in nightly [skip ci]|
+|[#4643](https://github.com/NVIDIA/spark-rapids/pull/4643)|Add profile release301 when mvn help:evaluate|
+|[#4630](https://github.com/NVIDIA/spark-rapids/pull/4630)|Fix the incomplete capture of SubqueryBroadcast |
+|[#4633](https://github.com/NVIDIA/spark-rapids/pull/4633)|Suppress newTaskTempFile method warnings for Spark 3.3.0 build|
+|[#4618](https://github.com/NVIDIA/spark-rapids/pull/4618)|[DB31x] Pick the correct Python runner for flatmap-group Pandas UDF|
+|[#4622](https://github.com/NVIDIA/spark-rapids/pull/4622)|Fallback to CPU when encoding is not supported for JSON reader|
+|[#4470](https://github.com/NVIDIA/spark-rapids/pull/4470)|Add in HashPartitioning support for decimal 128|
+|[#4535](https://github.com/NVIDIA/spark-rapids/pull/4535)|Revert "Disable orc write by default because of https://issues.apache.org/jira/browse/ORC-1075 (#4471)"|
+|[#4583](https://github.com/NVIDIA/spark-rapids/pull/4583)|Avoid unapply on PromotePrecision|
+|[#4573](https://github.com/NVIDIA/spark-rapids/pull/4573)|Correct version from 21.12 to 22.02[skip ci]|
+|[#4575](https://github.com/NVIDIA/spark-rapids/pull/4575)|Correct and update links in UDF doc[skip ci]|
+|[#4501](https://github.com/NVIDIA/spark-rapids/pull/4501)|Switch and/or to use new cudf binops to improve performance|
+|[#4594](https://github.com/NVIDIA/spark-rapids/pull/4594)|Resolve case-folding issue [skip ci]|
+|[#4585](https://github.com/NVIDIA/spark-rapids/pull/4585)|Spark2 module upmerge, deploy script, and updates for Jenkins|
+|[#4589](https://github.com/NVIDIA/spark-rapids/pull/4589)|Increase premerge databricks IDLE_TIMEOUT to 4 hours [skip ci]|
+|[#4485](https://github.com/NVIDIA/spark-rapids/pull/4485)|Add json reader support|
+|[#4556](https://github.com/NVIDIA/spark-rapids/pull/4556)|regexp_replace with back-references should fall back to CPU|
+|[#4569](https://github.com/NVIDIA/spark-rapids/pull/4569)|Fix infinite loop with Profiling tool compare mode and app with no sql ids|
+|[#4529](https://github.com/NVIDIA/spark-rapids/pull/4529)|Add support for Spark 2.x Explain Api|
+|[#4577](https://github.com/NVIDIA/spark-rapids/pull/4577)|Revert "Fix CVE-2021-22569 (#4545)"|
+|[#4520](https://github.com/NVIDIA/spark-rapids/pull/4520)|GpuSequence refactor|
+|[#4570](https://github.com/NVIDIA/spark-rapids/pull/4570)|A few quick fixes to try to reduce max memory usage in the tests|
+|[#4477](https://github.com/NVIDIA/spark-rapids/pull/4477)|Use libcudf mixed joins for conditional hash joins|
+|[#4566](https://github.com/NVIDIA/spark-rapids/pull/4566)|remove scala-library from combined tools jar|
+|[#4552](https://github.com/NVIDIA/spark-rapids/pull/4552)|Fix resource leak in GpuCaseWhen|
+|[#4553](https://github.com/NVIDIA/spark-rapids/pull/4553)|Reenable test_hash_reduction_pivot_without_nans|
+|[#4530](https://github.com/NVIDIA/spark-rapids/pull/4530)|Fix correctness issues in regexp and add `\r` and `\n` to fuzz tests|
+|[#4549](https://github.com/NVIDIA/spark-rapids/pull/4549)|Fix typos in integration tests README [skip ci]|
+|[#4545](https://github.com/NVIDIA/spark-rapids/pull/4545)|Fix CVE-2021-22569|
+|[#4543](https://github.com/NVIDIA/spark-rapids/pull/4543)|Enable auto-merge from branch-22.02 to branch-22.04 [skip ci]|
+|[#4540](https://github.com/NVIDIA/spark-rapids/pull/4540)|Remove user kuhushukla|
+|[#4434](https://github.com/NVIDIA/spark-rapids/pull/4434)|Support max on single-level struct in aggregation context|
+|[#4534](https://github.com/NVIDIA/spark-rapids/pull/4534)|Temporarily disable integration test - test_hash_reduction_pivot_without_nans|
+|[#4322](https://github.com/NVIDIA/spark-rapids/pull/4322)|Add an explain only mode to the plugin|
+|[#4497](https://github.com/NVIDIA/spark-rapids/pull/4497)|Make better use of pinned memory pool|
+|[#4512](https://github.com/NVIDIA/spark-rapids/pull/4512)|remove hadoop version requirement[skip ci]|
+|[#4527](https://github.com/NVIDIA/spark-rapids/pull/4527)|Fall back to CPU for regular expressions containing \D or \W|
+|[#4525](https://github.com/NVIDIA/spark-rapids/pull/4525)|Properly close data writer in GpuFileFormatWriter|
+|[#4502](https://github.com/NVIDIA/spark-rapids/pull/4502)|Removed the redundant test for element_at and fixed the failing one|
+|[#4523](https://github.com/NVIDIA/spark-rapids/pull/4523)|Add more integration tests for decimal 128|
+|[#3762](https://github.com/NVIDIA/spark-rapids/pull/3762)|Call the right method to convert table from row major <=> col major|
+|[#4482](https://github.com/NVIDIA/spark-rapids/pull/4482)|Simplified the construction of zero scalar in GpuUnaryMinus|
+|[#4510](https://github.com/NVIDIA/spark-rapids/pull/4510)|Update copyright in NOTICE [skip ci]|
+|[#4484](https://github.com/NVIDIA/spark-rapids/pull/4484)|Update GpuFileFormatWriter to stay in sync with recent Spark changes, but still not support writing Hive bucketed table on GPU.|
+|[#4492](https://github.com/NVIDIA/spark-rapids/pull/4492)|Fall back to CPU for regular expressions containing hex digits|
+|[#4495](https://github.com/NVIDIA/spark-rapids/pull/4495)|Enable approx_percentile by default|
+|[#4420](https://github.com/NVIDIA/spark-rapids/pull/4420)|Fix up incorrect results of rounding past the max digits of data type|
+|[#4483](https://github.com/NVIDIA/spark-rapids/pull/4483)|Update test case of reading nested unsigned parquet file|
+|[#4490](https://github.com/NVIDIA/spark-rapids/pull/4490)|Remove warning about RMM default allocator|
+|[#4461](https://github.com/NVIDIA/spark-rapids/pull/4461)|[Audit] Add bucketed scan info in query plan of data source v1|
+|[#4489](https://github.com/NVIDIA/spark-rapids/pull/4489)|Add arrays of decimal128 to join tests|
+|[#4476](https://github.com/NVIDIA/spark-rapids/pull/4476)|Don't acquire the semaphore for empty input while scanning|
+|[#4424](https://github.com/NVIDIA/spark-rapids/pull/4424)|Improve support for regular expression string anchors `\A`, `\Z`, and `\z`|
+|[#4491](https://github.com/NVIDIA/spark-rapids/pull/4491)|Skip the test for spark versions 3.1.1, 3.1.2 and 3.2.0 only|
+|[#4459](https://github.com/NVIDIA/spark-rapids/pull/4459)|Use merge sort for struct types in non-key columns|
+|[#4494](https://github.com/NVIDIA/spark-rapids/pull/4494)|Append new authorized user to blossom-ci whitelist [skip ci]|
+|[#4400](https://github.com/NVIDIA/spark-rapids/pull/4400)|Enable approx percentile tests|
+|[#4471](https://github.com/NVIDIA/spark-rapids/pull/4471)|Disable orc write by default because of https://issues.apache.org/jira/browse/ORC-1075|
+|[#4462](https://github.com/NVIDIA/spark-rapids/pull/4462)|Rename DECIMAL_128_FULL and rework usage of TypeSig.gpuNumeric|
+|[#4479](https://github.com/NVIDIA/spark-rapids/pull/4479)|Change signoff check image to slim-buster [skip ci]|
+|[#4464](https://github.com/NVIDIA/spark-rapids/pull/4464)|Throw SparkArrayIndexOutOfBoundsException for Spark 3.3.0+|
+|[#4469](https://github.com/NVIDIA/spark-rapids/pull/4469)|Support repetition of \d and \D in regexp functions|
+|[#4472](https://github.com/NVIDIA/spark-rapids/pull/4472)|Modify docs for 22.02 to address issue-4319[skip ci]|
+|[#4440](https://github.com/NVIDIA/spark-rapids/pull/4440)|Enable GPU broadcast exchange reuse for DPP when AQE enabled|
+|[#4376](https://github.com/NVIDIA/spark-rapids/pull/4376)|Add sequence support|
+|[#4460](https://github.com/NVIDIA/spark-rapids/pull/4460)|Abstract the text based PartitionReader|
+|[#4383](https://github.com/NVIDIA/spark-rapids/pull/4383)|Fix correctness issue with CASE WHEN with expressions that have side-effects|
+|[#4465](https://github.com/NVIDIA/spark-rapids/pull/4465)|Refactor for shims 320+|
+|[#4463](https://github.com/NVIDIA/spark-rapids/pull/4463)|Avoid replacing a hash join if build side is unsupported by the join type|
+|[#4456](https://github.com/NVIDIA/spark-rapids/pull/4456)|Fix build issues: 1 clean non-exists target dirs; 2 remove duplicated plugin|
+|[#4416](https://github.com/NVIDIA/spark-rapids/pull/4416)|Unshim join execs|
+|[#4172](https://github.com/NVIDIA/spark-rapids/pull/4172)|Support String to Decimal 128|
+|[#4458](https://github.com/NVIDIA/spark-rapids/pull/4458)|Exclude some metadata operators when checking GPU replacement|
+|[#4451](https://github.com/NVIDIA/spark-rapids/pull/4451)|Some metrics improvements and timeline reporting|
+|[#4435](https://github.com/NVIDIA/spark-rapids/pull/4435)|Disable add profile src execution by default to make the build log clean|
+|[#4436](https://github.com/NVIDIA/spark-rapids/pull/4436)|Print error log to stderr output|
+|[#4155](https://github.com/NVIDIA/spark-rapids/pull/4155)|Add partial support for line begin and end anchors in regexp_replace|
+|[#4428](https://github.com/NVIDIA/spark-rapids/pull/4428)|Exhaustively iterate ColumnarToRow iterator to avoid leaks|
+|[#4430](https://github.com/NVIDIA/spark-rapids/pull/4430)|update pca example link in ml-integration.md[skip ci]|
+|[#4452](https://github.com/NVIDIA/spark-rapids/pull/4452)|Limit parallelism of nightly tests [skip ci]|
+|[#4449](https://github.com/NVIDIA/spark-rapids/pull/4449)|Add recursive type checking and fallback tests for casting array with unsupported element types to string|
+|[#4437](https://github.com/NVIDIA/spark-rapids/pull/4437)|Change logInfo to logWarning|
+|[#4447](https://github.com/NVIDIA/spark-rapids/pull/4447)|Fix 330 build error and add 322 shims layer|
+|[#4417](https://github.com/NVIDIA/spark-rapids/pull/4417)|Fix an Intellij debug issue|
+|[#4431](https://github.com/NVIDIA/spark-rapids/pull/4431)|Add DateType support for AST expressions|
+|[#4433](https://github.com/NVIDIA/spark-rapids/pull/4433)|Import the right pandas from conda [skip ci]|
+|[#4419](https://github.com/NVIDIA/spark-rapids/pull/4419)|Import the right pandas from conda|
+|[#4427](https://github.com/NVIDIA/spark-rapids/pull/4427)|Update getFileScanRDD shim for recent changes in Spark 3.3.0|
+|[#4397](https://github.com/NVIDIA/spark-rapids/pull/4397)|Ignore cufile.log|
+|[#4388](https://github.com/NVIDIA/spark-rapids/pull/4388)|Add support for ReplicateRows|
+|[#4399](https://github.com/NVIDIA/spark-rapids/pull/4399)|Update docs for Profiling and Qualification tool to change wording|
+|[#4407](https://github.com/NVIDIA/spark-rapids/pull/4407)|Fix GpuSubqueryBroadcast on multi-fields relation|
+|[#4396](https://github.com/NVIDIA/spark-rapids/pull/4396)|GpuShuffleCoalesceIterator acquire semaphore after host concat|
+|[#4361](https://github.com/NVIDIA/spark-rapids/pull/4361)|Accommodate altered semantics of `cudf::lists::contains()`|
+|[#4394](https://github.com/NVIDIA/spark-rapids/pull/4394)|Use correct column name in GpuIf test|
+|[#4385](https://github.com/NVIDIA/spark-rapids/pull/4385)|Add missing GpuSubqueryBroadcast replacement rule for spark31x |
+|[#4387](https://github.com/NVIDIA/spark-rapids/pull/4387)|Fix auto merge conflict 4384[skip ci]|
+|[#4374](https://github.com/NVIDIA/spark-rapids/pull/4374)|Fix the IT module depends on the tests module|
+|[#4365](https://github.com/NVIDIA/spark-rapids/pull/4365)|Not publishing integration_tests jar to Maven Central [skip ci]|
+|[#4358](https://github.com/NVIDIA/spark-rapids/pull/4358)|Update GpuIf to support expressions with side effects|
+|[#4382](https://github.com/NVIDIA/spark-rapids/pull/4382)|Remove unused scallop dependency from integration_tests|
+|[#4364](https://github.com/NVIDIA/spark-rapids/pull/4364)|Replace Scala document with Scala comment for inner functions|
+|[#4373](https://github.com/NVIDIA/spark-rapids/pull/4373)|Add pytest tags for nightly test parallel run [skip ci]|
+|[#4150](https://github.com/NVIDIA/spark-rapids/pull/4150)|Support GpuSubqueryBroadcast for DPP|
+|[#4372](https://github.com/NVIDIA/spark-rapids/pull/4372)|Move casting to string tests from array_test.py and struct_test.py to cast_test.py|
+|[#4371](https://github.com/NVIDIA/spark-rapids/pull/4371)|Fix typo in skipTestsFor330 calculation [skip ci]|
+|[#4355](https://github.com/NVIDIA/spark-rapids/pull/4355)|Dedicated deploy-file with reduced pom in nightly build [skip ci]|
+|[#4352](https://github.com/NVIDIA/spark-rapids/pull/4352)|Revert "Ignore failing string to timestamp tests temporarily (#4197)"|
+|[#4359](https://github.com/NVIDIA/spark-rapids/pull/4359)|Audit - SPARK-37268 - Remove unused variable in GpuFileScanRDD [Databricks]|
+|[#4327](https://github.com/NVIDIA/spark-rapids/pull/4327)|Print meaningful message when calling scripts in maven|
+|[#4354](https://github.com/NVIDIA/spark-rapids/pull/4354)|Fix regression in AQE optimizations|
+|[#4343](https://github.com/NVIDIA/spark-rapids/pull/4343)|Fix issue with binding to hash agg columns with computation|
+|[#4285](https://github.com/NVIDIA/spark-rapids/pull/4285)|Add support for regexp_extract on the GPU|
+|[#4349](https://github.com/NVIDIA/spark-rapids/pull/4349)|Fix PYTHONPATH in pre-merge|
+|[#4269](https://github.com/NVIDIA/spark-rapids/pull/4269)|The option for the nightly script not deploying jars [skip ci]|
+|[#4335](https://github.com/NVIDIA/spark-rapids/pull/4335)|Fix the issue of exporting Column RDD|
+|[#4336](https://github.com/NVIDIA/spark-rapids/pull/4336)|Split expensive pytest files in cases level [skip ci]|
+|[#4328](https://github.com/NVIDIA/spark-rapids/pull/4328)|Change the explanation of why the operator will not work on GPU|
+|[#4338](https://github.com/NVIDIA/spark-rapids/pull/4338)|Use scala Int.box instead of Integer constructors |
+|[#4340](https://github.com/NVIDIA/spark-rapids/pull/4340)|Remove the unnecessary parameter `dataType` in `resolveColumnVector` method|
+|[#4256](https://github.com/NVIDIA/spark-rapids/pull/4256)|Allow returning an EmptyHashedRelation when a broadcast result is empty|
+|[#4333](https://github.com/NVIDIA/spark-rapids/pull/4333)|Add tests about writing empty table to ORC/PAQUET|
+|[#4337](https://github.com/NVIDIA/spark-rapids/pull/4337)|Support GpuFirst and GpuLast on nested types under reduction aggregations|
+|[#4331](https://github.com/NVIDIA/spark-rapids/pull/4331)|Fix parquet options builder calls|
+|[#4310](https://github.com/NVIDIA/spark-rapids/pull/4310)|Fix typo in shim class name|
+|[#4326](https://github.com/NVIDIA/spark-rapids/pull/4326)|Fix 4315 decrease concurrentGpuTasks to avoid sum test OOM|
+|[#4266](https://github.com/NVIDIA/spark-rapids/pull/4266)|Check revisions for all shim jars while build all|
+|[#4282](https://github.com/NVIDIA/spark-rapids/pull/4282)|Use data type to create an inspector for a foldable GPU expression.|
+|[#3144](https://github.com/NVIDIA/spark-rapids/pull/3144)|Optimize AQE with Spark 3.2+ to avoid redundant transitions|
+|[#4317](https://github.com/NVIDIA/spark-rapids/pull/4317)|[BUG] Update nightly test script to dynamically set mem_fraction [skip ci]|
+|[#4206](https://github.com/NVIDIA/spark-rapids/pull/4206)|Porting GpuRowToColumnar converters to InternalColumnarRDDConverter|
+|[#4272](https://github.com/NVIDIA/spark-rapids/pull/4272)|Full support for SUM overflow detection on decimal|
+|[#4255](https://github.com/NVIDIA/spark-rapids/pull/4255)|Make regexp pattern `[^a]` consistent with Spark for multiline strings|
+|[#4306](https://github.com/NVIDIA/spark-rapids/pull/4306)|Revert commonizing the int96ParquetRebase* functions |
+|[#4299](https://github.com/NVIDIA/spark-rapids/pull/4299)|Fix auto merge conflict 4298 [skip ci]|
+|[#4159](https://github.com/NVIDIA/spark-rapids/pull/4159)|Optimize sample perf|
+|[#4235](https://github.com/NVIDIA/spark-rapids/pull/4235)|Commonize v2 shim|
+|[#4274](https://github.com/NVIDIA/spark-rapids/pull/4274)|Add tests for timestamps that overflowed before.|
+|[#4271](https://github.com/NVIDIA/spark-rapids/pull/4271)|Skip test_regexp_replace_null_pattern_fallback on Spark 3.1.1 and later|
+|[#4278](https://github.com/NVIDIA/spark-rapids/pull/4278)|Use mamba for cudf conda install [skip ci]|
+|[#4270](https://github.com/NVIDIA/spark-rapids/pull/4270)|Document exponent differences when casting floating point to string [skip ci]|
+|[#4268](https://github.com/NVIDIA/spark-rapids/pull/4268)|Fix merge conflict with branch-21.12|
+|[#4093](https://github.com/NVIDIA/spark-rapids/pull/4093)|Add tests for regexp() and regexp_like()|
+|[#4259](https://github.com/NVIDIA/spark-rapids/pull/4259)|fix regression in cast from string to float that caused signed NaN to be considered valid|
+|[#4241](https://github.com/NVIDIA/spark-rapids/pull/4241)|fix bug in parsing regex character classes that start with `^` and contain an unescaped `]`|
+|[#4224](https://github.com/NVIDIA/spark-rapids/pull/4224)|Support row-based Hive UDFs|
+|[#4221](https://github.com/NVIDIA/spark-rapids/pull/4221)|GpuCast from ArrayType to StringType|
+|[#4007](https://github.com/NVIDIA/spark-rapids/pull/4007)|Implement duplicate key handling for GpuCreateMap|
+|[#4251](https://github.com/NVIDIA/spark-rapids/pull/4251)|Skip test_regexp_replace_null_pattern_fallback on Databricks|
+|[#4247](https://github.com/NVIDIA/spark-rapids/pull/4247)|Disable failing CastOpSuite test|
+|[#4239](https://github.com/NVIDIA/spark-rapids/pull/4239)|Make EOL anchor behavior match CPU for strings ending with newline|
+|[#4153](https://github.com/NVIDIA/spark-rapids/pull/4153)|Regexp: Only transpile once per expression rather than once per batch|
+|[#4230](https://github.com/NVIDIA/spark-rapids/pull/4230)|Change to build tools module with all the versions by default|
+|[#4223](https://github.com/NVIDIA/spark-rapids/pull/4223)|Fixes a minor deprecation warning|
+|[#4215](https://github.com/NVIDIA/spark-rapids/pull/4215)|Rebalance testing load|
+|[#4214](https://github.com/NVIDIA/spark-rapids/pull/4214)|Fix pre_merge ci_2 [skip ci]|
+|[#4212](https://github.com/NVIDIA/spark-rapids/pull/4212)|Remove an unused method with its outdated comment|
+|[#4211](https://github.com/NVIDIA/spark-rapids/pull/4211)|Update test_floor_ceil_overflow to be more lenient on exception type|
+|[#4203](https://github.com/NVIDIA/spark-rapids/pull/4203)|Move all the GpuShuffleExchangeExec shim v2 classes to org.apache.spark|
+|[#4193](https://github.com/NVIDIA/spark-rapids/pull/4193)|Rename 311until320-apache to 311until320-noncdh|
+|[#4197](https://github.com/NVIDIA/spark-rapids/pull/4197)|Ignore failing string to timestamp tests temporarily|
+|[#4160](https://github.com/NVIDIA/spark-rapids/pull/4160)|Fix merge issues for branch 22.02|
+|[#4081](https://github.com/NVIDIA/spark-rapids/pull/4081)|Convert String to DecimalType without casting to FloatType|
+|[#4132](https://github.com/NVIDIA/spark-rapids/pull/4132)|Fix auto merge conflict 4131 [skip ci]|
+|[#4099](https://github.com/NVIDIA/spark-rapids/pull/4099)|[REVIEW] Init version 22.02.0|
+|[#4113](https://github.com/NVIDIA/spark-rapids/pull/4113)|Fix pre-merge CI 2 conditions [skip ci]|
+|[#4064](https://github.com/NVIDIA/spark-rapids/pull/4064)|Regex: transpile `.` to `[^\r\n]` in cuDF|
+|[#4044](https://github.com/NVIDIA/spark-rapids/pull/4044)|RLike: Fall back to CPU for regex that would produce incorrect results|
## Release 21.12
### Features
|||
|:---|:---|
+|[#1571](https://github.com/NVIDIA/spark-rapids/issues/1571)|[FEA] Better precision range for decimal multiply, and possibly others|
|[#3953](https://github.com/NVIDIA/spark-rapids/issues/3953)|[FEA] Audit: Add array support to union by name |
|[#4085](https://github.com/NVIDIA/spark-rapids/issues/4085)|[FEA] Decimal 128 Support: Concat|
|[#4073](https://github.com/NVIDIA/spark-rapids/issues/4073)|[FEA] Decimal 128 Support: MapKeys, MapValues, MapEntries|
@@ -106,6 +401,11 @@ Generated on 2021-12-07
### PRs
|||
|:---|:---|
+|[#4362](https://github.com/NVIDIA/spark-rapids/pull/4362)|Decimal128 support for Parquet|
+|[#4391](https://github.com/NVIDIA/spark-rapids/pull/4391)|update gcp custom dataproc image version to avoid log4j issue[skip ci]|
+|[#4379](https://github.com/NVIDIA/spark-rapids/pull/4379)|update hot fix cudf link v21.12.2|
+|[#4367](https://github.com/NVIDIA/spark-rapids/pull/4367)|update 21.12 branch for doc [skip ci]|
+|[#4245](https://github.com/NVIDIA/spark-rapids/pull/4245)|Update changelog 21.12 to latest [skip ci]|
|[#4258](https://github.com/NVIDIA/spark-rapids/pull/4258)|Sanitize column names in ParquetCachedBatchSerializer before writing to Parquet|
|[#4308](https://github.com/NVIDIA/spark-rapids/pull/4308)|Bump up GPU reserve memory to 640MB|
|[#4307](https://github.com/NVIDIA/spark-rapids/pull/4307)|Update Download page for 21.12 [skip ci]|
@@ -1228,1322 +1528,5 @@ Generated on 2021-12-07
|[#2402](https://github.com/NVIDIA/spark-rapids/pull/2402)|Add profiling tool|
|[#2313](https://github.com/NVIDIA/spark-rapids/pull/2313)|Supports `GpuLiteral` of array type|
-## Release 0.5
-
-### Features
-|||
-|:---|:---|
-|[#938](https://github.com/NVIDIA/spark-rapids/issues/938)|[FEA] Have hashed shuffle match spark|
-|[#1604](https://github.com/NVIDIA/spark-rapids/issues/1604)|[FEA] Support casting structs to strings |
-|[#1920](https://github.com/NVIDIA/spark-rapids/issues/1920)|[FEA] Support murmur3 hashing of structs|
-|[#2018](https://github.com/NVIDIA/spark-rapids/issues/2018)|[FEA] A way for user to find out the plugin version and cudf version in REPL|
-|[#77](https://github.com/NVIDIA/spark-rapids/issues/77)|[FEA] Support ArrayContains|
-|[#1721](https://github.com/NVIDIA/spark-rapids/issues/1721)|[FEA] build cudf jars with NVTX enabled|
-|[#1782](https://github.com/NVIDIA/spark-rapids/issues/1782)|[FEA] Shim layers to support spark versions|
-|[#1625](https://github.com/NVIDIA/spark-rapids/issues/1625)|[FEA] Support Decimal Casts to String and String to Decimal|
-|[#166](https://github.com/NVIDIA/spark-rapids/issues/166)|[FEA] Support get_json_object|
-|[#1698](https://github.com/NVIDIA/spark-rapids/issues/1698)|[FEA] Support casting structs to string|
-|[#1912](https://github.com/NVIDIA/spark-rapids/issues/1912)|[FEA] Let `Scalar Pandas UDF ` support array of struct type.|
-|[#1136](https://github.com/NVIDIA/spark-rapids/issues/1136)|[FEA] Audit: Script to list commits between different Spark versions/tags|
-|[#1921](https://github.com/NVIDIA/spark-rapids/issues/1921)|[FEA] cudf version check should be lenient on later patch version|
-|[#19](https://github.com/NVIDIA/spark-rapids/issues/19)|[FEA] Out of core sorts|
-
-### Performance
-|||
-|:---|:---|
-|[#2090](https://github.com/NVIDIA/spark-rapids/issues/2090)|[FEA] Make row count estimates available to the cost-based optimizer|
-|[#1341](https://github.com/NVIDIA/spark-rapids/issues/1341)|Optimize unnecessary columnar->row->columnar transitions with AQE|
-|[#1558](https://github.com/NVIDIA/spark-rapids/issues/1558)|[FEA] Initialize UCX early|
-|[#1633](https://github.com/NVIDIA/spark-rapids/issues/1633)|[FEA] Implement a cost-based optimizer|
-|[#1727](https://github.com/NVIDIA/spark-rapids/issues/1727)|[FEA] Put RangePartitioner data path on the GPU|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#2279](https://github.com/NVIDIA/spark-rapids/issues/2279)|[BUG] Hash Partitioning can fail for very small batches|
-|[#2314](https://github.com/NVIDIA/spark-rapids/issues/2314)|[BUG] v0.5.0 pre-release pytests join_test.py::test_hash_join_array FAILED on SPARK-EGX Yarn Cluster|
-|[#2317](https://github.com/NVIDIA/spark-rapids/issues/2317)|[BUG] GpuColumnarToRowIterator can stop after receiving an empty batch|
-|[#2244](https://github.com/NVIDIA/spark-rapids/issues/2244)|[BUG] Executors hanging when running NDS benchmarks|
-|[#2278](https://github.com/NVIDIA/spark-rapids/issues/2278)|[BUG] FullOuter join can produce too many results|
-|[#2220](https://github.com/NVIDIA/spark-rapids/issues/2220)|[BUG] csv_test.py::test_csv_fallback FAILED on the EMR Cluster|
-|[#2225](https://github.com/NVIDIA/spark-rapids/issues/2225)|[BUG] GpuSort fails on tables containing arrays.|
-|[#2232](https://github.com/NVIDIA/spark-rapids/issues/2232)|[BUG] hash_aggregate_test.py::test_hash_grpby_pivot FAILED on the Databricks Cluster|
-|[#2231](https://github.com/NVIDIA/spark-rapids/issues/2231)|[BUG]string_test.py::test_re_replace FAILED on the Dataproc Cluster|
-|[#2042](https://github.com/NVIDIA/spark-rapids/issues/2042)|[BUG] NDS q14a fails with "GpuColumnarToRow does not implement doExecuteBroadcast"|
-|[#2203](https://github.com/NVIDIA/spark-rapids/issues/2203)|[BUG] Spark nightly cache tests fail with -- master flag|
-|[#2230](https://github.com/NVIDIA/spark-rapids/issues/2230)|[BUG] qa_nightly_select_test.py::test_select FAILED on the Dataproc Cluster|
-|[#1711](https://github.com/NVIDIA/spark-rapids/issues/1711)|[BUG] find a way to stop allocating from RMM on the shuffle-client thread|
-|[#2109](https://github.com/NVIDIA/spark-rapids/issues/2109)|[BUG] Fix high priority violations detected by code analysis tools|
-|[#2217](https://github.com/NVIDIA/spark-rapids/issues/2217)|[BUG] qa_nightly_select_test failure in test_select |
-|[#2127](https://github.com/NVIDIA/spark-rapids/issues/2127)|[BUG] Parsing with two-digit year should fall back to CPU|
-|[#2078](https://github.com/NVIDIA/spark-rapids/issues/2078)|[BUG] java.lang.ArithmeticException: divide by zero when spark.sql.ansi.enabled=true|
-|[#2048](https://github.com/NVIDIA/spark-rapids/issues/2048)|[BUG] split function+ repartition result in "ai.rapids.cudf.CudaException: device-side assert triggered"|
-|[#2036](https://github.com/NVIDIA/spark-rapids/issues/2036)|[BUG] Stackoverflow when writing wide parquet files.|
-|[#1973](https://github.com/NVIDIA/spark-rapids/issues/1973)|[BUG] generate_expr_test FAILED on Dataproc Cluster|
-|[#2079](https://github.com/NVIDIA/spark-rapids/issues/2079)|[BUG] koalas.sql fails with java.lang.ArrayIndexOutOfBoundsException|
-|[#217](https://github.com/NVIDIA/spark-rapids/issues/217)|[BUG] CudaUtil should be removed|
-|[#1550](https://github.com/NVIDIA/spark-rapids/issues/1550)|[BUG] The ORC output data of a query is not readable|
-|[#2074](https://github.com/NVIDIA/spark-rapids/issues/2074)|[BUG] Intermittent NPE in RapidsBufferCatalog when running test suite|
-|[#2027](https://github.com/NVIDIA/spark-rapids/issues/2027)|[BUG] udf_cudf_test.py integration tests fail |
-|[#1899](https://github.com/NVIDIA/spark-rapids/issues/1899)|[BUG] Some queries fail when cost-based optimizations are enabled|
-|[#1914](https://github.com/NVIDIA/spark-rapids/issues/1914)|[BUG] Add in float, double, timestamp, and date support to murmur3|
-|[#2014](https://github.com/NVIDIA/spark-rapids/issues/2014)|[BUG] earlyStart option added in 0.5 can cause errors when starting UCX|
-|[#1984](https://github.com/NVIDIA/spark-rapids/issues/1984)|[BUG] NDS q58 Decimal scale (59) cannot be greater than precision (38).|
-|[#2001](https://github.com/NVIDIA/spark-rapids/issues/2001)|[BUG] RapidsShuffleManager didn't pass `dirs` to `getBlockData` from a wrapped `ShuffleBlockResolver`|
-|[#1797](https://github.com/NVIDIA/spark-rapids/issues/1797)|[BUG] occasional crashes in CI|
-|[#1861](https://github.com/NVIDIA/spark-rapids/issues/1861)|Encountered column data outside the range of input buffer|
-|[#1905](https://github.com/NVIDIA/spark-rapids/issues/1905)|[BUG] Large concat task time in GpuShuffleCoalesce with pinned memory pool|
-|[#1638](https://github.com/NVIDIA/spark-rapids/issues/1638)|[BUG] Tests `test_window_aggs_for_rows_collect_list` fails when there are null values in columns.|
-|[#1864](https://github.com/NVIDIA/spark-rapids/issues/1864)|[BUG]HostColumnarToGPU inefficient when only doing count()|
-|[#1862](https://github.com/NVIDIA/spark-rapids/issues/1862)|[BUG] spark 3.2.0-snapshot integration test failed due to conf change|
-|[#1844](https://github.com/NVIDIA/spark-rapids/issues/1844)|[BUG] branch-0.5 nightly IT FAILED on the The mortgage ETL test "Could not read footer for file: file:/xxx/xxx.snappy.parquet"|
-|[#1627](https://github.com/NVIDIA/spark-rapids/issues/1627)|[BUG] GDS exception when restoring spilled buffer|
-|[#1802](https://github.com/NVIDIA/spark-rapids/issues/1802)|[BUG] Many decimal integration test failures for 0.5|
-
-### PRs
-|||
-|:---|:---|
-|[#2326](https://github.com/NVIDIA/spark-rapids/pull/2326)|Update changelog for 0.5.0 release|
-|[#2316](https://github.com/NVIDIA/spark-rapids/pull/2316)|Update doc to note that single quoted json strings are not ok|
-|[#2319](https://github.com/NVIDIA/spark-rapids/pull/2319)|Disable hash partitioning on arrays|
-|[#2318](https://github.com/NVIDIA/spark-rapids/pull/2318)|Fix ColumnarToRowIterator handling of empty batches|
-|[#2304](https://github.com/NVIDIA/spark-rapids/pull/2304)|Update CHANGELOG.md|
-|[#2301](https://github.com/NVIDIA/spark-rapids/pull/2301)|Update doc to reflect nanosleep problem with 460.32.03|
-|[#2298](https://github.com/NVIDIA/spark-rapids/pull/2298)|Update changelog for v0.5.0 release [skip ci]|
-|[#2293](https://github.com/NVIDIA/spark-rapids/pull/2293)|update cudf version to 0.19.2|
-|[#2289](https://github.com/NVIDIA/spark-rapids/pull/2289)|Update docs to warn against 450.80.02 driver with 10.x toolkit|
-|[#2285](https://github.com/NVIDIA/spark-rapids/pull/2285)|Require single batch for full outer join streaming|
-|[#2281](https://github.com/NVIDIA/spark-rapids/pull/2281)|Remove download section for unreleased 0.4.2|
-|[#2264](https://github.com/NVIDIA/spark-rapids/pull/2264)|Add spark312 and spark320 versions of cache serializer|
-|[#2254](https://github.com/NVIDIA/spark-rapids/pull/2254)|updated gcp docs with custom dataproc image instructions|
-|[#2247](https://github.com/NVIDIA/spark-rapids/pull/2247)|Allow specifying a superclass for non-GPU execs|
-|[#2235](https://github.com/NVIDIA/spark-rapids/pull/2235)|Fix distributed cache to read requested schema |
-|[#2261](https://github.com/NVIDIA/spark-rapids/pull/2261)|Make CBO row count test more robust|
-|[#2237](https://github.com/NVIDIA/spark-rapids/pull/2237)|update cudf version to 0.19.1|
-|[#2240](https://github.com/NVIDIA/spark-rapids/pull/2240)|Get the correct 'PIPESTATUS' in bash [skip ci]|
-|[#2242](https://github.com/NVIDIA/spark-rapids/pull/2242)|Add shuffle doc section on the periodicGC configuration|
-|[#2251](https://github.com/NVIDIA/spark-rapids/pull/2251)|Fix issue when out of core sorting nested data types|
-|[#2204](https://github.com/NVIDIA/spark-rapids/pull/2204)|Run nightly tests for ParquetCachedBatchSerializer|
-|[#2245](https://github.com/NVIDIA/spark-rapids/pull/2245)|Fix pivot bug for decimalType|
-|[#2093](https://github.com/NVIDIA/spark-rapids/pull/2093)|Initial implementation of row count estimates in cost-based optimizer|
-|[#2188](https://github.com/NVIDIA/spark-rapids/pull/2188)|Support GPU broadcast exchange reuse to feed CPU BHJ when AQE is enabled|
-|[#2227](https://github.com/NVIDIA/spark-rapids/pull/2227)|ParquetCachedBatchSerializer broadcast AllConfs instead of SQLConf to fix distributed mode|
-|[#2223](https://github.com/NVIDIA/spark-rapids/pull/2223)|Adds subquery aggregate tests from SPARK-31620|
-|[#2222](https://github.com/NVIDIA/spark-rapids/pull/2222)|Remove groupId already specified in parent pom|
-|[#2209](https://github.com/NVIDIA/spark-rapids/pull/2209)|Fixed a few issues with out of core sort|
-|[#2218](https://github.com/NVIDIA/spark-rapids/pull/2218)|Fix incorrect RegExpReplace children handling on Spark 3.1+|
-|[#2207](https://github.com/NVIDIA/spark-rapids/pull/2207)|fix batch size default values in the tuning guide|
-|[#2208](https://github.com/NVIDIA/spark-rapids/pull/2208)|Revert "add nightly cache tests (#2083)"|
-|[#2206](https://github.com/NVIDIA/spark-rapids/pull/2206)|Fix shim301db build|
-|[#2192](https://github.com/NVIDIA/spark-rapids/pull/2192)|Fix index-based access to the head elements|
-|[#2210](https://github.com/NVIDIA/spark-rapids/pull/2210)|Avoid redundant collection conversions|
-|[#2190](https://github.com/NVIDIA/spark-rapids/pull/2190)|JNI fixes for StringWordCount native UDF example|
-|[#2086](https://github.com/NVIDIA/spark-rapids/pull/2086)|Updating documentation for data format support|
-|[#2172](https://github.com/NVIDIA/spark-rapids/pull/2172)|Remove easy unused symbols|
-|[#2089](https://github.com/NVIDIA/spark-rapids/pull/2089)|Update PandasUDF doc|
-|[#2195](https://github.com/NVIDIA/spark-rapids/pull/2195)|fix cudf 0.19.0 download link [skip ci]|
-|[#2175](https://github.com/NVIDIA/spark-rapids/pull/2175)|Branch 0.5 doc update|
-|[#2168](https://github.com/NVIDIA/spark-rapids/pull/2168)|Simplify GpuExpressions w/ withResourceIfAllowed|
-|[#2055](https://github.com/NVIDIA/spark-rapids/pull/2055)|Support PivotFirst|
-|[#2183](https://github.com/NVIDIA/spark-rapids/pull/2183)|GpuParquetScan#readBufferToTable remove dead code|
-|[#2129](https://github.com/NVIDIA/spark-rapids/pull/2129)|Fall back to CPU when parsing two-digit years|
-|[#2083](https://github.com/NVIDIA/spark-rapids/pull/2083)|add nightly cache tests|
-|[#2151](https://github.com/NVIDIA/spark-rapids/pull/2151)|add corresponding close call for HostMemoryOutputStream|
-|[#2169](https://github.com/NVIDIA/spark-rapids/pull/2169)|Work around bug in Spark for integration test|
-|[#2130](https://github.com/NVIDIA/spark-rapids/pull/2130)|Fix divide-by-zero in GpuAverage with ansi mode|
-|[#2149](https://github.com/NVIDIA/spark-rapids/pull/2149)|Auto generate the supported types for the file formats|
-|[#2072](https://github.com/NVIDIA/spark-rapids/pull/2072)|Disable CSV parsing by default and update tests to better show what is left|
-|[#2157](https://github.com/NVIDIA/spark-rapids/pull/2157)|fix merge conflict for 0.4.2 [skip ci]|
-|[#2144](https://github.com/NVIDIA/spark-rapids/pull/2144)|Allow array and struct types to pass thru when doing join|
-|[#2145](https://github.com/NVIDIA/spark-rapids/pull/2145)|Avoid GPU shuffle for round-robin of unsortable types|
-|[#2021](https://github.com/NVIDIA/spark-rapids/pull/2021)|Add in support for murmur3 hashing of structs|
-|[#2128](https://github.com/NVIDIA/spark-rapids/pull/2128)|Add in Partition type check support|
-|[#2116](https://github.com/NVIDIA/spark-rapids/pull/2116)|Add dynamic Spark configuration for Databricks|
-|[#2132](https://github.com/NVIDIA/spark-rapids/pull/2132)|Log plugin and cudf versions on startup|
-|[#2135](https://github.com/NVIDIA/spark-rapids/pull/2135)|Disable Spark 3.2 shim by default|
-|[#2125](https://github.com/NVIDIA/spark-rapids/pull/2125)|enable auto-merge from 0.5 to 0.6 [skip ci]|
-|[#2120](https://github.com/NVIDIA/spark-rapids/pull/2120)|Materialize Stream before serialization|
-|[#2119](https://github.com/NVIDIA/spark-rapids/pull/2119)|Add more comprehensive documentation on supported date formats|
-|[#1717](https://github.com/NVIDIA/spark-rapids/pull/1717)|Decimal32 support|
-|[#2114](https://github.com/NVIDIA/spark-rapids/pull/2114)|Modified the Download page for 0.4.1 and updated doc to point to K8s guide|
-|[#2106](https://github.com/NVIDIA/spark-rapids/pull/2106)|Fix some buffer leaks|
-|[#2097](https://github.com/NVIDIA/spark-rapids/pull/2097)|fix the bound row project empty issue in row frame|
-|[#2099](https://github.com/NVIDIA/spark-rapids/pull/2099)|Remove verbose log prints to make the build/test log clean|
-|[#2105](https://github.com/NVIDIA/spark-rapids/pull/2105)|Cleanup prior Spark sessions in tests consistently|
-|[#2104](https://github.com/NVIDIA/spark-rapids/pull/2104)| Clone apache spark source code to parse the git commit IDs|
-|[#2095](https://github.com/NVIDIA/spark-rapids/pull/2095)|fix refcount when materializing device buffer from GDS|
-|[#2100](https://github.com/NVIDIA/spark-rapids/pull/2100)|[BUG] add wget for fetching conda [skip ci]|
-|[#2096](https://github.com/NVIDIA/spark-rapids/pull/2096)|Adjust images for integration tests|
-|[#2094](https://github.com/NVIDIA/spark-rapids/pull/2094)|Changed name of parquet files for Mortgage ETL Integration test|
-|[#2035](https://github.com/NVIDIA/spark-rapids/pull/2035)|Accelerate data transfer for map Pandas UDF plan|
-|[#2050](https://github.com/NVIDIA/spark-rapids/pull/2050)|stream shuffle buffers from GDS to UCX|
-|[#2084](https://github.com/NVIDIA/spark-rapids/pull/2084)|Enable ORC write by default|
-|[#2088](https://github.com/NVIDIA/spark-rapids/pull/2088)|Upgrade ScalaTest plugin to respect JAVA_HOME|
-|[#1932](https://github.com/NVIDIA/spark-rapids/pull/1932)|Create a getting started on K8s page|
-|[#2080](https://github.com/NVIDIA/spark-rapids/pull/2080)|Improve error message after failed RMM shutdown|
-|[#2064](https://github.com/NVIDIA/spark-rapids/pull/2064)|Optimize unnecessary columnar->row->columnar transitions with AQE|
-|[#2025](https://github.com/NVIDIA/spark-rapids/pull/2025)|Update the doc for pandas udf on databricks|
-|[#2059](https://github.com/NVIDIA/spark-rapids/pull/2059)|Add the flag 'TEST_TYPE' to avoid integration tests silently skipping some test cases|
-|[#2075](https://github.com/NVIDIA/spark-rapids/pull/2075)|Remove debug println from CBO test|
-|[#2046](https://github.com/NVIDIA/spark-rapids/pull/2046)|support casting Decimal to String|
-|[#1812](https://github.com/NVIDIA/spark-rapids/pull/1812)|allow spilled buffers to be unspilled|
-|[#2061](https://github.com/NVIDIA/spark-rapids/pull/2061)|Run the pandas udf using cudf on Databricks|
-|[#1893](https://github.com/NVIDIA/spark-rapids/pull/1893)|Plug-in support for get_json_object|
-|[#2044](https://github.com/NVIDIA/spark-rapids/pull/2044)|Use partition for GPU hash partitioning|
-|[#1954](https://github.com/NVIDIA/spark-rapids/pull/1954)|Fix CBO bug where incompatible plans were produced with AQE on|
-|[#2049](https://github.com/NVIDIA/spark-rapids/pull/2049)|Remove incompatable int overflow checking|
-|[#2056](https://github.com/NVIDIA/spark-rapids/pull/2056)|Remove Spark 3.2 from premerge and nightly CI run|
-|[#1814](https://github.com/NVIDIA/spark-rapids/pull/1814)|Struct to string casting functionality|
-|[#2037](https://github.com/NVIDIA/spark-rapids/pull/2037)|Fix warnings from use of deprecated cudf methods|
-|[#2033](https://github.com/NVIDIA/spark-rapids/pull/2033)|Bump up pre-merge OS from ubuntu 16 to ubuntu 18 [skip ci]|
-|[#1883](https://github.com/NVIDIA/spark-rapids/pull/1883)|Enable sort for single-level nesting struct columns on GPU|
-|[#2016](https://github.com/NVIDIA/spark-rapids/pull/2016)|Refactor logic for parallel testing|
-|[#2022](https://github.com/NVIDIA/spark-rapids/pull/2022)|Update order by to not load native libraries when sorting|
-|[#2017](https://github.com/NVIDIA/spark-rapids/pull/2017)|Add in murmur3 support for float, double, date and timestamp|
-|[#1981](https://github.com/NVIDIA/spark-rapids/pull/1981)|Fix GpuSize|
-|[#1999](https://github.com/NVIDIA/spark-rapids/pull/1999)|support casting string to decimal|
-|[#2006](https://github.com/NVIDIA/spark-rapids/pull/2006)|Enable windowed `collect_list` by default|
-|[#2000](https://github.com/NVIDIA/spark-rapids/pull/2000)|Use Spark's HybridRowQueue to avoid MemoryConsumer API shim|
-|[#2015](https://github.com/NVIDIA/spark-rapids/pull/2015)|Fix bug where rkey buffer is getting advanced after the first handshake|
-|[#2007](https://github.com/NVIDIA/spark-rapids/pull/2007)|Fix unknown column name error when filtering ORC file with no names|
-|[#2005](https://github.com/NVIDIA/spark-rapids/pull/2005)|Update to new is_before_spark_311 function name|
-|[#1944](https://github.com/NVIDIA/spark-rapids/pull/1944)|Support running scalar pandas UDF with array type.|
-|[#1991](https://github.com/NVIDIA/spark-rapids/pull/1991)|Fixes creation of invalid DecimalType in GpuDivide.tagExprForGpu|
-|[#1958](https://github.com/NVIDIA/spark-rapids/pull/1958)|Support legacy behavior of parameterless count |
-|[#1919](https://github.com/NVIDIA/spark-rapids/pull/1919)|Add support for Structs for UnionExec|
-|[#2002](https://github.com/NVIDIA/spark-rapids/pull/2002)|Pass dirs to getBlockData for a wrapped shuffle resolver|
-|[#1983](https://github.com/NVIDIA/spark-rapids/pull/1983)|document building against different CUDA Toolkit versions|
-|[#1994](https://github.com/NVIDIA/spark-rapids/pull/1994)|Merge 0.4 to 0.5 [skip ci]|
-|[#1982](https://github.com/NVIDIA/spark-rapids/pull/1982)|Update ORC pushdown filter building to latest Spark logic|
-|[#1978](https://github.com/NVIDIA/spark-rapids/pull/1978)|Add audit script to list commits from Spark|
-|[#1976](https://github.com/NVIDIA/spark-rapids/pull/1976)|Temp fix for parquet write changes|
-|[#1970](https://github.com/NVIDIA/spark-rapids/pull/1970)|add maven profiles for supported CUDA versions|
-|[#1951](https://github.com/NVIDIA/spark-rapids/pull/1951)|Branch 0.5 doc remove numpartitions|
-|[#1967](https://github.com/NVIDIA/spark-rapids/pull/1967)|Update FAQ for Dataset API and format supported versions|
-|[#1972](https://github.com/NVIDIA/spark-rapids/pull/1972)|support GpuSize|
-|[#1966](https://github.com/NVIDIA/spark-rapids/pull/1966)|add xml report for codecov|
-|[#1955](https://github.com/NVIDIA/spark-rapids/pull/1955)|Fix typo in Arrow optimization config|
-|[#1956](https://github.com/NVIDIA/spark-rapids/pull/1956)|Fix NPE in plugin shutdown|
-|[#1930](https://github.com/NVIDIA/spark-rapids/pull/1930)|Relax cudf version check for patch-level versions|
-|[#1787](https://github.com/NVIDIA/spark-rapids/pull/1787)|support distributed file path in cloud environment|
-|[#1961](https://github.com/NVIDIA/spark-rapids/pull/1961)|change premege GPU_TYPE from secret to global env [skip ci]|
-|[#1957](https://github.com/NVIDIA/spark-rapids/pull/1957)|Update Spark 3.1.2 shim for float upcast behavior|
-|[#1889](https://github.com/NVIDIA/spark-rapids/pull/1889)|Decimal DIV changes |
-|[#1947](https://github.com/NVIDIA/spark-rapids/pull/1947)|Move doc of Pandas UDF to additional-functionality|
-|[#1938](https://github.com/NVIDIA/spark-rapids/pull/1938)|Add spark.executor.resource.gpu.amount=1 to YARN and K8s docs|
-|[#1937](https://github.com/NVIDIA/spark-rapids/pull/1937)|Fix merge conflict with branch-0.4|
-|[#1878](https://github.com/NVIDIA/spark-rapids/pull/1878)|spillable cache for GpuCartesianRDD|
-|[#1843](https://github.com/NVIDIA/spark-rapids/pull/1843)|Refactor GpuGenerateExec and Explode|
-|[#1933](https://github.com/NVIDIA/spark-rapids/pull/1933)|Split DB scripts to make them common for the build and IT pipeline|
-|[#1935](https://github.com/NVIDIA/spark-rapids/pull/1935)|Update Alias SQL quoting and float-to-timestamp casting to match Spark 3.2|
-|[#1926](https://github.com/NVIDIA/spark-rapids/pull/1926)|Consolidate RAT settings in parent pom|
-|[#1918](https://github.com/NVIDIA/spark-rapids/pull/1918)|Minor code cleanup in dateTImeExpressions|
-|[#1906](https://github.com/NVIDIA/spark-rapids/pull/1906)|Remove get call on timeZoneId|
-|[#1908](https://github.com/NVIDIA/spark-rapids/pull/1908)|Remove the Scala version of Mortgage ETL tests from nightly test|
-|[#1894](https://github.com/NVIDIA/spark-rapids/pull/1894)|Modified Download Page to re-order the items and change the format of download links|
-|[#1909](https://github.com/NVIDIA/spark-rapids/pull/1909)|Avoid pinned memory for shuffle host buffers|
-|[#1891](https://github.com/NVIDIA/spark-rapids/pull/1891)|Connect UCX endpoints early during app startup|
-|[#1877](https://github.com/NVIDIA/spark-rapids/pull/1877)|remove docker build in pre-merge [skip ci]|
-|[#1830](https://github.com/NVIDIA/spark-rapids/pull/1830)|Enable the tests for collect over window.|
-|[#1882](https://github.com/NVIDIA/spark-rapids/pull/1882)|GpuArrowColumnarBatchBuilder retains the references of ArrowBuf until HostToGpuCoalesceIterator put them into device|
-|[#1868](https://github.com/NVIDIA/spark-rapids/pull/1868)|Increase row limit when doing count() for HostColumnarToGpu |
-|[#1855](https://github.com/NVIDIA/spark-rapids/pull/1855)|Expose row count statistics in GpuShuffleExchangeExec|
-|[#1875](https://github.com/NVIDIA/spark-rapids/pull/1875)|Fix merge conflict with branch-0.4|
-|[#1841](https://github.com/NVIDIA/spark-rapids/pull/1841)|Add in support for DateAddInterval|
-|[#1869](https://github.com/NVIDIA/spark-rapids/pull/1869)|Fix tests for Spark 3.2.0 shim|
-|[#1858](https://github.com/NVIDIA/spark-rapids/pull/1858)|fix shuffle manager doc on ucx library path|
-|[#1836](https://github.com/NVIDIA/spark-rapids/pull/1836)|Add shim for Spark 3.1.2|
-|[#1852](https://github.com/NVIDIA/spark-rapids/pull/1852)|Fix Part Suite Tests|
-|[#1616](https://github.com/NVIDIA/spark-rapids/pull/1616)|Cost-based optimizer|
-|[#1834](https://github.com/NVIDIA/spark-rapids/pull/1834)|Add shim for Spark 3.0.3|
-|[#1839](https://github.com/NVIDIA/spark-rapids/pull/1839)|Refactor join code to reduce duplicated code|
-|[#1848](https://github.com/NVIDIA/spark-rapids/pull/1848)|Fix merge conflict with branch-0.4|
-|[#1796](https://github.com/NVIDIA/spark-rapids/pull/1796)|Have most of range partitioning run on the GPU|
-|[#1845](https://github.com/NVIDIA/spark-rapids/pull/1845)|Fix fails on the mortgage ETL test|
-|[#1829](https://github.com/NVIDIA/spark-rapids/pull/1829)|Cleanup unused Jenkins files and scripts|
-|[#1704](https://github.com/NVIDIA/spark-rapids/pull/1704)|Create a shim for Spark 3.2.0 development|
-|[#1838](https://github.com/NVIDIA/spark-rapids/pull/1838)|Make databricks build.sh more convenient for dev|
-|[#1835](https://github.com/NVIDIA/spark-rapids/pull/1835)|Fix merge conflict with branch-0.4|
-|[#1808](https://github.com/NVIDIA/spark-rapids/pull/1808)|Update mortgage tests to support reading multiple dataset formats|
-|[#1822](https://github.com/NVIDIA/spark-rapids/pull/1822)|Fix conflict 0.4 to 0.5|
-|[#1807](https://github.com/NVIDIA/spark-rapids/pull/1807)|Fix merge conflict between branch-0.4 and branch-0.5|
-|[#1788](https://github.com/NVIDIA/spark-rapids/pull/1788)|Spill metrics everywhere|
-|[#1719](https://github.com/NVIDIA/spark-rapids/pull/1719)|Add in out of core sort|
-|[#1728](https://github.com/NVIDIA/spark-rapids/pull/1728)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
-|[#1689](https://github.com/NVIDIA/spark-rapids/pull/1689)|Update docs for plugin 0.5.0-SNAPSHOT and cudf 0.19-SNAPSHOT|
-|[#1682](https://github.com/NVIDIA/spark-rapids/pull/1682)|init CI/CD dependencies branch-0.5|
-
-## Release 0.4.1
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#1985](https://github.com/NVIDIA/spark-rapids/issues/1985)|[BUG] broadcast exchange can fail on 0.4|
-
-### PRs
-|||
-|:---|:---|
-|[#1995](https://github.com/NVIDIA/spark-rapids/pull/1995)|update changelog 0.4.1 [skip ci]|
-|[#1990](https://github.com/NVIDIA/spark-rapids/pull/1990)|Prepare for v0.4.1 release|
-|[#1988](https://github.com/NVIDIA/spark-rapids/pull/1988)|broadcast exchange can fail when job group set|
-
-## Release 0.4
-
-### Features
-|||
-|:---|:---|
-|[#1773](https://github.com/NVIDIA/spark-rapids/issues/1773)|[FEA] Spark 3.0.2 release support|
-|[#80](https://github.com/NVIDIA/spark-rapids/issues/80)|[FEA] Support the struct SQL function|
-|[#76](https://github.com/NVIDIA/spark-rapids/issues/76)|[FEA] Support CreateArray|
-|[#1635](https://github.com/NVIDIA/spark-rapids/issues/1635)|[FEA] RAPIDS accelerated Java UDF|
-|[#1333](https://github.com/NVIDIA/spark-rapids/issues/1333)|[FEA] Support window operations on Decimal|
-|[#1419](https://github.com/NVIDIA/spark-rapids/issues/1419)|[FEA] Support GPU accelerated UDF alternative for higher order function "aggregate" over window|
-|[#1580](https://github.com/NVIDIA/spark-rapids/issues/1580)|[FEA] Support Decimal for ParquetCachedBatchSerializer|
-|[#1600](https://github.com/NVIDIA/spark-rapids/issues/1600)|[FEA] Support ScalarSubquery|
-|[#1072](https://github.com/NVIDIA/spark-rapids/issues/1072)|[FEA] Support for a custom DataSource V2 which supplies Arrow data|
-|[#906](https://github.com/NVIDIA/spark-rapids/issues/906)|[FEA] Clarify query explanation to directly state what will run on GPU|
-|[#1335](https://github.com/NVIDIA/spark-rapids/issues/1335)|[FEA] Support CollectLimitExec for decimal|
-|[#1485](https://github.com/NVIDIA/spark-rapids/issues/1485)|[FEA] Decimal Support for Parquet Write|
-|[#1329](https://github.com/NVIDIA/spark-rapids/issues/1329)|[FEA] Decimal support for multiply int div, add, subtract and null safe equals|
-|[#1351](https://github.com/NVIDIA/spark-rapids/issues/1351)|[FEA] Execute UDFs that provide a RAPIDS execution path|
-|[#1330](https://github.com/NVIDIA/spark-rapids/issues/1330)|[FEA] Support Decimal Casts|
-|[#1353](https://github.com/NVIDIA/spark-rapids/issues/1353)|[FEA] Example of RAPIDS UDF using custom GPU code|
-|[#1487](https://github.com/NVIDIA/spark-rapids/issues/1487)|[FEA] Change spark 3.1.0 to 3.1.1|
-|[#1334](https://github.com/NVIDIA/spark-rapids/issues/1334)|[FEA] Add support for count aggregate on decimal|
-|[#1325](https://github.com/NVIDIA/spark-rapids/issues/1325)|[FEA] Add in join support for decimal|
-|[#1326](https://github.com/NVIDIA/spark-rapids/issues/1326)|[FEA] Add in Broadcast support for decimal values|
-|[#37](https://github.com/NVIDIA/spark-rapids/issues/37)|[FEA] round and bround SQL functions|
-|[#78](https://github.com/NVIDIA/spark-rapids/issues/78)|[FEA] Support CreateNamedStruct function|
-|[#1331](https://github.com/NVIDIA/spark-rapids/issues/1331)|[FEA] UnionExec and ExpandExec support for decimal|
-|[#1332](https://github.com/NVIDIA/spark-rapids/issues/1332)|[FEA] Support CaseWhen, Coalesce and IfElse for decimal|
-|[#937](https://github.com/NVIDIA/spark-rapids/issues/937)|[FEA] have murmur3 hash function that matches exactly with spark|
-|[#1324](https://github.com/NVIDIA/spark-rapids/issues/1324)|[FEA] Support Parquet Read of Decimal FIXED_LENGTH_BYTE_ARRAY|
-|[#1428](https://github.com/NVIDIA/spark-rapids/issues/1428)|[FEA] Add support for unary decimal operations abs, floor, ceil, unary - and unary +|
-|[#1375](https://github.com/NVIDIA/spark-rapids/issues/1375)|[FEA] Add log statement for what the concurrentGpuTasks tasks is set to on executor startup|
-|[#1352](https://github.com/NVIDIA/spark-rapids/issues/1352)|[FEA] Example of RAPIDS UDF using cudf Java APIs|
-|[#1328](https://github.com/NVIDIA/spark-rapids/issues/1328)|[FEA] Support sorting and shuffle of decimal|
-|[#1316](https://github.com/NVIDIA/spark-rapids/issues/1316)|[FEA] Support simple DECIMAL aggregates|
-
-### Performance
-|||
-|:---|:---|
-|[#1435](https://github.com/NVIDIA/spark-rapids/issues/1435)|[FEA]Improve the file reading by using local file caching|
-|[#1738](https://github.com/NVIDIA/spark-rapids/issues/1738)|[FEA] Reduce regex usage in CAST string to date/timestamp|
-|[#987](https://github.com/NVIDIA/spark-rapids/issues/987)|[FEA] Optimize CAST from string to temporal types by using cuDF is_timestamp function|
-|[#1594](https://github.com/NVIDIA/spark-rapids/issues/1594)|[FEA] RAPIDS accelerated ScalaUDF|
-|[#103](https://github.com/NVIDIA/spark-rapids/issues/103)|[FEA] GPU version of TakeOrderedAndProject|
-|[#1024](https://github.com/NVIDIA/spark-rapids/issues/1024)|Cleanup RAPIDS transport calls to `receive`|
-|[#1366](https://github.com/NVIDIA/spark-rapids/issues/1366)|Seeing performance differences of multi-threaded/coalesce/perfile Parquet reader type for a single file|
-|[#1200](https://github.com/NVIDIA/spark-rapids/issues/1200)|[FEA] Accelerate the scan speed for coalescing parquet reader when reading files from multiple partitioned folders|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#1885](https://github.com/NVIDIA/spark-rapids/issues/1885)|[BUG] natural join on string key results in a data frame with spurious NULLs|
-|[#1785](https://github.com/NVIDIA/spark-rapids/issues/1785)|[BUG] Rapids pytest integration tests FAILED on Yarn cluster with unrecognized arguments: `--std_input_path=src/test/resources/`|
-|[#999](https://github.com/NVIDIA/spark-rapids/issues/999)|[BUG] test_multi_types_window_aggs_for_rows_lead_lag fails against Spark 3.1.0|
-|[#1818](https://github.com/NVIDIA/spark-rapids/issues/1818)|[BUG] unmoored doc comment warnings in GpuCast|
-|[#1817](https://github.com/NVIDIA/spark-rapids/issues/1817)|[BUG] Developer build with local modifications fails during verify phase|
-|[#1644](https://github.com/NVIDIA/spark-rapids/issues/1644)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks|
-|[#1771](https://github.com/NVIDIA/spark-rapids/issues/1771)|[BUG] Databricks AWS CI/CD failing to create cluster|
-|[#1157](https://github.com/NVIDIA/spark-rapids/issues/1157)|[BUG] Fix regression supporting to_date on GPU with Spark 3.1.0|
-|[#716](https://github.com/NVIDIA/spark-rapids/issues/716)|[BUG] Cast String to TimeStamp issues|
-|[#1117](https://github.com/NVIDIA/spark-rapids/issues/1117)|[BUG] CAST string to date returns wrong values for dates with out-of-range values|
-|[#1670](https://github.com/NVIDIA/spark-rapids/issues/1670)|[BUG] Some TPC-DS queries fail with AQE when decimal types enabled|
-|[#1730](https://github.com/NVIDIA/spark-rapids/issues/1730)|[BUG] Range Partitioning can crash when processing is in the order-by|
-|[#1726](https://github.com/NVIDIA/spark-rapids/issues/1726)|[BUG] java url decode test failing on databricks, emr, and dataproc|
-|[#1651](https://github.com/NVIDIA/spark-rapids/issues/1651)|[BUG] GDS exception when writing shuffle file|
-|[#1702](https://github.com/NVIDIA/spark-rapids/issues/1702)|[BUG] check all tests marked xfail for Spark 3.1.1|
-|[#575](https://github.com/NVIDIA/spark-rapids/issues/575)|[BUG] Spark 3.1 FAILED join_test.py::test_broadcast_join_mixed[FullOuter][IGNORE_ORDER] failed|
-|[#577](https://github.com/NVIDIA/spark-rapids/issues/577)|[BUG] Spark 3.1 log arithmetic functions fail|
-|[#1541](https://github.com/NVIDIA/spark-rapids/issues/1541)|[BUG] Tests fail in integration in distributed mode after allowing nested types through in sort and shuffle|
-|[#1626](https://github.com/NVIDIA/spark-rapids/issues/1626)|[BUG] TPC-DS-like query 77 at scale=3TB fails with maxResultSize exceeded error|
-|[#1576](https://github.com/NVIDIA/spark-rapids/issues/1576)|[BUG] loading SPARK-32639 example parquet file triggers a JVM crash |
-|[#1643](https://github.com/NVIDIA/spark-rapids/issues/1643)|[BUG] TPC-DS-Like q10, q35, and q69 - slow or hanging at leftSemiJoin|
-|[#1650](https://github.com/NVIDIA/spark-rapids/issues/1650)|[BUG] BenchmarkRunner does not include query name in JSON summary filename when running multiple queries|
-|[#1654](https://github.com/NVIDIA/spark-rapids/issues/1654)|[BUG] TPC-DS-like query 59 at scale=3TB with AQE fails with join mismatch|
-|[#1274](https://github.com/NVIDIA/spark-rapids/issues/1274)|[BUG] OutOfMemoryError - Maximum pool size exceeded while running 24 day criteo ETL Transform stage|
-|[#1497](https://github.com/NVIDIA/spark-rapids/issues/1497)|[BUG] Spark-rapids v0.3.0 pytest integration tests with UCX on FAILED on Yarn cluster|
-|[#1534](https://github.com/NVIDIA/spark-rapids/issues/1534)|[BUG] Spark 3.1.1 test failure in writing due to removal of InMemoryFileIndex.shouldFilterOut|
-|[#1155](https://github.com/NVIDIA/spark-rapids/issues/1155)|[BUG] on shutdown don't print `Socket closed` exception when shutting down UCX.scala|
-|[#1510](https://github.com/NVIDIA/spark-rapids/issues/1510)|[BUG] IllegalArgumentException during shuffle|
-|[#1513](https://github.com/NVIDIA/spark-rapids/issues/1513)|[BUG] executor not fully initialized may get calls from Spark, in the process setting the `catalog` incorrectly|
-|[#1466](https://github.com/NVIDIA/spark-rapids/issues/1466)|[BUG] Databricks build must run before the rapids nightly|
-|[#1456](https://github.com/NVIDIA/spark-rapids/issues/1456)|[BUG] Databricks 0.4 parquet integration tests fail|
-|[#1400](https://github.com/NVIDIA/spark-rapids/issues/1400)|[BUG] Regressions in spark-shell usage of benchmark utilities|
-|[#1119](https://github.com/NVIDIA/spark-rapids/issues/1119)|[BUG] inner join fails with Column size cannot be negative|
-|[#1079](https://github.com/NVIDIA/spark-rapids/issues/1079)|[BUG]The Scala UDF function cannot invoke the UDF compiler when it's passed to "explode"|
-|[#1298](https://github.com/NVIDIA/spark-rapids/issues/1298)|TPCxBB query16 failed at UnsupportedOperationException: org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainIntegerDictionary|
-|[#1271](https://github.com/NVIDIA/spark-rapids/issues/1271)|[BUG] CastOpSuite and AnsiCastOpSuite failing with ArithmeticException on Spark 3.1|
-|[#84](https://github.com/NVIDIA/spark-rapids/issues/84)|[BUG] sort does not match spark for -0.0 and 0.0|
-|[#578](https://github.com/NVIDIA/spark-rapids/issues/578)|[BUG] Spark 3.1 qa_nightly_select_test.py Full join test failures|
-|[#586](https://github.com/NVIDIA/spark-rapids/issues/586)|[BUG] Spark3.1 tpch failures|
-|[#837](https://github.com/NVIDIA/spark-rapids/issues/837)|[BUG] Distinct count of floating point values differs with regular spark|
-|[#953](https://github.com/NVIDIA/spark-rapids/issues/953)|[BUG] 3.1.0 pos_explode tests are failing|
-|[#127](https://github.com/NVIDIA/spark-rapids/issues/127)|[BUG] String CSV parsing does not respect nullValues|
-|[#1203](https://github.com/NVIDIA/spark-rapids/issues/1203)|[BUG] tpcds query 51 fails with join error on Spark 3.1.0|
-|[#750](https://github.com/NVIDIA/spark-rapids/issues/750)|[BUG] udf_cudf_test::test_with_column fails with IPC error |
-|[#1348](https://github.com/NVIDIA/spark-rapids/issues/1348)|[BUG] Host columnar decimal conversions are failing|
-|[#1270](https://github.com/NVIDIA/spark-rapids/issues/1270)|[BUG] Benchmark runner fails to produce report if benchmark fails due to an invalid query plan|
-|[#1179](https://github.com/NVIDIA/spark-rapids/issues/1179)|[BUG] SerializeConcatHostBuffersDeserializeBatch may have thread issues|
-|[#1115](https://github.com/NVIDIA/spark-rapids/issues/1115)|[BUG] Unchecked type warning in SparkQueryCompareTestSuite|
-
-### PRs
-|||
-|:---|:---|
-|[#1963](https://github.com/NVIDIA/spark-rapids/pull/1963)|Update changelog 0.4 [skip ci]|
-|[#1960](https://github.com/NVIDIA/spark-rapids/pull/1960)|Replace sonatype staging link with maven central link|
-|[#1945](https://github.com/NVIDIA/spark-rapids/pull/1945)|Update changelog 0.4 [skip ci]|
-|[#1910](https://github.com/NVIDIA/spark-rapids/pull/1910)|Make hash partitioning match CPU|
-|[#1927](https://github.com/NVIDIA/spark-rapids/pull/1927)|Change cuDF dependency to 0.18.1|
-|[#1934](https://github.com/NVIDIA/spark-rapids/pull/1934)|Update documentation to use cudf version 0.18.1|
-|[#1871](https://github.com/NVIDIA/spark-rapids/pull/1871)|Disable coalesce batch spilling to avoid cudf contiguous_split bug|
-|[#1849](https://github.com/NVIDIA/spark-rapids/pull/1849)|Update changelog for 0.4|
-|[#1744](https://github.com/NVIDIA/spark-rapids/pull/1744)|Fix NullPointerException on null partition insert|
-|[#1842](https://github.com/NVIDIA/spark-rapids/pull/1842)|Update to note support for 3.0.2|
-|[#1832](https://github.com/NVIDIA/spark-rapids/pull/1832)|Spark 3.1.1 shim no longer a snapshot shim|
-|[#1831](https://github.com/NVIDIA/spark-rapids/pull/1831)|Spark 3.0.2 shim no longer a snapshot shim|
-|[#1826](https://github.com/NVIDIA/spark-rapids/pull/1826)|Remove benchmarks|
-|[#1828](https://github.com/NVIDIA/spark-rapids/pull/1828)|Update cudf dependency to 0.18|
-|[#1813](https://github.com/NVIDIA/spark-rapids/pull/1813)|Fix LEAD/LAG failures in Spark 3.1.1|
-|[#1819](https://github.com/NVIDIA/spark-rapids/pull/1819)|Fix scaladoc warning in GpuCast|
-|[#1820](https://github.com/NVIDIA/spark-rapids/pull/1820)|[BUG] make modified check pre-merge only|
-|[#1780](https://github.com/NVIDIA/spark-rapids/pull/1780)|Remove SNAPSHOT from test and integration_test READMEs|
-|[#1809](https://github.com/NVIDIA/spark-rapids/pull/1809)|check if modified files after update_config/supported|
-|[#1804](https://github.com/NVIDIA/spark-rapids/pull/1804)|Update UCX documentation for RX_QUEUE_LEN and Docker|
-|[#1810](https://github.com/NVIDIA/spark-rapids/pull/1810)|Pandas UDF: Sort the data before computing the sum.|
-|[#1751](https://github.com/NVIDIA/spark-rapids/pull/1751)|Exclude foldable expressions from GPU if constant folding is disabled|
-|[#1798](https://github.com/NVIDIA/spark-rapids/pull/1798)|Add documentation about explain not on GPU when AQE is on|
-|[#1766](https://github.com/NVIDIA/spark-rapids/pull/1766)|Branch 0.4 release docs|
-|[#1794](https://github.com/NVIDIA/spark-rapids/pull/1794)|Build python output schema from udf expressions|
-|[#1783](https://github.com/NVIDIA/spark-rapids/pull/1783)|Fix the collect_list over window tests failures on db|
-|[#1781](https://github.com/NVIDIA/spark-rapids/pull/1781)|Better float/double cases for casting tests|
-|[#1790](https://github.com/NVIDIA/spark-rapids/pull/1790)|Record row counts in benchmark runs that call collect|
-|[#1779](https://github.com/NVIDIA/spark-rapids/pull/1779)|Add support of DateType and TimestampType for GetTimestamp expression|
-|[#1768](https://github.com/NVIDIA/spark-rapids/pull/1768)|Updating getting started Databricks docs|
-|[#1742](https://github.com/NVIDIA/spark-rapids/pull/1742)|Fix regression supporting to_date with Spark-3.1|
-|[#1775](https://github.com/NVIDIA/spark-rapids/pull/1775)|Fix ambiguous ordering for some tests|
-|[#1760](https://github.com/NVIDIA/spark-rapids/pull/1760)|Update GpuDataSourceScanExec and GpuBroadcastExchangeExec to fix audit issues|
-|[#1750](https://github.com/NVIDIA/spark-rapids/pull/1750)|Detect task failures in benchmarks|
-|[#1767](https://github.com/NVIDIA/spark-rapids/pull/1767)|Consistent Spark version for test and production|
-|[#1741](https://github.com/NVIDIA/spark-rapids/pull/1741)|Reduce regex use in CAST|
-|[#1756](https://github.com/NVIDIA/spark-rapids/pull/1756)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
-|[#1716](https://github.com/NVIDIA/spark-rapids/pull/1716)|Update RapidsShuffleManager documentation for branch 0.4|
-|[#1740](https://github.com/NVIDIA/spark-rapids/pull/1740)|Disable ORC writes until bug can be fixed|
-|[#1747](https://github.com/NVIDIA/spark-rapids/pull/1747)|Fix resource leaks in unit tests|
-|[#1725](https://github.com/NVIDIA/spark-rapids/pull/1725)|Branch 0.4 FAQ reorg|
-|[#1718](https://github.com/NVIDIA/spark-rapids/pull/1718)|CAST string to temporal type now calls isTimestamp|
-|[#1734](https://github.com/NVIDIA/spark-rapids/pull/1734)|Disable range partitioning if computation is needed|
-|[#1723](https://github.com/NVIDIA/spark-rapids/pull/1723)|Removed StructTypes support for ParquetCachedBatchSerializer as cudf doesn't support it yet|
-|[#1714](https://github.com/NVIDIA/spark-rapids/pull/1714)|Add support for RAPIDS accelerated Java UDFs|
-|[#1713](https://github.com/NVIDIA/spark-rapids/pull/1713)|Call GpuDeviceManager.shutdown when the executor plugin is shutting down|
-|[#1596](https://github.com/NVIDIA/spark-rapids/pull/1596)|Added in Decimal support to ParquetCachedBatchSerializer|
-|[#1706](https://github.com/NVIDIA/spark-rapids/pull/1706)|cleanup unused is_before_spark_310|
-|[#1685](https://github.com/NVIDIA/spark-rapids/pull/1685)|Fix CustomShuffleReader replacement when decimal types enabled|
-|[#1699](https://github.com/NVIDIA/spark-rapids/pull/1699)|Add docs about Spark 3.1 in standalone modes not needing extra class path|
-|[#1701](https://github.com/NVIDIA/spark-rapids/pull/1701)|remove xfail for orc test_input_meta for spark 3.1.0|
-|[#1703](https://github.com/NVIDIA/spark-rapids/pull/1703)|Remove xfail for spark 3.1.0 test_broadcast_join_mixed FullOuter|
-|[#1676](https://github.com/NVIDIA/spark-rapids/pull/1676)|BenchmarkRunner option to generate query plan diagrams in DOT format|
-|[#1695](https://github.com/NVIDIA/spark-rapids/pull/1695)|support alternate jar paths|
-|[#1694](https://github.com/NVIDIA/spark-rapids/pull/1694)|increase mem and limit parallelism for pre-merge|
-|[#1691](https://github.com/NVIDIA/spark-rapids/pull/1691)|add validate_execs_in_gpu_plan to pytest.ini|
-|[#1692](https://github.com/NVIDIA/spark-rapids/pull/1692)|Add the integration test resources to the test tarball|
-|[#1677](https://github.com/NVIDIA/spark-rapids/pull/1677)|When PTDS is enabled, print warning if the allocator is not ARENA|
-|[#1683](https://github.com/NVIDIA/spark-rapids/pull/1683)|update changelog to verify autotmerge 0.5 setup [skip ci]|
-|[#1673](https://github.com/NVIDIA/spark-rapids/pull/1673)|support auto-merge for branch 0.5 [skip ci]|
-|[#1681](https://github.com/NVIDIA/spark-rapids/pull/1681)|Xfail the collect_list tests for databricks|
-|[#1678](https://github.com/NVIDIA/spark-rapids/pull/1678)|Fix array/struct checks in Sort and HashAggregate and sorting tests in distributed mode|
-|[#1671](https://github.com/NVIDIA/spark-rapids/pull/1671)|Allow metrics to be configurable by level|
-|[#1675](https://github.com/NVIDIA/spark-rapids/pull/1675)|add run_pyspark_from_build.sh to the pytest distribution tarball|
-|[#1548](https://github.com/NVIDIA/spark-rapids/pull/1548)|Support executing collect_list on GPU with windowing.|
-|[#1593](https://github.com/NVIDIA/spark-rapids/pull/1593)|Avoid unnecessary Table instances after contiguous split|
-|[#1592](https://github.com/NVIDIA/spark-rapids/pull/1592)|Add in support for Decimal divide|
-|[#1668](https://github.com/NVIDIA/spark-rapids/pull/1668)|Implement way for python integration tests to validate Exec is in GPU plan|
-|[#1669](https://github.com/NVIDIA/spark-rapids/pull/1669)|Add FAQ entries for executor-per-GPU questions|
-|[#1661](https://github.com/NVIDIA/spark-rapids/pull/1661)|Enable Parquet test for file containing map struct key|
-|[#1664](https://github.com/NVIDIA/spark-rapids/pull/1664)|Filter nulls for left semi and left anti join to work around cudf|
-|[#1665](https://github.com/NVIDIA/spark-rapids/pull/1665)|Add better automated tests for Arrow columnar copy in HostColumnarToGpu|
-|[#1614](https://github.com/NVIDIA/spark-rapids/pull/1614)|add alluxio getting start document|
-|[#1639](https://github.com/NVIDIA/spark-rapids/pull/1639)|support GpuScalarSubquery|
-|[#1656](https://github.com/NVIDIA/spark-rapids/pull/1656)|Move UDF to Catalyst Expressions to its own document|
-|[#1663](https://github.com/NVIDIA/spark-rapids/pull/1663)|BenchmarkRunner - Include query name in JSON summary filename|
-|[#1655](https://github.com/NVIDIA/spark-rapids/pull/1655)|Fix extraneous shuffles added by AQE|
-|[#1652](https://github.com/NVIDIA/spark-rapids/pull/1652)|Fix typo in arrow optimized config name - spark.rapids.arrowCopyOptimizationEnabled|
-|[#1645](https://github.com/NVIDIA/spark-rapids/pull/1645)|Run Databricks IT with python-xdist parallel, includes test fixes and xfail|
-|[#1649](https://github.com/NVIDIA/spark-rapids/pull/1649)|Move building from source docs to contributing guide|
-|[#1637](https://github.com/NVIDIA/spark-rapids/pull/1637)|Fail DivModLike on zero divisor in ANSI mode|
-|[#1646](https://github.com/NVIDIA/spark-rapids/pull/1646)|Update links in rapids-udfs.md after moving to subfolder|
-|[#1641](https://github.com/NVIDIA/spark-rapids/pull/1641)|Xfail struct and array order by tests on Dataproc|
-|[#1565](https://github.com/NVIDIA/spark-rapids/pull/1565)|Add GPU accelerated array_contains operator|
-|[#1617](https://github.com/NVIDIA/spark-rapids/pull/1617)|Enable nightly test checks for Apache Spark|
-|[#1636](https://github.com/NVIDIA/spark-rapids/pull/1636)|RAPIDS accelerated Spark Scala UDF support|
-|[#1634](https://github.com/NVIDIA/spark-rapids/pull/1634)|Fix databricks build since Arrow code added|
-|[#1599](https://github.com/NVIDIA/spark-rapids/pull/1599)|Add division by zero tests for Spark 3.1 behavior|
-|[#1619](https://github.com/NVIDIA/spark-rapids/pull/1619)|Update GpuFileSourceScanExec to be in sync with DataSourceScanExec|
-|[#1631](https://github.com/NVIDIA/spark-rapids/pull/1631)|Explicitly add maven-jar-plugin version to improve incremental build time.|
-|[#1624](https://github.com/NVIDIA/spark-rapids/pull/1624)|Update explain format to show what will and will not run on the GPU|
-|[#1622](https://github.com/NVIDIA/spark-rapids/pull/1622)|Support faster copy for a custom DataSource V2 which supplies Arrow data|
-|[#1621](https://github.com/NVIDIA/spark-rapids/pull/1621)|Additional functionality docs|
-|[#1618](https://github.com/NVIDIA/spark-rapids/pull/1618)|update blossom-ci for security updates [skip ci]|
-|[#1562](https://github.com/NVIDIA/spark-rapids/pull/1562)|add alluxio support|
-|[#1597](https://github.com/NVIDIA/spark-rapids/pull/1597)|Documentation for Parquet serializer|
-|[#1611](https://github.com/NVIDIA/spark-rapids/pull/1611)|Add in flag for integration tests to not skip required tests|
-|[#1609](https://github.com/NVIDIA/spark-rapids/pull/1609)|Disable float round/bround by default|
-|[#1615](https://github.com/NVIDIA/spark-rapids/pull/1615)|Add in window support for average|
-|[#1610](https://github.com/NVIDIA/spark-rapids/pull/1610)|Limit length of spark app name in BenchmarkRunner|
-|[#1579](https://github.com/NVIDIA/spark-rapids/pull/1579)|Support TakeOrderedAndProject|
-|[#1581](https://github.com/NVIDIA/spark-rapids/pull/1581)|Support Decimal type for CollectLimitExec|
-|[#1591](https://github.com/NVIDIA/spark-rapids/pull/1591)|Add support for running multiple queries in BenchmarkRunner|
-|[#1595](https://github.com/NVIDIA/spark-rapids/pull/1595)|Fix Github documentation issue template|
-|[#1577](https://github.com/NVIDIA/spark-rapids/pull/1577)|rename directory from spark310 to spark311|
-|[#1578](https://github.com/NVIDIA/spark-rapids/pull/1578)|Test to track RAPIDS-side issues re SPARK-32639|
-|[#1583](https://github.com/NVIDIA/spark-rapids/pull/1583)|fix request-action issue [skip ci]|
-|[#1555](https://github.com/NVIDIA/spark-rapids/pull/1555)|Enable ANSI mode for CAST string to timestamp|
-|[#1531](https://github.com/NVIDIA/spark-rapids/pull/1531)|Decimal Support for writing Parquet|
-|[#1545](https://github.com/NVIDIA/spark-rapids/pull/1545)|Support comparing ORC data|
-|[#1570](https://github.com/NVIDIA/spark-rapids/pull/1570)|Branch 0.4 doc cleanup|
-|[#1569](https://github.com/NVIDIA/spark-rapids/pull/1569)|Add shim method shouldIgnorePath|
-|[#1564](https://github.com/NVIDIA/spark-rapids/pull/1564)|Add in support for Decimal Multiply and DIV|
-|[#1561](https://github.com/NVIDIA/spark-rapids/pull/1561)|Decimal support for add and subtract|
-|[#1560](https://github.com/NVIDIA/spark-rapids/pull/1560)|support sum in window aggregation for decimal|
-|[#1546](https://github.com/NVIDIA/spark-rapids/pull/1546)|Cleanup shutdown logging for UCX shuffle|
-|[#1551](https://github.com/NVIDIA/spark-rapids/pull/1551)|RAPIDS-accelerated Hive UDFs support all types|
-|[#1543](https://github.com/NVIDIA/spark-rapids/pull/1543)|Shuffle/transport enabled by default|
-|[#1552](https://github.com/NVIDIA/spark-rapids/pull/1552)|Disable blackduck signature check|
-|[#1540](https://github.com/NVIDIA/spark-rapids/pull/1540)|Handle ShuffleManager api calls when plugin is not fully initialized|
-|[#1547](https://github.com/NVIDIA/spark-rapids/pull/1547)|Cleanup shuffle transport receive calls|
-|[#1512](https://github.com/NVIDIA/spark-rapids/pull/1512)|Support window operations on Decimal|
-|[#1532](https://github.com/NVIDIA/spark-rapids/pull/1532)|Support casting from decimal to decimal|
-|[#1542](https://github.com/NVIDIA/spark-rapids/pull/1542)|Change the number of partitions to zero when a range is empty|
-|[#1506](https://github.com/NVIDIA/spark-rapids/pull/1506)|Add --use-decimals flag to TPC-DS ConvertFiles|
-|[#1511](https://github.com/NVIDIA/spark-rapids/pull/1511)|Remove unused Jenkinsfiles [skip ci]|
-|[#1505](https://github.com/NVIDIA/spark-rapids/pull/1505)|Add least, greatest and eqNullSafe support for DecimalType|
-|[#1484](https://github.com/NVIDIA/spark-rapids/pull/1484)|add doc for nsight systems bundled with cuda toolkit|
-|[#1478](https://github.com/NVIDIA/spark-rapids/pull/1478)|Documentation for RAPIDS-accelerated Hive UDFs|
-|[#1477](https://github.com/NVIDIA/spark-rapids/pull/1477)|Allow structs and arrays to pass through for Shuffle and Sort |
-|[#1489](https://github.com/NVIDIA/spark-rapids/pull/1489)|Adds in some support for the array sql function|
-|[#1438](https://github.com/NVIDIA/spark-rapids/pull/1438)|Cast from numeric types to decimal type|
-|[#1493](https://github.com/NVIDIA/spark-rapids/pull/1493)|Moved ParquetRecordMaterializer to the shim package to follow convention|
-|[#1495](https://github.com/NVIDIA/spark-rapids/pull/1495)|Fix merge conflict, merge branch 0.3 to branch 0.4 [skip ci]|
-|[#1472](https://github.com/NVIDIA/spark-rapids/pull/1472)|Add an example RAPIDS-accelerated Hive UDF using native code|
-|[#1488](https://github.com/NVIDIA/spark-rapids/pull/1488)|Rename Spark 3.1.0 shim to Spark 3.1.1 to match community|
-|[#1474](https://github.com/NVIDIA/spark-rapids/pull/1474)|Fix link|
-|[#1476](https://github.com/NVIDIA/spark-rapids/pull/1476)|DecimalType support for Aggregate Count|
-|[#1475](https://github.com/NVIDIA/spark-rapids/pull/1475)| Join support for DecimalType|
-|[#1244](https://github.com/NVIDIA/spark-rapids/pull/1244)|Support round and bround SQL functions |
-|[#1458](https://github.com/NVIDIA/spark-rapids/pull/1458)|Add in support for struct and named_struct|
-|[#1465](https://github.com/NVIDIA/spark-rapids/pull/1465)|DecimalType support for UnionExec and ExpandExec|
-|[#1450](https://github.com/NVIDIA/spark-rapids/pull/1450)|Add dynamic configs for the spark-rapids IT pipelines|
-|[#1207](https://github.com/NVIDIA/spark-rapids/pull/1207)|Spark SQL hash function using murmur3|
-|[#1457](https://github.com/NVIDIA/spark-rapids/pull/1457)|Support reading decimal columns from parquet files on Databricks|
-|[#1455](https://github.com/NVIDIA/spark-rapids/pull/1455)|Upgrade Scala Maven Plugin to 4.3.0|
-|[#1453](https://github.com/NVIDIA/spark-rapids/pull/1453)|DecimalType support for IfElse and Coalesce|
-|[#1452](https://github.com/NVIDIA/spark-rapids/pull/1452)|Support DecimalType for CaseWhen|
-|[#1444](https://github.com/NVIDIA/spark-rapids/pull/1444)|Improve UX when running benchmarks from Spark shell|
-|[#1294](https://github.com/NVIDIA/spark-rapids/pull/1294)|Support reading decimal columns from parquet files|
-|[#1153](https://github.com/NVIDIA/spark-rapids/pull/1153)|Scala UDF will compile children expressions in Project|
-|[#1416](https://github.com/NVIDIA/spark-rapids/pull/1416)|Optimize mvn dependency download scripts|
-|[#1430](https://github.com/NVIDIA/spark-rapids/pull/1430)|Add project for testing code that requires Spark 3.1.0 or later|
-|[#1425](https://github.com/NVIDIA/spark-rapids/pull/1425)|Add in Decimal support for abs, floor, ceil, unary - and unary +|
-|[#1427](https://github.com/NVIDIA/spark-rapids/pull/1427)|Revert "Make the multi-threaded parquet reader the default"|
-|[#1420](https://github.com/NVIDIA/spark-rapids/pull/1420)|Add udf jar to nightly integration tests|
-|[#1422](https://github.com/NVIDIA/spark-rapids/pull/1422)|Log the number of concurrent gpu tasks allowed on Executor startup|
-|[#1401](https://github.com/NVIDIA/spark-rapids/pull/1401)|Accelerate the coalescing parquet reader when reading files from multiple partitioned folders|
-|[#1413](https://github.com/NVIDIA/spark-rapids/pull/1413)|Add config for cast float to integral types|
-|[#1313](https://github.com/NVIDIA/spark-rapids/pull/1313)|Support spilling to disk directly via cuFile/GDS|
-|[#1411](https://github.com/NVIDIA/spark-rapids/pull/1411)|Add udf-examples jar to databricks build|
-|[#1412](https://github.com/NVIDIA/spark-rapids/pull/1412)|Fix a lot of tests marked with xfail for Spark 3.1.0 that no longer fail|
-|[#1414](https://github.com/NVIDIA/spark-rapids/pull/1414)|Build merged code of HEAD and BASE branch for pre-merge [skip ci]|
-|[#1409](https://github.com/NVIDIA/spark-rapids/pull/1409)|Add option to use decimals in tpc-ds csv to parquet conversion|
-|[#1410](https://github.com/NVIDIA/spark-rapids/pull/1410)|Add Decimal support for In, InSet, AtLeastNNonNulls, GetArrayItem, GetStructField, and GenerateExec|
-|[#1408](https://github.com/NVIDIA/spark-rapids/pull/1408)|Support RAPIDS-accelerated HiveGenericUDF|
-|[#1407](https://github.com/NVIDIA/spark-rapids/pull/1407)|Update docs and tests for null CSV support|
-|[#1393](https://github.com/NVIDIA/spark-rapids/pull/1393)|Support RAPIDS-accelerated HiveSimpleUDF|
-|[#1392](https://github.com/NVIDIA/spark-rapids/pull/1392)|Turn on hash partitioning for decimal support|
-|[#1402](https://github.com/NVIDIA/spark-rapids/pull/1402)|Better GPU Cast type checks|
-|[#1404](https://github.com/NVIDIA/spark-rapids/pull/1404)|Fix branch 0.4 merge conflict|
-|[#1323](https://github.com/NVIDIA/spark-rapids/pull/1323)|More advanced type checking and documentation|
-|[#1391](https://github.com/NVIDIA/spark-rapids/pull/1391)|Remove extra null join filtering because cudf is fast for this now.|
-|[#1395](https://github.com/NVIDIA/spark-rapids/pull/1395)|Fix branch-0.3 -> branch-0.4 automerge|
-|[#1382](https://github.com/NVIDIA/spark-rapids/pull/1382)|Handle "MM[/-]dd" and "dd[/-]MM" datetime formats in UnixTimeExprMeta|
-|[#1390](https://github.com/NVIDIA/spark-rapids/pull/1390)|Accelerated columnar to row/row to columnar for decimal|
-|[#1380](https://github.com/NVIDIA/spark-rapids/pull/1380)|Adds in basic support for decimal sort, sum, and some shuffle|
-|[#1367](https://github.com/NVIDIA/spark-rapids/pull/1367)|Reuse gpu expression conversion rules when checking sort order|
-|[#1349](https://github.com/NVIDIA/spark-rapids/pull/1349)|Add canonicalization tests|
-|[#1368](https://github.com/NVIDIA/spark-rapids/pull/1368)|Move to cudf 0.18-SNAPSHOT|
-|[#1361](https://github.com/NVIDIA/spark-rapids/pull/1361)|Use the correct precision when reading spark columnar data.|
-|[#1273](https://github.com/NVIDIA/spark-rapids/pull/1273)|Update docs and scripts to 0.4.0-SNAPSHOT|
-|[#1321](https://github.com/NVIDIA/spark-rapids/pull/1321)|Refactor to stop inheriting from HashJoin|
-|[#1311](https://github.com/NVIDIA/spark-rapids/pull/1311)|ParquetCachedBatchSerializer code cleanup|
-|[#1303](https://github.com/NVIDIA/spark-rapids/pull/1303)|Add explicit outputOrdering for BHJ and SHJ in spark310 shim|
-|[#1299](https://github.com/NVIDIA/spark-rapids/pull/1299)|Benchmark runner improved error handling|
-
-## Release 0.3
-
-### Features
-|||
-|:---|:---|
-|[#1002](https://github.com/NVIDIA/spark-rapids/issues/1002)|[FEA] RapidsHostColumnVectorCore should verify cudf data with respect to the expected spark type |
-|[#444](https://github.com/NVIDIA/spark-rapids/issues/444)|[FEA] Plugable Cache|
-|[#1158](https://github.com/NVIDIA/spark-rapids/issues/1158)|[FEA] Better documentation on type support|
-|[#57](https://github.com/NVIDIA/spark-rapids/issues/57)|[FEA] Support INT96 for parquet reads and writes|
-|[#1003](https://github.com/NVIDIA/spark-rapids/issues/1003)|[FEA] Reduce overlap between RapidsHostColumnVector and RapidsHostColumnVectorCore|
-|[#913](https://github.com/NVIDIA/spark-rapids/issues/913)|[FEA] In Pluggable Cache Support CalendarInterval while creating CachedBatches|
-|[#1092](https://github.com/NVIDIA/spark-rapids/issues/1092)|[FEA] In Pluggable Cache handle nested types having CalendarIntervalType and NullType|
-|[#670](https://github.com/NVIDIA/spark-rapids/issues/670)|[FEA] Support NullType|
-|[#50](https://github.com/NVIDIA/spark-rapids/issues/50)|[FEA] support `spark.sql.legacy.timeParserPolicy`|
-|[#1144](https://github.com/NVIDIA/spark-rapids/issues/1144)|[FEA] Remove Databricks 3.0.0 shim layer|
-|[#1096](https://github.com/NVIDIA/spark-rapids/issues/1096)|[FEA] Implement parquet CreateDataSourceTableAsSelectCommand|
-|[#688](https://github.com/NVIDIA/spark-rapids/issues/688)|[FEA] udf compiler should be auto-appended to `spark.sql.extensions`|
-|[#502](https://github.com/NVIDIA/spark-rapids/issues/502)|[FEA] Support Databricks 7.3 LTS Runtime|
-|[#764](https://github.com/NVIDIA/spark-rapids/issues/764)|[FEA] Sanity checks for cudf jar mismatch|
-|[#1018](https://github.com/NVIDIA/spark-rapids/issues/1018)|[FEA] Log details related to GPU memory fragmentation on GPU OOM|
-|[#619](https://github.com/NVIDIA/spark-rapids/issues/619)|[FEA] log whether libcudf and libcudfjni were built for PTDS|
-|[#905](https://github.com/NVIDIA/spark-rapids/issues/905)|[FEA] create AWS EMR 3.0.1 shim|
-|[#838](https://github.com/NVIDIA/spark-rapids/issues/838)|[FEA] Support window count for a column|
-|[#864](https://github.com/NVIDIA/spark-rapids/issues/864)|[FEA] config option to enable RMM arena memory resource|
-|[#430](https://github.com/NVIDIA/spark-rapids/issues/430)|[FEA] Audit: Parquet Writer support for TIMESTAMP_MILLIS|
-|[#818](https://github.com/NVIDIA/spark-rapids/issues/818)|[FEA] Create shim layer for AWS EMR |
-|[#608](https://github.com/NVIDIA/spark-rapids/issues/608)|[FEA] Parquet small file optimization improve handle merge schema|
-
-### Performance
-|||
-|:---|:---|
-|[#446](https://github.com/NVIDIA/spark-rapids/issues/446)|[FEA] Test jucx in 1.9.x branch|
-|[#1038](https://github.com/NVIDIA/spark-rapids/issues/1038)|[FEA] Accelerate the data transfer for plan `WindowInPandasExec`|
-|[#533](https://github.com/NVIDIA/spark-rapids/issues/533)|[FEA] Improve PTDS performance|
-|[#849](https://github.com/NVIDIA/spark-rapids/issues/849)|[FEA] Have GpuColumnarBatchSerializer return GpuColumnVectorFromBuffer instances|
-|[#784](https://github.com/NVIDIA/spark-rapids/issues/784)|[FEA] Allow Host Spilling to be more dynamic|
-|[#627](https://github.com/NVIDIA/spark-rapids/issues/627)|[FEA] Further parquet reading small file improvements|
-|[#5](https://github.com/NVIDIA/spark-rapids/issues/5)|[FEA] Support Adaptive Execution|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#1423](https://github.com/NVIDIA/spark-rapids/issues/1423)|[BUG] Mortgage ETL sample failed with spark.sql.adaptive enabled on AWS EMR 6.2 |
-|[#1369](https://github.com/NVIDIA/spark-rapids/issues/1369)|[BUG] TPC-DS Query Failing on EMR 6.2 with AQE|
-|[#1344](https://github.com/NVIDIA/spark-rapids/issues/1344)|[BUG] Spark-rapids Pytests failed on On Databricks cluster spark standalone mode|
-|[#1279](https://github.com/NVIDIA/spark-rapids/issues/1279)|[BUG] TPC-DS query 2 failing with NPE|
-|[#1280](https://github.com/NVIDIA/spark-rapids/issues/1280)|[BUG] TPC-DS query 93 failing with UnsupportedOperationException|
-|[#1308](https://github.com/NVIDIA/spark-rapids/issues/1308)|[BUG] TPC-DS query 14a runs much slower on 0.3|
-|[#1284](https://github.com/NVIDIA/spark-rapids/issues/1284)|[BUG] TPC-DS query 77 at scale=1TB fails with maxResultSize exceeded error|
-|[#1061](https://github.com/NVIDIA/spark-rapids/issues/1061)|[BUG] orc_test.py is failing|
-|[#1197](https://github.com/NVIDIA/spark-rapids/issues/1197)|[BUG] java.lang.NullPointerException when exporting delta table|
-|[#685](https://github.com/NVIDIA/spark-rapids/issues/685)|[BUG] In ParqueCachedBatchSerializer, serializing parquet buffers might blow up in certain cases|
-|[#1269](https://github.com/NVIDIA/spark-rapids/issues/1269)|[BUG] GpuSubstring is not expected to be a part of a SortOrder|
-|[#1246](https://github.com/NVIDIA/spark-rapids/issues/1246)|[BUG] Many TPC-DS benchmarks fail when writing to Parquet|
-|[#961](https://github.com/NVIDIA/spark-rapids/issues/961)|[BUG] ORC predicate pushdown should work with case-insensitive analysis|
-|[#962](https://github.com/NVIDIA/spark-rapids/issues/962)|[BUG] Loading columns from an ORC file without column names returns no data|
-|[#1245](https://github.com/NVIDIA/spark-rapids/issues/1245)|[BUG] Code adding buffers to the spillable store should synchronize|
-|[#570](https://github.com/NVIDIA/spark-rapids/issues/570)|[BUG] Continue debugging OOM after ensuring device store is empty|
-|[#972](https://github.com/NVIDIA/spark-rapids/issues/972)|[BUG] total time metric is redundant with scan time|
-|[#1039](https://github.com/NVIDIA/spark-rapids/issues/1039)|[BUG] UNBOUNDED window ranges on null timestamp columns produces incorrect results.|
-|[#1195](https://github.com/NVIDIA/spark-rapids/issues/1195)|[BUG] AcceleratedColumnarToRowIterator queue empty|
-|[#1177](https://github.com/NVIDIA/spark-rapids/issues/1177)|[BUG] leaks possible in the rapids shuffle if batches are received after the task completes|
-|[#1216](https://github.com/NVIDIA/spark-rapids/issues/1216)|[BUG] Failure to recognize ORC file format when loaded via Hive|
-|[#898](https://github.com/NVIDIA/spark-rapids/issues/898)|[BUG] count reductions are failing on databricks because lack for Complete support|
-|[#1184](https://github.com/NVIDIA/spark-rapids/issues/1184)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks 3.0.1|
-|[#1151](https://github.com/NVIDIA/spark-rapids/issues/1151)|[BUG]Add databricks 3.0.1 shim layer for GpuWindowInPandasExec.|
-|[#1199](https://github.com/NVIDIA/spark-rapids/issues/1199)|[BUG] No data size in Input column in Stages page from Spark UI when using Parquet as file source|
-|[#1031](https://github.com/NVIDIA/spark-rapids/issues/1031)|[BUG] dependency info properties file contains error messages|
-|[#1149](https://github.com/NVIDIA/spark-rapids/issues/1149)|[BUG] Scaladoc warnings in GpuDataSource|
-|[#1185](https://github.com/NVIDIA/spark-rapids/issues/1185)|[BUG] test_hash_multiple_mode_query failing|
-|[#724](https://github.com/NVIDIA/spark-rapids/issues/724)|[BUG] PySpark test_broadcast_nested_loop_join_special_case intermittent failure|
-|[#1164](https://github.com/NVIDIA/spark-rapids/issues/1164)|[BUG] ansi_cast tests are failing in 3.1.0|
-|[#1110](https://github.com/NVIDIA/spark-rapids/issues/1110)|[BUG] Special date "now" has wrong value on GPU|
-|[#1139](https://github.com/NVIDIA/spark-rapids/issues/1139)|[BUG] Host columnar to GPU can be very slow|
-|[#1094](https://github.com/NVIDIA/spark-rapids/issues/1094)|[BUG] unix_timestamp on GPU returns invalid data for special dates|
-|[#1098](https://github.com/NVIDIA/spark-rapids/issues/1098)|[BUG] unix_timestamp on GPU returns invalid data for bad input|
-|[#1082](https://github.com/NVIDIA/spark-rapids/issues/1082)|[BUG] string to timestamp conversion fails with split|
-|[#1140](https://github.com/NVIDIA/spark-rapids/issues/1140)|[BUG] ConcurrentModificationException error after scala test suite completes|
-|[#1073](https://github.com/NVIDIA/spark-rapids/issues/1073)|[BUG] java.lang.RuntimeException: BinaryExpressions must override either eval or nullSafeEval|
-|[#975](https://github.com/NVIDIA/spark-rapids/issues/975)|[BUG] BroadcastExchangeExec fails to fall back to CPU on driver node on GCP Dataproc|
-|[#773](https://github.com/NVIDIA/spark-rapids/issues/773)|[BUG] Investigate high task deserialization|
-|[#1035](https://github.com/NVIDIA/spark-rapids/issues/1035)|[BUG] TPC-DS query 90 with AQE enabled fails with doExecuteBroadcast exception|
-|[#825](https://github.com/NVIDIA/spark-rapids/issues/825)|[BUG] test_window_aggs_for_ranges intermittently fails|
-|[#1008](https://github.com/NVIDIA/spark-rapids/issues/1008)|[BUG] limit function is producing inconsistent result when type is Byte, Long, Boolean and Timestamp|
-|[#996](https://github.com/NVIDIA/spark-rapids/issues/996)|[BUG] TPC-DS benchmark via spark-submit does not provide option to disable appending .dat to path|
-|[#1006](https://github.com/NVIDIA/spark-rapids/issues/1006)|[BUG] Spark3.1.0 changed BasicWriteTaskStats breaks BasicColumnarWriteTaskStatsTracker|
-|[#985](https://github.com/NVIDIA/spark-rapids/issues/985)|[BUG] missing metric `dataSize`|
-|[#881](https://github.com/NVIDIA/spark-rapids/issues/881)|[BUG] cannot disable Sort by itself|
-|[#812](https://github.com/NVIDIA/spark-rapids/issues/812)|[BUG] Test failures for 0.2 when run with multiple executors|
-|[#925](https://github.com/NVIDIA/spark-rapids/issues/925)|[BUG]Range window-functions with non-timestamp order-by expressions not falling back to CPU|
-|[#852](https://github.com/NVIDIA/spark-rapids/issues/852)|[BUG] BenchUtils.compareResults cannot compare partitioned files when ignoreOrdering=false|
-|[#868](https://github.com/NVIDIA/spark-rapids/issues/868)|[BUG] Rounding error when casting timestamp to string for timestamps before 1970|
-|[#880](https://github.com/NVIDIA/spark-rapids/issues/880)|[BUG] doing a window operation with an orderby for a single constant crashes|
-|[#776](https://github.com/NVIDIA/spark-rapids/issues/776)|[BUG] Integration test fails on spark 3.1.0-SNAPSHOT|
-|[#874](https://github.com/NVIDIA/spark-rapids/issues/874)|[BUG] `RapidsConf.scala` has some un-consistency for `spark.rapids.sql.format.parquet.multiThreadedRead`|
-|[#860](https://github.com/NVIDIA/spark-rapids/issues/860)|[BUG] we need to mark columns from received shuffle buffers as `GpuColumnVectorFromBuffer`|
-|[#122](https://github.com/NVIDIA/spark-rapids/issues/122)|[BUG] CSV Timestamp parseing is broken for TS < 1902 and TS > 2038|
-|[#810](https://github.com/NVIDIA/spark-rapids/issues/810)|[BUG] UDF Integration tests fail if pandas is not installed|
-|[#746](https://github.com/NVIDIA/spark-rapids/issues/746)|[BUG] cudf_udf_test.py is flakey|
-|[#811](https://github.com/NVIDIA/spark-rapids/issues/811)|[BUG] 0.3 nightly is timing out |
-|[#574](https://github.com/NVIDIA/spark-rapids/issues/574)|[BUG] Fix GpuTimeSub for Spark 3.1.0|
-
-### PRs
-|||
-|:---|:---|
-|[#1496](https://github.com/NVIDIA/spark-rapids/pull/1496)|Update changelog for v0.3.0 release [skip ci]|
-|[#1473](https://github.com/NVIDIA/spark-rapids/pull/1473)|Update documentation for 0.3 release|
-|[#1371](https://github.com/NVIDIA/spark-rapids/pull/1371)|Start Guide for RAPIDS on AWS EMR 6.2|
-|[#1446](https://github.com/NVIDIA/spark-rapids/pull/1446)|Update changelog for 0.3.0 release [skip ci]|
-|[#1439](https://github.com/NVIDIA/spark-rapids/pull/1439)|when AQE enabled we fail to fix up exchanges properly and EMR|
-|[#1433](https://github.com/NVIDIA/spark-rapids/pull/1433)|fix pandas 1.2 compatible issue|
-|[#1424](https://github.com/NVIDIA/spark-rapids/pull/1424)|Make the multi-threaded parquet reader the default since coalescing doesn't handle partitioned files well|
-|[#1389](https://github.com/NVIDIA/spark-rapids/pull/1389)|Update project version to 0.3.0|
-|[#1387](https://github.com/NVIDIA/spark-rapids/pull/1387)|Update cudf version to 0.17|
-|[#1370](https://github.com/NVIDIA/spark-rapids/pull/1370)|[REVIEW] init changelog 0.3 [skip ci]|
-|[#1376](https://github.com/NVIDIA/spark-rapids/pull/1376)|MetaUtils.getBatchFromMeta should return batches with GpuColumnVectorFromBuffer|
-|[#1358](https://github.com/NVIDIA/spark-rapids/pull/1358)|auto-merge: instant merge after creation [skip ci]|
-|[#1359](https://github.com/NVIDIA/spark-rapids/pull/1359)|Use SortOrder from shims.|
-|[#1343](https://github.com/NVIDIA/spark-rapids/pull/1343)|Do not run UDFs when the partition is empty.|
-|[#1342](https://github.com/NVIDIA/spark-rapids/pull/1342)|Fix and edit docs for standalone mode|
-|[#1350](https://github.com/NVIDIA/spark-rapids/pull/1350)|fix GpuRangePartitioning canonicalization|
-|[#1281](https://github.com/NVIDIA/spark-rapids/pull/1281)|Documentation added for testing|
-|[#1336](https://github.com/NVIDIA/spark-rapids/pull/1336)|Fix missing post-shuffle coalesce with AQE|
-|[#1318](https://github.com/NVIDIA/spark-rapids/pull/1318)|Fix copying GpuFileSourceScanExec node|
-|[#1337](https://github.com/NVIDIA/spark-rapids/pull/1337)|Use UTC instead of GMT|
-|[#1307](https://github.com/NVIDIA/spark-rapids/pull/1307)|Fallback to cpu when reading Delta log files for stats|
-|[#1310](https://github.com/NVIDIA/spark-rapids/pull/1310)|Fix canonicalization of GpuFileSourceScanExec, GpuShuffleCoalesceExec|
-|[#1302](https://github.com/NVIDIA/spark-rapids/pull/1302)|Add GpuSubstring handling to SortOrder canonicalization|
-|[#1265](https://github.com/NVIDIA/spark-rapids/pull/1265)|Chunking input before writing a ParquetCachedBatch|
-|[#1278](https://github.com/NVIDIA/spark-rapids/pull/1278)|Add a config to disable decimal types by default|
-|[#1272](https://github.com/NVIDIA/spark-rapids/pull/1272)|Add Alias to shims|
-|[#1268](https://github.com/NVIDIA/spark-rapids/pull/1268)|Adds in support docs for 0.3 release|
-|[#1235](https://github.com/NVIDIA/spark-rapids/pull/1235)|Trigger reading and handling control data.|
-|[#1266](https://github.com/NVIDIA/spark-rapids/pull/1266)|Updating Databricks getting started for 0.3 release|
-|[#1291](https://github.com/NVIDIA/spark-rapids/pull/1291)|Increase pre-merge resource requests [skip ci]|
-|[#1275](https://github.com/NVIDIA/spark-rapids/pull/1275)|Temporarily disable more CAST tests for Spark 3.1.0|
-|[#1264](https://github.com/NVIDIA/spark-rapids/pull/1264)|Fix race condition in batch creation|
-|[#1260](https://github.com/NVIDIA/spark-rapids/pull/1260)|Update UCX license info in NOTIFY-binary for 1.9 and RAPIDS plugin copyright dates|
-|[#1247](https://github.com/NVIDIA/spark-rapids/pull/1247)|Ensure column names are valid when writing benchmark query results to file|
-|[#1240](https://github.com/NVIDIA/spark-rapids/pull/1240)|Fix loading from ORC file with no column names|
-|[#1242](https://github.com/NVIDIA/spark-rapids/pull/1242)|Remove compatibility documentation about unsupported INT96|
-|[#1192](https://github.com/NVIDIA/spark-rapids/pull/1192)|[REVIEW] Support GpuFilter and GpuCoalesceBatches for decimal data|
-|[#1170](https://github.com/NVIDIA/spark-rapids/pull/1170)|Add nested type support to MetaUtils|
-|[#1194](https://github.com/NVIDIA/spark-rapids/pull/1194)|Drop redundant total time metric from scan|
-|[#1248](https://github.com/NVIDIA/spark-rapids/pull/1248)|At BatchedTableCompressor.finish synchronize to allow for "right-size…|
-|[#1169](https://github.com/NVIDIA/spark-rapids/pull/1169)|Use CUDF's "UNBOUNDED" window boundaries for time-range queries.|
-|[#1204](https://github.com/NVIDIA/spark-rapids/pull/1204)|Avoid empty batches on columnar to row conversion|
-|[#1133](https://github.com/NVIDIA/spark-rapids/pull/1133)|Refactor batch coalesce to be based solely on batch data size|
-|[#1237](https://github.com/NVIDIA/spark-rapids/pull/1237)|In transport, limit pending transfer requests to fit within a bounce|
-|[#1232](https://github.com/NVIDIA/spark-rapids/pull/1232)|Move SortOrder creation to shims|
-|[#1068](https://github.com/NVIDIA/spark-rapids/pull/1068)|Write int96 to parquet|
-|[#1193](https://github.com/NVIDIA/spark-rapids/pull/1193)|Verify shuffle of decimal columns|
-|[#1180](https://github.com/NVIDIA/spark-rapids/pull/1180)|Remove batches if they are received after the iterator detects that t…|
-|[#1173](https://github.com/NVIDIA/spark-rapids/pull/1173)|Support relational operators for decimal type|
-|[#1220](https://github.com/NVIDIA/spark-rapids/pull/1220)|Support replacing ORC format when Hive is configured|
-|[#1219](https://github.com/NVIDIA/spark-rapids/pull/1219)|Upgrade to jucx 1.9.0|
-|[#1081](https://github.com/NVIDIA/spark-rapids/pull/1081)|Add option to upload benchmark summary JSON file|
-|[#1217](https://github.com/NVIDIA/spark-rapids/pull/1217)|Aggregate reductions in Complete mode should use updateExpressions|
-|[#1218](https://github.com/NVIDIA/spark-rapids/pull/1218)|Remove obsolete HiveStringType usage|
-|[#1214](https://github.com/NVIDIA/spark-rapids/pull/1214)|changelog update 2020-11-30. Trigger automerge check [skip ci]|
-|[#1210](https://github.com/NVIDIA/spark-rapids/pull/1210)|Support auto-merge for branch-0.4 [skip ci]|
-|[#1202](https://github.com/NVIDIA/spark-rapids/pull/1202)|Fix a bug with the support for java.lang.StringBuilder.append.|
-|[#1213](https://github.com/NVIDIA/spark-rapids/pull/1213)|Skip casting StringType to TimestampType for Spark 310|
-|[#1201](https://github.com/NVIDIA/spark-rapids/pull/1201)|Replace only window expressions on databricks.|
-|[#1208](https://github.com/NVIDIA/spark-rapids/pull/1208)|[BUG] Fix GHSL2020-239 [skip ci]|
-|[#1205](https://github.com/NVIDIA/spark-rapids/pull/1205)|Fix missing input bytes read metric for Parquet|
-|[#1206](https://github.com/NVIDIA/spark-rapids/pull/1206)|Update Spark 3.1 shim for ShuffleOrigin shuffle parameter|
-|[#1196](https://github.com/NVIDIA/spark-rapids/pull/1196)|Rename ShuffleCoalesceExec to GpuShuffleCoalesceExec|
-|[#1191](https://github.com/NVIDIA/spark-rapids/pull/1191)|Skip window array tests for databricks.|
-|[#1183](https://github.com/NVIDIA/spark-rapids/pull/1183)|Support for CalendarIntervalType and NullType|
-|[#1150](https://github.com/NVIDIA/spark-rapids/pull/1150)|udf spec|
-|[#1188](https://github.com/NVIDIA/spark-rapids/pull/1188)|Add in tests for parquet nested pruning support|
-|[#1189](https://github.com/NVIDIA/spark-rapids/pull/1189)|Enable NullType for First and Last in 3.0.1+|
-|[#1181](https://github.com/NVIDIA/spark-rapids/pull/1181)|Fix resource leaks in unit tests|
-|[#1186](https://github.com/NVIDIA/spark-rapids/pull/1186)|Fix compilation and scaladoc warnings|
-|[#1187](https://github.com/NVIDIA/spark-rapids/pull/1187)|Updated documentation for distinct count compatibility|
-|[#1182](https://github.com/NVIDIA/spark-rapids/pull/1182)|Close buffer catalog on device manager shutdown|
-|[#1137](https://github.com/NVIDIA/spark-rapids/pull/1137)|Let GpuWindowInPandas declare ArrayType supported.|
-|[#1176](https://github.com/NVIDIA/spark-rapids/pull/1176)|Add in support for null type|
-|[#1174](https://github.com/NVIDIA/spark-rapids/pull/1174)|Fix race condition in SerializeConcatHostBuffersDeserializeBatch|
-|[#1175](https://github.com/NVIDIA/spark-rapids/pull/1175)|Fix leaks seen in shuffle tests|
-|[#1138](https://github.com/NVIDIA/spark-rapids/pull/1138)|[REVIEW] Support decimal type for GpuProjectExec|
-|[#1162](https://github.com/NVIDIA/spark-rapids/pull/1162)|Set job descriptions in benchmark runner|
-|[#1172](https://github.com/NVIDIA/spark-rapids/pull/1172)|Revert "Fix race condition (#1165)"|
-|[#1060](https://github.com/NVIDIA/spark-rapids/pull/1060)|Show partition metrics for custom shuffler reader|
-|[#1152](https://github.com/NVIDIA/spark-rapids/pull/1152)|Add spark301db shim layer for WindowInPandas.|
-|[#1167](https://github.com/NVIDIA/spark-rapids/pull/1167)|Nulls out the dataframe if --gc-between-runs is set|
-|[#1165](https://github.com/NVIDIA/spark-rapids/pull/1165)|Fix race condition in SerializeConcatHostBuffersDeserializeBatch|
-|[#1163](https://github.com/NVIDIA/spark-rapids/pull/1163)|Add in support for GetStructField|
-|[#1166](https://github.com/NVIDIA/spark-rapids/pull/1166)|Fix the cast tests for 3.1.0+|
-|[#1159](https://github.com/NVIDIA/spark-rapids/pull/1159)|fix bug where 'now' had same value as 'today' for timestamps|
-|[#1161](https://github.com/NVIDIA/spark-rapids/pull/1161)|Fix nightly build pipeline failure.|
-|[#1160](https://github.com/NVIDIA/spark-rapids/pull/1160)|Fix some performance problems with columnar to columnar conversion|
-|[#1105](https://github.com/NVIDIA/spark-rapids/pull/1105)|[REVIEW] Change ColumnViewAccess usage to work with ColumnView|
-|[#1148](https://github.com/NVIDIA/spark-rapids/pull/1148)|Add in tests for Maps and extend map support where possible|
-|[#1154](https://github.com/NVIDIA/spark-rapids/pull/1154)|Mark test as xfail until we can get a fix in|
-|[#1113](https://github.com/NVIDIA/spark-rapids/pull/1113)|Support unix_timestamp on GPU for subset of formats|
-|[#1156](https://github.com/NVIDIA/spark-rapids/pull/1156)|Fix warning introduced in iterator suite|
-|[#1095](https://github.com/NVIDIA/spark-rapids/pull/1095)|Dependency info|
-|[#1145](https://github.com/NVIDIA/spark-rapids/pull/1145)|Remove support for databricks 7.0 runtime - shim spark300db|
-|[#1147](https://github.com/NVIDIA/spark-rapids/pull/1147)|Change the assert to require for handling TIMESTAMP_MILLIS in isDateTimeRebaseNeeded |
-|[#1132](https://github.com/NVIDIA/spark-rapids/pull/1132)|Add in basic support to read structs from parquet|
-|[#1121](https://github.com/NVIDIA/spark-rapids/pull/1121)|Shuffle/better error handling|
-|[#1134](https://github.com/NVIDIA/spark-rapids/pull/1134)|Support saveAsTable for writing orc and parquet|
-|[#1124](https://github.com/NVIDIA/spark-rapids/pull/1124)|Add shim layers for GpuWindowInPandasExec.|
-|[#1131](https://github.com/NVIDIA/spark-rapids/pull/1131)|Add in some basic support for Structs|
-|[#1127](https://github.com/NVIDIA/spark-rapids/pull/1127)|Add in basic support for reading lists from parquet|
-|[#1129](https://github.com/NVIDIA/spark-rapids/pull/1129)|Fix resource leaks with new shuffle optimization|
-|[#1116](https://github.com/NVIDIA/spark-rapids/pull/1116)|Optimize normal shuffle by coalescing smaller batches on host|
-|[#1102](https://github.com/NVIDIA/spark-rapids/pull/1102)|Auto-register UDF extention when main plugin is set|
-|[#1108](https://github.com/NVIDIA/spark-rapids/pull/1108)|Remove integration test pipelines on NGCC|
-|[#1123](https://github.com/NVIDIA/spark-rapids/pull/1123)|Mark Pandas udf over window tests as xfail on databricks until they can be fixed|
-|[#1120](https://github.com/NVIDIA/spark-rapids/pull/1120)|Add in support for filtering ArrayType|
-|[#1080](https://github.com/NVIDIA/spark-rapids/pull/1080)|Support for CalendarIntervalType and NullType for ParquetCachedSerializer|
-|[#994](https://github.com/NVIDIA/spark-rapids/pull/994)|Packs bounce buffers for highly partitioned shuffles|
-|[#1112](https://github.com/NVIDIA/spark-rapids/pull/1112)|Remove bad config from pytest setup|
-|[#1107](https://github.com/NVIDIA/spark-rapids/pull/1107)|closeOnExcept -> withResources in MetaUtils|
-|[#1104](https://github.com/NVIDIA/spark-rapids/pull/1104)|Support lists to/from the GPU|
-|[#1106](https://github.com/NVIDIA/spark-rapids/pull/1106)|Improve mechanism for expected exceptions in tests|
-|[#1069](https://github.com/NVIDIA/spark-rapids/pull/1069)|Accelerate the data transfer between JVM and Python for the plan 'GpuWindowInPandasExec'|
-|[#1099](https://github.com/NVIDIA/spark-rapids/pull/1099)|Update how we deal with type checking|
-|[#1077](https://github.com/NVIDIA/spark-rapids/pull/1077)|Improve AQE transitions for shuffle and coalesce batches|
-|[#1097](https://github.com/NVIDIA/spark-rapids/pull/1097)|Cleanup some instances of excess closure serialization|
-|[#1090](https://github.com/NVIDIA/spark-rapids/pull/1090)|Fix the integration build|
-|[#1086](https://github.com/NVIDIA/spark-rapids/pull/1086)|Speed up test performance using pytest-xdist|
-|[#1084](https://github.com/NVIDIA/spark-rapids/pull/1084)|Avoid issues where more scalars that expected show up in an expression|
-|[#1076](https://github.com/NVIDIA/spark-rapids/pull/1076)|[FEA] Support Databricks 7.3 LTS Runtime|
-|[#1083](https://github.com/NVIDIA/spark-rapids/pull/1083)|Revert "Get cudf/spark dependency from the correct .m2 dir"|
-|[#1062](https://github.com/NVIDIA/spark-rapids/pull/1062)|Get cudf/spark dependency from the correct .m2 dir|
-|[#1078](https://github.com/NVIDIA/spark-rapids/pull/1078)|Another round of fixes for mapping of DataType to DType|
-|[#1066](https://github.com/NVIDIA/spark-rapids/pull/1066)|More fixes for conversion to ColumnarBatch|
-|[#1029](https://github.com/NVIDIA/spark-rapids/pull/1029)|BenchmarkRunner should produce JSON summary file even when queries fail|
-|[#1055](https://github.com/NVIDIA/spark-rapids/pull/1055)|Fix build warnings|
-|[#1064](https://github.com/NVIDIA/spark-rapids/pull/1064)|Use array instead of List for from(Table, DataType)|
-|[#1057](https://github.com/NVIDIA/spark-rapids/pull/1057)|Fix empty table broadcast requiring a GPU on driver node|
-|[#1047](https://github.com/NVIDIA/spark-rapids/pull/1047)|Sanity checks for cudf jar mismatch|
-|[#1044](https://github.com/NVIDIA/spark-rapids/pull/1044)|Accelerated row to columnar and columnar to row transitions|
-|[#1056](https://github.com/NVIDIA/spark-rapids/pull/1056)|Add query number to Spark app name when running benchmarks|
-|[#1054](https://github.com/NVIDIA/spark-rapids/pull/1054)|Log total RMM allocated on GPU OOM|
-|[#1053](https://github.com/NVIDIA/spark-rapids/pull/1053)|Remove isGpuBroadcastNestedLoopJoin from shims|
-|[#1052](https://github.com/NVIDIA/spark-rapids/pull/1052)|Allow for GPUCoalesceBatch to deal with Map|
-|[#1051](https://github.com/NVIDIA/spark-rapids/pull/1051)|Add simple retry for URM dependencies [skip ci]|
-|[#1046](https://github.com/NVIDIA/spark-rapids/pull/1046)|Fix broken links|
-|[#1017](https://github.com/NVIDIA/spark-rapids/pull/1017)|Log whether PTDS is enabled|
-|[#1040](https://github.com/NVIDIA/spark-rapids/pull/1040)|Update to cudf 0.17-SNAPSHOT and fix tests|
-|[#1042](https://github.com/NVIDIA/spark-rapids/pull/1042)|Fix inconsistencies in AQE support for broadcast joins|
-|[#1037](https://github.com/NVIDIA/spark-rapids/pull/1037)|Add in support for the SQL functions Least and Greatest|
-|[#1036](https://github.com/NVIDIA/spark-rapids/pull/1036)|Increase number of retries when waiting for databricks cluster|
-|[#1034](https://github.com/NVIDIA/spark-rapids/pull/1034)|[BUG] To honor spark.rapids.memory.gpu.pool=NONE|
-|[#854](https://github.com/NVIDIA/spark-rapids/pull/854)|Arbitrary function call in UDF|
-|[#1028](https://github.com/NVIDIA/spark-rapids/pull/1028)|Update to cudf-0.16|
-|[#1023](https://github.com/NVIDIA/spark-rapids/pull/1023)|Add --gc-between-run flag for TPC* benchmarks.|
-|[#1001](https://github.com/NVIDIA/spark-rapids/pull/1001)|ColumnarBatch to CachedBatch and back|
-|[#990](https://github.com/NVIDIA/spark-rapids/pull/990)|Parquet coalesce file reader for local filesystems|
-|[#1014](https://github.com/NVIDIA/spark-rapids/pull/1014)|Add --append-dat flag for TPC-DS benchmark|
-|[#991](https://github.com/NVIDIA/spark-rapids/pull/991)|Updated GCP Dataproc Mortgage-ETL-GPU.ipynb|
-|[#886](https://github.com/NVIDIA/spark-rapids/pull/886)|Spark BinaryType and cast to BinaryType|
-|[#1016](https://github.com/NVIDIA/spark-rapids/pull/1016)|Change Hash Aggregate to allow pass-through on MapType|
-|[#984](https://github.com/NVIDIA/spark-rapids/pull/984)|Add support for MapType in selected operators |
-|[#1012](https://github.com/NVIDIA/spark-rapids/pull/1012)|Update for new position parameter in Spark 3.1.0 RegExpReplace|
-|[#995](https://github.com/NVIDIA/spark-rapids/pull/995)|Add shim for EMR 3.0.1 and EMR 3.0.1-SNAPSHOT|
-|[#998](https://github.com/NVIDIA/spark-rapids/pull/998)|Update benchmark automation script|
-|[#1000](https://github.com/NVIDIA/spark-rapids/pull/1000)|Always use RAPIDS shuffle when running TPCH and Mortgage tests|
-|[#981](https://github.com/NVIDIA/spark-rapids/pull/981)|Change databricks build to dynamically create a cluster|
-|[#986](https://github.com/NVIDIA/spark-rapids/pull/986)|Fix missing dataSize metric when using RAPIDS shuffle|
-|[#914](https://github.com/NVIDIA/spark-rapids/pull/914)|Write InternalRow to CachedBatch|
-|[#934](https://github.com/NVIDIA/spark-rapids/pull/934)|Iterator to make it easier to work with a window of blocks in the RAPIDS shuffle|
-|[#992](https://github.com/NVIDIA/spark-rapids/pull/992)|Skip post-clean if aborted before the image build stage in pre-merge [skip ci]|
-|[#988](https://github.com/NVIDIA/spark-rapids/pull/988)|Change in Spark caused the 3.1.0 CI to fail|
-|[#983](https://github.com/NVIDIA/spark-rapids/pull/983)|clean jenkins file for premerge on NGCC|
-|[#964](https://github.com/NVIDIA/spark-rapids/pull/964)|Refactor TPC benchmarks to reduce duplicate code|
-|[#978](https://github.com/NVIDIA/spark-rapids/pull/978)|Enable scalastyle checks for udf-compiler module|
-|[#949](https://github.com/NVIDIA/spark-rapids/pull/949)|Fix GpuWindowExec to work with a CPU SortExec|
-|[#973](https://github.com/NVIDIA/spark-rapids/pull/973)|Stop reporting totalTime metric for GpuShuffleExchangeExec|
-|[#968](https://github.com/NVIDIA/spark-rapids/pull/968)|XFail pos_explode tests until final fix can be put in|
-|[#970](https://github.com/NVIDIA/spark-rapids/pull/970)|Add legacy config to clear active Spark 3.1.0 session in tests|
-|[#918](https://github.com/NVIDIA/spark-rapids/pull/918)|Benchmark runner script|
-|[#915](https://github.com/NVIDIA/spark-rapids/pull/915)|Add option to control number of partitions when converting from CSV to Parquet|
-|[#944](https://github.com/NVIDIA/spark-rapids/pull/944)|Fix some issues with non-determinism|
-|[#935](https://github.com/NVIDIA/spark-rapids/pull/935)|Add in support/tests for a window count on a column|
-|[#940](https://github.com/NVIDIA/spark-rapids/pull/940)|Fix closeOnExcept suppressed exception handling|
-|[#942](https://github.com/NVIDIA/spark-rapids/pull/942)|fix github action env setup [skip ci]|
-|[#933](https://github.com/NVIDIA/spark-rapids/pull/933)|Update first/last tests to avoid non-determinisim and ordering differences|
-|[#931](https://github.com/NVIDIA/spark-rapids/pull/931)|Fix checking for nullable columns in window range query|
-|[#924](https://github.com/NVIDIA/spark-rapids/pull/924)|Benchmark guide update for command-line interface / spark-submit|
-|[#926](https://github.com/NVIDIA/spark-rapids/pull/926)|Move pandas_udf functions into the tests functions|
-|[#929](https://github.com/NVIDIA/spark-rapids/pull/929)|Pick a default tableId to use that is non 0 so that flatbuffers allow…|
-|[#928](https://github.com/NVIDIA/spark-rapids/pull/928)|Fix RapidsBufferStore NPE when no spillable buffers are available|
-|[#820](https://github.com/NVIDIA/spark-rapids/pull/820)|Benchmarking guide|
-|[#859](https://github.com/NVIDIA/spark-rapids/pull/859)|Compare partitioned files in order|
-|[#916](https://github.com/NVIDIA/spark-rapids/pull/916)|create new sparkContext explicitly in CPU notebook|
-|[#917](https://github.com/NVIDIA/spark-rapids/pull/917)|create new SparkContext in GPU notebook explicitly.|
-|[#919](https://github.com/NVIDIA/spark-rapids/pull/919)|Add label benchmark to performance subsection in changelog|
-|[#850](https://github.com/NVIDIA/spark-rapids/pull/850)| Add in basic support for lead/lag|
-|[#843](https://github.com/NVIDIA/spark-rapids/pull/843)|[REVIEW] Cache plugin to handle reading CachedBatch to an InternalRow|
-|[#904](https://github.com/NVIDIA/spark-rapids/pull/904)|Add command-line argument for benchmark result filename|
-|[#909](https://github.com/NVIDIA/spark-rapids/pull/909)|GCP preview version image name update|
-|[#903](https://github.com/NVIDIA/spark-rapids/pull/903)|update getting-started-gcp.md with new component list|
-|[#900](https://github.com/NVIDIA/spark-rapids/pull/900)|Turn off CollectLimitExec replacement by default|
-|[#907](https://github.com/NVIDIA/spark-rapids/pull/907)|remove configs from databricks that shouldn't be used by default|
-|[#893](https://github.com/NVIDIA/spark-rapids/pull/893)|Fix rounding error when casting timestamp to string for timestamps before 1970|
-|[#899](https://github.com/NVIDIA/spark-rapids/pull/899)|Mark reduction corner case tests as xfail on databricks until they can be fixed|
-|[#894](https://github.com/NVIDIA/spark-rapids/pull/894)|Replace whole-buffer slicing with direct refcounting|
-|[#891](https://github.com/NVIDIA/spark-rapids/pull/891)|Add config to dump heap on GPU OOM|
-|[#890](https://github.com/NVIDIA/spark-rapids/pull/890)|Clean up CoalesceBatch to use withResource|
-|[#892](https://github.com/NVIDIA/spark-rapids/pull/892)|Only manifest the current batch in cached block shuffle read iterator|
-|[#871](https://github.com/NVIDIA/spark-rapids/pull/871)|Add support for using the arena allocator|
-|[#889](https://github.com/NVIDIA/spark-rapids/pull/889)|Fix crash on scalar only orderby|
-|[#879](https://github.com/NVIDIA/spark-rapids/pull/879)|Update SpillableColumnarBatch to remove buffer from catalog on close|
-|[#888](https://github.com/NVIDIA/spark-rapids/pull/888)|Shrink detect scope to compile only [skip ci]|
-|[#885](https://github.com/NVIDIA/spark-rapids/pull/885)|[BUG] fix IT dockerfile arguments [skip ci]|
-|[#883](https://github.com/NVIDIA/spark-rapids/pull/883)|[BUG] fix IT dockerfile args ordering [skip ci]|
-|[#875](https://github.com/NVIDIA/spark-rapids/pull/875)|fix the non-consistency for `spark.rapids.sql.format.parquet.multiThreadedRead` in RapidsConf.scala|
-|[#862](https://github.com/NVIDIA/spark-rapids/pull/862)|Migrate nightly&integration pipelines to blossom [skip ci]|
-|[#872](https://github.com/NVIDIA/spark-rapids/pull/872)|Ensure that receive-side batches use GpuColumnVectorFromBuffer to avoid|
-|[#833](https://github.com/NVIDIA/spark-rapids/pull/833)|Add nvcomp LZ4 codec support|
-|[#870](https://github.com/NVIDIA/spark-rapids/pull/870)|Cleaned up tests and documentation for csv timestamp parsing|
-|[#823](https://github.com/NVIDIA/spark-rapids/pull/823)|Add command-line interface for TPC-* for use with spark-submit|
-|[#856](https://github.com/NVIDIA/spark-rapids/pull/856)|Move GpuWindowInPandasExec in shims layers|
-|[#756](https://github.com/NVIDIA/spark-rapids/pull/756)|Add stream-time metric|
-|[#832](https://github.com/NVIDIA/spark-rapids/pull/832)|Skip pandas tests if pandas cannot be found|
-|[#841](https://github.com/NVIDIA/spark-rapids/pull/841)|Fix a hanging issue when processing empty data.|
-|[#840](https://github.com/NVIDIA/spark-rapids/pull/840)|[REVIEW] Fixed failing cache tests|
-|[#848](https://github.com/NVIDIA/spark-rapids/pull/848)|Update task memory and disk spill metrics when buffer store spills|
-|[#851](https://github.com/NVIDIA/spark-rapids/pull/851)|Use contiguous table when deserializing columnar batch|
-|[#857](https://github.com/NVIDIA/spark-rapids/pull/857)|fix pvc scheduling issue|
-|[#853](https://github.com/NVIDIA/spark-rapids/pull/853)|Remove nodeAffinity from premerge pipeline|
-|[#796](https://github.com/NVIDIA/spark-rapids/pull/796)|Record spark plan SQL metrics to JSON when running benchmarks|
-|[#781](https://github.com/NVIDIA/spark-rapids/pull/781)|Add AQE unit tests|
-|[#824](https://github.com/NVIDIA/spark-rapids/pull/824)|Skip cudf_udf test by default|
-|[#839](https://github.com/NVIDIA/spark-rapids/pull/839)|First/Last reduction and cleanup of agg APIs|
-|[#827](https://github.com/NVIDIA/spark-rapids/pull/827)|Add Spark 3.0 EMR Shim layer |
-|[#816](https://github.com/NVIDIA/spark-rapids/pull/816)|[BUG] fix nightly is timing out|
-|[#782](https://github.com/NVIDIA/spark-rapids/pull/782)|Benchmark utility to perform diff of output from benchmark runs, allowing for precision differences|
-|[#813](https://github.com/NVIDIA/spark-rapids/pull/813)|Revert "Enable tests in udf_cudf_test.py"|
-|[#788](https://github.com/NVIDIA/spark-rapids/pull/788)|[FEA] Persist workspace data on PVC for premerge|
-|[#805](https://github.com/NVIDIA/spark-rapids/pull/805)|[FEA] nightly build trigger both IT on spark 300 and 301|
-|[#797](https://github.com/NVIDIA/spark-rapids/pull/797)|Allow host spill store to fit a buffer larger than configured max size|
-|[#807](https://github.com/NVIDIA/spark-rapids/pull/807)|Deploy integration-tests javadoc and sources|
-|[#777](https://github.com/NVIDIA/spark-rapids/pull/777)|Enable tests in udf_cudf_test.py|
-|[#790](https://github.com/NVIDIA/spark-rapids/pull/790)|CI: Update cudf python to 0.16 nightly|
-|[#772](https://github.com/NVIDIA/spark-rapids/pull/772)|Add support for empty array construction.|
-|[#783](https://github.com/NVIDIA/spark-rapids/pull/783)|Improved GpuArrowEvalPythonExec|
-|[#771](https://github.com/NVIDIA/spark-rapids/pull/771)|Various improvements to benchmarks|
-|[#763](https://github.com/NVIDIA/spark-rapids/pull/763)|[REVIEW] Allow CoalesceBatch to spill data that is not in active use|
-|[#727](https://github.com/NVIDIA/spark-rapids/pull/727)|Update cudf dependency to 0.16-SNAPSHOT|
-|[#726](https://github.com/NVIDIA/spark-rapids/pull/726)|parquet writer support for TIMESTAMP_MILLIS|
-|[#674](https://github.com/NVIDIA/spark-rapids/pull/674)|Unit test for GPU exchange re-use with AQE|
-|[#723](https://github.com/NVIDIA/spark-rapids/pull/723)|Update code coverage to find source files in new places|
-|[#766](https://github.com/NVIDIA/spark-rapids/pull/766)|Update the integration Dockerfile to reduce the image size|
-|[#762](https://github.com/NVIDIA/spark-rapids/pull/762)|Fixing conflicts in branch-0.3|
-|[#738](https://github.com/NVIDIA/spark-rapids/pull/738)|[auto-merge] branch-0.2 to branch-0.3 - resolve conflict|
-|[#722](https://github.com/NVIDIA/spark-rapids/pull/722)|Initial code changes to support spilling outside of shuffle|
-|[#693](https://github.com/NVIDIA/spark-rapids/pull/693)|Update jenkins files for 0.3|
-|[#692](https://github.com/NVIDIA/spark-rapids/pull/692)|Merge shims dependency to spark-3.0.1 into branch-0.3|
-|[#690](https://github.com/NVIDIA/spark-rapids/pull/690)|Update the version to 0.3.0-SNAPSHOT|
-
-## Release 0.2
-
-### Features
-|||
-|:---|:---|
-|[#696](https://github.com/NVIDIA/spark-rapids/issues/696)|[FEA] run integration tests against SPARK-3.0.1|
-|[#455](https://github.com/NVIDIA/spark-rapids/issues/455)|[FEA] Support UCX shuffle with optimized AQE|
-|[#510](https://github.com/NVIDIA/spark-rapids/issues/510)|[FEA] Investigate libcudf features needed to support struct schema pruning during loads|
-|[#541](https://github.com/NVIDIA/spark-rapids/issues/541)|[FEA] Scala UDF:Support for null Value operands|
-|[#542](https://github.com/NVIDIA/spark-rapids/issues/542)|[FEA] Scala UDF: Support for Date and Time |
-|[#499](https://github.com/NVIDIA/spark-rapids/issues/499)|[FEA] disable any kind of warnings about ExecutedCommandExec not being on the GPU|
-|[#540](https://github.com/NVIDIA/spark-rapids/issues/540)|[FEA] Scala UDF: Support for String replaceFirst()|
-|[#340](https://github.com/NVIDIA/spark-rapids/issues/340)|[FEA] widen the rendered Jekyll pages|
-|[#602](https://github.com/NVIDIA/spark-rapids/issues/602)|[FEA] don't release with any -SNAPSHOT dependencies|
-|[#579](https://github.com/NVIDIA/spark-rapids/issues/579)|[FEA] Auto-merge between branches|
-|[#515](https://github.com/NVIDIA/spark-rapids/issues/515)|[FEA] Write tests for AQE skewed join optimization|
-|[#452](https://github.com/NVIDIA/spark-rapids/issues/452)|[FEA] Update HashSortOptimizerSuite to work with AQE|
-|[#454](https://github.com/NVIDIA/spark-rapids/issues/454)|[FEA] Update GpuCoalesceBatchesSuite to work with AQE enabled|
-|[#354](https://github.com/NVIDIA/spark-rapids/issues/354)|[FEA]Spark 3.1 FileSourceScanExec adds parameter optionalNumCoalescedBuckets|
-|[#566](https://github.com/NVIDIA/spark-rapids/issues/566)|[FEA] Add support for StringSplit with an array index.|
-|[#524](https://github.com/NVIDIA/spark-rapids/issues/524)|[FEA] Add GPU specific metrics to GpuFileSourceScanExec|
-|[#494](https://github.com/NVIDIA/spark-rapids/issues/494)|[FEA] Add some AQE-specific tests to the PySpark test suite|
-|[#146](https://github.com/NVIDIA/spark-rapids/issues/146)|[FEA] Python tests should support running with Adaptive Query Execution enabled|
-|[#465](https://github.com/NVIDIA/spark-rapids/issues/465)|[FEA] Audit: Update script to audit multiple versions of Spark |
-|[#488](https://github.com/NVIDIA/spark-rapids/issues/488)|[FEA] Ability to limit total GPU memory used|
-|[#70](https://github.com/NVIDIA/spark-rapids/issues/70)|[FEA] Support StringSplit|
-|[#403](https://github.com/NVIDIA/spark-rapids/issues/403)|[FEA] Add in support for GetArrayItem|
-|[#493](https://github.com/NVIDIA/spark-rapids/issues/493)|[FEA] Implement shuffle optimization when AQE is enabled|
-|[#500](https://github.com/NVIDIA/spark-rapids/issues/500)|[FEA] Add maven profiles for testing with AQE on or off|
-|[#471](https://github.com/NVIDIA/spark-rapids/issues/471)|[FEA] create a formal process for updating the github-pages branch|
-|[#233](https://github.com/NVIDIA/spark-rapids/issues/233)|[FEA] Audit DataWritingCommandExec |
-|[#240](https://github.com/NVIDIA/spark-rapids/issues/240)|[FEA] Audit Api validation script follow on - Optimize StringToTypeTag |
-|[#388](https://github.com/NVIDIA/spark-rapids/issues/388)|[FEA] Audit WindowExec|
-|[#425](https://github.com/NVIDIA/spark-rapids/issues/425)|[FEA] Add tests for configs in BatchScan Readers|
-|[#453](https://github.com/NVIDIA/spark-rapids/issues/453)|[FEA] Update HashAggregatesSuite to work with AQE|
-|[#184](https://github.com/NVIDIA/spark-rapids/issues/184)|[FEA] Enable NoScalaDoc scalastyle rule|
-|[#438](https://github.com/NVIDIA/spark-rapids/issues/438)|[FEA] Enable StringLPad|
-|[#232](https://github.com/NVIDIA/spark-rapids/issues/232)|[FEA] Audit SortExec |
-|[#236](https://github.com/NVIDIA/spark-rapids/issues/236)|[FEA] Audit ShuffleExchangeExec |
-|[#355](https://github.com/NVIDIA/spark-rapids/issues/355)|[FEA] Support Multiple Spark versions in the same jar|
-|[#385](https://github.com/NVIDIA/spark-rapids/issues/385)|[FEA] Support RangeExec on the GPU|
-|[#317](https://github.com/NVIDIA/spark-rapids/issues/317)|[FEA] Write test wrapper to run SQL queries via pyspark|
-|[#235](https://github.com/NVIDIA/spark-rapids/issues/235)|[FEA] Audit BroadcastExchangeExec|
-|[#234](https://github.com/NVIDIA/spark-rapids/issues/234)|[FEA] Audit BatchScanExec|
-|[#238](https://github.com/NVIDIA/spark-rapids/issues/238)|[FEA] Audit ShuffledHashJoinExec |
-|[#237](https://github.com/NVIDIA/spark-rapids/issues/237)|[FEA] Audit BroadcastHashJoinExec |
-|[#316](https://github.com/NVIDIA/spark-rapids/issues/316)|[FEA] Add some basic Dataframe tests for CoalesceExec|
-|[#145](https://github.com/NVIDIA/spark-rapids/issues/145)|[FEA] Scala tests should support running with Adaptive Query Execution enabled|
-|[#231](https://github.com/NVIDIA/spark-rapids/issues/231)|[FEA] Audit ProjectExec |
-|[#229](https://github.com/NVIDIA/spark-rapids/issues/229)|[FEA] Audit FileSourceScanExec |
-
-### Performance
-|||
-|:---|:---|
-|[#326](https://github.com/NVIDIA/spark-rapids/issues/326)|[DISCUSS] Shuffle read-side error handling|
-|[#601](https://github.com/NVIDIA/spark-rapids/issues/601)|[FEA] Optimize unnecessary sorts when replacing SortAggregate|
-|[#333](https://github.com/NVIDIA/spark-rapids/issues/333)|[FEA] Better handling of reading lots of small Parquet files|
-|[#511](https://github.com/NVIDIA/spark-rapids/issues/511)|[FEA] Connect shuffle table compression to shuffle exec metrics|
-|[#15](https://github.com/NVIDIA/spark-rapids/issues/15)|[FEA] Multiple threads sharing the same GPU|
-|[#272](https://github.com/NVIDIA/spark-rapids/issues/272)|[DOC] Getting started guide for UCX shuffle|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#780](https://github.com/NVIDIA/spark-rapids/issues/780)|[BUG] Inner Join dropping data with bucketed Table input|
-|[#569](https://github.com/NVIDIA/spark-rapids/issues/569)|[BUG] left_semi_join operation is abnormal and serious time-consuming|
-|[#744](https://github.com/NVIDIA/spark-rapids/issues/744)|[BUG] TPC-DS query 6 now produces incorrect results.|
-|[#718](https://github.com/NVIDIA/spark-rapids/issues/718)|[BUG] GpuBroadcastHashJoinExec ArrayIndexOutOfBoundsException|
-|[#698](https://github.com/NVIDIA/spark-rapids/issues/698)|[BUG] batch coalesce can fail to appear between columnar shuffle and subsequent columnar operation|
-|[#658](https://github.com/NVIDIA/spark-rapids/issues/658)|[BUG] GpuCoalesceBatches collectTime metric can be underreported|
-|[#59](https://github.com/NVIDIA/spark-rapids/issues/59)|[BUG] enable tests for string literals in a select|
-|[#486](https://github.com/NVIDIA/spark-rapids/issues/486)|[BUG] GpuWindowExec does not implement requiredChildOrdering|
-|[#631](https://github.com/NVIDIA/spark-rapids/issues/631)|[BUG] Rows are dropped when AQE is enabled in some cases|
-|[#671](https://github.com/NVIDIA/spark-rapids/issues/671)|[BUG] Databricks hash_aggregate_test fails trying to canonicalize a WrappedAggFunction|
-|[#218](https://github.com/NVIDIA/spark-rapids/issues/218)|[BUG] Window function COUNT(x) includes null-values, when it shouldn't|
-|[#153](https://github.com/NVIDIA/spark-rapids/issues/153)|[BUG] Incorrect output from partial-only hash aggregates with multiple distincts and non-distinct functions|
-|[#656](https://github.com/NVIDIA/spark-rapids/issues/656)|[BUG] integration tests produce hive metadata files|
-|[#607](https://github.com/NVIDIA/spark-rapids/issues/607)|[BUG] Fix misleading "cannot run on GPU" warnings when AQE is enabled|
-|[#630](https://github.com/NVIDIA/spark-rapids/issues/630)|[BUG] GpuCustomShuffleReader metrics always show zero rows/batches output|
-|[#643](https://github.com/NVIDIA/spark-rapids/issues/643)|[BUG] race condition while registering a buffer and spilling at the same time|
-|[#606](https://github.com/NVIDIA/spark-rapids/issues/606)|[BUG] Multiple scans for same data source with TPC-DS query59 with delta format|
-|[#626](https://github.com/NVIDIA/spark-rapids/issues/626)|[BUG] parquet_test showing leaked memory buffer|
-|[#155](https://github.com/NVIDIA/spark-rapids/issues/155)|[BUG] Incorrect output from averages with filters in partial only mode|
-|[#277](https://github.com/NVIDIA/spark-rapids/issues/277)|[BUG] HashAggregateSuite failure when AQE is enabled|
-|[#276](https://github.com/NVIDIA/spark-rapids/issues/276)|[BUG] GpuCoalesceBatchSuite failure when AQE is enabled|
-|[#598](https://github.com/NVIDIA/spark-rapids/issues/598)|[BUG] Non-deterministic output from MapOutputTracker.getStatistics() with AQE on GPU|
-|[#192](https://github.com/NVIDIA/spark-rapids/issues/192)|[BUG] test_read_merge_schema fails on Databricks|
-|[#341](https://github.com/NVIDIA/spark-rapids/issues/341)|[BUG] Document compression formats for readers/writers|
-|[#587](https://github.com/NVIDIA/spark-rapids/issues/587)|[BUG] Spark3.1 changed FileScan which means or GpuScans need to be added to shim layer|
-|[#362](https://github.com/NVIDIA/spark-rapids/issues/362)|[BUG] Implement getReaderForRange in the RapidsShuffleManager|
-|[#528](https://github.com/NVIDIA/spark-rapids/issues/528)|[BUG] HashAggregateSuite "Avg Distinct with filter" no longer valid when testing against Spark 3.1.0|
-|[#416](https://github.com/NVIDIA/spark-rapids/issues/416)|[BUG] Fix Spark 3.1.0 integration tests|
-|[#556](https://github.com/NVIDIA/spark-rapids/issues/556)|[BUG] NPE when removing shuffle|
-|[#553](https://github.com/NVIDIA/spark-rapids/issues/553)|[BUG] GpuColumnVector build warnings from raw type access|
-|[#492](https://github.com/NVIDIA/spark-rapids/issues/492)|[BUG] Re-enable AQE integration tests|
-|[#275](https://github.com/NVIDIA/spark-rapids/issues/275)|[BUG] TpchLike query 2 fails when AQE is enabled|
-|[#508](https://github.com/NVIDIA/spark-rapids/issues/508)|[BUG] GpuUnion publishes metrics on the UI that are all 0|
-|[#269](https://github.com/NVIDIA/spark-rapids/issues/269)|Needed to add `--conf spark.driver.extraClassPath=` |
-|[#473](https://github.com/NVIDIA/spark-rapids/issues/473)|[BUG] PartMerge:countDistinct:sum fails sporadically|
-|[#531](https://github.com/NVIDIA/spark-rapids/issues/531)|[BUG] Temporary RMM workaround needs to be removed|
-|[#532](https://github.com/NVIDIA/spark-rapids/issues/532)|[BUG] NPE when enabling shuffle manager|
-|[#525](https://github.com/NVIDIA/spark-rapids/issues/525)|[BUG] GpuFilterExec reports incorrect nullability of output in some cases|
-|[#483](https://github.com/NVIDIA/spark-rapids/issues/483)|[BUG] Multiple scans for the same parquet data source|
-|[#382](https://github.com/NVIDIA/spark-rapids/issues/382)|[BUG] Spark3.1 StringFallbackSuite regexp_replace null cpu fall back test fails.|
-|[#489](https://github.com/NVIDIA/spark-rapids/issues/489)|[FEA] Fix Spark 3.1 GpuHashJoin since it now requires CodegenSupport|
-|[#441](https://github.com/NVIDIA/spark-rapids/issues/441)|[BUG] test_broadcast_nested_loop_join_special_case fails on databricks|
-|[#347](https://github.com/NVIDIA/spark-rapids/issues/347)|[BUG] Failed to read Parquet file generated by GPU-enabled Spark.|
-|[#433](https://github.com/NVIDIA/spark-rapids/issues/433)|`InSet` operator produces an error for Strings|
-|[#144](https://github.com/NVIDIA/spark-rapids/issues/144)|[BUG] spark.sql.legacy.parquet.datetimeRebaseModeInWrite is ignored|
-|[#323](https://github.com/NVIDIA/spark-rapids/issues/323)|[BUG] GpuBroadcastNestedLoopJoinExec can fail if there are no columns|
-|[#356](https://github.com/NVIDIA/spark-rapids/issues/356)|[BUG] Integration cache test for BroadcastNestedLoopJoin failure|
-|[#280](https://github.com/NVIDIA/spark-rapids/issues/280)|[BUG] Full Outer Join does not work on nullable keys|
-|[#149](https://github.com/NVIDIA/spark-rapids/issues/149)|[BUG] Spark driver fails to load native libs when running on node without CUDA|
-
-### PRs
-|||
-|:---|:---|
-|[#826](https://github.com/NVIDIA/spark-rapids/pull/826)|Fix link to cudf-0.15-cuda11.jar|
-|[#815](https://github.com/NVIDIA/spark-rapids/pull/815)|Update documentation for Scala UDFs in 0.2 since you need two things|
-|[#802](https://github.com/NVIDIA/spark-rapids/pull/802)|Update 0.2 CHANGELOG|
-|[#793](https://github.com/NVIDIA/spark-rapids/pull/793)|Update Jenkins scripts for release|
-|[#798](https://github.com/NVIDIA/spark-rapids/pull/798)|Fix shims provider override config not being seen by executors|
-|[#785](https://github.com/NVIDIA/spark-rapids/pull/785)|Make shuffle run on CPU if we do a join where we read from bucketed table|
-|[#765](https://github.com/NVIDIA/spark-rapids/pull/765)|Add config to override shims provider class|
-|[#759](https://github.com/NVIDIA/spark-rapids/pull/759)|Add CHANGELOG for release 0.2|
-|[#758](https://github.com/NVIDIA/spark-rapids/pull/758)|Skip the udf test fails periodically.|
-|[#752](https://github.com/NVIDIA/spark-rapids/pull/752)|Fix snapshot plugin jar version in docs|
-|[#751](https://github.com/NVIDIA/spark-rapids/pull/751)|Correct the channel for cudf installation|
-|[#754](https://github.com/NVIDIA/spark-rapids/pull/754)|Filter nulls from joins where possible to improve performance|
-|[#732](https://github.com/NVIDIA/spark-rapids/pull/732)|Add a timeout for RapidsShuffleIterator to prevent jobs to hang infin…|
-|[#637](https://github.com/NVIDIA/spark-rapids/pull/637)|Documentation changes for 0.2 release |
-|[#747](https://github.com/NVIDIA/spark-rapids/pull/747)|Disable udf tests that fail periodically|
-|[#745](https://github.com/NVIDIA/spark-rapids/pull/745)|Revert Null Join Filter|
-|[#741](https://github.com/NVIDIA/spark-rapids/pull/741)|Fix issue with parquet partitioned reads|
-|[#733](https://github.com/NVIDIA/spark-rapids/pull/733)|Remove GPU Types from github|
-|[#720](https://github.com/NVIDIA/spark-rapids/pull/720)|Stop removing GpuCoalesceBatches from non-AQE queries when AQE is enabled|
-|[#729](https://github.com/NVIDIA/spark-rapids/pull/729)|Fix collect time metric in CoalesceBatches|
-|[#640](https://github.com/NVIDIA/spark-rapids/pull/640)|Support running Pandas UDFs on GPUs in Python processes.|
-|[#721](https://github.com/NVIDIA/spark-rapids/pull/721)|Add some more checks to databricks build scripts|
-|[#714](https://github.com/NVIDIA/spark-rapids/pull/714)|Move spark 3.0.1-shims out of snapshot-shims|
-|[#711](https://github.com/NVIDIA/spark-rapids/pull/711)|fix blossom checkout repo|
-|[#709](https://github.com/NVIDIA/spark-rapids/pull/709)|[BUG] fix unexpected indentation issue in blossom yml|
-|[#642](https://github.com/NVIDIA/spark-rapids/pull/642)|Init workflow for blossom-ci|
-|[#705](https://github.com/NVIDIA/spark-rapids/pull/705)|Enable configuration check for cast string to timestamp|
-|[#702](https://github.com/NVIDIA/spark-rapids/pull/702)|Update slack channel for Jenkins builds|
-|[#701](https://github.com/NVIDIA/spark-rapids/pull/701)|fix checkout-ref for automerge|
-|[#695](https://github.com/NVIDIA/spark-rapids/pull/695)|Fix spark-3.0.1 shim to be released|
-|[#668](https://github.com/NVIDIA/spark-rapids/pull/668)|refactor automerge to support merge for protected branch|
-|[#687](https://github.com/NVIDIA/spark-rapids/pull/687)|Include the UDF compiler in the dist jar|
-|[#689](https://github.com/NVIDIA/spark-rapids/pull/689)|Change shims dependency to spark-3.0.1|
-|[#677](https://github.com/NVIDIA/spark-rapids/pull/677)|Use multi-threaded parquet read with small files|
-|[#638](https://github.com/NVIDIA/spark-rapids/pull/638)|Add Parquet-based cache serializer|
-|[#613](https://github.com/NVIDIA/spark-rapids/pull/613)|Enable UCX + AQE|
-|[#684](https://github.com/NVIDIA/spark-rapids/pull/684)|Enable test for literal string values in a select|
-|[#686](https://github.com/NVIDIA/spark-rapids/pull/686)|Remove sorts when replacing sort aggregate if possible|
-|[#675](https://github.com/NVIDIA/spark-rapids/pull/675)|Added TimeAdd|
-|[#645](https://github.com/NVIDIA/spark-rapids/pull/645)|[window] Add GpuWindowExec requiredChildOrdering|
-|[#676](https://github.com/NVIDIA/spark-rapids/pull/676)|fixUpJoinConsistency rule now works when AQE is enabled|
-|[#683](https://github.com/NVIDIA/spark-rapids/pull/683)|Fix issues with cannonicalization of WrappedAggFunction|
-|[#682](https://github.com/NVIDIA/spark-rapids/pull/682)|Fix path to start-slave.sh script in docs|
-|[#673](https://github.com/NVIDIA/spark-rapids/pull/673)|Increase build timeouts on nightly and premerge builds|
-|[#648](https://github.com/NVIDIA/spark-rapids/pull/648)|add signoff-check use github actions|
-|[#593](https://github.com/NVIDIA/spark-rapids/pull/593)|Add support for isNaN and datetime related instructions in UDF compiler|
-|[#666](https://github.com/NVIDIA/spark-rapids/pull/666)|[window] Disable GPU for COUNT(exp) queries|
-|[#655](https://github.com/NVIDIA/spark-rapids/pull/655)|Implement AQE unit test for InsertAdaptiveSparkPlan|
-|[#614](https://github.com/NVIDIA/spark-rapids/pull/614)|Fix for aggregation with multiple distinct and non distinct functions|
-|[#657](https://github.com/NVIDIA/spark-rapids/pull/657)|Fix verify build after integration tests are run|
-|[#660](https://github.com/NVIDIA/spark-rapids/pull/660)|Add in neverReplaceExec and several rules for it|
-|[#639](https://github.com/NVIDIA/spark-rapids/pull/639)|BooleanType test shouldn't xfail|
-|[#652](https://github.com/NVIDIA/spark-rapids/pull/652)|Mark UVM config as internal until supported|
-|[#653](https://github.com/NVIDIA/spark-rapids/pull/653)|Move to the cudf-0.15 release|
-|[#647](https://github.com/NVIDIA/spark-rapids/pull/647)|Improve warnings about AQE nodes not supported on GPU|
-|[#646](https://github.com/NVIDIA/spark-rapids/pull/646)|Stop reporting zero metrics for GpuCustomShuffleReader|
-|[#644](https://github.com/NVIDIA/spark-rapids/pull/644)|Small fix for race in catalog where a buffer could get spilled while …|
-|[#623](https://github.com/NVIDIA/spark-rapids/pull/623)|Fix issues with canonicalization|
-|[#599](https://github.com/NVIDIA/spark-rapids/pull/599)|[FEA] changelog generator|
-|[#563](https://github.com/NVIDIA/spark-rapids/pull/563)|cudf and spark version info in artifacts|
-|[#633](https://github.com/NVIDIA/spark-rapids/pull/633)|Fix leak if RebaseHelper throws during Parquet read|
-|[#632](https://github.com/NVIDIA/spark-rapids/pull/632)|Copy function isSearchableType from Spark because signature changed in 3.0.1|
-|[#583](https://github.com/NVIDIA/spark-rapids/pull/583)|Add udf compiler unit tests|
-|[#617](https://github.com/NVIDIA/spark-rapids/pull/617)|Documentation updates for branch 0.2|
-|[#616](https://github.com/NVIDIA/spark-rapids/pull/616)|Add config to reserve GPU memory|
-|[#612](https://github.com/NVIDIA/spark-rapids/pull/612)|[REVIEW] Fix incorrect output from averages with filters in partial only mode|
-|[#609](https://github.com/NVIDIA/spark-rapids/pull/609)|fix minor issues with instructions for building ucx|
-|[#611](https://github.com/NVIDIA/spark-rapids/pull/611)|Added in profile to enable shims for SNAPSHOT releases|
-|[#595](https://github.com/NVIDIA/spark-rapids/pull/595)|Parquet small file reading optimization|
-|[#582](https://github.com/NVIDIA/spark-rapids/pull/582)|fix #579 Auto-merge between branches|
-|[#536](https://github.com/NVIDIA/spark-rapids/pull/536)|Add test for skewed join optimization when AQE is enabled|
-|[#603](https://github.com/NVIDIA/spark-rapids/pull/603)|Fix data size metric always 0 when using RAPIDS shuffle|
-|[#600](https://github.com/NVIDIA/spark-rapids/pull/600)|Fix calculation of string data for compressed batches|
-|[#597](https://github.com/NVIDIA/spark-rapids/pull/597)|Remove the xfail for parquet test_read_merge_schema on Databricks|
-|[#591](https://github.com/NVIDIA/spark-rapids/pull/591)|Add ucx license in NOTICE-binary|
-|[#596](https://github.com/NVIDIA/spark-rapids/pull/596)|Add Spark 3.0.2 to Shim layer|
-|[#594](https://github.com/NVIDIA/spark-rapids/pull/594)|Filter nulls from joins where possible to improve performance.|
-|[#590](https://github.com/NVIDIA/spark-rapids/pull/590)|Move GpuParquetScan/GpuOrcScan into Shim|
-|[#588](https://github.com/NVIDIA/spark-rapids/pull/588)|xfail the tpch spark 3.1.0 tests that fail|
-|[#572](https://github.com/NVIDIA/spark-rapids/pull/572)|Update buffer store to return compressed batches directly, add compression NVTX ranges|
-|[#558](https://github.com/NVIDIA/spark-rapids/pull/558)|Fix unit tests when AQE is enabled|
-|[#580](https://github.com/NVIDIA/spark-rapids/pull/580)|xfail the Spark 3.1.0 integration tests that fail |
-|[#565](https://github.com/NVIDIA/spark-rapids/pull/565)|Minor improvements to TPC-DS benchmarking code|
-|[#567](https://github.com/NVIDIA/spark-rapids/pull/567)|Explicitly disable AQE in one test|
-|[#571](https://github.com/NVIDIA/spark-rapids/pull/571)|Fix Databricks shim layer for GpuFileSourceScanExec and GpuBroadcastExchangeExec|
-|[#564](https://github.com/NVIDIA/spark-rapids/pull/564)|Add GPU decode time metric to scans|
-|[#562](https://github.com/NVIDIA/spark-rapids/pull/562)|getCatalog can be called from the driver, and can return null|
-|[#555](https://github.com/NVIDIA/spark-rapids/pull/555)|Fix build warnings for ColumnViewAccess|
-|[#560](https://github.com/NVIDIA/spark-rapids/pull/560)|Fix databricks build for AQE support|
-|[#557](https://github.com/NVIDIA/spark-rapids/pull/557)|Fix tests failing on Spark 3.1|
-|[#547](https://github.com/NVIDIA/spark-rapids/pull/547)|Add GPU metrics to GpuFileSourceScanExec|
-|[#462](https://github.com/NVIDIA/spark-rapids/pull/462)|Implement optimized AQE support so that exchanges run on GPU where possible|
-|[#550](https://github.com/NVIDIA/spark-rapids/pull/550)|Document Parquet and ORC compression support|
-|[#539](https://github.com/NVIDIA/spark-rapids/pull/539)|Update script to audit multiple Spark versions|
-|[#543](https://github.com/NVIDIA/spark-rapids/pull/543)|Add metrics to GpuUnion operator|
-|[#549](https://github.com/NVIDIA/spark-rapids/pull/549)|Move spark shim properties to top level pom|
-|[#497](https://github.com/NVIDIA/spark-rapids/pull/497)|Add UDF compiler implementations|
-|[#487](https://github.com/NVIDIA/spark-rapids/pull/487)|Add framework for batch compression of shuffle partitions|
-|[#544](https://github.com/NVIDIA/spark-rapids/pull/544)|Add in driverExtraClassPath for standalone mode docs|
-|[#546](https://github.com/NVIDIA/spark-rapids/pull/546)|Fix Spark 3.1.0 shim build error in GpuHashJoin|
-|[#537](https://github.com/NVIDIA/spark-rapids/pull/537)|Use fresh SparkSession when capturing to avoid late capture of previous query|
-|[#538](https://github.com/NVIDIA/spark-rapids/pull/538)|Revert "Temporary workaround for RMM initial pool size bug (#530)"|
-|[#517](https://github.com/NVIDIA/spark-rapids/pull/517)|Add config to limit maximum RMM pool size|
-|[#527](https://github.com/NVIDIA/spark-rapids/pull/527)|Add support for split and getArrayIndex|
-|[#534](https://github.com/NVIDIA/spark-rapids/pull/534)|Fixes bugs around GpuShuffleEnv initialization|
-|[#529](https://github.com/NVIDIA/spark-rapids/pull/529)|[BUG] Degenerate table metas were not getting copied to the heap|
-|[#530](https://github.com/NVIDIA/spark-rapids/pull/530)|Temporary workaround for RMM initial pool size bug|
-|[#526](https://github.com/NVIDIA/spark-rapids/pull/526)|Fix bug with nullability reporting in GpuFilterExec|
-|[#521](https://github.com/NVIDIA/spark-rapids/pull/521)|Fix typo with databricks shim classname SparkShimServiceProvider|
-|[#522](https://github.com/NVIDIA/spark-rapids/pull/522)|Use SQLConf instead of SparkConf when looking up SQL configs|
-|[#518](https://github.com/NVIDIA/spark-rapids/pull/518)|Fix init order issue in GpuShuffleEnv when RAPIDS shuffle configured|
-|[#514](https://github.com/NVIDIA/spark-rapids/pull/514)|Added clarification of RegExpReplace, DateDiff, made descriptive text consistent|
-|[#506](https://github.com/NVIDIA/spark-rapids/pull/506)|Add in basic support for running tpcds like queries|
-|[#504](https://github.com/NVIDIA/spark-rapids/pull/504)|Add ability to ignore tests depending on spark shim version|
-|[#503](https://github.com/NVIDIA/spark-rapids/pull/503)|Remove unused async buffer spill support|
-|[#501](https://github.com/NVIDIA/spark-rapids/pull/501)|disable codegen in 3.1 shim for hash join|
-|[#466](https://github.com/NVIDIA/spark-rapids/pull/466)|Optimize and fix Api validation script|
-|[#481](https://github.com/NVIDIA/spark-rapids/pull/481)|Codeowners|
-|[#439](https://github.com/NVIDIA/spark-rapids/pull/439)|Check a PR has been committed using git signoff|
-|[#319](https://github.com/NVIDIA/spark-rapids/pull/319)|Update partitioning logic in ShuffledBatchRDD|
-|[#491](https://github.com/NVIDIA/spark-rapids/pull/491)|Temporarily ignore AQE integration tests|
-|[#490](https://github.com/NVIDIA/spark-rapids/pull/490)|Fix Spark 3.1.0 build for HashJoin changes|
-|[#482](https://github.com/NVIDIA/spark-rapids/pull/482)|Prevent bad practice in python tests|
-|[#485](https://github.com/NVIDIA/spark-rapids/pull/485)|Show plan in assertion message if test fails|
-|[#480](https://github.com/NVIDIA/spark-rapids/pull/480)|Fix link from README to getting-started.md|
-|[#448](https://github.com/NVIDIA/spark-rapids/pull/448)|Preliminary support for keeping broadcast exchanges on GPU when AQE is enabled|
-|[#478](https://github.com/NVIDIA/spark-rapids/pull/478)|Fall back to CPU for binary as string in parquet|
-|[#477](https://github.com/NVIDIA/spark-rapids/pull/477)|Fix special case joins in broadcast nested loop join|
-|[#469](https://github.com/NVIDIA/spark-rapids/pull/469)|Update HashAggregateSuite to work with AQE|
-|[#475](https://github.com/NVIDIA/spark-rapids/pull/475)|Udf compiler pom followup|
-|[#434](https://github.com/NVIDIA/spark-rapids/pull/434)|Add UDF compiler skeleton|
-|[#474](https://github.com/NVIDIA/spark-rapids/pull/474)|Re-enable noscaladoc check|
-|[#461](https://github.com/NVIDIA/spark-rapids/pull/461)|Fix comments style to pass scala style check|
-|[#468](https://github.com/NVIDIA/spark-rapids/pull/468)|fix broken link|
-|[#456](https://github.com/NVIDIA/spark-rapids/pull/456)|Add closeOnExcept to clean up code that closes resources only on exceptions|
-|[#464](https://github.com/NVIDIA/spark-rapids/pull/464)|Turn off noscaladoc rule until codebase is fixed|
-|[#449](https://github.com/NVIDIA/spark-rapids/pull/449)|Enforce NoScalaDoc rule in scalastyle checks|
-|[#450](https://github.com/NVIDIA/spark-rapids/pull/450)|Enable scalastyle for shuffle plugin|
-|[#451](https://github.com/NVIDIA/spark-rapids/pull/451)|Databricks remove unneeded files and fix build to not fail on rm when file missing|
-|[#442](https://github.com/NVIDIA/spark-rapids/pull/442)|Shim layer support for Spark 3.0.0 Databricks|
-|[#447](https://github.com/NVIDIA/spark-rapids/pull/447)|Add scalastyle plugin to shim module|
-|[#426](https://github.com/NVIDIA/spark-rapids/pull/426)|Update BufferMeta to support multiple codec buffers per table|
-|[#440](https://github.com/NVIDIA/spark-rapids/pull/440)|Run mortgage test both with AQE on and off|
-|[#445](https://github.com/NVIDIA/spark-rapids/pull/445)|Added in StringRPad and StringLPad|
-|[#422](https://github.com/NVIDIA/spark-rapids/pull/422)|Documentation updates|
-|[#437](https://github.com/NVIDIA/spark-rapids/pull/437)|Fix bug with InSet and Strings|
-|[#435](https://github.com/NVIDIA/spark-rapids/pull/435)|Add in checks for Parquet LEGACY date/time rebase|
-|[#432](https://github.com/NVIDIA/spark-rapids/pull/432)|Fix batch use-after-close in partitioning, shuffle env init|
-|[#423](https://github.com/NVIDIA/spark-rapids/pull/423)|Fix duplicates includes in assembly jar|
-|[#418](https://github.com/NVIDIA/spark-rapids/pull/418)|CI Add unit tests running for Spark 3.0.1|
-|[#421](https://github.com/NVIDIA/spark-rapids/pull/421)|Make it easier to run TPCxBB benchmarks from spark shell|
-|[#413](https://github.com/NVIDIA/spark-rapids/pull/413)|Fix download link|
-|[#414](https://github.com/NVIDIA/spark-rapids/pull/414)|Shim Layer to support multiple Spark versions |
-|[#406](https://github.com/NVIDIA/spark-rapids/pull/406)|Update cast handling to deal with new libcudf casting limitations|
-|[#405](https://github.com/NVIDIA/spark-rapids/pull/405)|Change slave->worker|
-|[#395](https://github.com/NVIDIA/spark-rapids/pull/395)|Databricks doc updates|
-|[#401](https://github.com/NVIDIA/spark-rapids/pull/401)|Extended the FAQ|
-|[#398](https://github.com/NVIDIA/spark-rapids/pull/398)|Add tests for GpuPartition|
-|[#352](https://github.com/NVIDIA/spark-rapids/pull/352)|Change spark tgz package name|
-|[#397](https://github.com/NVIDIA/spark-rapids/pull/397)|Fix small bug in ShuffleBufferCatalog.hasActiveShuffle|
-|[#286](https://github.com/NVIDIA/spark-rapids/pull/286)|[REVIEW] Updated join tests for cache|
-|[#393](https://github.com/NVIDIA/spark-rapids/pull/393)|Contributor license agreement|
-|[#389](https://github.com/NVIDIA/spark-rapids/pull/389)|Added in support for RangeExec|
-|[#390](https://github.com/NVIDIA/spark-rapids/pull/390)|Ucx getting started|
-|[#391](https://github.com/NVIDIA/spark-rapids/pull/391)|Hide slack channel in Jenkins scripts|
-|[#387](https://github.com/NVIDIA/spark-rapids/pull/387)|Remove the term whitelist|
-|[#365](https://github.com/NVIDIA/spark-rapids/pull/365)|[REVIEW] Timesub tests|
-|[#383](https://github.com/NVIDIA/spark-rapids/pull/383)|Test utility to compare SQL query results between CPU and GPU|
-|[#380](https://github.com/NVIDIA/spark-rapids/pull/380)|Fix databricks notebook link|
-|[#378](https://github.com/NVIDIA/spark-rapids/pull/378)|Added in FAQ and fixed spelling|
-|[#377](https://github.com/NVIDIA/spark-rapids/pull/377)|Update heading in configs.md|
-|[#373](https://github.com/NVIDIA/spark-rapids/pull/373)|Modifying branch name to conform with rapidsai branch name change|
-|[#376](https://github.com/NVIDIA/spark-rapids/pull/376)|Add our session extension correctly if there are other extensions configured|
-|[#374](https://github.com/NVIDIA/spark-rapids/pull/374)|Fix rat issue for notebooks|
-|[#364](https://github.com/NVIDIA/spark-rapids/pull/364)|Update Databricks patch for changes to GpuSortMergeJoin|
-|[#371](https://github.com/NVIDIA/spark-rapids/pull/371)|fix typo and use regional bucket per GCP's update|
-|[#359](https://github.com/NVIDIA/spark-rapids/pull/359)|Karthik changes|
-|[#353](https://github.com/NVIDIA/spark-rapids/pull/353)|Fix broadcast nested loop join for the no column case|
-|[#313](https://github.com/NVIDIA/spark-rapids/pull/313)|Additional tests for broadcast hash join|
-|[#342](https://github.com/NVIDIA/spark-rapids/pull/342)|Implement build-side rules for shuffle hash join|
-|[#349](https://github.com/NVIDIA/spark-rapids/pull/349)|Updated join code to treat null equality properly|
-|[#335](https://github.com/NVIDIA/spark-rapids/pull/335)|Integration tests on spark 3.0.1-SNAPSHOT & 3.1.0-SNAPSHOT|
-|[#346](https://github.com/NVIDIA/spark-rapids/pull/346)|Update the Title Header for Fine Tuning|
-|[#344](https://github.com/NVIDIA/spark-rapids/pull/344)|Fix small typo in readme|
-|[#331](https://github.com/NVIDIA/spark-rapids/pull/331)|Adds iterator and client unit tests, and prepares for more fetch failure handling|
-|[#337](https://github.com/NVIDIA/spark-rapids/pull/337)|Fix Scala compile phase to allow Java classes referencing Scala classes|
-|[#332](https://github.com/NVIDIA/spark-rapids/pull/332)|Match GPU overwritten functions with SQL functions from FunctionRegistry|
-|[#339](https://github.com/NVIDIA/spark-rapids/pull/339)|Fix databricks build|
-|[#338](https://github.com/NVIDIA/spark-rapids/pull/338)|Move GpuPartitioning to a separate file|
-|[#310](https://github.com/NVIDIA/spark-rapids/pull/310)|Update release Jenkinsfile for Databricks|
-|[#330](https://github.com/NVIDIA/spark-rapids/pull/330)|Hide private info in Jenkins scripts|
-|[#324](https://github.com/NVIDIA/spark-rapids/pull/324)|Add in basic support for GpuCartesianProductExec|
-|[#328](https://github.com/NVIDIA/spark-rapids/pull/328)|Enable slack notification for Databricks build|
-|[#321](https://github.com/NVIDIA/spark-rapids/pull/321)|update databricks patch for GpuBroadcastNestedLoopJoinExec|
-|[#322](https://github.com/NVIDIA/spark-rapids/pull/322)|Add oss.sonatype.org to download the cudf jar|
-|[#320](https://github.com/NVIDIA/spark-rapids/pull/320)|Don't mount passwd/group to the container|
-|[#258](https://github.com/NVIDIA/spark-rapids/pull/258)|Enable running TPCH tests with AQE enabled|
-|[#318](https://github.com/NVIDIA/spark-rapids/pull/318)|Build docker image with Dockerfile|
-|[#309](https://github.com/NVIDIA/spark-rapids/pull/309)|Update databricks patch to latest changes|
-|[#312](https://github.com/NVIDIA/spark-rapids/pull/312)|Trigger branch-0.2 integration test|
-|[#307](https://github.com/NVIDIA/spark-rapids/pull/307)|[Jenkins] Update the release script and Jenkinsfile|
-|[#304](https://github.com/NVIDIA/spark-rapids/pull/304)|[DOC][Minor] Fix typo in spark config name.|
-|[#303](https://github.com/NVIDIA/spark-rapids/pull/303)|Update compatibility doc for -0.0 issues|
-|[#301](https://github.com/NVIDIA/spark-rapids/pull/301)|Add info about branches in README.md|
-|[#296](https://github.com/NVIDIA/spark-rapids/pull/296)|Added in basic support for broadcast nested loop join|
-|[#297](https://github.com/NVIDIA/spark-rapids/pull/297)|Databricks CI improvements and support runtime env parameter to xfail certain tests|
-|[#292](https://github.com/NVIDIA/spark-rapids/pull/292)|Move artifacts version in version-def.sh|
-|[#254](https://github.com/NVIDIA/spark-rapids/pull/254)|Cleanup QA tests|
-|[#289](https://github.com/NVIDIA/spark-rapids/pull/289)|Clean up GpuCollectLimitMeta and add in metrics|
-|[#287](https://github.com/NVIDIA/spark-rapids/pull/287)|Add in support for right join and fix issues build right|
-|[#273](https://github.com/NVIDIA/spark-rapids/pull/273)|Added releases to the README.md|
-|[#285](https://github.com/NVIDIA/spark-rapids/pull/285)|modify run_pyspark_from_build.sh to be bash 3 friendly|
-|[#281](https://github.com/NVIDIA/spark-rapids/pull/281)|Add in support for Full Outer Join on non-null keys|
-|[#274](https://github.com/NVIDIA/spark-rapids/pull/274)|Add RapidsDiskStore tests|
-|[#259](https://github.com/NVIDIA/spark-rapids/pull/259)|Add RapidsHostMemoryStore tests|
-|[#282](https://github.com/NVIDIA/spark-rapids/pull/282)|Update Databricks patch for 0.2 branch|
-|[#261](https://github.com/NVIDIA/spark-rapids/pull/261)|Add conditional xfail test for DISTINCT aggregates with NaN|
-|[#263](https://github.com/NVIDIA/spark-rapids/pull/263)|More time ops|
-|[#256](https://github.com/NVIDIA/spark-rapids/pull/256)|Remove special cases for contains, startsWith, and endWith|
-|[#253](https://github.com/NVIDIA/spark-rapids/pull/253)|Remove GpuAttributeReference and GpuSortOrder|
-|[#271](https://github.com/NVIDIA/spark-rapids/pull/271)|Update the versions for 0.2.0 properly for the databricks build|
-|[#162](https://github.com/NVIDIA/spark-rapids/pull/162)|Integration tests for corner cases in window functions.|
-|[#264](https://github.com/NVIDIA/spark-rapids/pull/264)|Add a local mvn repo for nightly pipeline|
-|[#262](https://github.com/NVIDIA/spark-rapids/pull/262)|Refer to branch-0.2|
-|[#255](https://github.com/NVIDIA/spark-rapids/pull/255)|Revert change to make dependencies of shaded jar optional|
-|[#257](https://github.com/NVIDIA/spark-rapids/pull/257)|Fix link to RAPIDS cudf in index.md|
-|[#252](https://github.com/NVIDIA/spark-rapids/pull/252)|Update to 0.2.0-SNAPSHOT and cudf-0.15-SNAPSHOT|
-
-## Release 0.1
-
-### Features
-|||
-|:---|:---|
-|[#74](https://github.com/NVIDIA/spark-rapids/issues/74)|[FEA] Support ToUnixTimestamp|
-|[#21](https://github.com/NVIDIA/spark-rapids/issues/21)|[FEA] NormalizeNansAndZeros|
-|[#105](https://github.com/NVIDIA/spark-rapids/issues/105)|[FEA] integration tests for equi-joins|
-
-### Bugs Fixed
-|||
-|:---|:---|
-|[#116](https://github.com/NVIDIA/spark-rapids/issues/116)|[BUG] calling replace with a NULL throws an exception|
-|[#168](https://github.com/NVIDIA/spark-rapids/issues/168)|[BUG] GpuUnitTests Date tests leak column vectors|
-|[#209](https://github.com/NVIDIA/spark-rapids/issues/209)|[BUG] Developers section in pom need to be updated|
-|[#204](https://github.com/NVIDIA/spark-rapids/issues/204)|[BUG] Code coverage docs are out of date|
-|[#154](https://github.com/NVIDIA/spark-rapids/issues/154)|[BUG] Incorrect output from partial-only averages with nulls|
-|[#61](https://github.com/NVIDIA/spark-rapids/issues/61)|[BUG] Cannot disable Parquet, ORC, CSV reading when using FileSourceScanExec|
-
-### PRs
-|||
-|:---|:---|
-|[#249](https://github.com/NVIDIA/spark-rapids/pull/249)|Compatability -> Compatibility|
-|[#247](https://github.com/NVIDIA/spark-rapids/pull/247)|Add index.md for default doc page, fix table formatting for configs|
-|[#241](https://github.com/NVIDIA/spark-rapids/pull/241)|Let default branch to master per the release rule|
-|[#177](https://github.com/NVIDIA/spark-rapids/pull/177)|Fixed leaks in unit test and use ColumnarBatch for testing|
-|[#243](https://github.com/NVIDIA/spark-rapids/pull/243)|Jenkins file for Databricks release|
-|[#225](https://github.com/NVIDIA/spark-rapids/pull/225)|Make internal project dependencies optional for shaded artifact|
-|[#242](https://github.com/NVIDIA/spark-rapids/pull/242)|Add site pages|
-|[#221](https://github.com/NVIDIA/spark-rapids/pull/221)|Databricks Build Support|
-|[#215](https://github.com/NVIDIA/spark-rapids/pull/215)|Remove CudfColumnVector|
-|[#213](https://github.com/NVIDIA/spark-rapids/pull/213)|Add RapidsDeviceMemoryStore tests|
-|[#214](https://github.com/NVIDIA/spark-rapids/pull/214)|[REVIEW] Test failure to pass Attribute as GpuAttribute|
-|[#211](https://github.com/NVIDIA/spark-rapids/pull/211)|Add project leads to pom developer list|
-|[#210](https://github.com/NVIDIA/spark-rapids/pull/210)|Updated coverage docs|
-|[#195](https://github.com/NVIDIA/spark-rapids/pull/195)|Support public release for plugin jar|
-|[#208](https://github.com/NVIDIA/spark-rapids/pull/208)|Remove unneeded comment from pom.xml|
-|[#191](https://github.com/NVIDIA/spark-rapids/pull/191)|WindowExec handle different spark distributions|
-|[#181](https://github.com/NVIDIA/spark-rapids/pull/181)|Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized|
-|[#196](https://github.com/NVIDIA/spark-rapids/pull/196)|Update Spark dependency to the released 3.0.0 artifacts|
-|[#206](https://github.com/NVIDIA/spark-rapids/pull/206)|Change groupID to 'com.nvidia' in IT scripts|
-|[#202](https://github.com/NVIDIA/spark-rapids/pull/202)|Fixed issue for contains when searching for an empty string|
-|[#201](https://github.com/NVIDIA/spark-rapids/pull/201)|Fix name of scan|
-|[#200](https://github.com/NVIDIA/spark-rapids/pull/200)|Fix issue with GpuAttributeReference not overrideing references|
-|[#197](https://github.com/NVIDIA/spark-rapids/pull/197)|Fix metrics for writes|
-|[#186](https://github.com/NVIDIA/spark-rapids/pull/186)|Fixed issue with nullability on concat|
-|[#193](https://github.com/NVIDIA/spark-rapids/pull/193)|Add RapidsBufferCatalog tests|
-|[#188](https://github.com/NVIDIA/spark-rapids/pull/188)|rebrand to com.nvidia instead of ai.rapids|
-|[#189](https://github.com/NVIDIA/spark-rapids/pull/189)|Handle AggregateExpression having resultIds parameter instead of a single resultId|
-|[#190](https://github.com/NVIDIA/spark-rapids/pull/190)|FileSourceScanExec can have logicalRelation parameter on some distributions|
-|[#185](https://github.com/NVIDIA/spark-rapids/pull/185)|Update type of parameter of GpuExpandExec to make it consistent|
-|[#172](https://github.com/NVIDIA/spark-rapids/pull/172)|Merge qa test to integration test|
-|[#180](https://github.com/NVIDIA/spark-rapids/pull/180)|Add MetaUtils unit tests|
-|[#171](https://github.com/NVIDIA/spark-rapids/pull/171)|Cleanup scaladoc warnings about missing links|
-|[#176](https://github.com/NVIDIA/spark-rapids/pull/176)|Updated join tests to cover more data.|
-|[#169](https://github.com/NVIDIA/spark-rapids/pull/169)|Remove dependency on shaded Spark artifact|
-|[#174](https://github.com/NVIDIA/spark-rapids/pull/174)|Added in fallback tests|
-|[#165](https://github.com/NVIDIA/spark-rapids/pull/165)|Move input metadata tests to pyspark|
-|[#173](https://github.com/NVIDIA/spark-rapids/pull/173)|Fix setting local mode for tests|
-|[#160](https://github.com/NVIDIA/spark-rapids/pull/160)|Integration tests for normalizing NaN/zeroes.|
-|[#163](https://github.com/NVIDIA/spark-rapids/pull/163)|Ignore the order locally for repartition tests|
-|[#157](https://github.com/NVIDIA/spark-rapids/pull/157)|Add partial and final only hash aggregate tests and fix nulls corner case for Average|
-|[#159](https://github.com/NVIDIA/spark-rapids/pull/159)|Add integration tests for joins|
-|[#158](https://github.com/NVIDIA/spark-rapids/pull/158)|Orc merge schema fallback and FileScan format configs|
-|[#164](https://github.com/NVIDIA/spark-rapids/pull/164)|Fix compiler warnings|
-|[#152](https://github.com/NVIDIA/spark-rapids/pull/152)|Moved cudf to 0.14 for CI|
-|[#151](https://github.com/NVIDIA/spark-rapids/pull/151)|Switch CICD pipelines to Github|
+## Older Releases
+Changelog of older releases can be found at [docs/archives](/docs/archives)
diff --git a/NOTICE-binary b/NOTICE-binary
index 8066d8545c3..0c0021116d0 100644
--- a/NOTICE-binary
+++ b/NOTICE-binary
@@ -12,17 +12,6 @@ Copyright 2014 and onwards The Apache Software Foundation
This product includes software developed at
The Apache Software Foundation (http://www.apache.org/).
----------------------------------------------------------------------
-
-Apache ORC
-Copyright 2013-2019 The Apache Software Foundation
-
-This product includes software developed by The Apache Software
-Foundation (http://www.apache.org/).
-
-This product includes software developed by Hewlett-Packard:
-(c) Copyright [2014-2015] Hewlett-Packard Development Company, L.P
-
---------------------------------------------------------------------
UCF Consortium - Unified Communication X (UCX)
diff --git a/aggregator/pom.xml b/aggregator/pom.xml
index 4ea80017800..b4663bafa2c 100644
--- a/aggregator/pom.xml
+++ b/aggregator/pom.xml
@@ -88,53 +88,8 @@
- org.apache.orc.
- ${rapids.shade.package}.orc.
-
-
- org.apache.hadoop.hive.
- ${rapids.shade.package}.hadoop.hive.
-
-
- org.apache.hadoop.hive.conf.HiveConf
- org.apache.hadoop.hive.ql.exec.FunctionRegistry
- org.apache.hadoop.hive.ql.exec.UDF
- org.apache.hadoop.hive.ql.exec.UDFMethodResolver
- org.apache.hadoop.hive.ql.udf.UDFType
- org.apache.hadoop.hive.ql.udf.generic.GenericUDF
- org.apache.hadoop.hive.ql.udf.generic.GenericUDF$DeferredObject
- org.apache.hadoop.hive.ql.udf.generic.GenericUDFUtils$ConversionHelper
- org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector
- org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory
- org.apache.hadoop.hive.serde2.objectinspector.ObjectInspectorFactory$ObjectInspectorOptions
- org.apache.hadoop.hive.serde2.objectinspector.StructField
- org.apache.hadoop.hive.serde2.typeinfo.TypeInfo
-
-
-
- org.apache.hive.
- ${rapids.shade.package}.hive.
-
-
- io.airlift.compress.
- ${rapids.shade.package}.io.airlift.compress.
-
-
- org.apache.commons.codec.
- ${rapids.shade.package}.org.apache.commons.codec.
-
-
- org.apache.commons.lang.
- ${rapids.shade.package}.org.apache.commons.lang.
-
-
- com.google
- ${rapids.shade.package}.com.google
+ com.google.flatbuffers
+ ${rapids.shade.package}.com.google.flatbuffers
diff --git a/common/pom.xml b/common/pom.xml
new file mode 100644
index 00000000000..3f46ea8459f
--- /dev/null
+++ b/common/pom.xml
@@ -0,0 +1,96 @@
+
+
+
+
+
+ 4.0.0
+
+
+ com.nvidia
+ rapids-4-spark-parent
+ 22.04.0-SNAPSHOT
+
+
+ rapids-4-spark-common_2.12
+ RAPIDS Accelerator for Apache Spark Common
+ Utility code that is common across the RAPIDS Accelerator projects
+ 22.04.0-SNAPSHOT
+
+
+
+ org.scala-lang
+ scala-library
+
+
+ org.scalatest
+ scalatest_${scala.binary.version}
+ test
+
+
+
+
+
+
+
+ ${project.build.directory}/extra-resources
+ true
+
+
+ ${project.basedir}/..
+ META-INF
+
+
+ LICENSE
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-jar-plugin
+
+
+ default-test-jar
+ none
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-surefire-plugin
+
+ true
+
+
+
+ net.alchim31.maven
+ scala-maven-plugin
+
+
+ org.apache.rat
+ apache-rat-plugin
+
+
+ org.scalatest
+ scalatest-maven-plugin
+
+
+
+
diff --git a/common/src/main/scala/com/nvidia/spark/rapids/CheckUtils.scala b/common/src/main/scala/com/nvidia/spark/rapids/CheckUtils.scala
new file mode 100644
index 00000000000..65ab724cc50
--- /dev/null
+++ b/common/src/main/scala/com/nvidia/spark/rapids/CheckUtils.scala
@@ -0,0 +1,23 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+object CheckUtils {
+ def checkArgument(expression: Boolean, msg: String): Unit = {
+ if (!expression) throw new IllegalArgumentException(msg)
+ }
+}
diff --git a/common/src/main/scala/com/nvidia/spark/rapids/ThreadFactoryBuilder.scala b/common/src/main/scala/com/nvidia/spark/rapids/ThreadFactoryBuilder.scala
new file mode 100644
index 00000000000..d61dd5a9c90
--- /dev/null
+++ b/common/src/main/scala/com/nvidia/spark/rapids/ThreadFactoryBuilder.scala
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import java.util.concurrent.{Executors, ThreadFactory}
+import java.util.concurrent.atomic.AtomicLong
+
+// This is similar to Guava ThreadFactoryBuilder
+// Avoid to use Guava as it is a messy dependency in practice.
+class ThreadFactoryBuilder {
+ private var nameFormat = Option.empty[String]
+ private var daemon = Option.empty[Boolean]
+
+ def setNameFormat(nameFormat: String): ThreadFactoryBuilder = {
+ nameFormat.format(0)
+ this.nameFormat = Some(nameFormat)
+ this
+ }
+
+ def setDaemon(daemon: Boolean): ThreadFactoryBuilder = {
+ this.daemon = Some(daemon)
+ this
+ }
+
+ def build(): ThreadFactory = {
+ val count = nameFormat.map(_ => new AtomicLong(0))
+ new ThreadFactory() {
+ private val defaultThreadFactory = Executors.defaultThreadFactory
+
+ override def newThread(r: Runnable): Thread = {
+ val thread = defaultThreadFactory.newThread(r)
+ nameFormat.foreach(f => thread.setName(f.format(count.get.getAndIncrement())))
+ daemon.foreach(b => thread.setDaemon(b))
+ thread
+ }
+ }
+ }
+}
diff --git a/common/src/test/scala/com/nvidia/spark/rapids/ThreadFactoryBuilderTest.scala b/common/src/test/scala/com/nvidia/spark/rapids/ThreadFactoryBuilderTest.scala
new file mode 100644
index 00000000000..d71915f51d0
--- /dev/null
+++ b/common/src/test/scala/com/nvidia/spark/rapids/ThreadFactoryBuilderTest.scala
@@ -0,0 +1,83 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import java.util.concurrent.{Callable, Executors}
+
+import org.scalatest.FunSuite
+
+class ThreadFactoryBuilderTest extends FunSuite {
+
+ test("test thread factory builder") {
+ val pool1 = Executors.newFixedThreadPool(2,
+ new ThreadFactoryBuilder().setNameFormat("thread-pool1-1 %s").setDaemon(true).build())
+ try {
+ var ret = pool1.submit(new Callable[String] {
+ override def call(): String = {
+ assert(Thread.currentThread().isDaemon)
+ assert(Thread.currentThread().getName == "thread-pool1-1 0")
+ ""
+ }
+ })
+ // waits and retrieves the result, if above asserts failed, will get execution exception
+ ret.get()
+ ret = pool1.submit(() => {
+ assert(Thread.currentThread().isDaemon)
+ assert(Thread.currentThread().getName == "thread-pool1-1 1")
+ ""
+ })
+ ret.get()
+ } finally {
+ pool1.shutdown()
+ }
+
+ val pool2 = Executors.newFixedThreadPool(2,
+ new ThreadFactoryBuilder().setNameFormat("pool2-%d").build())
+ try {
+ var ret = pool2.submit(new Callable[String] {
+ override def call(): String = {
+ assert(!Thread.currentThread().isDaemon)
+ assert(Thread.currentThread().getName == "pool2-0")
+ ""
+ }
+ })
+ ret.get()
+ ret = pool2.submit(() => {
+ assert(!Thread.currentThread().isDaemon)
+ assert(Thread.currentThread().getName == "pool2-1")
+ ""
+ })
+ ret.get()
+ } finally {
+ pool2.shutdown()
+ }
+
+ val pool3 = Executors.newFixedThreadPool(2,
+ new ThreadFactoryBuilder().setNameFormat("pool3-%d").setDaemon(false).build())
+ try {
+ pool3.submit(new Callable[String] {
+ override def call(): String = {
+ assert(!Thread.currentThread().isDaemon)
+ assert(Thread.currentThread().getName == "pool3-0")
+ ""
+ }
+ }).get()
+ } finally {
+ pool3.shutdown()
+ }
+ }
+}
diff --git a/docs/archives/CHANGELOG_0.1_to_0.5.md b/docs/archives/CHANGELOG_0.1_to_0.5.md
new file mode 100644
index 00000000000..fa5412f7d8a
--- /dev/null
+++ b/docs/archives/CHANGELOG_0.1_to_0.5.md
@@ -0,0 +1,1325 @@
+# Change log
+Generated on 2022-01-28
+
+## Release 0.5
+
+### Features
+|||
+|:---|:---|
+|[#938](https://github.com/NVIDIA/spark-rapids/issues/938)|[FEA] Have hashed shuffle match spark|
+|[#1604](https://github.com/NVIDIA/spark-rapids/issues/1604)|[FEA] Support casting structs to strings |
+|[#1920](https://github.com/NVIDIA/spark-rapids/issues/1920)|[FEA] Support murmur3 hashing of structs|
+|[#2018](https://github.com/NVIDIA/spark-rapids/issues/2018)|[FEA] A way for user to find out the plugin version and cudf version in REPL|
+|[#77](https://github.com/NVIDIA/spark-rapids/issues/77)|[FEA] Support ArrayContains|
+|[#1721](https://github.com/NVIDIA/spark-rapids/issues/1721)|[FEA] build cudf jars with NVTX enabled|
+|[#1782](https://github.com/NVIDIA/spark-rapids/issues/1782)|[FEA] Shim layers to support spark versions|
+|[#1625](https://github.com/NVIDIA/spark-rapids/issues/1625)|[FEA] Support Decimal Casts to String and String to Decimal|
+|[#166](https://github.com/NVIDIA/spark-rapids/issues/166)|[FEA] Support get_json_object|
+|[#1698](https://github.com/NVIDIA/spark-rapids/issues/1698)|[FEA] Support casting structs to string|
+|[#1912](https://github.com/NVIDIA/spark-rapids/issues/1912)|[FEA] Let `Scalar Pandas UDF ` support array of struct type.|
+|[#1136](https://github.com/NVIDIA/spark-rapids/issues/1136)|[FEA] Audit: Script to list commits between different Spark versions/tags|
+|[#1921](https://github.com/NVIDIA/spark-rapids/issues/1921)|[FEA] cudf version check should be lenient on later patch version|
+|[#19](https://github.com/NVIDIA/spark-rapids/issues/19)|[FEA] Out of core sorts|
+
+### Performance
+|||
+|:---|:---|
+|[#2090](https://github.com/NVIDIA/spark-rapids/issues/2090)|[FEA] Make row count estimates available to the cost-based optimizer|
+|[#1341](https://github.com/NVIDIA/spark-rapids/issues/1341)|Optimize unnecessary columnar->row->columnar transitions with AQE|
+|[#1558](https://github.com/NVIDIA/spark-rapids/issues/1558)|[FEA] Initialize UCX early|
+|[#1633](https://github.com/NVIDIA/spark-rapids/issues/1633)|[FEA] Implement a cost-based optimizer|
+|[#1727](https://github.com/NVIDIA/spark-rapids/issues/1727)|[FEA] Put RangePartitioner data path on the GPU|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#2279](https://github.com/NVIDIA/spark-rapids/issues/2279)|[BUG] Hash Partitioning can fail for very small batches|
+|[#2314](https://github.com/NVIDIA/spark-rapids/issues/2314)|[BUG] v0.5.0 pre-release pytests join_test.py::test_hash_join_array FAILED on SPARK-EGX Yarn Cluster|
+|[#2317](https://github.com/NVIDIA/spark-rapids/issues/2317)|[BUG] GpuColumnarToRowIterator can stop after receiving an empty batch|
+|[#2244](https://github.com/NVIDIA/spark-rapids/issues/2244)|[BUG] Executors hanging when running NDS benchmarks|
+|[#2278](https://github.com/NVIDIA/spark-rapids/issues/2278)|[BUG] FullOuter join can produce too many results|
+|[#2220](https://github.com/NVIDIA/spark-rapids/issues/2220)|[BUG] csv_test.py::test_csv_fallback FAILED on the EMR Cluster|
+|[#2225](https://github.com/NVIDIA/spark-rapids/issues/2225)|[BUG] GpuSort fails on tables containing arrays.|
+|[#2232](https://github.com/NVIDIA/spark-rapids/issues/2232)|[BUG] hash_aggregate_test.py::test_hash_grpby_pivot FAILED on the Databricks Cluster|
+|[#2231](https://github.com/NVIDIA/spark-rapids/issues/2231)|[BUG]string_test.py::test_re_replace FAILED on the Dataproc Cluster|
+|[#2042](https://github.com/NVIDIA/spark-rapids/issues/2042)|[BUG] NDS q14a fails with "GpuColumnarToRow does not implement doExecuteBroadcast"|
+|[#2203](https://github.com/NVIDIA/spark-rapids/issues/2203)|[BUG] Spark nightly cache tests fail with -- master flag|
+|[#2230](https://github.com/NVIDIA/spark-rapids/issues/2230)|[BUG] qa_nightly_select_test.py::test_select FAILED on the Dataproc Cluster|
+|[#1711](https://github.com/NVIDIA/spark-rapids/issues/1711)|[BUG] find a way to stop allocating from RMM on the shuffle-client thread|
+|[#2109](https://github.com/NVIDIA/spark-rapids/issues/2109)|[BUG] Fix high priority violations detected by code analysis tools|
+|[#2217](https://github.com/NVIDIA/spark-rapids/issues/2217)|[BUG] qa_nightly_select_test failure in test_select |
+|[#2127](https://github.com/NVIDIA/spark-rapids/issues/2127)|[BUG] Parsing with two-digit year should fall back to CPU|
+|[#2078](https://github.com/NVIDIA/spark-rapids/issues/2078)|[BUG] java.lang.ArithmeticException: divide by zero when spark.sql.ansi.enabled=true|
+|[#2048](https://github.com/NVIDIA/spark-rapids/issues/2048)|[BUG] split function+ repartition result in "ai.rapids.cudf.CudaException: device-side assert triggered"|
+|[#2036](https://github.com/NVIDIA/spark-rapids/issues/2036)|[BUG] Stackoverflow when writing wide parquet files.|
+|[#1973](https://github.com/NVIDIA/spark-rapids/issues/1973)|[BUG] generate_expr_test FAILED on Dataproc Cluster|
+|[#2079](https://github.com/NVIDIA/spark-rapids/issues/2079)|[BUG] koalas.sql fails with java.lang.ArrayIndexOutOfBoundsException|
+|[#217](https://github.com/NVIDIA/spark-rapids/issues/217)|[BUG] CudaUtil should be removed|
+|[#1550](https://github.com/NVIDIA/spark-rapids/issues/1550)|[BUG] The ORC output data of a query is not readable|
+|[#2074](https://github.com/NVIDIA/spark-rapids/issues/2074)|[BUG] Intermittent NPE in RapidsBufferCatalog when running test suite|
+|[#2027](https://github.com/NVIDIA/spark-rapids/issues/2027)|[BUG] udf_cudf_test.py integration tests fail |
+|[#1899](https://github.com/NVIDIA/spark-rapids/issues/1899)|[BUG] Some queries fail when cost-based optimizations are enabled|
+|[#1914](https://github.com/NVIDIA/spark-rapids/issues/1914)|[BUG] Add in float, double, timestamp, and date support to murmur3|
+|[#2014](https://github.com/NVIDIA/spark-rapids/issues/2014)|[BUG] earlyStart option added in 0.5 can cause errors when starting UCX|
+|[#1984](https://github.com/NVIDIA/spark-rapids/issues/1984)|[BUG] NDS q58 Decimal scale (59) cannot be greater than precision (38).|
+|[#2001](https://github.com/NVIDIA/spark-rapids/issues/2001)|[BUG] RapidsShuffleManager didn't pass `dirs` to `getBlockData` from a wrapped `ShuffleBlockResolver`|
+|[#1797](https://github.com/NVIDIA/spark-rapids/issues/1797)|[BUG] occasional crashes in CI|
+|[#1861](https://github.com/NVIDIA/spark-rapids/issues/1861)|Encountered column data outside the range of input buffer|
+|[#1905](https://github.com/NVIDIA/spark-rapids/issues/1905)|[BUG] Large concat task time in GpuShuffleCoalesce with pinned memory pool|
+|[#1638](https://github.com/NVIDIA/spark-rapids/issues/1638)|[BUG] Tests `test_window_aggs_for_rows_collect_list` fails when there are null values in columns.|
+|[#1864](https://github.com/NVIDIA/spark-rapids/issues/1864)|[BUG]HostColumnarToGPU inefficient when only doing count()|
+|[#1862](https://github.com/NVIDIA/spark-rapids/issues/1862)|[BUG] spark 3.2.0-snapshot integration test failed due to conf change|
+|[#1844](https://github.com/NVIDIA/spark-rapids/issues/1844)|[BUG] branch-0.5 nightly IT FAILED on the The mortgage ETL test "Could not read footer for file: file:/xxx/xxx.snappy.parquet"|
+|[#1627](https://github.com/NVIDIA/spark-rapids/issues/1627)|[BUG] GDS exception when restoring spilled buffer|
+|[#1802](https://github.com/NVIDIA/spark-rapids/issues/1802)|[BUG] Many decimal integration test failures for 0.5|
+
+### PRs
+|||
+|:---|:---|
+|[#2326](https://github.com/NVIDIA/spark-rapids/pull/2326)|Update changelog for 0.5.0 release|
+|[#2316](https://github.com/NVIDIA/spark-rapids/pull/2316)|Update doc to note that single quoted json strings are not ok|
+|[#2319](https://github.com/NVIDIA/spark-rapids/pull/2319)|Disable hash partitioning on arrays|
+|[#2318](https://github.com/NVIDIA/spark-rapids/pull/2318)|Fix ColumnarToRowIterator handling of empty batches|
+|[#2304](https://github.com/NVIDIA/spark-rapids/pull/2304)|Update CHANGELOG.md|
+|[#2301](https://github.com/NVIDIA/spark-rapids/pull/2301)|Update doc to reflect nanosleep problem with 460.32.03|
+|[#2298](https://github.com/NVIDIA/spark-rapids/pull/2298)|Update changelog for v0.5.0 release [skip ci]|
+|[#2293](https://github.com/NVIDIA/spark-rapids/pull/2293)|update cudf version to 0.19.2|
+|[#2289](https://github.com/NVIDIA/spark-rapids/pull/2289)|Update docs to warn against 450.80.02 driver with 10.x toolkit|
+|[#2285](https://github.com/NVIDIA/spark-rapids/pull/2285)|Require single batch for full outer join streaming|
+|[#2281](https://github.com/NVIDIA/spark-rapids/pull/2281)|Remove download section for unreleased 0.4.2|
+|[#2264](https://github.com/NVIDIA/spark-rapids/pull/2264)|Add spark312 and spark320 versions of cache serializer|
+|[#2254](https://github.com/NVIDIA/spark-rapids/pull/2254)|updated gcp docs with custom dataproc image instructions|
+|[#2247](https://github.com/NVIDIA/spark-rapids/pull/2247)|Allow specifying a superclass for non-GPU execs|
+|[#2235](https://github.com/NVIDIA/spark-rapids/pull/2235)|Fix distributed cache to read requested schema |
+|[#2261](https://github.com/NVIDIA/spark-rapids/pull/2261)|Make CBO row count test more robust|
+|[#2237](https://github.com/NVIDIA/spark-rapids/pull/2237)|update cudf version to 0.19.1|
+|[#2240](https://github.com/NVIDIA/spark-rapids/pull/2240)|Get the correct 'PIPESTATUS' in bash [skip ci]|
+|[#2242](https://github.com/NVIDIA/spark-rapids/pull/2242)|Add shuffle doc section on the periodicGC configuration|
+|[#2251](https://github.com/NVIDIA/spark-rapids/pull/2251)|Fix issue when out of core sorting nested data types|
+|[#2204](https://github.com/NVIDIA/spark-rapids/pull/2204)|Run nightly tests for ParquetCachedBatchSerializer|
+|[#2245](https://github.com/NVIDIA/spark-rapids/pull/2245)|Fix pivot bug for decimalType|
+|[#2093](https://github.com/NVIDIA/spark-rapids/pull/2093)|Initial implementation of row count estimates in cost-based optimizer|
+|[#2188](https://github.com/NVIDIA/spark-rapids/pull/2188)|Support GPU broadcast exchange reuse to feed CPU BHJ when AQE is enabled|
+|[#2227](https://github.com/NVIDIA/spark-rapids/pull/2227)|ParquetCachedBatchSerializer broadcast AllConfs instead of SQLConf to fix distributed mode|
+|[#2223](https://github.com/NVIDIA/spark-rapids/pull/2223)|Adds subquery aggregate tests from SPARK-31620|
+|[#2222](https://github.com/NVIDIA/spark-rapids/pull/2222)|Remove groupId already specified in parent pom|
+|[#2209](https://github.com/NVIDIA/spark-rapids/pull/2209)|Fixed a few issues with out of core sort|
+|[#2218](https://github.com/NVIDIA/spark-rapids/pull/2218)|Fix incorrect RegExpReplace children handling on Spark 3.1+|
+|[#2207](https://github.com/NVIDIA/spark-rapids/pull/2207)|fix batch size default values in the tuning guide|
+|[#2208](https://github.com/NVIDIA/spark-rapids/pull/2208)|Revert "add nightly cache tests (#2083)"|
+|[#2206](https://github.com/NVIDIA/spark-rapids/pull/2206)|Fix shim301db build|
+|[#2192](https://github.com/NVIDIA/spark-rapids/pull/2192)|Fix index-based access to the head elements|
+|[#2210](https://github.com/NVIDIA/spark-rapids/pull/2210)|Avoid redundant collection conversions|
+|[#2190](https://github.com/NVIDIA/spark-rapids/pull/2190)|JNI fixes for StringWordCount native UDF example|
+|[#2086](https://github.com/NVIDIA/spark-rapids/pull/2086)|Updating documentation for data format support|
+|[#2172](https://github.com/NVIDIA/spark-rapids/pull/2172)|Remove easy unused symbols|
+|[#2089](https://github.com/NVIDIA/spark-rapids/pull/2089)|Update PandasUDF doc|
+|[#2195](https://github.com/NVIDIA/spark-rapids/pull/2195)|fix cudf 0.19.0 download link [skip ci]|
+|[#2175](https://github.com/NVIDIA/spark-rapids/pull/2175)|Branch 0.5 doc update|
+|[#2168](https://github.com/NVIDIA/spark-rapids/pull/2168)|Simplify GpuExpressions w/ withResourceIfAllowed|
+|[#2055](https://github.com/NVIDIA/spark-rapids/pull/2055)|Support PivotFirst|
+|[#2183](https://github.com/NVIDIA/spark-rapids/pull/2183)|GpuParquetScan#readBufferToTable remove dead code|
+|[#2129](https://github.com/NVIDIA/spark-rapids/pull/2129)|Fall back to CPU when parsing two-digit years|
+|[#2083](https://github.com/NVIDIA/spark-rapids/pull/2083)|add nightly cache tests|
+|[#2151](https://github.com/NVIDIA/spark-rapids/pull/2151)|add corresponding close call for HostMemoryOutputStream|
+|[#2169](https://github.com/NVIDIA/spark-rapids/pull/2169)|Work around bug in Spark for integration test|
+|[#2130](https://github.com/NVIDIA/spark-rapids/pull/2130)|Fix divide-by-zero in GpuAverage with ansi mode|
+|[#2149](https://github.com/NVIDIA/spark-rapids/pull/2149)|Auto generate the supported types for the file formats|
+|[#2072](https://github.com/NVIDIA/spark-rapids/pull/2072)|Disable CSV parsing by default and update tests to better show what is left|
+|[#2157](https://github.com/NVIDIA/spark-rapids/pull/2157)|fix merge conflict for 0.4.2 [skip ci]|
+|[#2144](https://github.com/NVIDIA/spark-rapids/pull/2144)|Allow array and struct types to pass thru when doing join|
+|[#2145](https://github.com/NVIDIA/spark-rapids/pull/2145)|Avoid GPU shuffle for round-robin of unsortable types|
+|[#2021](https://github.com/NVIDIA/spark-rapids/pull/2021)|Add in support for murmur3 hashing of structs|
+|[#2128](https://github.com/NVIDIA/spark-rapids/pull/2128)|Add in Partition type check support|
+|[#2116](https://github.com/NVIDIA/spark-rapids/pull/2116)|Add dynamic Spark configuration for Databricks|
+|[#2132](https://github.com/NVIDIA/spark-rapids/pull/2132)|Log plugin and cudf versions on startup|
+|[#2135](https://github.com/NVIDIA/spark-rapids/pull/2135)|Disable Spark 3.2 shim by default|
+|[#2125](https://github.com/NVIDIA/spark-rapids/pull/2125)|enable auto-merge from 0.5 to 0.6 [skip ci]|
+|[#2120](https://github.com/NVIDIA/spark-rapids/pull/2120)|Materialize Stream before serialization|
+|[#2119](https://github.com/NVIDIA/spark-rapids/pull/2119)|Add more comprehensive documentation on supported date formats|
+|[#1717](https://github.com/NVIDIA/spark-rapids/pull/1717)|Decimal32 support|
+|[#2114](https://github.com/NVIDIA/spark-rapids/pull/2114)|Modified the Download page for 0.4.1 and updated doc to point to K8s guide|
+|[#2106](https://github.com/NVIDIA/spark-rapids/pull/2106)|Fix some buffer leaks|
+|[#2097](https://github.com/NVIDIA/spark-rapids/pull/2097)|fix the bound row project empty issue in row frame|
+|[#2099](https://github.com/NVIDIA/spark-rapids/pull/2099)|Remove verbose log prints to make the build/test log clean|
+|[#2105](https://github.com/NVIDIA/spark-rapids/pull/2105)|Cleanup prior Spark sessions in tests consistently|
+|[#2104](https://github.com/NVIDIA/spark-rapids/pull/2104)| Clone apache spark source code to parse the git commit IDs|
+|[#2095](https://github.com/NVIDIA/spark-rapids/pull/2095)|fix refcount when materializing device buffer from GDS|
+|[#2100](https://github.com/NVIDIA/spark-rapids/pull/2100)|[BUG] add wget for fetching conda [skip ci]|
+|[#2096](https://github.com/NVIDIA/spark-rapids/pull/2096)|Adjust images for integration tests|
+|[#2094](https://github.com/NVIDIA/spark-rapids/pull/2094)|Changed name of parquet files for Mortgage ETL Integration test|
+|[#2035](https://github.com/NVIDIA/spark-rapids/pull/2035)|Accelerate data transfer for map Pandas UDF plan|
+|[#2050](https://github.com/NVIDIA/spark-rapids/pull/2050)|stream shuffle buffers from GDS to UCX|
+|[#2084](https://github.com/NVIDIA/spark-rapids/pull/2084)|Enable ORC write by default|
+|[#2088](https://github.com/NVIDIA/spark-rapids/pull/2088)|Upgrade ScalaTest plugin to respect JAVA_HOME|
+|[#1932](https://github.com/NVIDIA/spark-rapids/pull/1932)|Create a getting started on K8s page|
+|[#2080](https://github.com/NVIDIA/spark-rapids/pull/2080)|Improve error message after failed RMM shutdown|
+|[#2064](https://github.com/NVIDIA/spark-rapids/pull/2064)|Optimize unnecessary columnar->row->columnar transitions with AQE|
+|[#2025](https://github.com/NVIDIA/spark-rapids/pull/2025)|Update the doc for pandas udf on databricks|
+|[#2059](https://github.com/NVIDIA/spark-rapids/pull/2059)|Add the flag 'TEST_TYPE' to avoid integration tests silently skipping some test cases|
+|[#2075](https://github.com/NVIDIA/spark-rapids/pull/2075)|Remove debug println from CBO test|
+|[#2046](https://github.com/NVIDIA/spark-rapids/pull/2046)|support casting Decimal to String|
+|[#1812](https://github.com/NVIDIA/spark-rapids/pull/1812)|allow spilled buffers to be unspilled|
+|[#2061](https://github.com/NVIDIA/spark-rapids/pull/2061)|Run the pandas udf using cudf on Databricks|
+|[#1893](https://github.com/NVIDIA/spark-rapids/pull/1893)|Plug-in support for get_json_object|
+|[#2044](https://github.com/NVIDIA/spark-rapids/pull/2044)|Use partition for GPU hash partitioning|
+|[#1954](https://github.com/NVIDIA/spark-rapids/pull/1954)|Fix CBO bug where incompatible plans were produced with AQE on|
+|[#2049](https://github.com/NVIDIA/spark-rapids/pull/2049)|Remove incompatable int overflow checking|
+|[#2056](https://github.com/NVIDIA/spark-rapids/pull/2056)|Remove Spark 3.2 from premerge and nightly CI run|
+|[#1814](https://github.com/NVIDIA/spark-rapids/pull/1814)|Struct to string casting functionality|
+|[#2037](https://github.com/NVIDIA/spark-rapids/pull/2037)|Fix warnings from use of deprecated cudf methods|
+|[#2033](https://github.com/NVIDIA/spark-rapids/pull/2033)|Bump up pre-merge OS from ubuntu 16 to ubuntu 18 [skip ci]|
+|[#1883](https://github.com/NVIDIA/spark-rapids/pull/1883)|Enable sort for single-level nesting struct columns on GPU|
+|[#2016](https://github.com/NVIDIA/spark-rapids/pull/2016)|Refactor logic for parallel testing|
+|[#2022](https://github.com/NVIDIA/spark-rapids/pull/2022)|Update order by to not load native libraries when sorting|
+|[#2017](https://github.com/NVIDIA/spark-rapids/pull/2017)|Add in murmur3 support for float, double, date and timestamp|
+|[#1981](https://github.com/NVIDIA/spark-rapids/pull/1981)|Fix GpuSize|
+|[#1999](https://github.com/NVIDIA/spark-rapids/pull/1999)|support casting string to decimal|
+|[#2006](https://github.com/NVIDIA/spark-rapids/pull/2006)|Enable windowed `collect_list` by default|
+|[#2000](https://github.com/NVIDIA/spark-rapids/pull/2000)|Use Spark's HybridRowQueue to avoid MemoryConsumer API shim|
+|[#2015](https://github.com/NVIDIA/spark-rapids/pull/2015)|Fix bug where rkey buffer is getting advanced after the first handshake|
+|[#2007](https://github.com/NVIDIA/spark-rapids/pull/2007)|Fix unknown column name error when filtering ORC file with no names|
+|[#2005](https://github.com/NVIDIA/spark-rapids/pull/2005)|Update to new is_before_spark_311 function name|
+|[#1944](https://github.com/NVIDIA/spark-rapids/pull/1944)|Support running scalar pandas UDF with array type.|
+|[#1991](https://github.com/NVIDIA/spark-rapids/pull/1991)|Fixes creation of invalid DecimalType in GpuDivide.tagExprForGpu|
+|[#1958](https://github.com/NVIDIA/spark-rapids/pull/1958)|Support legacy behavior of parameterless count |
+|[#1919](https://github.com/NVIDIA/spark-rapids/pull/1919)|Add support for Structs for UnionExec|
+|[#2002](https://github.com/NVIDIA/spark-rapids/pull/2002)|Pass dirs to getBlockData for a wrapped shuffle resolver|
+|[#1983](https://github.com/NVIDIA/spark-rapids/pull/1983)|document building against different CUDA Toolkit versions|
+|[#1994](https://github.com/NVIDIA/spark-rapids/pull/1994)|Merge 0.4 to 0.5 [skip ci]|
+|[#1982](https://github.com/NVIDIA/spark-rapids/pull/1982)|Update ORC pushdown filter building to latest Spark logic|
+|[#1978](https://github.com/NVIDIA/spark-rapids/pull/1978)|Add audit script to list commits from Spark|
+|[#1976](https://github.com/NVIDIA/spark-rapids/pull/1976)|Temp fix for parquet write changes|
+|[#1970](https://github.com/NVIDIA/spark-rapids/pull/1970)|add maven profiles for supported CUDA versions|
+|[#1951](https://github.com/NVIDIA/spark-rapids/pull/1951)|Branch 0.5 doc remove numpartitions|
+|[#1967](https://github.com/NVIDIA/spark-rapids/pull/1967)|Update FAQ for Dataset API and format supported versions|
+|[#1972](https://github.com/NVIDIA/spark-rapids/pull/1972)|support GpuSize|
+|[#1966](https://github.com/NVIDIA/spark-rapids/pull/1966)|add xml report for codecov|
+|[#1955](https://github.com/NVIDIA/spark-rapids/pull/1955)|Fix typo in Arrow optimization config|
+|[#1956](https://github.com/NVIDIA/spark-rapids/pull/1956)|Fix NPE in plugin shutdown|
+|[#1930](https://github.com/NVIDIA/spark-rapids/pull/1930)|Relax cudf version check for patch-level versions|
+|[#1787](https://github.com/NVIDIA/spark-rapids/pull/1787)|support distributed file path in cloud environment|
+|[#1961](https://github.com/NVIDIA/spark-rapids/pull/1961)|change premege GPU_TYPE from secret to global env [skip ci]|
+|[#1957](https://github.com/NVIDIA/spark-rapids/pull/1957)|Update Spark 3.1.2 shim for float upcast behavior|
+|[#1889](https://github.com/NVIDIA/spark-rapids/pull/1889)|Decimal DIV changes |
+|[#1947](https://github.com/NVIDIA/spark-rapids/pull/1947)|Move doc of Pandas UDF to additional-functionality|
+|[#1938](https://github.com/NVIDIA/spark-rapids/pull/1938)|Add spark.executor.resource.gpu.amount=1 to YARN and K8s docs|
+|[#1937](https://github.com/NVIDIA/spark-rapids/pull/1937)|Fix merge conflict with branch-0.4|
+|[#1878](https://github.com/NVIDIA/spark-rapids/pull/1878)|spillable cache for GpuCartesianRDD|
+|[#1843](https://github.com/NVIDIA/spark-rapids/pull/1843)|Refactor GpuGenerateExec and Explode|
+|[#1933](https://github.com/NVIDIA/spark-rapids/pull/1933)|Split DB scripts to make them common for the build and IT pipeline|
+|[#1935](https://github.com/NVIDIA/spark-rapids/pull/1935)|Update Alias SQL quoting and float-to-timestamp casting to match Spark 3.2|
+|[#1926](https://github.com/NVIDIA/spark-rapids/pull/1926)|Consolidate RAT settings in parent pom|
+|[#1918](https://github.com/NVIDIA/spark-rapids/pull/1918)|Minor code cleanup in dateTImeExpressions|
+|[#1906](https://github.com/NVIDIA/spark-rapids/pull/1906)|Remove get call on timeZoneId|
+|[#1908](https://github.com/NVIDIA/spark-rapids/pull/1908)|Remove the Scala version of Mortgage ETL tests from nightly test|
+|[#1894](https://github.com/NVIDIA/spark-rapids/pull/1894)|Modified Download Page to re-order the items and change the format of download links|
+|[#1909](https://github.com/NVIDIA/spark-rapids/pull/1909)|Avoid pinned memory for shuffle host buffers|
+|[#1891](https://github.com/NVIDIA/spark-rapids/pull/1891)|Connect UCX endpoints early during app startup|
+|[#1877](https://github.com/NVIDIA/spark-rapids/pull/1877)|remove docker build in pre-merge [skip ci]|
+|[#1830](https://github.com/NVIDIA/spark-rapids/pull/1830)|Enable the tests for collect over window.|
+|[#1882](https://github.com/NVIDIA/spark-rapids/pull/1882)|GpuArrowColumnarBatchBuilder retains the references of ArrowBuf until HostToGpuCoalesceIterator put them into device|
+|[#1868](https://github.com/NVIDIA/spark-rapids/pull/1868)|Increase row limit when doing count() for HostColumnarToGpu |
+|[#1855](https://github.com/NVIDIA/spark-rapids/pull/1855)|Expose row count statistics in GpuShuffleExchangeExec|
+|[#1875](https://github.com/NVIDIA/spark-rapids/pull/1875)|Fix merge conflict with branch-0.4|
+|[#1841](https://github.com/NVIDIA/spark-rapids/pull/1841)|Add in support for DateAddInterval|
+|[#1869](https://github.com/NVIDIA/spark-rapids/pull/1869)|Fix tests for Spark 3.2.0 shim|
+|[#1858](https://github.com/NVIDIA/spark-rapids/pull/1858)|fix shuffle manager doc on ucx library path|
+|[#1836](https://github.com/NVIDIA/spark-rapids/pull/1836)|Add shim for Spark 3.1.2|
+|[#1852](https://github.com/NVIDIA/spark-rapids/pull/1852)|Fix Part Suite Tests|
+|[#1616](https://github.com/NVIDIA/spark-rapids/pull/1616)|Cost-based optimizer|
+|[#1834](https://github.com/NVIDIA/spark-rapids/pull/1834)|Add shim for Spark 3.0.3|
+|[#1839](https://github.com/NVIDIA/spark-rapids/pull/1839)|Refactor join code to reduce duplicated code|
+|[#1848](https://github.com/NVIDIA/spark-rapids/pull/1848)|Fix merge conflict with branch-0.4|
+|[#1796](https://github.com/NVIDIA/spark-rapids/pull/1796)|Have most of range partitioning run on the GPU|
+|[#1845](https://github.com/NVIDIA/spark-rapids/pull/1845)|Fix fails on the mortgage ETL test|
+|[#1829](https://github.com/NVIDIA/spark-rapids/pull/1829)|Cleanup unused Jenkins files and scripts|
+|[#1704](https://github.com/NVIDIA/spark-rapids/pull/1704)|Create a shim for Spark 3.2.0 development|
+|[#1838](https://github.com/NVIDIA/spark-rapids/pull/1838)|Make databricks build.sh more convenient for dev|
+|[#1835](https://github.com/NVIDIA/spark-rapids/pull/1835)|Fix merge conflict with branch-0.4|
+|[#1808](https://github.com/NVIDIA/spark-rapids/pull/1808)|Update mortgage tests to support reading multiple dataset formats|
+|[#1822](https://github.com/NVIDIA/spark-rapids/pull/1822)|Fix conflict 0.4 to 0.5|
+|[#1807](https://github.com/NVIDIA/spark-rapids/pull/1807)|Fix merge conflict between branch-0.4 and branch-0.5|
+|[#1788](https://github.com/NVIDIA/spark-rapids/pull/1788)|Spill metrics everywhere|
+|[#1719](https://github.com/NVIDIA/spark-rapids/pull/1719)|Add in out of core sort|
+|[#1728](https://github.com/NVIDIA/spark-rapids/pull/1728)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
+|[#1689](https://github.com/NVIDIA/spark-rapids/pull/1689)|Update docs for plugin 0.5.0-SNAPSHOT and cudf 0.19-SNAPSHOT|
+|[#1682](https://github.com/NVIDIA/spark-rapids/pull/1682)|init CI/CD dependencies branch-0.5|
+
+## Release 0.4.1
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#1985](https://github.com/NVIDIA/spark-rapids/issues/1985)|[BUG] broadcast exchange can fail on 0.4|
+
+### PRs
+|||
+|:---|:---|
+|[#1995](https://github.com/NVIDIA/spark-rapids/pull/1995)|update changelog 0.4.1 [skip ci]|
+|[#1990](https://github.com/NVIDIA/spark-rapids/pull/1990)|Prepare for v0.4.1 release|
+|[#1988](https://github.com/NVIDIA/spark-rapids/pull/1988)|broadcast exchange can fail when job group set|
+
+## Release 0.4
+
+### Features
+|||
+|:---|:---|
+|[#1773](https://github.com/NVIDIA/spark-rapids/issues/1773)|[FEA] Spark 3.0.2 release support|
+|[#80](https://github.com/NVIDIA/spark-rapids/issues/80)|[FEA] Support the struct SQL function|
+|[#76](https://github.com/NVIDIA/spark-rapids/issues/76)|[FEA] Support CreateArray|
+|[#1635](https://github.com/NVIDIA/spark-rapids/issues/1635)|[FEA] RAPIDS accelerated Java UDF|
+|[#1333](https://github.com/NVIDIA/spark-rapids/issues/1333)|[FEA] Support window operations on Decimal|
+|[#1419](https://github.com/NVIDIA/spark-rapids/issues/1419)|[FEA] Support GPU accelerated UDF alternative for higher order function "aggregate" over window|
+|[#1580](https://github.com/NVIDIA/spark-rapids/issues/1580)|[FEA] Support Decimal for ParquetCachedBatchSerializer|
+|[#1600](https://github.com/NVIDIA/spark-rapids/issues/1600)|[FEA] Support ScalarSubquery|
+|[#1072](https://github.com/NVIDIA/spark-rapids/issues/1072)|[FEA] Support for a custom DataSource V2 which supplies Arrow data|
+|[#906](https://github.com/NVIDIA/spark-rapids/issues/906)|[FEA] Clarify query explanation to directly state what will run on GPU|
+|[#1335](https://github.com/NVIDIA/spark-rapids/issues/1335)|[FEA] Support CollectLimitExec for decimal|
+|[#1485](https://github.com/NVIDIA/spark-rapids/issues/1485)|[FEA] Decimal Support for Parquet Write|
+|[#1329](https://github.com/NVIDIA/spark-rapids/issues/1329)|[FEA] Decimal support for multiply int div, add, subtract and null safe equals|
+|[#1351](https://github.com/NVIDIA/spark-rapids/issues/1351)|[FEA] Execute UDFs that provide a RAPIDS execution path|
+|[#1330](https://github.com/NVIDIA/spark-rapids/issues/1330)|[FEA] Support Decimal Casts|
+|[#1353](https://github.com/NVIDIA/spark-rapids/issues/1353)|[FEA] Example of RAPIDS UDF using custom GPU code|
+|[#1487](https://github.com/NVIDIA/spark-rapids/issues/1487)|[FEA] Change spark 3.1.0 to 3.1.1|
+|[#1334](https://github.com/NVIDIA/spark-rapids/issues/1334)|[FEA] Add support for count aggregate on decimal|
+|[#1325](https://github.com/NVIDIA/spark-rapids/issues/1325)|[FEA] Add in join support for decimal|
+|[#1326](https://github.com/NVIDIA/spark-rapids/issues/1326)|[FEA] Add in Broadcast support for decimal values|
+|[#37](https://github.com/NVIDIA/spark-rapids/issues/37)|[FEA] round and bround SQL functions|
+|[#78](https://github.com/NVIDIA/spark-rapids/issues/78)|[FEA] Support CreateNamedStruct function|
+|[#1331](https://github.com/NVIDIA/spark-rapids/issues/1331)|[FEA] UnionExec and ExpandExec support for decimal|
+|[#1332](https://github.com/NVIDIA/spark-rapids/issues/1332)|[FEA] Support CaseWhen, Coalesce and IfElse for decimal|
+|[#937](https://github.com/NVIDIA/spark-rapids/issues/937)|[FEA] have murmur3 hash function that matches exactly with spark|
+|[#1324](https://github.com/NVIDIA/spark-rapids/issues/1324)|[FEA] Support Parquet Read of Decimal FIXED_LENGTH_BYTE_ARRAY|
+|[#1428](https://github.com/NVIDIA/spark-rapids/issues/1428)|[FEA] Add support for unary decimal operations abs, floor, ceil, unary - and unary +|
+|[#1375](https://github.com/NVIDIA/spark-rapids/issues/1375)|[FEA] Add log statement for what the concurrentGpuTasks tasks is set to on executor startup|
+|[#1352](https://github.com/NVIDIA/spark-rapids/issues/1352)|[FEA] Example of RAPIDS UDF using cudf Java APIs|
+|[#1328](https://github.com/NVIDIA/spark-rapids/issues/1328)|[FEA] Support sorting and shuffle of decimal|
+|[#1316](https://github.com/NVIDIA/spark-rapids/issues/1316)|[FEA] Support simple DECIMAL aggregates|
+
+### Performance
+|||
+|:---|:---|
+|[#1435](https://github.com/NVIDIA/spark-rapids/issues/1435)|[FEA]Improve the file reading by using local file caching|
+|[#1738](https://github.com/NVIDIA/spark-rapids/issues/1738)|[FEA] Reduce regex usage in CAST string to date/timestamp|
+|[#987](https://github.com/NVIDIA/spark-rapids/issues/987)|[FEA] Optimize CAST from string to temporal types by using cuDF is_timestamp function|
+|[#1594](https://github.com/NVIDIA/spark-rapids/issues/1594)|[FEA] RAPIDS accelerated ScalaUDF|
+|[#103](https://github.com/NVIDIA/spark-rapids/issues/103)|[FEA] GPU version of TakeOrderedAndProject|
+|[#1024](https://github.com/NVIDIA/spark-rapids/issues/1024)|Cleanup RAPIDS transport calls to `receive`|
+|[#1366](https://github.com/NVIDIA/spark-rapids/issues/1366)|Seeing performance differences of multi-threaded/coalesce/perfile Parquet reader type for a single file|
+|[#1200](https://github.com/NVIDIA/spark-rapids/issues/1200)|[FEA] Accelerate the scan speed for coalescing parquet reader when reading files from multiple partitioned folders|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#1885](https://github.com/NVIDIA/spark-rapids/issues/1885)|[BUG] natural join on string key results in a data frame with spurious NULLs|
+|[#1785](https://github.com/NVIDIA/spark-rapids/issues/1785)|[BUG] Rapids pytest integration tests FAILED on Yarn cluster with unrecognized arguments: `--std_input_path=src/test/resources/`|
+|[#999](https://github.com/NVIDIA/spark-rapids/issues/999)|[BUG] test_multi_types_window_aggs_for_rows_lead_lag fails against Spark 3.1.0|
+|[#1818](https://github.com/NVIDIA/spark-rapids/issues/1818)|[BUG] unmoored doc comment warnings in GpuCast|
+|[#1817](https://github.com/NVIDIA/spark-rapids/issues/1817)|[BUG] Developer build with local modifications fails during verify phase|
+|[#1644](https://github.com/NVIDIA/spark-rapids/issues/1644)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks|
+|[#1771](https://github.com/NVIDIA/spark-rapids/issues/1771)|[BUG] Databricks AWS CI/CD failing to create cluster|
+|[#1157](https://github.com/NVIDIA/spark-rapids/issues/1157)|[BUG] Fix regression supporting to_date on GPU with Spark 3.1.0|
+|[#716](https://github.com/NVIDIA/spark-rapids/issues/716)|[BUG] Cast String to TimeStamp issues|
+|[#1117](https://github.com/NVIDIA/spark-rapids/issues/1117)|[BUG] CAST string to date returns wrong values for dates with out-of-range values|
+|[#1670](https://github.com/NVIDIA/spark-rapids/issues/1670)|[BUG] Some TPC-DS queries fail with AQE when decimal types enabled|
+|[#1730](https://github.com/NVIDIA/spark-rapids/issues/1730)|[BUG] Range Partitioning can crash when processing is in the order-by|
+|[#1726](https://github.com/NVIDIA/spark-rapids/issues/1726)|[BUG] java url decode test failing on databricks, emr, and dataproc|
+|[#1651](https://github.com/NVIDIA/spark-rapids/issues/1651)|[BUG] GDS exception when writing shuffle file|
+|[#1702](https://github.com/NVIDIA/spark-rapids/issues/1702)|[BUG] check all tests marked xfail for Spark 3.1.1|
+|[#575](https://github.com/NVIDIA/spark-rapids/issues/575)|[BUG] Spark 3.1 FAILED join_test.py::test_broadcast_join_mixed[FullOuter][IGNORE_ORDER] failed|
+|[#577](https://github.com/NVIDIA/spark-rapids/issues/577)|[BUG] Spark 3.1 log arithmetic functions fail|
+|[#1541](https://github.com/NVIDIA/spark-rapids/issues/1541)|[BUG] Tests fail in integration in distributed mode after allowing nested types through in sort and shuffle|
+|[#1626](https://github.com/NVIDIA/spark-rapids/issues/1626)|[BUG] TPC-DS-like query 77 at scale=3TB fails with maxResultSize exceeded error|
+|[#1576](https://github.com/NVIDIA/spark-rapids/issues/1576)|[BUG] loading SPARK-32639 example parquet file triggers a JVM crash |
+|[#1643](https://github.com/NVIDIA/spark-rapids/issues/1643)|[BUG] TPC-DS-Like q10, q35, and q69 - slow or hanging at leftSemiJoin|
+|[#1650](https://github.com/NVIDIA/spark-rapids/issues/1650)|[BUG] BenchmarkRunner does not include query name in JSON summary filename when running multiple queries|
+|[#1654](https://github.com/NVIDIA/spark-rapids/issues/1654)|[BUG] TPC-DS-like query 59 at scale=3TB with AQE fails with join mismatch|
+|[#1274](https://github.com/NVIDIA/spark-rapids/issues/1274)|[BUG] OutOfMemoryError - Maximum pool size exceeded while running 24 day criteo ETL Transform stage|
+|[#1497](https://github.com/NVIDIA/spark-rapids/issues/1497)|[BUG] Spark-rapids v0.3.0 pytest integration tests with UCX on FAILED on Yarn cluster|
+|[#1534](https://github.com/NVIDIA/spark-rapids/issues/1534)|[BUG] Spark 3.1.1 test failure in writing due to removal of InMemoryFileIndex.shouldFilterOut|
+|[#1155](https://github.com/NVIDIA/spark-rapids/issues/1155)|[BUG] on shutdown don't print `Socket closed` exception when shutting down UCX.scala|
+|[#1510](https://github.com/NVIDIA/spark-rapids/issues/1510)|[BUG] IllegalArgumentException during shuffle|
+|[#1513](https://github.com/NVIDIA/spark-rapids/issues/1513)|[BUG] executor not fully initialized may get calls from Spark, in the process setting the `catalog` incorrectly|
+|[#1466](https://github.com/NVIDIA/spark-rapids/issues/1466)|[BUG] Databricks build must run before the rapids nightly|
+|[#1456](https://github.com/NVIDIA/spark-rapids/issues/1456)|[BUG] Databricks 0.4 parquet integration tests fail|
+|[#1400](https://github.com/NVIDIA/spark-rapids/issues/1400)|[BUG] Regressions in spark-shell usage of benchmark utilities|
+|[#1119](https://github.com/NVIDIA/spark-rapids/issues/1119)|[BUG] inner join fails with Column size cannot be negative|
+|[#1079](https://github.com/NVIDIA/spark-rapids/issues/1079)|[BUG]The Scala UDF function cannot invoke the UDF compiler when it's passed to "explode"|
+|[#1298](https://github.com/NVIDIA/spark-rapids/issues/1298)|TPCxBB query16 failed at UnsupportedOperationException: org.apache.parquet.column.values.dictionary.PlainValuesDictionary$PlainIntegerDictionary|
+|[#1271](https://github.com/NVIDIA/spark-rapids/issues/1271)|[BUG] CastOpSuite and AnsiCastOpSuite failing with ArithmeticException on Spark 3.1|
+|[#84](https://github.com/NVIDIA/spark-rapids/issues/84)|[BUG] sort does not match spark for -0.0 and 0.0|
+|[#578](https://github.com/NVIDIA/spark-rapids/issues/578)|[BUG] Spark 3.1 qa_nightly_select_test.py Full join test failures|
+|[#586](https://github.com/NVIDIA/spark-rapids/issues/586)|[BUG] Spark3.1 tpch failures|
+|[#837](https://github.com/NVIDIA/spark-rapids/issues/837)|[BUG] Distinct count of floating point values differs with regular spark|
+|[#953](https://github.com/NVIDIA/spark-rapids/issues/953)|[BUG] 3.1.0 pos_explode tests are failing|
+|[#127](https://github.com/NVIDIA/spark-rapids/issues/127)|[BUG] String CSV parsing does not respect nullValues|
+|[#1203](https://github.com/NVIDIA/spark-rapids/issues/1203)|[BUG] tpcds query 51 fails with join error on Spark 3.1.0|
+|[#750](https://github.com/NVIDIA/spark-rapids/issues/750)|[BUG] udf_cudf_test::test_with_column fails with IPC error |
+|[#1348](https://github.com/NVIDIA/spark-rapids/issues/1348)|[BUG] Host columnar decimal conversions are failing|
+|[#1270](https://github.com/NVIDIA/spark-rapids/issues/1270)|[BUG] Benchmark runner fails to produce report if benchmark fails due to an invalid query plan|
+|[#1179](https://github.com/NVIDIA/spark-rapids/issues/1179)|[BUG] SerializeConcatHostBuffersDeserializeBatch may have thread issues|
+|[#1115](https://github.com/NVIDIA/spark-rapids/issues/1115)|[BUG] Unchecked type warning in SparkQueryCompareTestSuite|
+
+### PRs
+|||
+|:---|:---|
+|[#1963](https://github.com/NVIDIA/spark-rapids/pull/1963)|Update changelog 0.4 [skip ci]|
+|[#1960](https://github.com/NVIDIA/spark-rapids/pull/1960)|Replace sonatype staging link with maven central link|
+|[#1945](https://github.com/NVIDIA/spark-rapids/pull/1945)|Update changelog 0.4 [skip ci]|
+|[#1910](https://github.com/NVIDIA/spark-rapids/pull/1910)|Make hash partitioning match CPU|
+|[#1927](https://github.com/NVIDIA/spark-rapids/pull/1927)|Change cuDF dependency to 0.18.1|
+|[#1934](https://github.com/NVIDIA/spark-rapids/pull/1934)|Update documentation to use cudf version 0.18.1|
+|[#1871](https://github.com/NVIDIA/spark-rapids/pull/1871)|Disable coalesce batch spilling to avoid cudf contiguous_split bug|
+|[#1849](https://github.com/NVIDIA/spark-rapids/pull/1849)|Update changelog for 0.4|
+|[#1744](https://github.com/NVIDIA/spark-rapids/pull/1744)|Fix NullPointerException on null partition insert|
+|[#1842](https://github.com/NVIDIA/spark-rapids/pull/1842)|Update to note support for 3.0.2|
+|[#1832](https://github.com/NVIDIA/spark-rapids/pull/1832)|Spark 3.1.1 shim no longer a snapshot shim|
+|[#1831](https://github.com/NVIDIA/spark-rapids/pull/1831)|Spark 3.0.2 shim no longer a snapshot shim|
+|[#1826](https://github.com/NVIDIA/spark-rapids/pull/1826)|Remove benchmarks|
+|[#1828](https://github.com/NVIDIA/spark-rapids/pull/1828)|Update cudf dependency to 0.18|
+|[#1813](https://github.com/NVIDIA/spark-rapids/pull/1813)|Fix LEAD/LAG failures in Spark 3.1.1|
+|[#1819](https://github.com/NVIDIA/spark-rapids/pull/1819)|Fix scaladoc warning in GpuCast|
+|[#1820](https://github.com/NVIDIA/spark-rapids/pull/1820)|[BUG] make modified check pre-merge only|
+|[#1780](https://github.com/NVIDIA/spark-rapids/pull/1780)|Remove SNAPSHOT from test and integration_test READMEs|
+|[#1809](https://github.com/NVIDIA/spark-rapids/pull/1809)|check if modified files after update_config/supported|
+|[#1804](https://github.com/NVIDIA/spark-rapids/pull/1804)|Update UCX documentation for RX_QUEUE_LEN and Docker|
+|[#1810](https://github.com/NVIDIA/spark-rapids/pull/1810)|Pandas UDF: Sort the data before computing the sum.|
+|[#1751](https://github.com/NVIDIA/spark-rapids/pull/1751)|Exclude foldable expressions from GPU if constant folding is disabled|
+|[#1798](https://github.com/NVIDIA/spark-rapids/pull/1798)|Add documentation about explain not on GPU when AQE is on|
+|[#1766](https://github.com/NVIDIA/spark-rapids/pull/1766)|Branch 0.4 release docs|
+|[#1794](https://github.com/NVIDIA/spark-rapids/pull/1794)|Build python output schema from udf expressions|
+|[#1783](https://github.com/NVIDIA/spark-rapids/pull/1783)|Fix the collect_list over window tests failures on db|
+|[#1781](https://github.com/NVIDIA/spark-rapids/pull/1781)|Better float/double cases for casting tests|
+|[#1790](https://github.com/NVIDIA/spark-rapids/pull/1790)|Record row counts in benchmark runs that call collect|
+|[#1779](https://github.com/NVIDIA/spark-rapids/pull/1779)|Add support of DateType and TimestampType for GetTimestamp expression|
+|[#1768](https://github.com/NVIDIA/spark-rapids/pull/1768)|Updating getting started Databricks docs|
+|[#1742](https://github.com/NVIDIA/spark-rapids/pull/1742)|Fix regression supporting to_date with Spark-3.1|
+|[#1775](https://github.com/NVIDIA/spark-rapids/pull/1775)|Fix ambiguous ordering for some tests|
+|[#1760](https://github.com/NVIDIA/spark-rapids/pull/1760)|Update GpuDataSourceScanExec and GpuBroadcastExchangeExec to fix audit issues|
+|[#1750](https://github.com/NVIDIA/spark-rapids/pull/1750)|Detect task failures in benchmarks|
+|[#1767](https://github.com/NVIDIA/spark-rapids/pull/1767)|Consistent Spark version for test and production|
+|[#1741](https://github.com/NVIDIA/spark-rapids/pull/1741)|Reduce regex use in CAST|
+|[#1756](https://github.com/NVIDIA/spark-rapids/pull/1756)|Skip RAPIDS accelerated Java UDF tests if UDF fails to load|
+|[#1716](https://github.com/NVIDIA/spark-rapids/pull/1716)|Update RapidsShuffleManager documentation for branch 0.4|
+|[#1740](https://github.com/NVIDIA/spark-rapids/pull/1740)|Disable ORC writes until bug can be fixed|
+|[#1747](https://github.com/NVIDIA/spark-rapids/pull/1747)|Fix resource leaks in unit tests|
+|[#1725](https://github.com/NVIDIA/spark-rapids/pull/1725)|Branch 0.4 FAQ reorg|
+|[#1718](https://github.com/NVIDIA/spark-rapids/pull/1718)|CAST string to temporal type now calls isTimestamp|
+|[#1734](https://github.com/NVIDIA/spark-rapids/pull/1734)|Disable range partitioning if computation is needed|
+|[#1723](https://github.com/NVIDIA/spark-rapids/pull/1723)|Removed StructTypes support for ParquetCachedBatchSerializer as cudf doesn't support it yet|
+|[#1714](https://github.com/NVIDIA/spark-rapids/pull/1714)|Add support for RAPIDS accelerated Java UDFs|
+|[#1713](https://github.com/NVIDIA/spark-rapids/pull/1713)|Call GpuDeviceManager.shutdown when the executor plugin is shutting down|
+|[#1596](https://github.com/NVIDIA/spark-rapids/pull/1596)|Added in Decimal support to ParquetCachedBatchSerializer|
+|[#1706](https://github.com/NVIDIA/spark-rapids/pull/1706)|cleanup unused is_before_spark_310|
+|[#1685](https://github.com/NVIDIA/spark-rapids/pull/1685)|Fix CustomShuffleReader replacement when decimal types enabled|
+|[#1699](https://github.com/NVIDIA/spark-rapids/pull/1699)|Add docs about Spark 3.1 in standalone modes not needing extra class path|
+|[#1701](https://github.com/NVIDIA/spark-rapids/pull/1701)|remove xfail for orc test_input_meta for spark 3.1.0|
+|[#1703](https://github.com/NVIDIA/spark-rapids/pull/1703)|Remove xfail for spark 3.1.0 test_broadcast_join_mixed FullOuter|
+|[#1676](https://github.com/NVIDIA/spark-rapids/pull/1676)|BenchmarkRunner option to generate query plan diagrams in DOT format|
+|[#1695](https://github.com/NVIDIA/spark-rapids/pull/1695)|support alternate jar paths|
+|[#1694](https://github.com/NVIDIA/spark-rapids/pull/1694)|increase mem and limit parallelism for pre-merge|
+|[#1691](https://github.com/NVIDIA/spark-rapids/pull/1691)|add validate_execs_in_gpu_plan to pytest.ini|
+|[#1692](https://github.com/NVIDIA/spark-rapids/pull/1692)|Add the integration test resources to the test tarball|
+|[#1677](https://github.com/NVIDIA/spark-rapids/pull/1677)|When PTDS is enabled, print warning if the allocator is not ARENA|
+|[#1683](https://github.com/NVIDIA/spark-rapids/pull/1683)|update changelog to verify autotmerge 0.5 setup [skip ci]|
+|[#1673](https://github.com/NVIDIA/spark-rapids/pull/1673)|support auto-merge for branch 0.5 [skip ci]|
+|[#1681](https://github.com/NVIDIA/spark-rapids/pull/1681)|Xfail the collect_list tests for databricks|
+|[#1678](https://github.com/NVIDIA/spark-rapids/pull/1678)|Fix array/struct checks in Sort and HashAggregate and sorting tests in distributed mode|
+|[#1671](https://github.com/NVIDIA/spark-rapids/pull/1671)|Allow metrics to be configurable by level|
+|[#1675](https://github.com/NVIDIA/spark-rapids/pull/1675)|add run_pyspark_from_build.sh to the pytest distribution tarball|
+|[#1548](https://github.com/NVIDIA/spark-rapids/pull/1548)|Support executing collect_list on GPU with windowing.|
+|[#1593](https://github.com/NVIDIA/spark-rapids/pull/1593)|Avoid unnecessary Table instances after contiguous split|
+|[#1592](https://github.com/NVIDIA/spark-rapids/pull/1592)|Add in support for Decimal divide|
+|[#1668](https://github.com/NVIDIA/spark-rapids/pull/1668)|Implement way for python integration tests to validate Exec is in GPU plan|
+|[#1669](https://github.com/NVIDIA/spark-rapids/pull/1669)|Add FAQ entries for executor-per-GPU questions|
+|[#1661](https://github.com/NVIDIA/spark-rapids/pull/1661)|Enable Parquet test for file containing map struct key|
+|[#1664](https://github.com/NVIDIA/spark-rapids/pull/1664)|Filter nulls for left semi and left anti join to work around cudf|
+|[#1665](https://github.com/NVIDIA/spark-rapids/pull/1665)|Add better automated tests for Arrow columnar copy in HostColumnarToGpu|
+|[#1614](https://github.com/NVIDIA/spark-rapids/pull/1614)|add alluxio getting start document|
+|[#1639](https://github.com/NVIDIA/spark-rapids/pull/1639)|support GpuScalarSubquery|
+|[#1656](https://github.com/NVIDIA/spark-rapids/pull/1656)|Move UDF to Catalyst Expressions to its own document|
+|[#1663](https://github.com/NVIDIA/spark-rapids/pull/1663)|BenchmarkRunner - Include query name in JSON summary filename|
+|[#1655](https://github.com/NVIDIA/spark-rapids/pull/1655)|Fix extraneous shuffles added by AQE|
+|[#1652](https://github.com/NVIDIA/spark-rapids/pull/1652)|Fix typo in arrow optimized config name - spark.rapids.arrowCopyOptimizationEnabled|
+|[#1645](https://github.com/NVIDIA/spark-rapids/pull/1645)|Run Databricks IT with python-xdist parallel, includes test fixes and xfail|
+|[#1649](https://github.com/NVIDIA/spark-rapids/pull/1649)|Move building from source docs to contributing guide|
+|[#1637](https://github.com/NVIDIA/spark-rapids/pull/1637)|Fail DivModLike on zero divisor in ANSI mode|
+|[#1646](https://github.com/NVIDIA/spark-rapids/pull/1646)|Update links in rapids-udfs.md after moving to subfolder|
+|[#1641](https://github.com/NVIDIA/spark-rapids/pull/1641)|Xfail struct and array order by tests on Dataproc|
+|[#1565](https://github.com/NVIDIA/spark-rapids/pull/1565)|Add GPU accelerated array_contains operator|
+|[#1617](https://github.com/NVIDIA/spark-rapids/pull/1617)|Enable nightly test checks for Apache Spark|
+|[#1636](https://github.com/NVIDIA/spark-rapids/pull/1636)|RAPIDS accelerated Spark Scala UDF support|
+|[#1634](https://github.com/NVIDIA/spark-rapids/pull/1634)|Fix databricks build since Arrow code added|
+|[#1599](https://github.com/NVIDIA/spark-rapids/pull/1599)|Add division by zero tests for Spark 3.1 behavior|
+|[#1619](https://github.com/NVIDIA/spark-rapids/pull/1619)|Update GpuFileSourceScanExec to be in sync with DataSourceScanExec|
+|[#1631](https://github.com/NVIDIA/spark-rapids/pull/1631)|Explicitly add maven-jar-plugin version to improve incremental build time.|
+|[#1624](https://github.com/NVIDIA/spark-rapids/pull/1624)|Update explain format to show what will and will not run on the GPU|
+|[#1622](https://github.com/NVIDIA/spark-rapids/pull/1622)|Support faster copy for a custom DataSource V2 which supplies Arrow data|
+|[#1621](https://github.com/NVIDIA/spark-rapids/pull/1621)|Additional functionality docs|
+|[#1618](https://github.com/NVIDIA/spark-rapids/pull/1618)|update blossom-ci for security updates [skip ci]|
+|[#1562](https://github.com/NVIDIA/spark-rapids/pull/1562)|add alluxio support|
+|[#1597](https://github.com/NVIDIA/spark-rapids/pull/1597)|Documentation for Parquet serializer|
+|[#1611](https://github.com/NVIDIA/spark-rapids/pull/1611)|Add in flag for integration tests to not skip required tests|
+|[#1609](https://github.com/NVIDIA/spark-rapids/pull/1609)|Disable float round/bround by default|
+|[#1615](https://github.com/NVIDIA/spark-rapids/pull/1615)|Add in window support for average|
+|[#1610](https://github.com/NVIDIA/spark-rapids/pull/1610)|Limit length of spark app name in BenchmarkRunner|
+|[#1579](https://github.com/NVIDIA/spark-rapids/pull/1579)|Support TakeOrderedAndProject|
+|[#1581](https://github.com/NVIDIA/spark-rapids/pull/1581)|Support Decimal type for CollectLimitExec|
+|[#1591](https://github.com/NVIDIA/spark-rapids/pull/1591)|Add support for running multiple queries in BenchmarkRunner|
+|[#1595](https://github.com/NVIDIA/spark-rapids/pull/1595)|Fix Github documentation issue template|
+|[#1577](https://github.com/NVIDIA/spark-rapids/pull/1577)|rename directory from spark310 to spark311|
+|[#1578](https://github.com/NVIDIA/spark-rapids/pull/1578)|Test to track RAPIDS-side issues re SPARK-32639|
+|[#1583](https://github.com/NVIDIA/spark-rapids/pull/1583)|fix request-action issue [skip ci]|
+|[#1555](https://github.com/NVIDIA/spark-rapids/pull/1555)|Enable ANSI mode for CAST string to timestamp|
+|[#1531](https://github.com/NVIDIA/spark-rapids/pull/1531)|Decimal Support for writing Parquet|
+|[#1545](https://github.com/NVIDIA/spark-rapids/pull/1545)|Support comparing ORC data|
+|[#1570](https://github.com/NVIDIA/spark-rapids/pull/1570)|Branch 0.4 doc cleanup|
+|[#1569](https://github.com/NVIDIA/spark-rapids/pull/1569)|Add shim method shouldIgnorePath|
+|[#1564](https://github.com/NVIDIA/spark-rapids/pull/1564)|Add in support for Decimal Multiply and DIV|
+|[#1561](https://github.com/NVIDIA/spark-rapids/pull/1561)|Decimal support for add and subtract|
+|[#1560](https://github.com/NVIDIA/spark-rapids/pull/1560)|support sum in window aggregation for decimal|
+|[#1546](https://github.com/NVIDIA/spark-rapids/pull/1546)|Cleanup shutdown logging for UCX shuffle|
+|[#1551](https://github.com/NVIDIA/spark-rapids/pull/1551)|RAPIDS-accelerated Hive UDFs support all types|
+|[#1543](https://github.com/NVIDIA/spark-rapids/pull/1543)|Shuffle/transport enabled by default|
+|[#1552](https://github.com/NVIDIA/spark-rapids/pull/1552)|Disable blackduck signature check|
+|[#1540](https://github.com/NVIDIA/spark-rapids/pull/1540)|Handle ShuffleManager api calls when plugin is not fully initialized|
+|[#1547](https://github.com/NVIDIA/spark-rapids/pull/1547)|Cleanup shuffle transport receive calls|
+|[#1512](https://github.com/NVIDIA/spark-rapids/pull/1512)|Support window operations on Decimal|
+|[#1532](https://github.com/NVIDIA/spark-rapids/pull/1532)|Support casting from decimal to decimal|
+|[#1542](https://github.com/NVIDIA/spark-rapids/pull/1542)|Change the number of partitions to zero when a range is empty|
+|[#1506](https://github.com/NVIDIA/spark-rapids/pull/1506)|Add --use-decimals flag to TPC-DS ConvertFiles|
+|[#1511](https://github.com/NVIDIA/spark-rapids/pull/1511)|Remove unused Jenkinsfiles [skip ci]|
+|[#1505](https://github.com/NVIDIA/spark-rapids/pull/1505)|Add least, greatest and eqNullSafe support for DecimalType|
+|[#1484](https://github.com/NVIDIA/spark-rapids/pull/1484)|add doc for nsight systems bundled with cuda toolkit|
+|[#1478](https://github.com/NVIDIA/spark-rapids/pull/1478)|Documentation for RAPIDS-accelerated Hive UDFs|
+|[#1477](https://github.com/NVIDIA/spark-rapids/pull/1477)|Allow structs and arrays to pass through for Shuffle and Sort |
+|[#1489](https://github.com/NVIDIA/spark-rapids/pull/1489)|Adds in some support for the array sql function|
+|[#1438](https://github.com/NVIDIA/spark-rapids/pull/1438)|Cast from numeric types to decimal type|
+|[#1493](https://github.com/NVIDIA/spark-rapids/pull/1493)|Moved ParquetRecordMaterializer to the shim package to follow convention|
+|[#1495](https://github.com/NVIDIA/spark-rapids/pull/1495)|Fix merge conflict, merge branch 0.3 to branch 0.4 [skip ci]|
+|[#1472](https://github.com/NVIDIA/spark-rapids/pull/1472)|Add an example RAPIDS-accelerated Hive UDF using native code|
+|[#1488](https://github.com/NVIDIA/spark-rapids/pull/1488)|Rename Spark 3.1.0 shim to Spark 3.1.1 to match community|
+|[#1474](https://github.com/NVIDIA/spark-rapids/pull/1474)|Fix link|
+|[#1476](https://github.com/NVIDIA/spark-rapids/pull/1476)|DecimalType support for Aggregate Count|
+|[#1475](https://github.com/NVIDIA/spark-rapids/pull/1475)| Join support for DecimalType|
+|[#1244](https://github.com/NVIDIA/spark-rapids/pull/1244)|Support round and bround SQL functions |
+|[#1458](https://github.com/NVIDIA/spark-rapids/pull/1458)|Add in support for struct and named_struct|
+|[#1465](https://github.com/NVIDIA/spark-rapids/pull/1465)|DecimalType support for UnionExec and ExpandExec|
+|[#1450](https://github.com/NVIDIA/spark-rapids/pull/1450)|Add dynamic configs for the spark-rapids IT pipelines|
+|[#1207](https://github.com/NVIDIA/spark-rapids/pull/1207)|Spark SQL hash function using murmur3|
+|[#1457](https://github.com/NVIDIA/spark-rapids/pull/1457)|Support reading decimal columns from parquet files on Databricks|
+|[#1455](https://github.com/NVIDIA/spark-rapids/pull/1455)|Upgrade Scala Maven Plugin to 4.3.0|
+|[#1453](https://github.com/NVIDIA/spark-rapids/pull/1453)|DecimalType support for IfElse and Coalesce|
+|[#1452](https://github.com/NVIDIA/spark-rapids/pull/1452)|Support DecimalType for CaseWhen|
+|[#1444](https://github.com/NVIDIA/spark-rapids/pull/1444)|Improve UX when running benchmarks from Spark shell|
+|[#1294](https://github.com/NVIDIA/spark-rapids/pull/1294)|Support reading decimal columns from parquet files|
+|[#1153](https://github.com/NVIDIA/spark-rapids/pull/1153)|Scala UDF will compile children expressions in Project|
+|[#1416](https://github.com/NVIDIA/spark-rapids/pull/1416)|Optimize mvn dependency download scripts|
+|[#1430](https://github.com/NVIDIA/spark-rapids/pull/1430)|Add project for testing code that requires Spark 3.1.0 or later|
+|[#1425](https://github.com/NVIDIA/spark-rapids/pull/1425)|Add in Decimal support for abs, floor, ceil, unary - and unary +|
+|[#1427](https://github.com/NVIDIA/spark-rapids/pull/1427)|Revert "Make the multi-threaded parquet reader the default"|
+|[#1420](https://github.com/NVIDIA/spark-rapids/pull/1420)|Add udf jar to nightly integration tests|
+|[#1422](https://github.com/NVIDIA/spark-rapids/pull/1422)|Log the number of concurrent gpu tasks allowed on Executor startup|
+|[#1401](https://github.com/NVIDIA/spark-rapids/pull/1401)|Accelerate the coalescing parquet reader when reading files from multiple partitioned folders|
+|[#1413](https://github.com/NVIDIA/spark-rapids/pull/1413)|Add config for cast float to integral types|
+|[#1313](https://github.com/NVIDIA/spark-rapids/pull/1313)|Support spilling to disk directly via cuFile/GDS|
+|[#1411](https://github.com/NVIDIA/spark-rapids/pull/1411)|Add udf-examples jar to databricks build|
+|[#1412](https://github.com/NVIDIA/spark-rapids/pull/1412)|Fix a lot of tests marked with xfail for Spark 3.1.0 that no longer fail|
+|[#1414](https://github.com/NVIDIA/spark-rapids/pull/1414)|Build merged code of HEAD and BASE branch for pre-merge [skip ci]|
+|[#1409](https://github.com/NVIDIA/spark-rapids/pull/1409)|Add option to use decimals in tpc-ds csv to parquet conversion|
+|[#1410](https://github.com/NVIDIA/spark-rapids/pull/1410)|Add Decimal support for In, InSet, AtLeastNNonNulls, GetArrayItem, GetStructField, and GenerateExec|
+|[#1408](https://github.com/NVIDIA/spark-rapids/pull/1408)|Support RAPIDS-accelerated HiveGenericUDF|
+|[#1407](https://github.com/NVIDIA/spark-rapids/pull/1407)|Update docs and tests for null CSV support|
+|[#1393](https://github.com/NVIDIA/spark-rapids/pull/1393)|Support RAPIDS-accelerated HiveSimpleUDF|
+|[#1392](https://github.com/NVIDIA/spark-rapids/pull/1392)|Turn on hash partitioning for decimal support|
+|[#1402](https://github.com/NVIDIA/spark-rapids/pull/1402)|Better GPU Cast type checks|
+|[#1404](https://github.com/NVIDIA/spark-rapids/pull/1404)|Fix branch 0.4 merge conflict|
+|[#1323](https://github.com/NVIDIA/spark-rapids/pull/1323)|More advanced type checking and documentation|
+|[#1391](https://github.com/NVIDIA/spark-rapids/pull/1391)|Remove extra null join filtering because cudf is fast for this now.|
+|[#1395](https://github.com/NVIDIA/spark-rapids/pull/1395)|Fix branch-0.3 -> branch-0.4 automerge|
+|[#1382](https://github.com/NVIDIA/spark-rapids/pull/1382)|Handle "MM[/-]dd" and "dd[/-]MM" datetime formats in UnixTimeExprMeta|
+|[#1390](https://github.com/NVIDIA/spark-rapids/pull/1390)|Accelerated columnar to row/row to columnar for decimal|
+|[#1380](https://github.com/NVIDIA/spark-rapids/pull/1380)|Adds in basic support for decimal sort, sum, and some shuffle|
+|[#1367](https://github.com/NVIDIA/spark-rapids/pull/1367)|Reuse gpu expression conversion rules when checking sort order|
+|[#1349](https://github.com/NVIDIA/spark-rapids/pull/1349)|Add canonicalization tests|
+|[#1368](https://github.com/NVIDIA/spark-rapids/pull/1368)|Move to cudf 0.18-SNAPSHOT|
+|[#1361](https://github.com/NVIDIA/spark-rapids/pull/1361)|Use the correct precision when reading spark columnar data.|
+|[#1273](https://github.com/NVIDIA/spark-rapids/pull/1273)|Update docs and scripts to 0.4.0-SNAPSHOT|
+|[#1321](https://github.com/NVIDIA/spark-rapids/pull/1321)|Refactor to stop inheriting from HashJoin|
+|[#1311](https://github.com/NVIDIA/spark-rapids/pull/1311)|ParquetCachedBatchSerializer code cleanup|
+|[#1303](https://github.com/NVIDIA/spark-rapids/pull/1303)|Add explicit outputOrdering for BHJ and SHJ in spark310 shim|
+|[#1299](https://github.com/NVIDIA/spark-rapids/pull/1299)|Benchmark runner improved error handling|
+
+## Release 0.3
+
+### Features
+|||
+|:---|:---|
+|[#1002](https://github.com/NVIDIA/spark-rapids/issues/1002)|[FEA] RapidsHostColumnVectorCore should verify cudf data with respect to the expected spark type |
+|[#444](https://github.com/NVIDIA/spark-rapids/issues/444)|[FEA] Plugable Cache|
+|[#1158](https://github.com/NVIDIA/spark-rapids/issues/1158)|[FEA] Better documentation on type support|
+|[#57](https://github.com/NVIDIA/spark-rapids/issues/57)|[FEA] Support INT96 for parquet reads and writes|
+|[#1003](https://github.com/NVIDIA/spark-rapids/issues/1003)|[FEA] Reduce overlap between RapidsHostColumnVector and RapidsHostColumnVectorCore|
+|[#913](https://github.com/NVIDIA/spark-rapids/issues/913)|[FEA] In Pluggable Cache Support CalendarInterval while creating CachedBatches|
+|[#1092](https://github.com/NVIDIA/spark-rapids/issues/1092)|[FEA] In Pluggable Cache handle nested types having CalendarIntervalType and NullType|
+|[#670](https://github.com/NVIDIA/spark-rapids/issues/670)|[FEA] Support NullType|
+|[#50](https://github.com/NVIDIA/spark-rapids/issues/50)|[FEA] support `spark.sql.legacy.timeParserPolicy`|
+|[#1144](https://github.com/NVIDIA/spark-rapids/issues/1144)|[FEA] Remove Databricks 3.0.0 shim layer|
+|[#1096](https://github.com/NVIDIA/spark-rapids/issues/1096)|[FEA] Implement parquet CreateDataSourceTableAsSelectCommand|
+|[#688](https://github.com/NVIDIA/spark-rapids/issues/688)|[FEA] udf compiler should be auto-appended to `spark.sql.extensions`|
+|[#502](https://github.com/NVIDIA/spark-rapids/issues/502)|[FEA] Support Databricks 7.3 LTS Runtime|
+|[#764](https://github.com/NVIDIA/spark-rapids/issues/764)|[FEA] Sanity checks for cudf jar mismatch|
+|[#1018](https://github.com/NVIDIA/spark-rapids/issues/1018)|[FEA] Log details related to GPU memory fragmentation on GPU OOM|
+|[#619](https://github.com/NVIDIA/spark-rapids/issues/619)|[FEA] log whether libcudf and libcudfjni were built for PTDS|
+|[#905](https://github.com/NVIDIA/spark-rapids/issues/905)|[FEA] create AWS EMR 3.0.1 shim|
+|[#838](https://github.com/NVIDIA/spark-rapids/issues/838)|[FEA] Support window count for a column|
+|[#864](https://github.com/NVIDIA/spark-rapids/issues/864)|[FEA] config option to enable RMM arena memory resource|
+|[#430](https://github.com/NVIDIA/spark-rapids/issues/430)|[FEA] Audit: Parquet Writer support for TIMESTAMP_MILLIS|
+|[#818](https://github.com/NVIDIA/spark-rapids/issues/818)|[FEA] Create shim layer for AWS EMR |
+|[#608](https://github.com/NVIDIA/spark-rapids/issues/608)|[FEA] Parquet small file optimization improve handle merge schema|
+
+### Performance
+|||
+|:---|:---|
+|[#446](https://github.com/NVIDIA/spark-rapids/issues/446)|[FEA] Test jucx in 1.9.x branch|
+|[#1038](https://github.com/NVIDIA/spark-rapids/issues/1038)|[FEA] Accelerate the data transfer for plan `WindowInPandasExec`|
+|[#533](https://github.com/NVIDIA/spark-rapids/issues/533)|[FEA] Improve PTDS performance|
+|[#849](https://github.com/NVIDIA/spark-rapids/issues/849)|[FEA] Have GpuColumnarBatchSerializer return GpuColumnVectorFromBuffer instances|
+|[#784](https://github.com/NVIDIA/spark-rapids/issues/784)|[FEA] Allow Host Spilling to be more dynamic|
+|[#627](https://github.com/NVIDIA/spark-rapids/issues/627)|[FEA] Further parquet reading small file improvements|
+|[#5](https://github.com/NVIDIA/spark-rapids/issues/5)|[FEA] Support Adaptive Execution|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#1423](https://github.com/NVIDIA/spark-rapids/issues/1423)|[BUG] Mortgage ETL sample failed with spark.sql.adaptive enabled on AWS EMR 6.2 |
+|[#1369](https://github.com/NVIDIA/spark-rapids/issues/1369)|[BUG] TPC-DS Query Failing on EMR 6.2 with AQE|
+|[#1344](https://github.com/NVIDIA/spark-rapids/issues/1344)|[BUG] Spark-rapids Pytests failed on On Databricks cluster spark standalone mode|
+|[#1279](https://github.com/NVIDIA/spark-rapids/issues/1279)|[BUG] TPC-DS query 2 failing with NPE|
+|[#1280](https://github.com/NVIDIA/spark-rapids/issues/1280)|[BUG] TPC-DS query 93 failing with UnsupportedOperationException|
+|[#1308](https://github.com/NVIDIA/spark-rapids/issues/1308)|[BUG] TPC-DS query 14a runs much slower on 0.3|
+|[#1284](https://github.com/NVIDIA/spark-rapids/issues/1284)|[BUG] TPC-DS query 77 at scale=1TB fails with maxResultSize exceeded error|
+|[#1061](https://github.com/NVIDIA/spark-rapids/issues/1061)|[BUG] orc_test.py is failing|
+|[#1197](https://github.com/NVIDIA/spark-rapids/issues/1197)|[BUG] java.lang.NullPointerException when exporting delta table|
+|[#685](https://github.com/NVIDIA/spark-rapids/issues/685)|[BUG] In ParqueCachedBatchSerializer, serializing parquet buffers might blow up in certain cases|
+|[#1269](https://github.com/NVIDIA/spark-rapids/issues/1269)|[BUG] GpuSubstring is not expected to be a part of a SortOrder|
+|[#1246](https://github.com/NVIDIA/spark-rapids/issues/1246)|[BUG] Many TPC-DS benchmarks fail when writing to Parquet|
+|[#961](https://github.com/NVIDIA/spark-rapids/issues/961)|[BUG] ORC predicate pushdown should work with case-insensitive analysis|
+|[#962](https://github.com/NVIDIA/spark-rapids/issues/962)|[BUG] Loading columns from an ORC file without column names returns no data|
+|[#1245](https://github.com/NVIDIA/spark-rapids/issues/1245)|[BUG] Code adding buffers to the spillable store should synchronize|
+|[#570](https://github.com/NVIDIA/spark-rapids/issues/570)|[BUG] Continue debugging OOM after ensuring device store is empty|
+|[#972](https://github.com/NVIDIA/spark-rapids/issues/972)|[BUG] total time metric is redundant with scan time|
+|[#1039](https://github.com/NVIDIA/spark-rapids/issues/1039)|[BUG] UNBOUNDED window ranges on null timestamp columns produces incorrect results.|
+|[#1195](https://github.com/NVIDIA/spark-rapids/issues/1195)|[BUG] AcceleratedColumnarToRowIterator queue empty|
+|[#1177](https://github.com/NVIDIA/spark-rapids/issues/1177)|[BUG] leaks possible in the rapids shuffle if batches are received after the task completes|
+|[#1216](https://github.com/NVIDIA/spark-rapids/issues/1216)|[BUG] Failure to recognize ORC file format when loaded via Hive|
+|[#898](https://github.com/NVIDIA/spark-rapids/issues/898)|[BUG] count reductions are failing on databricks because lack for Complete support|
+|[#1184](https://github.com/NVIDIA/spark-rapids/issues/1184)|[BUG] test_window_aggregate_udf_array_from_python fails on databricks 3.0.1|
+|[#1151](https://github.com/NVIDIA/spark-rapids/issues/1151)|[BUG]Add databricks 3.0.1 shim layer for GpuWindowInPandasExec.|
+|[#1199](https://github.com/NVIDIA/spark-rapids/issues/1199)|[BUG] No data size in Input column in Stages page from Spark UI when using Parquet as file source|
+|[#1031](https://github.com/NVIDIA/spark-rapids/issues/1031)|[BUG] dependency info properties file contains error messages|
+|[#1149](https://github.com/NVIDIA/spark-rapids/issues/1149)|[BUG] Scaladoc warnings in GpuDataSource|
+|[#1185](https://github.com/NVIDIA/spark-rapids/issues/1185)|[BUG] test_hash_multiple_mode_query failing|
+|[#724](https://github.com/NVIDIA/spark-rapids/issues/724)|[BUG] PySpark test_broadcast_nested_loop_join_special_case intermittent failure|
+|[#1164](https://github.com/NVIDIA/spark-rapids/issues/1164)|[BUG] ansi_cast tests are failing in 3.1.0|
+|[#1110](https://github.com/NVIDIA/spark-rapids/issues/1110)|[BUG] Special date "now" has wrong value on GPU|
+|[#1139](https://github.com/NVIDIA/spark-rapids/issues/1139)|[BUG] Host columnar to GPU can be very slow|
+|[#1094](https://github.com/NVIDIA/spark-rapids/issues/1094)|[BUG] unix_timestamp on GPU returns invalid data for special dates|
+|[#1098](https://github.com/NVIDIA/spark-rapids/issues/1098)|[BUG] unix_timestamp on GPU returns invalid data for bad input|
+|[#1082](https://github.com/NVIDIA/spark-rapids/issues/1082)|[BUG] string to timestamp conversion fails with split|
+|[#1140](https://github.com/NVIDIA/spark-rapids/issues/1140)|[BUG] ConcurrentModificationException error after scala test suite completes|
+|[#1073](https://github.com/NVIDIA/spark-rapids/issues/1073)|[BUG] java.lang.RuntimeException: BinaryExpressions must override either eval or nullSafeEval|
+|[#975](https://github.com/NVIDIA/spark-rapids/issues/975)|[BUG] BroadcastExchangeExec fails to fall back to CPU on driver node on GCP Dataproc|
+|[#773](https://github.com/NVIDIA/spark-rapids/issues/773)|[BUG] Investigate high task deserialization|
+|[#1035](https://github.com/NVIDIA/spark-rapids/issues/1035)|[BUG] TPC-DS query 90 with AQE enabled fails with doExecuteBroadcast exception|
+|[#825](https://github.com/NVIDIA/spark-rapids/issues/825)|[BUG] test_window_aggs_for_ranges intermittently fails|
+|[#1008](https://github.com/NVIDIA/spark-rapids/issues/1008)|[BUG] limit function is producing inconsistent result when type is Byte, Long, Boolean and Timestamp|
+|[#996](https://github.com/NVIDIA/spark-rapids/issues/996)|[BUG] TPC-DS benchmark via spark-submit does not provide option to disable appending .dat to path|
+|[#1006](https://github.com/NVIDIA/spark-rapids/issues/1006)|[BUG] Spark3.1.0 changed BasicWriteTaskStats breaks BasicColumnarWriteTaskStatsTracker|
+|[#985](https://github.com/NVIDIA/spark-rapids/issues/985)|[BUG] missing metric `dataSize`|
+|[#881](https://github.com/NVIDIA/spark-rapids/issues/881)|[BUG] cannot disable Sort by itself|
+|[#812](https://github.com/NVIDIA/spark-rapids/issues/812)|[BUG] Test failures for 0.2 when run with multiple executors|
+|[#925](https://github.com/NVIDIA/spark-rapids/issues/925)|[BUG]Range window-functions with non-timestamp order-by expressions not falling back to CPU|
+|[#852](https://github.com/NVIDIA/spark-rapids/issues/852)|[BUG] BenchUtils.compareResults cannot compare partitioned files when ignoreOrdering=false|
+|[#868](https://github.com/NVIDIA/spark-rapids/issues/868)|[BUG] Rounding error when casting timestamp to string for timestamps before 1970|
+|[#880](https://github.com/NVIDIA/spark-rapids/issues/880)|[BUG] doing a window operation with an orderby for a single constant crashes|
+|[#776](https://github.com/NVIDIA/spark-rapids/issues/776)|[BUG] Integration test fails on spark 3.1.0-SNAPSHOT|
+|[#874](https://github.com/NVIDIA/spark-rapids/issues/874)|[BUG] `RapidsConf.scala` has some un-consistency for `spark.rapids.sql.format.parquet.multiThreadedRead`|
+|[#860](https://github.com/NVIDIA/spark-rapids/issues/860)|[BUG] we need to mark columns from received shuffle buffers as `GpuColumnVectorFromBuffer`|
+|[#122](https://github.com/NVIDIA/spark-rapids/issues/122)|[BUG] CSV Timestamp parseing is broken for TS < 1902 and TS > 2038|
+|[#810](https://github.com/NVIDIA/spark-rapids/issues/810)|[BUG] UDF Integration tests fail if pandas is not installed|
+|[#746](https://github.com/NVIDIA/spark-rapids/issues/746)|[BUG] cudf_udf_test.py is flakey|
+|[#811](https://github.com/NVIDIA/spark-rapids/issues/811)|[BUG] 0.3 nightly is timing out |
+|[#574](https://github.com/NVIDIA/spark-rapids/issues/574)|[BUG] Fix GpuTimeSub for Spark 3.1.0|
+
+### PRs
+|||
+|:---|:---|
+|[#1496](https://github.com/NVIDIA/spark-rapids/pull/1496)|Update changelog for v0.3.0 release [skip ci]|
+|[#1473](https://github.com/NVIDIA/spark-rapids/pull/1473)|Update documentation for 0.3 release|
+|[#1371](https://github.com/NVIDIA/spark-rapids/pull/1371)|Start Guide for RAPIDS on AWS EMR 6.2|
+|[#1446](https://github.com/NVIDIA/spark-rapids/pull/1446)|Update changelog for 0.3.0 release [skip ci]|
+|[#1439](https://github.com/NVIDIA/spark-rapids/pull/1439)|when AQE enabled we fail to fix up exchanges properly and EMR|
+|[#1433](https://github.com/NVIDIA/spark-rapids/pull/1433)|fix pandas 1.2 compatible issue|
+|[#1424](https://github.com/NVIDIA/spark-rapids/pull/1424)|Make the multi-threaded parquet reader the default since coalescing doesn't handle partitioned files well|
+|[#1389](https://github.com/NVIDIA/spark-rapids/pull/1389)|Update project version to 0.3.0|
+|[#1387](https://github.com/NVIDIA/spark-rapids/pull/1387)|Update cudf version to 0.17|
+|[#1370](https://github.com/NVIDIA/spark-rapids/pull/1370)|[REVIEW] init changelog 0.3 [skip ci]|
+|[#1376](https://github.com/NVIDIA/spark-rapids/pull/1376)|MetaUtils.getBatchFromMeta should return batches with GpuColumnVectorFromBuffer|
+|[#1358](https://github.com/NVIDIA/spark-rapids/pull/1358)|auto-merge: instant merge after creation [skip ci]|
+|[#1359](https://github.com/NVIDIA/spark-rapids/pull/1359)|Use SortOrder from shims.|
+|[#1343](https://github.com/NVIDIA/spark-rapids/pull/1343)|Do not run UDFs when the partition is empty.|
+|[#1342](https://github.com/NVIDIA/spark-rapids/pull/1342)|Fix and edit docs for standalone mode|
+|[#1350](https://github.com/NVIDIA/spark-rapids/pull/1350)|fix GpuRangePartitioning canonicalization|
+|[#1281](https://github.com/NVIDIA/spark-rapids/pull/1281)|Documentation added for testing|
+|[#1336](https://github.com/NVIDIA/spark-rapids/pull/1336)|Fix missing post-shuffle coalesce with AQE|
+|[#1318](https://github.com/NVIDIA/spark-rapids/pull/1318)|Fix copying GpuFileSourceScanExec node|
+|[#1337](https://github.com/NVIDIA/spark-rapids/pull/1337)|Use UTC instead of GMT|
+|[#1307](https://github.com/NVIDIA/spark-rapids/pull/1307)|Fallback to cpu when reading Delta log files for stats|
+|[#1310](https://github.com/NVIDIA/spark-rapids/pull/1310)|Fix canonicalization of GpuFileSourceScanExec, GpuShuffleCoalesceExec|
+|[#1302](https://github.com/NVIDIA/spark-rapids/pull/1302)|Add GpuSubstring handling to SortOrder canonicalization|
+|[#1265](https://github.com/NVIDIA/spark-rapids/pull/1265)|Chunking input before writing a ParquetCachedBatch|
+|[#1278](https://github.com/NVIDIA/spark-rapids/pull/1278)|Add a config to disable decimal types by default|
+|[#1272](https://github.com/NVIDIA/spark-rapids/pull/1272)|Add Alias to shims|
+|[#1268](https://github.com/NVIDIA/spark-rapids/pull/1268)|Adds in support docs for 0.3 release|
+|[#1235](https://github.com/NVIDIA/spark-rapids/pull/1235)|Trigger reading and handling control data.|
+|[#1266](https://github.com/NVIDIA/spark-rapids/pull/1266)|Updating Databricks getting started for 0.3 release|
+|[#1291](https://github.com/NVIDIA/spark-rapids/pull/1291)|Increase pre-merge resource requests [skip ci]|
+|[#1275](https://github.com/NVIDIA/spark-rapids/pull/1275)|Temporarily disable more CAST tests for Spark 3.1.0|
+|[#1264](https://github.com/NVIDIA/spark-rapids/pull/1264)|Fix race condition in batch creation|
+|[#1260](https://github.com/NVIDIA/spark-rapids/pull/1260)|Update UCX license info in NOTIFY-binary for 1.9 and RAPIDS plugin copyright dates|
+|[#1247](https://github.com/NVIDIA/spark-rapids/pull/1247)|Ensure column names are valid when writing benchmark query results to file|
+|[#1240](https://github.com/NVIDIA/spark-rapids/pull/1240)|Fix loading from ORC file with no column names|
+|[#1242](https://github.com/NVIDIA/spark-rapids/pull/1242)|Remove compatibility documentation about unsupported INT96|
+|[#1192](https://github.com/NVIDIA/spark-rapids/pull/1192)|[REVIEW] Support GpuFilter and GpuCoalesceBatches for decimal data|
+|[#1170](https://github.com/NVIDIA/spark-rapids/pull/1170)|Add nested type support to MetaUtils|
+|[#1194](https://github.com/NVIDIA/spark-rapids/pull/1194)|Drop redundant total time metric from scan|
+|[#1248](https://github.com/NVIDIA/spark-rapids/pull/1248)|At BatchedTableCompressor.finish synchronize to allow for "right-size…|
+|[#1169](https://github.com/NVIDIA/spark-rapids/pull/1169)|Use CUDF's "UNBOUNDED" window boundaries for time-range queries.|
+|[#1204](https://github.com/NVIDIA/spark-rapids/pull/1204)|Avoid empty batches on columnar to row conversion|
+|[#1133](https://github.com/NVIDIA/spark-rapids/pull/1133)|Refactor batch coalesce to be based solely on batch data size|
+|[#1237](https://github.com/NVIDIA/spark-rapids/pull/1237)|In transport, limit pending transfer requests to fit within a bounce|
+|[#1232](https://github.com/NVIDIA/spark-rapids/pull/1232)|Move SortOrder creation to shims|
+|[#1068](https://github.com/NVIDIA/spark-rapids/pull/1068)|Write int96 to parquet|
+|[#1193](https://github.com/NVIDIA/spark-rapids/pull/1193)|Verify shuffle of decimal columns|
+|[#1180](https://github.com/NVIDIA/spark-rapids/pull/1180)|Remove batches if they are received after the iterator detects that t…|
+|[#1173](https://github.com/NVIDIA/spark-rapids/pull/1173)|Support relational operators for decimal type|
+|[#1220](https://github.com/NVIDIA/spark-rapids/pull/1220)|Support replacing ORC format when Hive is configured|
+|[#1219](https://github.com/NVIDIA/spark-rapids/pull/1219)|Upgrade to jucx 1.9.0|
+|[#1081](https://github.com/NVIDIA/spark-rapids/pull/1081)|Add option to upload benchmark summary JSON file|
+|[#1217](https://github.com/NVIDIA/spark-rapids/pull/1217)|Aggregate reductions in Complete mode should use updateExpressions|
+|[#1218](https://github.com/NVIDIA/spark-rapids/pull/1218)|Remove obsolete HiveStringType usage|
+|[#1214](https://github.com/NVIDIA/spark-rapids/pull/1214)|changelog update 2020-11-30. Trigger automerge check [skip ci]|
+|[#1210](https://github.com/NVIDIA/spark-rapids/pull/1210)|Support auto-merge for branch-0.4 [skip ci]|
+|[#1202](https://github.com/NVIDIA/spark-rapids/pull/1202)|Fix a bug with the support for java.lang.StringBuilder.append.|
+|[#1213](https://github.com/NVIDIA/spark-rapids/pull/1213)|Skip casting StringType to TimestampType for Spark 310|
+|[#1201](https://github.com/NVIDIA/spark-rapids/pull/1201)|Replace only window expressions on databricks.|
+|[#1208](https://github.com/NVIDIA/spark-rapids/pull/1208)|[BUG] Fix GHSL2020-239 [skip ci]|
+|[#1205](https://github.com/NVIDIA/spark-rapids/pull/1205)|Fix missing input bytes read metric for Parquet|
+|[#1206](https://github.com/NVIDIA/spark-rapids/pull/1206)|Update Spark 3.1 shim for ShuffleOrigin shuffle parameter|
+|[#1196](https://github.com/NVIDIA/spark-rapids/pull/1196)|Rename ShuffleCoalesceExec to GpuShuffleCoalesceExec|
+|[#1191](https://github.com/NVIDIA/spark-rapids/pull/1191)|Skip window array tests for databricks.|
+|[#1183](https://github.com/NVIDIA/spark-rapids/pull/1183)|Support for CalendarIntervalType and NullType|
+|[#1150](https://github.com/NVIDIA/spark-rapids/pull/1150)|udf spec|
+|[#1188](https://github.com/NVIDIA/spark-rapids/pull/1188)|Add in tests for parquet nested pruning support|
+|[#1189](https://github.com/NVIDIA/spark-rapids/pull/1189)|Enable NullType for First and Last in 3.0.1+|
+|[#1181](https://github.com/NVIDIA/spark-rapids/pull/1181)|Fix resource leaks in unit tests|
+|[#1186](https://github.com/NVIDIA/spark-rapids/pull/1186)|Fix compilation and scaladoc warnings|
+|[#1187](https://github.com/NVIDIA/spark-rapids/pull/1187)|Updated documentation for distinct count compatibility|
+|[#1182](https://github.com/NVIDIA/spark-rapids/pull/1182)|Close buffer catalog on device manager shutdown|
+|[#1137](https://github.com/NVIDIA/spark-rapids/pull/1137)|Let GpuWindowInPandas declare ArrayType supported.|
+|[#1176](https://github.com/NVIDIA/spark-rapids/pull/1176)|Add in support for null type|
+|[#1174](https://github.com/NVIDIA/spark-rapids/pull/1174)|Fix race condition in SerializeConcatHostBuffersDeserializeBatch|
+|[#1175](https://github.com/NVIDIA/spark-rapids/pull/1175)|Fix leaks seen in shuffle tests|
+|[#1138](https://github.com/NVIDIA/spark-rapids/pull/1138)|[REVIEW] Support decimal type for GpuProjectExec|
+|[#1162](https://github.com/NVIDIA/spark-rapids/pull/1162)|Set job descriptions in benchmark runner|
+|[#1172](https://github.com/NVIDIA/spark-rapids/pull/1172)|Revert "Fix race condition (#1165)"|
+|[#1060](https://github.com/NVIDIA/spark-rapids/pull/1060)|Show partition metrics for custom shuffler reader|
+|[#1152](https://github.com/NVIDIA/spark-rapids/pull/1152)|Add spark301db shim layer for WindowInPandas.|
+|[#1167](https://github.com/NVIDIA/spark-rapids/pull/1167)|Nulls out the dataframe if --gc-between-runs is set|
+|[#1165](https://github.com/NVIDIA/spark-rapids/pull/1165)|Fix race condition in SerializeConcatHostBuffersDeserializeBatch|
+|[#1163](https://github.com/NVIDIA/spark-rapids/pull/1163)|Add in support for GetStructField|
+|[#1166](https://github.com/NVIDIA/spark-rapids/pull/1166)|Fix the cast tests for 3.1.0+|
+|[#1159](https://github.com/NVIDIA/spark-rapids/pull/1159)|fix bug where 'now' had same value as 'today' for timestamps|
+|[#1161](https://github.com/NVIDIA/spark-rapids/pull/1161)|Fix nightly build pipeline failure.|
+|[#1160](https://github.com/NVIDIA/spark-rapids/pull/1160)|Fix some performance problems with columnar to columnar conversion|
+|[#1105](https://github.com/NVIDIA/spark-rapids/pull/1105)|[REVIEW] Change ColumnViewAccess usage to work with ColumnView|
+|[#1148](https://github.com/NVIDIA/spark-rapids/pull/1148)|Add in tests for Maps and extend map support where possible|
+|[#1154](https://github.com/NVIDIA/spark-rapids/pull/1154)|Mark test as xfail until we can get a fix in|
+|[#1113](https://github.com/NVIDIA/spark-rapids/pull/1113)|Support unix_timestamp on GPU for subset of formats|
+|[#1156](https://github.com/NVIDIA/spark-rapids/pull/1156)|Fix warning introduced in iterator suite|
+|[#1095](https://github.com/NVIDIA/spark-rapids/pull/1095)|Dependency info|
+|[#1145](https://github.com/NVIDIA/spark-rapids/pull/1145)|Remove support for databricks 7.0 runtime - shim spark300db|
+|[#1147](https://github.com/NVIDIA/spark-rapids/pull/1147)|Change the assert to require for handling TIMESTAMP_MILLIS in isDateTimeRebaseNeeded |
+|[#1132](https://github.com/NVIDIA/spark-rapids/pull/1132)|Add in basic support to read structs from parquet|
+|[#1121](https://github.com/NVIDIA/spark-rapids/pull/1121)|Shuffle/better error handling|
+|[#1134](https://github.com/NVIDIA/spark-rapids/pull/1134)|Support saveAsTable for writing orc and parquet|
+|[#1124](https://github.com/NVIDIA/spark-rapids/pull/1124)|Add shim layers for GpuWindowInPandasExec.|
+|[#1131](https://github.com/NVIDIA/spark-rapids/pull/1131)|Add in some basic support for Structs|
+|[#1127](https://github.com/NVIDIA/spark-rapids/pull/1127)|Add in basic support for reading lists from parquet|
+|[#1129](https://github.com/NVIDIA/spark-rapids/pull/1129)|Fix resource leaks with new shuffle optimization|
+|[#1116](https://github.com/NVIDIA/spark-rapids/pull/1116)|Optimize normal shuffle by coalescing smaller batches on host|
+|[#1102](https://github.com/NVIDIA/spark-rapids/pull/1102)|Auto-register UDF extention when main plugin is set|
+|[#1108](https://github.com/NVIDIA/spark-rapids/pull/1108)|Remove integration test pipelines on NGCC|
+|[#1123](https://github.com/NVIDIA/spark-rapids/pull/1123)|Mark Pandas udf over window tests as xfail on databricks until they can be fixed|
+|[#1120](https://github.com/NVIDIA/spark-rapids/pull/1120)|Add in support for filtering ArrayType|
+|[#1080](https://github.com/NVIDIA/spark-rapids/pull/1080)|Support for CalendarIntervalType and NullType for ParquetCachedSerializer|
+|[#994](https://github.com/NVIDIA/spark-rapids/pull/994)|Packs bounce buffers for highly partitioned shuffles|
+|[#1112](https://github.com/NVIDIA/spark-rapids/pull/1112)|Remove bad config from pytest setup|
+|[#1107](https://github.com/NVIDIA/spark-rapids/pull/1107)|closeOnExcept -> withResources in MetaUtils|
+|[#1104](https://github.com/NVIDIA/spark-rapids/pull/1104)|Support lists to/from the GPU|
+|[#1106](https://github.com/NVIDIA/spark-rapids/pull/1106)|Improve mechanism for expected exceptions in tests|
+|[#1069](https://github.com/NVIDIA/spark-rapids/pull/1069)|Accelerate the data transfer between JVM and Python for the plan 'GpuWindowInPandasExec'|
+|[#1099](https://github.com/NVIDIA/spark-rapids/pull/1099)|Update how we deal with type checking|
+|[#1077](https://github.com/NVIDIA/spark-rapids/pull/1077)|Improve AQE transitions for shuffle and coalesce batches|
+|[#1097](https://github.com/NVIDIA/spark-rapids/pull/1097)|Cleanup some instances of excess closure serialization|
+|[#1090](https://github.com/NVIDIA/spark-rapids/pull/1090)|Fix the integration build|
+|[#1086](https://github.com/NVIDIA/spark-rapids/pull/1086)|Speed up test performance using pytest-xdist|
+|[#1084](https://github.com/NVIDIA/spark-rapids/pull/1084)|Avoid issues where more scalars that expected show up in an expression|
+|[#1076](https://github.com/NVIDIA/spark-rapids/pull/1076)|[FEA] Support Databricks 7.3 LTS Runtime|
+|[#1083](https://github.com/NVIDIA/spark-rapids/pull/1083)|Revert "Get cudf/spark dependency from the correct .m2 dir"|
+|[#1062](https://github.com/NVIDIA/spark-rapids/pull/1062)|Get cudf/spark dependency from the correct .m2 dir|
+|[#1078](https://github.com/NVIDIA/spark-rapids/pull/1078)|Another round of fixes for mapping of DataType to DType|
+|[#1066](https://github.com/NVIDIA/spark-rapids/pull/1066)|More fixes for conversion to ColumnarBatch|
+|[#1029](https://github.com/NVIDIA/spark-rapids/pull/1029)|BenchmarkRunner should produce JSON summary file even when queries fail|
+|[#1055](https://github.com/NVIDIA/spark-rapids/pull/1055)|Fix build warnings|
+|[#1064](https://github.com/NVIDIA/spark-rapids/pull/1064)|Use array instead of List for from(Table, DataType)|
+|[#1057](https://github.com/NVIDIA/spark-rapids/pull/1057)|Fix empty table broadcast requiring a GPU on driver node|
+|[#1047](https://github.com/NVIDIA/spark-rapids/pull/1047)|Sanity checks for cudf jar mismatch|
+|[#1044](https://github.com/NVIDIA/spark-rapids/pull/1044)|Accelerated row to columnar and columnar to row transitions|
+|[#1056](https://github.com/NVIDIA/spark-rapids/pull/1056)|Add query number to Spark app name when running benchmarks|
+|[#1054](https://github.com/NVIDIA/spark-rapids/pull/1054)|Log total RMM allocated on GPU OOM|
+|[#1053](https://github.com/NVIDIA/spark-rapids/pull/1053)|Remove isGpuBroadcastNestedLoopJoin from shims|
+|[#1052](https://github.com/NVIDIA/spark-rapids/pull/1052)|Allow for GPUCoalesceBatch to deal with Map|
+|[#1051](https://github.com/NVIDIA/spark-rapids/pull/1051)|Add simple retry for URM dependencies [skip ci]|
+|[#1046](https://github.com/NVIDIA/spark-rapids/pull/1046)|Fix broken links|
+|[#1017](https://github.com/NVIDIA/spark-rapids/pull/1017)|Log whether PTDS is enabled|
+|[#1040](https://github.com/NVIDIA/spark-rapids/pull/1040)|Update to cudf 0.17-SNAPSHOT and fix tests|
+|[#1042](https://github.com/NVIDIA/spark-rapids/pull/1042)|Fix inconsistencies in AQE support for broadcast joins|
+|[#1037](https://github.com/NVIDIA/spark-rapids/pull/1037)|Add in support for the SQL functions Least and Greatest|
+|[#1036](https://github.com/NVIDIA/spark-rapids/pull/1036)|Increase number of retries when waiting for databricks cluster|
+|[#1034](https://github.com/NVIDIA/spark-rapids/pull/1034)|[BUG] To honor spark.rapids.memory.gpu.pool=NONE|
+|[#854](https://github.com/NVIDIA/spark-rapids/pull/854)|Arbitrary function call in UDF|
+|[#1028](https://github.com/NVIDIA/spark-rapids/pull/1028)|Update to cudf-0.16|
+|[#1023](https://github.com/NVIDIA/spark-rapids/pull/1023)|Add --gc-between-run flag for TPC* benchmarks.|
+|[#1001](https://github.com/NVIDIA/spark-rapids/pull/1001)|ColumnarBatch to CachedBatch and back|
+|[#990](https://github.com/NVIDIA/spark-rapids/pull/990)|Parquet coalesce file reader for local filesystems|
+|[#1014](https://github.com/NVIDIA/spark-rapids/pull/1014)|Add --append-dat flag for TPC-DS benchmark|
+|[#991](https://github.com/NVIDIA/spark-rapids/pull/991)|Updated GCP Dataproc Mortgage-ETL-GPU.ipynb|
+|[#886](https://github.com/NVIDIA/spark-rapids/pull/886)|Spark BinaryType and cast to BinaryType|
+|[#1016](https://github.com/NVIDIA/spark-rapids/pull/1016)|Change Hash Aggregate to allow pass-through on MapType|
+|[#984](https://github.com/NVIDIA/spark-rapids/pull/984)|Add support for MapType in selected operators |
+|[#1012](https://github.com/NVIDIA/spark-rapids/pull/1012)|Update for new position parameter in Spark 3.1.0 RegExpReplace|
+|[#995](https://github.com/NVIDIA/spark-rapids/pull/995)|Add shim for EMR 3.0.1 and EMR 3.0.1-SNAPSHOT|
+|[#998](https://github.com/NVIDIA/spark-rapids/pull/998)|Update benchmark automation script|
+|[#1000](https://github.com/NVIDIA/spark-rapids/pull/1000)|Always use RAPIDS shuffle when running TPCH and Mortgage tests|
+|[#981](https://github.com/NVIDIA/spark-rapids/pull/981)|Change databricks build to dynamically create a cluster|
+|[#986](https://github.com/NVIDIA/spark-rapids/pull/986)|Fix missing dataSize metric when using RAPIDS shuffle|
+|[#914](https://github.com/NVIDIA/spark-rapids/pull/914)|Write InternalRow to CachedBatch|
+|[#934](https://github.com/NVIDIA/spark-rapids/pull/934)|Iterator to make it easier to work with a window of blocks in the RAPIDS shuffle|
+|[#992](https://github.com/NVIDIA/spark-rapids/pull/992)|Skip post-clean if aborted before the image build stage in pre-merge [skip ci]|
+|[#988](https://github.com/NVIDIA/spark-rapids/pull/988)|Change in Spark caused the 3.1.0 CI to fail|
+|[#983](https://github.com/NVIDIA/spark-rapids/pull/983)|clean jenkins file for premerge on NGCC|
+|[#964](https://github.com/NVIDIA/spark-rapids/pull/964)|Refactor TPC benchmarks to reduce duplicate code|
+|[#978](https://github.com/NVIDIA/spark-rapids/pull/978)|Enable scalastyle checks for udf-compiler module|
+|[#949](https://github.com/NVIDIA/spark-rapids/pull/949)|Fix GpuWindowExec to work with a CPU SortExec|
+|[#973](https://github.com/NVIDIA/spark-rapids/pull/973)|Stop reporting totalTime metric for GpuShuffleExchangeExec|
+|[#968](https://github.com/NVIDIA/spark-rapids/pull/968)|XFail pos_explode tests until final fix can be put in|
+|[#970](https://github.com/NVIDIA/spark-rapids/pull/970)|Add legacy config to clear active Spark 3.1.0 session in tests|
+|[#918](https://github.com/NVIDIA/spark-rapids/pull/918)|Benchmark runner script|
+|[#915](https://github.com/NVIDIA/spark-rapids/pull/915)|Add option to control number of partitions when converting from CSV to Parquet|
+|[#944](https://github.com/NVIDIA/spark-rapids/pull/944)|Fix some issues with non-determinism|
+|[#935](https://github.com/NVIDIA/spark-rapids/pull/935)|Add in support/tests for a window count on a column|
+|[#940](https://github.com/NVIDIA/spark-rapids/pull/940)|Fix closeOnExcept suppressed exception handling|
+|[#942](https://github.com/NVIDIA/spark-rapids/pull/942)|fix github action env setup [skip ci]|
+|[#933](https://github.com/NVIDIA/spark-rapids/pull/933)|Update first/last tests to avoid non-determinisim and ordering differences|
+|[#931](https://github.com/NVIDIA/spark-rapids/pull/931)|Fix checking for nullable columns in window range query|
+|[#924](https://github.com/NVIDIA/spark-rapids/pull/924)|Benchmark guide update for command-line interface / spark-submit|
+|[#926](https://github.com/NVIDIA/spark-rapids/pull/926)|Move pandas_udf functions into the tests functions|
+|[#929](https://github.com/NVIDIA/spark-rapids/pull/929)|Pick a default tableId to use that is non 0 so that flatbuffers allow…|
+|[#928](https://github.com/NVIDIA/spark-rapids/pull/928)|Fix RapidsBufferStore NPE when no spillable buffers are available|
+|[#820](https://github.com/NVIDIA/spark-rapids/pull/820)|Benchmarking guide|
+|[#859](https://github.com/NVIDIA/spark-rapids/pull/859)|Compare partitioned files in order|
+|[#916](https://github.com/NVIDIA/spark-rapids/pull/916)|create new sparkContext explicitly in CPU notebook|
+|[#917](https://github.com/NVIDIA/spark-rapids/pull/917)|create new SparkContext in GPU notebook explicitly.|
+|[#919](https://github.com/NVIDIA/spark-rapids/pull/919)|Add label benchmark to performance subsection in changelog|
+|[#850](https://github.com/NVIDIA/spark-rapids/pull/850)| Add in basic support for lead/lag|
+|[#843](https://github.com/NVIDIA/spark-rapids/pull/843)|[REVIEW] Cache plugin to handle reading CachedBatch to an InternalRow|
+|[#904](https://github.com/NVIDIA/spark-rapids/pull/904)|Add command-line argument for benchmark result filename|
+|[#909](https://github.com/NVIDIA/spark-rapids/pull/909)|GCP preview version image name update|
+|[#903](https://github.com/NVIDIA/spark-rapids/pull/903)|update getting-started-gcp.md with new component list|
+|[#900](https://github.com/NVIDIA/spark-rapids/pull/900)|Turn off CollectLimitExec replacement by default|
+|[#907](https://github.com/NVIDIA/spark-rapids/pull/907)|remove configs from databricks that shouldn't be used by default|
+|[#893](https://github.com/NVIDIA/spark-rapids/pull/893)|Fix rounding error when casting timestamp to string for timestamps before 1970|
+|[#899](https://github.com/NVIDIA/spark-rapids/pull/899)|Mark reduction corner case tests as xfail on databricks until they can be fixed|
+|[#894](https://github.com/NVIDIA/spark-rapids/pull/894)|Replace whole-buffer slicing with direct refcounting|
+|[#891](https://github.com/NVIDIA/spark-rapids/pull/891)|Add config to dump heap on GPU OOM|
+|[#890](https://github.com/NVIDIA/spark-rapids/pull/890)|Clean up CoalesceBatch to use withResource|
+|[#892](https://github.com/NVIDIA/spark-rapids/pull/892)|Only manifest the current batch in cached block shuffle read iterator|
+|[#871](https://github.com/NVIDIA/spark-rapids/pull/871)|Add support for using the arena allocator|
+|[#889](https://github.com/NVIDIA/spark-rapids/pull/889)|Fix crash on scalar only orderby|
+|[#879](https://github.com/NVIDIA/spark-rapids/pull/879)|Update SpillableColumnarBatch to remove buffer from catalog on close|
+|[#888](https://github.com/NVIDIA/spark-rapids/pull/888)|Shrink detect scope to compile only [skip ci]|
+|[#885](https://github.com/NVIDIA/spark-rapids/pull/885)|[BUG] fix IT dockerfile arguments [skip ci]|
+|[#883](https://github.com/NVIDIA/spark-rapids/pull/883)|[BUG] fix IT dockerfile args ordering [skip ci]|
+|[#875](https://github.com/NVIDIA/spark-rapids/pull/875)|fix the non-consistency for `spark.rapids.sql.format.parquet.multiThreadedRead` in RapidsConf.scala|
+|[#862](https://github.com/NVIDIA/spark-rapids/pull/862)|Migrate nightly&integration pipelines to blossom [skip ci]|
+|[#872](https://github.com/NVIDIA/spark-rapids/pull/872)|Ensure that receive-side batches use GpuColumnVectorFromBuffer to avoid|
+|[#833](https://github.com/NVIDIA/spark-rapids/pull/833)|Add nvcomp LZ4 codec support|
+|[#870](https://github.com/NVIDIA/spark-rapids/pull/870)|Cleaned up tests and documentation for csv timestamp parsing|
+|[#823](https://github.com/NVIDIA/spark-rapids/pull/823)|Add command-line interface for TPC-* for use with spark-submit|
+|[#856](https://github.com/NVIDIA/spark-rapids/pull/856)|Move GpuWindowInPandasExec in shims layers|
+|[#756](https://github.com/NVIDIA/spark-rapids/pull/756)|Add stream-time metric|
+|[#832](https://github.com/NVIDIA/spark-rapids/pull/832)|Skip pandas tests if pandas cannot be found|
+|[#841](https://github.com/NVIDIA/spark-rapids/pull/841)|Fix a hanging issue when processing empty data.|
+|[#840](https://github.com/NVIDIA/spark-rapids/pull/840)|[REVIEW] Fixed failing cache tests|
+|[#848](https://github.com/NVIDIA/spark-rapids/pull/848)|Update task memory and disk spill metrics when buffer store spills|
+|[#851](https://github.com/NVIDIA/spark-rapids/pull/851)|Use contiguous table when deserializing columnar batch|
+|[#857](https://github.com/NVIDIA/spark-rapids/pull/857)|fix pvc scheduling issue|
+|[#853](https://github.com/NVIDIA/spark-rapids/pull/853)|Remove nodeAffinity from premerge pipeline|
+|[#796](https://github.com/NVIDIA/spark-rapids/pull/796)|Record spark plan SQL metrics to JSON when running benchmarks|
+|[#781](https://github.com/NVIDIA/spark-rapids/pull/781)|Add AQE unit tests|
+|[#824](https://github.com/NVIDIA/spark-rapids/pull/824)|Skip cudf_udf test by default|
+|[#839](https://github.com/NVIDIA/spark-rapids/pull/839)|First/Last reduction and cleanup of agg APIs|
+|[#827](https://github.com/NVIDIA/spark-rapids/pull/827)|Add Spark 3.0 EMR Shim layer |
+|[#816](https://github.com/NVIDIA/spark-rapids/pull/816)|[BUG] fix nightly is timing out|
+|[#782](https://github.com/NVIDIA/spark-rapids/pull/782)|Benchmark utility to perform diff of output from benchmark runs, allowing for precision differences|
+|[#813](https://github.com/NVIDIA/spark-rapids/pull/813)|Revert "Enable tests in udf_cudf_test.py"|
+|[#788](https://github.com/NVIDIA/spark-rapids/pull/788)|[FEA] Persist workspace data on PVC for premerge|
+|[#805](https://github.com/NVIDIA/spark-rapids/pull/805)|[FEA] nightly build trigger both IT on spark 300 and 301|
+|[#797](https://github.com/NVIDIA/spark-rapids/pull/797)|Allow host spill store to fit a buffer larger than configured max size|
+|[#807](https://github.com/NVIDIA/spark-rapids/pull/807)|Deploy integration-tests javadoc and sources|
+|[#777](https://github.com/NVIDIA/spark-rapids/pull/777)|Enable tests in udf_cudf_test.py|
+|[#790](https://github.com/NVIDIA/spark-rapids/pull/790)|CI: Update cudf python to 0.16 nightly|
+|[#772](https://github.com/NVIDIA/spark-rapids/pull/772)|Add support for empty array construction.|
+|[#783](https://github.com/NVIDIA/spark-rapids/pull/783)|Improved GpuArrowEvalPythonExec|
+|[#771](https://github.com/NVIDIA/spark-rapids/pull/771)|Various improvements to benchmarks|
+|[#763](https://github.com/NVIDIA/spark-rapids/pull/763)|[REVIEW] Allow CoalesceBatch to spill data that is not in active use|
+|[#727](https://github.com/NVIDIA/spark-rapids/pull/727)|Update cudf dependency to 0.16-SNAPSHOT|
+|[#726](https://github.com/NVIDIA/spark-rapids/pull/726)|parquet writer support for TIMESTAMP_MILLIS|
+|[#674](https://github.com/NVIDIA/spark-rapids/pull/674)|Unit test for GPU exchange re-use with AQE|
+|[#723](https://github.com/NVIDIA/spark-rapids/pull/723)|Update code coverage to find source files in new places|
+|[#766](https://github.com/NVIDIA/spark-rapids/pull/766)|Update the integration Dockerfile to reduce the image size|
+|[#762](https://github.com/NVIDIA/spark-rapids/pull/762)|Fixing conflicts in branch-0.3|
+|[#738](https://github.com/NVIDIA/spark-rapids/pull/738)|[auto-merge] branch-0.2 to branch-0.3 - resolve conflict|
+|[#722](https://github.com/NVIDIA/spark-rapids/pull/722)|Initial code changes to support spilling outside of shuffle|
+|[#693](https://github.com/NVIDIA/spark-rapids/pull/693)|Update jenkins files for 0.3|
+|[#692](https://github.com/NVIDIA/spark-rapids/pull/692)|Merge shims dependency to spark-3.0.1 into branch-0.3|
+|[#690](https://github.com/NVIDIA/spark-rapids/pull/690)|Update the version to 0.3.0-SNAPSHOT|
+
+## Release 0.2
+
+### Features
+|||
+|:---|:---|
+|[#696](https://github.com/NVIDIA/spark-rapids/issues/696)|[FEA] run integration tests against SPARK-3.0.1|
+|[#455](https://github.com/NVIDIA/spark-rapids/issues/455)|[FEA] Support UCX shuffle with optimized AQE|
+|[#510](https://github.com/NVIDIA/spark-rapids/issues/510)|[FEA] Investigate libcudf features needed to support struct schema pruning during loads|
+|[#541](https://github.com/NVIDIA/spark-rapids/issues/541)|[FEA] Scala UDF:Support for null Value operands|
+|[#542](https://github.com/NVIDIA/spark-rapids/issues/542)|[FEA] Scala UDF: Support for Date and Time |
+|[#499](https://github.com/NVIDIA/spark-rapids/issues/499)|[FEA] disable any kind of warnings about ExecutedCommandExec not being on the GPU|
+|[#540](https://github.com/NVIDIA/spark-rapids/issues/540)|[FEA] Scala UDF: Support for String replaceFirst()|
+|[#340](https://github.com/NVIDIA/spark-rapids/issues/340)|[FEA] widen the rendered Jekyll pages|
+|[#602](https://github.com/NVIDIA/spark-rapids/issues/602)|[FEA] don't release with any -SNAPSHOT dependencies|
+|[#579](https://github.com/NVIDIA/spark-rapids/issues/579)|[FEA] Auto-merge between branches|
+|[#515](https://github.com/NVIDIA/spark-rapids/issues/515)|[FEA] Write tests for AQE skewed join optimization|
+|[#452](https://github.com/NVIDIA/spark-rapids/issues/452)|[FEA] Update HashSortOptimizerSuite to work with AQE|
+|[#454](https://github.com/NVIDIA/spark-rapids/issues/454)|[FEA] Update GpuCoalesceBatchesSuite to work with AQE enabled|
+|[#354](https://github.com/NVIDIA/spark-rapids/issues/354)|[FEA]Spark 3.1 FileSourceScanExec adds parameter optionalNumCoalescedBuckets|
+|[#566](https://github.com/NVIDIA/spark-rapids/issues/566)|[FEA] Add support for StringSplit with an array index.|
+|[#524](https://github.com/NVIDIA/spark-rapids/issues/524)|[FEA] Add GPU specific metrics to GpuFileSourceScanExec|
+|[#494](https://github.com/NVIDIA/spark-rapids/issues/494)|[FEA] Add some AQE-specific tests to the PySpark test suite|
+|[#146](https://github.com/NVIDIA/spark-rapids/issues/146)|[FEA] Python tests should support running with Adaptive Query Execution enabled|
+|[#465](https://github.com/NVIDIA/spark-rapids/issues/465)|[FEA] Audit: Update script to audit multiple versions of Spark |
+|[#488](https://github.com/NVIDIA/spark-rapids/issues/488)|[FEA] Ability to limit total GPU memory used|
+|[#70](https://github.com/NVIDIA/spark-rapids/issues/70)|[FEA] Support StringSplit|
+|[#403](https://github.com/NVIDIA/spark-rapids/issues/403)|[FEA] Add in support for GetArrayItem|
+|[#493](https://github.com/NVIDIA/spark-rapids/issues/493)|[FEA] Implement shuffle optimization when AQE is enabled|
+|[#500](https://github.com/NVIDIA/spark-rapids/issues/500)|[FEA] Add maven profiles for testing with AQE on or off|
+|[#471](https://github.com/NVIDIA/spark-rapids/issues/471)|[FEA] create a formal process for updating the github-pages branch|
+|[#233](https://github.com/NVIDIA/spark-rapids/issues/233)|[FEA] Audit DataWritingCommandExec |
+|[#240](https://github.com/NVIDIA/spark-rapids/issues/240)|[FEA] Audit Api validation script follow on - Optimize StringToTypeTag |
+|[#388](https://github.com/NVIDIA/spark-rapids/issues/388)|[FEA] Audit WindowExec|
+|[#425](https://github.com/NVIDIA/spark-rapids/issues/425)|[FEA] Add tests for configs in BatchScan Readers|
+|[#453](https://github.com/NVIDIA/spark-rapids/issues/453)|[FEA] Update HashAggregatesSuite to work with AQE|
+|[#184](https://github.com/NVIDIA/spark-rapids/issues/184)|[FEA] Enable NoScalaDoc scalastyle rule|
+|[#438](https://github.com/NVIDIA/spark-rapids/issues/438)|[FEA] Enable StringLPad|
+|[#232](https://github.com/NVIDIA/spark-rapids/issues/232)|[FEA] Audit SortExec |
+|[#236](https://github.com/NVIDIA/spark-rapids/issues/236)|[FEA] Audit ShuffleExchangeExec |
+|[#355](https://github.com/NVIDIA/spark-rapids/issues/355)|[FEA] Support Multiple Spark versions in the same jar|
+|[#385](https://github.com/NVIDIA/spark-rapids/issues/385)|[FEA] Support RangeExec on the GPU|
+|[#317](https://github.com/NVIDIA/spark-rapids/issues/317)|[FEA] Write test wrapper to run SQL queries via pyspark|
+|[#235](https://github.com/NVIDIA/spark-rapids/issues/235)|[FEA] Audit BroadcastExchangeExec|
+|[#234](https://github.com/NVIDIA/spark-rapids/issues/234)|[FEA] Audit BatchScanExec|
+|[#238](https://github.com/NVIDIA/spark-rapids/issues/238)|[FEA] Audit ShuffledHashJoinExec |
+|[#237](https://github.com/NVIDIA/spark-rapids/issues/237)|[FEA] Audit BroadcastHashJoinExec |
+|[#316](https://github.com/NVIDIA/spark-rapids/issues/316)|[FEA] Add some basic Dataframe tests for CoalesceExec|
+|[#145](https://github.com/NVIDIA/spark-rapids/issues/145)|[FEA] Scala tests should support running with Adaptive Query Execution enabled|
+|[#231](https://github.com/NVIDIA/spark-rapids/issues/231)|[FEA] Audit ProjectExec |
+|[#229](https://github.com/NVIDIA/spark-rapids/issues/229)|[FEA] Audit FileSourceScanExec |
+
+### Performance
+|||
+|:---|:---|
+|[#326](https://github.com/NVIDIA/spark-rapids/issues/326)|[DISCUSS] Shuffle read-side error handling|
+|[#601](https://github.com/NVIDIA/spark-rapids/issues/601)|[FEA] Optimize unnecessary sorts when replacing SortAggregate|
+|[#333](https://github.com/NVIDIA/spark-rapids/issues/333)|[FEA] Better handling of reading lots of small Parquet files|
+|[#511](https://github.com/NVIDIA/spark-rapids/issues/511)|[FEA] Connect shuffle table compression to shuffle exec metrics|
+|[#15](https://github.com/NVIDIA/spark-rapids/issues/15)|[FEA] Multiple threads sharing the same GPU|
+|[#272](https://github.com/NVIDIA/spark-rapids/issues/272)|[DOC] Getting started guide for UCX shuffle|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#780](https://github.com/NVIDIA/spark-rapids/issues/780)|[BUG] Inner Join dropping data with bucketed Table input|
+|[#569](https://github.com/NVIDIA/spark-rapids/issues/569)|[BUG] left_semi_join operation is abnormal and serious time-consuming|
+|[#744](https://github.com/NVIDIA/spark-rapids/issues/744)|[BUG] TPC-DS query 6 now produces incorrect results.|
+|[#718](https://github.com/NVIDIA/spark-rapids/issues/718)|[BUG] GpuBroadcastHashJoinExec ArrayIndexOutOfBoundsException|
+|[#698](https://github.com/NVIDIA/spark-rapids/issues/698)|[BUG] batch coalesce can fail to appear between columnar shuffle and subsequent columnar operation|
+|[#658](https://github.com/NVIDIA/spark-rapids/issues/658)|[BUG] GpuCoalesceBatches collectTime metric can be underreported|
+|[#59](https://github.com/NVIDIA/spark-rapids/issues/59)|[BUG] enable tests for string literals in a select|
+|[#486](https://github.com/NVIDIA/spark-rapids/issues/486)|[BUG] GpuWindowExec does not implement requiredChildOrdering|
+|[#631](https://github.com/NVIDIA/spark-rapids/issues/631)|[BUG] Rows are dropped when AQE is enabled in some cases|
+|[#671](https://github.com/NVIDIA/spark-rapids/issues/671)|[BUG] Databricks hash_aggregate_test fails trying to canonicalize a WrappedAggFunction|
+|[#218](https://github.com/NVIDIA/spark-rapids/issues/218)|[BUG] Window function COUNT(x) includes null-values, when it shouldn't|
+|[#153](https://github.com/NVIDIA/spark-rapids/issues/153)|[BUG] Incorrect output from partial-only hash aggregates with multiple distincts and non-distinct functions|
+|[#656](https://github.com/NVIDIA/spark-rapids/issues/656)|[BUG] integration tests produce hive metadata files|
+|[#607](https://github.com/NVIDIA/spark-rapids/issues/607)|[BUG] Fix misleading "cannot run on GPU" warnings when AQE is enabled|
+|[#630](https://github.com/NVIDIA/spark-rapids/issues/630)|[BUG] GpuCustomShuffleReader metrics always show zero rows/batches output|
+|[#643](https://github.com/NVIDIA/spark-rapids/issues/643)|[BUG] race condition while registering a buffer and spilling at the same time|
+|[#606](https://github.com/NVIDIA/spark-rapids/issues/606)|[BUG] Multiple scans for same data source with TPC-DS query59 with delta format|
+|[#626](https://github.com/NVIDIA/spark-rapids/issues/626)|[BUG] parquet_test showing leaked memory buffer|
+|[#155](https://github.com/NVIDIA/spark-rapids/issues/155)|[BUG] Incorrect output from averages with filters in partial only mode|
+|[#277](https://github.com/NVIDIA/spark-rapids/issues/277)|[BUG] HashAggregateSuite failure when AQE is enabled|
+|[#276](https://github.com/NVIDIA/spark-rapids/issues/276)|[BUG] GpuCoalesceBatchSuite failure when AQE is enabled|
+|[#598](https://github.com/NVIDIA/spark-rapids/issues/598)|[BUG] Non-deterministic output from MapOutputTracker.getStatistics() with AQE on GPU|
+|[#192](https://github.com/NVIDIA/spark-rapids/issues/192)|[BUG] test_read_merge_schema fails on Databricks|
+|[#341](https://github.com/NVIDIA/spark-rapids/issues/341)|[BUG] Document compression formats for readers/writers|
+|[#587](https://github.com/NVIDIA/spark-rapids/issues/587)|[BUG] Spark3.1 changed FileScan which means or GpuScans need to be added to shim layer|
+|[#362](https://github.com/NVIDIA/spark-rapids/issues/362)|[BUG] Implement getReaderForRange in the RapidsShuffleManager|
+|[#528](https://github.com/NVIDIA/spark-rapids/issues/528)|[BUG] HashAggregateSuite "Avg Distinct with filter" no longer valid when testing against Spark 3.1.0|
+|[#416](https://github.com/NVIDIA/spark-rapids/issues/416)|[BUG] Fix Spark 3.1.0 integration tests|
+|[#556](https://github.com/NVIDIA/spark-rapids/issues/556)|[BUG] NPE when removing shuffle|
+|[#553](https://github.com/NVIDIA/spark-rapids/issues/553)|[BUG] GpuColumnVector build warnings from raw type access|
+|[#492](https://github.com/NVIDIA/spark-rapids/issues/492)|[BUG] Re-enable AQE integration tests|
+|[#275](https://github.com/NVIDIA/spark-rapids/issues/275)|[BUG] TpchLike query 2 fails when AQE is enabled|
+|[#508](https://github.com/NVIDIA/spark-rapids/issues/508)|[BUG] GpuUnion publishes metrics on the UI that are all 0|
+|[#269](https://github.com/NVIDIA/spark-rapids/issues/269)|Needed to add `--conf spark.driver.extraClassPath=` |
+|[#473](https://github.com/NVIDIA/spark-rapids/issues/473)|[BUG] PartMerge:countDistinct:sum fails sporadically|
+|[#531](https://github.com/NVIDIA/spark-rapids/issues/531)|[BUG] Temporary RMM workaround needs to be removed|
+|[#532](https://github.com/NVIDIA/spark-rapids/issues/532)|[BUG] NPE when enabling shuffle manager|
+|[#525](https://github.com/NVIDIA/spark-rapids/issues/525)|[BUG] GpuFilterExec reports incorrect nullability of output in some cases|
+|[#483](https://github.com/NVIDIA/spark-rapids/issues/483)|[BUG] Multiple scans for the same parquet data source|
+|[#382](https://github.com/NVIDIA/spark-rapids/issues/382)|[BUG] Spark3.1 StringFallbackSuite regexp_replace null cpu fall back test fails.|
+|[#489](https://github.com/NVIDIA/spark-rapids/issues/489)|[FEA] Fix Spark 3.1 GpuHashJoin since it now requires CodegenSupport|
+|[#441](https://github.com/NVIDIA/spark-rapids/issues/441)|[BUG] test_broadcast_nested_loop_join_special_case fails on databricks|
+|[#347](https://github.com/NVIDIA/spark-rapids/issues/347)|[BUG] Failed to read Parquet file generated by GPU-enabled Spark.|
+|[#433](https://github.com/NVIDIA/spark-rapids/issues/433)|`InSet` operator produces an error for Strings|
+|[#144](https://github.com/NVIDIA/spark-rapids/issues/144)|[BUG] spark.sql.legacy.parquet.datetimeRebaseModeInWrite is ignored|
+|[#323](https://github.com/NVIDIA/spark-rapids/issues/323)|[BUG] GpuBroadcastNestedLoopJoinExec can fail if there are no columns|
+|[#356](https://github.com/NVIDIA/spark-rapids/issues/356)|[BUG] Integration cache test for BroadcastNestedLoopJoin failure|
+|[#280](https://github.com/NVIDIA/spark-rapids/issues/280)|[BUG] Full Outer Join does not work on nullable keys|
+|[#149](https://github.com/NVIDIA/spark-rapids/issues/149)|[BUG] Spark driver fails to load native libs when running on node without CUDA|
+
+### PRs
+|||
+|:---|:---|
+|[#826](https://github.com/NVIDIA/spark-rapids/pull/826)|Fix link to cudf-0.15-cuda11.jar|
+|[#815](https://github.com/NVIDIA/spark-rapids/pull/815)|Update documentation for Scala UDFs in 0.2 since you need two things|
+|[#802](https://github.com/NVIDIA/spark-rapids/pull/802)|Update 0.2 CHANGELOG|
+|[#793](https://github.com/NVIDIA/spark-rapids/pull/793)|Update Jenkins scripts for release|
+|[#798](https://github.com/NVIDIA/spark-rapids/pull/798)|Fix shims provider override config not being seen by executors|
+|[#785](https://github.com/NVIDIA/spark-rapids/pull/785)|Make shuffle run on CPU if we do a join where we read from bucketed table|
+|[#765](https://github.com/NVIDIA/spark-rapids/pull/765)|Add config to override shims provider class|
+|[#759](https://github.com/NVIDIA/spark-rapids/pull/759)|Add CHANGELOG for release 0.2|
+|[#758](https://github.com/NVIDIA/spark-rapids/pull/758)|Skip the udf test fails periodically.|
+|[#752](https://github.com/NVIDIA/spark-rapids/pull/752)|Fix snapshot plugin jar version in docs|
+|[#751](https://github.com/NVIDIA/spark-rapids/pull/751)|Correct the channel for cudf installation|
+|[#754](https://github.com/NVIDIA/spark-rapids/pull/754)|Filter nulls from joins where possible to improve performance|
+|[#732](https://github.com/NVIDIA/spark-rapids/pull/732)|Add a timeout for RapidsShuffleIterator to prevent jobs to hang infin…|
+|[#637](https://github.com/NVIDIA/spark-rapids/pull/637)|Documentation changes for 0.2 release |
+|[#747](https://github.com/NVIDIA/spark-rapids/pull/747)|Disable udf tests that fail periodically|
+|[#745](https://github.com/NVIDIA/spark-rapids/pull/745)|Revert Null Join Filter|
+|[#741](https://github.com/NVIDIA/spark-rapids/pull/741)|Fix issue with parquet partitioned reads|
+|[#733](https://github.com/NVIDIA/spark-rapids/pull/733)|Remove GPU Types from github|
+|[#720](https://github.com/NVIDIA/spark-rapids/pull/720)|Stop removing GpuCoalesceBatches from non-AQE queries when AQE is enabled|
+|[#729](https://github.com/NVIDIA/spark-rapids/pull/729)|Fix collect time metric in CoalesceBatches|
+|[#640](https://github.com/NVIDIA/spark-rapids/pull/640)|Support running Pandas UDFs on GPUs in Python processes.|
+|[#721](https://github.com/NVIDIA/spark-rapids/pull/721)|Add some more checks to databricks build scripts|
+|[#714](https://github.com/NVIDIA/spark-rapids/pull/714)|Move spark 3.0.1-shims out of snapshot-shims|
+|[#711](https://github.com/NVIDIA/spark-rapids/pull/711)|fix blossom checkout repo|
+|[#709](https://github.com/NVIDIA/spark-rapids/pull/709)|[BUG] fix unexpected indentation issue in blossom yml|
+|[#642](https://github.com/NVIDIA/spark-rapids/pull/642)|Init workflow for blossom-ci|
+|[#705](https://github.com/NVIDIA/spark-rapids/pull/705)|Enable configuration check for cast string to timestamp|
+|[#702](https://github.com/NVIDIA/spark-rapids/pull/702)|Update slack channel for Jenkins builds|
+|[#701](https://github.com/NVIDIA/spark-rapids/pull/701)|fix checkout-ref for automerge|
+|[#695](https://github.com/NVIDIA/spark-rapids/pull/695)|Fix spark-3.0.1 shim to be released|
+|[#668](https://github.com/NVIDIA/spark-rapids/pull/668)|refactor automerge to support merge for protected branch|
+|[#687](https://github.com/NVIDIA/spark-rapids/pull/687)|Include the UDF compiler in the dist jar|
+|[#689](https://github.com/NVIDIA/spark-rapids/pull/689)|Change shims dependency to spark-3.0.1|
+|[#677](https://github.com/NVIDIA/spark-rapids/pull/677)|Use multi-threaded parquet read with small files|
+|[#638](https://github.com/NVIDIA/spark-rapids/pull/638)|Add Parquet-based cache serializer|
+|[#613](https://github.com/NVIDIA/spark-rapids/pull/613)|Enable UCX + AQE|
+|[#684](https://github.com/NVIDIA/spark-rapids/pull/684)|Enable test for literal string values in a select|
+|[#686](https://github.com/NVIDIA/spark-rapids/pull/686)|Remove sorts when replacing sort aggregate if possible|
+|[#675](https://github.com/NVIDIA/spark-rapids/pull/675)|Added TimeAdd|
+|[#645](https://github.com/NVIDIA/spark-rapids/pull/645)|[window] Add GpuWindowExec requiredChildOrdering|
+|[#676](https://github.com/NVIDIA/spark-rapids/pull/676)|fixUpJoinConsistency rule now works when AQE is enabled|
+|[#683](https://github.com/NVIDIA/spark-rapids/pull/683)|Fix issues with cannonicalization of WrappedAggFunction|
+|[#682](https://github.com/NVIDIA/spark-rapids/pull/682)|Fix path to start-slave.sh script in docs|
+|[#673](https://github.com/NVIDIA/spark-rapids/pull/673)|Increase build timeouts on nightly and premerge builds|
+|[#648](https://github.com/NVIDIA/spark-rapids/pull/648)|add signoff-check use github actions|
+|[#593](https://github.com/NVIDIA/spark-rapids/pull/593)|Add support for isNaN and datetime related instructions in UDF compiler|
+|[#666](https://github.com/NVIDIA/spark-rapids/pull/666)|[window] Disable GPU for COUNT(exp) queries|
+|[#655](https://github.com/NVIDIA/spark-rapids/pull/655)|Implement AQE unit test for InsertAdaptiveSparkPlan|
+|[#614](https://github.com/NVIDIA/spark-rapids/pull/614)|Fix for aggregation with multiple distinct and non distinct functions|
+|[#657](https://github.com/NVIDIA/spark-rapids/pull/657)|Fix verify build after integration tests are run|
+|[#660](https://github.com/NVIDIA/spark-rapids/pull/660)|Add in neverReplaceExec and several rules for it|
+|[#639](https://github.com/NVIDIA/spark-rapids/pull/639)|BooleanType test shouldn't xfail|
+|[#652](https://github.com/NVIDIA/spark-rapids/pull/652)|Mark UVM config as internal until supported|
+|[#653](https://github.com/NVIDIA/spark-rapids/pull/653)|Move to the cudf-0.15 release|
+|[#647](https://github.com/NVIDIA/spark-rapids/pull/647)|Improve warnings about AQE nodes not supported on GPU|
+|[#646](https://github.com/NVIDIA/spark-rapids/pull/646)|Stop reporting zero metrics for GpuCustomShuffleReader|
+|[#644](https://github.com/NVIDIA/spark-rapids/pull/644)|Small fix for race in catalog where a buffer could get spilled while …|
+|[#623](https://github.com/NVIDIA/spark-rapids/pull/623)|Fix issues with canonicalization|
+|[#599](https://github.com/NVIDIA/spark-rapids/pull/599)|[FEA] changelog generator|
+|[#563](https://github.com/NVIDIA/spark-rapids/pull/563)|cudf and spark version info in artifacts|
+|[#633](https://github.com/NVIDIA/spark-rapids/pull/633)|Fix leak if RebaseHelper throws during Parquet read|
+|[#632](https://github.com/NVIDIA/spark-rapids/pull/632)|Copy function isSearchableType from Spark because signature changed in 3.0.1|
+|[#583](https://github.com/NVIDIA/spark-rapids/pull/583)|Add udf compiler unit tests|
+|[#617](https://github.com/NVIDIA/spark-rapids/pull/617)|Documentation updates for branch 0.2|
+|[#616](https://github.com/NVIDIA/spark-rapids/pull/616)|Add config to reserve GPU memory|
+|[#612](https://github.com/NVIDIA/spark-rapids/pull/612)|[REVIEW] Fix incorrect output from averages with filters in partial only mode|
+|[#609](https://github.com/NVIDIA/spark-rapids/pull/609)|fix minor issues with instructions for building ucx|
+|[#611](https://github.com/NVIDIA/spark-rapids/pull/611)|Added in profile to enable shims for SNAPSHOT releases|
+|[#595](https://github.com/NVIDIA/spark-rapids/pull/595)|Parquet small file reading optimization|
+|[#582](https://github.com/NVIDIA/spark-rapids/pull/582)|fix #579 Auto-merge between branches|
+|[#536](https://github.com/NVIDIA/spark-rapids/pull/536)|Add test for skewed join optimization when AQE is enabled|
+|[#603](https://github.com/NVIDIA/spark-rapids/pull/603)|Fix data size metric always 0 when using RAPIDS shuffle|
+|[#600](https://github.com/NVIDIA/spark-rapids/pull/600)|Fix calculation of string data for compressed batches|
+|[#597](https://github.com/NVIDIA/spark-rapids/pull/597)|Remove the xfail for parquet test_read_merge_schema on Databricks|
+|[#591](https://github.com/NVIDIA/spark-rapids/pull/591)|Add ucx license in NOTICE-binary|
+|[#596](https://github.com/NVIDIA/spark-rapids/pull/596)|Add Spark 3.0.2 to Shim layer|
+|[#594](https://github.com/NVIDIA/spark-rapids/pull/594)|Filter nulls from joins where possible to improve performance.|
+|[#590](https://github.com/NVIDIA/spark-rapids/pull/590)|Move GpuParquetScan/GpuOrcScan into Shim|
+|[#588](https://github.com/NVIDIA/spark-rapids/pull/588)|xfail the tpch spark 3.1.0 tests that fail|
+|[#572](https://github.com/NVIDIA/spark-rapids/pull/572)|Update buffer store to return compressed batches directly, add compression NVTX ranges|
+|[#558](https://github.com/NVIDIA/spark-rapids/pull/558)|Fix unit tests when AQE is enabled|
+|[#580](https://github.com/NVIDIA/spark-rapids/pull/580)|xfail the Spark 3.1.0 integration tests that fail |
+|[#565](https://github.com/NVIDIA/spark-rapids/pull/565)|Minor improvements to TPC-DS benchmarking code|
+|[#567](https://github.com/NVIDIA/spark-rapids/pull/567)|Explicitly disable AQE in one test|
+|[#571](https://github.com/NVIDIA/spark-rapids/pull/571)|Fix Databricks shim layer for GpuFileSourceScanExec and GpuBroadcastExchangeExec|
+|[#564](https://github.com/NVIDIA/spark-rapids/pull/564)|Add GPU decode time metric to scans|
+|[#562](https://github.com/NVIDIA/spark-rapids/pull/562)|getCatalog can be called from the driver, and can return null|
+|[#555](https://github.com/NVIDIA/spark-rapids/pull/555)|Fix build warnings for ColumnViewAccess|
+|[#560](https://github.com/NVIDIA/spark-rapids/pull/560)|Fix databricks build for AQE support|
+|[#557](https://github.com/NVIDIA/spark-rapids/pull/557)|Fix tests failing on Spark 3.1|
+|[#547](https://github.com/NVIDIA/spark-rapids/pull/547)|Add GPU metrics to GpuFileSourceScanExec|
+|[#462](https://github.com/NVIDIA/spark-rapids/pull/462)|Implement optimized AQE support so that exchanges run on GPU where possible|
+|[#550](https://github.com/NVIDIA/spark-rapids/pull/550)|Document Parquet and ORC compression support|
+|[#539](https://github.com/NVIDIA/spark-rapids/pull/539)|Update script to audit multiple Spark versions|
+|[#543](https://github.com/NVIDIA/spark-rapids/pull/543)|Add metrics to GpuUnion operator|
+|[#549](https://github.com/NVIDIA/spark-rapids/pull/549)|Move spark shim properties to top level pom|
+|[#497](https://github.com/NVIDIA/spark-rapids/pull/497)|Add UDF compiler implementations|
+|[#487](https://github.com/NVIDIA/spark-rapids/pull/487)|Add framework for batch compression of shuffle partitions|
+|[#544](https://github.com/NVIDIA/spark-rapids/pull/544)|Add in driverExtraClassPath for standalone mode docs|
+|[#546](https://github.com/NVIDIA/spark-rapids/pull/546)|Fix Spark 3.1.0 shim build error in GpuHashJoin|
+|[#537](https://github.com/NVIDIA/spark-rapids/pull/537)|Use fresh SparkSession when capturing to avoid late capture of previous query|
+|[#538](https://github.com/NVIDIA/spark-rapids/pull/538)|Revert "Temporary workaround for RMM initial pool size bug (#530)"|
+|[#517](https://github.com/NVIDIA/spark-rapids/pull/517)|Add config to limit maximum RMM pool size|
+|[#527](https://github.com/NVIDIA/spark-rapids/pull/527)|Add support for split and getArrayIndex|
+|[#534](https://github.com/NVIDIA/spark-rapids/pull/534)|Fixes bugs around GpuShuffleEnv initialization|
+|[#529](https://github.com/NVIDIA/spark-rapids/pull/529)|[BUG] Degenerate table metas were not getting copied to the heap|
+|[#530](https://github.com/NVIDIA/spark-rapids/pull/530)|Temporary workaround for RMM initial pool size bug|
+|[#526](https://github.com/NVIDIA/spark-rapids/pull/526)|Fix bug with nullability reporting in GpuFilterExec|
+|[#521](https://github.com/NVIDIA/spark-rapids/pull/521)|Fix typo with databricks shim classname SparkShimServiceProvider|
+|[#522](https://github.com/NVIDIA/spark-rapids/pull/522)|Use SQLConf instead of SparkConf when looking up SQL configs|
+|[#518](https://github.com/NVIDIA/spark-rapids/pull/518)|Fix init order issue in GpuShuffleEnv when RAPIDS shuffle configured|
+|[#514](https://github.com/NVIDIA/spark-rapids/pull/514)|Added clarification of RegExpReplace, DateDiff, made descriptive text consistent|
+|[#506](https://github.com/NVIDIA/spark-rapids/pull/506)|Add in basic support for running tpcds like queries|
+|[#504](https://github.com/NVIDIA/spark-rapids/pull/504)|Add ability to ignore tests depending on spark shim version|
+|[#503](https://github.com/NVIDIA/spark-rapids/pull/503)|Remove unused async buffer spill support|
+|[#501](https://github.com/NVIDIA/spark-rapids/pull/501)|disable codegen in 3.1 shim for hash join|
+|[#466](https://github.com/NVIDIA/spark-rapids/pull/466)|Optimize and fix Api validation script|
+|[#481](https://github.com/NVIDIA/spark-rapids/pull/481)|Codeowners|
+|[#439](https://github.com/NVIDIA/spark-rapids/pull/439)|Check a PR has been committed using git signoff|
+|[#319](https://github.com/NVIDIA/spark-rapids/pull/319)|Update partitioning logic in ShuffledBatchRDD|
+|[#491](https://github.com/NVIDIA/spark-rapids/pull/491)|Temporarily ignore AQE integration tests|
+|[#490](https://github.com/NVIDIA/spark-rapids/pull/490)|Fix Spark 3.1.0 build for HashJoin changes|
+|[#482](https://github.com/NVIDIA/spark-rapids/pull/482)|Prevent bad practice in python tests|
+|[#485](https://github.com/NVIDIA/spark-rapids/pull/485)|Show plan in assertion message if test fails|
+|[#480](https://github.com/NVIDIA/spark-rapids/pull/480)|Fix link from README to getting-started.md|
+|[#448](https://github.com/NVIDIA/spark-rapids/pull/448)|Preliminary support for keeping broadcast exchanges on GPU when AQE is enabled|
+|[#478](https://github.com/NVIDIA/spark-rapids/pull/478)|Fall back to CPU for binary as string in parquet|
+|[#477](https://github.com/NVIDIA/spark-rapids/pull/477)|Fix special case joins in broadcast nested loop join|
+|[#469](https://github.com/NVIDIA/spark-rapids/pull/469)|Update HashAggregateSuite to work with AQE|
+|[#475](https://github.com/NVIDIA/spark-rapids/pull/475)|Udf compiler pom followup|
+|[#434](https://github.com/NVIDIA/spark-rapids/pull/434)|Add UDF compiler skeleton|
+|[#474](https://github.com/NVIDIA/spark-rapids/pull/474)|Re-enable noscaladoc check|
+|[#461](https://github.com/NVIDIA/spark-rapids/pull/461)|Fix comments style to pass scala style check|
+|[#468](https://github.com/NVIDIA/spark-rapids/pull/468)|fix broken link|
+|[#456](https://github.com/NVIDIA/spark-rapids/pull/456)|Add closeOnExcept to clean up code that closes resources only on exceptions|
+|[#464](https://github.com/NVIDIA/spark-rapids/pull/464)|Turn off noscaladoc rule until codebase is fixed|
+|[#449](https://github.com/NVIDIA/spark-rapids/pull/449)|Enforce NoScalaDoc rule in scalastyle checks|
+|[#450](https://github.com/NVIDIA/spark-rapids/pull/450)|Enable scalastyle for shuffle plugin|
+|[#451](https://github.com/NVIDIA/spark-rapids/pull/451)|Databricks remove unneeded files and fix build to not fail on rm when file missing|
+|[#442](https://github.com/NVIDIA/spark-rapids/pull/442)|Shim layer support for Spark 3.0.0 Databricks|
+|[#447](https://github.com/NVIDIA/spark-rapids/pull/447)|Add scalastyle plugin to shim module|
+|[#426](https://github.com/NVIDIA/spark-rapids/pull/426)|Update BufferMeta to support multiple codec buffers per table|
+|[#440](https://github.com/NVIDIA/spark-rapids/pull/440)|Run mortgage test both with AQE on and off|
+|[#445](https://github.com/NVIDIA/spark-rapids/pull/445)|Added in StringRPad and StringLPad|
+|[#422](https://github.com/NVIDIA/spark-rapids/pull/422)|Documentation updates|
+|[#437](https://github.com/NVIDIA/spark-rapids/pull/437)|Fix bug with InSet and Strings|
+|[#435](https://github.com/NVIDIA/spark-rapids/pull/435)|Add in checks for Parquet LEGACY date/time rebase|
+|[#432](https://github.com/NVIDIA/spark-rapids/pull/432)|Fix batch use-after-close in partitioning, shuffle env init|
+|[#423](https://github.com/NVIDIA/spark-rapids/pull/423)|Fix duplicates includes in assembly jar|
+|[#418](https://github.com/NVIDIA/spark-rapids/pull/418)|CI Add unit tests running for Spark 3.0.1|
+|[#421](https://github.com/NVIDIA/spark-rapids/pull/421)|Make it easier to run TPCxBB benchmarks from spark shell|
+|[#413](https://github.com/NVIDIA/spark-rapids/pull/413)|Fix download link|
+|[#414](https://github.com/NVIDIA/spark-rapids/pull/414)|Shim Layer to support multiple Spark versions |
+|[#406](https://github.com/NVIDIA/spark-rapids/pull/406)|Update cast handling to deal with new libcudf casting limitations|
+|[#405](https://github.com/NVIDIA/spark-rapids/pull/405)|Change slave->worker|
+|[#395](https://github.com/NVIDIA/spark-rapids/pull/395)|Databricks doc updates|
+|[#401](https://github.com/NVIDIA/spark-rapids/pull/401)|Extended the FAQ|
+|[#398](https://github.com/NVIDIA/spark-rapids/pull/398)|Add tests for GpuPartition|
+|[#352](https://github.com/NVIDIA/spark-rapids/pull/352)|Change spark tgz package name|
+|[#397](https://github.com/NVIDIA/spark-rapids/pull/397)|Fix small bug in ShuffleBufferCatalog.hasActiveShuffle|
+|[#286](https://github.com/NVIDIA/spark-rapids/pull/286)|[REVIEW] Updated join tests for cache|
+|[#393](https://github.com/NVIDIA/spark-rapids/pull/393)|Contributor license agreement|
+|[#389](https://github.com/NVIDIA/spark-rapids/pull/389)|Added in support for RangeExec|
+|[#390](https://github.com/NVIDIA/spark-rapids/pull/390)|Ucx getting started|
+|[#391](https://github.com/NVIDIA/spark-rapids/pull/391)|Hide slack channel in Jenkins scripts|
+|[#387](https://github.com/NVIDIA/spark-rapids/pull/387)|Remove the term whitelist|
+|[#365](https://github.com/NVIDIA/spark-rapids/pull/365)|[REVIEW] Timesub tests|
+|[#383](https://github.com/NVIDIA/spark-rapids/pull/383)|Test utility to compare SQL query results between CPU and GPU|
+|[#380](https://github.com/NVIDIA/spark-rapids/pull/380)|Fix databricks notebook link|
+|[#378](https://github.com/NVIDIA/spark-rapids/pull/378)|Added in FAQ and fixed spelling|
+|[#377](https://github.com/NVIDIA/spark-rapids/pull/377)|Update heading in configs.md|
+|[#373](https://github.com/NVIDIA/spark-rapids/pull/373)|Modifying branch name to conform with rapidsai branch name change|
+|[#376](https://github.com/NVIDIA/spark-rapids/pull/376)|Add our session extension correctly if there are other extensions configured|
+|[#374](https://github.com/NVIDIA/spark-rapids/pull/374)|Fix rat issue for notebooks|
+|[#364](https://github.com/NVIDIA/spark-rapids/pull/364)|Update Databricks patch for changes to GpuSortMergeJoin|
+|[#371](https://github.com/NVIDIA/spark-rapids/pull/371)|fix typo and use regional bucket per GCP's update|
+|[#359](https://github.com/NVIDIA/spark-rapids/pull/359)|Karthik changes|
+|[#353](https://github.com/NVIDIA/spark-rapids/pull/353)|Fix broadcast nested loop join for the no column case|
+|[#313](https://github.com/NVIDIA/spark-rapids/pull/313)|Additional tests for broadcast hash join|
+|[#342](https://github.com/NVIDIA/spark-rapids/pull/342)|Implement build-side rules for shuffle hash join|
+|[#349](https://github.com/NVIDIA/spark-rapids/pull/349)|Updated join code to treat null equality properly|
+|[#335](https://github.com/NVIDIA/spark-rapids/pull/335)|Integration tests on spark 3.0.1-SNAPSHOT & 3.1.0-SNAPSHOT|
+|[#346](https://github.com/NVIDIA/spark-rapids/pull/346)|Update the Title Header for Fine Tuning|
+|[#344](https://github.com/NVIDIA/spark-rapids/pull/344)|Fix small typo in readme|
+|[#331](https://github.com/NVIDIA/spark-rapids/pull/331)|Adds iterator and client unit tests, and prepares for more fetch failure handling|
+|[#337](https://github.com/NVIDIA/spark-rapids/pull/337)|Fix Scala compile phase to allow Java classes referencing Scala classes|
+|[#332](https://github.com/NVIDIA/spark-rapids/pull/332)|Match GPU overwritten functions with SQL functions from FunctionRegistry|
+|[#339](https://github.com/NVIDIA/spark-rapids/pull/339)|Fix databricks build|
+|[#338](https://github.com/NVIDIA/spark-rapids/pull/338)|Move GpuPartitioning to a separate file|
+|[#310](https://github.com/NVIDIA/spark-rapids/pull/310)|Update release Jenkinsfile for Databricks|
+|[#330](https://github.com/NVIDIA/spark-rapids/pull/330)|Hide private info in Jenkins scripts|
+|[#324](https://github.com/NVIDIA/spark-rapids/pull/324)|Add in basic support for GpuCartesianProductExec|
+|[#328](https://github.com/NVIDIA/spark-rapids/pull/328)|Enable slack notification for Databricks build|
+|[#321](https://github.com/NVIDIA/spark-rapids/pull/321)|update databricks patch for GpuBroadcastNestedLoopJoinExec|
+|[#322](https://github.com/NVIDIA/spark-rapids/pull/322)|Add oss.sonatype.org to download the cudf jar|
+|[#320](https://github.com/NVIDIA/spark-rapids/pull/320)|Don't mount passwd/group to the container|
+|[#258](https://github.com/NVIDIA/spark-rapids/pull/258)|Enable running TPCH tests with AQE enabled|
+|[#318](https://github.com/NVIDIA/spark-rapids/pull/318)|Build docker image with Dockerfile|
+|[#309](https://github.com/NVIDIA/spark-rapids/pull/309)|Update databricks patch to latest changes|
+|[#312](https://github.com/NVIDIA/spark-rapids/pull/312)|Trigger branch-0.2 integration test|
+|[#307](https://github.com/NVIDIA/spark-rapids/pull/307)|[Jenkins] Update the release script and Jenkinsfile|
+|[#304](https://github.com/NVIDIA/spark-rapids/pull/304)|[DOC][Minor] Fix typo in spark config name.|
+|[#303](https://github.com/NVIDIA/spark-rapids/pull/303)|Update compatibility doc for -0.0 issues|
+|[#301](https://github.com/NVIDIA/spark-rapids/pull/301)|Add info about branches in README.md|
+|[#296](https://github.com/NVIDIA/spark-rapids/pull/296)|Added in basic support for broadcast nested loop join|
+|[#297](https://github.com/NVIDIA/spark-rapids/pull/297)|Databricks CI improvements and support runtime env parameter to xfail certain tests|
+|[#292](https://github.com/NVIDIA/spark-rapids/pull/292)|Move artifacts version in version-def.sh|
+|[#254](https://github.com/NVIDIA/spark-rapids/pull/254)|Cleanup QA tests|
+|[#289](https://github.com/NVIDIA/spark-rapids/pull/289)|Clean up GpuCollectLimitMeta and add in metrics|
+|[#287](https://github.com/NVIDIA/spark-rapids/pull/287)|Add in support for right join and fix issues build right|
+|[#273](https://github.com/NVIDIA/spark-rapids/pull/273)|Added releases to the README.md|
+|[#285](https://github.com/NVIDIA/spark-rapids/pull/285)|modify run_pyspark_from_build.sh to be bash 3 friendly|
+|[#281](https://github.com/NVIDIA/spark-rapids/pull/281)|Add in support for Full Outer Join on non-null keys|
+|[#274](https://github.com/NVIDIA/spark-rapids/pull/274)|Add RapidsDiskStore tests|
+|[#259](https://github.com/NVIDIA/spark-rapids/pull/259)|Add RapidsHostMemoryStore tests|
+|[#282](https://github.com/NVIDIA/spark-rapids/pull/282)|Update Databricks patch for 0.2 branch|
+|[#261](https://github.com/NVIDIA/spark-rapids/pull/261)|Add conditional xfail test for DISTINCT aggregates with NaN|
+|[#263](https://github.com/NVIDIA/spark-rapids/pull/263)|More time ops|
+|[#256](https://github.com/NVIDIA/spark-rapids/pull/256)|Remove special cases for contains, startsWith, and endWith|
+|[#253](https://github.com/NVIDIA/spark-rapids/pull/253)|Remove GpuAttributeReference and GpuSortOrder|
+|[#271](https://github.com/NVIDIA/spark-rapids/pull/271)|Update the versions for 0.2.0 properly for the databricks build|
+|[#162](https://github.com/NVIDIA/spark-rapids/pull/162)|Integration tests for corner cases in window functions.|
+|[#264](https://github.com/NVIDIA/spark-rapids/pull/264)|Add a local mvn repo for nightly pipeline|
+|[#262](https://github.com/NVIDIA/spark-rapids/pull/262)|Refer to branch-0.2|
+|[#255](https://github.com/NVIDIA/spark-rapids/pull/255)|Revert change to make dependencies of shaded jar optional|
+|[#257](https://github.com/NVIDIA/spark-rapids/pull/257)|Fix link to RAPIDS cudf in index.md|
+|[#252](https://github.com/NVIDIA/spark-rapids/pull/252)|Update to 0.2.0-SNAPSHOT and cudf-0.15-SNAPSHOT|
+
+## Release 0.1
+
+### Features
+|||
+|:---|:---|
+|[#74](https://github.com/NVIDIA/spark-rapids/issues/74)|[FEA] Support ToUnixTimestamp|
+|[#21](https://github.com/NVIDIA/spark-rapids/issues/21)|[FEA] NormalizeNansAndZeros|
+|[#105](https://github.com/NVIDIA/spark-rapids/issues/105)|[FEA] integration tests for equi-joins|
+
+### Bugs Fixed
+|||
+|:---|:---|
+|[#116](https://github.com/NVIDIA/spark-rapids/issues/116)|[BUG] calling replace with a NULL throws an exception|
+|[#168](https://github.com/NVIDIA/spark-rapids/issues/168)|[BUG] GpuUnitTests Date tests leak column vectors|
+|[#209](https://github.com/NVIDIA/spark-rapids/issues/209)|[BUG] Developers section in pom need to be updated|
+|[#204](https://github.com/NVIDIA/spark-rapids/issues/204)|[BUG] Code coverage docs are out of date|
+|[#154](https://github.com/NVIDIA/spark-rapids/issues/154)|[BUG] Incorrect output from partial-only averages with nulls|
+|[#61](https://github.com/NVIDIA/spark-rapids/issues/61)|[BUG] Cannot disable Parquet, ORC, CSV reading when using FileSourceScanExec|
+
+### PRs
+|||
+|:---|:---|
+|[#249](https://github.com/NVIDIA/spark-rapids/pull/249)|Compatability -> Compatibility|
+|[#247](https://github.com/NVIDIA/spark-rapids/pull/247)|Add index.md for default doc page, fix table formatting for configs|
+|[#241](https://github.com/NVIDIA/spark-rapids/pull/241)|Let default branch to master per the release rule|
+|[#177](https://github.com/NVIDIA/spark-rapids/pull/177)|Fixed leaks in unit test and use ColumnarBatch for testing|
+|[#243](https://github.com/NVIDIA/spark-rapids/pull/243)|Jenkins file for Databricks release|
+|[#225](https://github.com/NVIDIA/spark-rapids/pull/225)|Make internal project dependencies optional for shaded artifact|
+|[#242](https://github.com/NVIDIA/spark-rapids/pull/242)|Add site pages|
+|[#221](https://github.com/NVIDIA/spark-rapids/pull/221)|Databricks Build Support|
+|[#215](https://github.com/NVIDIA/spark-rapids/pull/215)|Remove CudfColumnVector|
+|[#213](https://github.com/NVIDIA/spark-rapids/pull/213)|Add RapidsDeviceMemoryStore tests|
+|[#214](https://github.com/NVIDIA/spark-rapids/pull/214)|[REVIEW] Test failure to pass Attribute as GpuAttribute|
+|[#211](https://github.com/NVIDIA/spark-rapids/pull/211)|Add project leads to pom developer list|
+|[#210](https://github.com/NVIDIA/spark-rapids/pull/210)|Updated coverage docs|
+|[#195](https://github.com/NVIDIA/spark-rapids/pull/195)|Support public release for plugin jar|
+|[#208](https://github.com/NVIDIA/spark-rapids/pull/208)|Remove unneeded comment from pom.xml|
+|[#191](https://github.com/NVIDIA/spark-rapids/pull/191)|WindowExec handle different spark distributions|
+|[#181](https://github.com/NVIDIA/spark-rapids/pull/181)|Remove INCOMPAT for NormalizeNanAndZero, KnownFloatingPointNormalized|
+|[#196](https://github.com/NVIDIA/spark-rapids/pull/196)|Update Spark dependency to the released 3.0.0 artifacts|
+|[#206](https://github.com/NVIDIA/spark-rapids/pull/206)|Change groupID to 'com.nvidia' in IT scripts|
+|[#202](https://github.com/NVIDIA/spark-rapids/pull/202)|Fixed issue for contains when searching for an empty string|
+|[#201](https://github.com/NVIDIA/spark-rapids/pull/201)|Fix name of scan|
+|[#200](https://github.com/NVIDIA/spark-rapids/pull/200)|Fix issue with GpuAttributeReference not overrideing references|
+|[#197](https://github.com/NVIDIA/spark-rapids/pull/197)|Fix metrics for writes|
+|[#186](https://github.com/NVIDIA/spark-rapids/pull/186)|Fixed issue with nullability on concat|
+|[#193](https://github.com/NVIDIA/spark-rapids/pull/193)|Add RapidsBufferCatalog tests|
+|[#188](https://github.com/NVIDIA/spark-rapids/pull/188)|rebrand to com.nvidia instead of ai.rapids|
+|[#189](https://github.com/NVIDIA/spark-rapids/pull/189)|Handle AggregateExpression having resultIds parameter instead of a single resultId|
+|[#190](https://github.com/NVIDIA/spark-rapids/pull/190)|FileSourceScanExec can have logicalRelation parameter on some distributions|
+|[#185](https://github.com/NVIDIA/spark-rapids/pull/185)|Update type of parameter of GpuExpandExec to make it consistent|
+|[#172](https://github.com/NVIDIA/spark-rapids/pull/172)|Merge qa test to integration test|
+|[#180](https://github.com/NVIDIA/spark-rapids/pull/180)|Add MetaUtils unit tests|
+|[#171](https://github.com/NVIDIA/spark-rapids/pull/171)|Cleanup scaladoc warnings about missing links|
+|[#176](https://github.com/NVIDIA/spark-rapids/pull/176)|Updated join tests to cover more data.|
+|[#169](https://github.com/NVIDIA/spark-rapids/pull/169)|Remove dependency on shaded Spark artifact|
+|[#174](https://github.com/NVIDIA/spark-rapids/pull/174)|Added in fallback tests|
+|[#165](https://github.com/NVIDIA/spark-rapids/pull/165)|Move input metadata tests to pyspark|
+|[#173](https://github.com/NVIDIA/spark-rapids/pull/173)|Fix setting local mode for tests|
+|[#160](https://github.com/NVIDIA/spark-rapids/pull/160)|Integration tests for normalizing NaN/zeroes.|
+|[#163](https://github.com/NVIDIA/spark-rapids/pull/163)|Ignore the order locally for repartition tests|
+|[#157](https://github.com/NVIDIA/spark-rapids/pull/157)|Add partial and final only hash aggregate tests and fix nulls corner case for Average|
+|[#159](https://github.com/NVIDIA/spark-rapids/pull/159)|Add integration tests for joins|
+|[#158](https://github.com/NVIDIA/spark-rapids/pull/158)|Orc merge schema fallback and FileScan format configs|
+|[#164](https://github.com/NVIDIA/spark-rapids/pull/164)|Fix compiler warnings|
+|[#152](https://github.com/NVIDIA/spark-rapids/pull/152)|Moved cudf to 0.14 for CI|
+|[#151](https://github.com/NVIDIA/spark-rapids/pull/151)|Switch CICD pipelines to Github|
+
+## Older Releases
+Changelog of older releases can be found at [docs/archives](/docs/archives)
diff --git a/docs/configs.md b/docs/configs.md
index d79514badf6..6fd15ac25b7 100644
--- a/docs/configs.md
+++ b/docs/configs.md
@@ -164,6 +164,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
spark.rapids.sql.expression.Atanh|`atanh`|Inverse hyperbolic tangent|true|None|
spark.rapids.sql.expression.AttributeReference| |References an input column|true|None|
spark.rapids.sql.expression.BRound|`bround`|Round an expression to d decimal places using HALF_EVEN rounding mode|true|None|
+spark.rapids.sql.expression.BitLength|`bit_length`|The bit length of string data|true|None|
spark.rapids.sql.expression.BitwiseAnd|`&`|Returns the bitwise AND of the operands|true|None|
spark.rapids.sql.expression.BitwiseNot|`~`|Returns the bitwise NOT of the operands|true|None|
spark.rapids.sql.expression.BitwiseOr|`\|`|Returns the bitwise OR of the operands|true|None|
@@ -255,6 +256,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
spark.rapids.sql.expression.NaNvl|`nanvl`|Evaluates to `left` iff left is not NaN, `right` otherwise|true|None|
spark.rapids.sql.expression.NamedLambdaVariable| |A parameter to a higher order SQL function|true|None|
spark.rapids.sql.expression.Not|`!`, `not`|Boolean not operator|true|None|
+spark.rapids.sql.expression.OctetLength|`octet_length`|The byte length of string data|true|None|
spark.rapids.sql.expression.Or|`or`|Logical OR|true|None|
spark.rapids.sql.expression.Pmod|`pmod`|Pmod|true|None|
spark.rapids.sql.expression.PosExplode|`posexplode_outer`, `posexplode`|Given an input array produces a sequence of rows for each value in the array|true|None|
diff --git a/docs/download.md b/docs/download.md
index a16a774d18e..7cb561a5ae6 100644
--- a/docs/download.md
+++ b/docs/download.md
@@ -18,6 +18,69 @@ cuDF jar, that is either preinstalled in the Spark classpath on all nodes or sub
that uses the RAPIDS Accelerator For Apache Spark. See the [getting-started
guide](https://nvidia.github.io/spark-rapids/Getting-Started/) for more details.
+## Release v22.02.0
+Hardware Requirements:
+
+The plugin is tested on the following architectures:
+
+ GPU Models: NVIDIA V100, T4 and A2/A10/A30/A100 GPUs
+
+Software Requirements:
+
+ OS: Ubuntu 18.04, Ubuntu 20.04 or CentOS 7, CentOS 8
+
+ CUDA & NVIDIA Drivers*: 11.x & v450.80.02+
+
+ Apache Spark 3.0.1, 3.0.2, 3.0.3, 3.1.1, 3.1.2, 3.2.0, 3.2.1, Cloudera CDP 7.1.6, 7.1.7, Databricks 7.3 ML LTS or 9.1 ML LTS Runtime and GCP Dataproc 2.0
+
+ Python 3.6+, Scala 2.12, Java 8
+
+*Some hardware may have a minimum driver version greater than v450.80.02+. Check the GPU spec sheet
+for your hardware's minimum driver version.
+
+### Download v22.02.0
+* Download the [RAPIDS
+ Accelerator for Apache Spark 22.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.02.0/rapids-4-spark_2.12-22.02.0.jar)
+* Download the [RAPIDS cuDF 22.02.0 jar](https://repo1.maven.org/maven2/ai/rapids/cudf/22.02.0/cudf-22.02.0-cuda11.jar)
+
+This package is built against CUDA 11.5 and has [CUDA forward
+compatibility](https://docs.nvidia.com/deploy/cuda-compatibility/index.html) enabled. It is tested
+on V100, T4, A2, A10, A30 and A100 GPUs with CUDA 11.0-11.5. For those using other types of GPUs which
+do not have CUDA forward compatibility (for example, GeForce), CUDA 11.5 is required. Users will
+need to ensure the minimum driver (450.80.02) and CUDA toolkit are installed on each Spark node.
+
+### Verify signature
+* Download the [RAPIDS Accelerator for Apache Spark 22.02.0 jar](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.02.0/rapids-4-spark_2.12-22.02.0.jar)
+ and [RAPIDS Accelerator for Apache Spark 22.02.0 jars.asc](https://repo1.maven.org/maven2/com/nvidia/rapids-4-spark_2.12/22.02.0/rapids-4-spark_2.12-22.02.0.jar.asc)
+* Download the [PUB_KEY](https://keys.openpgp.org/search?q=sw-spark@nvidia.com).
+* Import the public key: `gpg --import PUB_KEY`
+* Verify the signature: `gpg --verify rapids-4-spark_2.12-22.02.0.jar.asc rapids-4-spark_2.12-22.02.0.jar`
+
+The output if signature verify:
+
+ gpg: Good signature from "NVIDIA Spark (For the signature of spark-rapids release jars) "
+
+### Release Notes
+New functionality and performance improvements for this release include:
+* Parquet reader and writer support for decimal precision up to 38 digits (128-bits)
+* Decimal 128-bits casting
+ * Casting of decimal 128-bits values in nested types
+ * Casting to String from decimal 128-bits
+ * Casting from String to decimal 128-bits
+* MIG on YARN support
+* GPU explain only mode for Spark 3.x and 2.x
+* JSON reader support
+* Sequence function support
+* regexp_extract function support
+* Min and max on single-level struct
+* CreateMap updates and enable CreateMap by default
+* Cast from array to string
+* Add regular expression support to regexp_replace function
+* Support for conditional joins using libcudf's mixed join feature
+
+For a detailed list of changes, please refer to the
+[CHANGELOG](https://github.com/NVIDIA/spark-rapids/blob/main/CHANGELOG.md).
+
## Release v21.12.0
Hardware Requirements:
diff --git a/docs/supported_ops.md b/docs/supported_ops.md
index 52d7fc28468..2a6535befa0 100644
--- a/docs/supported_ops.md
+++ b/docs/supported_ops.md
@@ -2745,6 +2745,79 @@ are limited.
+
BitLength
+
`bit_length`
+
The bit length of string data
+
None
+
project
+
input
+
+
+
+
+
+
+
+
+
+
S
+
+
+
NS
+
+
+
+
+
+
+
+
result
+
+
+
+
S
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
BitwiseAnd
`&`
Returns the bitwise AND of the operands
@@ -2877,32 +2950,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
BitwiseNot
`~`
Returns the bitwise NOT of the operands
@@ -3125,6 +3172,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
BitwiseXor
`^`
Returns the bitwise XOR of the operands
@@ -3257,32 +3330,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
CaseWhen
`when`
CASE WHEN expression
@@ -3488,6 +3535,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
CheckOverflow
CheckOverflow after arithmetic operations between DecimalType data
@@ -3629,32 +3702,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
ConcatWs
`concat_ws`
Concatenates multiple input strings or array of strings into a single string using a given separator
@@ -3860,6 +3907,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
Cosh
`cosh`
Hyperbolic cosine
@@ -4040,32 +4113,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
CreateArray
`array`
Returns an array with the given elements
@@ -4228,6 +4275,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
CurrentRow$
Special boundary for a window frame, indicating stopping at the current row
@@ -4458,32 +4531,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
DateFormatClass
`date_format`
Converts timestamp to a value of string in the format specified by the date format
@@ -4620,6 +4667,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
DayOfMonth
`dayofmonth`, `day`
Returns the day of the month from a date or timestamp
@@ -4876,32 +4949,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
ElementAt
`element_at`
Returns element of array at given(1-based) index in value if column is array. Returns value for the given key in value if column is map
@@ -5038,6 +5085,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
EqualNullSafe
`<=>`
Check if the values are equal including nulls <=>
@@ -5238,32 +5311,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
Exp
`exp`
Euler's number e raised to a power
@@ -5401,6 +5448,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
Expm1
`expm1`
Euler's number e raised to a power minus 1
@@ -5606,32 +5679,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
GetArrayItem
Gets the field at `ordinal` in the Array
@@ -5768,6 +5815,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
GetMapValue
Gets Value from a Map based on a key
@@ -6083,32 +6156,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
GreaterThanOrEqual
`>=`
>= operator
@@ -6241,6 +6288,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
Greatest
`greatest`
Returns the greatest value of all parameters, skipping null values
@@ -6492,32 +6565,6 @@ are limited.
NS
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
In
`in`
IN operator
@@ -6633,6 +6680,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
InitCap
`initcap`
Returns str with the first letter of each word in uppercase. All other letters are in lowercase
@@ -6873,32 +6946,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
IsNotNull
`isnotnull`
Checks if a value is not null
@@ -7040,6 +7087,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
KnownNotNull
Tag an expression as known to not be null
@@ -7244,32 +7317,6 @@ are limited.
NS
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
LastDay
`last_day`
Returns the last day of the month which the date belongs to
@@ -7406,6 +7453,32 @@ are limited.
NS
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
Least
`least`
Returns the least value of all parameters, skipping null values
@@ -7632,32 +7705,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
LessThanOrEqual
`<=`
<= operator
@@ -7790,6 +7837,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
Like
`like`
Like
@@ -8000,32 +8073,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
Log1p
`log1p`
Natural log 1 + expr
@@ -8188,6 +8235,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
Lower
`lower`, `lcase`
String lowercase operator
@@ -8376,32 +8449,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
MapValues
`map_values`
Returns an unordered array containing the values of the map
@@ -8569,6 +8616,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
Month
`month`
Returns the month from a date or timestamp
@@ -8742,36 +8815,10 @@ are limited.
-
-
-
-
-
-
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
+
+
+
+
Murmur3Hash
@@ -9005,6 +9052,79 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
+
OctetLength
+
`octet_length`
+
The byte length of string data
+
None
+
project
+
input
+
+
+
+
+
+
+
+
+
+
S
+
+
+
NS
+
+
+
+
+
+
+
+
result
+
+
+
+
S
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Or
`or`
Logical OR
@@ -9137,32 +9257,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
Pmod
`pmod`
Pmod
@@ -9410,6 +9504,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
PreciseTimestampConversion
Expression used internally to convert the TimestampType to Long and back without losing precision, i.e. in microseconds. Used in time windowing
@@ -9504,32 +9624,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
PythonUDF
UDF run in an external python process. Does not actually run on the GPU, but the transfer of data to/from it can be accelerated
@@ -9821,6 +9915,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
Rand
`random`, `rand`
Generate a random column with i.i.d. uniformly distributed values in [0, 1)
@@ -9868,32 +9988,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
Rank
`rank`
Window function that returns the rank value within the aggregation window
@@ -10187,6 +10281,32 @@ are limited.
+
Expression
+
SQL Functions(s)
+
Description
+
Notes
+
Context
+
Param/Output
+
BOOLEAN
+
BYTE
+
SHORT
+
INT
+
LONG
+
FLOAT
+
DOUBLE
+
DATE
+
TIMESTAMP
+
STRING
+
DECIMAL
+
NULL
+
BINARY
+
CALENDAR
+
ARRAY
+
MAP
+
STRUCT
+
UDT
+
+
ReplicateRows
Given an input row replicates the row N times
@@ -10234,32 +10354,6 @@ are limited.
-
Expression
-
SQL Functions(s)
-
Description
-
Notes
-
Context
-
Param/Output
-
BOOLEAN
-
BYTE
-
SHORT
-
INT
-
LONG
-
FLOAT
-
DOUBLE
-
DATE
-
TIMESTAMP
-
STRING
-
DECIMAL
-
NULL
-
BINARY
-
CALENDAR
-
ARRAY
-
MAP
-
STRUCT
-
UDT
-
-
Rint
`rint`
Rounds up a double value to the nearest double equal to an integer
diff --git a/integration_tests/src/main/python/csv_test.py b/integration_tests/src/main/python/csv_test.py
index 0aa1cb50b4d..9101ef56760 100644
--- a/integration_tests/src/main/python/csv_test.py
+++ b/integration_tests/src/main/python/csv_test.py
@@ -14,13 +14,13 @@
import pytest
-from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_fallback_write
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_gpu_fallback_write, assert_cpu_and_gpu_are_equal_collect_with_capture
from conftest import get_non_gpu_allowed
from datetime import datetime, timezone
from data_gen import *
from marks import *
from pyspark.sql.types import *
-from spark_session import with_cpu_session
+from spark_session import with_cpu_session, is_before_spark_330
_acq_schema = StructType([
StructField('loan_id', LongType()),
@@ -405,3 +405,23 @@ def test_csv_save_as_table_fallback(spark_tmp_path, spark_tmp_table_factory):
lambda spark, path: spark.read.csv(path),
data_path,
'DataWritingCommandExec')
+
+@pytest.mark.skipif(is_before_spark_330(), reason='Hidden file metadata columns are a new feature of Spark 330')
+@allow_non_gpu(any = True)
+@pytest.mark.parametrize('metadata_column', ["file_path", "file_name", "file_size", "file_modification_time"])
+def test_csv_scan_with_hidden_metadata_fallback(spark_tmp_path, metadata_column):
+ data_path = spark_tmp_path + "/hidden_metadata.csv"
+ with_cpu_session(lambda spark : spark.range(10) \
+ .selectExpr("id") \
+ .write \
+ .mode("overwrite") \
+ .csv(data_path))
+
+ def do_csv_scan(spark):
+ df = spark.read.csv(data_path).selectExpr("_c0", "_metadata.{}".format(metadata_column))
+ return df
+
+ assert_cpu_and_gpu_are_equal_collect_with_capture(
+ do_csv_scan,
+ exist_classes= "FileSourceScanExec",
+ non_exist_classes= "GpuBatchScanExec")
\ No newline at end of file
diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
index 94289193485..e8414230b1b 100644
--- a/integration_tests/src/main/python/join_test.py
+++ b/integration_tests/src/main/python/join_test.py
@@ -15,7 +15,7 @@
import pytest
from pyspark.sql.functions import broadcast
from pyspark.sql.types import *
-from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture
from conftest import is_databricks_runtime, is_emr_runtime
from data_gen import *
from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
@@ -361,7 +361,7 @@ def do_join(spark):
def test_right_broadcast_nested_loop_join_with_ast_condition(data_gen, join_type, batch_size):
def do_join(spark):
left, right = create_df(spark, data_gen, 50, 25)
- # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294
+ # This test is impacted by https://github.com/NVIDIA/spark-rapids/issues/294
# if the sizes are large enough to have both 0.0 and -0.0 show up 500 and 250
# but these take a long time to verify so we run with smaller numbers by default
# that do not expose the error
@@ -651,7 +651,7 @@ def do_join(spark):
if (cache_side == 'cache_left'):
# Try to force the shuffle to be split between CPU and GPU for the join
- # by default if the operation after the shuffle is not on the GPU then
+ # by default if the operation after the shuffle is not on the GPU then
# don't do a GPU shuffle, so do something simple after the repartition
# to make sure that the GPU shuffle is used.
left = left.repartition('a').selectExpr('b + 1 as b', 'a').cache()
@@ -659,7 +659,7 @@ def do_join(spark):
else:
#cache_right
# Try to force the shuffle to be split between CPU and GPU for the join
- # by default if the operation after the shuffle is not on the GPU then
+ # by default if the operation after the shuffle is not on the GPU then
# don't do a GPU shuffle, so do something simple after the repartition
# to make sure that the GPU shuffle is used.
right = right.repartition('r_a').selectExpr('c + 1 as c', 'r_a').cache()
@@ -785,3 +785,37 @@ def do_join(spark):
return spark.sql("select a.* from {} a, {} b where a.name=b.name".format(
resultdf_name, resultdf_name))
assert_gpu_and_cpu_are_equal_collect(do_join)
+
+# ExistenceJoin occurs in the context of existential subqueries (which is rewritten to SemiJoin) if
+# there is an additional condition that may qualify left records even though they don't have
+# join partner records from the right.
+#
+# Thus a query is rewritten roughly as a LeftOuter with an additional Boolean column "exists" added.
+# which feeds into a filter "exists OR someOtherPredicate"
+# If the condition is something like an AND, it makes the result a subset of a SemiJoin, and
+# the optimizer won't use ExistenceJoin.
+@ignore_order(local=True)
+@pytest.mark.parametrize(
+ "allowFallback", [
+ pytest.param('true',
+ marks=pytest.mark.allow_non_gpu('SortMergeJoinExec')),
+ pytest.param('false',
+ marks=pytest.mark.xfail(reason="https://github.com/NVIDIA/spark-rapids/issues/589"))
+ ], ids=idfn
+)
+def test_existence_join(allowFallback, spark_tmp_table_factory):
+ leftTable = spark_tmp_table_factory.get()
+ rightTable = spark_tmp_table_factory.get()
+ def do_join(spark):
+ # create non-overlapping ranges to have a mix of exists=true and exists=false
+ spark.createDataFrame([v] for v in range(2, 10)).createOrReplaceTempView(leftTable)
+ spark.createDataFrame([v] for v in range(0, 8)).createOrReplaceTempView(rightTable)
+ res = spark.sql((
+ "select * "
+ "from {} as l "
+ "where l._1 < 0 "
+ " OR l._1 in (select * from {} as r)"
+ ).format(leftTable, rightTable))
+ return res
+ assert_cpu_and_gpu_are_equal_collect_with_capture(do_join, r".+Join ExistenceJoin\(exists#[0-9]+\).+")
+
diff --git a/integration_tests/src/main/python/map_test.py b/integration_tests/src/main/python/map_test.py
index 72aea6fbd2e..dae5c408cdd 100644
--- a/integration_tests/src/main/python/map_test.py
+++ b/integration_tests/src/main/python/map_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -17,7 +17,7 @@
from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_and_cpu_error, assert_gpu_fallback_collect
from data_gen import *
from marks import incompat, allow_non_gpu
-from spark_session import is_before_spark_311
+from spark_session import is_before_spark_311, is_before_spark_330
from pyspark.sql.types import *
from pyspark.sql.types import IntegralType
import pyspark.sql.functions as f
@@ -145,12 +145,13 @@ def test_map_scalar_project():
@pytest.mark.skipif(is_before_spark_311(), reason="Only in Spark 3.1.1 + ANSI mode, map key throws on no such element")
@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
def test_simple_get_map_value_ansi_fail(data_gen):
+ message = "org.apache.spark.SparkNoSuchElementException" if not is_before_spark_330() else "java.util.NoSuchElementException"
assert_gpu_and_cpu_error(
lambda spark: unary_op_df(spark, data_gen).selectExpr(
'a["NOT_FOUND"]').collect(),
conf={'spark.sql.ansi.enabled':True,
'spark.sql.legacy.allowNegativeScaleOfDecimal': True},
- error_message='java.util.NoSuchElementException')
+ error_message=message)
@pytest.mark.skipif(not is_before_spark_311(), reason="For Spark before 3.1.1 + ANSI mode, null will be returned instead of an exception if key is not found")
@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
@@ -176,12 +177,13 @@ def test_simple_element_at_map(data_gen):
@pytest.mark.skipif(is_before_spark_311(), reason="Only in Spark 3.1.1 + ANSI mode, map key throws on no such element")
@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
def test_map_element_at_ansi_fail(data_gen):
+ message = "org.apache.spark.SparkNoSuchElementException" if not is_before_spark_330() else "java.util.NoSuchElementException"
assert_gpu_and_cpu_error(
lambda spark: unary_op_df(spark, data_gen).selectExpr(
'element_at(a, "NOT_FOUND")').collect(),
conf={'spark.sql.ansi.enabled':True,
'spark.sql.legacy.allowNegativeScaleOfDecimal': True},
- error_message='java.util.NoSuchElementException')
+ error_message=message)
@pytest.mark.skipif(not is_before_spark_311(), reason="For Spark before 3.1.1 + ANSI mode, null will be returned instead of an exception if key is not found")
@pytest.mark.parametrize('data_gen', [simple_string_to_string_map_gen], ids=idfn)
diff --git a/integration_tests/src/main/python/orc_test.py b/integration_tests/src/main/python/orc_test.py
index e644e32a180..ba5fca0711c 100644
--- a/integration_tests/src/main/python/orc_test.py
+++ b/integration_tests/src/main/python/orc_test.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -14,11 +14,11 @@
import pytest
-from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect
+from asserts import assert_gpu_and_cpu_are_equal_collect, assert_gpu_fallback_collect, assert_cpu_and_gpu_are_equal_collect_with_capture
from data_gen import *
from marks import *
from pyspark.sql.types import *
-from spark_session import with_cpu_session
+from spark_session import with_cpu_session, is_before_spark_330
from parquet_test import _nested_pruning_schemas
pytestmark = pytest.mark.nightly_resource_consuming_test
@@ -444,3 +444,24 @@ def test_read_with_more_columns(spark_tmp_path, orc_gen, reader_confs, v1_enable
assert_gpu_and_cpu_are_equal_collect(
lambda spark : spark.read.schema(rs).orc(data_path),
conf=all_confs)
+
+@pytest.mark.skipif(is_before_spark_330(), reason='Hidden file metadata columns are a new feature of Spark 330')
+@allow_non_gpu(any = True)
+@pytest.mark.parametrize('metadata_column', ["file_path", "file_name", "file_size", "file_modification_time"])
+def test_orc_scan_with_hidden_metadata_fallback(spark_tmp_path, metadata_column):
+ data_path = spark_tmp_path + "/hidden_metadata.orc"
+ with_cpu_session(lambda spark : spark.range(10) \
+ .selectExpr("id", "id % 3 as p") \
+ .write \
+ .partitionBy("p") \
+ .mode("overwrite") \
+ .orc(data_path))
+
+ def do_orc_scan(spark):
+ df = spark.read.orc(data_path).selectExpr("id", "_metadata.{}".format(metadata_column))
+ return df
+
+ assert_cpu_and_gpu_are_equal_collect_with_capture(
+ do_orc_scan,
+ exist_classes= "FileSourceScanExec",
+ non_exist_classes= "GpuBatchScanExec")
\ No newline at end of file
diff --git a/integration_tests/src/main/python/parquet_test.py b/integration_tests/src/main/python/parquet_test.py
index 516cef3e7a5..7d1d9e1a6b3 100644
--- a/integration_tests/src/main/python/parquet_test.py
+++ b/integration_tests/src/main/python/parquet_test.py
@@ -18,6 +18,7 @@
from data_gen import *
from marks import *
from pyspark.sql.types import *
+from pyspark.sql.functions import *
from spark_session import with_cpu_session, with_gpu_session, is_before_spark_330
def read_parquet_df(data_path):
@@ -728,3 +729,24 @@ def do_parquet_scan(spark):
exist_classes= "BatchScanExec",
non_exist_classes= "GpuBatchScanExec",
conf = conf_for_parquet_aggregate_pushdown)
+
+@pytest.mark.skipif(is_before_spark_330(), reason='Hidden file metadata columns are a new feature of Spark 330')
+@allow_non_gpu(any = True)
+@pytest.mark.parametrize('metadata_column', ["file_path", "file_name", "file_size", "file_modification_time"])
+def test_parquet_scan_with_hidden_metadata_fallback(spark_tmp_path, metadata_column):
+ data_path = spark_tmp_path + "/hidden_metadata.parquet"
+ with_cpu_session(lambda spark : spark.range(10) \
+ .selectExpr("id", "id % 3 as p") \
+ .write \
+ .partitionBy("p") \
+ .mode("overwrite") \
+ .parquet(data_path))
+
+ def do_parquet_scan(spark):
+ df = spark.read.parquet(data_path).selectExpr("id", "_metadata.{}".format(metadata_column))
+ return df
+
+ assert_cpu_and_gpu_are_equal_collect_with_capture(
+ do_parquet_scan,
+ exist_classes= "FileSourceScanExec",
+ non_exist_classes= "GpuBatchScanExec")
\ No newline at end of file
diff --git a/integration_tests/src/main/python/string_test.py b/integration_tests/src/main/python/string_test.py
index 296a1bad26c..857ee4295c2 100644
--- a/integration_tests/src/main/python/string_test.py
+++ b/integration_tests/src/main/python/string_test.py
@@ -401,6 +401,12 @@ def test_length():
'CHAR_LENGTH(a)',
'CHARACTER_LENGTH(a)'))
+def test_byte_length():
+ gen = mk_str_gen('.{0,5}TEST[\ud720 A]{0,5}')
+ assert_gpu_and_cpu_are_equal_collect(
+ lambda spark: unary_op_df(spark, gen).selectExpr(
+ 'BIT_LENGTH(a)', 'OCTET_LENGTH(a)'))
+
@incompat
def test_initcap():
# Because we don't use the same unicode version we need to limit
@@ -712,4 +718,4 @@ def test_rlike_fallback_possessive_quantifier():
lambda spark: unary_op_df(spark, gen).selectExpr(
'a rlike "a*+"'),
'RLike',
- conf={'spark.rapids.sql.expression.RLike': 'true'})
\ No newline at end of file
+ conf={'spark.rapids.sql.expression.RLike': 'true'})
diff --git a/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala b/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala
index a9263c2ed36..145905df727 100644
--- a/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala
+++ b/integration_tests/src/test/scala/com/nvidia/spark/rapids/tests/mortgage/MortgageSparkSuite.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION. All rights reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -17,16 +17,21 @@
package com.nvidia.spark.rapids.tests.mortgage
import com.nvidia.spark.rapids.ShimLoader
-import org.scalatest.{BeforeAndAfterAll, FunSuite}
+import org.scalatest.FunSuite
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions._
-class MortgageSparkSuite extends FunSuite with BeforeAndAfterAll {
+class MortgageSparkSuite extends FunSuite {
/**
* This is intentionally a def rather than a val so that scalatest uses the correct value (from
* this class or the derived class) when registering tests.
+ *
+ * @note You are likely to see device/host leaks from this test when using the
+ * RAPIDS Shuffle Manager. The reason for that is a race between cuDF's MemoryCleaner
+ * and the SparkContext shutdown. Because of this, shuffle buffers cached may not get
+ * cleaned (on shuffle unregister) when the MemoryCleaner exits.
*/
def adaptiveQueryEnabled = false
@@ -63,11 +68,6 @@ class MortgageSparkSuite extends FunSuite with BeforeAndAfterAll {
builder.getOrCreate()
}
- // Close the session to avoid hanging after all cases are completed
- override def afterAll() = {
- session.close()
- }
-
test("extract mortgage data") {
val df = Run.csv(
session,
diff --git a/jenkins/databricks/build.sh b/jenkins/databricks/build.sh
index 261dd6bec0a..ef8521aea95 100755
--- a/jenkins/databricks/build.sh
+++ b/jenkins/databricks/build.sh
@@ -1,6 +1,6 @@
#!/bin/bash
#
-# Copyright (c) 2020-2021, NVIDIA CORPORATION. All rights reserved.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -87,11 +87,19 @@ then
PARQUETHADOOPJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-hadoop--org.apache.parquet__parquet-hadoop__1.10.1-databricks9.jar
PARQUETCOMMONJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-common--org.apache.parquet__parquet-common__1.10.1-databricks9.jar
PARQUETCOLUMNJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-column--org.apache.parquet__parquet-column__1.10.1-databricks9.jar
+ ORC_CORE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-core--org.apache.orc__orc-core__1.5.12.jar
+ ORC_SHIM_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-shims--org.apache.orc__orc-shims__1.5.12.jar
+ ORC_MAPREDUCE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-mapreduce--org.apache.orc__orc-mapreduce__1.5.12.jar
else
PARQUETHADOOPJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-hadoop--org.apache.parquet__parquet-hadoop__1.10.1-databricks6.jar
PARQUETCOMMONJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-common--org.apache.parquet__parquet-common__1.10.1-databricks6.jar
PARQUETCOLUMNJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-column--org.apache.parquet__parquet-column__1.10.1-databricks6.jar
+ ORC_CORE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-core--org.apache.orc__orc-core__1.5.10.jar
+ ORC_SHIM_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-shims--org.apache.orc__orc-shims__1.5.10.jar
+ ORC_MAPREDUCE_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.orc--orc-mapreduce--org.apache.orc__orc-mapreduce__1.5.10.jar
fi
+
+PROTOBUF_JAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--com.google.protobuf--protobuf-java--com.google.protobuf__protobuf-java__2.6.1.jar
PARQUETFORMATJAR=----workspace_${SPARK_MAJOR_VERSION_STRING}--maven-trees--hive-2.3__hadoop-2.7--org.apache.parquet--parquet-format--org.apache.parquet__parquet-format__2.4.0.jar
NETWORKCOMMON=----workspace_${SPARK_MAJOR_VERSION_STRING}--common--network-common--network-common-hive-2.3__hadoop-2.7_2.12_deploy.jar
@@ -363,6 +371,38 @@ mvn -B install:install-file \
-Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
-Dpackaging=jar
+mvn -B install:install-file \
+ -Dmaven.repo.local=$M2DIR \
+ -Dfile=$JARDIR/$ORC_CORE_JAR \
+ -DgroupId=org.apache.orc \
+ -DartifactId=orc-core \
+ -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
+ -Dpackaging=jar
+
+mvn -B install:install-file \
+ -Dmaven.repo.local=$M2DIR \
+ -Dfile=$JARDIR/$ORC_SHIM_JAR \
+ -DgroupId=org.apache.orc \
+ -DartifactId=orc-shims \
+ -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
+ -Dpackaging=jar
+
+mvn -B install:install-file \
+ -Dmaven.repo.local=$M2DIR \
+ -Dfile=$JARDIR/$ORC_MAPREDUCE_JAR \
+ -DgroupId=org.apache.orc \
+ -DartifactId=orc-mapreduce \
+ -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
+ -Dpackaging=jar
+
+mvn -B install:install-file \
+ -Dmaven.repo.local=$M2DIR \
+ -Dfile=$JARDIR/$PROTOBUF_JAR \
+ -DgroupId=com.google.protobuf \
+ -DartifactId=protobuf-java \
+ -Dversion=$SPARK_VERSION_TO_INSTALL_DATABRICKS_JARS \
+ -Dpackaging=jar
+
mvn -B -Ddatabricks -Dbuildver=$BUILDVER clean package -DskipTests
cd /home/ubuntu
diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh
index 10611c8e3f3..a78838edcde 100755
--- a/jenkins/spark-tests.sh
+++ b/jenkins/spark-tests.sh
@@ -258,8 +258,8 @@ if [[ $TEST_MODE == "ALL" || $TEST_MODE == "IT_ONLY" ]]; then
PARALLELISM=$(nvidia-smi --query-gpu=memory.free --format=csv,noheader | \
awk '{if (MAX < $1){ MAX = $1}} END {print int(MAX / (2 * 1024))}')
fi
- # parallelism > 8 could slow down the whole process, so we have a limitation for it
- [[ ${PARALLELISM} -gt 8 ]] && PARALLELISM=8
+ # parallelism > 7 could slow down the whole process, so we have a limitation for it
+ [[ ${PARALLELISM} -gt 7 ]] && PARALLELISM=7
MEMORY_FRACTION=$(python -c "print(1/($PARALLELISM + 0.1))")
export MEMORY_FRACTION_CONF="--conf spark.rapids.memory.gpu.allocFraction=${MEMORY_FRACTION} \
--conf spark.rapids.memory.gpu.maxAllocFraction=${MEMORY_FRACTION}"
diff --git a/pom.xml b/pom.xml
index bc79b7eb526..23dbfa761bb 100644
--- a/pom.xml
+++ b/pom.xml
@@ -118,6 +118,7 @@
+
@@ -129,6 +130,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -166,6 +168,7 @@
+
@@ -182,6 +185,7 @@
spark302
+ commondistintegration_testsshuffle-plugin
@@ -223,6 +227,7 @@
+
@@ -234,6 +239,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -275,6 +281,7 @@
+
@@ -286,6 +293,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -325,6 +333,7 @@
+
@@ -342,6 +351,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -378,8 +388,6 @@
${spark301db.version}${spark301db.version}true
- ${spark301db.version}
- ${spark301db.version}
@@ -396,6 +404,7 @@
+
@@ -406,6 +415,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -439,8 +449,6 @@
${spark312db.version}${spark312db.version}true
- ${spark312db.version}
- ${spark312db.version}
@@ -455,6 +463,7 @@
+
@@ -471,6 +480,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -508,6 +518,7 @@
+
@@ -525,6 +536,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -565,6 +577,7 @@
+
@@ -582,6 +595,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -646,6 +660,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -709,6 +724,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -772,6 +788,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -834,6 +851,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -895,6 +913,7 @@
+ commondistintegration_testsshuffle-plugin
@@ -963,8 +982,6 @@
22.04.0-SNAPSHOT2.122.12.15
- 1.5.10
- org.rogach
diff --git a/scripts/generate-changelog b/scripts/generate-changelog
index 0c313f6ec5a..d7459684a52 100755
--- a/scripts/generate-changelog
+++ b/scripts/generate-changelog
@@ -1,6 +1,6 @@
#!/usr/bin/env python
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
@@ -46,11 +46,11 @@ Usage:
# generate changelog for releases 0.1 to 21.12
scripts/generate-changelog --token= \
- --releases=0.1,0.2,0.3,0.4,0.4.1,0.5,21.06,21.06.1,21.06.2,21.08,21.08.1,21.10,21.12
+ --releases=21.06,21.06.1,21.06.2,21.08,21.08.1,21.10,21.12,22.02
# generate changelog for releases 0.1 to 21.12 to /tmp/CHANGELOG.md
GITHUB_TOKEN= scripts/generate-changelog \
- --releases=0.1,0.2,0.3,0.4,0.4.1,0.5,21.06,21.06.1,21.06.2,21.08,21.08.1,21.10,21.12 \
+ --releases=21.06,21.06.1,21.06.2,21.08,21.08.1,21.10,21.12,22.02 \
--path=/tmp/CHANGELOG.md
"""
import os
@@ -273,6 +273,8 @@ def form_changelog(path: str, changelog: dict):
subsections += form_subsection(issues, PRS)
markdown = f"""# Change log
Generated on {date.today()}{subsections}
+\n## Older Releases
+Changelog of older releases can be found at [docs/archives](/docs/archives)
"""
with open(path, "w") as file:
file.write(markdown)
diff --git a/shuffle-plugin/pom.xml b/shuffle-plugin/pom.xml
index ed1654161ba..410106f07ac 100644
--- a/shuffle-plugin/pom.xml
+++ b/shuffle-plugin/pom.xml
@@ -47,6 +47,11 @@
1.11compile
+
+ com.nvidia
+ rapids-4-spark-common_${scala.binary.version}
+ ${project.version}
+ com.nvidiarapids-4-spark-sql_${scala.binary.version}
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
index 177eeef1a5c..b9557fb5f63 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCX.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -25,9 +25,9 @@ import java.util.concurrent.atomic.AtomicLong
import scala.collection.mutable.ArrayBuffer
import ai.rapids.cudf.{BaseDeviceMemoryBuffer, MemoryBuffer, NvtxColor, NvtxRange}
-import com.google.common.util.concurrent.ThreadFactoryBuilder
import com.nvidia.spark.rapids.{Arm, GpuDeviceManager, RapidsConf}
import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
import com.nvidia.spark.rapids.shuffle.{ClientConnection, MemoryRegistrationCallback, MessageType, MetadataTransportBuffer, TransportBuffer, TransportUtils}
import org.openucx.jucx._
import org.openucx.jucx.ucp._
diff --git a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
index 942a519b5cf..d7b22affda7 100644
--- a/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
+++ b/shuffle-plugin/src/main/scala/com/nvidia/spark/rapids/shuffle/ucx/UCXShuffleTransport.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -23,8 +23,8 @@ import scala.collection.mutable
import scala.collection.mutable.ArrayBuffer
import ai.rapids.cudf.{BaseDeviceMemoryBuffer, CudaMemoryBuffer, DeviceMemoryBuffer, HostMemoryBuffer, MemoryBuffer}
-import com.google.common.util.concurrent.ThreadFactoryBuilder
import com.nvidia.spark.rapids.{GpuDeviceManager, HashedPriorityQueue, RapidsConf}
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
import com.nvidia.spark.rapids.shuffle._
import com.nvidia.spark.rapids.shuffle.{BounceBufferManager, BufferReceiveState, ClientConnection, PendingTransferRequest, RapidsShuffleClient, RapidsShuffleRequestHandler, RapidsShuffleServer, RapidsShuffleTransport, RefCountedDirectByteBuffer}
diff --git a/sql-plugin/pom.xml b/sql-plugin/pom.xml
index 33417cc2c81..35766938d06 100644
--- a/sql-plugin/pom.xml
+++ b/sql-plugin/pom.xml
@@ -36,9 +36,9 @@
${cuda.version}
- com.google.flatbuffers
- flatbuffers-java
- compile
+ com.nvidia
+ rapids-4-spark-common_${scala.binary.version}
+ ${project.version}org.scala-lang
@@ -49,41 +49,12 @@
scalatest_${scala.binary.version}test
+
+
+
- org.apache.orc
- orc-core
- ${orc.classifier}
-
-
- org.slf4j
- slf4j-api
-
-
-
-
- org.apache.orc
- orc-mapreduce
- ${orc.classifier}
-
-
- com.google.code.findbugs
- jsr305
-
-
-
-
- org.apache.hive
- hive-storage-api
-
-
- org.slf4j
- slf4j-api
-
-
-
-
- com.google.protobuf
- protobuf-java
+ com.google.flatbuffers
+ flatbuffers-java
@@ -290,6 +261,36 @@
${spark.version}provided
+
+ org.apache.orc
+ orc-core
+ ${spark.version}
+ provided
+
+
+ org.apache.orc
+ orc-shims
+ ${spark.version}
+ provided
+
+
+ org.apache.orc
+ orc-mapreduce
+ ${spark.version}
+ provided
+
+
+ org.apache.hive
+ hive-storage-api
+ ${spark.version}
+ provided
+
+
+ com.google.protobuf
+ protobuf-java
+ ${spark.version}
+ provided
+
diff --git a/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/v2/OrcShims301until320Base.scala b/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/v2/OrcShims301until320Base.scala
new file mode 100644
index 00000000000..543cb7c9afd
--- /dev/null
+++ b/sql-plugin/src/main/301until320-all/scala/com/nvidia/spark/rapids/shims/v2/OrcShims301until320Base.scala
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.shims.v2
+
+import scala.collection.mutable.ArrayBuffer
+
+import com.nvidia.spark.rapids.OrcOutputStripe
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.common.io.DiskRangeList
+import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcFile, OrcProto, PhysicalWriter, Reader, StripeInformation}
+import org.apache.orc.impl.{DataReaderProperties, OutStream, SchemaEvolution}
+import org.apache.orc.impl.RecordReaderImpl.SargApplier
+
+trait OrcShims301until320Base {
+
+ // read data to buffer
+ def readFileData(dataReader: DataReader, inputDataRanges: DiskRangeList): DiskRangeList = {
+ dataReader.readFileData(inputDataRanges, 0, false)
+ }
+
+ // create reader properties builder
+ def newDataReaderPropertiesBuilder(compressionSize: Int,
+ compressionKind: CompressionKind, typeCount: Int): DataReaderProperties.Builder = {
+ DataReaderProperties.builder()
+ .withBufferSize(compressionSize)
+ .withCompression(compressionKind)
+ .withTypeCount(typeCount)
+ }
+
+ // create ORC out stream
+ def newOrcOutStream(name: String, bufferSize: Int, codec: CompressionCodec,
+ receiver: PhysicalWriter.OutputReceiver): OutStream = {
+ new OutStream(name, bufferSize, codec, receiver)
+ }
+
+ // filter stripes by pushing down filter
+ def filterStripes(
+ stripes: Seq[StripeInformation],
+ conf: Configuration,
+ orcReader: Reader,
+ dataReader: DataReader,
+ gen: (StripeInformation, OrcProto.StripeFooter, Array[Int], Array[Int]) => OrcOutputStripe,
+ evolution: SchemaEvolution,
+ sargApp: SargApplier,
+ sargColumns: Array[Boolean],
+ ignoreNonUtf8BloomFilter: Boolean,
+ writerVersion: OrcFile.WriterVersion,
+ fileIncluded: Array[Boolean],
+ columnMapping: Array[Int],
+ idMapping: Array[Int]): ArrayBuffer[OrcOutputStripe] = {
+ val result = new ArrayBuffer[OrcOutputStripe](stripes.length)
+ stripes.foreach { stripe =>
+ val stripeFooter = dataReader.readStripeFooter(stripe)
+ val needStripe = if (sargApp != null) {
+ // An ORC schema is a single struct type describing the schema fields
+ val orcFileSchema = evolution.getFileType(0)
+ val orcIndex = dataReader.readRowIndex(stripe, orcFileSchema, stripeFooter,
+ ignoreNonUtf8BloomFilter, fileIncluded, null, sargColumns,
+ writerVersion, null, null)
+ val rowGroups = sargApp.pickRowGroups(stripe, orcIndex.getRowGroupIndex,
+ orcIndex.getBloomFilterKinds, stripeFooter.getColumnsList, orcIndex.getBloomFilterIndex,
+ true)
+ rowGroups != SargApplier.READ_NO_RGS
+ } else {
+ true
+ }
+
+ if (needStripe) {
+ result.append(gen(stripe, stripeFooter, columnMapping, idMapping))
+ }
+ }
+ result
+ }
+}
diff --git a/sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala b/sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
new file mode 100644
index 00000000000..dcac01eefe9
--- /dev/null
+++ b/sql-plugin/src/main/301until320-noncdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
@@ -0,0 +1,38 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.shims.v2
+
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import org.apache.orc.Reader
+
+object OrcShims extends OrcShims301until320Base {
+
+ // the ORC Reader in non CDH Spark is closeable
+ def withReader[T <: AutoCloseable, V](r: T)(block: T => V): V = {
+ try {
+ block(r)
+ } finally {
+ r.safeClose()
+ }
+ }
+
+ // the ORC Reader in non CDH Spark is closeable
+ def closeReader(reader: Reader): Unit = {
+ if (reader != null) {
+ reader.close()
+ }
+ }
+}
diff --git a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala b/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
index 8105b2349df..650758258f6 100644
--- a/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/301until330-all/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
@@ -23,4 +23,11 @@ object RapidsErrorUtils {
throw new ArrayIndexOutOfBoundsException(s"index $index is beyond the max index allowed " +
s"${numElements - 1}")
}
+
+ def throwInvalidElementAtIndexError(
+ elementKey: String, isElementAtFunction: Boolean = false): ColumnVector = {
+ // For now, the default argument is false. The caller sets the correct value accordingly.
+ throw new NoSuchElementException(s"Key: ${elementKey} " +
+ s"does not exist in any one of the rows in the map column")
+ }
}
diff --git a/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala b/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
new file mode 100644
index 00000000000..ddc4534cb39
--- /dev/null
+++ b/sql-plugin/src/main/311cdh/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.shims.v2
+
+import org.apache.orc.Reader
+
+object OrcShims extends OrcShims301until320Base {
+
+ // ORC Reader of the 311cdh Spark has no close method.
+ // The resource is closed internally.
+ def withReader[V](r: Reader)(block: Reader => V): V = {
+ block(r)
+ }
+
+ // empty
+ def closeReader(reader: Reader): Unit = {
+ }
+
+}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
new file mode 100644
index 00000000000..7a0fed4abc9
--- /dev/null
+++ b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/OrcShims.scala
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids.shims.v2
+
+import scala.collection.mutable.ArrayBuffer
+
+import com.nvidia.spark.rapids.OrcOutputStripe
+import com.nvidia.spark.rapids.RapidsPluginImplicits._
+import org.apache.hadoop.conf.Configuration
+import org.apache.hadoop.hive.common.io.DiskRangeList
+import org.apache.orc.{CompressionCodec, CompressionKind, DataReader, OrcConf, OrcFile, OrcProto, PhysicalWriter, Reader, StripeInformation}
+import org.apache.orc.impl.{BufferChunk, BufferChunkList, DataReaderProperties, InStream, OrcCodecPool, OutStream, ReaderImpl, SchemaEvolution}
+import org.apache.orc.impl.RecordReaderImpl.SargApplier
+import org.apache.orc.impl.reader.StripePlanner
+import org.apache.orc.impl.writer.StreamOptions
+
+// 320+ ORC shims
+object OrcShims {
+
+ // the ORC Reader in non-CDH Spark is closeable
+ def withReader[T <: Reader, V](r: T)(block: T => V): V = {
+ try {
+ block(r)
+ } finally {
+ r.safeClose()
+ }
+ }
+
+ // the ORC Reader in non-CDH Spark is closeable
+ def closeReader(reader: Reader): Unit = {
+ if(reader != null) {
+ reader.close()
+ }
+ }
+
+ // read data to buffer
+ def readFileData(dataReader: DataReader, inputDataRanges: DiskRangeList): DiskRangeList = {
+
+ // convert DiskRangeList to BufferChunkList
+ val chuckList = new BufferChunkList
+ var curr = inputDataRanges
+ while (curr != null) {
+ chuckList.add(new BufferChunk(curr.getOffset, curr.getLength))
+ curr = curr.next
+ }
+
+ // BufferChunk is subclass of DiskRangeList
+ dataReader.readFileData(chuckList, false).get()
+ }
+
+ // create reader properties builder
+ def newDataReaderPropertiesBuilder(compressionSize: Int,
+ compressionKind: CompressionKind, typeCount: Int): DataReaderProperties.Builder = {
+ val compression = new InStream.StreamOptions()
+ .withBufferSize(compressionSize).withCodec(OrcCodecPool.getCodec(compressionKind))
+ DataReaderProperties.builder().withCompression(compression)
+ }
+
+ // create ORC out stream
+ def newOrcOutStream(name: String, bufferSize: Int, codec: CompressionCodec,
+ receiver: PhysicalWriter.OutputReceiver): OutStream = {
+ val options = new StreamOptions(bufferSize).withCodec(codec, codec.getDefaultOptions)
+ new OutStream(name, options, receiver)
+ }
+
+ // filter stripes by pushing down filter
+ def filterStripes(
+ stripes: Seq[StripeInformation],
+ conf: Configuration,
+ orcReader: Reader,
+ dataReader: DataReader,
+ gen: (StripeInformation, OrcProto.StripeFooter, Array[Int], Array[Int]) => OrcOutputStripe,
+ evolution: SchemaEvolution,
+ sargApp: SargApplier,
+ sargColumns: Array[Boolean],
+ ignoreNonUtf8BloomFilter: Boolean,
+ writerVersion: OrcFile.WriterVersion,
+ fileIncluded: Array[Boolean],
+ columnMapping: Array[Int],
+ idMapping: Array[Int]): ArrayBuffer[OrcOutputStripe] = {
+
+ val orcReaderImpl = orcReader.asInstanceOf[ReaderImpl]
+ val maxDiskRangeChunkLimit = OrcConf.ORC_MAX_DISK_RANGE_CHUNK_LIMIT.getInt(conf)
+ val planner = new StripePlanner(evolution.getFileSchema, orcReaderImpl.getEncryption(),
+ dataReader, writerVersion, ignoreNonUtf8BloomFilter, maxDiskRangeChunkLimit)
+
+ val result = new ArrayBuffer[OrcOutputStripe](stripes.length)
+ stripes.foreach { stripe =>
+ val stripeFooter = dataReader.readStripeFooter(stripe)
+ val needStripe = if (sargApp != null) {
+ val orcIndex = planner.parseStripe(stripe, fileIncluded).readRowIndex(sargColumns, null)
+ val rowGroups = sargApp.pickRowGroups(stripe, orcIndex.getRowGroupIndex,
+ orcIndex.getBloomFilterKinds, stripeFooter.getColumnsList, orcIndex.getBloomFilterIndex,
+ true)
+ rowGroups != SargApplier.READ_NO_RGS
+ } else {
+ true
+ }
+
+ if (needStripe) {
+ result.append(gen(stripe, stripeFooter, columnMapping, idMapping))
+ }
+ }
+ result
+
+ }
+}
diff --git a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/Spark320PlusShims.scala b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/Spark320PlusShims.scala
index 606b2f8be00..662008e9fab 100644
--- a/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/Spark320PlusShims.scala
+++ b/sql-plugin/src/main/320+/scala/com/nvidia/spark/rapids/shims/v2/Spark320PlusShims.scala
@@ -558,7 +558,7 @@ trait Spark320PlusShims extends SparkShims with RebaseShims with Logging {
// partition filters and data filters are not run on the GPU
override val childExprs: Seq[ExprMeta[_]] = Seq.empty
- override def tagPlanForGpu(): Unit = GpuFileSourceScanExec.tagSupport(this)
+ override def tagPlanForGpu(): Unit = tagFileSourceScanExec(this)
override def convertToCpu(): SparkPlan = {
wrapped.copy(partitionFilters = partitionFilters)
@@ -1051,4 +1051,8 @@ trait Spark320PlusShims extends SparkShims with RebaseShims with Logging {
}
override def supportsColumnarAdaptivePlans: Boolean = true
+
+ def tagFileSourceScanExec(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {
+ GpuFileSourceScanExec.tagSupport(meta)
+ }
}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
index de67b5e5cf7..99b943a0e71 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
+++ b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/RapidsErrorUtils.scala
@@ -24,4 +24,10 @@ object RapidsErrorUtils {
def throwArrayIndexOutOfBoundsException(index: Int, numElements: Int): ColumnVector = {
throw QueryExecutionErrors.invalidArrayIndexError(index, numElements)
}
+
+ def throwInvalidElementAtIndexError(
+ elementKey: String, isElementAtFunction: Boolean = false): ColumnVector = {
+ // For now, the default argument is false. The caller sets the correct value accordingly.
+ throw QueryExecutionErrors.mapKeyNotExistError(elementKey, isElementAtFunction)
+ }
}
diff --git a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/Spark33XShims.scala b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/Spark33XShims.scala
index bad7c3d8e4d..0294f602a7d 100644
--- a/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/Spark33XShims.scala
+++ b/sql-plugin/src/main/330+/scala/com/nvidia/spark/rapids/shims/v2/Spark33XShims.scala
@@ -22,10 +22,10 @@ import org.apache.parquet.schema.MessageType
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.catalyst.InternalRow
-import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression}
+import org.apache.spark.sql.catalyst.expressions.{AttributeReference, Expression, MetadataAttribute}
import org.apache.spark.sql.catalyst.json.rapids.shims.v2.Spark33XFileOptionsShims
import org.apache.spark.sql.connector.read.{Scan, SupportsRuntimeFiltering}
-import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.{FileSourceScanExec, SparkPlan}
import org.apache.spark.sql.execution.datasources.{DataSourceUtils, FilePartition, FileScanRDD, PartitionedFile}
import org.apache.spark.sql.execution.datasources.parquet.ParquetFilters
import org.apache.spark.sql.execution.datasources.v2.csv.CSVScan
@@ -143,6 +143,16 @@ trait Spark33XShims extends Spark33XFileOptionsShims {
conf.maxReadBatchSizeBytes)
})
).map(r => (r.getClassFor.asSubclass(classOf[Scan]), r)).toMap
+
+ override def tagFileSourceScanExec(meta: SparkPlanMeta[FileSourceScanExec]): Unit = {
+ if (meta.wrapped.expressions.exists(expr => expr match {
+ case MetadataAttribute(expr) => true
+ case _ => false
+ })) {
+ meta.willNotWorkOnGpu("hidden metadata columns are not supported on GPU")
+ }
+ super.tagFileSourceScanExec(meta)
+ }
}
// Fallback to the default definition of `deterministic`
diff --git a/sql-plugin/src/main/scala/ai/rapids/cudf/HostConcatResultUtil.scala b/sql-plugin/src/main/scala/ai/rapids/cudf/HostConcatResultUtil.scala
new file mode 100644
index 00000000000..30d7289c902
--- /dev/null
+++ b/sql-plugin/src/main/scala/ai/rapids/cudf/HostConcatResultUtil.scala
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package ai.rapids.cudf
+
+import ai.rapids.cudf.JCudfSerialization.HostConcatResult
+import com.nvidia.spark.rapids.{Arm, GpuColumnVectorFromBuffer}
+
+import org.apache.spark.sql.types.DataType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+object HostConcatResultUtil extends Arm {
+ /**
+ * Create a rows-only `HostConcatResult`.
+ */
+ def rowsOnlyHostConcatResult(numRows: Int): HostConcatResult = {
+ new HostConcatResult(
+ new JCudfSerialization.SerializedTableHeader(
+ Array.empty, numRows, 0L),
+ HostMemoryBuffer.allocate(0, false))
+ }
+
+ /**
+ * Given a `HostConcatResult` and a SparkSchema produce a `ColumnarBatch`,
+ * handling the rows-only case.
+ *
+ * @note This function does not consume the `HostConcatResult`, and
+ * callers are responsible for closing the resulting `ColumnarBatch`
+ */
+ def getColumnarBatch(
+ hostConcatResult: HostConcatResult,
+ sparkSchema: Array[DataType]): ColumnarBatch = {
+ if (hostConcatResult.getTableHeader.getNumColumns == 0) {
+ // We expect the caller to have acquired the GPU unconditionally before calling
+ // `getColumnarBatch`, as a downstream exec may need the GPU, and the assumption is
+ // that it is acquired in the coalesce code.
+ new ColumnarBatch(Array.empty, hostConcatResult.getTableHeader.getNumRows)
+ } else {
+ withResource(hostConcatResult.toContiguousTable) { ct =>
+ GpuColumnVectorFromBuffer.from(ct, sparkSchema)
+ }
+ }
+ }
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
index d4333f7bc15..26b1c0c4116 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuCoalesceBatches.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -46,7 +46,20 @@ object ConcatAndConsumeAll {
* @return a single batch with all of them concated together.
*/
def buildNonEmptyBatch(arrayOfBatches: Array[ColumnarBatch],
- schema: StructType): ColumnarBatch = {
+ schema: StructType): ColumnarBatch =
+ buildNonEmptyBatchFromTypes(
+ arrayOfBatches, GpuColumnVector.extractTypes(schema))
+
+ /**
+ * Build a single batch from the batches collected so far. If array is empty this will likely
+ * blow up.
+ * @param arrayOfBatches the batches to concat. This will be consumed and you do not need to
+ * close any of the batches after this is called.
+ * @param dataTypes the output types.
+ * @return a single batch with all of them concated together.
+ */
+ def buildNonEmptyBatchFromTypes(arrayOfBatches: Array[ColumnarBatch],
+ dataTypes: Array[DataType]): ColumnarBatch = {
if (arrayOfBatches.length == 1) {
arrayOfBatches(0)
} else {
@@ -54,7 +67,7 @@ object ConcatAndConsumeAll {
try {
val combined = Table.concatenate(tables: _*)
try {
- GpuColumnVector.from(combined, GpuColumnVector.extractTypes(schema))
+ GpuColumnVector.from(combined, dataTypes)
} finally {
combined.close()
}
@@ -410,9 +423,8 @@ abstract class AbstractGpuCoalesceIterator(
}
class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
- schema: StructType,
+ sparkTypes: Array[DataType],
goal: CoalesceSizeGoal,
- maxDecompressBatchMemory: Long,
numInputRows: GpuMetric,
numInputBatches: GpuMetric,
numOutputRows: GpuMetric,
@@ -422,8 +434,7 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
opTime: GpuMetric,
peakDevMemory: GpuMetric,
spillCallback: SpillCallback,
- opName: String,
- codecConfigs: TableCompressionCodecConfig)
+ opName: String)
extends AbstractGpuCoalesceIterator(iter,
goal,
numInputRows,
@@ -435,8 +446,7 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
opTime,
opName) with Arm {
- private val sparkTypes: Array[DataType] = GpuColumnVector.extractTypes(schema)
- private val batches: ArrayBuffer[SpillableColumnarBatch] = ArrayBuffer.empty
+ protected val batches: ArrayBuffer[SpillableColumnarBatch] = ArrayBuffer.empty
private var maxDeviceMemory: Long = 0
override def initNewBatch(batch: ColumnarBatch): Unit = {
@@ -448,10 +458,85 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
batches.append(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_BATCHING_PRIORITY,
spillCallback))
+ protected def popAll(): Array[ColumnarBatch] = {
+ closeOnExcept(batches.toArray.safeMap(_.getColumnarBatch())) { wip =>
+ batches.safeClose()
+ batches.clear()
+ wip
+ }
+ }
+
+ override def concatAllAndPutOnGPU(): ColumnarBatch = {
+ val ret = ConcatAndConsumeAll.buildNonEmptyBatchFromTypes(popAll(), sparkTypes)
+ // sum of current batches and concatenating batches. Approximately sizeof(ret * 2).
+ maxDeviceMemory = GpuColumnVector.getTotalDeviceMemoryUsed(ret) * 2
+ ret
+ }
+
+ override def cleanupConcatIsDone(): Unit = {
+ peakDevMemory.set(maxDeviceMemory)
+ batches.clear()
+ }
+
+ private var onDeck: Option[SpillableColumnarBatch] = None
+
+ override protected def hasOnDeck: Boolean = onDeck.isDefined
+
+ override protected def saveOnDeck(batch: ColumnarBatch): Unit = {
+ assert(onDeck.isEmpty)
+ onDeck = Some(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
+ spillCallback))
+ }
+
+ override protected def clearOnDeck(): Unit = {
+ onDeck.foreach(_.close())
+ onDeck = None
+ }
+
+ override protected def popOnDeck(): ColumnarBatch = {
+ val ret = onDeck.get.getColumnarBatch()
+ clearOnDeck()
+ ret
+ }
+}
+
+/**
+ * Compression codec-aware `GpuCoalesceIterator` subclass which should be used in cases
+ * where the RAPIDS Shuffle Manager could be configured, as batches to be coalesced
+ * may be compressed.
+ */
+class GpuCompressionAwareCoalesceIterator(
+ iter: Iterator[ColumnarBatch],
+ sparkTypes: Array[DataType],
+ goal: CoalesceSizeGoal,
+ maxDecompressBatchMemory: Long,
+ numInputRows: GpuMetric,
+ numInputBatches: GpuMetric,
+ numOutputRows: GpuMetric,
+ numOutputBatches: GpuMetric,
+ collectTime: GpuMetric,
+ concatTime: GpuMetric,
+ opTime: GpuMetric,
+ peakDevMemory: GpuMetric,
+ spillCallback: SpillCallback,
+ opName: String,
+ codecConfigs: TableCompressionCodecConfig)
+ extends GpuCoalesceIterator(
+ iter, sparkTypes, goal,
+ numInputRows = numInputRows,
+ numInputBatches = numInputBatches,
+ numOutputRows = numOutputRows,
+ numOutputBatches = numOutputBatches,
+ collectTime = collectTime,
+ concatTime = concatTime,
+ opTime = opTime,
+ peakDevMemory = peakDevMemory,
+ spillCallback, opName) {
+
private[this] var codec: TableCompressionCodec = _
- private[this] def popAllDecompressed(): Array[ColumnarBatch] = {
- closeOnExcept(batches.map(_.getColumnarBatch())) { wip =>
+ override protected def popAll(): Array[ColumnarBatch] = {
+ closeOnExcept(batches.toArray.safeMap(_.getColumnarBatch())) { wip =>
batches.safeClose()
batches.clear()
@@ -487,42 +572,9 @@ class GpuCoalesceIterator(iter: Iterator[ColumnarBatch],
}
}
}
- wip.toArray
+ wip
}
}
-
- override def concatAllAndPutOnGPU(): ColumnarBatch = {
- val ret = ConcatAndConsumeAll.buildNonEmptyBatch(popAllDecompressed(), schema)
- // sum of current batches and concatenating batches. Approximately sizeof(ret * 2).
- maxDeviceMemory = GpuColumnVector.getTotalDeviceMemoryUsed(ret) * 2
- ret
- }
-
- override def cleanupConcatIsDone(): Unit = {
- peakDevMemory.set(maxDeviceMemory)
- batches.clear()
- }
-
- private var onDeck: Option[SpillableColumnarBatch] = None
-
- override protected def hasOnDeck: Boolean = onDeck.isDefined
-
- override protected def saveOnDeck(batch: ColumnarBatch): Unit = {
- assert(onDeck.isEmpty)
- onDeck = Some(SpillableColumnarBatch(batch, SpillPriorities.ACTIVE_ON_DECK_PRIORITY,
- spillCallback))
- }
-
- override protected def clearOnDeck(): Unit = {
- onDeck.foreach(_.close())
- onDeck = None
- }
-
- override protected def popOnDeck(): ColumnarBatch = {
- val ret = onDeck.get.getColumnarBatch()
- clearOnDeck()
- ret
- }
}
case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
@@ -579,6 +631,7 @@ case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
// cache in local vars to avoid serializing the plan
val outputSchema = schema
+ val dataTypes = GpuColumnVector.extractTypes(outputSchema)
val decompressMemoryTarget = maxDecompressBatchMemory
val batches = child.executeColumnar()
@@ -593,7 +646,8 @@ case class GpuCoalesceBatches(child: SparkPlan, goal: CoalesceGoal)
goal match {
case sizeGoal: CoalesceSizeGoal =>
batches.mapPartitions { iter =>
- new GpuCoalesceIterator(iter, outputSchema, sizeGoal, decompressMemoryTarget,
+ new GpuCompressionAwareCoalesceIterator(
+ iter, dataTypes, sizeGoal, decompressMemoryTarget,
numInputRows, numInputBatches, numOutputRows, numOutputBatches, NoopMetric,
concatTime, opTime, peakDevMemory, callback, "GpuCoalesceBatches",
codecConfigs)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
index ffba671dc2d..0bffd6cd3cb 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuMultiFileReader.scala
@@ -26,7 +26,6 @@ import scala.collection.mutable.{ArrayBuffer, LinkedHashMap, Queue}
import scala.math.max
import ai.rapids.cudf.{ColumnVector, HostMemoryBuffer, NvtxColor, NvtxRange, Table}
-import com.google.common.util.concurrent.ThreadFactoryBuilder
import com.nvidia.spark.rapids.GpuMetric.{NUM_OUTPUT_BATCHES, PEAK_DEVICE_MEMORY, SEMAPHORE_WAIT_TIME}
import org.apache.commons.io.IOUtils
import org.apache.hadoop.conf.Configuration
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScanBase.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScanBase.scala
index e6063853eb2..6493d2cd3a7 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScanBase.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOrcScanBase.scala
@@ -36,6 +36,7 @@ import com.google.protobuf.CodedOutputStream
import com.nvidia.spark.rapids.GpuMetric._
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import com.nvidia.spark.rapids.SchemaUtils._
+import com.nvidia.spark.rapids.shims.v2.OrcShims
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.hive.common.io.DiskRangeList
@@ -55,10 +56,10 @@ import org.apache.spark.sql.connector.read.{InputPartition, PartitionReader, Par
import org.apache.spark.sql.execution.QueryExecutionException
import org.apache.spark.sql.execution.datasources.PartitionedFile
import org.apache.spark.sql.execution.datasources.orc.OrcUtils
+import org.apache.spark.sql.execution.datasources.rapids.OrcFiltersWrapper
import org.apache.spark.sql.execution.datasources.v2.{EmptyPartitionReader, FilePartitionReaderFactory}
import org.apache.spark.sql.execution.datasources.v2.orc.OrcScan
import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.rapids.OrcFilters
import org.apache.spark.sql.rapids.execution.TrampolineUtil
import org.apache.spark.sql.sources.Filter
import org.apache.spark.sql.types.{ArrayType, DataType, DecimalType, MapType, StructType}
@@ -319,13 +320,13 @@ trait OrcCommonFunctions extends OrcCodecWritingHelper {
withResource(OrcTools.buildDataReader(ctx)) { dataReader =>
val start = System.nanoTime()
- val bufferChunks = dataReader.readFileData(inputDataRanges, 0, false)
+ val bufferChunks = OrcShims.readFileData(dataReader, inputDataRanges)
val mid = System.nanoTime()
var current = bufferChunks
while (current != null) {
out.write(current.getData)
if (dataReader.isTrackingDiskRanges && current.isInstanceOf[BufferChunk]) {
- dataReader.releaseBuffer(current.asInstanceOf[BufferChunk].getChunk)
+ dataReader.releaseBuffer(current.getData)
}
current = current.next
}
@@ -740,17 +741,18 @@ private object OrcTools extends Arm {
}
val maxDiskRangeChunkLimit = OrcConf.ORC_MAX_DISK_RANGE_CHUNK_LIMIT.getInt(conf)
val file = filePath.getFileSystem(conf).open(filePath)
+
+ val typeCount = org.apache.orc.OrcUtils.getOrcTypes(fileSchema).size
//noinspection ScalaDeprecation
- RecordReaderUtils.createDefaultDataReader(DataReaderProperties.builder()
- .withBufferSize(compressionSize)
- .withCompression(compressionKind)
- .withFileSystem(fs)
- .withPath(filePath)
- .withFile(file) // explicitly specify the FSDataInputStream
- .withTypeCount(org.apache.orc.OrcUtils.getOrcTypes(fileSchema).size)
- .withZeroCopy(zeroCopy)
- .withMaxDiskRangeChunkLimit(maxDiskRangeChunkLimit)
- .build())
+ val reader = RecordReaderUtils.createDefaultDataReader(
+ OrcShims.newDataReaderPropertiesBuilder(compressionSize, compressionKind, typeCount)
+ .withFileSystem(fs)
+ .withPath(filePath)
+ .withZeroCopy(zeroCopy)
+ .withMaxDiskRangeChunkLimit(maxDiskRangeChunkLimit)
+ .build())
+ reader.open() // 311cdh needs to initialize the internal FSDataInputStream file variable.
+ reader
}
}
@@ -783,8 +785,8 @@ private case class GpuOrcFileFilterHandler(
val orcFileReaderOpts = OrcFile.readerOptions(conf).filesystem(fs)
// After getting the necessary information from ORC reader, we must close the ORC reader
- withResource(OrcFile.createReader(filePath, orcFileReaderOpts)) { orcReader =>
- val resultedColPruneInfo = requestedColumnIds(isCaseSensitive, dataSchema,
+ OrcShims.withReader(OrcFile.createReader(filePath, orcFileReaderOpts)) { orcReader =>
+ val resultedColPruneInfo = requestedColumnIds(isCaseSensitive, dataSchema,
readDataSchema, orcReader)
if (resultedColPruneInfo.isEmpty) {
// Be careful when the OrcPartitionReaderContext is null, we should change
@@ -822,7 +824,7 @@ private case class GpuOrcFileFilterHandler(
val readerOpts = OrcInputFormat.buildOptions(
conf, orcReader, partFile.start, partFile.length)
// create the search argument if we have pushed filters
- OrcFilters.createFilter(fullSchema, pushedFilters).foreach { f =>
+ OrcFiltersWrapper.createFilter(fullSchema, pushedFilters).foreach { f =>
readerOpts.searchArgument(f, fullSchema.fieldNames)
}
readerOpts
@@ -882,7 +884,7 @@ private case class GpuOrcFileFilterHandler(
if (matchedOrcFields.size > 1) {
// Need to fail if there is ambiguity, i.e. more than one field is matched.
val matchedOrcFieldsString = matchedOrcFields.mkString("[", ", ", "]")
- reader.close()
+ OrcShims.closeReader(reader)
throw new RuntimeException(s"""Found duplicate field(s) "$requiredFieldName": """
+ s"$matchedOrcFieldsString in case-insensitive mode")
} else {
@@ -1088,29 +1090,10 @@ private case class GpuOrcFileFilterHandler(
val fileIncluded = calcOrcFileIncluded(evolution)
val (columnMapping, idMapping) = columnRemap(fileIncluded, evolution.getFileSchema,
updatedReadSchema, isCaseSensitive)
- val result = new ArrayBuffer[OrcOutputStripe](stripes.length)
- stripes.foreach { stripe =>
- val stripeFooter = dataReader.readStripeFooter(stripe)
- val needStripe = if (sargApp != null) {
- // An ORC schema is a single struct type describing the schema fields
- val orcFileSchema = evolution.getFileType(0)
- val orcIndex = dataReader.readRowIndex(stripe, orcFileSchema, stripeFooter,
- ignoreNonUtf8BloomFilter, fileIncluded, null, sargColumns,
- writerVersion, null, null)
- val rowGroups = sargApp.pickRowGroups(stripe, orcIndex.getRowGroupIndex,
- orcIndex.getBloomFilterKinds, stripeFooter.getColumnsList, orcIndex.getBloomFilterIndex,
- true)
- rowGroups != SargApplier.READ_NO_RGS
- } else {
- true
- }
-
- if (needStripe) {
- result.append(buildOutputStripe(stripe, stripeFooter, columnMapping, idMapping))
- }
- }
-
- result
+ OrcShims.filterStripes(stripes, conf, orcReader, dataReader,
+ buildOutputStripe, evolution,
+ sargApp, sargColumns, ignoreNonUtf8BloomFilter,
+ writerVersion, fileIncluded, columnMapping, idMapping)
}
/**
@@ -1552,8 +1535,8 @@ trait OrcCodecWritingHelper extends Arm {
// note that this buffer is just for writing meta-data
OrcConf.BUFFER_SIZE.getDefaultValue.asInstanceOf[Int]
}
- withResource(new OutStream(getClass.getSimpleName, orcBufferSize, codec,
- outReceiver)) { codecStream =>
+ withResource(OrcShims.newOrcOutStream(
+ getClass.getSimpleName, orcBufferSize, codec, outReceiver)) { codecStream =>
val protoWriter = CodedOutputStream.newInstance(codecStream)
block(outChannel, protoWriter, codecStream)
}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
index 16280a47179..82333fa3721 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuOverrides.scala
@@ -492,11 +492,6 @@ object GpuOverrides extends Logging {
listeners.clear()
}
- def canRegexpBeTreatedLikeARegularString(strLit: UTF8String): Boolean = {
- val s = strLit.toString
- !regexList.exists(pattern => s.contains(pattern))
- }
-
private def convertPartToGpuIfPossible(part: Partitioning, conf: RapidsConf): Partitioning = {
part match {
case _: GpuPartitioning => part
@@ -3352,7 +3347,23 @@ object GpuOverrides extends Logging {
TypeSig.DATE)),
Some(RepeatingParamCheck("step", TypeSig.integral, TypeSig.integral + TypeSig.CALENDAR))),
(a, conf, p, r) => new GpuSequenceMeta(a, conf, p, r)
- )
+ ),
+ expr[BitLength](
+ "The bit length of string data",
+ ExprChecks.unaryProject(
+ TypeSig.INT, TypeSig.INT,
+ TypeSig.STRING, TypeSig.STRING + TypeSig.BINARY),
+ (a, conf, p, r) => new UnaryExprMeta[BitLength](a, conf, p, r) {
+ override def convertToGpu(child: Expression): GpuExpression = GpuBitLength(child)
+ }),
+ expr[OctetLength](
+ "The byte length of string data",
+ ExprChecks.unaryProject(
+ TypeSig.INT, TypeSig.INT,
+ TypeSig.STRING, TypeSig.STRING + TypeSig.BINARY),
+ (a, conf, p, r) => new UnaryExprMeta[OctetLength](a, conf, p, r) {
+ override def convertToGpu(child: Expression): GpuExpression = GpuOctetLength(child)
+ })
).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
// Shim expressions should be last to allow overrides with shim-specific versions
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
index ba42c2d8f3e..c85d6fe1a60 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffleCoalesceExec.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -18,8 +18,8 @@ package com.nvidia.spark.rapids
import java.util
-import ai.rapids.cudf.{HostMemoryBuffer, JCudfSerialization, NvtxColor, NvtxRange}
-import ai.rapids.cudf.JCudfSerialization.SerializedTableHeader
+import ai.rapids.cudf.{HostConcatResultUtil, HostMemoryBuffer, JCudfSerialization, NvtxColor, NvtxRange}
+import ai.rapids.cudf.JCudfSerialization.{HostConcatResult, SerializedTableHeader}
import com.nvidia.spark.rapids.shims.v2.ShimUnaryExecNode
import org.apache.spark.TaskContext
@@ -61,10 +61,12 @@ case class GpuShuffleCoalesceExec(child: SparkPlan, targetBatchByteSize: Long)
override def doExecuteColumnar(): RDD[ColumnarBatch] = {
val metricsMap = allMetrics
val targetSize = targetBatchByteSize
- val sparkSchema = GpuColumnVector.extractTypes(schema)
+ val dataTypes = GpuColumnVector.extractTypes(schema)
child.executeColumnar().mapPartitions { iter =>
- new GpuShuffleCoalesceIterator(iter, targetSize, sparkSchema, metricsMap)
+ new GpuShuffleCoalesceIterator(
+ new HostShuffleCoalesceIterator(iter, targetSize, dataTypes, metricsMap),
+ dataTypes, metricsMap)
}
}
}
@@ -72,22 +74,18 @@ case class GpuShuffleCoalesceExec(child: SparkPlan, targetBatchByteSize: Long)
/**
* Iterator that coalesces columnar batches that are expected to only contain
* [[SerializedTableColumn]]. The serialized tables within are collected up
- * to the target batch size and then concatenated on the host before the data
- * is transferred to the GPU.
+ * to the target batch size and then concatenated on the host before handing
+ * them to the caller on `.next()`
*/
-class GpuShuffleCoalesceIterator(
+class HostShuffleCoalesceIterator(
iter: Iterator[ColumnarBatch],
targetBatchByteSize: Long,
- sparkSchema: Array[DataType],
+ dataTypes: Array[DataType],
metricsMap: Map[String, GpuMetric])
- extends Iterator[ColumnarBatch] with Arm with AutoCloseable {
- private[this] val opTimeMetric = metricsMap(GpuMetric.OP_TIME)
+ extends Iterator[HostConcatResult] with Arm with AutoCloseable {
+ private[this] val concatTimeMetric = metricsMap(GpuMetric.CONCAT_TIME)
private[this] val inputBatchesMetric = metricsMap(GpuMetric.NUM_INPUT_BATCHES)
private[this] val inputRowsMetric = metricsMap(GpuMetric.NUM_INPUT_ROWS)
- private[this] val outputBatchesMetric = metricsMap(GpuMetric.NUM_OUTPUT_BATCHES)
- private[this] val outputRowsMetric = metricsMap(GpuMetric.NUM_OUTPUT_ROWS)
- private[this] val concatTimeMetric = metricsMap(GpuMetric.CONCAT_TIME)
- private[this] val semWaitTime = metricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
private[this] val serializedTables = new util.ArrayDeque[SerializedTableColumn]
private[this] var numTablesInBatch: Int = 0
private[this] var numRowsInBatch: Int = 0
@@ -95,21 +93,44 @@ class GpuShuffleCoalesceIterator(
Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => close()))
- override def hasNext: Boolean = {
- bufferNextBatch()
- numTablesInBatch > 0
+ override def close(): Unit = {
+ serializedTables.forEach(_.close())
+ serializedTables.clear()
}
- override def next(): ColumnarBatch = {
- if (!hasNext) {
- throw new NoSuchElementException("No more columnar batches")
+ def concatenateTablesInHost(): HostConcatResult = {
+ val result = withResource(new MetricRange(concatTimeMetric)) { _ =>
+ val firstHeader = serializedTables.peekFirst().header
+ if (firstHeader.getNumColumns == 0) {
+ (0 until numTablesInBatch).foreach(_ => serializedTables.removeFirst())
+ HostConcatResultUtil.rowsOnlyHostConcatResult(numRowsInBatch)
+ } else {
+ val headers = new Array[SerializedTableHeader](numTablesInBatch)
+ withResource(new Array[HostMemoryBuffer](numTablesInBatch)) { buffers =>
+ headers.indices.foreach { i =>
+ val serializedTable = serializedTables.removeFirst()
+ headers(i) = serializedTable.header
+ buffers(i) = serializedTable.hostBuffer
+ }
+ JCudfSerialization.concatToHostBuffer(headers, buffers)
+ }
+ }
}
- concatenateBatch()
- }
- override def close(): Unit = {
- serializedTables.forEach(_.close())
- serializedTables.clear()
+ // update the stats for the next batch in progress
+ numTablesInBatch = serializedTables.size
+
+ batchByteSize = 0
+ numRowsInBatch = 0
+ if (numTablesInBatch > 0) {
+ require(numTablesInBatch == 1,
+ "should only track at most one buffer that is not in a batch")
+ val header = serializedTables.peekFirst().header
+ batchByteSize = header.getDataLen
+ numRowsInBatch = header.getNumRows
+ }
+
+ result
}
private def bufferNextBatch(): Unit = {
@@ -120,7 +141,7 @@ class GpuShuffleCoalesceIterator(
inputBatchesMetric += 1
// don't bother tracking empty tables
if (batch.numRows > 0) {
- inputRowsMetric += batch.numRows
+ inputRowsMetric += batch.numRows()
val tableColumn = batch.column(0).asInstanceOf[SerializedTableColumn]
batchCanGrow = canAddToBatch(tableColumn.header)
serializedTables.addLast(tableColumn)
@@ -138,6 +159,18 @@ class GpuShuffleCoalesceIterator(
}
}
+ override def hasNext(): Boolean = {
+ bufferNextBatch()
+ numTablesInBatch > 0
+ }
+
+ override def next(): HostConcatResult = {
+ if (!hasNext()) {
+ throw new NoSuchElementException("No more host batches to concatenate")
+ }
+ concatenateTablesInHost()
+ }
+
private def canAddToBatch(nextTable: SerializedTableHeader): Boolean = {
if (batchByteSize + nextTable.getDataLen > targetBatchByteSize) {
return false
@@ -147,60 +180,41 @@ class GpuShuffleCoalesceIterator(
}
true
}
+}
- private def concatenateBatch(): ColumnarBatch = {
- val firstHeader = serializedTables.peekFirst().header
- val batch = withResource(new MetricRange(concatTimeMetric)) { _ =>
- if (firstHeader.getNumColumns == 0) {
- // acquire the GPU unconditionally for now in this case, as a downstream exec
- // may need the GPU, and the assumption is that it is acquired in the coalesce
- // code.
- GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWaitTime)
- (0 until numTablesInBatch).foreach(_ => serializedTables.removeFirst())
- new ColumnarBatch(Array.empty, numRowsInBatch)
- } else {
- concatenateTablesBatch()
- }
- }
+/**
+ * Iterator that coalesces columnar batches that are expected to only contain
+ * [[SerializedTableColumn]]. The serialized tables within are collected up
+ * to the target batch size and then concatenated on the host before the data
+ * is transferred to the GPU.
+ */
+class GpuShuffleCoalesceIterator(iter: Iterator[HostConcatResult],
+ dataTypes: Array[DataType],
+ metricsMap: Map[String, GpuMetric])
+ extends Iterator[ColumnarBatch] with Arm {
+ private[this] val semWaitTime = metricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
+ private[this] val opTimeMetric = metricsMap(GpuMetric.OP_TIME)
+ private[this] val outputBatchesMetric = metricsMap(GpuMetric.NUM_OUTPUT_BATCHES)
+ private[this] val outputRowsMetric = metricsMap(GpuMetric.NUM_OUTPUT_ROWS)
- withResource(new MetricRange(opTimeMetric)) { _ =>
- outputBatchesMetric += 1
- outputRowsMetric += batch.numRows
-
- // update the stats for the next batch in progress
- numTablesInBatch = serializedTables.size
- batchByteSize = 0
- numRowsInBatch = 0
- if (numTablesInBatch > 0) {
- require(numTablesInBatch == 1,
- "should only track at most one buffer that is not in a batch")
- val header = serializedTables.peekFirst().header
- batchByteSize = header.getDataLen
- numRowsInBatch = header.getNumRows
- }
+ override def hasNext: Boolean = iter.hasNext
- batch
+ override def next(): ColumnarBatch = {
+ if (!hasNext) {
+ throw new NoSuchElementException("No more columnar batches")
}
- }
-
- private def concatenateTablesBatch(): ColumnarBatch = {
- val headers = new Array[SerializedTableHeader](numTablesInBatch)
- withResource(new Array[HostMemoryBuffer](numTablesInBatch)) { buffers =>
- headers.indices.foreach { i =>
- val serializedTable = serializedTables.removeFirst()
- headers(i) = serializedTable.header
- buffers(i) = serializedTable.hostBuffer
- }
-
- withResource(new NvtxRange("Concat+Load Batch", NvtxColor.YELLOW)) { _ =>
- withResource(JCudfSerialization.concatToHostBuffer(headers, buffers)) { hostConcatResult =>
- // about to start using the GPU in this task
- GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWaitTime)
- withResource(new MetricRange(opTimeMetric)) { _ =>
- withResource(hostConcatResult.toContiguousTable) { contigTable =>
- GpuColumnVectorFromBuffer.from(contigTable, sparkSchema)
- }
- }
+ withResource(new NvtxRange("Concat+Load Batch", NvtxColor.YELLOW)) { _ =>
+ withResource(iter.next()) { hostConcatResult =>
+ // We acquire the GPU regardless of whether `hostConcatResult`
+ // is an empty batch or not, because the downstream tasks expect
+ // the `GpuShuffleCoalesceIterator` to acquire the semaphore and may
+ // generate GPU data from batches that are empty.
+ GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWaitTime)
+ withResource(new MetricRange(opTimeMetric)) { _ =>
+ val batch = HostConcatResultUtil.getColumnarBatch(hostConcatResult, dataTypes)
+ outputBatchesMetric += 1
+ outputRowsMetric += batch.numRows()
+ batch
}
}
}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
index 675ec01758e..8c33ede98bc 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExec.scala
@@ -16,8 +16,11 @@
package com.nvidia.spark.rapids
+import ai.rapids.cudf.{HostConcatResultUtil, NvtxColor, NvtxRange}
+import ai.rapids.cudf.JCudfSerialization.HostConcatResult
import com.nvidia.spark.rapids.shims.v2.{GpuHashPartitioning, GpuJoinUtils, ShimBinaryExecNode}
+import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression}
@@ -97,7 +100,9 @@ case class GpuShuffledHashJoinExec(
override val outputBatchesLevel: MetricsLevel = MODERATE_LEVEL
override lazy val additionalMetrics: Map[String, GpuMetric] = Map(
OP_TIME -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_OP_TIME),
+ CONCAT_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_CONCAT_TIME),
BUILD_DATA_SIZE -> createSizeMetric(ESSENTIAL_LEVEL, DESCRIPTION_BUILD_DATA_SIZE),
+ PEAK_DEVICE_MEMORY -> createNanoTimingMetric(MODERATE_LEVEL, DESCRIPTION_PEAK_DEVICE_MEMORY),
BUILD_TIME -> createNanoTimingMetric(ESSENTIAL_LEVEL, DESCRIPTION_BUILD_TIME),
STREAM_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_STREAM_TIME),
JOIN_TIME -> createNanoTimingMetric(DEBUG_LEVEL, DESCRIPTION_JOIN_TIME),
@@ -123,28 +128,39 @@ case class GpuShuffledHashJoinExec(
val numOutputRows = gpuLongMetric(NUM_OUTPUT_ROWS)
val numOutputBatches = gpuLongMetric(NUM_OUTPUT_BATCHES)
val opTime = gpuLongMetric(OP_TIME)
- val buildTime = gpuLongMetric(BUILD_TIME)
val streamTime = gpuLongMetric(STREAM_TIME)
val joinTime = gpuLongMetric(JOIN_TIME)
val joinOutputRows = gpuLongMetric(JOIN_OUTPUT_ROWS)
- val targetSize = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
+ val batchSizeBytes = RapidsConf.GPU_BATCH_SIZE_BYTES.get(conf)
val spillCallback = GpuMetric.makeSpillCallback(allMetrics)
- val localBuildOutput: Seq[Attribute] = buildPlan.output
+ val localBuildOutput = buildPlan.output
+
+ // Create a map of metrics that can be handed down to shuffle and coalesce
+ // iterators, setting as noop certain metrics that the coalesce iterators
+ // normally update, but that in the case of the join they would produce
+ // the wrong statistics (since there are conflicts)
+ val coalesceMetricsMap = allMetrics +
+ (GpuMetric.NUM_INPUT_ROWS -> NoopMetric,
+ GpuMetric.NUM_INPUT_BATCHES -> NoopMetric,
+ GpuMetric.NUM_OUTPUT_BATCHES -> NoopMetric,
+ GpuMetric.NUM_OUTPUT_ROWS -> NoopMetric)
streamedPlan.executeColumnar().zipPartitions(buildPlan.executeColumnar()) {
(streamIter, buildIter) => {
- val stIt = new CollectTimeIterator("shuffled join stream", streamIter, streamTime)
- val startTime = System.nanoTime()
+ val (builtBatch, maybeBufferedStreamIter) =
+ GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ batchSizeBytes,
+ localBuildOutput,
+ buildIter,
+ new CollectTimeIterator("shuffled join stream", streamIter, streamTime),
+ spillCallback,
+ coalesceMetricsMap)
- withResource(ConcatAndConsumeAll.getSingleBatchWithVerification(buildIter,
- localBuildOutput)) { builtBatch =>
+ withResource(builtBatch) { _ =>
// doJoin will increment the reference counts as needed for the builtBatch
- val delta = System.nanoTime() - startTime
- buildTime += delta
buildDataSize += GpuColumnVector.getTotalDeviceMemoryUsed(builtBatch)
-
- doJoin(builtBatch, stIt, targetSize, spillCallback,
- numOutputRows, joinOutputRows, numOutputBatches,
+ doJoin(builtBatch, maybeBufferedStreamIter,
+ batchSizeBytes, spillCallback, numOutputRows, joinOutputRows, numOutputBatches,
opTime, joinTime)
}
}
@@ -155,3 +171,183 @@ case class GpuShuffledHashJoinExec(
if (isSkewJoin) super.nodeName + "(skew=true)" else super.nodeName
}
}
+
+object GpuShuffledHashJoinExec extends Arm {
+ /**
+ * Helper iterator that wraps a BufferedIterator of AutoCloseable subclasses.
+ * This iterator also implements AutoCloseable, so it can be closed in case
+ * of exceptions.
+ *
+ * @param wrapped the buffered iterator
+ * @tparam T an AutoCloseable subclass
+ */
+ class CloseableBufferedIterator[T <: AutoCloseable](wrapped: BufferedIterator[T])
+ extends BufferedIterator[T] with AutoCloseable {
+ // register against task completion to close any leaked buffered items
+ Option(TaskContext.get()).foreach(_.addTaskCompletionListener[Unit](_ => close()))
+
+ private[this] var isClosed = false
+ override def head: T = wrapped.head
+ override def headOption: Option[T] = wrapped.headOption
+ override def next: T = wrapped.next
+ override def hasNext: Boolean = wrapped.hasNext
+ override def close(): Unit = {
+ if (!isClosed) {
+ headOption.foreach(_.close())
+ isClosed = true
+ }
+ }
+ }
+
+ /**
+ * Gets a `ColumnarBatch` and stream Iterator[ColumnarBatch] pair by acquiring
+ * the GPU semaphore optimally in the scenario where the build side is relatively
+ * small (less than `hostTargetBatchSize`).
+ *
+ * In the optimal case, this function will load the build side on the host up to the
+ * goal configuration and if it fits entirely, allow the stream iterator
+ * to also pull to host its first batch. After the first stream batch is on the host, the
+ * stream iterator acquires the semaphore and then the build side is copied to the GPU.
+ *
+ * Prior to this we would get a build batch on the GPU, acquiring
+ * the semaphore in the process, and then begin pulling from the stream iterator,
+ * which could include IO (while holding onto the semaphore).
+ *
+ * The function handles the case where the build side goes above the configured batch
+ * goal, in which case it will concat on the host, grab the semaphore, and continue to
+ * pull the build iterator to build a bigger batch on the GPU. This is not optimized
+ * because we hold onto the semaphore during the entire time after realizing the goal
+ * has been hit.
+ *
+ * @param hostTargetBatchSize target batch size goal on the host
+ * @param buildOutput output attributes of the build plan
+ * @param buildIter build iterator
+ * @param streamIter stream iterator
+ * @param spillCallback metric updater in case downstream iterators spill
+ * @param coalesceMetricsMap metrics map with metrics to be used in downstream
+ * iterators
+ * @return a pair of `ColumnarBatch` and streamed iterator that can be
+ * used for the join
+ */
+ def getBuiltBatchAndStreamIter(
+ hostTargetBatchSize: Long,
+ buildOutput: Seq[Attribute],
+ buildIter: Iterator[ColumnarBatch],
+ streamIter: Iterator[ColumnarBatch],
+ spillCallback: SpillCallback,
+ coalesceMetricsMap: Map[String, GpuMetric]): (ColumnarBatch, Iterator[ColumnarBatch]) = {
+ val semWait = coalesceMetricsMap(GpuMetric.SEMAPHORE_WAIT_TIME)
+ val buildTime = coalesceMetricsMap(GpuMetric.BUILD_TIME)
+ var bufferedBuildIterator: CloseableBufferedIterator[ColumnarBatch] = null
+ closeOnExcept(bufferedBuildIterator) { _ =>
+ val startTime = System.nanoTime()
+ // find if the build side is non-empty, and if the first batch is
+ // a serialized batch. If neither condition is met, we fallback to the
+ // `getSingleBatchWithVerification` method.
+ val firstBatchIsSerialized = {
+ if (!buildIter.hasNext) {
+ false
+ } else {
+ bufferedBuildIterator = new CloseableBufferedIterator(buildIter.buffered)
+ val firstBatch = bufferedBuildIterator.head
+ if (firstBatch.numCols() != 1) {
+ false
+ } else {
+ firstBatch.column(0).isInstanceOf[SerializedTableColumn]
+ }
+ }
+ }
+
+ if (!firstBatchIsSerialized) {
+ // In this scenario we are getting non host-side batches in the build side
+ // given the plan rules we expect this to be a single batch
+ val builtBatch =
+ ConcatAndConsumeAll.getSingleBatchWithVerification(
+ Option(bufferedBuildIterator).getOrElse(buildIter), buildOutput)
+ val delta = System.nanoTime() - startTime
+ buildTime += delta
+ (builtBatch, streamIter)
+ } else {
+ val dataTypes = buildOutput.map(_.dataType).toArray
+ val hostConcatIter = new HostShuffleCoalesceIterator(bufferedBuildIterator,
+ hostTargetBatchSize, dataTypes, coalesceMetricsMap)
+ withResource(hostConcatIter) { _ =>
+ closeOnExcept(hostConcatIter.next()) { hostConcatResult =>
+ if (!hostConcatIter.hasNext()) {
+ // add the time it took to fetch that first host-side build batch
+ buildTime += System.nanoTime() - startTime
+ // Optimal case, we drained the build iterator and we didn't have a prior
+ // so it was a single batch, and is entirely on the host.
+ // We peek at the stream iterator with `hasNext` on the buffered
+ // iterator, which will grab the semaphore when putting the first stream
+ // batch on the GPU, and then we bring the build batch to the GPU and return.
+ val bufferedStreamIter = new CloseableBufferedIterator(streamIter.buffered)
+ closeOnExcept(bufferedStreamIter) { _ =>
+ withResource(new NvtxRange("first stream batch", NvtxColor.RED)) { _ =>
+ if (bufferedStreamIter.hasNext) {
+ bufferedStreamIter.head
+ } else {
+ GpuSemaphore.acquireIfNecessary(TaskContext.get(), semWait)
+ }
+ }
+ val buildBatch = getBuildBatchOptimized(hostConcatResult, buildOutput, buildTime)
+ (buildBatch, bufferedStreamIter)
+ }
+ } else {
+ val buildBatch = getBuildBatchFromUnfinished(
+ Seq(hostConcatResult).iterator ++ hostConcatIter,
+ buildOutput, spillCallback, coalesceMetricsMap)
+ buildTime += System.nanoTime() - startTime
+ (buildBatch, streamIter)
+ }
+ }
+ }
+ }
+ }
+ }
+
+ private def getBuildBatchFromUnfinished(
+ iterWithPrior: Iterator[HostConcatResult],
+ buildOutput: Seq[Attribute],
+ spillCallback: SpillCallback,
+ coalesceMetricsMap: Map[String, GpuMetric]): ColumnarBatch = {
+ // In the fallback case we build the same iterator chain that the Spark plan
+ // would have produced:
+ // GpuCoalesceIterator(GpuShuffleCoalesceIterator(shuffled build side))
+ // This allows us to make the shuffle batches spillable in case we have a large,
+ // build-side table, as `RequireSingleBatch` is virtually no limit, and we
+ // know we are now above `hostTargetBatchSize` (which is 2GB by default)
+ val dataTypes = buildOutput.map(_.dataType).toArray
+ val shuffleCoalesce = new GpuShuffleCoalesceIterator(
+ iterWithPrior,
+ dataTypes,
+ coalesceMetricsMap)
+ val res = ConcatAndConsumeAll.getSingleBatchWithVerification(
+ new GpuCoalesceIterator(shuffleCoalesce,
+ dataTypes,
+ RequireSingleBatch,
+ NoopMetric, NoopMetric, NoopMetric, NoopMetric, NoopMetric,
+ coalesceMetricsMap(GpuMetric.CONCAT_TIME),
+ coalesceMetricsMap(GpuMetric.OP_TIME),
+ coalesceMetricsMap(GpuMetric.PEAK_DEVICE_MEMORY),
+ spillCallback,
+ "build batch"),
+ buildOutput)
+ res
+ }
+
+ private def getBuildBatchOptimized(
+ hostConcatResult: HostConcatResult,
+ buildOutput: Seq[Attribute],
+ buildTime: GpuMetric): ColumnarBatch = {
+ val dataTypes = buildOutput.map(_.dataType).toArray
+ // we are on the GPU and our build batch is within `targetSizeBytes`.
+ // we can bring the build batch to the GPU now
+ withResource(hostConcatResult) { _ =>
+ buildTime.ns {
+ HostConcatResultUtil.getColumnarBatch(hostConcatResult, dataTypes)
+ }
+ }
+ }
+}
+
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
index 9e9c496e8b4..54bf57e94fa 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/GpuTransitionOverrides.scala
@@ -237,6 +237,28 @@ class GpuTransitionOverrides extends Rule[SparkPlan] {
p.withNewChildren(p.children.map(optimizeCoalesce))
}
+ /**
+ * Removes `GpuCoalesceBatches(GpuShuffleCoalesceExec(build side))` for the build side
+ * for the shuffled hash join. The coalesce logic has been moved to the
+ * `GpuShuffleCoalesceExec` class, and is handled differently to prevent holding onto the
+ * GPU semaphore for stream IO.
+ */
+ def shuffledHashJoinOptimizeShuffle(plan: SparkPlan): SparkPlan = plan match {
+ case x@GpuShuffledHashJoinExec(
+ _, _, _, buildSide, _,
+ left: GpuShuffleCoalesceExec,
+ GpuCoalesceBatches(GpuShuffleCoalesceExec(rc, _), _),_) if buildSide == GpuBuildRight =>
+ x.withNewChildren(
+ Seq(shuffledHashJoinOptimizeShuffle(left), shuffledHashJoinOptimizeShuffle(rc)))
+ case x@GpuShuffledHashJoinExec(
+ _, _, _, buildSide, _,
+ GpuCoalesceBatches(GpuShuffleCoalesceExec(lc, _), _),
+ right: GpuShuffleCoalesceExec, _) if buildSide == GpuBuildLeft =>
+ x.withNewChildren(
+ Seq(shuffledHashJoinOptimizeShuffle(lc), shuffledHashJoinOptimizeShuffle(right)))
+ case p => p.withNewChildren(p.children.map(shuffledHashJoinOptimizeShuffle))
+ }
+
private def insertCoalesce(plans: Seq[SparkPlan], goals: Seq[CoalesceGoal],
disableUntilInput: Boolean): Seq[SparkPlan] = {
plans.zip(goals).map {
@@ -550,6 +572,9 @@ class GpuTransitionOverrides extends Rule[SparkPlan] {
}
updatedPlan = fixupHostColumnarTransitions(updatedPlan)
updatedPlan = optimizeCoalesce(updatedPlan)
+ if (rapidsConf.shuffledHashJoinOptimizeShuffle) {
+ updatedPlan = shuffledHashJoinOptimizeShuffle(updatedPlan)
+ }
if (rapidsConf.exportColumnarRdd) {
updatedPlan = detectAndTagFinalColumnarOutput(updatedPlan)
}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
index 93ce85ce3e1..16ff8eb02f2 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/Plugin.scala
@@ -21,7 +21,9 @@ import java.util.Properties
import java.util.concurrent.atomic.{AtomicBoolean, AtomicReference}
import scala.collection.JavaConverters._
+import scala.collection.mutable.{Map => MutableMap}
import scala.util.Try
+import scala.util.matching.Regex
import com.nvidia.spark.rapids.python.PythonWorkerSemaphore
@@ -391,25 +393,33 @@ object ExecutionPlanCaptureCallback {
executedPlan.expressions.exists(didFallBack(_, fallbackCpuClass))
}
- private def containsExpression(exp: Expression, className: String): Boolean = exp.find {
+ private def containsExpression(exp: Expression, className: String,
+ regexMap: MutableMap[String, Regex] // regex memoization
+ ): Boolean = exp.find {
case e if PlanUtils.getBaseNameFromClass(e.getClass.getName) == className => true
- case e: ExecSubqueryExpression => containsPlan(e.plan, className)
+ case e: ExecSubqueryExpression => containsPlan(e.plan, className, regexMap)
case _ => false
}.nonEmpty
- private def containsPlan(plan: SparkPlan, className: String): Boolean = plan.find {
+ private def containsPlan(plan: SparkPlan, className: String,
+ regexMap: MutableMap[String, Regex] = MutableMap.empty // regex memoization
+ ): Boolean = plan.find {
case p if PlanUtils.sameClass(p, className) =>
true
case p: AdaptiveSparkPlanExec =>
- containsPlan(p.executedPlan, className)
+ containsPlan(p.executedPlan, className, regexMap)
case p: QueryStageExec =>
- containsPlan(p.plan, className)
+ containsPlan(p.plan, className, regexMap)
case p: ReusedSubqueryExec =>
- containsPlan(p.child, className)
+ containsPlan(p.child, className, regexMap)
case p: ReusedExchangeExec =>
- containsPlan(p.child, className)
- case p =>
- p.expressions.exists(containsExpression(_, className))
+ containsPlan(p.child, className, regexMap)
+ case p if p.expressions.exists(containsExpression(_, className, regexMap)) =>
+ true
+ case p: SparkPlan =>
+ regexMap.getOrElseUpdate(className, className.r)
+ .findFirstIn(p.simpleStringWithNodeId())
+ .nonEmpty
}.nonEmpty
}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
index 222207c1151..f7730e84f99 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferCatalog.scala
@@ -299,4 +299,4 @@ object RapidsBufferCatalog extends Logging with Arm {
/** Remove a buffer ID from the catalog and release the resources of the registered buffer. */
def removeBuffer(id: RapidsBufferId): Unit = singleton.removeBuffer(id)
-}
\ No newline at end of file
+}
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
index 92847350df6..e9ac9c1235e 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsBufferStore.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -70,11 +70,17 @@ abstract class RapidsBufferStore(
}
}
- def freeAll(): Unit = synchronized {
- val values = buffers.values().toArray(new Array[RapidsBufferBase](0))
+ def freeAll(): Unit = {
+ val values = synchronized {
+ val buffs = buffers.values().toArray(new Array[RapidsBufferBase](0))
+ buffers.clear()
+ spillable.clear()
+ buffs
+ }
+ // We need to release the `RapidsBufferStore` lock to prevent a lock order inversion
+ // deadlock: (1) `RapidsBufferBase.free` calls (2) `RapidsBufferStore.remove` and
+ // (1) `RapidsBufferStore.freeAll` calls (2) `RapidsBufferBase.free`.
values.foreach(_.free())
- buffers.clear()
- spillable.clear()
}
def nextSpillableBuffer(): RapidsBufferBase = synchronized {
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
index aec84a1f4a6..4042a466cbc 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsConf.scala
@@ -471,6 +471,16 @@ object RapidsConf {
.booleanConf
.createWithDefault(false)
+ val SHUFFLED_HASH_JOIN_OPTIMIZE_SHUFFLE =
+ conf("spark.rapids.sql.shuffledHashJoin.optimizeShuffle")
+ .doc("Enable or disable an optimization where shuffled build side batches are kept " +
+ "on the host while the first stream batch is loaded onto the GPU. The optimization " +
+ "increases off-heap host memory usage to avoid holding onto the GPU semaphore while " +
+ "waiting for stream side IO.")
+ .internal()
+ .booleanConf
+ .createWithDefault(true)
+
val STABLE_SORT = conf("spark.rapids.sql.stableSort.enabled")
.doc("Enable or disable stable sorting. Apache Spark's sorting is typically a stable " +
"sort, but sort stability cannot be guaranteed in distributed work loads because the " +
@@ -1484,6 +1494,8 @@ class RapidsConf(conf: Map[String, String]) extends Logging {
lazy val exportColumnarRdd: Boolean = get(EXPORT_COLUMNAR_RDD)
+ lazy val shuffledHashJoinOptimizeShuffle: Boolean = get(SHUFFLED_HASH_JOIN_OPTIMIZE_SHUFFLE)
+
lazy val stableSort: Boolean = get(STABLE_SORT)
lazy val isIncompatEnabled: Boolean = get(INCOMPATIBLE_OPS)
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
index ac3705f7004..09dfdfc1869 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsShuffleHeartbeatManager.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -20,7 +20,6 @@ import java.util.concurrent.{Executors, ScheduledExecutorService, TimeUnit}
import scala.collection.mutable.ArrayBuffer
-import com.google.common.util.concurrent.ThreadFactoryBuilder
import org.apache.commons.lang3.mutable.MutableLong
import org.apache.spark.SparkEnv
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
index 79db0286d8e..002c8b3f04b 100644
--- a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
+++ b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RegexParser.scala
@@ -404,6 +404,30 @@ class RegexParser(pattern: String) {
}
+object RegexParser {
+ private val regexpChars = Set('\u0000', '\\', '.', '^', '$', '\f')
+
+ def isRegExpString(s: String): Boolean = {
+
+ def isRegExpString(ast: RegexAST): Boolean = ast match {
+ case RegexChar(ch) => regexpChars.contains(ch)
+ case RegexEscaped(_) => true
+ case RegexSequence(parts) => parts.exists(isRegExpString)
+ case _ => true
+ }
+
+ try {
+ val parser = new RegexParser(s)
+ val ast = parser.parse()
+ isRegExpString(ast)
+ } catch {
+ case _: RegexUnsupportedException =>
+ // if we cannot parse it then assume that it might be valid regexp
+ true
+ }
+ }
+}
+
/**
* Transpile Java/Spark regular expression to a format that cuDF supports, or throw an exception
* if this is not possible.
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/execution/datasources/rapids/OrcFiltersWrapper.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/execution/datasources/rapids/OrcFiltersWrapper.scala
new file mode 100644
index 00000000000..65792c76c82
--- /dev/null
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/execution/datasources/rapids/OrcFiltersWrapper.scala
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.sql.execution.datasources.rapids
+
+import org.apache.hadoop.hive.ql.io.sarg.SearchArgument
+
+import org.apache.spark.sql.execution.datasources.orc.OrcFilters
+import org.apache.spark.sql.sources.Filter
+import org.apache.spark.sql.types.StructType
+
+// Wrapper for Spark OrcFilters which is in private package
+object OrcFiltersWrapper {
+ def createFilter(schema: StructType, filters: Seq[Filter]): Option[SearchArgument] = {
+ OrcFilters.createFilter(schema, filters)
+ }
+}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFilters.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFilters.scala
deleted file mode 100644
index 2dd9973cafd..00000000000
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFilters.scala
+++ /dev/null
@@ -1,278 +0,0 @@
-/*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.rapids
-
-import java.time.{Instant, LocalDate}
-
-import org.apache.hadoop.hive.common.`type`.HiveDecimal
-import org.apache.hadoop.hive.ql.io.sarg.{PredicateLeaf, SearchArgument}
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgument.Builder
-import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory.newBuilder
-import org.apache.hadoop.hive.serde2.io.HiveDecimalWritable
-
-import org.apache.spark.SparkException
-import org.apache.spark.sql.catalyst.util.DateTimeUtils.{instantToMicros, localDateToDays, toJavaDate, toJavaTimestamp}
-import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.sources.Filter
-import org.apache.spark.sql.types._
-
-// This is derived from Apache Spark's OrcFilters code to avoid calling the
-// Spark version. Spark's version can potentially create a search argument
-// applier object that is incompatible with the orc:nohive jar that has been
-// shaded as part of this project.
-
-/**
- * Helper object for building ORC `SearchArgument`s, which are used for ORC predicate push-down.
- *
- * Due to limitation of ORC `SearchArgument` builder, we had to implement separate checking and
- * conversion passes through the Filter to make sure we only convert predicates that are known
- * to be convertible.
- *
- * An ORC `SearchArgument` must be built in one pass using a single builder. For example, you can't
- * build `a = 1` and `b = 2` first, and then combine them into `a = 1 AND b = 2`. This is quite
- * different from the cases in Spark SQL or Parquet, where complex filters can be easily built using
- * existing simpler ones.
- *
- * The annoying part is that, `SearchArgument` builder methods like `startAnd()`, `startOr()`, and
- * `startNot()` mutate internal state of the builder instance. This forces us to translate all
- * convertible filters with a single builder instance. However, if we try to translate a filter
- * before checking whether it can be converted or not, we may end up with a builder whose internal
- * state is inconsistent in the case of an inconvertible filter.
- *
- * For example, to convert an `And` filter with builder `b`, we call `b.startAnd()` first, and then
- * try to convert its children. Say we convert `left` child successfully, but find that `right`
- * child is inconvertible. Alas, `b.startAnd()` call can't be rolled back, and `b` is inconsistent
- * now.
- *
- * The workaround employed here is to trim the Spark filters before trying to convert them. This
- * way, we can only do the actual conversion on the part of the Filter that is known to be
- * convertible.
- *
- * P.S.: Hive seems to use `SearchArgument` together with `ExprNodeGenericFuncDesc` only. Usage of
- * builder methods mentioned above can only be found in test code, where all tested filters are
- * known to be convertible.
- */
-object OrcFilters extends OrcFiltersBase {
-
- /**
- * Create ORC filter as a SearchArgument instance.
- */
- def createFilter(schema: StructType, filters: Seq[Filter]): Option[SearchArgument] = {
- val dataTypeMap = OrcFilters.getSearchableTypeMap(schema, SQLConf.get.caseSensitiveAnalysis)
- // Combines all convertible filters using `And` to produce a single conjunction
- val conjunctionOptional = buildTree(convertibleFilters(dataTypeMap, filters))
- conjunctionOptional.map { conjunction =>
- // Then tries to build a single ORC `SearchArgument` for the conjunction predicate.
- // The input predicate is fully convertible. There should not be any empty result in the
- // following recursive method call `buildSearchArgument`.
- buildSearchArgument(dataTypeMap, conjunction, newBuilder).build()
- }
- }
-
- def convertibleFilters(
- dataTypeMap: Map[String, OrcPrimitiveField],
- filters: Seq[Filter]): Seq[Filter] = {
- import org.apache.spark.sql.sources._
-
- def convertibleFiltersHelper(
- filter: Filter,
- canPartialPushDown: Boolean): Option[Filter] = filter match {
- // At here, it is not safe to just convert one side and remove the other side
- // if we do not understand what the parent filters are.
- //
- // Here is an example used to explain the reason.
- // Let's say we have NOT(a = 2 AND b in ('1')) and we do not understand how to
- // convert b in ('1'). If we only convert a = 2, we will end up with a filter
- // NOT(a = 2), which will generate wrong results.
- //
- // Pushing one side of AND down is only safe to do at the top level or in the child
- // AND before hitting NOT or OR conditions, and in this case, the unsupported predicate
- // can be safely removed.
- case And(left, right) =>
- val leftResultOptional = convertibleFiltersHelper(left, canPartialPushDown)
- val rightResultOptional = convertibleFiltersHelper(right, canPartialPushDown)
- (leftResultOptional, rightResultOptional) match {
- case (Some(leftResult), Some(rightResult)) => Some(And(leftResult, rightResult))
- case (Some(leftResult), None) if canPartialPushDown => Some(leftResult)
- case (None, Some(rightResult)) if canPartialPushDown => Some(rightResult)
- case _ => None
- }
-
- // The Or predicate is convertible when both of its children can be pushed down.
- // That is to say, if one/both of the children can be partially pushed down, the Or
- // predicate can be partially pushed down as well.
- //
- // Here is an example used to explain the reason.
- // Let's say we have
- // (a1 AND a2) OR (b1 AND b2),
- // a1 and b1 is convertible, while a2 and b2 is not.
- // The predicate can be converted as
- // (a1 OR b1) AND (a1 OR b2) AND (a2 OR b1) AND (a2 OR b2)
- // As per the logical in And predicate, we can push down (a1 OR b1).
- case Or(left, right) =>
- for {
- lhs <- convertibleFiltersHelper(left, canPartialPushDown)
- rhs <- convertibleFiltersHelper(right, canPartialPushDown)
- } yield Or(lhs, rhs)
- case Not(pred) =>
- val childResultOptional = convertibleFiltersHelper(pred, canPartialPushDown = false)
- childResultOptional.map(Not)
- case other =>
- for (_ <- buildLeafSearchArgument(dataTypeMap, other, newBuilder())) yield other
- }
- filters.flatMap { filter =>
- convertibleFiltersHelper(filter, true)
- }
- }
-
- /**
- * Get PredicateLeafType which is corresponding to the given DataType.
- */
- def getPredicateLeafType(dataType: DataType): PredicateLeaf.Type = dataType match {
- case BooleanType => PredicateLeaf.Type.BOOLEAN
- case ByteType | ShortType | IntegerType | LongType => PredicateLeaf.Type.LONG
- case FloatType | DoubleType => PredicateLeaf.Type.FLOAT
- case StringType => PredicateLeaf.Type.STRING
- case DateType => PredicateLeaf.Type.DATE
- case TimestampType => PredicateLeaf.Type.TIMESTAMP
- case _: DecimalType => PredicateLeaf.Type.DECIMAL
- case _ => throw new UnsupportedOperationException(s"DataType: ${dataType.catalogString}")
- }
-
- /**
- * Cast literal values for filters.
- *
- * We need to cast to long because ORC raises exceptions
- * at 'checkLiteralType' of SearchArgumentImpl.java.
- */
- private def castLiteralValue(value: Any, dataType: DataType): Any = dataType match {
- case ByteType | ShortType | IntegerType | LongType =>
- value.asInstanceOf[Number].longValue
- case FloatType | DoubleType =>
- value.asInstanceOf[Number].doubleValue()
- case _: DecimalType =>
- new HiveDecimalWritable(HiveDecimal.create(value.asInstanceOf[java.math.BigDecimal]))
- case _: DateType if value.isInstanceOf[LocalDate] =>
- toJavaDate(localDateToDays(value.asInstanceOf[LocalDate]))
- case _: TimestampType if value.isInstanceOf[Instant] =>
- toJavaTimestamp(instantToMicros(value.asInstanceOf[Instant]))
- case _ => value
- }
-
- /**
- * Build a SearchArgument and return the builder so far.
- *
- * @param dataTypeMap a map from the attribute name to its data type.
- * @param expression the input predicates, which should be fully convertible to SearchArgument.
- * @param builder the input SearchArgument.Builder.
- * @return the builder so far.
- */
- private def buildSearchArgument(
- dataTypeMap: Map[String, OrcPrimitiveField],
- expression: Filter,
- builder: Builder): Builder = {
- import org.apache.spark.sql.sources._
-
- expression match {
- case And(left, right) =>
- val lhs = buildSearchArgument(dataTypeMap, left, builder.startAnd())
- val rhs = buildSearchArgument(dataTypeMap, right, lhs)
- rhs.end()
-
- case Or(left, right) =>
- val lhs = buildSearchArgument(dataTypeMap, left, builder.startOr())
- val rhs = buildSearchArgument(dataTypeMap, right, lhs)
- rhs.end()
-
- case Not(child) =>
- buildSearchArgument(dataTypeMap, child, builder.startNot()).end()
-
- case other =>
- buildLeafSearchArgument(dataTypeMap, other, builder).getOrElse {
- throw new SparkException(
- "The input filter of OrcFilters.buildSearchArgument should be fully convertible.")
- }
- }
- }
-
- /**
- * Build a SearchArgument for a leaf predicate and return the builder so far.
- *
- * @param dataTypeMap a map from the attribute name to its data type.
- * @param expression the input filter predicates.
- * @param builder the input SearchArgument.Builder.
- * @return the builder so far.
- */
- private def buildLeafSearchArgument(
- dataTypeMap: Map[String, OrcPrimitiveField],
- expression: Filter,
- builder: Builder): Option[Builder] = {
- def getType(attribute: String): PredicateLeaf.Type =
- getPredicateLeafType(dataTypeMap(attribute).fieldType)
-
- import org.apache.spark.sql.sources._
-
- // NOTE: For all case branches dealing with leaf predicates below, the additional `startAnd()`
- // call is mandatory. ORC `SearchArgument` builder requires that all leaf predicates must be
- // wrapped by a "parent" predicate (`And`, `Or`, or `Not`).
- expression match {
- case EqualTo(name, value) if dataTypeMap.contains(name) =>
- val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
- Some(builder.startAnd()
- .equals(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
- case EqualNullSafe(name, value) if dataTypeMap.contains(name) =>
- val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
- Some(builder.startAnd()
- .nullSafeEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
- case LessThan(name, value) if dataTypeMap.contains(name) =>
- val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
- Some(builder.startAnd()
- .lessThan(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
- case LessThanOrEqual(name, value) if dataTypeMap.contains(name) =>
- val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
- Some(builder.startAnd()
- .lessThanEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
- case GreaterThan(name, value) if dataTypeMap.contains(name) =>
- val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
- Some(builder.startNot()
- .lessThanEquals(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
- case GreaterThanOrEqual(name, value) if dataTypeMap.contains(name) =>
- val castedValue = castLiteralValue(value, dataTypeMap(name).fieldType)
- Some(builder.startNot()
- .lessThan(dataTypeMap(name).fieldName, getType(name), castedValue).end())
-
- case IsNull(name) if dataTypeMap.contains(name) =>
- Some(builder.startAnd()
- .isNull(dataTypeMap(name).fieldName, getType(name)).end())
-
- case IsNotNull(name) if dataTypeMap.contains(name) =>
- Some(builder.startNot()
- .isNull(dataTypeMap(name).fieldName, getType(name)).end())
-
- case In(name, values) if dataTypeMap.contains(name) =>
- val castedValues = values.map(v => castLiteralValue(v, dataTypeMap(name).fieldType))
- Some(builder.startAnd().in(dataTypeMap(name).fieldName, getType(name),
- castedValues.map(_.asInstanceOf[AnyRef]): _*).end())
-
- case _ => None
- }
- }
-}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFiltersBase.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFiltersBase.scala
deleted file mode 100644
index d4fb2f260d6..00000000000
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/OrcFiltersBase.scala
+++ /dev/null
@@ -1,89 +0,0 @@
-/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-package org.apache.spark.sql.rapids
-
-import java.util.Locale
-
-import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap
-import org.apache.spark.sql.sources.{And, Filter}
-import org.apache.spark.sql.types.{AtomicType, BinaryType, DataType, StructField, StructType}
-
-/**
- * Methods that can be shared when upgrading the built-in Hive.
- *
- * Derived from Apache Spark to avoid depending upon it directly,
- * since its API has changed between Spark versions.
- */
-trait OrcFiltersBase {
-
- private[sql] def buildTree(filters: Seq[Filter]): Option[Filter] = {
- filters match {
- case Seq() => None
- case Seq(filter) => Some(filter)
- case Seq(filter1, filter2) => Some(And(filter1, filter2))
- case _ => // length > 2
- val (left, right) = filters.splitAt(filters.length / 2)
- Some(And(buildTree(left).get, buildTree(right).get))
- }
- }
-
- case class OrcPrimitiveField(fieldName: String, fieldType: DataType)
-
- /**
- * This method returns a map which contains ORC field name and data type. Each key
- * represents a column; `dots` are used as separators for nested columns. If any part
- * of the names contains `dots`, it is quoted to avoid confusion. See
- * `org.apache.spark.sql.connector.catalog.quoted` for implementation details.
- *
- * BinaryType, UserDefinedType, ArrayType and MapType are ignored.
- */
- protected[sql] def getSearchableTypeMap(
- schema: StructType,
- caseSensitive: Boolean): Map[String, OrcPrimitiveField] = {
- import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.MultipartIdentifierHelper
-
- def getPrimitiveFields(
- fields: Seq[StructField],
- parentFieldNames: Seq[String] = Seq.empty): Seq[(String, OrcPrimitiveField)] = {
- fields.flatMap { f =>
- f.dataType match {
- case st: StructType =>
- getPrimitiveFields(st.fields, parentFieldNames :+ f.name)
- case BinaryType => None
- case _: AtomicType =>
- val fieldName = (parentFieldNames :+ f.name).quoted
- val orcField = OrcPrimitiveField(fieldName, f.dataType)
- Some((fieldName, orcField))
- case _ => None
- }
- }
- }
-
- val primitiveFields = getPrimitiveFields(schema.fields)
- if (caseSensitive) {
- primitiveFields.toMap
- } else {
- // Don't consider ambiguity here, i.e. more than one field are matched in case insensitive
- // mode, just skip pushdown for these fields, they will trigger Exception when reading,
- // See: SPARK-25175.
- val dedupPrimitiveFields = primitiveFields
- .groupBy(_._1.toLowerCase(Locale.ROOT))
- .filter(_._2.size == 1)
- .mapValues(_.head._2)
- CaseInsensitiveMap(dedupPrimitiveFields)
- }
- }
-}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
index 3bf981d2c63..76ab58f140b 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/collectionOperations.scala
@@ -171,9 +171,8 @@ case class GpuElementAt(left: Expression, right: Expression, failOnError: Boolea
if (!exist.isValid || exist.getBoolean) {
lhs.getBase.getMapValue(rhs.getBase)
} else {
- throw new NoSuchElementException(
- s"Key: ${rhs.getValue.asInstanceOf[UTF8String].toString} " +
- s"does not exist in one of the rows in the map column")
+ RapidsErrorUtils.throwInvalidElementAtIndexError(
+ rhs.getValue.asInstanceOf[UTF8String].toString, true)
}
}
}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeExtractors.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeExtractors.scala
index 8dd0635c988..14d152139eb 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeExtractors.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/complexTypeExtractors.scala
@@ -183,9 +183,8 @@ case class GpuGetMapValue(child: Expression, key: Expression, failOnError: Boole
withResource(lhs.getBase.getMapKeyExistence(rhs.getBase)) { keyExistenceColumn =>
withResource(keyExistenceColumn.all) { exist =>
if (exist.isValid && !exist.getBoolean) {
- throw new NoSuchElementException(
- s"Key: ${rhs.getValue.asInstanceOf[UTF8String].toString} " +
- s"does not exist in any one of the rows in the map column")
+ RapidsErrorUtils.throwInvalidElementAtIndexError(
+ rhs.getValue.asInstanceOf[UTF8String].toString)
}
}
}
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
index a3ba5724c66..f3e5737c0ca 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -26,7 +26,6 @@ import scala.util.control.NonFatal
import ai.rapids.cudf.{HostMemoryBuffer, JCudfSerialization, NvtxColor, NvtxRange}
import ai.rapids.cudf.JCudfSerialization.SerializedTableHeader
-import com.google.common.util.concurrent.ThreadFactoryBuilder
import com.nvidia.spark.rapids._
import com.nvidia.spark.rapids.GpuMetric._
import com.nvidia.spark.rapids.RapidsPluginImplicits._
diff --git a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
index 5bcd9826028..827e1d75634 100644
--- a/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
+++ b/sql-plugin/src/main/scala/org/apache/spark/sql/rapids/stringFunctions.scala
@@ -18,7 +18,7 @@ package org.apache.spark.sql.rapids
import scala.collection.mutable.ArrayBuffer
-import ai.rapids.cudf.{ColumnVector, ColumnView, DType, PadSide, Scalar, Table}
+import ai.rapids.cudf.{BinaryOp, ColumnVector, ColumnView, DType, PadSide, Scalar, Table}
import com.nvidia.spark.rapids._
import com.nvidia.spark.rapids.RapidsPluginImplicits._
import com.nvidia.spark.rapids.shims.v2.ShimExpression
@@ -60,6 +60,32 @@ case class GpuLength(child: Expression) extends GpuUnaryExpression with ExpectsI
input.getBase.getCharLengths()
}
+case class GpuBitLength(child: Expression) extends GpuUnaryExpression with ExpectsInputTypes {
+
+ override def dataType: DataType = IntegerType
+ override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+ override def toString: String = s"bit_length($child)"
+
+ override def doColumnar(input: GpuColumnVector): ColumnVector = {
+ withResource(input.getBase.getByteCount) { byteCnt =>
+ // bit count = byte count * 8
+ withResource(GpuScalar.from(3, IntegerType)) { factor =>
+ byteCnt.binaryOp(BinaryOp.SHIFT_LEFT, factor, DType.INT32)
+ }
+ }
+ }
+}
+
+case class GpuOctetLength(child: Expression) extends GpuUnaryExpression with ExpectsInputTypes {
+
+ override def dataType: DataType = IntegerType
+ override def inputTypes: Seq[AbstractDataType] = Seq(StringType)
+ override def toString: String = s"octet_length($child)"
+
+ override def doColumnar(input: GpuColumnVector): ColumnVector =
+ input.getBase.getByteCount
+}
+
case class GpuStringLocate(substr: Expression, col: Expression, start: Expression)
extends GpuTernaryExpression with ImplicitCastInputTypes {
@@ -1296,7 +1322,7 @@ class GpuStringSplitMeta(
} else {
val str = regexp.get.value.asInstanceOf[UTF8String]
if (str != null) {
- if (!canRegexpBeTreatedLikeARegularString(str)) {
+ if (RegexParser.isRegExpString(str.toString)) {
willNotWorkOnGpu("regular expressions are not supported yet")
}
if (str.numChars() == 0) {
@@ -1320,7 +1346,7 @@ class GpuStringSplitMeta(
case class GpuStringSplit(str: Expression, regex: Expression, limit: Expression)
extends GpuTernaryExpression with ImplicitCastInputTypes {
- override def dataType: DataType = ArrayType(StringType)
+ override def dataType: DataType = ArrayType(StringType, containsNull = false)
override def inputTypes: Seq[DataType] = Seq(StringType, StringType, IntegerType)
override def first: Expression = str
override def second: Expression = regex
diff --git a/tests/pom.xml b/tests/pom.xml
index 356d2b18156..3329f0dee7c 100644
--- a/tests/pom.xml
+++ b/tests/pom.xml
@@ -54,12 +54,6 @@
${cuda.version}provided
-
- com.nvidia
- rapids-4-spark_${scala.binary.version}
- ${project.version}
- provided
- com.nvidia
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
index 5db550ba083..78913447dae 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/AdaptiveQueryExecSuite.scala
@@ -177,34 +177,54 @@ class AdaptiveQueryExecSuite
spark.sql("INSERT INTO TABLE t1 SELECT a, b FROM testData").collect()
spark.sql("INSERT INTO TABLE t2 SELECT a, b FROM testData").collect()
- val df = spark.sql(
- "SELECT t1.a, t2.b " +
+ // This test checks that inputs to the SHJ are coalesced. We need to check both sides
+ // if we are not optimizing the build-side coalescing logic, and only the stream side
+ // if the optimization is enabled (default).
+ // See `RapidsConf.SHUFFLED_HASH_JOIN_OPTIMIZE_SHUFFLE` for more information.
+ Seq(true, false).foreach { shouldOptimizeHashJoinShuffle =>
+ spark.conf.set(
+ RapidsConf.SHUFFLED_HASH_JOIN_OPTIMIZE_SHUFFLE.key,
+ shouldOptimizeHashJoinShuffle.toString)
+ val df = spark.sql(
+ "SELECT t1.a, t2.b " +
"FROM t1 " +
"JOIN t2 " +
"ON t1.a = t2.a " +
"WHERE t2.a = 5" // filter on partition key to force dynamic partition pruning
- )
- df.collect()
+ )
+ df.collect()
- val isAdaptiveQuery = df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]
- if (cmpSparkVersion(3, 2, 0) < 0) {
- // assert that DPP did cause this to run as a non-AQE plan prior to Spark 3.2.0
- assert(!isAdaptiveQuery)
- } else {
- // In 3.2.0 AQE works with DPP
- assert(isAdaptiveQuery)
+ val isAdaptiveQuery = df.queryExecution.executedPlan.isInstanceOf[AdaptiveSparkPlanExec]
+ if (cmpSparkVersion(3, 2, 0) < 0) {
+ // assert that DPP did cause this to run as a non-AQE plan prior to Spark 3.2.0
+ assert(!isAdaptiveQuery)
+ } else {
+ // In 3.2.0 AQE works with DPP
+ assert(isAdaptiveQuery)
+ }
+
+ val shj = TestUtils.findOperator(df.queryExecution.executedPlan,
+ _.isInstanceOf[GpuShuffledHashJoinExec]).get
+ .asInstanceOf[GpuShuffledHashJoinExec]
+ assert(shj.children.length == 2)
+ val childrenToCheck = if (shouldOptimizeHashJoinShuffle) {
+ // assert that the stream side of SHJ is coalesced
+ shj.buildSide match {
+ case GpuBuildLeft => Seq(shj.right)
+ case GpuBuildRight => Seq(shj.left)
+ }
+ } else {
+ // assert that both the build and stream side of SHJ are coalesced
+ // if we are not optimizing the build side shuffle
+ shj.children
+ }
+ assert(childrenToCheck.forall {
+ case GpuShuffleCoalesceExec(_, _) => true
+ case GpuCoalesceBatches(GpuShuffleCoalesceExec(_, _), _) => true
+ case _ => false
+ })
}
- // assert that both inputs to the SHJ are coalesced
- val shj = TestUtils.findOperator(df.queryExecution.executedPlan,
- _.isInstanceOf[GpuShuffledHashJoinExec]).get
- assert(shj.children.length == 2)
- assert(shj.children.forall {
- case GpuShuffleCoalesceExec(_, _) => true
- case GpuCoalesceBatches(GpuShuffleCoalesceExec(_, _), _) => true
- case _ => false
- })
-
}, conf)
}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
index bf29a411989..d5332628c5b 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuCoalesceBatchesSuite.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -475,9 +475,9 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
val schema = new StructType().add("i", LongType)
.add("j", DecimalType(ai.rapids.cudf.DType.DECIMAL64_MAX_PRECISION, 3))
val dummyMetric = WrappedGpuMetric(new SQLMetric("ignored"))
- val coalesceIter = new GpuCoalesceIterator(
+ val coalesceIter = new GpuCompressionAwareCoalesceIterator(
batchIter,
- schema,
+ GpuColumnVector.extractTypes(schema),
TargetSize(coalesceTargetBytes),
maxCompressedBatchMemoryLimit,
dummyMetric,
@@ -559,9 +559,9 @@ class GpuCoalesceBatchesSuite extends SparkQueryCompareTestSuite {
val schema = new StructType().add("i", LongType)
.add("j", DecimalType(ai.rapids.cudf.DType.DECIMAL64_MAX_PRECISION, 3))
val dummyMetric = WrappedGpuMetric(new SQLMetric("ignored"))
- val coalesceIter = new GpuCoalesceIterator(
+ val coalesceIter = new GpuCompressionAwareCoalesceIterator(
batchIter,
- schema,
+ GpuColumnVector.extractTypes(schema),
TargetSize(coalesceTargetBytes),
maxCompressedBatchMemoryLimit,
dummyMetric,
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
new file mode 100644
index 00000000000..ee11387c9ac
--- /dev/null
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/GpuShuffledHashJoinExecSuite.scala
@@ -0,0 +1,312 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids
+
+import java.io.{ByteArrayInputStream, ByteArrayOutputStream, DataInputStream}
+
+import ai.rapids.cudf.{ColumnVector, HostMemoryBuffer, JCudfSerialization, Table}
+import org.mockito.ArgumentMatchers._
+import org.mockito.Mockito._
+import org.scalatest.FunSuite
+import org.scalatest.mockito.MockitoSugar
+
+import org.apache.spark.SparkConf
+import org.apache.spark.sql.catalyst.expressions.AttributeReference
+import org.apache.spark.sql.types.IntegerType
+import org.apache.spark.sql.vectorized.ColumnarBatch
+
+class GpuShuffledHashJoinExecSuite extends FunSuite with Arm with MockitoSugar {
+ val metricMap = mock[Map[String, GpuMetric]]
+ when(metricMap(any())).thenReturn(NoopMetric)
+
+ test("fallback with empty build iterator") {
+ TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+ val mockBuildIter = mock[Iterator[ColumnarBatch]]
+ when(mockBuildIter.hasNext).thenReturn(false)
+ val mockStreamIter = mock[Iterator[ColumnarBatch]]
+ val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ 0,
+ Seq.empty,
+ mockBuildIter,
+ mockStreamIter,
+ mock[SpillCallback],
+ metricMap)
+ withResource(builtBatch) { _ =>
+ // we ge an empty batch with no columns or rows
+ assertResult(builtBatch.numCols())(0)
+ assertResult(builtBatch.numRows())(0)
+ // 2 invocations, once in the `getBuiltBatchAndStreamIter`
+ // method, and a second one in `getSingleBatchWithVerification`
+ verify(mockBuildIter, times(2)).hasNext
+ verify(mockBuildIter, times(0)).next
+ verify(mockStreamIter, times(0)).hasNext
+ }
+ }
+ }
+
+ test("fallback with 0 column build batches") {
+ TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+ withResource(GpuColumnVector.emptyBatchFromTypes(Array.empty)) {
+ emptyBatch =>
+ val buildIter = mock[Iterator[ColumnarBatch]]
+ when(buildIter.hasNext).thenReturn(true, false)
+ val buildBufferedIter = mock[BufferedIterator[ColumnarBatch]]
+ when(buildBufferedIter.hasNext).thenReturn(true, false)
+ when(buildBufferedIter.head).thenReturn(emptyBatch)
+ when(buildBufferedIter.next).thenReturn(emptyBatch)
+ when(buildIter.buffered).thenReturn(buildBufferedIter)
+ val mockStreamIter = mock[Iterator[ColumnarBatch]]
+ val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ 0,
+ Seq.empty,
+ buildIter,
+ mockStreamIter,
+ mock[SpillCallback],
+ metricMap)
+ withResource(builtBatch) { _ =>
+ assertResult(builtBatch.numCols())(0)
+ assertResult(builtBatch.numRows())(0)
+ // 1 invocation in the `getBuiltBatchAndStreamIter`
+ // after which a buffered iterator is obtained and used for the fallback case
+ verify(buildIter, times(1)).hasNext
+ verify(buildIter, times(1)).buffered
+ // we ask the buffered iterator for `head` to inspect the number of columns
+ verify(buildBufferedIter, times(1)).head
+ // the buffered iterator is passed to `getSingleBatchWithVerification`,
+ // and that code calls hasNext twice
+ verify(buildBufferedIter, times(2)).hasNext
+ // and calls next to get that batch we buffered
+ verify(buildBufferedIter, times(1)).next
+ verify(mockStreamIter, times(0)).hasNext
+ }
+ }
+ }
+ }
+
+ test("fallback with a non-SerializedTableColumn 1 col and 0 rows") {
+ TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+ val emptyBatch = GpuColumnVector.emptyBatchFromTypes(Seq(IntegerType).toArray)
+ val buildIter = Seq(emptyBatch).iterator
+ val mockStreamIter = mock[Iterator[ColumnarBatch]]
+ val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ 0,
+ Seq.empty,
+ buildIter,
+ mockStreamIter,
+ mock[SpillCallback],
+ metricMap)
+ withResource(builtBatch) { _ =>
+ assertResult(builtBatch.numCols())(1)
+ assertResult(builtBatch.numRows())(0)
+ // 2 invocations, once in the `getBuiltBatchAndStreamIter
+ // method, and one in `getSingleBatchWithVerification`
+ verify(mockStreamIter, times(0)).hasNext
+ // the buffered iterator drained the build iterator
+ assertResult(buildIter.hasNext)(false)
+ }
+ }
+ }
+
+ test("fallback with a non-SerialiedTableColumn") {
+ TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+ closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+ val cv = GpuColumnVector.from(cudfCol, IntegerType)
+ val batch = new ColumnarBatch(Seq(cv).toArray, 5)
+ val buildIter = Seq(batch).iterator
+ val mockStreamIter = mock[Iterator[ColumnarBatch]]
+ val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ 0,
+ Seq.empty,
+ buildIter,
+ mockStreamIter,
+ mock[SpillCallback],
+ metricMap)
+ withResource(builtBatch) { _ =>
+ assertResult(builtBatch.numCols())(1)
+ assertResult(builtBatch.numRows())(5)
+ // 2 invocations, once in the `getBuiltBatchAndStreamIter
+ // method, and one in `getSingleBatchWithVerification`
+ verify(mockStreamIter, times(0)).hasNext
+ // the buffered iterator drained the build iterator
+ assertResult(buildIter.hasNext)(false)
+ }
+ }
+ }
+ }
+
+ def getSerializedBatch(tbl: Table): ColumnarBatch = {
+ val outStream = new ByteArrayOutputStream()
+ JCudfSerialization.writeToStream(tbl, outStream, 0, tbl.getRowCount)
+ val dIn = new DataInputStream(new ByteArrayInputStream(outStream.toByteArray))
+ val header = new JCudfSerialization.SerializedTableHeader(dIn)
+ closeOnExcept(HostMemoryBuffer.allocate(header.getDataLen, false)) { hostBuffer =>
+ JCudfSerialization.readTableIntoBuffer(dIn, header, hostBuffer)
+ SerializedTableColumn.from(header, hostBuffer)
+ }
+ }
+
+ def getSerializedBatch(numRows: Int): ColumnarBatch = {
+ val outStream = new ByteArrayOutputStream()
+ JCudfSerialization.writeRowsToStream(outStream, numRows)
+ val dIn = new DataInputStream(new ByteArrayInputStream(outStream.toByteArray))
+ val header = new JCudfSerialization.SerializedTableHeader(dIn)
+ closeOnExcept(HostMemoryBuffer.allocate(header.getDataLen, false)) { hostBuffer =>
+ JCudfSerialization.readTableIntoBuffer(dIn, header, hostBuffer)
+ SerializedTableColumn.from(header, hostBuffer)
+ }
+ }
+
+ test("test a 0-column SerializedTableColumn") {
+ TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+ val serializedBatch = getSerializedBatch(5)
+ val mockStreamIter = mock[Iterator[ColumnarBatch]]
+ val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+ when(mockStreamIter.hasNext).thenReturn(true)
+ when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+ when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+ closeOnExcept(serializedBatch) { _ =>
+ val buildIter = Seq(serializedBatch).iterator
+ val attrs = AttributeReference("a", IntegerType, false)() :: Nil
+ val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ 1024,
+ attrs,
+ buildIter,
+ mockStreamIter,
+ mock[SpillCallback],
+ metricMap)
+ withResource(builtBatch) { _ =>
+ verify(mockBufferedStreamIterator, times(1)).hasNext
+ assertResult(builtBatch.numCols())(0)
+ assertResult(builtBatch.numRows())(5)
+ // the buffered iterator drained the build iterator
+ assertResult(buildIter.hasNext)(false)
+ }
+ }
+ }
+ }
+
+ test("test a SerializedTableColumn") {
+ TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+ closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+ val cv = GpuColumnVector.from(cudfCol, IntegerType)
+ val batch = new ColumnarBatch(Seq(cv).toArray, 5)
+ withResource(GpuColumnVector.from(batch)) { tbl =>
+ val serializedBatch = getSerializedBatch(tbl)
+ val mockStreamIter = mock[Iterator[ColumnarBatch]]
+ val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+ when(mockStreamIter.hasNext).thenReturn(true)
+ when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+ when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+ closeOnExcept(serializedBatch) { _ =>
+ val buildIter = Seq(serializedBatch).iterator
+ val attrs = AttributeReference("a", IntegerType, false)() :: Nil
+ val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ 1024,
+ attrs,
+ buildIter,
+ mockStreamIter,
+ mock[SpillCallback],
+ metricMap)
+ withResource(builtBatch) { _ =>
+ verify(mockBufferedStreamIterator, times(1)).hasNext
+ assertResult(builtBatch.numCols())(1)
+ assertResult(builtBatch.numRows())(5)
+ // the buffered iterator drained the build iterator
+ assertResult(buildIter.hasNext)(false)
+ }
+ }
+ }
+ }
+ }
+ }
+
+ test("test two batches, going over the limit") {
+ TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+ closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+ val cv = GpuColumnVector.from(cudfCol, IntegerType)
+ val batch = new ColumnarBatch(Seq(cv).toArray, 5)
+ withResource(GpuColumnVector.from(batch)) { tbl =>
+ val serializedBatch = getSerializedBatch(tbl)
+ val serializedBatch2 = getSerializedBatch(tbl)
+ val mockStreamIter = mock[Iterator[ColumnarBatch]]
+ val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+ when(mockStreamIter.hasNext).thenReturn(true)
+ when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+ when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+ closeOnExcept(serializedBatch) { _ =>
+ closeOnExcept(serializedBatch2) { _ =>
+ val buildIter = Seq(serializedBatch, serializedBatch2).iterator
+ val attrs = AttributeReference("a", IntegerType, false)() :: Nil
+ val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ 1,
+ attrs,
+ buildIter,
+ mockStreamIter,
+ mock[SpillCallback],
+ metricMap)
+ withResource(builtBatch) { _ =>
+ verify(mockBufferedStreamIterator, times(0)).hasNext
+ assertResult(builtBatch.numCols())(1)
+ assertResult(builtBatch.numRows())(10)
+ // the buffered iterator drained the build iterator
+ assertResult(buildIter.hasNext)(false)
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ test("test two batches, stating within the limit") {
+ TestUtils.withGpuSparkSession(new SparkConf()) { _ =>
+ closeOnExcept(ColumnVector.fromInts(1, 2, 3, 4, 5)) { cudfCol =>
+ val cv = GpuColumnVector.from(cudfCol, IntegerType)
+ val batch = new ColumnarBatch(Seq(cv).toArray, 5)
+ withResource(GpuColumnVector.from(batch)) { tbl =>
+ val serializedBatch = getSerializedBatch(tbl)
+ val serializedBatch2 = getSerializedBatch(tbl)
+ val mockStreamIter = mock[Iterator[ColumnarBatch]]
+ val mockBufferedStreamIterator = mock[BufferedIterator[ColumnarBatch]]
+ when(mockStreamIter.hasNext).thenReturn(true)
+ when(mockStreamIter.buffered).thenReturn(mockBufferedStreamIterator)
+ when(mockBufferedStreamIterator.hasNext).thenReturn(true)
+ closeOnExcept(serializedBatch) { _ =>
+ closeOnExcept(serializedBatch2) { _ =>
+ val buildIter = Seq(serializedBatch, serializedBatch2).iterator
+ val attrs = AttributeReference("a", IntegerType, false)() :: Nil
+ val (builtBatch, bStreamIter) = GpuShuffledHashJoinExec.getBuiltBatchAndStreamIter(
+ 1024,
+ attrs,
+ buildIter,
+ mockStreamIter,
+ mock[SpillCallback],
+ metricMap)
+ withResource(builtBatch) { _ =>
+ verify(mockBufferedStreamIterator, times(1)).hasNext
+ assertResult(builtBatch.numCols())(1)
+ assertResult(builtBatch.numRows())(10)
+ // the buffered iterator drained the build iterator
+ assertResult(buildIter.hasNext)(false)
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
index 997409412fb..a94affbf08d 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/OrcScanSuite.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -106,6 +106,9 @@ class OrcScanSuite extends SparkQueryCompareTestSuite {
* is actually 1582-09-23 in proleptic Gregorian calendar.
*/
test("test hybrid Julian Gregorian calendar vs proleptic Gregorian calendar") {
+ // After Spark 3.1.1, Orc failed to prune when converting Hybrid calendar to Proleptic calendar
+ // Orc bug: https://issues.apache.org/jira/browse/ORC-1083
+ assumePriorToSpark311
withCpuSparkSession(spark => {
val df = frameFromOrcWithSchema("hybrid-Julian-calendar.orc",
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
index 2d31835cf55..389fe7800af 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/RegularExpressionParserSuite.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -21,6 +21,24 @@ import org.scalatest.FunSuite
class RegularExpressionParserSuite extends FunSuite {
+ test("detect regexp strings") {
+ // Based on https://docs.oracle.com/javase/8/docs/api/java/util/regex/Pattern.html
+ val strings: Seq[String] = Seq("\\", "\u0000", "\\x00", "\\.",
+ "\f", "\\a", "\\e", "\\cx", "[abc]", "^", "[a-z&&[def]]", ".", "*", "\\d", "\\D",
+ "\\h", "\\H", "\\s", "\\S", "\\v", "\\V", "\\w", "\\w", "\\p", "$", "\\b", "\\B",
+ "\\A", "\\G", "\\Z", "\\z", "\\R", "?", "|", "(abc)", "a{1,}", "\\k", "\\Q", "\\E")
+ for (string <- strings) {
+ assert(RegexParser.isRegExpString(string))
+ }
+ }
+
+ test("detect non-regexp strings") {
+ val strings = Seq("A", ",", "\t", ":", "")
+ for (string <- strings) {
+ assert(!RegexParser.isRegExpString(string))
+ }
+ }
+
test("empty pattern") {
assert(parse("") === RegexSequence(ListBuffer()))
}
diff --git a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
index 1f84b04ad77..b8357c9db15 100644
--- a/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
+++ b/tests/src/test/scala/com/nvidia/spark/rapids/SparkQueryCompareTestSuite.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -1835,6 +1835,9 @@ trait SparkQueryCompareTestSuite extends FunSuite with Arm {
def assumeSpark320orLater: Assertion =
assume(VersionUtils.isSpark320OrLater, "Spark version not 3.2.0+")
+ def assumePriorToSpark311: Assertion =
+ assume(!VersionUtils.isSpark311OrLater, "Spark version not before 3.1.1")
+
def cmpSparkVersion(major: Int, minor: Int, bugfix: Int): Int = {
val sparkShimVersion = ShimLoader.getSparkShims.getSparkShimVersion
val (sparkMajor, sparkMinor, sparkBugfix) = sparkShimVersion match {
diff --git a/tools/pom.xml b/tools/pom.xml
index d8fffb17ccb..25fe91d7f5f 100644
--- a/tools/pom.xml
+++ b/tools/pom.xml
@@ -40,6 +40,11 @@
+
+ com.nvidia
+ rapids-4-spark-common_${scala.binary.version}
+ ${project.version}
+ org.scala-langscala-library
@@ -100,18 +105,14 @@
org.rogach:scallop_${scala.binary.version}
+ com.nvidia:rapids-4-spark-common_${scala.binary.version}
-
- org.rogach:scallop_${scala.binary.version}:*
-
- META-INF/*.MF
-
- *:*
+ META-INF/*.MFMETA-INF/*.SFMETA-INF/*.DSAMETA-INF/*.RSA
diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
index c8ef1f7cd9d..137e35a8f7b 100644
--- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
+++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/profiling/Profiler.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ import scala.collection.JavaConverters._
import scala.collection.mutable.{ArrayBuffer, HashMap}
import scala.util.control.NonFatal
-import com.google.common.util.concurrent.ThreadFactoryBuilder
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
import com.nvidia.spark.rapids.tool.{EventLogInfo, EventLogPathProcessor}
import org.apache.hadoop.conf.Configuration
diff --git a/tools/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala b/tools/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
index 33579a98b3d..a895d28fe69 100644
--- a/tools/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
+++ b/tools/src/main/scala/com/nvidia/spark/rapids/tool/qualification/Qualification.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -20,7 +20,7 @@ import java.util.concurrent.{ConcurrentLinkedQueue, Executors, ThreadPoolExecuto
import scala.collection.JavaConverters._
-import com.google.common.util.concurrent.ThreadFactoryBuilder
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
import com.nvidia.spark.rapids.tool.EventLogInfo
import org.apache.hadoop.conf.Configuration
diff --git a/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppFilterImpl.scala b/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppFilterImpl.scala
index 2b95e7639cd..5ebae2a075b 100644
--- a/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppFilterImpl.scala
+++ b/tools/src/main/scala/org/apache/spark/sql/rapids/tool/AppFilterImpl.scala
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@@ -22,7 +22,7 @@ import java.util.regex.PatternSyntaxException
import scala.collection.JavaConverters._
-import com.google.common.util.concurrent.ThreadFactoryBuilder
+import com.nvidia.spark.rapids.ThreadFactoryBuilder
import com.nvidia.spark.rapids.tool.EventLogInfo
import com.nvidia.spark.rapids.tool.qualification.QualificationArgs
import org.apache.hadoop.conf.Configuration
diff --git a/udf-examples/pom.xml b/udf-examples/pom.xml
index e5bc938f9f7..f44cce94e81 100644
--- a/udf-examples/pom.xml
+++ b/udf-examples/pom.xml
@@ -142,6 +142,12 @@
${spark.version}provided
+
+ org.apache.hive
+ hive-storage-api
+ ${spark.version}
+ provided
+