Merge remote-tracking branch 'origin/branch-21.12' into spark321-issu…

…e3470
NVIDIA · Oct 29, 2021 · f2e3088 · f2e3088
2 parents 29576f3 + 0d88779
commit f2e3088
Show file tree

Hide file tree

Showing 34 changed files with 1,737 additions and 860 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,5 +1,5 @@
 # Change log
-Generated on 2021-10-26
+Generated on 2021-10-28
 
 ## Release 21.10
 
@@ -55,6 +55,7 @@ Generated on 2021-10-26
 ### Bugs Fixed
 |||
 |:---|:---|
+|[#3929](https://github.com/NVIDIA/spark-rapids/issues/3929)|[BUG] published rapids-4-spark dist artifact references aggregator|
 |[#3837](https://github.com/NVIDIA/spark-rapids/issues/3837)|[BUG] Spark-rapids v21.10.0 release candidate jars failed on the OSS validation check.|
 |[#3769](https://github.com/NVIDIA/spark-rapids/issues/3769)|[BUG] dedupe fails with  find: './parallel-world/spark301/ ...' No such file or directory|
 |[#3783](https://github.com/NVIDIA/spark-rapids/issues/3783)|[BUG] spark-rapids v21.10.0 release build failed on script "dist/scripts/binary-dedupe.sh"|
@@ -145,6 +146,8 @@ Generated on 2021-10-26
 ### PRs
 |||
 |:---|:---|
+|[#3930](https://github.com/NVIDIA/spark-rapids/pull/3930)|Dist artifact with provided aggregator dependency|
+|[#3918](https://github.com/NVIDIA/spark-rapids/pull/3918)|Update changelog [skip ci]|
 |[#3906](https://github.com/NVIDIA/spark-rapids/pull/3906)|Doc updated for v2110[skip ci]|
 |[#3840](https://github.com/NVIDIA/spark-rapids/pull/3840)|Update changelog [skip ci]|
 |[#3838](https://github.com/NVIDIA/spark-rapids/pull/3838)|Update deploy script [skip ci]|

diff --git a/api_validation/pom.xml b/api_validation/pom.xml
@@ -127,6 +127,14 @@
             <classifier>${cuda.version}</classifier>
             <scope>provided</scope>
         </dependency>
+        <!-- use aggregator jar because accessing internal classes -->
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+            <classifier>${spark.version.classifier}</classifier>
+            <scope>provided</scope>
+         </dependency>
         <dependency>
             <groupId>com.nvidia</groupId>
             <artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
@@ -149,7 +157,7 @@
             <configuration>
 			    <mainClass>com.nvidia.spark.rapids.api.ApiValidation</mainClass>
             </configuration>
-	    </plugin>	
+	    </plugin>
         </plugins>
     </build>
 </project>
diff --git a/dist/pom.xml b/dist/pom.xml
@@ -41,6 +41,7 @@
                     <groupId>com.nvidia</groupId>
                     <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
                     <version>${project.version}</version>
+                    <scope>provided</scope>
                 </dependency>
             </dependencies>
             <build>
@@ -187,6 +188,7 @@
                     <groupId>com.nvidia</groupId>
                     <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
                     <version>${project.version}</version>
+                    <scope>provided</scope>
                 </dependency>
                 <dependency>
                     <!-- required for conf generation script -->
@@ -333,6 +335,7 @@
                     <groupId>com.nvidia</groupId>
                     <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
                     <version>${project.version}</version>
+                    <scope>provided</scope>
                 </dependency>
                 <dependency>
                     <!-- required for conf generation script -->
@@ -441,6 +444,7 @@
                     <groupId>com.nvidia</groupId>
                     <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
                     <version>${project.version}</version>
+                    <scope>provided</scope>
                 </dependency>
                 <dependency>
                     <!-- required for conf generation script -->
@@ -594,6 +598,7 @@
                     <groupId>com.nvidia</groupId>
                     <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
                     <version>${project.version}</version>
+                    <scope>provided</scope>
                 </dependency>
                 <dependency>
                     <!-- required for conf generation script -->
@@ -771,6 +776,7 @@
                     <groupId>com.nvidia</groupId>
                     <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
                     <version>${project.version}</version>
+                    <scope>provided</scope>
                 </dependency>
                 <dependency>
                     <groupId>org.apache.spark</groupId>
@@ -842,6 +848,7 @@
                     <groupId>com.nvidia</groupId>
                     <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
                     <version>${project.version}</version>
+                    <scope>provided</scope>
                 </dependency>
                 <dependency>
                     <!-- required for conf generation script -->
@@ -1036,6 +1043,20 @@
                     </excludes>
                 </configuration>
             </plugin>
+            <plugin>
+                <groupId>org.apache.maven.plugins</groupId>
+                <artifactId>maven-install-plugin</artifactId>
+                <version>3.0.0-M1</version>
+                <executions>
+                    <execution>
+                        <id>default-install</id>
+                        <phase>install</phase>
+                        <configuration>
+                            <pomFile>${project.build.directory}/dependency-reduced-pom.xml</pomFile>
+                        </configuration>
+                    </execution>
+                </executions>
+            </plugin>
         </plugins>
     </build>
 

diff --git a/dist/scripts/binary-dedupe.sh b/dist/scripts/binary-dedupe.sh
@@ -220,5 +220,9 @@ time (
 echo "$((++STEP))/ deleting all class files listed in $DELETE_DUPLICATES_TXT"
 time (< "$DELETE_DUPLICATES_TXT" sort -u | xargs rm) 2>&1
 
+echo "Generating dependency-reduced-pom.xml"
+# which is just delete the dependencies list altogether
+sed  -e '/<dependencies>/,/<\/dependencies>/d' ../pom.xml > dependency-reduced-pom.xml
+
 end_time=$(date +%s)
-echo "binary-dedupe completed in $((end_time - start_time)) seconds"
+echo "binary-dedupe completed in $((end_time - start_time)) seconds"
diff --git a/docs/compatibility.md b/docs/compatibility.md
@@ -624,4 +624,54 @@ The GPU implementation of RLike does not support empty groups correctly.
 | Pattern   | Input  | Spark on CPU | Spark on GPU |
 |-----------|--------|--------------|--------------|
 | `z()?`    | `a`    | No Match     | Match        |
-| `z()*`    | `a`    | No Match     | Match        |
+| `z()*`    | `a`    | No Match     | Match        |
+
+## Conditionals and operations with side effects (ANSI mode)
+
+In Apache Spark condition operations like `if`, `coalesce`, and `case/when` lazily evaluate
+their parameters on a row by row basis. On the GPU it is generally more efficient to
+evaluate the parameters regardless of the condition and then select which result to return
+based on the condition. This is fine so long as there are no side effects caused by evaluating
+a parameter. For most expressions in Spark this is true, but in ANSI mode many expressions can
+throw exceptions, like for the `Add` expression if an overflow happens. This is also true of
+UDFs, because by their nature they are user defined and can have side effects like throwing
+exceptions.
+
+Currently, the RAPIDS Accelerator
+[assumes that there are no side effects](https://github.com/NVIDIA/spark-rapids/issues/3849).
+This can result it situations, specifically in ANSI mode, where the RAPIDS Accelerator will
+always throw an exception, but Spark on the CPU will not.  For example:
+
+```scala
+spark.conf.set("spark.sql.ansi.enabled", "true")
+
+Seq(0L, Long.MaxValue).toDF("val")
+    .repartition(1) // The repartition makes Spark not optimize selectExpr away
+    .selectExpr("IF(val > 1000, null, val + 1) as ret")
+    .show()
+```
+
+If the above example is run on the CPU you will get a result like.
+```
++----+
+| ret|
++----+
+|   1|
+|null|
++----+
+```
+
+But if it is run on the GPU an overflow exception is thrown. As was explained before this
+is because the RAPIDS Accelerator will evaluate both `val + 1` and `null` regardless of
+the result of the condition. In some cases you can work around this. The above example
+could be re-written so the `if` happens before the `Add` operation.
+
+```scala
+Seq(0L, Long.MaxValue).toDF("val")
+    .repartition(1) // The repartition makes Spark not optimize selectExpr away
+    .selectExpr("IF(val > 1000, null, val) + 1 as ret")
+    .show()
+```
+
+But this is not something that can be done generically and requires inner knowledge about
+what can trigger a side effect.
diff --git a/docs/configs.md b/docs/configs.md
@@ -113,6 +113,7 @@ Name | Description | Default Value
 <a name="sql.reader.batchSizeBytes"></a>spark.rapids.sql.reader.batchSizeBytes|Soft limit on the maximum number of bytes the reader reads per batch. The readers will read chunks of data until this limit is met or exceeded. Note that the reader may estimate the number of bytes that will be used on the GPU in some cases based on the schema and number of rows in each batch.|2147483647
 <a name="sql.reader.batchSizeRows"></a>spark.rapids.sql.reader.batchSizeRows|Soft limit on the maximum number of rows the reader will read per batch. The orc and parquet readers will read row groups until this limit is met or exceeded. The limit is respected by the csv reader.|2147483647
 <a name="sql.replaceSortMergeJoin.enabled"></a>spark.rapids.sql.replaceSortMergeJoin.enabled|Allow replacing sortMergeJoin with HashJoin|true
+<a name="sql.rowBasedUDF.enabled"></a>spark.rapids.sql.rowBasedUDF.enabled|When set to true, optimizes a row-based UDF in a GPU operation by transferring only the data it needs between GPU and CPU inside a query operation, instead of falling this operation back to CPU. This is an experimental feature, and this config might be removed in the future.|false
 <a name="sql.shuffle.spillThreads"></a>spark.rapids.sql.shuffle.spillThreads|Number of threads used to spill shuffle data to disk in the background.|6
 <a name="sql.stableSort.enabled"></a>spark.rapids.sql.stableSort.enabled|Enable or disable stable sorting. Apache Spark's sorting is typically a stable sort, but sort stability cannot be guaranteed in distributed work loads because the order in which upstream data arrives to a task is not guaranteed. Sort stability then only matters when reading and sorting data from a file using a single task/partition. Because of limitations in the plugin when you enable stable sorting all of the data for a single task will be combined into a single batch before sorting. This currently disables spilling from GPU memory if the data size is too large.|false
 <a name="sql.udfCompiler.enabled"></a>spark.rapids.sql.udfCompiler.enabled|When set to true, Scala UDFs will be considered for compilation as Catalyst expressions|false
@@ -264,7 +265,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
 <a name="sql.expression.Rint"></a>spark.rapids.sql.expression.Rint|`rint`|Rounds up a double value to the nearest double equal to an integer|true|None|
 <a name="sql.expression.Round"></a>spark.rapids.sql.expression.Round|`round`|Round an expression to d decimal places using HALF_UP rounding mode|true|None|
 <a name="sql.expression.RowNumber"></a>spark.rapids.sql.expression.RowNumber|`row_number`|Window function that returns the index for the row within the aggregation window|true|None|
-<a name="sql.expression.ScalaUDF"></a>spark.rapids.sql.expression.ScalaUDF| |User Defined Function, support requires the UDF to implement a RAPIDS accelerated interface|true|None|
+<a name="sql.expression.ScalaUDF"></a>spark.rapids.sql.expression.ScalaUDF| |User Defined Function, the UDF can choose to implement a RAPIDS accelerated interface to get better performance.|true|None|
 <a name="sql.expression.Second"></a>spark.rapids.sql.expression.Second|`second`|Returns the second component of the string/timestamp|true|None|
 <a name="sql.expression.ShiftLeft"></a>spark.rapids.sql.expression.ShiftLeft|`shiftleft`|Bitwise shift left (<<)|true|None|
 <a name="sql.expression.ShiftRight"></a>spark.rapids.sql.expression.ShiftRight|`shiftright`|Bitwise shift right (>>)|true|None|

diff --git a/docs/supported_ops.md b/docs/supported_ops.md
@@ -10192,7 +10192,7 @@ are limited.
 <tr>
 <td rowSpan="2">ScalaUDF</td>
 <td rowSpan="2"> </td>
-<td rowSpan="2">User Defined Function, support requires the UDF to implement a RAPIDS accelerated interface</td>
+<td rowSpan="2">User Defined Function, the UDF can choose to implement a RAPIDS accelerated interface to get better performance.</td>
 <td rowSpan="2">None</td>
 <td rowSpan="2">project</td>
 <td>param</td>

diff --git a/integration_tests/pom.xml b/integration_tests/pom.xml
@@ -62,6 +62,14 @@
             <groupId>com.nvidia</groupId>
             <artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
             <version>${project.version}</version>
+            <scope>provided</scope>
+        </dependency>
+        <!-- use aggregator jar because accessing internal classes -->
+        <dependency>
+            <groupId>com.nvidia</groupId>
+            <artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
+            <version>${project.version}</version>
+            <classifier>${spark.version.classifier}</classifier>
             <scope>test</scope>
         </dependency>
         <dependency>

diff --git a/jenkins/deploy.sh b/jenkins/deploy.sh
@@ -82,7 +82,7 @@ $DEPLOY_CMD -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
 # Distribution jar is a shaded artifact so use the reduced dependency pom.
 $DEPLOY_CMD -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
             $SRC_DOC_JARS \
-            -Dfile=$FPATH.jar -DgroupId=com.nvidia -DartifactId=$ART_ID -Dversion=$ART_VER -DpomFile=./dist/pom.xml
+            -Dfile=$FPATH.jar -DgroupId=com.nvidia -DartifactId=$ART_ID -Dversion=$ART_VER -DpomFile=./dist/target/dependency-reduced-pom.xml
 
 ###### Deploy integration tests jar(s) ######
 TESTS_ART_ID=`mvn help:evaluate -q -pl $TESTS_PL -Dexpression=project.artifactId -DforceStdout`

diff --git a/jenkins/spark-tests.sh b/jenkins/spark-tests.sh
@@ -34,7 +34,10 @@ $MVN_GET_CMD -DremoteRepositories=$PROJECT_REPO \
     -DgroupId=com.nvidia -DartifactId=rapids-4-spark_$SCALA_BINARY_VER -Dversion=$PROJECT_VER
 $MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \
     -DgroupId=com.nvidia -DartifactId=rapids-4-spark-udf-examples_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER
+
+# TODO remove -Dtransitive=false workaround once pom is fixed
 $MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \
+    -Dtransitive=false \
     -DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER -Dclassifier=$SHUFFLE_SPARK_SHIM
 if [ "$CUDA_CLASSIFIER"x == x ];then
     CUDF_JAR="$ARTF_ROOT/cudf-$CUDF_VER.jar"
@@ -45,29 +48,53 @@ export RAPIDS_PLUGIN_JAR="$ARTF_ROOT/rapids-4-spark_${SCALA_BINARY_VER}-$PROJECT
 RAPIDS_UDF_JAR="$ARTF_ROOT/rapids-4-spark-udf-examples_${SCALA_BINARY_VER}-$PROJECT_TEST_VER.jar"
 RAPIDS_TEST_JAR="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_TEST_VER-$SHUFFLE_SPARK_SHIM.jar"
 
+# TODO remove -Dtransitive=false workaround once pom is fixed
 $MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \
+    -Dtransitive=false \
     -DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER -Dclassifier=pytest -Dpackaging=tar.gz
 
 RAPIDS_INT_TESTS_HOME="$ARTF_ROOT/integration_tests/"
 # The version of pytest.tar.gz that is uploaded is the one built against spark301 but its being pushed without classifier for now
 RAPIDS_INT_TESTS_TGZ="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_TEST_VER-pytest.tar.gz"
 
+tmp_info=${TMP_INFO_FILE:-'/tmp/artifacts-build.info'}
+rm -rf "$tmp_info"
+TEE_CMD="tee -a $tmp_info"
+GREP_CMD="grep revision"
+AWK_CMD=(awk -F'=' '{print $2}')
+getRevision() {
+  local file=$1
+  local properties=$2
+  local revision
+  if [[ $file == *.jar || $file == *.zip ]]; then
+    revision=$(unzip -p "$file" "$properties" | $TEE_CMD | $GREP_CMD | "${AWK_CMD[@]}" || true)
+  elif [[ $file == *.tgz || $file == *.tar.gz ]]; then
+    revision=$(tar -xzf "$file" --to-command=cat "$properties" | $TEE_CMD | $GREP_CMD | "${AWK_CMD[@]}" || true)
+  fi
+  echo "$revision"
+}
+
 set +x
-rm -rf /tmp/artifacts-build.info
-TEE_CMD="tee -a /tmp/artifacts-build.info"
-echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" | $TEE_CMD
-echo "-------------------- cudf JNI BUILD INFO --------------------" | $TEE_CMD
-unzip -p $CUDF_JAR cudf-java-version-info.properties | $TEE_CMD || true
-echo "-------------------- rapids-4-spark BUILD INFO --------------------" | $TEE_CMD
-unzip -p $RAPIDS_PLUGIN_JAR rapids4spark-version-info.properties | $TEE_CMD || true
-echo "-------------------- rapids-4-spark-udf-examples BUILD INFO --------------------" | $TEE_CMD
-unzip -p $RAPIDS_UDF_JAR rapids4spark-version-info.properties | $TEE_CMD || true
-echo "-------------------- rapids-4-spark-integration-tests BUILD INFO --------------------" | $TEE_CMD
-unzip -p $RAPIDS_TEST_JAR rapids4spark-version-info.properties | $TEE_CMD || true
-echo "-------------------- rapids-4-spark-integration-tests pytest BUILD INFO --------------------" | $TEE_CMD
-tar -xzf $RAPIDS_INT_TESTS_TGZ --to-command=cat integration_tests/rapids4spark-version-info.properties | $TEE_CMD || true
-echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" | $TEE_CMD
+echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" >> "$tmp_info"
+echo "-------------------- cudf JNI BUILD INFO --------------------" >> "$tmp_info"
+c_ver=$(getRevision $JARS_PATH/$CUDF_JAR cudf-java-version-info.properties)
+echo "-------------------- rapids-4-spark BUILD INFO --------------------" >> "$tmp_info"
+p_ver=$(getRevision $JARS_PATH/$RAPIDS_PLUGIN_JAR rapids4spark-version-info.properties)
+echo "-------------------- rapids-4-spark-integration-tests BUILD INFO --------------------" >> "$tmp_info"
+it_ver=$(getRevision $JARS_PATH/$RAPIDS_TEST_JAR rapids4spark-version-info.properties)
+echo "-------------------- rapids-4-spark-integration-tests pytest BUILD INFO --------------------" >> "$tmp_info"
+pt_ver=$(getRevision $JARS_PATH/$RAPIDS_INT_TESTS_TGZ integration_tests/rapids4spark-version-info.properties)
+echo "-------------------- rapids-4-spark-udf-examples BUILD INFO --------------------" >> "$tmp_info"
+u_ver=$(getRevision $JARS_PATH/$RAPIDS_UDF_JAR rapids4spark-version-info.properties)
+echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" >> "$tmp_info"
 set -x
+cat "$tmp_info" || true
+
+if [[ -z "$c_ver" || -z "$p_ver"|| \
+      "$p_ver" != "$it_ver" || "$p_ver" != "$pt_ver" || "$p_ver" != "$u_ver"  ]]; then
+  echo "Artifacts versions are inconsistent!"
+  exit 1
+fi
 
 tar xzf "$RAPIDS_INT_TESTS_TGZ" -C $ARTF_ROOT && rm -f "$RAPIDS_INT_TESTS_TGZ"
 

diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala
@@ -343,7 +343,8 @@ abstract class SparkBaseShims extends Spark30XShims {
           }
           override def convertToGpu(lhs: Expression, regexp: Expression,
               rep: Expression): GpuExpression = GpuStringReplace(lhs, regexp, rep)
-        })
+        }),
+      GpuScalaUDFMeta.exprMeta
     ).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
   }