Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/branch-21.12' into spark321-issu…
Browse files Browse the repository at this point in the history
…e3470
  • Loading branch information
gerashegalov committed Oct 29, 2021
2 parents 29576f3 + 0d88779 commit f2e3088
Show file tree
Hide file tree
Showing 34 changed files with 1,737 additions and 860 deletions.
5 changes: 4 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Change log
Generated on 2021-10-26
Generated on 2021-10-28

## Release 21.10

Expand Down Expand Up @@ -55,6 +55,7 @@ Generated on 2021-10-26
### Bugs Fixed
|||
|:---|:---|
|[#3929](https://github.com/NVIDIA/spark-rapids/issues/3929)|[BUG] published rapids-4-spark dist artifact references aggregator|
|[#3837](https://github.com/NVIDIA/spark-rapids/issues/3837)|[BUG] Spark-rapids v21.10.0 release candidate jars failed on the OSS validation check.|
|[#3769](https://github.com/NVIDIA/spark-rapids/issues/3769)|[BUG] dedupe fails with find: './parallel-world/spark301/ ...' No such file or directory|
|[#3783](https://github.com/NVIDIA/spark-rapids/issues/3783)|[BUG] spark-rapids v21.10.0 release build failed on script "dist/scripts/binary-dedupe.sh"|
Expand Down Expand Up @@ -145,6 +146,8 @@ Generated on 2021-10-26
### PRs
|||
|:---|:---|
|[#3930](https://github.com/NVIDIA/spark-rapids/pull/3930)|Dist artifact with provided aggregator dependency|
|[#3918](https://github.com/NVIDIA/spark-rapids/pull/3918)|Update changelog [skip ci]|
|[#3906](https://github.com/NVIDIA/spark-rapids/pull/3906)|Doc updated for v2110[skip ci]|
|[#3840](https://github.com/NVIDIA/spark-rapids/pull/3840)|Update changelog [skip ci]|
|[#3838](https://github.com/NVIDIA/spark-rapids/pull/3838)|Update deploy script [skip ci]|
Expand Down
10 changes: 9 additions & 1 deletion api_validation/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,14 @@
<classifier>${cuda.version}</classifier>
<scope>provided</scope>
</dependency>
<!-- use aggregator jar because accessing internal classes -->
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
Expand All @@ -149,7 +157,7 @@
<configuration>
<mainClass>com.nvidia.spark.rapids.api.ApiValidation</mainClass>
</configuration>
</plugin>
</plugin>
</plugins>
</build>
</project>
21 changes: 21 additions & 0 deletions dist/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
</dependencies>
<build>
Expand Down Expand Up @@ -187,6 +188,7 @@
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<!-- required for conf generation script -->
Expand Down Expand Up @@ -333,6 +335,7 @@
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<!-- required for conf generation script -->
Expand Down Expand Up @@ -441,6 +444,7 @@
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<!-- required for conf generation script -->
Expand Down Expand Up @@ -594,6 +598,7 @@
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<!-- required for conf generation script -->
Expand Down Expand Up @@ -771,6 +776,7 @@
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<groupId>org.apache.spark</groupId>
Expand Down Expand Up @@ -842,6 +848,7 @@
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<dependency>
<!-- required for conf generation script -->
Expand Down Expand Up @@ -1036,6 +1043,20 @@
</excludes>
</configuration>
</plugin>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-install-plugin</artifactId>
<version>3.0.0-M1</version>
<executions>
<execution>
<id>default-install</id>
<phase>install</phase>
<configuration>
<pomFile>${project.build.directory}/dependency-reduced-pom.xml</pomFile>
</configuration>
</execution>
</executions>
</plugin>
</plugins>
</build>

Expand Down
6 changes: 5 additions & 1 deletion dist/scripts/binary-dedupe.sh
Original file line number Diff line number Diff line change
Expand Up @@ -220,5 +220,9 @@ time (
echo "$((++STEP))/ deleting all class files listed in $DELETE_DUPLICATES_TXT"
time (< "$DELETE_DUPLICATES_TXT" sort -u | xargs rm) 2>&1

echo "Generating dependency-reduced-pom.xml"
# which is just delete the dependencies list altogether
sed -e '/<dependencies>/,/<\/dependencies>/d' ../pom.xml > dependency-reduced-pom.xml

end_time=$(date +%s)
echo "binary-dedupe completed in $((end_time - start_time)) seconds"
echo "binary-dedupe completed in $((end_time - start_time)) seconds"
52 changes: 51 additions & 1 deletion docs/compatibility.md
Original file line number Diff line number Diff line change
Expand Up @@ -624,4 +624,54 @@ The GPU implementation of RLike does not support empty groups correctly.
| Pattern | Input | Spark on CPU | Spark on GPU |
|-----------|--------|--------------|--------------|
| `z()?` | `a` | No Match | Match |
| `z()*` | `a` | No Match | Match |
| `z()*` | `a` | No Match | Match |

## Conditionals and operations with side effects (ANSI mode)

In Apache Spark condition operations like `if`, `coalesce`, and `case/when` lazily evaluate
their parameters on a row by row basis. On the GPU it is generally more efficient to
evaluate the parameters regardless of the condition and then select which result to return
based on the condition. This is fine so long as there are no side effects caused by evaluating
a parameter. For most expressions in Spark this is true, but in ANSI mode many expressions can
throw exceptions, like for the `Add` expression if an overflow happens. This is also true of
UDFs, because by their nature they are user defined and can have side effects like throwing
exceptions.

Currently, the RAPIDS Accelerator
[assumes that there are no side effects](https://github.com/NVIDIA/spark-rapids/issues/3849).
This can result it situations, specifically in ANSI mode, where the RAPIDS Accelerator will
always throw an exception, but Spark on the CPU will not. For example:

```scala
spark.conf.set("spark.sql.ansi.enabled", "true")

Seq(0L, Long.MaxValue).toDF("val")
.repartition(1) // The repartition makes Spark not optimize selectExpr away
.selectExpr("IF(val > 1000, null, val + 1) as ret")
.show()
```

If the above example is run on the CPU you will get a result like.
```
+----+
| ret|
+----+
| 1|
|null|
+----+
```

But if it is run on the GPU an overflow exception is thrown. As was explained before this
is because the RAPIDS Accelerator will evaluate both `val + 1` and `null` regardless of
the result of the condition. In some cases you can work around this. The above example
could be re-written so the `if` happens before the `Add` operation.

```scala
Seq(0L, Long.MaxValue).toDF("val")
.repartition(1) // The repartition makes Spark not optimize selectExpr away
.selectExpr("IF(val > 1000, null, val) + 1 as ret")
.show()
```

But this is not something that can be done generically and requires inner knowledge about
what can trigger a side effect.
3 changes: 2 additions & 1 deletion docs/configs.md
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ Name | Description | Default Value
<a name="sql.reader.batchSizeBytes"></a>spark.rapids.sql.reader.batchSizeBytes|Soft limit on the maximum number of bytes the reader reads per batch. The readers will read chunks of data until this limit is met or exceeded. Note that the reader may estimate the number of bytes that will be used on the GPU in some cases based on the schema and number of rows in each batch.|2147483647
<a name="sql.reader.batchSizeRows"></a>spark.rapids.sql.reader.batchSizeRows|Soft limit on the maximum number of rows the reader will read per batch. The orc and parquet readers will read row groups until this limit is met or exceeded. The limit is respected by the csv reader.|2147483647
<a name="sql.replaceSortMergeJoin.enabled"></a>spark.rapids.sql.replaceSortMergeJoin.enabled|Allow replacing sortMergeJoin with HashJoin|true
<a name="sql.rowBasedUDF.enabled"></a>spark.rapids.sql.rowBasedUDF.enabled|When set to true, optimizes a row-based UDF in a GPU operation by transferring only the data it needs between GPU and CPU inside a query operation, instead of falling this operation back to CPU. This is an experimental feature, and this config might be removed in the future.|false
<a name="sql.shuffle.spillThreads"></a>spark.rapids.sql.shuffle.spillThreads|Number of threads used to spill shuffle data to disk in the background.|6
<a name="sql.stableSort.enabled"></a>spark.rapids.sql.stableSort.enabled|Enable or disable stable sorting. Apache Spark's sorting is typically a stable sort, but sort stability cannot be guaranteed in distributed work loads because the order in which upstream data arrives to a task is not guaranteed. Sort stability then only matters when reading and sorting data from a file using a single task/partition. Because of limitations in the plugin when you enable stable sorting all of the data for a single task will be combined into a single batch before sorting. This currently disables spilling from GPU memory if the data size is too large.|false
<a name="sql.udfCompiler.enabled"></a>spark.rapids.sql.udfCompiler.enabled|When set to true, Scala UDFs will be considered for compilation as Catalyst expressions|false
Expand Down Expand Up @@ -264,7 +265,7 @@ Name | SQL Function(s) | Description | Default Value | Notes
<a name="sql.expression.Rint"></a>spark.rapids.sql.expression.Rint|`rint`|Rounds up a double value to the nearest double equal to an integer|true|None|
<a name="sql.expression.Round"></a>spark.rapids.sql.expression.Round|`round`|Round an expression to d decimal places using HALF_UP rounding mode|true|None|
<a name="sql.expression.RowNumber"></a>spark.rapids.sql.expression.RowNumber|`row_number`|Window function that returns the index for the row within the aggregation window|true|None|
<a name="sql.expression.ScalaUDF"></a>spark.rapids.sql.expression.ScalaUDF| |User Defined Function, support requires the UDF to implement a RAPIDS accelerated interface|true|None|
<a name="sql.expression.ScalaUDF"></a>spark.rapids.sql.expression.ScalaUDF| |User Defined Function, the UDF can choose to implement a RAPIDS accelerated interface to get better performance.|true|None|
<a name="sql.expression.Second"></a>spark.rapids.sql.expression.Second|`second`|Returns the second component of the string/timestamp|true|None|
<a name="sql.expression.ShiftLeft"></a>spark.rapids.sql.expression.ShiftLeft|`shiftleft`|Bitwise shift left (<<)|true|None|
<a name="sql.expression.ShiftRight"></a>spark.rapids.sql.expression.ShiftRight|`shiftright`|Bitwise shift right (>>)|true|None|
Expand Down
2 changes: 1 addition & 1 deletion docs/supported_ops.md
Original file line number Diff line number Diff line change
Expand Up @@ -10192,7 +10192,7 @@ are limited.
<tr>
<td rowSpan="2">ScalaUDF</td>
<td rowSpan="2"> </td>
<td rowSpan="2">User Defined Function, support requires the UDF to implement a RAPIDS accelerated interface</td>
<td rowSpan="2">User Defined Function, the UDF can choose to implement a RAPIDS accelerated interface to get better performance.</td>
<td rowSpan="2">None</td>
<td rowSpan="2">project</td>
<td>param</td>
Expand Down
8 changes: 8 additions & 0 deletions integration_tests/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,14 @@
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<scope>provided</scope>
</dependency>
<!-- use aggregator jar because accessing internal classes -->
<dependency>
<groupId>com.nvidia</groupId>
<artifactId>rapids-4-spark-aggregator_${scala.binary.version}</artifactId>
<version>${project.version}</version>
<classifier>${spark.version.classifier}</classifier>
<scope>test</scope>
</dependency>
<dependency>
Expand Down
2 changes: 1 addition & 1 deletion jenkins/deploy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ $DEPLOY_CMD -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
# Distribution jar is a shaded artifact so use the reduced dependency pom.
$DEPLOY_CMD -Durl=$SERVER_URL -DrepositoryId=$SERVER_ID \
$SRC_DOC_JARS \
-Dfile=$FPATH.jar -DgroupId=com.nvidia -DartifactId=$ART_ID -Dversion=$ART_VER -DpomFile=./dist/pom.xml
-Dfile=$FPATH.jar -DgroupId=com.nvidia -DartifactId=$ART_ID -Dversion=$ART_VER -DpomFile=./dist/target/dependency-reduced-pom.xml

###### Deploy integration tests jar(s) ######
TESTS_ART_ID=`mvn help:evaluate -q -pl $TESTS_PL -Dexpression=project.artifactId -DforceStdout`
Expand Down
55 changes: 41 additions & 14 deletions jenkins/spark-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,10 @@ $MVN_GET_CMD -DremoteRepositories=$PROJECT_REPO \
-DgroupId=com.nvidia -DartifactId=rapids-4-spark_$SCALA_BINARY_VER -Dversion=$PROJECT_VER
$MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \
-DgroupId=com.nvidia -DartifactId=rapids-4-spark-udf-examples_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER

# TODO remove -Dtransitive=false workaround once pom is fixed
$MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \
-Dtransitive=false \
-DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER -Dclassifier=$SHUFFLE_SPARK_SHIM
if [ "$CUDA_CLASSIFIER"x == x ];then
CUDF_JAR="$ARTF_ROOT/cudf-$CUDF_VER.jar"
Expand All @@ -45,29 +48,53 @@ export RAPIDS_PLUGIN_JAR="$ARTF_ROOT/rapids-4-spark_${SCALA_BINARY_VER}-$PROJECT
RAPIDS_UDF_JAR="$ARTF_ROOT/rapids-4-spark-udf-examples_${SCALA_BINARY_VER}-$PROJECT_TEST_VER.jar"
RAPIDS_TEST_JAR="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_TEST_VER-$SHUFFLE_SPARK_SHIM.jar"

# TODO remove -Dtransitive=false workaround once pom is fixed
$MVN_GET_CMD -DremoteRepositories=$PROJECT_TEST_REPO \
-Dtransitive=false \
-DgroupId=com.nvidia -DartifactId=rapids-4-spark-integration-tests_$SCALA_BINARY_VER -Dversion=$PROJECT_TEST_VER -Dclassifier=pytest -Dpackaging=tar.gz

RAPIDS_INT_TESTS_HOME="$ARTF_ROOT/integration_tests/"
# The version of pytest.tar.gz that is uploaded is the one built against spark301 but its being pushed without classifier for now
RAPIDS_INT_TESTS_TGZ="$ARTF_ROOT/rapids-4-spark-integration-tests_${SCALA_BINARY_VER}-$PROJECT_TEST_VER-pytest.tar.gz"

tmp_info=${TMP_INFO_FILE:-'/tmp/artifacts-build.info'}
rm -rf "$tmp_info"
TEE_CMD="tee -a $tmp_info"
GREP_CMD="grep revision"
AWK_CMD=(awk -F'=' '{print $2}')
getRevision() {
local file=$1
local properties=$2
local revision
if [[ $file == *.jar || $file == *.zip ]]; then
revision=$(unzip -p "$file" "$properties" | $TEE_CMD | $GREP_CMD | "${AWK_CMD[@]}" || true)
elif [[ $file == *.tgz || $file == *.tar.gz ]]; then
revision=$(tar -xzf "$file" --to-command=cat "$properties" | $TEE_CMD | $GREP_CMD | "${AWK_CMD[@]}" || true)
fi
echo "$revision"
}

set +x
rm -rf /tmp/artifacts-build.info
TEE_CMD="tee -a /tmp/artifacts-build.info"
echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" | $TEE_CMD
echo "-------------------- cudf JNI BUILD INFO --------------------" | $TEE_CMD
unzip -p $CUDF_JAR cudf-java-version-info.properties | $TEE_CMD || true
echo "-------------------- rapids-4-spark BUILD INFO --------------------" | $TEE_CMD
unzip -p $RAPIDS_PLUGIN_JAR rapids4spark-version-info.properties | $TEE_CMD || true
echo "-------------------- rapids-4-spark-udf-examples BUILD INFO --------------------" | $TEE_CMD
unzip -p $RAPIDS_UDF_JAR rapids4spark-version-info.properties | $TEE_CMD || true
echo "-------------------- rapids-4-spark-integration-tests BUILD INFO --------------------" | $TEE_CMD
unzip -p $RAPIDS_TEST_JAR rapids4spark-version-info.properties | $TEE_CMD || true
echo "-------------------- rapids-4-spark-integration-tests pytest BUILD INFO --------------------" | $TEE_CMD
tar -xzf $RAPIDS_INT_TESTS_TGZ --to-command=cat integration_tests/rapids4spark-version-info.properties | $TEE_CMD || true
echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" | $TEE_CMD
echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" >> "$tmp_info"
echo "-------------------- cudf JNI BUILD INFO --------------------" >> "$tmp_info"
c_ver=$(getRevision $JARS_PATH/$CUDF_JAR cudf-java-version-info.properties)
echo "-------------------- rapids-4-spark BUILD INFO --------------------" >> "$tmp_info"
p_ver=$(getRevision $JARS_PATH/$RAPIDS_PLUGIN_JAR rapids4spark-version-info.properties)
echo "-------------------- rapids-4-spark-integration-tests BUILD INFO --------------------" >> "$tmp_info"
it_ver=$(getRevision $JARS_PATH/$RAPIDS_TEST_JAR rapids4spark-version-info.properties)
echo "-------------------- rapids-4-spark-integration-tests pytest BUILD INFO --------------------" >> "$tmp_info"
pt_ver=$(getRevision $JARS_PATH/$RAPIDS_INT_TESTS_TGZ integration_tests/rapids4spark-version-info.properties)
echo "-------------------- rapids-4-spark-udf-examples BUILD INFO --------------------" >> "$tmp_info"
u_ver=$(getRevision $JARS_PATH/$RAPIDS_UDF_JAR rapids4spark-version-info.properties)
echo -e "\n==================== ARTIFACTS BUILD INFO ====================\n" >> "$tmp_info"
set -x
cat "$tmp_info" || true

if [[ -z "$c_ver" || -z "$p_ver"|| \
"$p_ver" != "$it_ver" || "$p_ver" != "$pt_ver" || "$p_ver" != "$u_ver" ]]; then
echo "Artifacts versions are inconsistent!"
exit 1
fi

tar xzf "$RAPIDS_INT_TESTS_TGZ" -C $ARTF_ROOT && rm -f "$RAPIDS_INT_TESTS_TGZ"

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -343,7 +343,8 @@ abstract class SparkBaseShims extends Spark30XShims {
}
override def convertToGpu(lhs: Expression, regexp: Expression,
rep: Expression): GpuExpression = GpuStringReplace(lhs, regexp, rep)
})
}),
GpuScalaUDFMeta.exprMeta
).map(r => (r.getClassFor.asSubclass(classOf[Expression]), r)).toMap
}

Expand Down
Loading

0 comments on commit f2e3088

Please sign in to comment.