NVIDIA · tgravescs · Oct 21, 2021 · Sep 27, 2021 · Sep 27, 2021 · Sep 27, 2021
diff --git a/dist/unshimmed-common-from-spark301.txt b/dist/unshimmed-common-from-spark301.txt
@@ -7,6 +7,7 @@ com/nvidia/spark/RapidsUDF*
 com/nvidia/spark/SQLPlugin*
 com/nvidia/spark/rapids/ColumnarRdd*
 com/nvidia/spark/rapids/ExecutionPlanCaptureCallback*
+com/nvidia/spark/rapids/ExplainPlan*
 com/nvidia/spark/rapids/GpuKryoRegistrator*
 com/nvidia/spark/rapids/PlanUtils*
 com/nvidia/spark/rapids/RapidsExecutorHeartbeatMsg*

diff --git a/integration_tests/src/main/python/explain_test.py b/integration_tests/src/main/python/explain_test.py
@@ -0,0 +1,93 @@
+# Copyright (c) 2021, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import pytest
+
+from data_gen import *
+from marks import *
+from pyspark.sql.functions import *
+from pyspark.sql.types import *
+from spark_session import with_cpu_session
+
+def create_df(spark, data_gen, left_length, right_length):
+    left = binary_op_df(spark, data_gen, length=left_length)
+    right = binary_op_df(spark, data_gen, length=right_length).withColumnRenamed("a", "r_a")\
+            .withColumnRenamed("b", "r_b")
+    return left, right
+
+
+@pytest.mark.parametrize('data_gen', [StringGen()], ids=idfn)
+def test_explain_join(spark_tmp_path, data_gen):
+    data_path1 = spark_tmp_path + '/PARQUET_DATA1'
+    data_path2 = spark_tmp_path + '/PARQUET_DATA2'
+
+    def do_join_explain(spark):
+        left, right = create_df(spark, data_gen, 500, 500)
+        left.write.parquet(data_path1)
+        right.write.parquet(data_path2)
+        df1 = spark.read.parquet(data_path1)
+        df2 = spark.read.parquet(data_path2)
+        df3 = df1.join(df2, df1.a == df2.r_a, "inner")
+        explain_str = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df3._jdf, "ALL")
+        remove_isnotnull = explain_str.replace("isnotnull", "")
+        # everything should be on GPU
+        assert "not" not in remove_isnotnull
+
+    with_cpu_session(do_join_explain)
+
+def test_explain_set_config():
+    conf = {'spark.rapids.sql.hasExtendedYearValues': 'false',
+            'spark.rapids.sql.castStringToTimestamp.enabled': 'true'}
+
+    def do_explain(spark):
+        df = unary_op_df(spark, StringGen('[0-9]{1,4}-[0-9]{1,2}-[0-9]{1,2}')).select(f.col('a').cast(TimestampType()))
+        # a bit brittle if these get turned on by default
+        spark.conf.set('spark.rapids.sql.hasExtendedYearValues', 'false')
+        spark.conf.set('spark.rapids.sql.castStringToTimestamp.enabled', 'true')
+        explain_str = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df._jdf, "ALL")
+        print(explain_str)
+        assert "timestamp) will run on GPU" in explain_str
+        spark.conf.set('spark.rapids.sql.castStringToTimestamp.enabled', 'false')
+        explain_str_cast_off = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df._jdf, "ALL")
+        print(explain_str_cast_off)
+        assert "timestamp) cannot run on GPU" in explain_str_cast_off
+
+    with_cpu_session(do_explain)
+
+def test_explain_udf():
+    slen = udf(lambda s: len(s), IntegerType())
+
+    @udf
+    def to_upper(s):
+        if s is not None:
+            return s.upper()
+
+    @udf(returnType=IntegerType())
+    def add_one(x):
+        if x is not None:
+            return x + 1
+
+    def do_explain(spark):
+        df = spark.createDataFrame([(1, "John Doe", 21)], ("id", "name", "age"))
+        df2 = df.select(slen("name").alias("slen(name)"), to_upper("name"), add_one("age"))
+        explain_str = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df2._jdf, "ALL")
+        # udf shouldn't be on GPU
+        udf_str_not = 'cannot run on GPU because no GPU enabled version of operator class org.apache.spark.sql.execution.python.BatchEvalPythonExec'
+        assert udf_str_not in explain_str
+        not_on_gpu_str = spark.sparkContext._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df2._jdf, "NOT")
+        assert udf_str_not in not_on_gpu_str
+        assert "will run on GPU" not in not_on_gpu_str
+
+    with_cpu_session(do_explain)
+
diff --git a/shims/spark320/src/main/scala/com/nvidia/spark/rapids/shims/spark320/Spark320Shims.scala b/shims/spark320/src/main/scala/com/nvidia/spark/rapids/shims/spark320/Spark320Shims.scala
@@ -1004,6 +1004,10 @@ class Spark320Shims extends Spark32XShims {
       new KryoJavaSerializer())
   }
 
+  override def getAdaptiveInputPlan(adaptivePlan: AdaptiveSparkPlanExec): SparkPlan = {
+    adaptivePlan.initialPlan
+  }
+
   override def getLegacyStatisticalAggregate(): Boolean =
     SQLConf.get.legacyStatisticalAggregate
 }
diff --git a/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala b/sql-plugin/src/main/301db/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.connector.read.Scan
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec, ShuffleQueryStageExec}
+import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, ShuffleQueryStageExec}
 import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, RunnableCommand}
 import org.apache.spark.sql.execution.datasources.{FileIndex, FilePartition, FileScanRDD, HadoopFsRelation, InMemoryFileIndex, PartitionDirectory, PartitionedFile, PartitioningAwareFileIndex}
 import org.apache.spark.sql.execution.datasources.json.JsonFileFormat
@@ -688,5 +688,9 @@ abstract class SparkBaseShims extends Spark30XShims {
       new KryoJavaSerializer())
   }
 
+  override def getAdaptiveInputPlan(adaptivePlan: AdaptiveSparkPlanExec): SparkPlan = {
+    adaptivePlan.initialPlan
+  }
+
   override def getLegacyStatisticalAggregate(): Boolean = true
 }
diff --git a/...in/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala b/...in/src/main/301until310-nondb/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala
@@ -41,7 +41,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.connector.read.Scan
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec, ShuffleQueryStageExec}
+import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, ShuffleQueryStageExec}
 import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, RunnableCommand}
 import org.apache.spark.sql.execution.datasources.{FileIndex, FilePartition, FileScanRDD, HadoopFsRelation, InMemoryFileIndex, PartitionDirectory, PartitionedFile, PartitioningAwareFileIndex}
 import org.apache.spark.sql.execution.datasources.rapids.GpuPartitioningUtils
@@ -647,4 +647,8 @@ abstract class SparkBaseShims extends Spark30XShims {
     kryo.register(classOf[SerializeBatchDeserializeHostBuffer],
       new KryoJavaSerializer())
   }
+
+  override def getAdaptiveInputPlan(adaptivePlan: AdaptiveSparkPlanExec): SparkPlan = {
+    adaptivePlan.initialPlan
+  }
 }
diff --git a/sql-plugin/src/main/311db/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala b/sql-plugin/src/main/311db/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala
@@ -42,7 +42,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.connector.read.Scan
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec, ShuffleQueryStageExec}
+import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, ShuffleQueryStageExec}
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, RunnableCommand}
 import org.apache.spark.sql.execution.datasources._
@@ -874,6 +874,10 @@ abstract class SparkBaseShims extends Spark30XShims {
 
   override def shouldFallbackOnAnsiTimestamp(): Boolean = SQLConf.get.ansiEnabled
 
+  override def getAdaptiveInputPlan(adaptivePlan: AdaptiveSparkPlanExec): SparkPlan = {
+    adaptivePlan.inputPlan
+  }
+
   override def getLegacyStatisticalAggregate(): Boolean =
     SQLConf.get.legacyStatisticalAggregate
 }
diff --git a/...in/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala b/...in/src/main/311until320-nondb/scala/com/nvidia/spark/rapids/shims/v2/SparkBaseShims.scala
@@ -41,7 +41,7 @@ import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning
 import org.apache.spark.sql.catalyst.trees.TreeNode
 import org.apache.spark.sql.connector.read.Scan
 import org.apache.spark.sql.execution._
-import org.apache.spark.sql.execution.adaptive.{BroadcastQueryStageExec, ShuffleQueryStageExec}
+import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, BroadcastQueryStageExec, ShuffleQueryStageExec}
 import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.command.{AlterTableRecoverPartitionsCommand, RunnableCommand}
 import org.apache.spark.sql.execution.datasources._
@@ -845,6 +845,10 @@ abstract class SparkBaseShims extends Spark31XShims {
 
   override def shouldFallbackOnAnsiTimestamp(): Boolean = SQLConf.get.ansiEnabled
 
+  override def getAdaptiveInputPlan(adaptivePlan: AdaptiveSparkPlanExec): SparkPlan = {
+    adaptivePlan.inputPlan
+  }
+
   override def getLegacyStatisticalAggregate(): Boolean =
     SQLConf.get.legacyStatisticalAggregate
 }
diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ExplainPlan.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/ExplainPlan.scala
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package com.nvidia.spark.rapids
+
+import scala.util.control.NonFatal
+
+import org.apache.spark.sql.DataFrame
+
+// Base trait visible publicly outside of parallel world packaging.
+// It can't be named the same as ExplainPlan object to allow calling from PySpark.
+trait ExplainPlanBase {
+  def explainPotentialGpuPlan(df: DataFrame, explain: String = "ALL"): String
+}
+
+object ExplainPlan {
+  /**
+   * Looks at the CPU plan associated with the dataframe and outputs information
+   * about which parts of the query the RAPIDS Accelerator for Apache Spark
+   * could place on the GPU. This only applies to the initial plan, so if running
+   * with adaptive query execution enable, it will not be able to show any changes
+   * in the plan due to that.
+   *
+   * This is very similar output you would get by running the query with the
+   * Rapids Accelerator enabled and with the config `spark.rapids.sql.enabled` enabled.
+   *
+   * Requires the RAPIDS Accelerator for Apache Spark jar and RAPIDS cudf jar be included
+   * in the classpath but the RAPIDS Accelerator for Apache Spark should be disabled.
+   *
+   * {{{
+   *   val output = com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df)
+   * }}}
+   *
+   * Calling from PySpark:
+   *
+   * {{{
+   *   output = sc._jvm.com.nvidia.spark.rapids.ExplainPlan.explainPotentialGpuPlan(df._jdf, "ALL")
+   * }}}
+   *
+   * @param df The Spark DataFrame to get the query plan from
+   * @param explain If ALL returns all the explain data, otherwise just returns what does not
+   *                work on the GPU. Default is ALL.
+   * @return String containing the explained plan.
+   * @throws IllegalArgumentException if an argument is invalid or it is unable to determine the
+   *         Spark version
+   * @throws IllegalStateException if the plugin gets into an invalid state while trying
+   *         to process the plan or there is an unexepected exception.
+   */
+  @throws[IllegalArgumentException]
+  @throws[IllegalStateException]
+  def explainPotentialGpuPlan(df: DataFrame, explain: String = "ALL"): String = {
+    try {
+      ShimLoader.newExplainPlan.explainPotentialGpuPlan(df, explain)
+    } catch {
+      case ia: IllegalArgumentException => throw ia
+      case is: IllegalStateException => throw is
+      case NonFatal(e) =>
+        val msg = "Unexpected exception trying to run explain on the plan!"
+        throw new IllegalStateException(msg, e)
+    }
+  }
+}