NVIDIA · andygrove · Nov 2, 2021 · Oct 19, 2021 · Oct 25, 2021 · Oct 25, 2021
diff --git a/integration_tests/src/main/python/hash_aggregate_test.py b/integration_tests/src/main/python/hash_aggregate_test.py
@@ -1164,6 +1164,31 @@ def test_hash_groupby_approx_percentile_double_scalar():
                                      ('v', DoubleGen())], length=100),
         0.05)
 
+@pytest.mark.parametrize('aqe_enabled', ['false', 'true'], ids=idfn)
+@ignore_order(local=True)
+@allow_non_gpu('TakeOrderedAndProjectExec', 'Alias', 'Cast', 'ObjectHashAggregateExec', 'AggregateExpression',
+    'ApproximatePercentile', 'Literal', 'ShuffleExchangeExec', 'HashPartitioning', 'CollectLimitExec')
+def test_hash_groupby_approx_percentile_partial_fallback_to_cpu(aqe_enabled):
+    conf = copy_and_update(_approx_percentile_conf, {
+        'spark.sql.adaptive.enabled': aqe_enabled,
+        'spark.rapids.sql.explain': 'ALL'
+    })
+
+    def create_and_show_df(spark):
+        df = gen_df(spark, [('k', StringGen(nullable=False)),
+                            ('v', DoubleGen())], length=100)
+        df.createOrReplaceTempView("t")
+        df2 = spark.sql("SELECT k, approx_percentile(v, array(0.1, 0.2)) from t group by k")
+
+        # the "show" introduces a `CAST(approx_percentile(...) AS string)` on the final aggregate and this is
+        # not supported on GPU so falls back to CPU and the purpose of this test is to make sure that the
+        # partial aggregate also falls back to CPU
+        df2.show()
+
+        return df2
+
+    run_with_cpu_and_gpu(create_and_show_df, 'COLLECT', conf)
+
 # The percentile approx tests differ from other tests because we do not expect the CPU and GPU to produce the same
 # results due to the different algorithms being used. Instead we compute an exact percentile on the CPU and then
 # compute approximate percentiles on CPU and GPU and assert that the GPU numbers are accurate within some percentage

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala
@@ -26,6 +26,7 @@ import org.apache.spark.sql.catalyst.plans.physical.Partitioning
 import org.apache.spark.sql.catalyst.trees.TreeNodeTag
 import org.apache.spark.sql.connector.read.Scan
 import org.apache.spark.sql.execution.SparkPlan
+import org.apache.spark.sql.execution.adaptive.QueryStageExec
 import org.apache.spark.sql.execution.aggregate.BaseAggregateExec
 import org.apache.spark.sql.execution.command.DataWritingCommand
 import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
@@ -627,6 +628,16 @@ abstract class SparkPlanMeta[INPUT <: SparkPlan](plan: INPUT,
     }
   }
 
+  def recursivelyCheckTags() {
+    if (wrapped.isInstanceOf[QueryStageExec] ||
+      ShimLoader.getSparkShims.isCustomReaderExec(wrapped)) {
+      // stop recursion once we hit an already-executed query stage or a reader for it
+    } else {
+      wrapped.getTagValue(gpuSupportedTag).foreach(_.foreach(willNotWorkOnGpu))
+      childPlans.foreach(_.recursivelyCheckTags())
+    }
+  }
+
   /**
    * Run rules that happen for the entire tree after it has been tagged initially.
    */

diff --git a/...lugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala b/...lugin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuBroadcastExchangeExec.scala
@@ -253,7 +253,7 @@ class GpuBroadcastMeta(
     }
     // when AQE is enabled and we are planning a new query stage, we need to look at meta-data
     // previously stored on the spark plan to determine whether this exchange can run on GPU
-    wrapped.getTagValue(gpuSupportedTag).foreach(_.foreach(willNotWorkOnGpu))
+    recursivelyCheckTags()
   }
 
   override def convertToGpu(): GpuExec = {

diff --git a/...gin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala b/...gin/src/main/scala/org/apache/spark/sql/rapids/execution/GpuShuffleExchangeExecBase.scala
@@ -81,7 +81,7 @@ class GpuShuffleMeta(
   override def tagPlanForGpu(): Unit = {
     // when AQE is enabled and we are planning a new query stage, we need to look at meta-data
     // previously stored on the spark plan to determine whether this exchange can run on GPU
-    wrapped.getTagValue(gpuSupportedTag).foreach(_.foreach(willNotWorkOnGpu))
+    recursivelyCheckTags()
 
     shuffle.outputPartitioning match {
       case _: RoundRobinPartitioning