NVIDIA · tgravescs · Jun 9, 2021 · Jun 4, 2021 · Jun 5, 2021 · Jun 5, 2021
diff --git a/integration_tests/src/main/python/window_function_test.py b/integration_tests/src/main/python/window_function_test.py
@@ -455,3 +455,34 @@ def test_window_aggs_for_rows_collect_list():
             (partition by a order by b,c_int rows between CURRENT ROW and UNBOUNDED FOLLOWING) as collect_struct
         from window_collect_table
         ''')
+
+
+# SortExec does not support array type, so sort the result locally.
+@ignore_order(local=True)
+def test_running_window_aggs_for_rows_collect_list():
+    assert_gpu_and_cpu_are_equal_sql(
+        lambda spark : gen_df(spark, _gen_data_for_collect),
+        "window_collect_table",
+        '''
+        select
+          sum(c_int) over
+            (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as sum_int,
+          min(c_long) over
+            (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as min_long,
+          max(c_time) over
+            (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as max_time,
+          count(1) over
+            (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as count_1,
+          count(*) over
+            (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as count_star,
+          row_number() over
+            (partition by a order by b,c_int) as row_num,
+          collect_list(c_float) over
+            (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as collect_float,
+          collect_list(c_decimal) over
+            (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as collect_decimal,
+          collect_list(c_struct) over
+            (partition by a order by b,c_int rows between UNBOUNDED PRECEDING AND CURRENT ROW) as collect_struct
+        from window_collect_table
+        ''')
+
diff --git a/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/SparkBaseShims.scala b/shims/spark301/src/main/scala/com/nvidia/spark/rapids/shims/spark301/SparkBaseShims.scala
@@ -47,6 +47,7 @@ import org.apache.spark.sql.execution.datasources.v2.parquet.ParquetScan
 import org.apache.spark.sql.execution.exchange.{ReusedExchangeExec, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec, HashJoin, ShuffledHashJoinExec, SortMergeJoinExec}
 import org.apache.spark.sql.execution.python.{AggregateInPandasExec, ArrowEvalPythonExec, FlatMapGroupsInPandasExec, MapInPandasExec, WindowInPandasExec}
+import org.apache.spark.sql.execution.window.WindowExecBase
 import org.apache.spark.sql.internal.SQLConf
 import org.apache.spark.sql.rapids.{GpuFileSourceScanExec, GpuStringReplace, GpuTimeSub, ShuffleManagerShimBase}
 import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExecBase, GpuBroadcastNestedLoopJoinExecBase, GpuShuffleExchangeExecBase}
@@ -141,6 +142,8 @@ abstract class SparkBaseShims extends SparkShims {
     }
   }
 
+  override def isWindowFunctionExec(plan: SparkPlan): Boolean = plan.isInstanceOf[WindowExecBase]
+
   override def isGpuShuffledHashJoin(plan: SparkPlan): Boolean = {
     plan match {
       case _: GpuShuffledHashJoinExec => true

diff --git a/...k301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuRunningWindowExec.scala b/...k301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/GpuRunningWindowExec.scala
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims.spark301db
+
+import com.databricks.sql.execution.window.RunningWindowFunctionExec
+import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, GpuExec, GpuOverrides, GpuWindowExec, RapidsConf, RapidsMeta, SparkPlanMeta}
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression, SortOrder}
+
+/**
+ * GPU-based window-exec implementation, analogous to RunningWindowFunctionExec.
+ */
+class GpuRunningWindowExecMeta(runningWindowFunctionExec: RunningWindowFunctionExec,
+    conf: RapidsConf,
+    parent: Option[RapidsMeta[_, _, _]],
+    rule: DataFromReplacementRule)
+    extends SparkPlanMeta[RunningWindowFunctionExec](runningWindowFunctionExec, conf, parent, rule) {
+
+  val windowExpressions: Seq[BaseExprMeta[NamedExpression]] =
+    runningWindowFunctionExec.windowExpressionList.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
+  val partitionSpec: Seq[BaseExprMeta[Expression]] =
+    runningWindowFunctionExec.partitionSpec.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
+  val orderSpec: Seq[BaseExprMeta[SortOrder]] =
+    runningWindowFunctionExec.orderSpec.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
+
+  override def tagPlanForGpu(): Unit = {
+    windowExpressions.map(meta => meta.wrapped)
+        .filter(expr => !expr.isInstanceOf[NamedExpression])
+        .foreach(_ => willNotWorkOnGpu(because = "Unexpected query plan with Windowing functions; " +
+            "cannot convert for GPU execution. " +
+            "(Detail: WindowExpression not wrapped in `NamedExpression`.)"))
+  }
+
+  override def convertToGpu(): GpuExec = {
+    GpuWindowExec(
+      windowExpressions.map(_.convertToGpu()),
+      partitionSpec.map(_.convertToGpu()),
+      orderSpec.map(_.convertToGpu().asInstanceOf[SortOrder]),
+      childPlans.head.convertIfNeeded(),
+      true
+    )
+  }
+}
+
diff --git a/.../spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/Spark301dbShims.scala b/.../spark301db/src/main/scala/com/nvidia/spark/rapids/shims/spark301db/Spark301dbShims.scala
@@ -16,6 +16,7 @@
 
 package com.nvidia.spark.rapids.shims.spark301db
 
+import com.databricks.sql.execution.window.RunningWindowFunctionExec
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.shims.spark301.Spark301Shims
 import org.apache.hadoop.fs.Path
@@ -36,6 +37,7 @@ import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec, HashJoin, SortMergeJoinExec}
 import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec
 import org.apache.spark.sql.execution.python.{AggregateInPandasExec, ArrowEvalPythonExec, FlatMapGroupsInPandasExec, MapInPandasExec, WindowInPandasExec}
+import org.apache.spark.sql.execution.window.WindowExecBase
 import org.apache.spark.sql.rapids.GpuFileSourceScanExec
 import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExecBase, GpuBroadcastNestedLoopJoinExecBase, GpuShuffleExchangeExecBase}
 import org.apache.spark.sql.rapids.execution.python.{GpuAggregateInPandasExecMeta, GpuArrowEvalPythonExec, GpuFlatMapGroupsInPandasExecMeta, GpuMapInPandasExecMeta, GpuPythonUDF, GpuWindowInPandasExecMetaBase}
@@ -75,6 +77,9 @@ class Spark301dbShims extends Spark301Shims {
     }
   }
 
+  override def isWindowFunctionExec(plan: SparkPlan): Boolean =
+    plan.isInstanceOf[WindowExecBase] || plan.isInstanceOf[RunningWindowFunctionExec]
+
   override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = {
     Seq(
       GpuOverrides.exec[WindowInPandasExec](
@@ -96,6 +101,17 @@ class Spark301dbShims extends Spark301Shims {
             )
           }
         }).disabledByDefault("it only supports row based frame for now"),
+      GpuOverrides.exec[RunningWindowFunctionExec](
+        "Databricks-specific window function exec, for \"running\" windows, " +
+            "i.e. (UNBOUNDED PRECEDING TO CURRENT ROW)",
+        ExecChecks(
+          TypeSig.commonCudfTypes + TypeSig.DECIMAL +
+              TypeSig.STRUCT.nested(TypeSig.commonCudfTypes + TypeSig.DECIMAL) +
+              TypeSig.ARRAY.nested(TypeSig.commonCudfTypes + TypeSig.DECIMAL + TypeSig.STRUCT
+                  + TypeSig.ARRAY),
+          TypeSig.all),
+        (runningWindowFunctionExec, conf, p, r) => new GpuRunningWindowExecMeta(runningWindowFunctionExec, conf, p, r)
+      ),
       GpuOverrides.exec[FileSourceScanExec](
         "Reading data from files, often from Hive tables",
         ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.STRUCT + TypeSig.MAP +

diff --git a/...k311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/GpuRunningWindowExec.scala b/...k311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/GpuRunningWindowExec.scala
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims.spark311db
+
+import com.databricks.sql.execution.window.RunningWindowFunctionExec
+import com.nvidia.spark.rapids.{BaseExprMeta, DataFromReplacementRule, GpuExec, GpuOverrides, GpuWindowExec, RapidsConf, RapidsMeta, SparkPlanMeta}
+
+import org.apache.spark.sql.catalyst.expressions.{Expression, NamedExpression, SortOrder}
+
+/**
+ * GPU-based window-exec implementation, analogous to RunningWindowFunctionExec.
+ */
+class GpuRunningWindowExecMeta(runningWindowFunctionExec: RunningWindowFunctionExec,
+    conf: RapidsConf,
+    parent: Option[RapidsMeta[_, _, _]],
+    rule: DataFromReplacementRule)
+    extends SparkPlanMeta[RunningWindowFunctionExec](runningWindowFunctionExec, conf, parent, rule) {
+
+  val windowExpressions: Seq[BaseExprMeta[NamedExpression]] =
+    runningWindowFunctionExec.windowExpressionList.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
+  val partitionSpec: Seq[BaseExprMeta[Expression]] =
+    runningWindowFunctionExec.partitionSpec.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
+  val orderSpec: Seq[BaseExprMeta[SortOrder]] =
+    runningWindowFunctionExec.orderSpec.map(GpuOverrides.wrapExpr(_, conf, Some(this)))
+
+  override def tagPlanForGpu(): Unit = {
+    windowExpressions.map(meta => meta.wrapped)
+        .filter(expr => !expr.isInstanceOf[NamedExpression])
+        .foreach(_ => willNotWorkOnGpu(because = "Unexpected query plan with Windowing functions; " +
+            "cannot convert for GPU execution. " +
+            "(Detail: WindowExpression not wrapped in `NamedExpression`.)"))
+  }
+
+  override def convertToGpu(): GpuExec = {
+    GpuWindowExec(
+      windowExpressions.map(_.convertToGpu()),
+      partitionSpec.map(_.convertToGpu()),
+      orderSpec.map(_.convertToGpu().asInstanceOf[SortOrder]),
+      childPlans.head.convertIfNeeded(),
+      true
+    )
+  }
+}
+
diff --git a/.../spark311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/Spark311dbShims.scala b/.../spark311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/Spark311dbShims.scala
@@ -16,6 +16,7 @@
 
 package com.nvidia.spark.rapids.shims.spark311db
 
+import com.databricks.sql.execution.window.RunningWindowFunctionExec
 import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.shims.spark311.Spark311Shims
 import org.apache.hadoop.fs.Path
@@ -37,6 +38,7 @@ import org.apache.spark.sql.execution.exchange.ShuffleExchangeExec
 import org.apache.spark.sql.execution.joins.{BroadcastHashJoinExec, BroadcastNestedLoopJoinExec, HashJoin, SortMergeJoinExec}
 import org.apache.spark.sql.execution.joins.ShuffledHashJoinExec
 import org.apache.spark.sql.execution.python.{AggregateInPandasExec, ArrowEvalPythonExec, FlatMapGroupsInPandasExec, MapInPandasExec, WindowInPandasExec}
+import org.apache.spark.sql.execution.window.WindowExecBase
 import org.apache.spark.sql.rapids.GpuFileSourceScanExec
 import org.apache.spark.sql.rapids.execution.{GpuBroadcastExchangeExecBase, GpuBroadcastNestedLoopJoinExecBase, GpuShuffleExchangeExecBase}
 import org.apache.spark.sql.rapids.execution.python.{GpuPythonUDF, GpuWindowInPandasExecMetaBase}
@@ -77,6 +79,9 @@ class Spark311dbShims extends Spark311Shims {
     }
   }
 
+  override def isWindowFunctionExec(plan: SparkPlan): Boolean =
+    plan.isInstanceOf[WindowExecBase] || plan.isInstanceOf[RunningWindowFunctionExec]
+
   override def getExecs: Map[Class[_ <: SparkPlan], ExecRule[_ <: SparkPlan]] = {
     Seq(
       GpuOverrides.exec[WindowInPandasExec](
@@ -99,6 +104,17 @@ class Spark311dbShims extends Spark311Shims {
             )
           }
         }).disabledByDefault("it only supports row based frame for now"),
+      GpuOverrides.exec[RunningWindowFunctionExec](
+        "Databricks-specific window function exec, for \"running\" windows, " +
+            "i.e. (UNBOUNDED PRECEDING TO CURRENT ROW)",
+        ExecChecks(
+            TypeSig.commonCudfTypes + TypeSig.DECIMAL +
+                TypeSig.STRUCT.nested(TypeSig.commonCudfTypes + TypeSig.DECIMAL) +
+                TypeSig.ARRAY.nested(TypeSig.commonCudfTypes + TypeSig.DECIMAL + TypeSig.STRUCT
+                    + TypeSig.ARRAY),
+            TypeSig.all),
+          (runningWindowFunctionExec, conf, p, r) => new GpuRunningWindowExecMeta(runningWindowFunctionExec, conf, p, r)
+      ),
       GpuOverrides.exec[FileSourceScanExec](
         "Reading data from files, often from Hive tables",
         ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.STRUCT + TypeSig.MAP +

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/RapidsMeta.scala
@@ -712,7 +712,8 @@ object ExpressionContext {
     val parent = findParentPlanMeta(meta)
     assert(parent.isDefined, "It is expected that an aggregate function is a child of a SparkPlan")
     parent.get.wrapped match {
-      case _: WindowExecBase => WindowAggExprContext
+      case agg if ShimLoader.getSparkShims.isWindowFunctionExec(agg.asInstanceOf[SparkPlan]) =>
+        WindowAggExprContext
       case agg: BaseAggregateExec =>
         if (agg.groupingExpressions.isEmpty) {
           ReductionAggExprContext

diff --git a/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala b/sql-plugin/src/main/scala/com/nvidia/spark/rapids/SparkShims.scala
@@ -90,6 +90,7 @@ trait SparkShims {
 
   def isGpuBroadcastHashJoin(plan: SparkPlan): Boolean
   def isGpuShuffledHashJoin(plan: SparkPlan): Boolean
+  def isWindowFunctionExec(plan: SparkPlan): Boolean
   def getRapidsShuffleManagerClass: String
   def getBuildSide(join: HashJoin): GpuBuildSide
   def getBuildSide(join: BroadcastNestedLoopJoinExec): GpuBuildSide