NVIDIA · razajafri · Jul 12, 2021 · Jul 6, 2021 · Jul 7, 2021 · Jul 7, 2021
@@ -35,7 +35,7 @@ def main():
   print("rsync command: %s" % rsync_command)
   subprocess.check_call(rsync_command, shell = True)
 
-  ssh_command = "bash -c 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s 2>&1 | tee testout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % (master_addr, params.private_key_file, params.script_dest, params.jar_path, params.spark_conf)
+  ssh_command = "bash -c 'ssh -o StrictHostKeyChecking=no -o UserKnownHostsFile=/dev/null ubuntu@%s -p 2200 -i %s %s %s %s 2>&1 | tee testout; if [ `echo ${PIPESTATUS[0]}` -ne 0 ]; then false; else true; fi'" % (master_addr, params.private_key_file, params.script_dest, params.jar_path, params.spark_conf, params.base_spark_pom_version)
   print("ssh command: %s" % ssh_command)
   subprocess.check_call(ssh_command, shell = True)
 

@@ -17,8 +17,9 @@
 
 set -ex
 
-LOCAL_JAR_PATH=$1
+LOCAL_JAR_PATH=${LOCAL_JAR_PATH:-$1}
 SPARK_CONF=$2
+BASE_SPARK_VER=${BASE_SPARK_VER:-$3}
 
 # tests
 export PATH=/databricks/conda/envs/databricks-ml-gpu/bin:/databricks/conda/condabin:$PATH
@@ -54,19 +55,36 @@ if [ -n "$SPARK_CONF" ]; then
     SPARK_CONF="--conf ${SPARK_CONF/','/' --conf '}"
 fi
 
+IS_SPARK_311_OR_LATER=0
+[[ "$(printf '%s\n' "3.1.1" "$BASE_SPARK_VER" | sort -V | head -n1)" = "3.1.1" ]] && IS_SPARK_311_OR_LATER=1
+
 TEST_TYPE="nightly"
+PCBS_CONF="com.nvidia.spark.rapids.shims.spark311db.ParquetCachedBatchSerializer"
 if [ -d "$LOCAL_JAR_PATH" ]; then
     ## Run tests with jars in the LOCAL_JAR_PATH dir downloading from the denpedency repo
     LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" --test_type=$TEST_TYPE
 
+    ## Run cache tests
+    if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
+      PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
+       LOCAL_JAR_PATH=$LOCAL_JAR_PATH bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh  --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
+    fi
+
     ## Run cudf-udf tests
     CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls $LOCAL_JAR_PATH/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
     LOCAL_JAR_PATH=$LOCAL_JAR_PATH SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \
         bash $LOCAL_JAR_PATH/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" -m "cudf_udf" --cudf_udf --test_type=$TEST_TYPE
+
 else
     ## Run tests with jars building from the spark-rapids source code
     bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE
 
+    ## Run cache tests
+    if [[ "$IS_SPARK_311_OR_LATER" -eq "1" ]]; then
+      PYSP_TEST_spark_sql_cache_serializer=${PCBS_CONF} \
+       bash /home/ubuntu/spark-rapids/integration_tests/run_pyspark_from_build.sh --runtime_env="databricks" --test_type=$TEST_TYPE -k cache_test
+    fi
+
     ## Run cudf-udf tests
     CUDF_UDF_TEST_ARGS="$CUDF_UDF_TEST_ARGS --conf spark.executorEnv.PYTHONPATH=`ls /home/ubuntu/spark-rapids/dist/target/rapids-4-spark_*.jar | grep -v 'tests.jar'`"
     SPARK_SUBMIT_FLAGS="$SPARK_CONF $CUDF_UDF_TEST_ARGS" TEST_PARALLEL=1 \

diff --git a/...rc/main/scala/com/nvidia/spark/rapids/shims/spark311db/ParquetCachedBatchSerializer.scala b/...rc/main/scala/com/nvidia/spark/rapids/shims/spark311db/ParquetCachedBatchSerializer.scala
@@ -0,0 +1,22 @@
+/*
+ * Copyright (c) 2021, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.nvidia.spark.rapids.shims.spark311db
+
+import com.nvidia.spark.rapids.shims
+
+class ParquetCachedBatchSerializer extends shims.spark311.ParquetCachedBatchSerializer {
+}
diff --git a/.../spark311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/Spark311dbShims.scala b/.../spark311db/src/main/scala/com/nvidia/spark/rapids/shims/spark311db/Spark311dbShims.scala
@@ -21,6 +21,7 @@ import com.nvidia.spark.rapids._
 import com.nvidia.spark.rapids.shims.spark311.Spark311Shims
 import org.apache.hadoop.fs.Path
 
+import org.apache.spark.sql.rapids.shims.spark311.GpuInMemoryTableScanExec
 import org.apache.spark.sql.rapids.shims.spark311db._
 import org.apache.spark.rdd.RDD
 import org.apache.spark.sql.SparkSession
@@ -30,6 +31,7 @@ import org.apache.spark.sql.catalyst.plans.JoinType
 import org.apache.spark.sql.catalyst.plans.physical.{BroadcastMode, Partitioning}
 import org.apache.spark.sql.execution.SparkPlan
 import org.apache.spark.sql.execution._
+import org.apache.spark.sql.execution.columnar.InMemoryTableScanExec
 import org.apache.spark.sql.execution.adaptive.ShuffleQueryStageExec
 import org.apache.spark.sql.execution.exchange.{ENSURE_REQUIREMENTS, ShuffleExchangeExec}
 import org.apache.spark.sql.execution.datasources.{FilePartition, HadoopFsRelation, PartitionDirectory, PartitionedFile}
@@ -166,6 +168,24 @@ class Spark311dbShims extends Spark311Shims {
               wrapped.tableIdentifier)(conf)
           }
         }),
+      GpuOverrides.exec[InMemoryTableScanExec](
+        "Implementation of InMemoryTableScanExec to use GPU accelerated Caching",
+        ExecChecks((TypeSig.commonCudfTypes + TypeSig.DECIMAL + TypeSig.STRUCT).nested(),
+          TypeSig.all),
+        (scan, conf, p, r) => new SparkPlanMeta[InMemoryTableScanExec](scan, conf, p, r) {
+          override def tagPlanForGpu(): Unit = {
+            if (!scan.relation.cacheBuilder.serializer.isInstanceOf[ParquetCachedBatchSerializer]) {
+              willNotWorkOnGpu("ParquetCachedBatchSerializer is not being used")
+            }
+          }
+
+          /**
+           * Convert InMemoryTableScanExec to a GPU enabled version.
+           */
+          override def convertToGpu(): GpuExec = {
+            GpuInMemoryTableScanExec(scan.attributes, scan.predicates, scan.relation)
+          }
+        }),
       GpuOverrides.exec[SortMergeJoinExec](
         "Sort merge join, replacing with shuffled hash join",
         ExecChecks((TypeSig.commonCudfTypes + TypeSig.NULL + TypeSig.DECIMAL + TypeSig.ARRAY +