Split pytest by 'slow_test' tag and run from different k8s pods to re…

…duce premerge job duration (NVIDIA#3241) * Add slow_test tag for pytest files that required long duration or big memory Signed-off-by: Alex Zhang <alex4zhang@gmail.com> * Split pytest by 'slow test' tag and run from different k8s pods to reduce premerge job duration and keep one Spark version of the unit test for premerge job Signed-off-by: Alex Zhang <alex4zhang@gmail.com> * Simplify description of 'slow_test' mark Signed-off-by: Alex Zhang <alex4zhang@gmail.com> Signed-off-by: Raza Jafri <rjafri@nvidia.com>
razajafri · Aug 23, 2021 · 5dbdbf2 · 5dbdbf2
1 parent d5b79e2
commit 5dbdbf2
Show file tree

Hide file tree

Showing 4 changed files with 42 additions and 28 deletions.
diff --git a/integration_tests/pytest.ini b/integration_tests/pytest.ini
@@ -25,3 +25,4 @@ markers =
     rapids_udf_example_native: test UDFs that require custom cuda compilation
     validate_execs_in_gpu_plan([execs]): Exec class names to validate they exist in the GPU plan.
     shuffle_test: Mark to include test in the RAPIDS Shuffle Manager
+    slow_test: Mark test that will run long time or require big memory to help split tests
diff --git a/integration_tests/src/main/python/join_test.py b/integration_tests/src/main/python/join_test.py
@@ -20,6 +20,10 @@
 from marks import ignore_order, allow_non_gpu, incompat, validate_execs_in_gpu_plan
 from spark_session import with_cpu_session, with_spark_session
 
+
+# Mark all tests in current file as slow test since it would require ~30mins in total
+pytestmark = pytest.mark.slow_test
+
 all_gen = [StringGen(), ByteGen(), ShortGen(), IntegerGen(), LongGen(),
            BooleanGen(), DateGen(), TimestampGen(), null_gen,
            pytest.param(FloatGen(), marks=[incompat]),

diff --git a/integration_tests/src/main/python/udf_test.py b/integration_tests/src/main/python/udf_test.py
@@ -40,6 +40,9 @@
 import pandas as pd
 from typing import Iterator, Tuple
 
+# Mark all tests in current file as slow test since it would require more memory than others
+pytestmark = pytest.mark.slow_test
+
 arrow_udf_conf = {
     'spark.sql.execution.arrow.pyspark.enabled': 'true',
     'spark.rapids.sql.exec.WindowInPandasExec': 'true',

diff --git a/jenkins/spark-premerge-build.sh b/jenkins/spark-premerge-build.sh
@@ -35,24 +35,21 @@ mvn_verify() {
     # file size check for pull request. The size of a committed file should be less than 1.5MiB
     pre-commit run check-added-large-files --from-ref $BASE_REF --to-ref HEAD
 
-    ARTF_ROOT="$WORKSPACE/.download"
-    MVN_GET_CMD="mvn org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \
-        $MVN_URM_MIRROR -DremoteRepositories=$URM_URL \
-        -Ddest=$ARTF_ROOT"
+    # Here run Python integration tests tagged with 'slow_test' only, that would require long duration or big memory. Such split would help
+    # balance test duration and memory consumption from two k8s pods running in parallel, which executes 'mvn_verify()' and 'unit_test()' respectively.
+    mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TEST_TAGS="slow_test" \
+        -Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=5 -Dcuda.version=$CUDA_CLASSIFIER
 
-    rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT
-
-    # Download a full version of spark
-    $MVN_GET_CMD \
-        -DgroupId=org.apache -DartifactId=spark -Dversion=$SPARK_VER -Dclassifier=bin-hadoop3.2 -Dpackaging=tgz
-
-    export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
-    export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
-    tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
-        rm -f $SPARK_HOME.tgz
-
-    mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TEST_TAGS='' \
-        -Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=3 -Dcuda.version=$CUDA_CLASSIFIER
+    # Run the unit tests for other Spark versions but dont run full python integration tests
+    # NOT ALL TESTS NEEDED FOR PREMERGE
+    # Just test one 3.0.X version (base version covers this) and one 3.1.X version.
+    # All others shims test should be covered in nightly pipelines
+    # Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052
+    # env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+    # env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark303tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+    # env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark304tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+    # env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark312tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+    env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
 
     # The jacoco coverage should have been collected, but because of how the shade plugin
     # works and jacoco we need to clean some things up so jacoco will only report for the
@@ -96,24 +93,33 @@ rapids_shuffle_smoke_test() {
 }
 
 unit_test() {
-    echo "Run unit testings..."
-    # Run the unit tests for other Spark versions but dont run full python integration tests
-    # NOT ALL TESTS NEEDED FOR PREMERGE
-    # Just test one 3.0.X version (base version covers this) and one 3.1.X version.
-    # All others shims test should be covered in nightly pipelines
-    # Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052
-    #env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
-    env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark303tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
-    env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark304tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
-    env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark312tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
-    env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
+    # TODO: this function should be named as 'integration_test()' but it would break backward compatibility. Need find a way to fix this.
+    echo "Run integration testings..."
+    mvn -U -B $MVN_URM_MIRROR clean package -DskipTests=true -Dcuda.version=$CUDA_CLASSIFIER
+    TEST_TAGS="not slow_test" TEST_TYPE="pre-commit" TEST_PARALLEL=5 ./integration_tests/run_pyspark_from_build.sh
 }
 
 
 nvidia-smi
 
 . jenkins/version-def.sh
 
+ARTF_ROOT="$WORKSPACE/.download"
+MVN_GET_CMD="mvn org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \
+    $MVN_URM_MIRROR -DremoteRepositories=$URM_URL \
+    -Ddest=$ARTF_ROOT"
+
+rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT
+
+# Download a full version of spark
+$MVN_GET_CMD \
+    -DgroupId=org.apache -DartifactId=spark -Dversion=$SPARK_VER -Dclassifier=bin-hadoop3.2 -Dpackaging=tgz
+
+export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
+export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
+tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
+    rm -f $SPARK_HOME.tgz
+
 case $BUILD_TYPE in
 
     all)