Skip to content

Commit

Permalink
Enable parallel build for pre-merge job to reduce overall duration [s…
Browse files Browse the repository at this point in the history
…kip ci] (NVIDIA#3028)

* Enable parallel build for pre-merge job to reduce overall duration

and upload logs to github for failed stage only

Signed-off-by: Alex Zhang <alex4zhang@gmail.com>

* Refined based on review comments

* Parameterize premerge build script to support run unit test independently and other cases

Signed-off-by: Alex Zhang <alex4zhang@gmail.com>

* Refine some comments per review comments

Signed-off-by: Alex Zhang <alex4zhang@gmail.com>
  • Loading branch information
zhanga5 committed Aug 9, 2021
1 parent f20d465 commit 5eee291
Show file tree
Hide file tree
Showing 2 changed files with 156 additions and 59 deletions.
94 changes: 72 additions & 22 deletions jenkins/Jenkinsfile-blossom.premerge
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,8 @@ pipeline {
]
)

stash(name: "source_tree", includes: "**")

container('docker-build') {
// check if pre-merge dockerfile modified
def dockerfileModified = sh(returnStdout: true,
Expand Down Expand Up @@ -146,7 +148,6 @@ pipeline {
}
} // end of Build docker image

// TODO: support parallel testing for different spark versions
stage('Premerge Test') {
when {
beforeAgent true
Expand All @@ -155,29 +156,72 @@ pipeline {
}
}

agent {
kubernetes {
label "premerge-test-${BUILD_TAG}"
cloud 'sc-ipp-blossom-prod'
yaml "$pluginPremerge"
workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
customWorkspace "${CUSTOM_WORKSPACE}"
}
}
// Parallel run mvn verify (build and integration tests) and unit tests (for multiple Spark versions)
// If any one is failed will abort another if not finish yet and will upload failure log to Github
failFast true
parallel {
stage('mvn verify') {
options {
// We have to use params to pass the resource label in options block,
// this is a limitation of declarative pipeline. And we need to lock resource before agent start
lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
}
agent {
kubernetes {
label "premerge-test-it-${BUILD_TAG}"
cloud 'sc-ipp-blossom-prod'
yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi
workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
customWorkspace "${CUSTOM_WORKSPACE}"
}
}

steps {
script {
container('gpu') {
sh "$PREMERGE_SCRIPT"
step([$class : 'JacocoPublisher',
execPattern : '**/target/jacoco.exec',
classPattern : 'target/jacoco_classes/',
sourcePattern : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark312/src/main/scala/',
sourceInclusionPattern: '**/*.java,**/*.scala'
])
steps {
script {
container('gpu') {
timeout(time: 4, unit: 'HOURS') { // step only timeout for test run
sh "$PREMERGE_SCRIPT mvn_verify"
step([$class : 'JacocoPublisher',
execPattern : '**/target/jacoco.exec',
classPattern : 'target/jacoco_classes/',
sourcePattern : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/',
sourceInclusionPattern: '**/*.java,**/*.scala'
])
}
}
}
}
}
}
} // end of mvn verify stage

stage('Unit Test') {
options {
// We have to use params to pass the resource label in options block,
// this is a limitation of declarative pipeline. And we need to lock resource before agent start
lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
}
agent {
kubernetes {
label "premerge-test-ut-${BUILD_TAG}"
cloud 'sc-ipp-blossom-prod'
yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi
workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
customWorkspace "${CUSTOM_WORKSPACE}-ut" // Use different workspace to avoid conflict with IT
}
}

steps {
script {
unstash "source_tree"

container('gpu') {
timeout(time: 4, unit: 'HOURS') {
sh "$PREMERGE_SCRIPT unit_test"
}
}
}
}
} // end of Unit Test stage
} // end of parallel
} // end of Premerge Test
} // end of stages

Expand All @@ -191,6 +235,12 @@ pipeline {
if (currentBuild.currentResult == "SUCCESS") {
githubHelper.updateCommitStatus("$BUILD_URL", "Success", GitHubCommitState.SUCCESS)
} else {
// upload log only in case of build failure
def guardWords = ["gitlab.*?\\.com", "urm.*?\\.com"]
guardWords.add("nvidia-smi(?s)(.*?)(?=jenkins/version-def.sh)") // hide GPU info

githubHelper.uploadParallelLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords)

githubHelper.updateCommitStatus("$BUILD_URL", "Fail", GitHubCommitState.FAILURE)
}

Expand Down
121 changes: 84 additions & 37 deletions jenkins/spark-premerge-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,44 +17,91 @@

set -ex

BUILD_TYPE=all

if [[ $# -eq 1 ]]; then
BUILD_TYPE=$1

elif [[ $# -gt 1 ]]; then
echo "ERROR: too many parameters are provided"
exit 1
fi


mvn_verify() {
echo "Run mvn verify..."
# get merge BASE from merged pull request. Log message e.g. "Merge HEAD into BASE"
BASE_REF=$(git --no-pager log --oneline -1 | awk '{ print $NF }')
# file size check for pull request. The size of a committed file should be less than 1.5MiB
pre-commit run check-added-large-files --from-ref $BASE_REF --to-ref HEAD

ARTF_ROOT="$WORKSPACE/.download"
MVN_GET_CMD="mvn org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \
$MVN_URM_MIRROR -DremoteRepositories=$URM_URL \
-Ddest=$ARTF_ROOT"

rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT

# Download a full version of spark
$MVN_GET_CMD \
-DgroupId=org.apache -DartifactId=spark -Dversion=$SPARK_VER -Dclassifier=bin-hadoop3.2 -Dpackaging=tgz

export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
rm -f $SPARK_HOME.tgz

mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TEST_TAGS='' \
-Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CUDA_CLASSIFIER

# The jacoco coverage should have been collected, but because of how the shade plugin
# works and jacoco we need to clean some things up so jacoco will only report for the
# things we care about
mkdir -p target/jacoco_classes/
FILE=$(ls dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)
pushd target/jacoco_classes/
jar xf $FILE
rm -rf com/nvidia/shaded/ org/openucx/
popd
}


unit_test() {
echo "Run unit testings..."
# Run the unit tests for other Spark versions but dont run full python integration tests
# NOT ALL TESTS NEEDED FOR PREMERGE
# Just test one 3.0.X version (base version covers this) and one 3.1.X version.
# All others shims test should be covered in nightly pipelines
# Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052
#env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark303tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark304tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark312tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
}


nvidia-smi

. jenkins/version-def.sh

ARTF_ROOT="$WORKSPACE/.download"
MVN_GET_CMD="mvn org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \
$MVN_URM_MIRROR -DremoteRepositories=$URM_URL \
-Ddest=$ARTF_ROOT"

rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT

# Download a full version of spark
$MVN_GET_CMD \
-DgroupId=org.apache -DartifactId=spark -Dversion=$SPARK_VER -Dclassifier=bin-hadoop3.2 -Dpackaging=tgz

export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
rm -f $SPARK_HOME.tgz

mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TEST_TAGS='' \
-Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CUDA_CLASSIFIER
# Run the unit tests for other Spark versions but dont run full python integration tests
# NOT ALL TESTS NEEDED FOR PREMERGE
#env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark301tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark302tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark303tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark311tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark312tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
# Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052
#env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER

# The jacoco coverage should have been collected, but because of how the shade plugin
# works and jacoco we need to clean some things up so jacoco will only report for the
# things we care about
mkdir -p target/jacoco_classes/
FILE=$(ls dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)
pushd target/jacoco_classes/
jar xf $FILE
rm -rf com/nvidia/shaded/ org/openucx/
popd
case $BUILD_TYPE in

all)
echo "Run all testings..."
mvn_verify
unit_test
;;

mvn_verify)
mvn_verify
;;

ut | unit_test)
unit_test
;;

*)
echo "ERROR: unknown parameter: $BUILD_TYPE"
;;
esac

0 comments on commit 5eee291

Please sign in to comment.