Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable parallel build for pre-merge job to reduce overall duration [skip ci] #3028

Merged
merged 4 commits into from
Jul 28, 2021
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
97 changes: 67 additions & 30 deletions jenkins/Jenkinsfile-blossom.premerge
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ pipeline {
]
)

stash(name: "source_tree", includes: "**")

container('docker-build') {
// check if pre-merge dockerfile modified
def dockerfileModified = sh(returnStdout: true,
Expand Down Expand Up @@ -151,37 +153,72 @@ pipeline {
!skipped
}
}
options {
// We have to use params to pass the resource label in options block,
// this is a limitation of declarative pipeline. And we need to lock resource before agent start
lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
}
agent {
kubernetes {
label "premerge-test-${BUILD_TAG}"
cloud 'sc-ipp-blossom-prod'
yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi
workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
customWorkspace "${CUSTOM_WORKSPACE}"
}
}

steps {
script {
container('gpu') {
// TODO: improve resource management
timeout(time: 4, unit: 'HOURS') { // step only timeout for test run
sh "$PREMERGE_SCRIPT"
step([$class : 'JacocoPublisher',
execPattern : '**/target/jacoco.exec',
classPattern : 'target/jacoco_classes/',
sourcePattern : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/',
sourceInclusionPattern: '**/*.java,**/*.scala'
])
failFast true
parallel {
stage('mvn verify') {
options {
// We have to use params to pass the resource label in options block,
// this is a limitation of declarative pipeline. And we need to lock resource before agent start
lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
}
agent {
kubernetes {
label "premerge-test-it-${BUILD_TAG}"
cloud 'sc-ipp-blossom-prod'
yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi
workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
customWorkspace "${CUSTOM_WORKSPACE}"
}
}
}
}

steps {
script {
container('gpu') {
// TODO: improve resource management
pxLi marked this conversation as resolved.
Show resolved Hide resolved
timeout(time: 4, unit: 'HOURS') { // step only timeout for test run
sh "$PREMERGE_SCRIPT mvn_verify"
step([$class : 'JacocoPublisher',
execPattern : '**/target/jacoco.exec',
classPattern : 'target/jacoco_classes/',
sourcePattern : 'shuffle-plugin/src/main/scala/,udf-compiler/src/main/scala/,sql-plugin/src/main/java/,sql-plugin/src/main/scala/,shims/spark311/src/main/scala/,shims/spark301db/src/main/scala/,shims/spark301/src/main/scala/,shims/spark302/src/main/scala/,shims/spark303/src/main/scala/,shims/spark304/src/main/scala/,shims/spark312/src/main/scala/,shims/spark313/src/main/scala/',
sourceInclusionPattern: '**/*.java,**/*.scala'
])
}
}
}
}
} // end of mvn verify stage

stage('Unit Test') {
options {
// We have to use params to pass the resource label in options block,
// this is a limitation of declarative pipeline. And we need to lock resource before agent start
lock(label: "${params.GPU_POOL}", quantity: 1, variable: 'GPU_RESOURCE')
}
agent {
kubernetes {
label "premerge-test-ut-${BUILD_TAG}"
cloud 'sc-ipp-blossom-prod'
yaml pod.getGPUYAML("${IMAGE_PREMERGE}", "${env.GPU_RESOURCE}", '8', '32Gi') // cpu: 8, memory: 32Gi
workspaceVolume persistentVolumeClaimWorkspaceVolume(claimName: "${PVC}", readOnly: false)
customWorkspace "${CUSTOM_WORKSPACE}-ut" // Use different workspace to avoid conflict with IT
}
}

steps {
script {
unstash "source_tree"

container('gpu') {
timeout(time: 4, unit: 'HOURS') {
sh "$PREMERGE_SCRIPT unit_test"
}
}
}
}
} // end of Unit Test stage
} // end of parallel
} // end of Premerge Test
} // end of stages

Expand All @@ -197,9 +234,9 @@ pipeline {
} else {
// upload log only in case of build failure
def guardWords = ["gitlab.*?\\.com", "urm.*?\\.com"]
def logPattern = "### BEGIN OF TEST LOG ###.*### END OF TEST LOG ###"
guardWords.add("nvidia-smi(?s)(.*?)(?=jenkins/version-def.sh)") // hide GPU info
pxLi marked this conversation as resolved.
Show resolved Hide resolved

githubHelper.uploadPartialLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords, logPattern)
githubHelper.uploadParallelLogs(this, env.JOB_NAME, env.BUILD_NUMBER, null, guardWords)

githubHelper.updateCommitStatus("$BUILD_URL", "Fail", GitHubCommitState.FAILURE)
}
Expand Down
127 changes: 82 additions & 45 deletions jenkins/spark-premerge-build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -17,54 +17,91 @@

set -ex

nvidia-smi
BUILD_TYPE=all

if [[ $# -eq 1 ]]; then
BUILD_TYPE=$1

elif [[ $# -gt 1 ]]; then
echo "ERROR: too many parameters are provided"
exit 1
fi


mvn_verify() {
echo "Run mvn verify..."
# get merge BASE from merged pull request. Log message e.g. "Merge HEAD into BASE"
BASE_REF=$(git --no-pager log --oneline -1 | awk '{ print $NF }')
# file size check for pull request. The size of a committed file should be less than 1.5MiB
pre-commit run check-added-large-files --from-ref $BASE_REF --to-ref HEAD

ARTF_ROOT="$WORKSPACE/.download"
MVN_GET_CMD="mvn org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \
$MVN_URM_MIRROR -DremoteRepositories=$URM_URL \
-Ddest=$ARTF_ROOT"

rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT

# Download a full version of spark
$MVN_GET_CMD \
-DgroupId=org.apache -DartifactId=spark -Dversion=$SPARK_VER -Dclassifier=bin-hadoop3.2 -Dpackaging=tgz

export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
rm -f $SPARK_HOME.tgz

mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TEST_TAGS='' \
-Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CUDA_CLASSIFIER

function on_exit {
echo '### END OF TEST LOG ###'
# The jacoco coverage should have been collected, but because of how the shade plugin
# works and jacoco we need to clean some things up so jacoco will only report for the
# things we care about
mkdir -p target/jacoco_classes/
FILE=$(ls dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)
pushd target/jacoco_classes/
jar xf $FILE
rm -rf com/nvidia/shaded/ org/openucx/
popd
}
trap on_exit EXIT

echo '### BEGIN OF TEST LOG ###'

unit_test() {
echo "Run unit testings..."
# Run the unit tests for other Spark versions but dont run full python integration tests
# NOT ALL TESTS NEEDED FOR PREMERGE
# Just test one 3.0.X version (base version covers this) and one 3.1.X version.
# All others shims test should be covered in nightly pipelines
# Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052
#env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark303tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark304tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark312tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
}


nvidia-smi

. jenkins/version-def.sh

# get merge BASE from merged pull request. Log message e.g. "Merge HEAD into BASE"
BASE_REF=$(git --no-pager log --oneline -1 | awk '{ print $NF }')
# file size check for pull request. The size of a committed file should be less than 1.5MiB
pre-commit run check-added-large-files --from-ref $BASE_REF --to-ref HEAD

ARTF_ROOT="$WORKSPACE/.download"
MVN_GET_CMD="mvn org.apache.maven.plugins:maven-dependency-plugin:2.8:get -B \
$MVN_URM_MIRROR -DremoteRepositories=$URM_URL \
-Ddest=$ARTF_ROOT"

rm -rf $ARTF_ROOT && mkdir -p $ARTF_ROOT

# Download a full version of spark
$MVN_GET_CMD \
-DgroupId=org.apache -DartifactId=spark -Dversion=$SPARK_VER -Dclassifier=bin-hadoop3.2 -Dpackaging=tgz

export SPARK_HOME="$ARTF_ROOT/spark-$SPARK_VER-bin-hadoop3.2"
export PATH="$SPARK_HOME/bin:$SPARK_HOME/sbin:$PATH"
tar zxf $SPARK_HOME.tgz -C $ARTF_ROOT && \
rm -f $SPARK_HOME.tgz

mvn -U -B $MVN_URM_MIRROR '-P!snapshot-shims,pre-merge' clean verify -Dpytest.TEST_TAGS='' \
-Dpytest.TEST_TYPE="pre-commit" -Dpytest.TEST_PARALLEL=4 -Dcuda.version=$CUDA_CLASSIFIER
# Run the unit tests for other Spark versions but dont run full python integration tests
# NOT ALL TESTS NEEDED FOR PREMERGE
# Just test one 3.0.X version (base version covers this) and one 3.1.X version.
# All others shims test should be covered in nightly pipelines
env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark313tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER
# Disabled until Spark 3.2 source incompatibility fixed, see https://github.com/NVIDIA/spark-rapids/issues/2052
#env -u SPARK_HOME mvn -U -B $MVN_URM_MIRROR -Pspark320tests,snapshot-shims test -Dpytest.TEST_TAGS='' -Dcuda.version=$CUDA_CLASSIFIER

# The jacoco coverage should have been collected, but because of how the shade plugin
# works and jacoco we need to clean some things up so jacoco will only report for the
# things we care about
mkdir -p target/jacoco_classes/
FILE=$(ls dist/target/rapids-4-spark_2.12-*.jar | grep -v test | xargs readlink -f)
pushd target/jacoco_classes/
jar xf $FILE
rm -rf com/nvidia/shaded/ org/openucx/
popd
case $BUILD_TYPE in

all)
echo "Run all testings..."
mvn_verify
unit_test
;;

mvn_verify)
mvn_verify
;;

ut | unit_test)
unit_test
;;

*)
echo "ERROR: unknown parameter: $BUILD_TYPE"
;;
esac