Skip to content

Commit

Permalink
Implement back off for jobs that are continually failing
Browse files Browse the repository at this point in the history
Also track GHA triggers so they can also have back off when they have triggered many times (i.e. failed to push to staging)
  • Loading branch information
yosifkit committed Nov 20, 2024
1 parent 2d86436 commit f8916c4
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 72 deletions.
145 changes: 73 additions & 72 deletions Jenkinsfile.trigger
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@ env.BASHBREW_ARCH = env.JOB_NAME.minus('/trigger').split('/')[-1] // "windows-am
def queue = []
def breakEarly = false // thanks Jenkins...

// this includes the number of attempts per failing buildId
// { buildId: { "count": 1, ... }, ... }
def pastFailedJobsJson = '{}'
// string filled with all images needing build and whether they were skipped this time for recording after queue completion
// { buildId: { "count": 1, skip: 0, ... }, ... }
def currentJobsJson = ''

node {
stage('Checkout') {
Expand All @@ -36,74 +36,72 @@ node {
[$class: 'RelativeTargetDirectory', relativeTargetDir: 'meta'],
],
))
pastFailedJobsJson = sh(returnStdout: true, script: '''#!/usr/bin/env bash
set -Eeuo pipefail -x
if ! json="$(wget --timeout=5 -qO- "$JOB_URL/lastSuccessfulBuild/artifact/pastFailedJobs.json")"; then
echo >&2 'failed to get pastFailedJobs.json'
json='{}'
fi
jq <<<"$json" '.'
''').trim()
}

dir('meta') {
def queueJson = ''
stage('Queue') {
withEnv([
'pastFailedJobsJson=' + pastFailedJobsJson,
]) {
// using pastFailedJobsJson, sort the needs_build queue so that failing builds always live at the bottom of the queue
queueJson = sh(returnStdout: true, script: '''
jq -L.scripts '
include "meta";
include "jenkins";
(env.pastFailedJobsJson | fromjson) as $pastFailedJobs
| [
.[]
| select(
needs_build
and .build.arch == env.BASHBREW_ARCH
)
| if .build.arch | IN("amd64", "i386", "windows-amd64") then
# "GHA" architectures (anything we add a "gha_payload" to will be run on GHA in the queue)
.gha_payload = (gha_payload | @json)
else . end
]
# this Jenkins job exports a JSON file that includes the number of attempts so far per failing buildId so that this can sort by attempts which means failing builds always live at the bottom of the queue (sorted by the number of times they have failed, so the most failing is always last)
| sort_by($pastFailedJobs[.buildId].count // 0)
' builds.json
''').trim()
// using pastJobsJson, sort the needs_build queue so that previously attempted builds always live at the bottom of the queue
// list of builds that have been failing and will be skipped this trigger
def queueAndFailsJson = sh(returnStdout: true, script: '''
if \\
! wget --timeout=5 -qO past-jobs.json "$JOB_URL/lastSuccessfulBuild/artifact/past-jobs.json" \\
|| ! jq 'empty' past-jobs.json \\
; then
# temporary migration of old data
if ! wget --timeout=5 -qO past-jobs.json "$JOB_URL/lastSuccessfulBuild/artifact/pastFailedJobs.json" || ! jq 'empty' past-jobs.json; then
echo '{}' > past-jobs.json
fi
fi
jq -c -L.scripts --slurpfile pastJobs past-jobs.json '
include "jenkins";
get_arch_queue as $rawQueue
| $rawQueue | jobs_record($pastJobs[0]) as $newJobs
| $rawQueue | filter_skips_queue($newJobs) as $filteredQueue
| (
($rawQueue | length) - ($filteredQueue | length)
) as $skippedCount
# queue, skips/builds record, number of skipped items
| $filteredQueue, $newJobs, $skippedCount
' builds.json
''').tokenize('\r\n')

def queueJson = queueAndFailsJson[0]
currentJobsJson = queueAndFailsJson[1]
def skips = queueAndFailsJson[2]
//echo(queueJson)

def jobName = ''
if (queueJson && queueJson != '[]') {
queue = readJSON(text: queueJson)
jobName += 'queue: ' + queue.size()
} else {
jobName += 'queue: 0'
breakEarly = true
}
}
if (queueJson && queueJson != '[]') {
queue = readJSON(text: queueJson)
currentBuild.displayName = 'queue size: ' + queue.size() + ' (#' + currentBuild.number + ')'
} else {
currentBuild.displayName = 'empty queue (#' + currentBuild.number + ')'
breakEarly = true
return
if (skips > 0 ) {
jobName += ' skip: ' + skips
// queue to build might be empty, be we still need to record these skipped builds
breakEarly = false
}
currentBuild.displayName = jobName + ' (#' + currentBuild.number + ')'
}
}
}

// with an empty queue and nothing to skip we can end early
if (breakEarly) { return } // thanks Jenkins...

// now that we have our parsed queue, we can release the node we're holding up (since we handle GHA builds above)
def pastFailedJobs = readJSON(text: pastFailedJobsJson)
def newFailedJobs = [:]
// new data to be added to the past-jobs.json
// { lastTime: unixTimestamp, url: "" }
def buildCompletionData = [:]

for (buildObj in queue) {
def identifier = buildObj.source.arches[buildObj.build.arch].tags[0]
if (buildObj.build.arch != env.BASHBREW_ARCH) {
identifier += ' (' + buildObj.build.arch + ')'
}
stage(identifier) {
def json = writeJSON(json: buildObj, returnText: true)
echo(json) // for debugging/data purposes
stage(buildObj.identifier) {
//def json = writeJSON(json: buildObj, returnText: true)
//echo(json) // for debugging/data purposes

// "catchError" to set "stageResult" :(
catchError(message: 'Build of "' + identifier + '" failed', buildResult: 'UNSTABLE', stageResult: 'FAILURE') {
catchError(message: 'Build of "' + buildObj.identifier + '" failed', buildResult: 'UNSTABLE', stageResult: 'FAILURE') {
if (buildObj.gha_payload) {
node {
withEnv([
Expand All @@ -129,6 +127,11 @@ for (buildObj in queue) {
'''
}
}
// record that GHA was triggered (for tracking continued triggers that fail to push an image)
buildCompletionData[buildObj.buildId] = [
lastTime: System.currentTimeMillis() / 1000, // convert to seconds
url: currentBuild.absoluteUrl,
]
}
} else {
def res = build(
Expand All @@ -139,39 +142,37 @@ for (buildObj in queue) {
propagate: false,
quietPeriod: 5, // seconds
)
// record the job failure
buildCompletionData[buildObj.buildId] = [
lastTime: (res.startTimeInMillis + res.duration) / 1000, // convert to seconds
url: res.absoluteUrl,
]
if (res.result != 'SUCCESS') {
def c = 1
if (pastFailedJobs[buildObj.buildId]) {
// TODO more defensive access of .count? (it is created just below, so it should be safe)
c += pastFailedJobs[buildObj.buildId].count
}
// TODO maybe implement some amount of backoff? keep first url/endTime?
newFailedJobs[buildObj.buildId] = [
count: c,
identifier: identifier,
url: res.absoluteUrl,
endTime: (res.startTimeInMillis + res.duration) / 1000.0, // convert to seconds
]
// set stage result via catchError
error(res.result)
}
}
}
}
}

// save newFailedJobs so we can use it next run as pastFailedJobs
// save currentJobs so we can use it next run as pastJobs
node {
def newFailedJobsJson = writeJSON(json: newFailedJobs, returnText: true)
def buildCompletionDataJson = writeJSON(json: buildCompletionData, returnText: true)
withEnv([
'newFailedJobsJson=' + newFailedJobsJson,
'buildCompletionDataJson=' + buildCompletionDataJson,
'currentJobsJson=' + currentJobsJson,
]) {
stage('Archive') {
dir('builds') {
deleteDir()
sh '''#!/usr/bin/env bash
set -Eeuo pipefail -x
jq <<<"$newFailedJobsJson" '.' | tee pastFailedJobs.json
jq <<<"$currentJobsJson" '
# merge the two objects recursively, preferring data from "buildCompletionDataJson"
. * ( env.buildCompletionDataJson | fromjson )
' | tee past-jobs.json
'''
archiveArtifacts(
artifacts: '*.json',
Expand Down
52 changes: 52 additions & 0 deletions jenkins.jq
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
include "meta";

# input: list of build objects i.e., builds.json
# output: stream of crane copy command strings
def crane_deploy_commands:
Expand Down Expand Up @@ -47,3 +49,53 @@ def gha_payload:
)
}
;

# input: full "build" object list (with "buildId" top level key)
# output: filtered build list { "buildId value": { build object } }
def get_arch_queue($arch):
map_values(
select(
needs_build
and .build.arch == $arch
)
| if .build.arch | IN("amd64", "i386", "windows-amd64") then
# "GHA" architectures (anything we add a "gha_payload" to will be run on GHA in the queue)
.gha_payload = (gha_payload | @json)
else . end
| .identifier = .source.arches[.build.arch].tags[0]
)
;
def get_arch_queue:
get_arch_queue(env.BASHBREW_ARCH)
;

# input: filtered "needs_build" build object list, like from get_raw_queue
# output: simplified list of builds with record of (build/trigger) count and number of current skips
def jobs_record($pastJobs):
map_values(
.identifier as $identifier
| $pastJobs[.buildId] // { count: 0, skips: 0 }
| .identifier = $identifier
# start skipping after 24 attempts, try once every 24 skips
| if .count > 24 and .skips < 24 then
.skips += 1
else
# these ones shold be built
.skips = 0
| .count += 1
end
)
;

# input: filtered "needs_build" build object list, like from get_raw_queue
# newJobs list, output of jobs_record: used for filtering and sorting the queue
# ouput: sorted build queue with skipped items removed
def filter_skips_queue($newJobs):
map(
select(
$newJobs[.buildId].skips == 0
)
)
# this Jenkins job exports a JSON file that includes the number of attempts so far per failing buildId so that this can sort by attempts which means failing builds always live at the bottom of the queue (sorted by the number of times they have failed, so the most failing is always last)
| sort_by($newJobs[.buildId].count)
;

0 comments on commit f8916c4

Please sign in to comment.