Skip to content

Commit

Permalink
Implement back off for jobs that are continually failing
Browse files Browse the repository at this point in the history
Also track GHA triggers so they can also have back off when they have triggered many times (i.e. failed to push to staging)
  • Loading branch information
yosifkit committed Nov 4, 2024
1 parent 468bf7d commit 0bcb99c
Showing 1 changed file with 126 additions and 52 deletions.
178 changes: 126 additions & 52 deletions Jenkinsfile.trigger
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,35 @@ def breakEarly = false // thanks Jenkins...
// { buildId: { "count": 1, ... }, ... }
def pastFailedJobsJson = '{}'

// globals: (so they can be accessed in the setFailedJob function)
// pastFailedJobsJson as a native object
pastFailedJobs = [:]
// object to fill failed jobs json after queue completion, includes skipped
newFailedJobs = [:]

def setFailedJob(buildId, identifier, timeNow, url) {
if (pastFailedJobs[buildId]) {
// carry forward count(GHA triggers)/time
newFailedJobs[buildId] = [
count: pastFailedJobs[buildId].count + 1,
firstTime: pastFailedJobs[buildId].firstTime,
identifier: identifier,
lastTime: timeNow,
skips: 0,
url: url,
]
} else {
newFailedJobs[buildId] = [
count: 1,
firstTime: timeNow,
identifier: identifier,
lastTime: timeNow,
skips: 0,
url: url,
]
}
}

node {
stage('Checkout') {
checkout(scmGit(
Expand Down Expand Up @@ -45,44 +74,98 @@ node {
fi
jq <<<"$json" '.'
''').trim()
pastFailedJobs = readJSON(text: pastFailedJobsJson)
}

dir('meta') {
def queueJson = ''
def newFailedJobsJson = ''
stage('Queue') {
withEnv([
'pastFailedJobsJson=' + pastFailedJobsJson,
]) {
// list of builds that have been failing and will be skipped this trigger
newFailedJobsJson = sh(returnStdout: true, script: '''
jq -L.scripts '
include "meta";
map(
select(
needs_build
and (
.build.arch | IN(
if env.BASHBREW_ARCH == "gha" then
"amd64", "i386", "windows-amd64"
else env.BASHBREW_ARCH end
)
)
)
| .buildId
) as $needsBuild
| env.pastFailedJobsJson | fromjson
| with_entries(
# filter out items no longer needing build
select( .key | IN($needsBuild[]) )
# start skipping after 24 attempts
# try once every 24 skips
| select(
.value
| .count > 24
and .skips < 24
)
| .value.skips+=1
)
' builds.json
''').trim()
echo(newFailedJobsJson)

// using pastFailedJobsJson, sort the needs_build queue so that failing builds always live at the bottom of the queue
queueJson = sh(returnStdout: true, script: '''
jq -L.scripts '
include "meta";
(env.pastFailedJobsJson | fromjson) as $pastFailedJobs
| [
.[]
| select(
needs_build
and (
.build.arch as $arch
| if env.BASHBREW_ARCH == "gha" then
[ "amd64", "i386", "windows-amd64" ]
else [ env.BASHBREW_ARCH ] end
| index($arch)
| map( select(
needs_build
and (
.build.arch | IN(
if env.BASHBREW_ARCH == "gha" then
"amd64", "i386", "windows-amd64"
else env.BASHBREW_ARCH end
)
)
]
and (
# see newFailedJobsJson above
($pastFailedJobs[.buildId].count // 0) <= 24
or
($pastFailedJobs[.buildId].skips // 0) >= 24
)
) )
# this Jenkins job exports a JSON file that includes the number of attempts so far per failing buildId so that this can sort by attempts which means failing builds always live at the bottom of the queue (sorted by the number of times they have failed, so the most failing is always last)
| sort_by($pastFailedJobs[.buildId].count // 0)
' builds.json
''').trim()
}
}

def jobName = ''
if (queueJson && queueJson != '[]') {
queue = readJSON(text: queueJson)
currentBuild.displayName = 'queue size: ' + queue.size() + ' (#' + currentBuild.number + ')'
jobName += 'queue: ' + queue.size()
} else {
currentBuild.displayName = 'empty queue (#' + currentBuild.number + ')'
jobName += 'queue: 0'
breakEarly = true
}
if (newFailedJobsJson && newFailedJobsJson != '[]') {
newFailedJobs = readJSON(text: newFailedJobsJson)
jobName += ' skip: ' + newFailedJobs.size()
// queue to build might be empty, be we still need to record these skipped builds
breakEarly = false
} else {
jobName += ' skip: 0'
}
currentBuild.displayName = jobName + ' (#' + currentBuild.number + ')'

// with an empty queue and nothing to skip we can end early
if (breakEarly) {
return
}

Expand All @@ -97,6 +180,8 @@ node {
for (buildObj in queue) {
def identifier = buildObj.source.arches[buildObj.build.arch].tags[0] + ' (' + buildObj.build.arch + ')'
def json = writeJSON(json: buildObj, returnText: true)
// long so that readJSON doesn't change the value via float precision
def now = (long)(System.currentTimeMillis() / 1000) // convert to seconds
withEnv([
'json=' + json,
]) {
Expand Down Expand Up @@ -125,56 +210,45 @@ node {
'''
}
}
// record that GHA was triggered (for tracking continued triggers that fail to push an image)
setFailedJob(buildObj.buildId, identifier, now, currentBuild.absoluteUrl)
}
}
// we're done triggering GHA, so we're completely done with this job
breakEarly = true
return
}
}
}

if (breakEarly) { return } // thanks Jenkins...

// now that we have our parsed queue, we can release the node we're holding up (since we handle GHA builds above)
def pastFailedJobs = readJSON(text: pastFailedJobsJson)
def newFailedJobs = [:]
if (env.BASHBREW_ARCH != 'gha') {
for (buildObj in queue) {
def identifier = buildObj.source.arches[buildObj.build.arch].tags[0]
def json = writeJSON(json: buildObj, returnText: true)
withEnv([
'json=' + json,
]) {
stage(identifier) {
echo(json) // for debugging/data purposes

for (buildObj in queue) {
def identifier = buildObj.source.arches[buildObj.build.arch].tags[0]
def json = writeJSON(json: buildObj, returnText: true)
withEnv([
'json=' + json,
]) {
stage(identifier) {
echo(json) // for debugging/data purposes

def res = build(
job: 'build-' + env.BASHBREW_ARCH,
parameters: [
string(name: 'buildId', value: buildObj.buildId),
],
propagate: false,
quietPeriod: 5, // seconds
)
// TODO do something useful with "res.result" (especially "res.result != 'SUCCESS'")
echo(res.result)
if (res.result != 'SUCCESS') {
def c = 1
if (pastFailedJobs[buildObj.buildId]) {
// TODO more defensive access of .count? (it is created just below, so it should be safe)
c += pastFailedJobs[buildObj.buildId].count
def res = build(
job: 'build-' + env.BASHBREW_ARCH,
parameters: [
string(name: 'buildId', value: buildObj.buildId),
],
propagate: false,
quietPeriod: 5, // seconds
)
// TODO something more with "res.result"? (like "res.result == 'SUCCESS'")
echo(res.result)
if (res.result != 'SUCCESS') {
// long so that readJSON doesn't change the value via float precision
def now = (long)((res.startTimeInMillis + res.duration) / 1000) // convert to seconds
// record the job failure
setFailedJob(buildObj.buildId, identifier, now, res.absoluteUrl)
// "catchError" is the only way to set "stageResult" :(
catchError(message: 'Build of "' + identifier + '" failed', buildResult: 'UNSTABLE', stageResult: 'FAILURE') { error() }
}
// TODO maybe implement some amount of backoff? keep first url/endTime?
newFailedJobs[buildObj.buildId] = [
count: c,
identifier: identifier,
url: res.absoluteUrl,
endTime: (res.startTimeInMillis + res.duration) / 1000.0, // convert to seconds
]

// "catchError" is the only way to set "stageResult" :(
catchError(message: 'Build of "' + identifier + '" failed', buildResult: 'UNSTABLE', stageResult: 'FAILURE') { error() }
}
}
}
Expand Down

0 comments on commit 0bcb99c

Please sign in to comment.