Implement back off for jobs that are continually failing

Also track GHA triggers so they can also have back off when they have triggered many times (i.e. failed to push to staging)
docker-library · Nov 4, 2024 · 0bcb99c · 0bcb99c
1 parent 468bf7d
commit 0bcb99c
Showing 1 changed file with 126 additions and 52 deletions.
diff --git a/Jenkinsfile.trigger b/Jenkinsfile.trigger
@@ -17,6 +17,35 @@ def breakEarly = false // thanks Jenkins...
 // { buildId: { "count": 1, ... }, ... }
 def pastFailedJobsJson = '{}'
 
+// globals: (so they can be accessed in the setFailedJob function)
+// pastFailedJobsJson as a native object
+pastFailedJobs = [:]
+// object to fill failed jobs json after queue completion, includes skipped
+newFailedJobs = [:]
+
+def setFailedJob(buildId, identifier, timeNow, url) {
+	if (pastFailedJobs[buildId]) {
+		// carry forward count(GHA triggers)/time
+		newFailedJobs[buildId] = [
+			count: pastFailedJobs[buildId].count + 1,
+			firstTime: pastFailedJobs[buildId].firstTime,
+			identifier: identifier,
+			lastTime: timeNow,
+			skips: 0,
+			url: url,
+		]
+	} else {
+		newFailedJobs[buildId] = [
+			count: 1,
+			firstTime: timeNow,
+			identifier: identifier,
+			lastTime: timeNow,
+			skips: 0,
+			url: url,
+		]
+	}
+}
+
 node {
 	stage('Checkout') {
 		checkout(scmGit(
@@ -45,44 +74,98 @@ node {
 			fi
 			jq <<<"$json" '.'
 		''').trim()
+		pastFailedJobs = readJSON(text: pastFailedJobsJson)
 	}
 
 	dir('meta') {
 		def queueJson = ''
+		def newFailedJobsJson = ''
 		stage('Queue') {
 			withEnv([
 				'pastFailedJobsJson=' + pastFailedJobsJson,
 			]) {
+				// list of builds that have been failing and will be skipped this trigger
+				newFailedJobsJson = sh(returnStdout: true, script: '''
+					jq -L.scripts '
+						include "meta";
+						map(
+							select(
+								needs_build
+								and (
+									.build.arch | IN(
+										if env.BASHBREW_ARCH == "gha" then
+											"amd64", "i386", "windows-amd64"
+										else env.BASHBREW_ARCH end
+									)
+								)
+							)
+							| .buildId
+						) as $needsBuild
+						| env.pastFailedJobsJson | fromjson
+						| with_entries(
+							# filter out items no longer needing build
+							select( .key | IN($needsBuild[]) )
+							# start skipping after 24 attempts
+							# try once every 24 skips
+							| select(
+								.value
+								| .count > 24
+								and .skips < 24
+							)
+							| .value.skips+=1
+						)
+					' builds.json
+				''').trim()
+				echo(newFailedJobsJson)
+
 				// using pastFailedJobsJson, sort the needs_build queue so that failing builds always live at the bottom of the queue
 				queueJson = sh(returnStdout: true, script: '''
 					jq -L.scripts '
 						include "meta";
 						(env.pastFailedJobsJson | fromjson) as $pastFailedJobs
-						| [
-							.[]
-							| select(
-								needs_build
-								and (
-									.build.arch as $arch
-									| if env.BASHBREW_ARCH == "gha" then
-										[ "amd64", "i386", "windows-amd64" ]
-									else [ env.BASHBREW_ARCH ] end
-									| index($arch)
+						| map( select(
+							needs_build
+							and (
+								.build.arch | IN(
+									if env.BASHBREW_ARCH == "gha" then
+										"amd64", "i386", "windows-amd64"
+									else env.BASHBREW_ARCH end
 								)
 							)
-						]
+							and (
+								# see newFailedJobsJson above
+								($pastFailedJobs[.buildId].count // 0) <= 24
+								or
+								($pastFailedJobs[.buildId].skips // 0) >= 24
+							)
+						) )
 						# this Jenkins job exports a JSON file that includes the number of attempts so far per failing buildId so that this can sort by attempts which means failing builds always live at the bottom of the queue (sorted by the number of times they have failed, so the most failing is always last)
 						| sort_by($pastFailedJobs[.buildId].count // 0)
 					' builds.json
 				''').trim()
 			}
 		}
+
+		def jobName = ''
 		if (queueJson && queueJson != '[]') {
 			queue = readJSON(text: queueJson)
-			currentBuild.displayName = 'queue size: ' + queue.size() + ' (#' + currentBuild.number + ')'
+			jobName += 'queue: ' + queue.size()
 		} else {
-			currentBuild.displayName = 'empty queue (#' + currentBuild.number + ')'
+			jobName += 'queue: 0'
 			breakEarly = true
+		}
+		if (newFailedJobsJson && newFailedJobsJson != '[]') {
+			newFailedJobs = readJSON(text: newFailedJobsJson)
+			jobName += ' skip: ' + newFailedJobs.size()
+			// queue to build might be empty, be we still need to record these skipped builds
+			breakEarly = false
+		} else {
+			jobName += ' skip: 0'
+		}
+		currentBuild.displayName = jobName + ' (#' + currentBuild.number + ')'
+
+		// with an empty queue and nothing to skip we can end early
+		if (breakEarly) {
 			return
 		}
 
@@ -97,6 +180,8 @@ node {
 				for (buildObj in queue) {
 					def identifier = buildObj.source.arches[buildObj.build.arch].tags[0] + ' (' + buildObj.build.arch + ')'
 					def json = writeJSON(json: buildObj, returnText: true)
+					// long so that readJSON doesn't change the value via float precision
+					def now = (long)(System.currentTimeMillis() / 1000) // convert to seconds
 					withEnv([
 						'json=' + json,
 					]) {
@@ -125,56 +210,45 @@ node {
 							'''
 						}
 					}
+					// record that GHA was triggered (for tracking continued triggers that fail to push an image)
+					setFailedJob(buildObj.buildId, identifier, now, currentBuild.absoluteUrl)
 				}
 			}
-			// we're done triggering GHA, so we're completely done with this job
-			breakEarly = true
-			return
 		}
 	}
 }
 
 if (breakEarly) { return } // thanks Jenkins...
 
 // now that we have our parsed queue, we can release the node we're holding up (since we handle GHA builds above)
-def pastFailedJobs = readJSON(text: pastFailedJobsJson)
-def newFailedJobs = [:]
+if (env.BASHBREW_ARCH != 'gha') {
+	for (buildObj in queue) {
+		def identifier = buildObj.source.arches[buildObj.build.arch].tags[0]
+		def json = writeJSON(json: buildObj, returnText: true)
+		withEnv([
+			'json=' + json,
+		]) {
+			stage(identifier) {
+				echo(json) // for debugging/data purposes
 
-for (buildObj in queue) {
-	def identifier = buildObj.source.arches[buildObj.build.arch].tags[0]
-	def json = writeJSON(json: buildObj, returnText: true)
-	withEnv([
-		'json=' + json,
-	]) {
-		stage(identifier) {
-			echo(json) // for debugging/data purposes
-
-			def res = build(
-				job: 'build-' + env.BASHBREW_ARCH,
-				parameters: [
-					string(name: 'buildId', value: buildObj.buildId),
-				],
-				propagate: false,
-				quietPeriod: 5, // seconds
-			)
-			// TODO do something useful with "res.result" (especially "res.result != 'SUCCESS'")
-			echo(res.result)
-			if (res.result != 'SUCCESS') {
-				def c = 1
-				if (pastFailedJobs[buildObj.buildId]) {
-					// TODO more defensive access of .count? (it is created just below, so it should be safe)
-					c += pastFailedJobs[buildObj.buildId].count
+				def res = build(
+					job: 'build-' + env.BASHBREW_ARCH,
+					parameters: [
+						string(name: 'buildId', value: buildObj.buildId),
+					],
+					propagate: false,
+					quietPeriod: 5, // seconds
+				)
+				// TODO something more with "res.result"? (like "res.result == 'SUCCESS'")
+				echo(res.result)
+				if (res.result != 'SUCCESS') {
+					// long so that readJSON doesn't change the value via float precision
+					def now = (long)((res.startTimeInMillis + res.duration) / 1000) // convert to seconds
+					// record the job failure
+					setFailedJob(buildObj.buildId, identifier, now, res.absoluteUrl)
+					// "catchError" is the only way to set "stageResult" :(
+					catchError(message: 'Build of "' + identifier + '" failed', buildResult: 'UNSTABLE', stageResult: 'FAILURE') { error() }
 				}
-				// TODO maybe implement some amount of backoff? keep first url/endTime?
-				newFailedJobs[buildObj.buildId] = [
-					count: c,
-					identifier: identifier,
-					url: res.absoluteUrl,
-					endTime: (res.startTimeInMillis + res.duration) / 1000.0, // convert to seconds
-				]
-
-				// "catchError" is the only way to set "stageResult" :(
-				catchError(message: 'Build of "' + identifier + '" failed', buildResult: 'UNSTABLE', stageResult: 'FAILURE') { error() }
 			}
 		}
 	}