From d1a7c49abe00683aa4a9df859d146bc9eb1cf46f Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 6 Dec 2021 16:17:12 -0500 Subject: [PATCH 01/28] Pick up https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/180 --- pom.xml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/pom.xml b/pom.xml index fc0c7f1d9a..d38a27aa43 100644 --- a/pom.xml +++ b/pom.xml @@ -47,7 +47,7 @@ 8 - 2.263.1 + 2.303.3 false true @@ -135,6 +135,11 @@ org.jenkins-ci.plugins credentials-binding + + org.jenkins-ci.plugins.workflow + workflow-durable-task-step + 1122.v2d03a46f6f49 + @@ -147,11 +152,6 @@ workflow-basic-steps test - - org.jenkins-ci.plugins.workflow - workflow-durable-task-step - test - org.jenkins-ci.plugins.workflow workflow-support @@ -256,7 +256,7 @@ io.jenkins.tools.bom - bom-2.263.x + bom-2.303.x 984.vb5eaac999a7e import pom From 8fa0aabd69b8f61270d85d95702493fcaa29d122 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 6 Dec 2021 16:17:37 -0500 Subject: [PATCH 02/28] Some `@CheckForNull`s --- .../org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud.java | 1 + .../csanchez/jenkins/plugins/kubernetes/PodTemplateUtils.java | 1 + 2 files changed, 2 insertions(+) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud.java index 15833ff530..bb9a58bee0 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesCloud.java @@ -571,6 +571,7 @@ public boolean canProvision(@NonNull Cloud.CloudState state) { * @param label label to look for in templates * @return the template */ + @CheckForNull public PodTemplate getTemplate(@CheckForNull Label label) { return PodTemplateUtils.getTemplateByLabel(label, getAllTemplates()); } diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/PodTemplateUtils.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/PodTemplateUtils.java index fce974a0b8..023cc26a0e 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/PodTemplateUtils.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/PodTemplateUtils.java @@ -531,6 +531,7 @@ static PodTemplate unwrap(PodTemplate template, Collection allTempl * @param templates The list of all templates. * @return The first pod template from the collection that has a matching label. */ + @CheckForNull public static PodTemplate getTemplateByLabel(@CheckForNull Label label, Collection templates) { for (PodTemplate t : templates) { if ((label == null && t.getNodeUsageMode() == Node.Mode.NORMAL) || (label != null && label.matches(t.getLabelSet()))) { From 36554aea158f612cadf1d7e0f5540dd09913fb27 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 6 Dec 2021 16:18:30 -0500 Subject: [PATCH 03/28] `ContainerExecDecorator.ws` no longer unused, but now actively harmful as it forces use of `FilePathPickle` --- .../pipeline/ContainerExecDecorator.java | 14 -------------- .../pipeline/ContainerStepExecution.java | 1 - 2 files changed, 15 deletions(-) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/ContainerExecDecorator.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/ContainerExecDecorator.java index 95433dc7b6..bfdf393eb2 100755 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/ContainerExecDecorator.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/ContainerExecDecorator.java @@ -104,9 +104,6 @@ public class ContainerExecDecorator extends LauncherDecorator implements Seriali private String containerName; private EnvironmentExpander environmentExpander; private EnvVars globalVars; - /** @deprecated no longer used */ - @Deprecated - private FilePath ws; private EnvVars rcEnvVars; private String shell; private KubernetesNodeContext nodeContext; @@ -118,7 +115,6 @@ public ContainerExecDecorator() { public ContainerExecDecorator(KubernetesClient client, String podName, String containerName, String namespace, EnvironmentExpander environmentExpander, FilePath ws) { this.containerName = containerName; this.environmentExpander = environmentExpander; - this.ws = ws; } @Deprecated @@ -222,16 +218,6 @@ public EnvVars getRunContextEnvVars() { return this.rcEnvVars; } - /** @deprecated unused */ - @Deprecated - public FilePath getWs() { - return ws; - } - - public void setWs(FilePath ws) { - this.ws = ws; - } - public void setShell(String shell) { this.shell = shell; } diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/ContainerStepExecution.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/ContainerStepExecution.java index 94bd68dbb9..73668234be 100755 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/ContainerStepExecution.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/ContainerStepExecution.java @@ -77,7 +77,6 @@ public boolean start() throws Exception { decorator.setNodeContext(nodeContext); decorator.setContainerName(containerName); decorator.setEnvironmentExpander(env); - decorator.setWs(getContext().get(FilePath.class)); decorator.setGlobalVars(globalVars); decorator.setRunContextEnvVars(rcEnvVars); decorator.setShell(shell); From b34d34a2f60bc3223e04b93c0168c806ffa4e833 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 6 Dec 2021 16:20:19 -0500 Subject: [PATCH 04/28] `Reaper` was activated only by `onOnline`, making it useless for cleaning up after a backup; `preLaunch` is called even for a broken agent, better suiting our purposes --- .../jenkins/plugins/kubernetes/pod/retention/Reaper.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index 7f9de5300b..7b5f35a4b8 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -91,7 +91,7 @@ public static Reaper getInstance() { private Watch watch; @Override - public void onOnline(Computer c, TaskListener listener) throws IOException, InterruptedException { + public void preLaunch(Computer c, TaskListener taskListener) throws IOException, InterruptedException { if (c instanceof KubernetesComputer && activated.compareAndSet(false, true)) { activate(); } From 178a4d9915a19da24b511467758aa17c15d6a09c Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 6 Dec 2021 16:22:48 -0500 Subject: [PATCH 05/28] `RestartPipelineTest.terminatedPodAfterRestart` improvements: logging, and resetting the Jenkins URL --- .../jenkins/plugins/kubernetes/KubernetesTestUtil.java | 2 +- .../plugins/kubernetes/pipeline/RestartPipelineTest.java | 4 ++++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesTestUtil.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesTestUtil.java index 58b0f1df36..1524269867 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesTestUtil.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/KubernetesTestUtil.java @@ -127,7 +127,7 @@ public static void setupHost() throws Exception { System.err.println("Calling home to address: " + hostAddress); URL nonLocalhostUrl = new URL(url.getProtocol(), hostAddress, url.getPort(), url.getFile()); - // TODO better to set KUBERNETES_JENKINS_URL + // TODO better to set KUBERNETES_JENKINS_URL, or better yet KubernetesCloud.setJenkinsUrl JenkinsLocationConfiguration.get().setUrl(nonLocalhostUrl.toString()); Integer slaveAgentPort = Integer.getInteger("slaveAgentPort"); diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index 2136deb675..d650f34796 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -45,8 +45,10 @@ import org.csanchez.jenkins.plugins.kubernetes.model.KeyValueEnvVar; import org.csanchez.jenkins.plugins.kubernetes.model.SecretEnvVar; import org.csanchez.jenkins.plugins.kubernetes.model.TemplateEnvVar; +import org.csanchez.jenkins.plugins.kubernetes.pod.retention.Reaper; import org.jenkinsci.plugins.workflow.job.WorkflowJob; import org.jenkinsci.plugins.workflow.job.WorkflowRun; +import org.jenkinsci.plugins.workflow.steps.durable_task.DurableTaskStep; import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; import org.junit.BeforeClass; import org.junit.ClassRule; @@ -245,7 +247,9 @@ public void terminatedPodAfterRestart() throws Exception { projectName.set(b.getParent().getFullName()); r.waitForMessage("+ sleep", b); }); + logs.record(DurableTaskStep.class, Level.FINE).record(Reaper.class, Level.FINE); story.then(r -> { + setupHost(); // otherwise JenkinsLocationConfiguration will be clobbered WorkflowRun b = r.jenkins.getItemByFullName(projectName.get(), WorkflowJob.class).getBuildByNumber(1); r.waitForMessage("Ready to run", b); // Note that the test is cheating here slightly. From 748f4ea9c2c1694b97addae843a10e41fff7d80c Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 6 Dec 2021 16:24:10 -0500 Subject: [PATCH 06/28] Removing comment about `Reaper` rendered incorrect by #714 --- .../plugins/kubernetes/pipeline/RestartPipelineTest.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index d650f34796..1414c2bdfb 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -252,12 +252,6 @@ public void terminatedPodAfterRestart() throws Exception { setupHost(); // otherwise JenkinsLocationConfiguration will be clobbered WorkflowRun b = r.jenkins.getItemByFullName(projectName.get(), WorkflowJob.class).getBuildByNumber(1); r.waitForMessage("Ready to run", b); - // Note that the test is cheating here slightly. - // The watch in Reaper is still running across the in-JVM restarts, - // whereas in production it would have been cancelled during the shutdown. - // But it does not matter since we are waiting for the agent to come back online after the restart, - // which is sufficient trigger to reactivate the reaper. - // Indeed we get two Reaper instances running, which independently remove the node. deletePods(cloud.connect(), getLabels(this, name), false); r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); From 965178632b01579f8c62ae73eafcebba2a5a2d25 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 6 Dec 2021 16:24:57 -0500 Subject: [PATCH 07/28] `RestartPipelineTest.terminatedPodAfterRestart` overriding `terminationGracePeriodSeconds` to avoid 30s delay --- .../pipeline/terminatedPodAfterRestart.groovy | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy index d1eca4791b..f82f2c3d5a 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy @@ -1,11 +1,18 @@ package org.csanchez.jenkins.plugins.kubernetes.pipeline -podTemplate(label: '$NAME', containers: [ - containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'), -]) { - node ('$NAME') { +podTemplate(yaml: ''' +spec: + containers: + - name: busybox + image: busybox + command: + - sleep + - 99d + terminationGracePeriodSeconds: 3 +''') { + node(POD_LABEL) { container('busybox') { - sh 'sleep 9999999' + sh 'sleep 9999999' } } } From 9395b2f8b9a7442e9cdc0541c0f73b4280f56aac Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 6 Dec 2021 16:30:58 -0500 Subject: [PATCH 08/28] Implementing `ExecutorStepRetryEligibility` --- .../retention/KubernetesRetryEligibility.java | 59 +++++++++++++++++++ .../pipeline/RestartPipelineTest.java | 6 +- .../pipeline/terminatedPodAfterRestart.groovy | 2 +- 3 files changed, 63 insertions(+), 4 deletions(-) create mode 100644 src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/KubernetesRetryEligibility.java diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/KubernetesRetryEligibility.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/KubernetesRetryEligibility.java new file mode 100644 index 0000000000..57c38a2726 --- /dev/null +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/KubernetesRetryEligibility.java @@ -0,0 +1,59 @@ +/* + * Copyright 2021 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.csanchez.jenkins.plugins.kubernetes.pod.retention; + +import hudson.Extension; +import hudson.model.Label; +import hudson.model.Node; +import hudson.model.TaskListener; +import hudson.slaves.Cloud; +import jenkins.model.Jenkins; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; +import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepRetryEligibility; + +/** + * Qualifies {@code node} blocks associated with {@link KubernetesSlave} to be retried if the node was deleted. + */ +@Extension +public class KubernetesRetryEligibility implements ExecutorStepRetryEligibility { + + @Override + public boolean shouldRetry(Throwable t, String node, String label, TaskListener listener) { + if (ExecutorStepRetryEligibility.isRemovedNode(t) && isKubernetesAgent(node, label)) { + listener.getLogger().println("Will retry failed node block from deleted pod " + node); + return true; + } + return false; + } + + private static boolean isKubernetesAgent(String node, String label) { + Node current = Jenkins.get().getNode(node); + if (current instanceof KubernetesSlave) { + return true; + } else if (current == null) { + Label l = Label.get(label); + for (Cloud c : Jenkins.get().clouds) { + if (c instanceof KubernetesCloud && ((KubernetesCloud) c).getTemplate(l) != null) { + return true; + } + } + } + return false; + } + +} diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index 1414c2bdfb..810870feb6 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -49,7 +49,6 @@ import org.jenkinsci.plugins.workflow.job.WorkflowJob; import org.jenkinsci.plugins.workflow.job.WorkflowRun; import org.jenkinsci.plugins.workflow.steps.durable_task.DurableTaskStep; -import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Rule; @@ -253,8 +252,9 @@ public void terminatedPodAfterRestart() throws Exception { WorkflowRun b = r.jenkins.getItemByFullName(projectName.get(), WorkflowJob.class).getBuildByNumber(1); r.waitForMessage("Ready to run", b); deletePods(cloud.connect(), getLabels(this, name), false); - r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); - r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); + r.waitForMessage("assuming it is not coming back", b); + r.waitForMessage("Will retry failed node block from deleted pod", b); + r.assertBuildStatusSuccess(r.waitForCompletion(b)); }); } diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy index f82f2c3d5a..916ca3d284 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy @@ -12,7 +12,7 @@ spec: ''') { node(POD_LABEL) { container('busybox') { - sh 'sleep 9999999' + sh 'sleep 15' } } } From 143387539083ab3ed48439b35bce5fed54276cd2 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Tue, 7 Dec 2021 10:47:32 -0500 Subject: [PATCH 09/28] `KubernetesPipelineTest.terminatedPod` is analogous to `RestartPipelineTest.terminatedPodAfterRestart`: we now expect it to retry --- .../pipeline/KubernetesPipelineTest.java | 6 +++--- .../kubernetes/pipeline/terminatedPod.groovy | 17 ++++++++++++----- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index be232f4241..037cbc6f01 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -72,7 +72,6 @@ import org.hamcrest.MatcherAssert; import org.hamcrest.Matchers; import org.jenkinsci.plugins.workflow.job.WorkflowRun; -import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; import org.jenkinsci.plugins.workflow.test.steps.SemaphoreStep; import org.junit.After; import org.junit.Before; @@ -477,10 +476,11 @@ public void runInPodWithRetention() throws Exception { public void terminatedPod() throws Exception { r.waitForMessage("+ sleep", b); deletePods(cloud.connect(), getLabels(this, name), false); - r.assertBuildStatus(Result.ABORTED, r.waitForCompletion(b)); - r.waitForMessage(new ExecutorStepExecution.RemovedNodeCause().getShortDescription(), b); r.waitForMessage("busybox -- terminated", b); r.waitForMessage("jnlp -- terminated", b); + r.waitForMessage("was deleted; cancelling node body", b); + r.waitForMessage("Will retry failed node block from deleted pod", b); + r.assertBuildStatusSuccess(r.waitForCompletion(b)); } @Issue("JENKINS-59340") diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy index d757048650..6e20fbb07f 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy @@ -1,10 +1,17 @@ -podTemplate(label: '$NAME', containers: [ - containerTemplate(name: 'busybox', image: 'busybox', ttyEnabled: true, command: '/bin/cat'), - ]) { - node ('$NAME') { +podTemplate(yaml: ''' +spec: + containers: + - name: busybox + image: busybox + command: + - sleep + - 99d + terminationGracePeriodSeconds: 3 +''') { + node(POD_LABEL) { container('busybox') { sh 'echo hello world' - sh 'sleep 9999999' + sh 'sleep 15' } } } From dff6ce4eede8337a5fa28075b6351d2432a40633 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 8 Dec 2021 12:51:46 -0500 Subject: [PATCH 10/28] Pick up https://github.com/jenkinsci/workflow-step-api-plugin/pull/73 --- pom.xml | 1 + 1 file changed, 1 insertion(+) diff --git a/pom.xml b/pom.xml index be94a68fc5..85f96dc38b 100644 --- a/pom.xml +++ b/pom.xml @@ -101,6 +101,7 @@ org.jenkins-ci.plugins.workflow workflow-step-api + 2.25-rc586.6e93334f902d org.jenkins-ci.plugins.workflow From 91c0b4bde9bf577fffa0cb0aceea4b428465bc5a Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 8 Dec 2021 13:20:56 -0500 Subject: [PATCH 11/28] `KubernetesRetryEligibility` makes more sense in the `pipeline` subpackage --- .../{pod/retention => pipeline}/KubernetesRetryEligibility.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename src/main/java/org/csanchez/jenkins/plugins/kubernetes/{pod/retention => pipeline}/KubernetesRetryEligibility.java (97%) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/KubernetesRetryEligibility.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java similarity index 97% rename from src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/KubernetesRetryEligibility.java rename to src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java index 57c38a2726..260412cf4a 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/KubernetesRetryEligibility.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java @@ -14,7 +14,7 @@ * limitations under the License. */ -package org.csanchez.jenkins.plugins.kubernetes.pod.retention; +package org.csanchez.jenkins.plugins.kubernetes.pipeline; import hudson.Extension; import hudson.model.Label; From 7884be6bde7416c14417019c3bd3edbdd13ef656 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 8 Dec 2021 14:32:43 -0500 Subject: [PATCH 12/28] Trying to fix `KubernetesPipelineTest.containerTerminated` by skipping retry when a container was terminated for certain reasons https://github.com/jenkinsci/kubernetes-plugin/pull/786#issuecomment-989131043 --- .../pipeline/KubernetesRetryEligibility.java | 32 ++++++++++++++++--- .../kubernetes/pod/retention/Reaper.java | 25 +++++++++++++++ .../AbstractKubernetesPipelineTest.java | 3 +- .../pipeline/KubernetesPipelineTest.java | 2 ++ 4 files changed, 57 insertions(+), 5 deletions(-) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java index 260412cf4a..4f88dd8cea 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java @@ -17,13 +17,18 @@ package org.csanchez.jenkins.plugins.kubernetes.pipeline; import hudson.Extension; +import hudson.ExtensionList; import hudson.model.Label; import hudson.model.Node; import hudson.model.TaskListener; import hudson.slaves.Cloud; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; import jenkins.model.Jenkins; import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; +import org.csanchez.jenkins.plugins.kubernetes.pod.retention.Reaper; import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepRetryEligibility; /** @@ -32,13 +37,32 @@ @Extension public class KubernetesRetryEligibility implements ExecutorStepRetryEligibility { + private static final Logger LOGGER = Logger.getLogger(KubernetesRetryEligibility.class.getName()); + + private static final Set IGNORED_CONTAINER_TERMINATION_REASONS = new HashSet(); + static { + IGNORED_CONTAINER_TERMINATION_REASONS.add("OOMKiller"); + IGNORED_CONTAINER_TERMINATION_REASONS.add("Completed"); + } + @Override public boolean shouldRetry(Throwable t, String node, String label, TaskListener listener) { - if (ExecutorStepRetryEligibility.isRemovedNode(t) && isKubernetesAgent(node, label)) { - listener.getLogger().println("Will retry failed node block from deleted pod " + node); - return true; + if (!ExecutorStepRetryEligibility.isRemovedNode(t)) { + LOGGER.fine(() -> "Not a RemovedNode failure: " + t); + return false; } - return false; + if (!isKubernetesAgent(node, label)) { + LOGGER.fine(() -> node + " was not a K8s agent"); + return false; + } + Set terminationReasons = ExtensionList.lookupSingleton(Reaper.TerminateAgentOnContainerTerminated.class).terminationReasons(node); + if (terminationReasons.stream().anyMatch(r -> IGNORED_CONTAINER_TERMINATION_REASONS.contains(r))) { + LOGGER.fine(() -> "ignored termination reason(s) for " + node + ": " + terminationReasons); + return false; + } + LOGGER.fine(() -> "active on " + node); + listener.getLogger().println("Will retry failed node block from deleted pod " + node); + return true; } private static boolean isKubernetesAgent(String node, String label) { diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index 7b5f35a4b8..11c622f1d5 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -46,6 +46,11 @@ import java.util.logging.Logger; import io.fabric8.kubernetes.client.WatcherException; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.Map; +import java.util.Set; import jenkins.model.Jenkins; import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; import org.csanchez.jenkins.plugins.kubernetes.KubernetesComputer; @@ -215,6 +220,9 @@ public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave nod @Extension public static class TerminateAgentOnContainerTerminated implements Listener { + + private final Map> terminationReasons = new HashMap<>(); + @Override public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException { if (action != Action.MODIFIED) { @@ -229,10 +237,27 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN ContainerStateTerminated t = c.getState().getTerminated(); LOGGER.info(() -> ns + "/" + name + " Container " + c.getName() + " was just terminated, so removing the corresponding Jenkins agent"); runListener.getLogger().printf("%s/%s Container %s was terminated (Exit Code: %d, Reason: %s)%n", ns, name, c.getName(), t.getExitCode(), t.getReason()); + synchronized (terminationReasons) { + terminationReasons.computeIfAbsent(node.getNodeName(), k -> new HashSet<>()).add(t.getReason()); + } }); node.terminate(); } } + + /** + * Get any reason(s) why a node was terminated by this listener. + * @param node a {@link Node#getNodeName} + * @return a possibly empty set of {@link ContainerStateTerminated#getReason} + */ + @NonNull + public Set terminationReasons(@NonNull String node) { + synchronized (terminationReasons) { + Set reasons = terminationReasons.get(node); + return reasons == null ? Collections.emptySet() : new HashSet<>(reasons); + } + } + } @Extension diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java index e92b001251..bd88e510fc 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java @@ -76,7 +76,8 @@ public abstract class AbstractKubernetesPipelineTest { public LoggerRule logs = new LoggerRule() .recordPackage(KubernetesCloud.class, Level.FINE) .recordPackage(NoDelayProvisionerStrategy.class, Level.FINE) - .record(NodeProvisioner.class, Level.FINE); + .record(NodeProvisioner.class, Level.FINE) + .record(KubernetesRetryEligibility.class, Level.FINE); @BeforeClass public static void isKubernetesConfigured() throws Exception { diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index ce4b370299..c0f4377fa5 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -488,7 +488,9 @@ public void terminatedPod() throws Exception { public void containerTerminated() throws Exception { assertBuildStatus(r.waitForCompletion(b), Result.FAILURE, Result.ABORTED); r.waitForMessage("Container stress-ng was terminated", b); + /* TODO sometimes instead get: Container stress-ng was terminated (Exit Code: 0, Reason: Completed) r.waitForMessage("Reason: OOMKilled", b); + */ } @Test From 0f00e23c5a00c18b67d12048c2c57a17da9e4a4c Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 8 Dec 2021 15:17:32 -0500 Subject: [PATCH 13/28] Delaying `Reaper.activate` seems to help? https://github.com/jenkinsci/kubernetes-plugin/pull/1077#issuecomment-989151545 --- .../jenkins/plugins/kubernetes/pod/retention/Reaper.java | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index 11c622f1d5..a7101da56b 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -51,7 +51,9 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.concurrent.TimeUnit; import jenkins.model.Jenkins; +import jenkins.util.Timer; import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; import org.csanchez.jenkins.plugins.kubernetes.KubernetesComputer; import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; @@ -98,7 +100,7 @@ public static Reaper getInstance() { @Override public void preLaunch(Computer c, TaskListener taskListener) throws IOException, InterruptedException { if (c instanceof KubernetesComputer && activated.compareAndSet(false, true)) { - activate(); + Timer.get().schedule(this::activate, 10, TimeUnit.SECONDS); } } From d802e291bc91fdd0c57425ec5a5802cd9b34dd4d Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 8 Dec 2021 16:07:04 -0500 Subject: [PATCH 14/28] Making `KubernetesPipelineTest.podDeadlineExceeded` pass --- .../pipeline/KubernetesRetryEligibility.java | 3 +- .../kubernetes/pod/retention/Reaper.java | 50 +++++++++---------- 2 files changed, 27 insertions(+), 26 deletions(-) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java index 4f88dd8cea..60aa0c3309 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java @@ -43,6 +43,7 @@ public class KubernetesRetryEligibility implements ExecutorStepRetryEligibility static { IGNORED_CONTAINER_TERMINATION_REASONS.add("OOMKiller"); IGNORED_CONTAINER_TERMINATION_REASONS.add("Completed"); + IGNORED_CONTAINER_TERMINATION_REASONS.add("DeadlineExceeded"); } @Override @@ -55,7 +56,7 @@ public boolean shouldRetry(Throwable t, String node, String label, TaskListener LOGGER.fine(() -> node + " was not a K8s agent"); return false; } - Set terminationReasons = ExtensionList.lookupSingleton(Reaper.TerminateAgentOnContainerTerminated.class).terminationReasons(node); + Set terminationReasons = ExtensionList.lookupSingleton(Reaper.class).terminationReasons(node); if (terminationReasons.stream().anyMatch(r -> IGNORED_CONTAINER_TERMINATION_REASONS.contains(r))) { LOGGER.fine(() -> "ignored termination reason(s) for " + node + ": " + terminationReasons); return false; diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index a7101da56b..6dbc356419 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -33,6 +33,7 @@ import io.fabric8.kubernetes.api.model.ContainerStateWaiting; import io.fabric8.kubernetes.api.model.ContainerStatus; import io.fabric8.kubernetes.api.model.Pod; +import io.fabric8.kubernetes.api.model.PodStatus; import io.fabric8.kubernetes.client.KubernetesClient; import io.fabric8.kubernetes.client.Watch; import io.fabric8.kubernetes.client.Watcher; @@ -97,6 +98,8 @@ public static Reaper getInstance() { private Watch watch; + private final Map> terminationReasons = new HashMap<>(); + @Override public void preLaunch(Computer c, TaskListener taskListener) throws IOException, InterruptedException { if (c instanceof KubernetesComputer && activated.compareAndSet(false, true)) { @@ -161,7 +164,7 @@ public void eventReceived(Watcher.Action action, Pod pod) { } ExtensionList.lookup(Listener.class).forEach(listener -> { try { - listener.onEvent(action, optionalNode.get(), pod); + listener.onEvent(action, optionalNode.get(), pod, terminationReasons.computeIfAbsent(optionalNode.get().getNodeName(), k -> new HashSet<>())); } catch (Exception x) { LOGGER.log(Level.WARNING, "Listener " + listener + " failed for " + ns + "/" + name, x); } @@ -191,6 +194,19 @@ private void closeWatch() { } } + /** + * Get any reason(s) why a node was terminated by a listener. + * @param node a {@link Node#getNodeName} + * @return a possibly empty set of {@link ContainerStateTerminated#getReason} or {@link PodStatus#getReason} + */ + @NonNull + public Set terminationReasons(@NonNull String node) { + synchronized (terminationReasons) { + Set reasons = terminationReasons.get(node); + return reasons == null ? Collections.emptySet() : new HashSet<>(reasons); + } + } + /** * Listener called when a Kubernetes event related to a Kubernetes agent happens. */ @@ -201,13 +217,13 @@ public interface Listener extends ExtensionPoint { * @param node The affected node * @param pod The affected pod */ - void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException; + void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReaons) throws IOException, InterruptedException; } @Extension public static class RemoveAgentOnPodDeleted implements Listener { @Override - public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException { + public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReasons) throws IOException { if (action != Action.DELETED) { return; } @@ -223,10 +239,8 @@ public void onEvent(@NonNull Watcher.Action action, @NonNull KubernetesSlave nod @Extension public static class TerminateAgentOnContainerTerminated implements Listener { - private final Map> terminationReasons = new HashMap<>(); - @Override - public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException { + public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReasons) throws IOException, InterruptedException { if (action != Action.MODIFIED) { return; } @@ -239,33 +253,17 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN ContainerStateTerminated t = c.getState().getTerminated(); LOGGER.info(() -> ns + "/" + name + " Container " + c.getName() + " was just terminated, so removing the corresponding Jenkins agent"); runListener.getLogger().printf("%s/%s Container %s was terminated (Exit Code: %d, Reason: %s)%n", ns, name, c.getName(), t.getExitCode(), t.getReason()); - synchronized (terminationReasons) { - terminationReasons.computeIfAbsent(node.getNodeName(), k -> new HashSet<>()).add(t.getReason()); - } + terminationReasons.add(t.getReason()); }); node.terminate(); } } - - /** - * Get any reason(s) why a node was terminated by this listener. - * @param node a {@link Node#getNodeName} - * @return a possibly empty set of {@link ContainerStateTerminated#getReason} - */ - @NonNull - public Set terminationReasons(@NonNull String node) { - synchronized (terminationReasons) { - Set reasons = terminationReasons.get(node); - return reasons == null ? Collections.emptySet() : new HashSet<>(reasons); - } - } - } @Extension public static class TerminateAgentOnPodFailed implements Listener { @Override - public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException { + public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReasons) throws IOException, InterruptedException { if (action != Action.MODIFIED) { return; } @@ -275,6 +273,7 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN TaskListener runListener = node.getTemplate().getListener(); LOGGER.info(() -> ns + "/" + name + " Pod just failed. Removing the corresponding Jenkins agent. Reason: " + pod.getStatus().getReason() + ", Message: " + pod.getStatus().getMessage()); runListener.getLogger().printf("%s/%s Pod just failed (Reason: %s, Message: %s)%n", ns, name, pod.getStatus().getReason(), pod.getStatus().getMessage()); + terminationReasons.add(pod.getStatus().getReason()); try { String lines = PodUtils.logLastLines(pod, node.getKubernetesCloud().connect()); if (lines != null) { @@ -293,7 +292,7 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN public static class TerminateAgentOnImagePullBackOff implements Listener { @Override - public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod) throws IOException, InterruptedException { + public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonNull Pod pod, @NonNull Set terminationReasons) throws IOException, InterruptedException { List backOffContainers = PodUtils.getContainers(pod, cs -> { ContainerStateWaiting waiting = cs.getState().getWaiting(); return waiting != null && waiting.getMessage() != null && waiting.getMessage().contains("Back-off pulling image"); @@ -305,6 +304,7 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN TaskListener runListener = node.getTemplate().getListener(); runListener.error("Unable to pull Docker image \""+cs.getImage()+"\". Check if image tag name is spelled correctly."); }); + terminationReasons.add("ImagePullBackOff"); try (ACLContext _ = ACL.as(ACL.SYSTEM)) { PodUtils.cancelQueueItemFor(pod, "ImagePullBackOff"); } From d54a0470d242f0856af7729a850811f53ff0c55f Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 8 Dec 2021 17:20:53 -0500 Subject: [PATCH 15/28] Typo in `IGNORED_CONTAINER_TERMINATION_REASONS` --- .../kubernetes/pipeline/KubernetesRetryEligibility.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java index 60aa0c3309..ffae913fe0 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java @@ -41,7 +41,7 @@ public class KubernetesRetryEligibility implements ExecutorStepRetryEligibility private static final Set IGNORED_CONTAINER_TERMINATION_REASONS = new HashSet(); static { - IGNORED_CONTAINER_TERMINATION_REASONS.add("OOMKiller"); + IGNORED_CONTAINER_TERMINATION_REASONS.add("OOMKilled"); IGNORED_CONTAINER_TERMINATION_REASONS.add("Completed"); IGNORED_CONTAINER_TERMINATION_REASONS.add("DeadlineExceeded"); } @@ -61,7 +61,7 @@ public boolean shouldRetry(Throwable t, String node, String label, TaskListener LOGGER.fine(() -> "ignored termination reason(s) for " + node + ": " + terminationReasons); return false; } - LOGGER.fine(() -> "active on " + node); + LOGGER.fine(() -> "active on " + node + " (termination reasons: " + terminationReasons + ")"); listener.getLogger().println("Will retry failed node block from deleted pod " + node); return true; } From f45d69f98e1c36aae116336fcad28c149efa7e82 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 8 Dec 2021 18:23:24 -0500 Subject: [PATCH 16/28] https://github.com/jenkinsci/workflow-step-api-plugin/pull/73 released --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 85f96dc38b..5c107ea866 100644 --- a/pom.xml +++ b/pom.xml @@ -101,7 +101,7 @@ org.jenkins-ci.plugins.workflow workflow-step-api - 2.25-rc586.6e93334f902d + 604.vffcf73c782e7 org.jenkins-ci.plugins.workflow From 0d391e769f18545bdf02eeb0a8610953059ea194 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 8 Dec 2021 18:24:14 -0500 Subject: [PATCH 17/28] `RestartPipelineTest.terminatedPodAfterRestart` requires https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/180/commits/b65ff1cd10a77700c017ac5bf660b75de1eb8c56 --- pom.xml | 2 +- .../plugins/kubernetes/pipeline/RestartPipelineTest.java | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 5c107ea866..aea4a95b3c 100644 --- a/pom.xml +++ b/pom.xml @@ -139,7 +139,7 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1122.v2d03a46f6f49 + 1123.vb65ff1cd10a7 diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index 810870feb6..76cfa195c0 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -49,6 +49,7 @@ import org.jenkinsci.plugins.workflow.job.WorkflowJob; import org.jenkinsci.plugins.workflow.job.WorkflowRun; import org.jenkinsci.plugins.workflow.steps.durable_task.DurableTaskStep; +import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepDynamicContext; import org.junit.BeforeClass; import org.junit.ClassRule; import org.junit.Rule; @@ -246,7 +247,7 @@ public void terminatedPodAfterRestart() throws Exception { projectName.set(b.getParent().getFullName()); r.waitForMessage("+ sleep", b); }); - logs.record(DurableTaskStep.class, Level.FINE).record(Reaper.class, Level.FINE); + logs.record(DurableTaskStep.class, Level.FINE).record(Reaper.class, Level.FINE).record(ExecutorStepDynamicContext.class, Level.FINE); story.then(r -> { setupHost(); // otherwise JenkinsLocationConfiguration will be clobbered WorkflowRun b = r.jenkins.getItemByFullName(projectName.get(), WorkflowJob.class).getBuildByNumber(1); From 494b7269fe39c54b9da0100035d3d72d28d3d054 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Fri, 10 Dec 2021 16:59:24 -0500 Subject: [PATCH 18/28] Picking up https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/180/commits/245514952d215052557b9a1bd3c4ffea86965912 --- pom.xml | 2 +- .../kubernetes/pipeline/KubernetesRetryEligibility.java | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index aea4a95b3c..4cc4f2a4e6 100644 --- a/pom.xml +++ b/pom.xml @@ -139,7 +139,7 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1123.vb65ff1cd10a7 + 1129.v245514952d21 diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java index ffae913fe0..4256a24ea6 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java @@ -24,6 +24,7 @@ import hudson.slaves.Cloud; import java.util.HashSet; import java.util.Set; +import java.util.logging.Level; import java.util.logging.Logger; import jenkins.model.Jenkins; import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; @@ -48,8 +49,8 @@ public class KubernetesRetryEligibility implements ExecutorStepRetryEligibility @Override public boolean shouldRetry(Throwable t, String node, String label, TaskListener listener) { - if (!ExecutorStepRetryEligibility.isRemovedNode(t)) { - LOGGER.fine(() -> "Not a RemovedNode failure: " + t); + if (!ExecutorStepRetryEligibility.isRemovedNode(t) && !ExecutorStepRetryEligibility.isClosedChannel(t)) { + LOGGER.log(Level.FINE, "Not a recognized failure", t); return false; } if (!isKubernetesAgent(node, label)) { From 2c885270c44dcd57b561ef665ee97c437d7c37bd Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Tue, 14 Dec 2021 17:48:48 -0500 Subject: [PATCH 19/28] Adapting to https://github.com/jenkinsci/workflow-durable-task-step-plugin/pull/180/commits/64c123a7a95b309cc2880e2652af0e6ec2db951c --- pom.xml | 2 +- .../plugins/kubernetes/pipeline/KubernetesRetryEligibility.java | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/pom.xml b/pom.xml index 4cc4f2a4e6..b8287617c2 100644 --- a/pom.xml +++ b/pom.xml @@ -139,7 +139,7 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1129.v245514952d21 + 1130.v64c123a7a95b diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java index 4256a24ea6..3fe526c35a 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java @@ -49,7 +49,7 @@ public class KubernetesRetryEligibility implements ExecutorStepRetryEligibility @Override public boolean shouldRetry(Throwable t, String node, String label, TaskListener listener) { - if (!ExecutorStepRetryEligibility.isRemovedNode(t) && !ExecutorStepRetryEligibility.isClosedChannel(t)) { + if (!ExecutorStepRetryEligibility.isGenerallyEligible(t)) { LOGGER.log(Level.FINE, "Not a recognized failure", t); return false; } From 2141a03fe3c1920171a26182910de3b79e41960d Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Mon, 2 May 2022 18:19:35 -0400 Subject: [PATCH 20/28] Initial work with `KubernetesAgentErrorCondition` --- pom.xml | 13 +- .../KubernetesAgentErrorCondition.java | 126 ++++++++++++++++++ .../pipeline/KubernetesRetryEligibility.java | 85 ------------ .../kubernetes/pipeline/SecretsMasker.java | 8 +- .../AbstractKubernetesPipelineTest.java | 2 +- .../pipeline/KubernetesPipelineTest.java | 3 +- .../pipeline/RestartPipelineTest.java | 2 +- .../kubernetes/pipeline/terminatedPod.groovy | 2 + .../pipeline/terminatedPodAfterRestart.groovy | 2 + 9 files changed, 148 insertions(+), 95 deletions(-) create mode 100644 src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java delete mode 100644 src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java diff --git a/pom.xml b/pom.xml index 9fa70bc462..5e2b7b6552 100644 --- a/pom.xml +++ b/pom.xml @@ -46,7 +46,7 @@ 8 - 2.303.3 + 2.332.1 false true @@ -100,11 +100,11 @@ org.jenkins-ci.plugins.workflow workflow-step-api - 604.vffcf73c782e7 org.jenkins-ci.plugins.workflow workflow-api + 1151.vb_c885fd4869b_ org.jenkinsci.plugins @@ -138,7 +138,7 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1130.v64c123a7a95b + 999999-SNAPSHOT @@ -150,6 +150,7 @@ org.jenkins-ci.plugins.workflow workflow-basic-steps + 999999-SNAPSHOT test @@ -256,8 +257,8 @@ io.jenkins.tools.bom - bom-2.303.x - 1090.v0a_33df40457a_ + bom-2.332.x + 1289.v5c4b_1c43511b_ import pom @@ -276,7 +277,7 @@ joda-time joda-time - 2.10.2 + 2.10.5 org.apache.commons diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java new file mode 100644 index 0000000000..9155d2c13b --- /dev/null +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java @@ -0,0 +1,126 @@ +/* + * Copyright 2021 CloudBees, Inc. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.csanchez.jenkins.plugins.kubernetes.pipeline; + +import hudson.Extension; +import hudson.ExtensionList; +import hudson.model.Node; +import hudson.model.labels.LabelAtom; +import java.io.IOException; +import java.util.HashSet; +import java.util.Set; +import java.util.logging.Logger; +import jenkins.model.Jenkins; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; +import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; +import org.csanchez.jenkins.plugins.kubernetes.pod.retention.Reaper; +import org.jenkinsci.Symbol; +import org.jenkinsci.plugins.workflow.actions.ErrorAction; +import org.jenkinsci.plugins.workflow.actions.WorkspaceAction; +import org.jenkinsci.plugins.workflow.flow.ErrorCondition; +import org.jenkinsci.plugins.workflow.flow.FlowExecution; +import org.jenkinsci.plugins.workflow.graph.BlockEndNode; +import org.jenkinsci.plugins.workflow.graph.FlowNode; +import org.jenkinsci.plugins.workflow.graphanalysis.LinearBlockHoppingScanner; +import org.jenkinsci.plugins.workflow.steps.FlowInterruptedException; +import org.jenkinsci.plugins.workflow.steps.StepContext; +import org.jenkinsci.plugins.workflow.support.steps.AgentErrorCondition; +import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepExecution; +import org.kohsuke.stapler.DataBoundConstructor; + +/** + * Qualifies {@code node} blocks associated with {@link KubernetesSlave} to be retried if the node was deleted. + * A more specific version of {@link AgentErrorCondition}. + */ +public class KubernetesAgentErrorCondition extends ErrorCondition { + + private static final Logger LOGGER = Logger.getLogger(KubernetesAgentErrorCondition.class.getName()); + + private static final Set IGNORED_CONTAINER_TERMINATION_REASONS = new HashSet<>(); + static { + IGNORED_CONTAINER_TERMINATION_REASONS.add("OOMKilled"); + IGNORED_CONTAINER_TERMINATION_REASONS.add("Completed"); + IGNORED_CONTAINER_TERMINATION_REASONS.add("DeadlineExceeded"); + } + + @DataBoundConstructor public KubernetesAgentErrorCondition() {} + + @Override + public boolean test(Throwable t, StepContext context) throws IOException, InterruptedException { + if (context == null) { + LOGGER.fine("Cannot check error without context"); + return false; + } + if (!new AgentErrorCondition().test(t, context)) { + if (t instanceof FlowInterruptedException && ((FlowInterruptedException) t).getCauses().stream().anyMatch(ExecutorStepExecution.QueueTaskCancelled.class::isInstance)) { + LOGGER.fine(() -> "QueueTaskCancelled normally ignored by AgentErrorCondition but might be delivered here from Reaper.TerminateAgentOnContainerTerminated"); + // TODO cleaner to somehow suppress that QueueTaskCancelled and let the underlying RemovedNodeCause be delivered + } else { + LOGGER.fine(() -> "Not a recognized failure: " + t); + return false; + } + } + FlowNode _origin = ErrorAction.findOrigin(t, context.get(FlowExecution.class)); + if (_origin == null) { + LOGGER.fine(() -> "No recognized origin of error: " + t); + return false; + } + FlowNode origin = _origin instanceof BlockEndNode ? ((BlockEndNode) _origin).getStartNode() : _origin; + LOGGER.fine(() -> "Found origin " + origin + " " + origin.getDisplayFunctionName()); + LinearBlockHoppingScanner scanner = new LinearBlockHoppingScanner(); + scanner.setup(origin); + for (FlowNode callStack : scanner) { + WorkspaceAction ws = callStack.getPersistentAction(WorkspaceAction.class); + if (ws != null) { + String node = ws.getNode(); + Node n = Jenkins.get().getNode(node); + if (n != null) { + if (!(n instanceof KubernetesSlave)) { + LOGGER.fine(() -> node + " was not a K8s agent"); + return false; + } + } else { + // May have been removed already, but we can look up the labels to see what it was. + Set labels = ws.getLabels(); + if (labels.stream().noneMatch(l -> Jenkins.get().clouds.stream().anyMatch(c -> c instanceof KubernetesCloud && ((KubernetesCloud) c).getTemplate(l) != null))) { + LOGGER.fine(() -> node + " was not a K8s agent judging by " + labels); + return false; + } + } + Set terminationReasons = ExtensionList.lookupSingleton(Reaper.class).terminationReasons(node); + if (terminationReasons.stream().anyMatch(r -> IGNORED_CONTAINER_TERMINATION_REASONS.contains(r))) { + LOGGER.fine(() -> "ignored termination reason(s) for " + node + ": " + terminationReasons); + return false; + } + LOGGER.fine(() -> "active on " + node + " (termination reasons: " + terminationReasons + ")"); + return true; + } + } + LOGGER.fine(() -> "found no WorkspaceAction starting from " + origin); + return false; + } + + @Symbol("kubernetesAgent") + @Extension public static final class DescriptorImpl extends ErrorConditionDescriptor { + + @Override public String getDisplayName() { + return "Kubernetes agent errors"; + } + + } + +} diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java deleted file mode 100644 index 3fe526c35a..0000000000 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesRetryEligibility.java +++ /dev/null @@ -1,85 +0,0 @@ -/* - * Copyright 2021 CloudBees, Inc. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.csanchez.jenkins.plugins.kubernetes.pipeline; - -import hudson.Extension; -import hudson.ExtensionList; -import hudson.model.Label; -import hudson.model.Node; -import hudson.model.TaskListener; -import hudson.slaves.Cloud; -import java.util.HashSet; -import java.util.Set; -import java.util.logging.Level; -import java.util.logging.Logger; -import jenkins.model.Jenkins; -import org.csanchez.jenkins.plugins.kubernetes.KubernetesCloud; -import org.csanchez.jenkins.plugins.kubernetes.KubernetesSlave; -import org.csanchez.jenkins.plugins.kubernetes.pod.retention.Reaper; -import org.jenkinsci.plugins.workflow.support.steps.ExecutorStepRetryEligibility; - -/** - * Qualifies {@code node} blocks associated with {@link KubernetesSlave} to be retried if the node was deleted. - */ -@Extension -public class KubernetesRetryEligibility implements ExecutorStepRetryEligibility { - - private static final Logger LOGGER = Logger.getLogger(KubernetesRetryEligibility.class.getName()); - - private static final Set IGNORED_CONTAINER_TERMINATION_REASONS = new HashSet(); - static { - IGNORED_CONTAINER_TERMINATION_REASONS.add("OOMKilled"); - IGNORED_CONTAINER_TERMINATION_REASONS.add("Completed"); - IGNORED_CONTAINER_TERMINATION_REASONS.add("DeadlineExceeded"); - } - - @Override - public boolean shouldRetry(Throwable t, String node, String label, TaskListener listener) { - if (!ExecutorStepRetryEligibility.isGenerallyEligible(t)) { - LOGGER.log(Level.FINE, "Not a recognized failure", t); - return false; - } - if (!isKubernetesAgent(node, label)) { - LOGGER.fine(() -> node + " was not a K8s agent"); - return false; - } - Set terminationReasons = ExtensionList.lookupSingleton(Reaper.class).terminationReasons(node); - if (terminationReasons.stream().anyMatch(r -> IGNORED_CONTAINER_TERMINATION_REASONS.contains(r))) { - LOGGER.fine(() -> "ignored termination reason(s) for " + node + ": " + terminationReasons); - return false; - } - LOGGER.fine(() -> "active on " + node + " (termination reasons: " + terminationReasons + ")"); - listener.getLogger().println("Will retry failed node block from deleted pod " + node); - return true; - } - - private static boolean isKubernetesAgent(String node, String label) { - Node current = Jenkins.get().getNode(node); - if (current instanceof KubernetesSlave) { - return true; - } else if (current == null) { - Label l = Label.get(label); - for (Cloud c : Jenkins.get().clouds) { - if (c instanceof KubernetesCloud && ((KubernetesCloud) c).getTemplate(l) != null) { - return true; - } - } - } - return false; - } - -} diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/SecretsMasker.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/SecretsMasker.java index 60f51458fd..d65d1edbc6 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/SecretsMasker.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/SecretsMasker.java @@ -84,7 +84,13 @@ protected Class type() { @Override protected TaskListenerDecorator get(DelegatedContext context) throws IOException, InterruptedException { - KubernetesComputer c = context.get(KubernetesComputer.class); + KubernetesComputer c; + try { + c = context.get(KubernetesComputer.class); + } catch (IOException | InterruptedException x) { + LOGGER.log(Level.FINE, "Unable to look up KubernetesComputer", x); + return null; + } if (c == null) { return null; } diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java index 26cc71fe72..81ab16ba1f 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/AbstractKubernetesPipelineTest.java @@ -82,7 +82,7 @@ public abstract class AbstractKubernetesPipelineTest { .recordPackage(KubernetesCloud.class, Level.FINE) .recordPackage(NoDelayProvisionerStrategy.class, Level.FINE) .record(NodeProvisioner.class, Level.FINE) - .record(KubernetesRetryEligibility.class, Level.FINE); + .record(KubernetesAgentErrorCondition.class, Level.FINE); @BeforeClass public static void isKubernetesConfigured() throws Exception { diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java index 15e888182b..d9cd29faaf 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesPipelineTest.java @@ -471,12 +471,13 @@ public void runInPodWithRetention() throws Exception { @Issue("JENKINS-49707") @Test public void terminatedPod() throws Exception { + logs.record(KubernetesAgentErrorCondition.class, Level.FINE); r.waitForMessage("+ sleep", b); deletePods(cloud.connect(), getLabels(this, name), false); r.waitForMessage("busybox --", b); r.waitForMessage("jnlp --", b); r.waitForMessage("was deleted; cancelling node body", b); - r.waitForMessage("Will retry failed node block from deleted pod", b); + r.waitForMessage("Retrying", b); r.assertBuildStatusSuccess(r.waitForCompletion(b)); } diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index ba9c8b6ced..e86815f2bf 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -253,7 +253,7 @@ public void terminatedPodAfterRestart() throws Exception { r.waitForMessage("Ready to run", b); deletePods(cloud.connect(), getLabels(this, name), false); r.waitForMessage("assuming it is not coming back", b); - r.waitForMessage("Will retry failed node block from deleted pod", b); + r.waitForMessage("Retrying", b); r.assertBuildStatusSuccess(r.waitForCompletion(b)); }); } diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy index 6e20fbb07f..e10bdd5c82 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy @@ -8,10 +8,12 @@ spec: - 99d terminationGracePeriodSeconds: 3 ''') { + retry(count: 2, errorConditions: [kubernetesAgent()]) { node(POD_LABEL) { container('busybox') { sh 'echo hello world' sh 'sleep 15' } } + } } diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy index 916ca3d284..962075aa5d 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy @@ -10,9 +10,11 @@ spec: - 99d terminationGracePeriodSeconds: 3 ''') { + retry(count: 2, errorConditions: [kubernetesAgent()]) { node(POD_LABEL) { container('busybox') { sh 'sleep 15' } } + } } From 71510c6e35a49ac15d5a8650a6ff880589b0f495 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Tue, 3 May 2022 17:28:07 -0400 Subject: [PATCH 21/28] Pick up incremental builds --- pom.xml | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/pom.xml b/pom.xml index 5e2b7b6552..aa6743ee42 100644 --- a/pom.xml +++ b/pom.xml @@ -138,7 +138,7 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 999999-SNAPSHOT + 1189.v39b_0b_d397ccf @@ -150,7 +150,7 @@ org.jenkins-ci.plugins.workflow workflow-basic-steps - 999999-SNAPSHOT + 956.v357f20c65eb_f test @@ -237,6 +237,12 @@ io.jenkins.configuration-as-code test-harness test + + + org.jenkins-ci.main + jenkins-test-harness + + org.jenkins-ci.plugins @@ -258,7 +264,7 @@ io.jenkins.tools.bom bom-2.332.x - 1289.v5c4b_1c43511b_ + 1342.v729ca_3818e88 import pom From 52c55ffcc3cfbf58ffc51566f9c455d84731c7e5 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Thu, 12 May 2022 09:14:16 -0400 Subject: [PATCH 22/28] Comment --- .../kubernetes/pipeline/KubernetesAgentErrorCondition.java | 1 + 1 file changed, 1 insertion(+) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java index 9155d2c13b..345ea013b1 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesAgentErrorCondition.java @@ -69,6 +69,7 @@ public boolean test(Throwable t, StepContext context) throws IOException, Interr if (t instanceof FlowInterruptedException && ((FlowInterruptedException) t).getCauses().stream().anyMatch(ExecutorStepExecution.QueueTaskCancelled.class::isInstance)) { LOGGER.fine(() -> "QueueTaskCancelled normally ignored by AgentErrorCondition but might be delivered here from Reaper.TerminateAgentOnContainerTerminated"); // TODO cleaner to somehow suppress that QueueTaskCancelled and let the underlying RemovedNodeCause be delivered + // (or just let AgentErrorCondition trigger on QueueTaskCancelled) } else { LOGGER.fine(() -> "Not a recognized failure: " + t); return false; From 417f590534227220cd86bb693b167b0095dd3bd4 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Thu, 12 May 2022 09:21:18 -0400 Subject: [PATCH 23/28] Updating deps --- pom.xml | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index aa6743ee42..0acbefa554 100644 --- a/pom.xml +++ b/pom.xml @@ -104,7 +104,7 @@ org.jenkins-ci.plugins.workflow workflow-api - 1151.vb_c885fd4869b_ + 1159.v27cb_4545c3ff org.jenkinsci.plugins @@ -120,6 +120,7 @@ org.jenkins-ci.plugins.workflow workflow-cps + 2691.va_688a_c3d8fd0 true @@ -138,19 +139,20 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1189.v39b_0b_d397ccf + 1200.v7231de192754 org.jenkins-ci.plugins.workflow workflow-job + 1181.vcea_0362753c3 test org.jenkins-ci.plugins.workflow workflow-basic-steps - 956.v357f20c65eb_f + 960.v0004499239c3 test @@ -168,6 +170,7 @@ org.jenkins-ci.plugins.workflow workflow-cps tests + 2691.va_688a_c3d8fd0 test @@ -264,7 +267,7 @@ io.jenkins.tools.bom bom-2.332.x - 1342.v729ca_3818e88 + 1370.vfa_e23fe119c3 import pom From 2f5c29739723f49dc8dbc07be34aa5fc43c34e04 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Thu, 12 May 2022 09:59:12 -0400 Subject: [PATCH 24/28] Expiring `terminationReasons` entries after a day https://github.com/jenkinsci/kubernetes-plugin/pull/1083#discussion_r765262823 --- .../kubernetes/pod/retention/Reaper.java | 29 +++++++++++++------ .../pipeline/RestartPipelineTest.java | 2 +- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index abdda78957..fd4b749077 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -16,6 +16,9 @@ package org.csanchez.jenkins.plugins.kubernetes.pod.retention; +import com.github.benmanes.caffeine.cache.CacheLoader; +import com.github.benmanes.caffeine.cache.Caffeine; +import com.github.benmanes.caffeine.cache.LoadingCache; import edu.umd.cs.findbugs.annotations.NonNull; import hudson.Extension; import hudson.ExtensionList; @@ -52,6 +55,7 @@ import java.util.HashSet; import java.util.Map; import java.util.Set; +import java.util.concurrent.ConcurrentSkipListSet; import java.util.concurrent.TimeUnit; import jenkins.model.Jenkins; import jenkins.util.Timer; @@ -98,7 +102,9 @@ public static Reaper getInstance() { private Watch watch; - private final Map> terminationReasons = new HashMap<>(); + private final LoadingCache> terminationReasons = Caffeine.newBuilder(). + expireAfterAccess(1, TimeUnit.DAYS). + build(k -> new ConcurrentSkipListSet<>()); @Override public void preLaunch(Computer c, TaskListener taskListener) throws IOException, InterruptedException { @@ -174,7 +180,7 @@ public void eventReceived(Watcher.Action action, Pod pod) { } ExtensionList.lookup(Listener.class).forEach(listener -> { try { - listener.onEvent(action, optionalNode.get(), pod, terminationReasons.computeIfAbsent(optionalNode.get().getNodeName(), k -> new HashSet<>())); + listener.onEvent(action, optionalNode.get(), pod, terminationReasons.get(optionalNode.get().getNodeName())); } catch (Exception x) { LOGGER.log(Level.WARNING, "Listener " + listener + " failed for " + ns + "/" + name, x); } @@ -212,8 +218,7 @@ private void closeWatch() { @NonNull public Set terminationReasons(@NonNull String node) { synchronized (terminationReasons) { - Set reasons = terminationReasons.get(node); - return reasons == null ? Collections.emptySet() : new HashSet<>(reasons); + return new HashSet<>(terminationReasons.get(node)); } } @@ -262,8 +267,11 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN terminatedContainers.forEach(c -> { ContainerStateTerminated t = c.getState().getTerminated(); LOGGER.info(() -> ns + "/" + name + " Container " + c.getName() + " was just terminated, so removing the corresponding Jenkins agent"); - runListener.getLogger().printf("%s/%s Container %s was terminated (Exit Code: %d, Reason: %s)%n", ns, name, c.getName(), t.getExitCode(), t.getReason()); - terminationReasons.add(t.getReason()); + String reason = t.getReason(); + runListener.getLogger().printf("%s/%s Container %s was terminated (Exit Code: %d, Reason: %s)%n", ns, name, c.getName(), t.getExitCode(), reason); + if (reason != null) { + terminationReasons.add(reason); + } }); try (ACLContext _ = ACL.as(ACL.SYSTEM)) { PodUtils.cancelQueueItemFor(pod, "ContainerError"); @@ -284,9 +292,12 @@ public void onEvent(@NonNull Action action, @NonNull KubernetesSlave node, @NonN String ns = pod.getMetadata().getNamespace(); String name = pod.getMetadata().getName(); TaskListener runListener = node.getTemplate().getListener(); - LOGGER.info(() -> ns + "/" + name + " Pod just failed. Removing the corresponding Jenkins agent. Reason: " + pod.getStatus().getReason() + ", Message: " + pod.getStatus().getMessage()); - runListener.getLogger().printf("%s/%s Pod just failed (Reason: %s, Message: %s)%n", ns, name, pod.getStatus().getReason(), pod.getStatus().getMessage()); - terminationReasons.add(pod.getStatus().getReason()); + String reason = pod.getStatus().getReason(); + LOGGER.info(() -> ns + "/" + name + " Pod just failed. Removing the corresponding Jenkins agent. Reason: " + reason + ", Message: " + pod.getStatus().getMessage()); + runListener.getLogger().printf("%s/%s Pod just failed (Reason: %s, Message: %s)%n", ns, name, reason, pod.getStatus().getMessage()); + if (reason != null) { + terminationReasons.add(reason); + } logLastLinesThenTerminateNode(node, pod, runListener); } } diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java index e86815f2bf..2bc9ffa2be 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/RestartPipelineTest.java @@ -252,7 +252,7 @@ public void terminatedPodAfterRestart() throws Exception { WorkflowRun b = r.jenkins.getItemByFullName(projectName.get(), WorkflowJob.class).getBuildByNumber(1); r.waitForMessage("Ready to run", b); deletePods(cloud.connect(), getLabels(this, name), false); - r.waitForMessage("assuming it is not coming back", b); + r.waitForMessage("Agent was removed", b); r.waitForMessage("Retrying", b); r.assertBuildStatusSuccess(r.waitForCompletion(b)); }); From d918d8b55ec11ab81a7a2d0976684616728403b2 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Thu, 12 May 2022 10:49:02 -0400 Subject: [PATCH 25/28] SpotBugs --- .../jenkins/plugins/kubernetes/pod/retention/Reaper.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java index fd4b749077..d513219606 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pod/retention/Reaper.java @@ -20,6 +20,7 @@ import com.github.benmanes.caffeine.cache.Caffeine; import com.github.benmanes.caffeine.cache.LoadingCache; import edu.umd.cs.findbugs.annotations.NonNull; +import edu.umd.cs.findbugs.annotations.SuppressFBWarnings; import hudson.Extension; import hudson.ExtensionList; import hudson.ExtensionPoint; @@ -215,6 +216,7 @@ private void closeWatch() { * @param node a {@link Node#getNodeName} * @return a possibly empty set of {@link ContainerStateTerminated#getReason} or {@link PodStatus#getReason} */ + @SuppressFBWarnings(value = "NP_NULL_ON_SOME_PATH_FROM_RETURN_VALUE", justification = "Confused by @org.checkerframework.checker.nullness.qual.Nullable on LoadingCache.get? Never null here.") @NonNull public Set terminationReasons(@NonNull String node) { synchronized (terminationReasons) { From 5d2e4d8e1384ba7a8d08c0a05eed5d2b1f435b48 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Thu, 19 May 2022 18:41:37 -0400 Subject: [PATCH 26/28] =?UTF-8?q?`errorConditions`=20=E2=86=92=20`conditio?= =?UTF-8?q?ns`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- pom.xml | 4 ++-- .../jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy | 2 +- .../kubernetes/pipeline/terminatedPodAfterRestart.groovy | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/pom.xml b/pom.xml index 900bc39dce..161411dee8 100644 --- a/pom.xml +++ b/pom.xml @@ -105,7 +105,7 @@ org.jenkins-ci.plugins.workflow workflow-api - 1159.v27cb_4545c3ff + 1166.v459a_d69b_3271 org.jenkinsci.plugins @@ -153,7 +153,7 @@ org.jenkins-ci.plugins.workflow workflow-basic-steps - 960.v0004499239c3 + 965.v145f716def48 test diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy index e10bdd5c82..73f1f453ee 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPod.groovy @@ -8,7 +8,7 @@ spec: - 99d terminationGracePeriodSeconds: 3 ''') { - retry(count: 2, errorConditions: [kubernetesAgent()]) { + retry(count: 2, conditions: [kubernetesAgent()]) { node(POD_LABEL) { container('busybox') { sh 'echo hello world' diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy index 962075aa5d..b4b5ef61ab 100644 --- a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/terminatedPodAfterRestart.groovy @@ -10,7 +10,7 @@ spec: - 99d terminationGracePeriodSeconds: 3 ''') { - retry(count: 2, errorConditions: [kubernetesAgent()]) { + retry(count: 2, conditions: [kubernetesAgent()]) { node(POD_LABEL) { container('busybox') { sh 'sleep 15' From e912f1c7e28d402fcabfa7b22b7c0d7913b202bf Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Wed, 25 May 2022 18:38:16 -0400 Subject: [PATCH 27/28] Pick up https://github.com/jenkinsci/pipeline-model-definition-plugin/pull/533 to add a `retries` option to `agent kubernetes` --- pom.xml | 42 +++++++++++++++--- .../pipeline/KubernetesDeclarativeAgent.java | 4 +- .../KubernetesDeclarativeAgent/config.jelly | 3 +- .../KubernetesDeclarativeAgentScript.groovy | 43 +++++++++++-------- .../KubernetesDeclarativeAgentTest.java | 16 +++++++ .../pipeline/declarativeRetries.groovy | 26 +++++++++++ 6 files changed, 109 insertions(+), 25 deletions(-) create mode 100644 src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/declarativeRetries.groovy diff --git a/pom.xml b/pom.xml index 161411dee8..79ae8e1637 100644 --- a/pom.xml +++ b/pom.xml @@ -50,6 +50,7 @@ false true jenkinsci/${project.artifactId}-plugin + 999999-SNAPSHOT @@ -121,7 +122,7 @@ org.jenkins-ci.plugins.workflow workflow-cps - 2691.va_688a_c3d8fd0 + 2713.v8b_f3c8cb_97a_0 true @@ -140,20 +141,20 @@ org.jenkins-ci.plugins.workflow workflow-durable-task-step - 1200.v7231de192754 + 1204.v2a_40b_1127a_a_8 org.jenkins-ci.plugins.workflow workflow-job - 1181.vcea_0362753c3 + 1183.vf1b_f64585092 test org.jenkins-ci.plugins.workflow workflow-basic-steps - 965.v145f716def48 + 967.v241f45274471 test @@ -171,7 +172,7 @@ org.jenkins-ci.plugins.workflow workflow-cps tests - 2691.va_688a_c3d8fd0 + 2713.v8b_f3c8cb_97a_0 test @@ -272,6 +273,37 @@ import pom + + org.jenkins-ci.plugins + script-security + 1172.v35f6a_0b_8207e + + + org.jenkinsci.plugins + pipeline-model-api + ${pipeline-model-definition-plugin.version} + + + org.jenkinsci.plugins + pipeline-model-definition + ${pipeline-model-definition-plugin.version} + + + org.jenkinsci.plugins + pipeline-model-definition + ${pipeline-model-definition-plugin.version} + tests + + + org.jenkinsci.plugins + pipeline-model-extensions + ${pipeline-model-definition-plugin.version} + + + org.jenkinsci.plugins + pipeline-stage-tags-metadata + ${pipeline-model-definition-plugin.version} + org.jenkins-ci.plugins diff --git a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgent.java b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgent.java index d3a397b359..103962e3ea 100644 --- a/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgent.java +++ b/src/main/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgent.java @@ -13,7 +13,6 @@ import org.csanchez.jenkins.plugins.kubernetes.pod.yaml.YamlMergeStrategy; import org.csanchez.jenkins.plugins.kubernetes.volumes.workspace.WorkspaceVolume; import org.jenkinsci.Symbol; -import org.jenkinsci.plugins.pipeline.modeldefinition.agent.DeclarativeAgent; import org.jenkinsci.plugins.pipeline.modeldefinition.agent.DeclarativeAgentDescriptor; import org.jenkinsci.plugins.variant.OptionalExtension; import org.kohsuke.accmod.Restricted; @@ -30,8 +29,9 @@ import java.util.logging.Level; import java.util.logging.Logger; import java.util.stream.Collectors; +import org.jenkinsci.plugins.pipeline.modeldefinition.agent.RetryableDeclarativeAgent; -public class KubernetesDeclarativeAgent extends DeclarativeAgent { +public class KubernetesDeclarativeAgent extends RetryableDeclarativeAgent { private static final Logger LOGGER = Logger.getLogger(KubernetesDeclarativeAgent.class.getName()); diff --git a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgent/config.jelly b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgent/config.jelly index c023738226..78a4c0047d 100644 --- a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgent/config.jelly +++ b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgent/config.jelly @@ -1,5 +1,5 @@ - + @@ -49,5 +49,6 @@ + diff --git a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgentScript.groovy b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgentScript.groovy index 8023becaeb..fe316e2e19 100644 --- a/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgentScript.groovy +++ b/src/main/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgentScript.groovy @@ -49,27 +49,36 @@ public class KubernetesDeclarativeAgentScript extends DeclarativeAgentScript 1) { + script.retry(count: describable.retries, conditions: [script.kubernetesAgent(), script.nonresumable()]) { + run.call() + } + } else { + run.call() } } } diff --git a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgentTest.java b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgentTest.java index e18d27eeae..b965e225f4 100644 --- a/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgentTest.java +++ b/src/test/java/org/csanchez/jenkins/plugins/kubernetes/pipeline/KubernetesDeclarativeAgentTest.java @@ -33,6 +33,8 @@ import hudson.model.Result; import jenkins.plugins.git.GitSampleRepoRule; import jenkins.plugins.git.GitStep; +import static org.csanchez.jenkins.plugins.kubernetes.KubernetesTestUtil.deletePods; +import static org.csanchez.jenkins.plugins.kubernetes.KubernetesTestUtil.getLabels; import org.csanchez.jenkins.plugins.kubernetes.pod.retention.OnFailure; import org.jenkinsci.plugins.structs.describable.UninstantiatedDescribable; import org.jenkinsci.plugins.workflow.actions.ArgumentsAction; @@ -194,4 +196,18 @@ public void declarativeShowRawYamlFalse() throws Exception { // check yaml metadata labels not logged r.assertLogNotContains("class: KubernetesDeclarativeAgentTest", b); } + + @Issue("JENKINS-49707") + @Test + public void declarativeRetries() throws Exception { + assertNotNull(createJobThenScheduleRun()); + r.waitForMessage("+ sleep", b); + deletePods(cloud.connect(), getLabels(this, name), false); + r.waitForMessage("busybox --", b); + r.waitForMessage("jnlp --", b); + r.waitForMessage("was deleted; cancelling node body", b); + r.waitForMessage("Retrying", b); + r.assertBuildStatusSuccess(r.waitForCompletion(b)); + } + } diff --git a/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/declarativeRetries.groovy b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/declarativeRetries.groovy new file mode 100644 index 0000000000..bf63cb20fc --- /dev/null +++ b/src/test/resources/org/csanchez/jenkins/plugins/kubernetes/pipeline/declarativeRetries.groovy @@ -0,0 +1,26 @@ +pipeline { + agent { + kubernetes { + yaml ''' +spec: + containers: + - name: busybox + image: busybox + command: + - sleep + - 99d + terminationGracePeriodSeconds: 3 + ''' + defaultContainer 'busybox' + retries 2 + } + } + stages { + stage('Run') { + steps { + sh 'echo hello world' + sh 'sleep 15' + } + } + } +} From da10da221cd52a669807eb9fe63f12c4fd593a09 Mon Sep 17 00:00:00 2001 From: Jesse Glick Date: Thu, 26 May 2022 09:17:06 -0400 Subject: [PATCH 28/28] Got an incremental deployment of https://github.com/jenkinsci/pipeline-model-definition-plugin/pull/533 --- pom.xml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pom.xml b/pom.xml index 79ae8e1637..c7fb61a93c 100644 --- a/pom.xml +++ b/pom.xml @@ -50,7 +50,7 @@ false true jenkinsci/${project.artifactId}-plugin - 999999-SNAPSHOT + 2.2093.v2fb_2b_ea_cfc23