Skip to content

Commit

Permalink
Merge pull request #768 from Metaswitch/feature-jenkins-69399-prevent…
Browse files Browse the repository at this point in the history
…-node-offlining

[JENKINS-69399] Prevent offlining ec2 node if jobs in node queue
  • Loading branch information
res0nance authored Aug 25, 2022
2 parents 0e129de + 772f8da commit 1a236e4
Show file tree
Hide file tree
Showing 2 changed files with 224 additions and 2 deletions.
34 changes: 32 additions & 2 deletions src/main/java/hudson/plugins/ec2/EC2RetentionStrategy.java
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
import hudson.model.ExecutorListener;
import hudson.model.Queue;
import hudson.plugins.ec2.util.MinimumInstanceChecker;
import hudson.model.Label;
import hudson.slaves.RetentionStrategy;
import jenkins.model.Jenkins;

Expand Down Expand Up @@ -199,7 +200,8 @@ private long internalCheck(EC2Computer computer) {
// TODO: really think about the right strategy here, see
// JENKINS-23792

if (idleMilliseconds > TimeUnit.MINUTES.toMillis(idleTerminationMinutes)) {
if (idleMilliseconds > TimeUnit.MINUTES.toMillis(idleTerminationMinutes) &&
!itemsInQueueForThisSlave(computer)){

LOGGER.info("Idle timeout of " + computer.getName() + " after "
+ TimeUnit.MILLISECONDS.toMinutes(idleMilliseconds) +
Expand All @@ -218,7 +220,7 @@ private long internalCheck(EC2Computer computer) {
// if we have less "free" (aka already paid for) time left than
// our idle time, stop/terminate the instance
// See JENKINS-23821
if (freeSecondsLeft <= TimeUnit.MINUTES.toSeconds(Math.abs(idleTerminationMinutes))) {
if (freeSecondsLeft <= TimeUnit.MINUTES.toSeconds(Math.abs(idleTerminationMinutes)) && !itemsInQueueForThisSlave(computer)) {
LOGGER.info("Idle timeout of " + computer.getName() + " after "
+ TimeUnit.MILLISECONDS.toMinutes(idleMilliseconds) + " idle minutes, with "
+ TimeUnit.SECONDS.toMinutes(freeSecondsLeft)
Expand All @@ -233,6 +235,34 @@ private long internalCheck(EC2Computer computer) {
return 1;
}

/*
* Checks if there are any items in the queue that are waiting for this node explicitly.
* This prevents a node from being taken offline while there are Ivy/Maven Modules waiting to build.
* Need to check entire queue as some modules may be blocked by upstream dependencies.
* Accessing the queue in this way can block other threads, so only perform this check just prior
* to timing out the slave.
*/
private boolean itemsInQueueForThisSlave(EC2Computer c) {
final EC2AbstractSlave selfNode = c.getNode();
/* null checking is required here because in the event that a computer
* doesn't have a node it will return null. In this case we want to
* return false because there's no slave to prevent a timeout of.
*/
if (selfNode == null) return false;
final Label selfLabel = selfNode.getSelfLabel();
Queue.Item[] items = Jenkins.getInstance().getQueue().getItems();
for (int i = 0; i < items.length; i++) {
Queue.Item item = items[i];
final Label assignedLabel = item.getAssignedLabel();
if (assignedLabel == selfLabel) {
LOGGER.fine("Preventing idle timeout of " + c.getName()
+ " as there is at least one item in the queue explicitly waiting for this slave");
return true;
}
}
return false;
}

/**
* Called when a new {@link EC2Computer} object is introduced (such as when Hudson started, or when
* a new agent is added.)
Expand Down
192 changes: 192 additions & 0 deletions src/test/java/hudson/plugins/ec2/EC2RetentionStrategyTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,41 @@
import com.amazonaws.services.ec2.model.InstanceType;
import hudson.model.Executor;
import hudson.model.Node;
import hudson.model.Label;
import hudson.model.Queue;
import hudson.model.ResourceList;
import hudson.model.Queue.Executable;
import hudson.model.Queue.Task;
import hudson.model.queue.CauseOfBlockage;
import hudson.plugins.ec2.util.AmazonEC2FactoryMockImpl;
import hudson.plugins.ec2.util.MinimumInstanceChecker;
import hudson.plugins.ec2.util.MinimumNumberOfInstancesTimeRangeConfig;
import hudson.plugins.ec2.util.PrivateKeyHelper;
import hudson.plugins.ec2.util.SSHCredentialHelper;
import hudson.security.ACL;
import hudson.security.AccessControlled;
import hudson.security.Permission;import hudson.model.Executor;
import hudson.slaves.NodeProperty;
import hudson.slaves.OfflineCause;
import jenkins.util.NonLocalizable;
import jenkins.model.Jenkins;
import net.sf.json.JSONObject;
import org.acegisecurity.Authentication;
import org.junit.Rule;
import org.junit.Test;
import org.jvnet.hudson.test.JenkinsRule;
import org.testcontainers.shaded.org.bouncycastle.jce.provider.BouncyCastleProvider;
import org.jvnet.hudson.test.LoggerRule;

import javax.annotation.Nonnull;
import java.security.Security;
import java.time.Clock;
import java.time.Duration;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.Month;
import java.time.ZoneId;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
Expand All @@ -31,6 +53,7 @@
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasItem;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertFalse;
import static org.junit.Assert.assertTrue;
import org.junit.Rule;
import org.junit.Test;
Expand Down Expand Up @@ -75,6 +98,175 @@ public void testOnBillingHourRetention() throws Exception {
}
}

@Test
public void testRetentionWhenQueueHasWaitingItemForThisNode() throws Exception {
EC2RetentionStrategy rs = new EC2RetentionStrategy("-2");
EC2Computer computer = computerWithIdleTime(59, 0);
final Label selfLabel = computer.getNode().getSelfLabel();
final Queue queue = Jenkins.get().getQueue();
final Task task = taskForLabel(selfLabel, false);
queue.schedule(task, 500);
checkRetentionStrategy(rs, computer);
assertFalse("Expected computer to be left running", idleTimeoutCalled.get());
queue.cancel(task);
EC2RetentionStrategy rs2 = new EC2RetentionStrategy("-2");
checkRetentionStrategy(rs2, computer);
assertTrue("Expected computer to be idled", idleTimeoutCalled.get());
}

@Test
public void testRetentionWhenQueueHasBlockedItemForThisNode() throws Exception {
EC2RetentionStrategy rs = new EC2RetentionStrategy("-2");
EC2Computer computer = computerWithIdleTime(59, 0);
final Label selfLabel = computer.getNode().getSelfLabel();
final Queue queue = Jenkins.get().getQueue();
final Task task = taskForLabel(selfLabel, true);
queue.schedule(task, 0);
checkRetentionStrategy(rs, computer);
assertFalse("Expected computer to be left running", idleTimeoutCalled.get());
queue.cancel(task);
EC2RetentionStrategy rs2 = new EC2RetentionStrategy("-2");
checkRetentionStrategy(rs2, computer);
assertTrue("Expected computer to be idled", idleTimeoutCalled.get());
}

private interface AccessControlledTask extends Queue.Task, AccessControlled {
}

private Queue.Task taskForLabel(final Label label, boolean blocked) {
final CauseOfBlockage cob = blocked ? new CauseOfBlockage() {
@Override
public String getShortDescription() {
return "Blocked";
}
} : null;
return new AccessControlledTask() {
@Nonnull
public ACL getACL() {
return new ACL() {
public boolean hasPermission(@Nonnull Authentication a, @Nonnull Permission permission) {
return true;
}
};
}
public ResourceList getResourceList() {
return null;
}

public Node getLastBuiltOn() {
return null;
}

public long getEstimatedDuration() {
return -1;
}

public Label getAssignedLabel() {
return label;
}

public Executable createExecutable() {
return null;
}

public String getDisplayName() {
return null;
}

@Override
public CauseOfBlockage getCauseOfBlockage() {
return cob;
}

public boolean hasAbortPermission() {
return false;
}

public String getUrl() {
return null;
}

public String getName() {
return null;
}

public String getFullDisplayName() {
return null;
}

public void checkAbortPermission() {
}
};
}

private EC2Computer computerWithIdleTime(final int minutes, final int seconds) throws Exception {
return computerWithIdleTime(minutes, seconds, false, null);
}

/*
* Creates a computer with the params passed. If isOnline is null, the computer returns the real value, otherwise,
* the computer returns the value established.
*/
private EC2Computer computerWithIdleTime(final int minutes, final int seconds, final Boolean isOffline, final Boolean isConnecting) throws Exception {
final EC2AbstractSlave slave = new EC2AbstractSlave("name", "id", "description", "fs", 1, null, "label", null, null, "init", "tmpDir", new ArrayList<NodeProperty<?>>(), "remote", "jvm", false, "idle", null, "cloud", false, Integer.MAX_VALUE, null, ConnectionStrategy.PRIVATE_IP, -1) {
@Override
public void terminate() {
}

@Override
public String getEc2Type() {
return null;
}

@Override
void idleTimeout() {
idleTimeoutCalled.set(true);
}
};
EC2Computer computer = new EC2Computer(slave) {

private final long launchedAtMs = Instant.now().minus(Duration.ofSeconds(minutes * 60L + seconds)).toEpochMilli();

@Override
public EC2AbstractSlave getNode() {
return slave;
}

@Override
public long getUptime() throws AmazonClientException, InterruptedException {
return ((minutes * 60L) + seconds) * 1000L;
}

@Override
public long getLaunchTime() throws InterruptedException {
return this.launchedAtMs;
}

@Override
public boolean isOffline() {
return isOffline == null ? super.isOffline() : isOffline;
}

@Override
public InstanceState getState() {
return InstanceState.RUNNING;
}

@Override
public SlaveTemplate getSlaveTemplate() {
return new SlaveTemplate("ami-123", EC2AbstractSlave.TEST_ZONE, null, "default", "foo", InstanceType.M1Large, false, "ttt", Node.Mode.NORMAL, "AMI description", "bar", "bbb", "aaa", "10", "fff", null, "-Xmx1g", false, "subnet-123 subnet-456", null, null, true, null, "", false, false, "", false, "");
}

@Override
public boolean isConnecting() {
return isConnecting == null ? super.isConnecting() : isConnecting;
}
};
assertTrue(computer.isIdle());
assertTrue(isOffline == null || computer.isOffline() == isOffline);
return computer;
}

private EC2Computer computerWithUpTime(final int minutes, final int seconds) throws Exception {
return computerWithUpTime(minutes, seconds, false, null);
}
Expand Down

0 comments on commit 1a236e4

Please sign in to comment.