From ce86e8cbc082fd8197f4c2f8769b98b3404abedc Mon Sep 17 00:00:00 2001 From: bhou Date: Fri, 20 Oct 2023 14:22:34 -0700 Subject: [PATCH] Force updating the job status to KILLED when killing a job that has a connected agent but no response observer --- .../v4/endpoints/GRpcJobKillServiceImpl.java | 34 +++++++++++++++---- .../GRpcJobKillServiceImplSpec.groovy | 5 ++- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java b/genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java index 91b7fb05aac..7ff7f4dbfca 100644 --- a/genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java +++ b/genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java @@ -154,15 +154,35 @@ public void killJob( = this.parkedJobKillResponseObservers.remove(jobId); if (responseObserver == null) { - log.error("Job {} not killed. Expected local agent connection not found", jobId); - throw new GenieServerException( - "Job " + jobId + " not killed. Expected local agent connection not found." + // This might happen when the agent has gone but its status is not updated + // In this case, we force updating the job status to KILLED. + log.warn("Job {} not killed. Expected local agent connection not found. " + + "Trying to force updating the job status to {}", + jobId, + JobStatus.KILLED ); - } - responseObserver.onNext(JobKillRegistrationResponse.newBuilder().build()); - responseObserver.onCompleted(); + try { + this.persistenceService.updateJobStatus(jobId, currentJobStatus, JobStatus.KILLED, reason); + log.info("Succeeded to force updating the status of Job {} to {}", + jobId, + JobStatus.KILLED + ); + } catch (final Exception e) { + log.error("Succeeded to force updating the status of Job {} to {}", + jobId, + JobStatus.KILLED + ); + throw new GenieServerException("Failed to force updating the status of Job " + + jobId + " to " + JobStatus.KILLED, + e + ); + } + } else { + responseObserver.onNext(JobKillRegistrationResponse.newBuilder().build()); + responseObserver.onCompleted(); - log.info("Agent notified for killing job {}", jobId); + log.info("Agent notified for killing job {}", jobId); + } } else { // Agent is running somewhere else try to forward the request final String hostname = this.agentRoutingService diff --git a/genie-web/src/test/groovy/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImplSpec.groovy b/genie-web/src/test/groovy/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImplSpec.groovy index 496aeeab98c..013fb382894 100644 --- a/genie-web/src/test/groovy/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImplSpec.groovy +++ b/genie-web/src/test/groovy/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImplSpec.groovy @@ -148,13 +148,12 @@ class GRpcJobKillServiceImplSpec extends Specification { when: "The job is active, the agent is connected, the job is local but no observer" this.serviceSpy.killJob(this.jobId, this.reason, this.servletRequest) - then: "Correct exception is thrown" + then: "Force updating job status" 1 * this.persistenceService.getJobStatus(this.jobId) >> JobStatus.CLAIMED - 0 * this.persistenceService.updateJobStatus(_ as String, _ as JobStatus, _ as JobStatus, _ as String) + 1 * this.persistenceService.updateJobStatus(_ as String, _ as JobStatus, _ as JobStatus, _ as String) 1 * this.agentRoutingService.isAgentConnectionLocal(this.jobId) >> true 0 * this.responseObserver.onNext(_ as JobKillRegistrationResponse) 0 * this.responseObserver.onCompleted() - thrown(GenieServerException) when: "The job is active, the agent is connected, and there is an observer" this.serviceSpy.registerForKillNotification(this.request, this.responseObserver)