Skip to content

Commit

Permalink
Force updating the job status to KILLED when killing a job that has a…
Browse files Browse the repository at this point in the history
… connected agent but no response observer
  • Loading branch information
bhou committed Oct 20, 2023
1 parent c6c81dc commit ce86e8c
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 10 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -154,15 +154,35 @@ public void killJob(
= this.parkedJobKillResponseObservers.remove(jobId);

if (responseObserver == null) {
log.error("Job {} not killed. Expected local agent connection not found", jobId);
throw new GenieServerException(
"Job " + jobId + " not killed. Expected local agent connection not found."
// This might happen when the agent has gone but its status is not updated
// In this case, we force updating the job status to KILLED.
log.warn("Job {} not killed. Expected local agent connection not found. "
+ "Trying to force updating the job status to {}",
jobId,
JobStatus.KILLED
);
}
responseObserver.onNext(JobKillRegistrationResponse.newBuilder().build());
responseObserver.onCompleted();
try {
this.persistenceService.updateJobStatus(jobId, currentJobStatus, JobStatus.KILLED, reason);
log.info("Succeeded to force updating the status of Job {} to {}",
jobId,
JobStatus.KILLED
);
} catch (final Exception e) {
log.error("Succeeded to force updating the status of Job {} to {}",

Check warning on line 171 in genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java

View check run for this annotation

Codecov / codecov/patch

genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java#L170-L171

Added lines #L170 - L171 were not covered by tests
jobId,
JobStatus.KILLED
);
throw new GenieServerException("Failed to force updating the status of Job "

Check warning on line 175 in genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java

View check run for this annotation

Codecov / codecov/patch

genie-web/src/main/java/com/netflix/genie/web/agent/apis/rpc/v4/endpoints/GRpcJobKillServiceImpl.java#L175

Added line #L175 was not covered by tests
+ jobId + " to " + JobStatus.KILLED,
e
);
}
} else {
responseObserver.onNext(JobKillRegistrationResponse.newBuilder().build());
responseObserver.onCompleted();

log.info("Agent notified for killing job {}", jobId);
log.info("Agent notified for killing job {}", jobId);
}
} else {
// Agent is running somewhere else try to forward the request
final String hostname = this.agentRoutingService
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,13 +148,12 @@ class GRpcJobKillServiceImplSpec extends Specification {
when: "The job is active, the agent is connected, the job is local but no observer"
this.serviceSpy.killJob(this.jobId, this.reason, this.servletRequest)

then: "Correct exception is thrown"
then: "Force updating job status"
1 * this.persistenceService.getJobStatus(this.jobId) >> JobStatus.CLAIMED
0 * this.persistenceService.updateJobStatus(_ as String, _ as JobStatus, _ as JobStatus, _ as String)
1 * this.persistenceService.updateJobStatus(_ as String, _ as JobStatus, _ as JobStatus, _ as String)
1 * this.agentRoutingService.isAgentConnectionLocal(this.jobId) >> true
0 * this.responseObserver.onNext(_ as JobKillRegistrationResponse)
0 * this.responseObserver.onCompleted()
thrown(GenieServerException)

when: "The job is active, the agent is connected, and there is an observer"
this.serviceSpy.registerForKillNotification(this.request, this.responseObserver)
Expand Down

0 comments on commit ce86e8c

Please sign in to comment.