Skip to content

Commit

Permalink
Kill job reason (#1367)
Browse files Browse the repository at this point in the history
Adding requester information to JobKillRequest

This feature requires more information from job kill
actions requested through the API.

Link the Issue(s) this Pull Request is related to.
This feature is motivated by a situation where a script was misusing the API and calling kill on all the jobs for a show on a regular basis. Without this feature, finding where the requests were coming from was a big endeavor.

Summarize your change.
This change requires that a kill request also provide username, pid, host_kill and reason.

---------

Signed-off-by: Diego Tavares <dtavares@imageworks.com>
Co-authored-by: Roula O'Regan <roregan@imageworks.com>
  • Loading branch information
DiegoTavares and roulaoregan-spi authored Jun 17, 2024
1 parent 7da773a commit a314c6c
Show file tree
Hide file tree
Showing 16 changed files with 223 additions and 97 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ htmlcov/
/.env
.envrc
.vscode
.venv/
.venv/
.eggs/*
2 changes: 1 addition & 1 deletion VERSION.in
Original file line number Diff line number Diff line change
@@ -1 +1 @@
0.25
0.26
22 changes: 21 additions & 1 deletion cuebot/src/main/java/com/imageworks/spcue/Source.java
Original file line number Diff line number Diff line change
Expand Up @@ -25,15 +25,35 @@
public class Source {

public String source = "unknown";
public String username = "";
public String pid = "";
public String host_kill = "";
public String reason = "";

public Source() {}

public Source(String source) {
this.source = source;
}

public Source(String source, String username, String pid, String host_kill, String reason) {
this.source = source;
this.username = username;
this.pid = pid;
this.host_kill = host_kill;
this.reason = reason;
}

public String getReason() {
return this.reason;
}

public String toString() {
return this.source;
return "User: " + this.username +
", Pid: " + this.pid +
", Hostname: " + this.host_kill +
", Reason: " + this.reason +
"\n" + this.source;
}
}

Original file line number Diff line number Diff line change
Expand Up @@ -275,7 +275,9 @@ public void kill(JobKillRequest request, StreamObserver<JobKillResponse> respons
try {
setupJobData(request.getJob());
manageQueue.execute(new DispatchJobComplete(job,
new Source(request.toString()), true, jobManagerSupport));
new Source(request.toString(), request.getUsername(), request.getPid(),
request.getHostKill(), request.getReason()),
true, jobManagerSupport));
responseObserver.onNext(JobKillResponse.newBuilder().build());
responseObserver.onCompleted();
}
Expand Down Expand Up @@ -486,7 +488,8 @@ public void killFrames(JobKillFramesRequest request, StreamObserver<JobKillFrame
manageQueue.execute(
new DispatchKillFrames(
frameSearchFactory.create(job, request.getReq()),
new Source(request.toString()),
new Source(request.toString(), request.getUsername(), request.getPid(),
request.getHostKill(), request.getReason()),
jobManagerSupport));
responseObserver.onNext(JobKillFramesResponse.newBuilder().build());
responseObserver.onCompleted();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -197,12 +197,11 @@ public void getFrames(LayerGetFramesRequest request, StreamObserver<LayerGetFram
@Override
public void killFrames(LayerKillFramesRequest request, StreamObserver<LayerKillFramesResponse> responseObserver) {
updateLayer(request.getLayer());
if (attemptChange(env, property, jobManager, layer, responseObserver)) {
manageQueue.execute(new DispatchKillFrames(frameSearch,
new Source(request.toString()), jobManagerSupport));
responseObserver.onNext(LayerKillFramesResponse.newBuilder().build());
responseObserver.onCompleted();
}
manageQueue.execute(new DispatchKillFrames(frameSearch,
new Source(request.toString(), request.getUsername(), request.getPid(),
request.getHostKill(), request.getReason()), jobManagerSupport));
responseObserver.onNext(LayerKillFramesResponse.newBuilder().build());
responseObserver.onCompleted();
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -75,76 +75,82 @@ public void queueShutdownJob(JobInterface job, Source source, boolean isManualKi

public boolean shutdownJob(JobInterface job, Source source, boolean isManualKill) {

if (jobManager.shutdownJob(job)) {

/*
* Satisfy any dependencies on just the
* job record, not layers or frames.
*/
satisfyWhatDependsOn(job);

if (departmentManager.isManaged(job)) {
departmentManager.syncJobsWithTask(job);
}

if (isManualKill) {
if (isManualKill && source.getReason().isEmpty()) {
logger.info(job.getName() + "/" + job.getId() +
" **Invalid Job Kill Request** for " + source.toString());
}
else {
if (jobManager.shutdownJob(job)) {

logger.info(job.getName() + "/" + job.getId() +
" is being manually killed by " + source.toString());
/*
* Satisfy any dependencies on just the
* job record, not layers or frames.
*/
satisfyWhatDependsOn(job);

/**
* Sleep a bit here in case any frames were
* dispatched during the job shutdown process.
*/
try {
Thread.sleep(3000);
} catch (InterruptedException e1) {
logger.info(job.getName() + "/" + job.getId() +
" shutdown thread was interrupted.");
Thread.currentThread().interrupt();
if (departmentManager.isManaged(job)) {
departmentManager.syncJobsWithTask(job);
}

FrameSearchInterface search = frameSearchFactory.create(job);
FrameSearchCriteria newCriteria = search.getCriteria();
FrameStateSeq states = newCriteria.getStates().toBuilder()
.addFrameStates(FrameState.RUNNING)
.build();
search.setCriteria(newCriteria.toBuilder().setStates(states).build());
if (isManualKill) {

for (FrameInterface frame: jobManager.findFrames(search)) {
logger.info(job.getName() + "/" + job.getId() +
" is being manually killed by " + source.toString());

VirtualProc proc = null;
/**
* Sleep a bit here in case any frames were
* dispatched during the job shutdown process.
*/
try {
proc = hostManager.findVirtualProc(frame);
}
catch (DataAccessException e) {
logger.warn("Unable to find proc to kill frame " + frame +
" on job shutdown operation, " + e);
Thread.sleep(3000);
} catch (InterruptedException e1) {
logger.info(job.getName() + "/" + job.getId() +
" shutdown thread was interrupted.");
Thread.currentThread().interrupt();
}

if (manualStopFrame(frame, FrameState.WAITING)) {
FrameSearchInterface search = frameSearchFactory.create(job);
FrameSearchCriteria newCriteria = search.getCriteria();
FrameStateSeq states = newCriteria.getStates().toBuilder()
.addFrameStates(FrameState.RUNNING)
.build();
search.setCriteria(newCriteria.toBuilder().setStates(states).build());

for (FrameInterface frame: jobManager.findFrames(search)) {

VirtualProc proc = null;
try {
if (proc != null) {
kill(proc, source);
}
} catch (DataAccessException e) {
logger.warn("Failed to kill frame " + frame +
proc = hostManager.findVirtualProc(frame);
}
catch (DataAccessException e) {
logger.warn("Unable to find proc to kill frame " + frame +
" on job shutdown operation, " + e);
}
catch (Exception e) {
logger.warn("error killing frame: " + frame);

if (manualStopFrame(frame, FrameState.WAITING)) {
try {
if (proc != null) {
kill(proc, source);
}
} catch (DataAccessException e) {
logger.warn("Failed to kill frame " + frame +
" on job shutdown operation, " + e);
}
catch (Exception e) {
logger.warn("error killing frame: " + frame);
}
}
}
}
}

/*
* Send mail after all frames have been stopped or else the email
* will have inaccurate numbers.
*/
emailSupport.sendShutdownEmail(job);
/*
* Send mail after all frames have been stopped or else the email
* will have inaccurate numbers.
*/
emailSupport.sendShutdownEmail(job);

return true;
return true;
}
}

return false;
Expand Down
13 changes: 8 additions & 5 deletions cuegui/cuegui/MenuActions.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
from builtins import filter
from builtins import str
from builtins import object
import getpass
import glob
import subprocess
import time
Expand Down Expand Up @@ -65,7 +66,8 @@
TITLE = 0
TOOLTIP = 1
ICON = 2

DEFAULT_JOB_KILL_REASON = "Manual Job Kill Request in Cuegui by " + getpass.getuser()
DEFAULT_FRAME_KILL_REASON = "Manual Frame(s) Kill Request in Cuegui by " + getpass.getuser()

# pylint: disable=missing-function-docstring,no-self-use,unused-argument

Expand Down Expand Up @@ -368,7 +370,7 @@ def kill(self, rpcObjects=None):
if cuegui.Utils.questionBoxYesNo(self._caller, "Kill jobs?", msg,
[job.data.name for job in jobs]):
for job in jobs:
job.kill()
job.kill(reason=DEFAULT_JOB_KILL_REASON)
self.killDependents(jobs)
self._update()

Expand All @@ -384,7 +386,7 @@ def killDependents(self, jobs):
sorted([dep.name() for dep in dependents])):
for depJob in dependents:
try:
depJob.kill()
depJob.kill(reason=DEFAULT_JOB_KILL_REASON)
except opencue.exception.CueException as e:
errMsg = "Failed to kill depending job: %s - %s" % (depJob.name(), e)
logger.warning(errMsg)
Expand Down Expand Up @@ -769,7 +771,7 @@ def kill(self, rpcObjects=None):
"Kill ALL frames in selected layers?",
[layer.data.name for layer in layers]):
for layer in layers:
layer.kill()
layer.kill(reason=DEFAULT_FRAME_KILL_REASON)
self._update()

eat_info = ["&Eat", None, "eat"]
Expand Down Expand Up @@ -1080,7 +1082,8 @@ def kill(self, rpcObjects=None):
if cuegui.Utils.questionBoxYesNo(self._caller, "Confirm",
"Kill selected frames?",
names):
self._getSource().killFrames(name=names)
self._getSource().killFrames(reason=DEFAULT_FRAME_KILL_REASON,
name=names)
self._update()

markAsWaiting_info = ["Mark as &waiting", None, "configure"]
Expand Down
12 changes: 6 additions & 6 deletions cuegui/cuegui/plugins/StuckFramePlugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,12 +23,12 @@
from builtins import str
from builtins import map
import datetime
import re
import getpass
import os
from datetime import datetime
import time
import socket
import re
import signal
import socket
import time
import yaml

from qtpy import QtGui
Expand Down Expand Up @@ -63,7 +63,7 @@
LLU_COLUMN = 3
RUNTIME_COLUMN = 4
LASTLINE_COLUMN = 7

DEFAULT_FRAME_KILL_REASON = "Manual Frame Kill Request in Cuegui by " + getpass.getuser()

class StuckWidget(cuegui.AbstractDockWidget.AbstractDockWidget):
"""This builds what is displayed on the dock widget"""
Expand Down Expand Up @@ -1345,7 +1345,7 @@ def logKill(self):
if cuegui.Utils.questionBoxYesNo(self, "Confirm", "Kill selected frames?", names):
self.log()
for frame in self.selectedObjects():
frame.kill()
frame.kill(reason=DEFAULT_FRAME_KILL_REASON)
self.remove()

def retryFrame(self):
Expand Down
4 changes: 3 additions & 1 deletion cuegui/tests/MenuActions_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -958,7 +958,9 @@ def test_kill(self, yesNoMock):

self.frame_actions.kill(rpcObjects=[frame])

self.job.killFrames.assert_called_with(name=[frame_name])
self.job.killFrames.assert_called_with(
name=[frame_name],
reason="Manual Frame(s) Kill Request in Cuegui by root")

@mock.patch('cuegui.Utils.questionBoxYesNo', return_value=True)
def test_markAsWaiting(self, yesNoMock):
Expand Down
16 changes: 16 additions & 0 deletions proto/job.proto
Original file line number Diff line number Diff line change
Expand Up @@ -885,6 +885,10 @@ message FrameGetWhatThisDependsOnResponse {
// Kill
message FrameKillRequest {
Frame frame = 1;
string username = 2;
string pid = 3;
string host_kill = 4;
string reason = 5;
}

message FrameKillResponse {} // Empty
Expand Down Expand Up @@ -1283,6 +1287,10 @@ message JobIsJobPendingResponse {
// Kill
message JobKillRequest {
Job job = 1;
string username = 2;
string pid = 3;
string host_kill = 4;
string reason = 5;
}

message JobKillResponse {} // Empty
Expand All @@ -1291,6 +1299,10 @@ message JobKillResponse {} // Empty
message JobKillFramesRequest {
Job job = 1;
FrameSearchCriteria req = 2;
string username = 3;
string pid = 4;
string host_kill = 5;
string reason = 6;
}

message JobKillFramesResponse {} // Empty
Expand Down Expand Up @@ -1613,6 +1625,10 @@ message LayerGetWhatThisDependsOnResponse {
// KillFrames
message LayerKillFramesRequest {
Layer layer = 1;
string username = 2;
string pid = 3;
string host_kill = 4;
string reason = 5;
}

message LayerKillFramesResponse {} // Empty
Expand Down
13 changes: 11 additions & 2 deletions pycue/opencue/wrappers/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
"""Module for classes related to frames."""

import enum
import getpass
import time
import os

Expand Down Expand Up @@ -59,10 +60,18 @@ def eat(self):
if self.data.state != job_pb2.FrameState.Value('EATEN'):
self.stub.Eat(job_pb2.FrameEatRequest(frame=self.data), timeout=Cuebot.Timeout)

def kill(self):
def kill(self, username=None, pid=None, host_kill=None, reason=None):
"""Kills the frame."""
username = username if username else getpass.getuser()
pid = pid if pid else os.getpid()
host_kill = host_kill if host_kill else os.uname()[1]
if self.data.state == job_pb2.FrameState.Value('RUNNING'):
self.stub.Kill(job_pb2.FrameKillRequest(frame=self.data), timeout=Cuebot.Timeout)
self.stub.Kill(job_pb2.FrameKillRequest(frame=self.data,
username=username,
pid=str(pid),
host_kill=host_kill,
reason=reason),
timeout=Cuebot.Timeout)

def retry(self):
"""Retries the frame."""
Expand Down
Loading

0 comments on commit a314c6c

Please sign in to comment.