From a314c6c228e7aac9f7f2247ef4fa20cf2d0fb300 Mon Sep 17 00:00:00 2001 From: Diego Tavares Date: Mon, 17 Jun 2024 10:31:14 -0700 Subject: [PATCH] Kill job reason (#1367) Adding requester information to JobKillRequest This feature requires more information from job kill actions requested through the API. Link the Issue(s) this Pull Request is related to. This feature is motivated by a situation where a script was misusing the API and calling kill on all the jobs for a show on a regular basis. Without this feature, finding where the requests were coming from was a big endeavor. Summarize your change. This change requires that a kill request also provide username, pid, host_kill and reason. --------- Signed-off-by: Diego Tavares Co-authored-by: Roula O'Regan --- .gitignore | 3 +- VERSION.in | 2 +- .../java/com/imageworks/spcue/Source.java | 22 +++- .../imageworks/spcue/servant/ManageJob.java | 7 +- .../imageworks/spcue/servant/ManageLayer.java | 11 +- .../spcue/service/JobManagerSupport.java | 112 +++++++++--------- cuegui/cuegui/MenuActions.py | 13 +- cuegui/cuegui/plugins/StuckFramePlugin.py | 12 +- cuegui/tests/MenuActions_tests.py | 4 +- proto/job.proto | 16 +++ pycue/opencue/wrappers/frame.py | 13 +- pycue/opencue/wrappers/job.py | 35 ++++-- pycue/opencue/wrappers/layer.py | 12 +- pycue/tests/wrappers/frame_test.py | 14 ++- pycue/tests/wrappers/job_test.py | 30 ++++- pycue/tests/wrappers/layer_test.py | 14 ++- 16 files changed, 223 insertions(+), 97 deletions(-) diff --git a/.gitignore b/.gitignore index 7f4857d09..370dbd6b1 100644 --- a/.gitignore +++ b/.gitignore @@ -12,4 +12,5 @@ htmlcov/ /.env .envrc .vscode -.venv/ \ No newline at end of file +.venv/ +.eggs/* diff --git a/VERSION.in b/VERSION.in index 7d385d419..992221508 100644 --- a/VERSION.in +++ b/VERSION.in @@ -1 +1 @@ -0.25 +0.26 diff --git a/cuebot/src/main/java/com/imageworks/spcue/Source.java b/cuebot/src/main/java/com/imageworks/spcue/Source.java index a54207c92..8f6d78ee3 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/Source.java +++ b/cuebot/src/main/java/com/imageworks/spcue/Source.java @@ -25,6 +25,10 @@ public class Source { public String source = "unknown"; + public String username = ""; + public String pid = ""; + public String host_kill = ""; + public String reason = ""; public Source() {} @@ -32,8 +36,24 @@ public Source(String source) { this.source = source; } + public Source(String source, String username, String pid, String host_kill, String reason) { + this.source = source; + this.username = username; + this.pid = pid; + this.host_kill = host_kill; + this.reason = reason; + } + + public String getReason() { + return this.reason; + } + public String toString() { - return this.source; + return "User: " + this.username + + ", Pid: " + this.pid + + ", Hostname: " + this.host_kill + + ", Reason: " + this.reason + + "\n" + this.source; } } diff --git a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java index e3cfa5178..fc01365a6 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java +++ b/cuebot/src/main/java/com/imageworks/spcue/servant/ManageJob.java @@ -275,7 +275,9 @@ public void kill(JobKillRequest request, StreamObserver respons try { setupJobData(request.getJob()); manageQueue.execute(new DispatchJobComplete(job, - new Source(request.toString()), true, jobManagerSupport)); + new Source(request.toString(), request.getUsername(), request.getPid(), + request.getHostKill(), request.getReason()), + true, jobManagerSupport)); responseObserver.onNext(JobKillResponse.newBuilder().build()); responseObserver.onCompleted(); } @@ -486,7 +488,8 @@ public void killFrames(JobKillFramesRequest request, StreamObserver responseObserver) { updateLayer(request.getLayer()); - if (attemptChange(env, property, jobManager, layer, responseObserver)) { - manageQueue.execute(new DispatchKillFrames(frameSearch, - new Source(request.toString()), jobManagerSupport)); - responseObserver.onNext(LayerKillFramesResponse.newBuilder().build()); - responseObserver.onCompleted(); - } + manageQueue.execute(new DispatchKillFrames(frameSearch, + new Source(request.toString(), request.getUsername(), request.getPid(), + request.getHostKill(), request.getReason()), jobManagerSupport)); + responseObserver.onNext(LayerKillFramesResponse.newBuilder().build()); + responseObserver.onCompleted(); } @Override diff --git a/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java b/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java index b2db74d59..6f5789ee2 100644 --- a/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java +++ b/cuebot/src/main/java/com/imageworks/spcue/service/JobManagerSupport.java @@ -75,76 +75,82 @@ public void queueShutdownJob(JobInterface job, Source source, boolean isManualKi public boolean shutdownJob(JobInterface job, Source source, boolean isManualKill) { - if (jobManager.shutdownJob(job)) { - - /* - * Satisfy any dependencies on just the - * job record, not layers or frames. - */ - satisfyWhatDependsOn(job); - - if (departmentManager.isManaged(job)) { - departmentManager.syncJobsWithTask(job); - } - - if (isManualKill) { + if (isManualKill && source.getReason().isEmpty()) { + logger.info(job.getName() + "/" + job.getId() + + " **Invalid Job Kill Request** for " + source.toString()); + } + else { + if (jobManager.shutdownJob(job)) { - logger.info(job.getName() + "/" + job.getId() + - " is being manually killed by " + source.toString()); + /* + * Satisfy any dependencies on just the + * job record, not layers or frames. + */ + satisfyWhatDependsOn(job); - /** - * Sleep a bit here in case any frames were - * dispatched during the job shutdown process. - */ - try { - Thread.sleep(3000); - } catch (InterruptedException e1) { - logger.info(job.getName() + "/" + job.getId() + - " shutdown thread was interrupted."); - Thread.currentThread().interrupt(); + if (departmentManager.isManaged(job)) { + departmentManager.syncJobsWithTask(job); } - FrameSearchInterface search = frameSearchFactory.create(job); - FrameSearchCriteria newCriteria = search.getCriteria(); - FrameStateSeq states = newCriteria.getStates().toBuilder() - .addFrameStates(FrameState.RUNNING) - .build(); - search.setCriteria(newCriteria.toBuilder().setStates(states).build()); + if (isManualKill) { - for (FrameInterface frame: jobManager.findFrames(search)) { + logger.info(job.getName() + "/" + job.getId() + + " is being manually killed by " + source.toString()); - VirtualProc proc = null; + /** + * Sleep a bit here in case any frames were + * dispatched during the job shutdown process. + */ try { - proc = hostManager.findVirtualProc(frame); - } - catch (DataAccessException e) { - logger.warn("Unable to find proc to kill frame " + frame + - " on job shutdown operation, " + e); + Thread.sleep(3000); + } catch (InterruptedException e1) { + logger.info(job.getName() + "/" + job.getId() + + " shutdown thread was interrupted."); + Thread.currentThread().interrupt(); } - if (manualStopFrame(frame, FrameState.WAITING)) { + FrameSearchInterface search = frameSearchFactory.create(job); + FrameSearchCriteria newCriteria = search.getCriteria(); + FrameStateSeq states = newCriteria.getStates().toBuilder() + .addFrameStates(FrameState.RUNNING) + .build(); + search.setCriteria(newCriteria.toBuilder().setStates(states).build()); + + for (FrameInterface frame: jobManager.findFrames(search)) { + + VirtualProc proc = null; try { - if (proc != null) { - kill(proc, source); - } - } catch (DataAccessException e) { - logger.warn("Failed to kill frame " + frame + + proc = hostManager.findVirtualProc(frame); + } + catch (DataAccessException e) { + logger.warn("Unable to find proc to kill frame " + frame + " on job shutdown operation, " + e); } - catch (Exception e) { - logger.warn("error killing frame: " + frame); + + if (manualStopFrame(frame, FrameState.WAITING)) { + try { + if (proc != null) { + kill(proc, source); + } + } catch (DataAccessException e) { + logger.warn("Failed to kill frame " + frame + + " on job shutdown operation, " + e); + } + catch (Exception e) { + logger.warn("error killing frame: " + frame); + } } } } - } - /* - * Send mail after all frames have been stopped or else the email - * will have inaccurate numbers. - */ - emailSupport.sendShutdownEmail(job); + /* + * Send mail after all frames have been stopped or else the email + * will have inaccurate numbers. + */ + emailSupport.sendShutdownEmail(job); - return true; + return true; + } } return false; diff --git a/cuegui/cuegui/MenuActions.py b/cuegui/cuegui/MenuActions.py index 6e1664671..7642eb383 100644 --- a/cuegui/cuegui/MenuActions.py +++ b/cuegui/cuegui/MenuActions.py @@ -24,6 +24,7 @@ from builtins import filter from builtins import str from builtins import object +import getpass import glob import subprocess import time @@ -65,7 +66,8 @@ TITLE = 0 TOOLTIP = 1 ICON = 2 - +DEFAULT_JOB_KILL_REASON = "Manual Job Kill Request in Cuegui by " + getpass.getuser() +DEFAULT_FRAME_KILL_REASON = "Manual Frame(s) Kill Request in Cuegui by " + getpass.getuser() # pylint: disable=missing-function-docstring,no-self-use,unused-argument @@ -368,7 +370,7 @@ def kill(self, rpcObjects=None): if cuegui.Utils.questionBoxYesNo(self._caller, "Kill jobs?", msg, [job.data.name for job in jobs]): for job in jobs: - job.kill() + job.kill(reason=DEFAULT_JOB_KILL_REASON) self.killDependents(jobs) self._update() @@ -384,7 +386,7 @@ def killDependents(self, jobs): sorted([dep.name() for dep in dependents])): for depJob in dependents: try: - depJob.kill() + depJob.kill(reason=DEFAULT_JOB_KILL_REASON) except opencue.exception.CueException as e: errMsg = "Failed to kill depending job: %s - %s" % (depJob.name(), e) logger.warning(errMsg) @@ -769,7 +771,7 @@ def kill(self, rpcObjects=None): "Kill ALL frames in selected layers?", [layer.data.name for layer in layers]): for layer in layers: - layer.kill() + layer.kill(reason=DEFAULT_FRAME_KILL_REASON) self._update() eat_info = ["&Eat", None, "eat"] @@ -1080,7 +1082,8 @@ def kill(self, rpcObjects=None): if cuegui.Utils.questionBoxYesNo(self._caller, "Confirm", "Kill selected frames?", names): - self._getSource().killFrames(name=names) + self._getSource().killFrames(reason=DEFAULT_FRAME_KILL_REASON, + name=names) self._update() markAsWaiting_info = ["Mark as &waiting", None, "configure"] diff --git a/cuegui/cuegui/plugins/StuckFramePlugin.py b/cuegui/cuegui/plugins/StuckFramePlugin.py index a44c90196..870dcaff4 100644 --- a/cuegui/cuegui/plugins/StuckFramePlugin.py +++ b/cuegui/cuegui/plugins/StuckFramePlugin.py @@ -23,12 +23,12 @@ from builtins import str from builtins import map import datetime -import re +import getpass import os -from datetime import datetime -import time -import socket +import re import signal +import socket +import time import yaml from qtpy import QtGui @@ -63,7 +63,7 @@ LLU_COLUMN = 3 RUNTIME_COLUMN = 4 LASTLINE_COLUMN = 7 - +DEFAULT_FRAME_KILL_REASON = "Manual Frame Kill Request in Cuegui by " + getpass.getuser() class StuckWidget(cuegui.AbstractDockWidget.AbstractDockWidget): """This builds what is displayed on the dock widget""" @@ -1345,7 +1345,7 @@ def logKill(self): if cuegui.Utils.questionBoxYesNo(self, "Confirm", "Kill selected frames?", names): self.log() for frame in self.selectedObjects(): - frame.kill() + frame.kill(reason=DEFAULT_FRAME_KILL_REASON) self.remove() def retryFrame(self): diff --git a/cuegui/tests/MenuActions_tests.py b/cuegui/tests/MenuActions_tests.py index f7a041e74..dea85612e 100644 --- a/cuegui/tests/MenuActions_tests.py +++ b/cuegui/tests/MenuActions_tests.py @@ -958,7 +958,9 @@ def test_kill(self, yesNoMock): self.frame_actions.kill(rpcObjects=[frame]) - self.job.killFrames.assert_called_with(name=[frame_name]) + self.job.killFrames.assert_called_with( + name=[frame_name], + reason="Manual Frame(s) Kill Request in Cuegui by root") @mock.patch('cuegui.Utils.questionBoxYesNo', return_value=True) def test_markAsWaiting(self, yesNoMock): diff --git a/proto/job.proto b/proto/job.proto index 79a0dc234..80b8cceec 100644 --- a/proto/job.proto +++ b/proto/job.proto @@ -885,6 +885,10 @@ message FrameGetWhatThisDependsOnResponse { // Kill message FrameKillRequest { Frame frame = 1; + string username = 2; + string pid = 3; + string host_kill = 4; + string reason = 5; } message FrameKillResponse {} // Empty @@ -1283,6 +1287,10 @@ message JobIsJobPendingResponse { // Kill message JobKillRequest { Job job = 1; + string username = 2; + string pid = 3; + string host_kill = 4; + string reason = 5; } message JobKillResponse {} // Empty @@ -1291,6 +1299,10 @@ message JobKillResponse {} // Empty message JobKillFramesRequest { Job job = 1; FrameSearchCriteria req = 2; + string username = 3; + string pid = 4; + string host_kill = 5; + string reason = 6; } message JobKillFramesResponse {} // Empty @@ -1613,6 +1625,10 @@ message LayerGetWhatThisDependsOnResponse { // KillFrames message LayerKillFramesRequest { Layer layer = 1; + string username = 2; + string pid = 3; + string host_kill = 4; + string reason = 5; } message LayerKillFramesResponse {} // Empty diff --git a/pycue/opencue/wrappers/frame.py b/pycue/opencue/wrappers/frame.py index 5aa528b5d..9c341d8a7 100644 --- a/pycue/opencue/wrappers/frame.py +++ b/pycue/opencue/wrappers/frame.py @@ -15,6 +15,7 @@ """Module for classes related to frames.""" import enum +import getpass import time import os @@ -59,10 +60,18 @@ def eat(self): if self.data.state != job_pb2.FrameState.Value('EATEN'): self.stub.Eat(job_pb2.FrameEatRequest(frame=self.data), timeout=Cuebot.Timeout) - def kill(self): + def kill(self, username=None, pid=None, host_kill=None, reason=None): """Kills the frame.""" + username = username if username else getpass.getuser() + pid = pid if pid else os.getpid() + host_kill = host_kill if host_kill else os.uname()[1] if self.data.state == job_pb2.FrameState.Value('RUNNING'): - self.stub.Kill(job_pb2.FrameKillRequest(frame=self.data), timeout=Cuebot.Timeout) + self.stub.Kill(job_pb2.FrameKillRequest(frame=self.data, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason), + timeout=Cuebot.Timeout) def retry(self): """Retries the frame.""" diff --git a/pycue/opencue/wrappers/job.py b/pycue/opencue/wrappers/job.py index cbb1869d9..2a86714f9 100644 --- a/pycue/opencue/wrappers/job.py +++ b/pycue/opencue/wrappers/job.py @@ -15,6 +15,7 @@ """Module for classes related to jobs.""" import enum +import getpass import os import time @@ -44,9 +45,17 @@ def __init__(self, job=None): self.stub = Cuebot.getStub('job') self.__frameStateTotals = {} - def kill(self): + def kill(self, username=None, pid=None, host_kill=None, reason=None): """Kills the job.""" - self.stub.Kill(job_pb2.JobKillRequest(job=self.data), timeout=Cuebot.Timeout) + username = username if username else getpass.getuser() + pid = pid if pid else os.getpid() + host_kill = host_kill if host_kill else os.uname()[1] + self.stub.Kill(job_pb2.JobKillRequest(job=self.data, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason), + timeout=Cuebot.Timeout) def pause(self): """Pauses the job.""" @@ -56,15 +65,23 @@ def resume(self): """Resumes the job.""" self.stub.Resume(job_pb2.JobResumeRequest(job=self.data), timeout=Cuebot.Timeout) - def killFrames(self, **request): + def killFrames(self, username=None, pid=None, host_kill=None, reason=None, **request): """Kills all frames that match the FrameSearch. :type request: Dict :param request: FrameSearch parameters """ + username = username if username else getpass.getuser() + pid = pid if pid else os.getpid() + host_kill = host_kill if host_kill else os.uname()[1] criteria = opencue.search.FrameSearch.criteriaFromOptions(**request) - self.stub.KillFrames(job_pb2.JobKillFramesRequest(job=self.data, req=criteria), - timeout=Cuebot.Timeout) + self.stub.KillFrames(job_pb2.JobKillFramesRequest(job=self.data, + req=criteria, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason), + timeout=Cuebot.Timeout) def eatFrames(self, **request): """Eats all frames that match the FrameSearch. @@ -791,9 +808,9 @@ def children(self): """Returns all job children.""" return self.__children - def kill(self): + def kill(self, username=None, pid=None, host_kill=None, reason=None): """Kills the job.""" - self.asJob().kill() + self.asJob().kill(username, pid, host_kill, reason) def pause(self): """Pauses the job.""" @@ -803,13 +820,13 @@ def resume(self): """Resumes the job.""" self.asJob().resume() - def killFrames(self, **request): + def killFrames(self, username=None, pid=None, host_kill=None, reason=None, **request): """Kills all frames that match the FrameSearch. :type request: Dict :param request: FrameSearch parameters """ - self.asJob().killFrames(**request) + self.asJob().killFrames(username, pid, host_kill, reason, **request) def eatFrames(self, **request): """Eats all frames that match the FrameSearch. diff --git a/pycue/opencue/wrappers/layer.py b/pycue/opencue/wrappers/layer.py index 0e34985bf..7a9782e0e 100644 --- a/pycue/opencue/wrappers/layer.py +++ b/pycue/opencue/wrappers/layer.py @@ -15,6 +15,7 @@ """Module for classes related to job layers.""" import enum +import getpass import os import opencue.api @@ -46,9 +47,16 @@ def __init__(self, layer=None): self.data = layer self.stub = Cuebot.getStub('layer') - def kill(self): + def kill(self, username=None, pid=None, host_kill=None, reason=None): """Kills the entire layer.""" - return self.stub.KillFrames(job_pb2.LayerKillFramesRequest(layer=self.data), + username = username if username else getpass.getuser() + pid = pid if pid else os.getpid() + host_kill = host_kill if host_kill else os.uname()[1] + return self.stub.KillFrames(job_pb2.LayerKillFramesRequest(layer=self.data, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason), timeout=Cuebot.Timeout) def eat(self): diff --git a/pycue/tests/wrappers/frame_test.py b/pycue/tests/wrappers/frame_test.py index d0e7c7ba6..82dcc11ae 100644 --- a/pycue/tests/wrappers/frame_test.py +++ b/pycue/tests/wrappers/frame_test.py @@ -19,6 +19,8 @@ from __future__ import print_function from __future__ import division from __future__ import absolute_import +import getpass +import os import time import unittest @@ -56,10 +58,18 @@ def testKill(self, getStubMock): frame = opencue.wrappers.frame.Frame( job_pb2.Frame(name=TEST_FRAME_NAME, state=job_pb2.RUNNING)) - frame.kill() + username = getpass.getuser() + pid = os.getpid() + host_kill = os.uname()[1] + reason = "Frames Kill Request" + frame.kill(username=username, pid=pid, host_kill=host_kill, reason=reason) stubMock.Kill.assert_called_with( - job_pb2.FrameKillRequest(frame=frame.data), timeout=mock.ANY) + job_pb2.FrameKillRequest(frame=frame.data, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason), timeout=mock.ANY) def testRetry(self, getStubMock): stubMock = mock.Mock() diff --git a/pycue/tests/wrappers/job_test.py b/pycue/tests/wrappers/job_test.py index 0ad2504de..1b9d2d420 100644 --- a/pycue/tests/wrappers/job_test.py +++ b/pycue/tests/wrappers/job_test.py @@ -19,6 +19,7 @@ from __future__ import print_function from __future__ import division from __future__ import absolute_import +import getpass import os import unittest @@ -46,10 +47,18 @@ def testKill(self, getStubMock): job = opencue.wrappers.job.Job( job_pb2.Job(name=TEST_JOB_NAME)) - job.kill() + username = getpass.getuser() + pid = os.getpid() + host_kill = os.uname()[1] + reason = "Job Kill Request" + job.kill(username=username, pid=pid, host_kill=host_kill, reason=reason) stubMock.Kill.assert_called_with( - job_pb2.JobKillRequest(job=job.data), timeout=mock.ANY) + job_pb2.JobKillRequest(job=job.data, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason), timeout=mock.ANY) def testPause(self, getStubMock): stubMock = mock.Mock() @@ -84,10 +93,23 @@ def testKillFrames(self, getStubMock): criteria = opencue.search.FrameSearch.criteriaFromOptions(range=frameRange) job = opencue.wrappers.job.Job( job_pb2.Job(name=TEST_JOB_NAME)) - job.killFrames(range=frameRange) + username = getpass.getuser() + pid = os.getpid() + host_kill = os.uname()[1] + reason = "Job Kill Request" + job.killFrames(range=frameRange, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason) stubMock.KillFrames.assert_called_with( - job_pb2.JobKillFramesRequest(job=job.data, req=criteria), timeout=mock.ANY) + job_pb2.JobKillFramesRequest(job=job.data, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason, + req=criteria), timeout=mock.ANY) def testEatFrames(self, getStubMock): stubMock = mock.Mock() diff --git a/pycue/tests/wrappers/layer_test.py b/pycue/tests/wrappers/layer_test.py index 4f5578681..8b71047a0 100644 --- a/pycue/tests/wrappers/layer_test.py +++ b/pycue/tests/wrappers/layer_test.py @@ -19,6 +19,8 @@ from __future__ import print_function from __future__ import division from __future__ import absolute_import +import getpass +import os import unittest import mock @@ -45,10 +47,18 @@ def testKill(self, getStubMock): layer = opencue.wrappers.layer.Layer( job_pb2.Layer(name=TEST_LAYER_NAME)) - layer.kill() + username = getpass.getuser() + pid = os.getpid() + host_kill = os.uname()[1] + reason = "Frames Kill Request" + layer.kill(username=username, pid=pid, host_kill=host_kill, reason=reason) stubMock.KillFrames.assert_called_with( - job_pb2.LayerKillFramesRequest(layer=layer.data), timeout=mock.ANY) + job_pb2.LayerKillFramesRequest(layer=layer.data, + username=username, + pid=str(pid), + host_kill=host_kill, + reason=reason), timeout=mock.ANY) def testEat(self, getStubMock): stubMock = mock.Mock()