From b48b886fab5510b3becd22b1908c38a2fde830fa Mon Sep 17 00:00:00 2001 From: david-alber Date: Fri, 19 Jan 2024 15:39:03 +0100 Subject: [PATCH] Gateway: add clarifying message for failed jobs with no logs (#1173) --- .../management/commands/update_jobs_statuses.py | 4 ++-- gateway/api/utils.py | 17 ++++++++++++++++- gateway/tests/api/management/test_commands.py | 17 +++++++++++++++-- gateway/tests/api/test_utils.py | 17 +++++++++++++++++ gateway/tests/fixtures/schedule_fixtures.json | 2 +- 5 files changed, 51 insertions(+), 6 deletions(-) diff --git a/gateway/api/management/commands/update_jobs_statuses.py b/gateway/api/management/commands/update_jobs_statuses.py index 753e72d1a..f8b652458 100644 --- a/gateway/api/management/commands/update_jobs_statuses.py +++ b/gateway/api/management/commands/update_jobs_statuses.py @@ -7,7 +7,7 @@ from api.models import Job from api.ray import get_job_handler from api.schedule import check_job_timeout, handle_job_status_not_available -from api.utils import ray_job_status_to_model_job_status +from api.utils import ray_job_status_to_model_job_status, check_logs logger = logging.getLogger("commands") @@ -55,7 +55,7 @@ def handle(self, *args, **options): if job_handler: logs = job_handler.logs(job.ray_job_id) - job.logs = logs + job.logs = check_logs(logs, job) try: job.save() diff --git a/gateway/api/utils.py b/gateway/api/utils.py index 495d71465..c53eb8eb4 100644 --- a/gateway/api/utils.py +++ b/gateway/api/utils.py @@ -6,7 +6,7 @@ import re import time import uuid -from typing import Optional, Tuple, Callable, Dict, Any +from typing import Optional, Tuple, Union, Callable, Dict, Any from cryptography.fernet import Fernet from ray.dashboard.modules.job.common import JobStatus @@ -182,3 +182,18 @@ def generate_cluster_name(username: str) -> str: pattern = re.compile("[^a-zA-Z0-9-.]") cluster_name = f"c-{re.sub(pattern,'-',username)}-{str(uuid.uuid4())[:8]}" return cluster_name + + +def check_logs(logs: Union[str, None], job: Job) -> str: + """Add error message to logs for faild jobs with empty logs. + Args: + logs: logs of the job + job: job model + + Returns: + logs with error message and metadata. + """ + if job.status == Job.FAILED and logs in ["", None]: + logs = f"Job {job.id} failed due to an internal error." + logger.warning("Job %s failed due to an internal error.", job.id) + return logs diff --git a/gateway/tests/api/management/test_commands.py b/gateway/tests/api/management/test_commands.py index a308fbf81..8f0cbb3db 100644 --- a/gateway/tests/api/management/test_commands.py +++ b/gateway/tests/api/management/test_commands.py @@ -32,8 +32,9 @@ def test_free_resources(self): @patch("api.ray.get_job_handler") def test_update_jobs_statuses(self, get_job_handler): """Tests update of job statuses.""" + # Test status change from PENDING to RUNNING ray_client = MagicMock() - ray_client.get_job_status.return_value = JobStatus.SUCCEEDED + ray_client.get_job_status.return_value = JobStatus.RUNNING ray_client.get_job_logs.return_value = "No logs yet." ray_client.stop_job.return_value = True ray_client.submit_job.return_value = "AwesomeJobId" @@ -42,7 +43,19 @@ def test_update_jobs_statuses(self, get_job_handler): call_command("update_jobs_statuses") job = Job.objects.get(id__exact="1a7947f9-6ae8-4e3d-ac1e-e7d608deec84") - self.assertEqual(job.status, "SUCCEEDED") + self.assertEqual(job.status, "RUNNING") + + # Test job logs for FAILED job with empty logs + ray_client.get_job_status.return_value = JobStatus.FAILED + ray_client.get_job_logs.return_value = "" + + call_command("update_jobs_statuses") + + job = Job.objects.get(id__exact="1a7947f9-6ae8-4e3d-ac1e-e7d608deec84") + self.assertEqual( + job.logs, + "Job 1a7947f9-6ae8-4e3d-ac1e-e7d608deec84 failed due to an internal error.", + ) @patch("api.schedule.execute_job") def test_schedule_queued_jobs(self, execute_job): diff --git a/gateway/tests/api/test_utils.py b/gateway/tests/api/test_utils.py index 03c0316db..05aee11e7 100644 --- a/gateway/tests/api/test_utils.py +++ b/gateway/tests/api/test_utils.py @@ -9,6 +9,7 @@ decrypt_string, encrypt_env_vars, decrypt_env_vars, + check_logs, ) @@ -75,3 +76,19 @@ def test_env_vars_encryption(self): self.assertEqual( env_vars_with_qiskit_runtime, decrypt_env_vars(encrypted_env_vars) ) + + def test_check_empty_logs(self): + """Test error notification for failed and empty logs.""" + job = MagicMock() + job.id = "42" + job.status = "FAILED" + logs = check_logs(logs="", job=job) + self.assertEqual(logs, "Job 42 failed due to an internal error.") + + def test_check_non_empty_logs(self): + """Test logs checker for non empty logs.""" + job = MagicMock() + job.id = "42" + job.status = "FAILED" + logs = check_logs(logs="awsome logs", job=job) + self.assertEqual(logs, "awsome logs") diff --git a/gateway/tests/fixtures/schedule_fixtures.json b/gateway/tests/fixtures/schedule_fixtures.json index 63f46e41d..53eef1f56 100644 --- a/gateway/tests/fixtures/schedule_fixtures.json +++ b/gateway/tests/fixtures/schedule_fixtures.json @@ -90,7 +90,7 @@ "program": "1a7947f9-6ae8-4e3d-ac1e-e7d608deec82", "created": "2024-02-01T15:30:43.281796Z", "result": "{\"somekey\":1}", - "status": "RUNNING", + "status": "PENDING", "author": 3, "compute_resource": "1a7947f9-6ae8-4e3d-ac1e-e7d608deec99" }