Skip to content

Commit

Permalink
Gateway: add clarifying message for failed jobs with no logs (#1173)
Browse files Browse the repository at this point in the history
  • Loading branch information
david-alber authored Jan 19, 2024
1 parent 3f1910b commit b48b886
Show file tree
Hide file tree
Showing 5 changed files with 51 additions and 6 deletions.
4 changes: 2 additions & 2 deletions gateway/api/management/commands/update_jobs_statuses.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from api.models import Job
from api.ray import get_job_handler
from api.schedule import check_job_timeout, handle_job_status_not_available
from api.utils import ray_job_status_to_model_job_status
from api.utils import ray_job_status_to_model_job_status, check_logs

logger = logging.getLogger("commands")

Expand Down Expand Up @@ -55,7 +55,7 @@ def handle(self, *args, **options):

if job_handler:
logs = job_handler.logs(job.ray_job_id)
job.logs = logs
job.logs = check_logs(logs, job)

try:
job.save()
Expand Down
17 changes: 16 additions & 1 deletion gateway/api/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import re
import time
import uuid
from typing import Optional, Tuple, Callable, Dict, Any
from typing import Optional, Tuple, Union, Callable, Dict, Any

from cryptography.fernet import Fernet
from ray.dashboard.modules.job.common import JobStatus
Expand Down Expand Up @@ -182,3 +182,18 @@ def generate_cluster_name(username: str) -> str:
pattern = re.compile("[^a-zA-Z0-9-.]")
cluster_name = f"c-{re.sub(pattern,'-',username)}-{str(uuid.uuid4())[:8]}"
return cluster_name


def check_logs(logs: Union[str, None], job: Job) -> str:
"""Add error message to logs for faild jobs with empty logs.
Args:
logs: logs of the job
job: job model
Returns:
logs with error message and metadata.
"""
if job.status == Job.FAILED and logs in ["", None]:
logs = f"Job {job.id} failed due to an internal error."
logger.warning("Job %s failed due to an internal error.", job.id)
return logs
17 changes: 15 additions & 2 deletions gateway/tests/api/management/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,9 @@ def test_free_resources(self):
@patch("api.ray.get_job_handler")
def test_update_jobs_statuses(self, get_job_handler):
"""Tests update of job statuses."""
# Test status change from PENDING to RUNNING
ray_client = MagicMock()
ray_client.get_job_status.return_value = JobStatus.SUCCEEDED
ray_client.get_job_status.return_value = JobStatus.RUNNING
ray_client.get_job_logs.return_value = "No logs yet."
ray_client.stop_job.return_value = True
ray_client.submit_job.return_value = "AwesomeJobId"
Expand All @@ -42,7 +43,19 @@ def test_update_jobs_statuses(self, get_job_handler):
call_command("update_jobs_statuses")

job = Job.objects.get(id__exact="1a7947f9-6ae8-4e3d-ac1e-e7d608deec84")
self.assertEqual(job.status, "SUCCEEDED")
self.assertEqual(job.status, "RUNNING")

# Test job logs for FAILED job with empty logs
ray_client.get_job_status.return_value = JobStatus.FAILED
ray_client.get_job_logs.return_value = ""

call_command("update_jobs_statuses")

job = Job.objects.get(id__exact="1a7947f9-6ae8-4e3d-ac1e-e7d608deec84")
self.assertEqual(
job.logs,
"Job 1a7947f9-6ae8-4e3d-ac1e-e7d608deec84 failed due to an internal error.",
)

@patch("api.schedule.execute_job")
def test_schedule_queued_jobs(self, execute_job):
Expand Down
17 changes: 17 additions & 0 deletions gateway/tests/api/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
decrypt_string,
encrypt_env_vars,
decrypt_env_vars,
check_logs,
)


Expand Down Expand Up @@ -75,3 +76,19 @@ def test_env_vars_encryption(self):
self.assertEqual(
env_vars_with_qiskit_runtime, decrypt_env_vars(encrypted_env_vars)
)

def test_check_empty_logs(self):
"""Test error notification for failed and empty logs."""
job = MagicMock()
job.id = "42"
job.status = "FAILED"
logs = check_logs(logs="", job=job)
self.assertEqual(logs, "Job 42 failed due to an internal error.")

def test_check_non_empty_logs(self):
"""Test logs checker for non empty logs."""
job = MagicMock()
job.id = "42"
job.status = "FAILED"
logs = check_logs(logs="awsome logs", job=job)
self.assertEqual(logs, "awsome logs")
2 changes: 1 addition & 1 deletion gateway/tests/fixtures/schedule_fixtures.json
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@
"program": "1a7947f9-6ae8-4e3d-ac1e-e7d608deec82",
"created": "2024-02-01T15:30:43.281796Z",
"result": "{\"somekey\":1}",
"status": "RUNNING",
"status": "PENDING",
"author": 3,
"compute_resource": "1a7947f9-6ae8-4e3d-ac1e-e7d608deec99"
}
Expand Down

0 comments on commit b48b886

Please sign in to comment.