Skip to content

Commit

Permalink
added double check on job info
Browse files Browse the repository at this point in the history
  • Loading branch information
jpdorsch committed Nov 30, 2022
1 parent fa1e0f3 commit 651d048
Showing 1 changed file with 19 additions and 4 deletions.
23 changes: 19 additions & 4 deletions src/compute/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from job_time import check_sacctTime

import logging
import time

from math import ceil
import os
Expand Down Expand Up @@ -266,6 +267,7 @@ def get_slurm_files(headers, system_name, system_addr, job_info, output=False, u
control_info = job_info
control_info["job_file_out"] = "Not available"
control_info["job_file_err"] = "Not available"
control_info["job_info_extra"] = "Job info returned successfully" # field for extra information about metadata of the job

ID = headers.get(TRACER_HEADER, '')
# scontrol command :
Expand All @@ -274,11 +276,24 @@ def get_slurm_files(headers, system_name, system_addr, job_info, output=False, u

app.logger.info(f"scontrol command: {action}")

resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin)
n_tries = 2 #tries 2 times to get the information of the jobs, otherwise returns error msg

for n_try in range(n_tries):

# if there was an error, the result will be SUCESS but not available outputs
if resp["error"] != 0:
return control_info
resp = exec_remote_command(headers, system_name, system_addr, action, no_home=use_plugin)

# if there was an error, the result will be SUCESS but not available outputs
if resp["error"] == 0:
break

app.logger.warning(f"Error getting job info. Reason: {resp['msg']}")

if n_try == n_tries - 1:
app.logger.warning(f"Returning default values")
control_info["job_info_extra"] = resp["msg"]
return control_info

time.sleep(TIMEOUT) # wait until next try

# if it's ok, we can add information
control_resp = resp["msg"]
Expand Down

0 comments on commit 651d048

Please sign in to comment.