From 442bbbb14ad5303908d0962ffdcac0b82d859785 Mon Sep 17 00:00:00 2001 From: zhaoyu Date: Wed, 13 Nov 2024 05:50:55 -0500 Subject: [PATCH 01/19] Add customJob from ctrl_bps --- config/bps_cmd.yaml | 31 +++++ .../panda/edgenode/build_cmd_line_decoder.py | 41 +------ .../bps/panda/edgenode/cmd_line_decoder.py | 110 +++++++++++++++++- python/lsst/ctrl/bps/panda/panda_service.py | 24 ++-- python/lsst/ctrl/bps/panda/utils.py | 61 +++++++++- 5 files changed, 218 insertions(+), 49 deletions(-) create mode 100644 config/bps_cmd.yaml diff --git a/config/bps_cmd.yaml b/config/bps_cmd.yaml new file mode 100644 index 0000000..50f1dd4 --- /dev/null +++ b/config/bps_cmd.yaml @@ -0,0 +1,31 @@ +payloadCommand: > + cd {jobInitDir}; + ls -al; + {setupLSSTEnv} + if [[ ! -z \"\${PANDA_AUTH_DIR}\" ]] && [[ ! -z \"\${PANDA_AUTH_ORIGIN}\" ]]; + then export PANDA_AUTH_ID_TOKEN=\$(cat $PANDA_AUTH_DIR); + export PANDA_AUTH_VO=\$PANDA_AUTH_ORIGIN; + export IDDS_OIDC_TOKEN=\$(cat \$PANDA_AUTH_DIR); + export IDDS_VO=\$PANDA_AUTH_ORIGIN; + export PANDA_AUTH=oidc; + else unset PANDA_AUTH; + export IDDS_AUTH_TYPE=x509_proxy; fi; + export PANDA_CONFIG_ROOT=\$(pwd); + export PANDA_VERIFY_HOST=off; + export PANDA_SYS=\$CONDA_PREFIX; + export PANDA_URL_SSL=\${PANDA_SERVER_URL}/server/panda; + export PANDACACHE_URL=\$PANDA_URL_SSL; + export PANDA_URL=\$PANDA_URL_SSL; + export PANDA_BEHIND_REAL_LB=true; + pwd; + export RUBIN_ES_MAP_FILE=orderIdMapFilename; + python3 \${CTRL_BPS_PANDA_DIR}/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py _cmd_line_ & pJob=\$!; + prmon -i 5 + -f ${logDir}/memory_monitor_output.txt + -j ${logDir}/memory_monitor_summary.json + -p \$pJob & mJob=\$!; + wait \$pJob; + ret=\$?; + wait \$mJob; + {jobCleanup} + exit \$ret; diff --git a/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py b/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py index 41a835e..b326b8f 100644 --- a/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py +++ b/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py @@ -11,7 +11,6 @@ import logging import os import sys -import tarfile from lsst.ctrl.bps.constants import DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT from lsst.ctrl.bps.drivers import prepare_driver @@ -19,6 +18,7 @@ from lsst.ctrl.bps.panda.utils import copy_files_for_distribution, get_idds_client from lsst.resources import ResourcePath from lsst.utils.timer import time_this +from lsst.ctrl.bps.panda.utils import download_extract_archive logging.basicConfig( stream=sys.stdout, @@ -29,45 +29,6 @@ _LOG = logging.getLogger(__name__) -def download_extract_archive(filename): - """Download and extract the tarball from pandacache. - - Parameters - ---------- - filename : `str` - The filename to download. - """ - archive_basename = os.path.basename(filename) - target_dir = os.getcwd() - full_output_filename = os.path.join(target_dir, archive_basename) - - if filename.startswith("https:"): - panda_cache_url = os.path.dirname(os.path.dirname(filename)) - os.environ["PANDACACHE_URL"] = panda_cache_url - elif "PANDACACHE_URL" not in os.environ and "PANDA_URL_SSL" in os.environ: - os.environ["PANDACACHE_URL"] = os.environ["PANDA_URL_SSL"] - panda_cache_url = os.environ.get("PANDACACHE_URL", None) - print(f"PANDACACHE_URL: {panda_cache_url}") - - from pandaclient import Client - - attempt = 0 - max_attempts = 3 - done = False - while attempt < max_attempts and not done: - status, output = Client.getFile(archive_basename, output_path=full_output_filename) - if status == 0: - done = True - print(f"Download archive file from pandacache status: {status}, output: {output}") - if status != 0: - raise RuntimeError("Failed to download archive file from pandacache") - with tarfile.open(full_output_filename, "r:gz") as f: - f.extractall(target_dir) - print(f"Extract {full_output_filename} to {target_dir}") - os.remove(full_output_filename) - print(f"Remove {full_output_filename}") - - def create_idds_workflow(config_file, compute_site): """Create pipeline workflow at remote site. diff --git a/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py b/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py index 771babf..3ad268a 100644 --- a/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py +++ b/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py @@ -41,7 +41,7 @@ import sys from lsst.resources import ResourcePath - +from lsst.ctrl.bps.panda.utils import download_extract_archive def replace_placeholders(cmd_line: str, tag: str, replacements: dict[str, str]) -> str: """Replace the placeholders. @@ -139,6 +139,14 @@ def deliver_input_files(src_path, files, skip_copy): """ files = files.split("+") src_uri = ResourcePath(src_path, forceDirectory=True) + + if 'jobO' in skip_copy: + download_extract_archive(skip_copy) + for script in files: + file_name_placeholder, file_pfn = script.split(":") + os.chmod(file_pfn, 0o755) + return + for file in files: file_name_placeholder, file_pfn = file.split(":") if file_name_placeholder not in skip_copy.split("+"): @@ -163,6 +171,106 @@ def deliver_input_files(src_path, files, skip_copy): if file_name_placeholder == "job_executable": os.chmod(dest.path, 0o777) +def replace_event_file(params, files): + """Replace events with node id. + + Parameters + ---------- + params : `str` + String with parameters separated by the '+' sign. + Example params: + isr:eventservice_90^10+somethingelse. This part + 'isr:eventservice_90^10' is the EventService parameter. + The format for the EventService parameter for LSST is + 'label:eventservice_^'. The '' should + start from 1, which means the first event of the file + 'label:eventservice_'. In EventService, all pseudo files + for a label is recorded in the 'orderIdMapFilename' file, with + a dict {'label0':{"0":"pseudo_file0", "1":..},'label1':..}. + For example, for a workflow with 100 pseudo files for the 'isr' label, + the dict will be {'isr': {"0": "pseudo0", "1": "pseudo_file1", + "99": "pseudo_file99"}}. If we split the 100 pseudo files into 5 PanDA + jobs with 20 files per PanDA job, the 5 eventservice group name will be + 'isr:event_service_0' for events ["0"~"19"], 'isr:event_service_20' for + events ["20"~"39"], ..., and 'isr:event_service_80' for events + ["80"~"99"]. The EventService param 'isr:event_service_80^5' means the + 5th event in the group 'isr:event_service_80', which is '80 + 5 -1=84' + and will be mapped to file 'pseudo_file84'. + files : `str` + String with file names separated by the '+' sign. + Example: + orderIdMapFilename:panda_order_id_map.json+runQgraphFile:a.qgraph + + Returns + ------- + ret_status: `bool` + Status of this function. If eventservice is enabled but this function + cannot handle it, it should return False. Otherwise it should + return True. + with_events: `bool` + Whether there are event parameters. + params_map: `dict` [`str`, `dict`] + Parameter map for event information. + """ + ret_status = True + with_events = False + files = files.split("+") + file_map = {} + for file in files: + file_name_placeholder, file_pfn = file.split(":") + file_map[file_name_placeholder] = file_pfn + order_id_map_file = file_map.get("orderIdMapFilename", None) + order_id_map = {} + try: + # The orderIdMapFilename should exist locally or copied to current + # directory by deliver_input_files + if order_id_map_file and os.path.exists(order_id_map_file): + with open(order_id_map_file) as f: + order_id_map = json.load(f) + except Exception as ex: + print(f"failed to load orderIdMapFilename: {ex}") + + params_map = {} + params_list = params.split("+") + for param in params_list: + if "eventservice_" in param: + with_events = True + label, event = param.split(":") + event_id = event.split("_")[1] + event_base_id = event_id.split("^")[0] + # The original format for EventService parameter is + # 'label:eventservice_^^', + # which can have multiple events per EventService job. + # However, for LSST, the '' is always 1. + # When is 1, it will not show. So for LSST, + # we will see 'label:eventservice_^'. + # However, to leave posibilities for future updates, + # the line below has two splits based on '^', which is from + # the original EventService parameter format. + event_order = event_id.split("^")[1].split("^")[0] + event_index = str(int(event_base_id) + int(event_order) - 1) + if not order_id_map: + print("EventSerice is enabled but order_id_map file doesn't exist.") + ret_status = False + break + + if label not in order_id_map: + print( + f"EventSerice is enabled but label {label} doesn't in the keys" + f" of order_id_map {order_id_map.keys()}" + ) + ret_status = False + break + if event_index not in order_id_map[label]: + print( + f"EventSerice is enabled but event_index {event_index} is not" + f" in order_id_map[{label}] {order_id_map[label].keys()}" + ) + ret_status = False + break + + params_map[param] = {"event_index": event_index, "order_id_map": order_id_map[label]} + return ret_status, with_events, params_map def replace_event_file(params, files): """Replace events with node id. diff --git a/python/lsst/ctrl/bps/panda/panda_service.py b/python/lsst/ctrl/bps/panda/panda_service.py index c496e13..f792a29 100644 --- a/python/lsst/ctrl/bps/panda/panda_service.py +++ b/python/lsst/ctrl/bps/panda/panda_service.py @@ -68,6 +68,7 @@ def prepare(self, config, generic_workflow, out_prefix=None): def submit(self, workflow, **kwargs): config = kwargs["config"] if "config" in kwargs else None remote_build = kwargs["remote_build"] if "remote_build" in kwargs else None + _, submit_cmd = config.search("submitCmd", opt={"default": False}) if config and remote_build: _LOG.info("remote build") @@ -87,6 +88,7 @@ def submit(self, workflow, **kwargs): _LOG.info("Submitted into iDDs with request id=%s", request_id) idds_build_workflow.run_id = request_id return idds_build_workflow + else: _, max_copy_workers = self.config.search( "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS} @@ -96,14 +98,19 @@ def submit(self, workflow, **kwargs): lsst_temp = "LSST_RUN_TEMP_SPACE" if lsst_temp in file_distribution_uri and lsst_temp not in os.environ: file_distribution_uri = self.config["fileDistributionEndPointDefault"] - copy_files_for_distribution( - workflow.files_to_pre_stage, - ResourcePath(file_distribution_uri, forceDirectory=True), - max_copy_workers, - ) + if not submit_cmd: + copy_files_for_distribution( + workflow.files_to_pre_stage, + ResourcePath(file_distribution_uri, forceDirectory=True), + max_copy_workers, + ) + #''' idds_client = get_idds_client(self.config) - ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False) + ret = idds_client.submit( + workflow.idds_client_workflow, + username=None, + use_dataset_name=False) _LOG.debug("iDDS client manager submit returned = %s", ret) # Check submission success @@ -111,10 +118,13 @@ def submit(self, workflow, **kwargs): if status: request_id = int(result) else: - raise RuntimeError(f"Error submitting to PanDA service: {error}") + raise RuntimeError( + f"Error submitting to PanDA service: {error}" + ) _LOG.info("Submitted into iDDs with request id=%s", request_id) workflow.run_id = request_id + #''' def restart(self, wms_workflow_id): # Docstring inherited from BaseWmsService.restart. diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index 7915a0b..22a5252 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -222,6 +222,7 @@ def _make_doma_work( es_label=None, max_payloads_per_panda_job=PANDA_DEFAULT_MAX_PAYLOADS_PER_PANDA_JOB, max_wms_job_wall_time=None, + remote_filename=None, ): """Make the DOMA Work object for a PanDA task. @@ -331,12 +332,19 @@ def _make_doma_work( if gwfile.job_access_remote: direct_io_files.add(gwfile.name) + _, submit_cmd = config.search("submitCmd", opt={"default": False}) + if not direct_io_files: - direct_io_files.add("cmdlineplaceholder") + if submit_cmd: + direct_io_files.add(remote_filename) + else: + direct_io_files.add("cmdlineplaceholder") lsst_temp = "LSST_RUN_TEMP_SPACE" if lsst_temp in file_distribution_end_point and lsst_temp not in os.environ: file_distribution_end_point = file_distribution_end_point_default + if submit_cmd and not file_distribution_end_point: + file_distribution_end_point = "FileDistribution" executable = add_decoder_prefix( config, cmd_line, file_distribution_end_point, (local_pfns, direct_io_files) @@ -554,6 +562,9 @@ def add_idds_work(config, generic_workflow, idds_workflow): RuntimeError If cannot recover from dependency issues after pass through workflow. """ + # custom job + _, submit_cmd = config.search("submitCmd", opt={"default": False}) + # event service _, enable_event_service = config.search("enableEventService", opt={"default": None}) _, max_payloads_per_panda_job = config.search( @@ -574,6 +585,16 @@ def add_idds_work(config, generic_workflow, idds_workflow): job_to_task = {} job_to_pseudo_filename = {} task_count = 0 # Task number/ID in idds workflow used for unique name + remote_filename = None + + if submit_cmd: + files = [] + _, script = config["customJob"].search("executable", opt={"default": ""}) + files.append(script) + submit_path = config["submitPath"] + archive_filename = f"jobO.{uuid.uuid4()}.tar.gz" + archive_filename = create_archive_file(submit_path, archive_filename, files) + remote_filename = copy_files_to_pandacache(archive_filename) es_files = {} name_works = {} @@ -649,6 +670,7 @@ def add_idds_work(config, generic_workflow, idds_workflow): es_label=job_label, max_payloads_per_panda_job=max_payloads_per_panda_job, max_wms_job_wall_time=max_wms_job_wall_time, + remote_filename=remote_filename, ) name_works[work.task_name] = work files_to_pre_stage.update(files) @@ -759,6 +781,43 @@ def copy_files_to_pandacache(filename): filename = os.path.join(cache_path, filename) return filename +def download_extract_archive(filename): + """Download and extract the tarball from pandacache. + + Parameters + ---------- + filename : `str` + The filename to download. + """ + archive_basename = os.path.basename(filename) + target_dir = os.getcwd() + full_output_filename = os.path.join(target_dir, archive_basename) + + if filename.startswith("https:"): + panda_cache_url = os.path.dirname(os.path.dirname(filename)) + os.environ["PANDACACHE_URL"] = panda_cache_url + elif "PANDACACHE_URL" not in os.environ and "PANDA_URL_SSL" in os.environ: + os.environ["PANDACACHE_URL"] = os.environ["PANDA_URL_SSL"] + panda_cache_url = os.environ.get("PANDACACHE_URL", None) + print(f"PANDACACHE_URL: {panda_cache_url}") + + from pandaclient import Client + + attempt = 0 + max_attempts = 3 + done = False + while attempt < max_attempts and not done: + status, output = Client.getFile(archive_basename, output_path=full_output_filename) + if status == 0: + done = True + print(f"Download archive file from pandacache status: {status}, output: {output}") + if status != 0: + raise RuntimeError("Failed to download archive file from pandacache") + with tarfile.open(full_output_filename, "r:gz") as f: + f.extractall(target_dir) + print(f"Extract {full_output_filename} to {target_dir}") + os.remove(full_output_filename) + print(f"Remove {full_output_filename}") def get_task_parameter(config, remote_build, key): search_opt = {"replaceVars": True, "expandEnvVars": False, "replaceEnvVars": False, "required": False} From b011b973d76467d9fe105922392baabbac1f71e8 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Tue, 19 Nov 2024 11:41:13 -0500 Subject: [PATCH 02/19] Reformat panda_service.py module with black **black** wasn't quite happy with formatting in `panda_service.py`. I let it do its thing. --- python/lsst/ctrl/bps/panda/panda_service.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/panda_service.py b/python/lsst/ctrl/bps/panda/panda_service.py index f792a29..1c958e8 100644 --- a/python/lsst/ctrl/bps/panda/panda_service.py +++ b/python/lsst/ctrl/bps/panda/panda_service.py @@ -107,10 +107,7 @@ def submit(self, workflow, **kwargs): ) #''' idds_client = get_idds_client(self.config) - ret = idds_client.submit( - workflow.idds_client_workflow, - username=None, - use_dataset_name=False) + ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False) _LOG.debug("iDDS client manager submit returned = %s", ret) # Check submission success @@ -118,9 +115,7 @@ def submit(self, workflow, **kwargs): if status: request_id = int(result) else: - raise RuntimeError( - f"Error submitting to PanDA service: {error}" - ) + raise RuntimeError(f"Error submitting to PanDA service: {error}") _LOG.info("Submitted into iDDs with request id=%s", request_id) workflow.run_id = request_id From 1c1f844e802b1485d5142627b221604eae54a454 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Tue, 19 Nov 2024 11:44:51 -0500 Subject: [PATCH 03/19] Remove comments that seemed to be out of place --- python/lsst/ctrl/bps/panda/panda_service.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/panda_service.py b/python/lsst/ctrl/bps/panda/panda_service.py index 1c958e8..525db75 100644 --- a/python/lsst/ctrl/bps/panda/panda_service.py +++ b/python/lsst/ctrl/bps/panda/panda_service.py @@ -93,7 +93,6 @@ def submit(self, workflow, **kwargs): _, max_copy_workers = self.config.search( "maxCopyWorkers", opt={"default": PANDA_DEFAULT_MAX_COPY_WORKERS} ) - # Docstring inherited from BaseWmsService.submit. file_distribution_uri = self.config["fileDistributionEndPoint"] lsst_temp = "LSST_RUN_TEMP_SPACE" if lsst_temp in file_distribution_uri and lsst_temp not in os.environ: @@ -105,7 +104,7 @@ def submit(self, workflow, **kwargs): ResourcePath(file_distribution_uri, forceDirectory=True), max_copy_workers, ) - #''' + idds_client = get_idds_client(self.config) ret = idds_client.submit(workflow.idds_client_workflow, username=None, use_dataset_name=False) _LOG.debug("iDDS client manager submit returned = %s", ret) @@ -119,7 +118,6 @@ def submit(self, workflow, **kwargs): _LOG.info("Submitted into iDDs with request id=%s", request_id) workflow.run_id = request_id - #''' def restart(self, wms_workflow_id): # Docstring inherited from BaseWmsService.restart. From 73abf92aeb434314e4930ec7ea4e3ac8e206f927 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Tue, 19 Nov 2024 12:18:45 -0500 Subject: [PATCH 04/19] Pass run attribs between generic and WMS workflows The run attributes of the generic workflow were not being passed to the WMS workflow. Changed that. --- python/lsst/ctrl/bps/panda/panda_service.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/lsst/ctrl/bps/panda/panda_service.py b/python/lsst/ctrl/bps/panda/panda_service.py index 525db75..c8cb6d4 100644 --- a/python/lsst/ctrl/bps/panda/panda_service.py +++ b/python/lsst/ctrl/bps/panda/panda_service.py @@ -373,12 +373,16 @@ def __init__(self, name, config=None): super().__init__(name, config) self.files_to_pre_stage = {} # src, dest self.idds_client_workflow = IDDS_client_workflow(name=name) + self.run_attrs = {} @classmethod def from_generic_workflow(cls, config, generic_workflow, out_prefix, service_class): # Docstring inherited from BaseWmsWorkflow.from_generic_workflow. wms_workflow = cls(generic_workflow.name, config) + if generic_workflow.run_attrs: + wms_workflow.run_attrs.update(generic_workflow.run_attrs) + files, dag_sink_work, task_count = add_idds_work( config, generic_workflow, wms_workflow.idds_client_workflow ) From f35de3ad346e2a8059b9e234dd0d28a1cd1549f0 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Tue, 19 Nov 2024 12:22:52 -0500 Subject: [PATCH 05/19] Use run attribute to determine if custom job The plugin were using an extra configuration, `submitCmd`, to determine if the workflow being submitted is a regular one (i.e. with payload jobs) or special (i.e. running a custom script). I make changes so a run attribute, `bps_iscustom`, is used instead. --- python/lsst/ctrl/bps/panda/panda_service.py | 2 +- python/lsst/ctrl/bps/panda/utils.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/panda_service.py b/python/lsst/ctrl/bps/panda/panda_service.py index c8cb6d4..45d514c 100644 --- a/python/lsst/ctrl/bps/panda/panda_service.py +++ b/python/lsst/ctrl/bps/panda/panda_service.py @@ -68,7 +68,6 @@ def prepare(self, config, generic_workflow, out_prefix=None): def submit(self, workflow, **kwargs): config = kwargs["config"] if "config" in kwargs else None remote_build = kwargs["remote_build"] if "remote_build" in kwargs else None - _, submit_cmd = config.search("submitCmd", opt={"default": False}) if config and remote_build: _LOG.info("remote build") @@ -98,6 +97,7 @@ def submit(self, workflow, **kwargs): if lsst_temp in file_distribution_uri and lsst_temp not in os.environ: file_distribution_uri = self.config["fileDistributionEndPointDefault"] + submit_cmd = workflow.run_attrs.get("bps_iscustom", False) if not submit_cmd: copy_files_for_distribution( workflow.files_to_pre_stage, diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index 22a5252..a32f0c5 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -332,7 +332,7 @@ def _make_doma_work( if gwfile.job_access_remote: direct_io_files.add(gwfile.name) - _, submit_cmd = config.search("submitCmd", opt={"default": False}) + submit_cmd = generic_workflow.run_attrs.get("bps_iscustom", False) if not direct_io_files: if submit_cmd: @@ -562,9 +562,6 @@ def add_idds_work(config, generic_workflow, idds_workflow): RuntimeError If cannot recover from dependency issues after pass through workflow. """ - # custom job - _, submit_cmd = config.search("submitCmd", opt={"default": False}) - # event service _, enable_event_service = config.search("enableEventService", opt={"default": None}) _, max_payloads_per_panda_job = config.search( @@ -587,6 +584,7 @@ def add_idds_work(config, generic_workflow, idds_workflow): task_count = 0 # Task number/ID in idds workflow used for unique name remote_filename = None + submit_cmd = generic_workflow.run_attrs.get("bps_iscustom", False) if submit_cmd: files = [] _, script = config["customJob"].search("executable", opt={"default": ""}) @@ -781,6 +779,7 @@ def copy_files_to_pandacache(filename): filename = os.path.join(cache_path, filename) return filename + def download_extract_archive(filename): """Download and extract the tarball from pandacache. @@ -819,6 +818,7 @@ def download_extract_archive(filename): os.remove(full_output_filename) print(f"Remove {full_output_filename}") + def get_task_parameter(config, remote_build, key): search_opt = {"replaceVars": True, "expandEnvVars": False, "replaceEnvVars": False, "required": False} _, value = remote_build.search(key, search_opt) From b0abadc4391db85a670d78bab87f912848568c81 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Tue, 19 Nov 2024 17:12:51 -0500 Subject: [PATCH 06/19] Document submitting custom scripts --- doc/lsst.ctrl.bps.panda/userguide.rst | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/doc/lsst.ctrl.bps.panda/userguide.rst b/doc/lsst.ctrl.bps.panda/userguide.rst index e7c3cf0..a400921 100644 --- a/doc/lsst.ctrl.bps.panda/userguide.rst +++ b/doc/lsst.ctrl.bps.panda/userguide.rst @@ -71,6 +71,46 @@ See `bps submit`_ and https://panda.lsst.io for details. .. __: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#submitting-a-run +.. _panda-plugin-submitting-custom-script: + +Submitting a custom script +-------------------------- + +See `bps submitcmd`_ for details. + +.. Describe any plugin specific ascpects of a submissinon below if any. + +To execute custom scripts you need to specify the version of the LSST Stack to +use *and* include the settings from +``${CTRL_BPS_PANDA_DIR}/config/bps_panda_DF.yaml`` and +``${CTRL_BPS_PANDA_DIR}/config/bps_cmd.yaml`` in your BPS config. + +.. code-block:: yaml + + LSST_VERSION: + + includeConfigs: + - ${CTRL_BPS_PANDA_DIR}/config/bps_panda_DF.yaml + - ${CTRL_BPS_PANDA_DIR}/config/bps_cmd.yaml + + customJob: + executable: + arguments: + +where ```` is the version of the LSST Stack to use while ```` +and ```` are respectively the script to run and arguments it takes. + +Be default, the script will be executed at USDF (SLAC). If you would like your +script to be executed at FrDF (CC-IN2P3), set ``computeSite`` and +``computeCloud`` in your BPS config to the values shown below: + +.. code-block:: yaml + + computeSite: "CC-IN2P3" + computeCloud: "EU" + +To execute the script in UKDF, set ``computeSite`` to ``LANCS``. + .. _panda-plugin-status: Checking status @@ -111,4 +151,5 @@ Restarting a failed run .. _bps report: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#checking-status .. _bps restart: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#restarting-a-failed-run .. _bps submit: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#submitting-a-run +.. _bps submitcmd: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#submitting-a-custom-script .. _ctrl_bps: https://github.com/lsst/ctrl_bps.git From 6ae0aad81ba831cab9bcf41fd6964f8c1d85a46f Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Thu, 21 Nov 2024 17:03:29 -0500 Subject: [PATCH 07/19] Add the news item describing the changes --- doc/changes/DM-46307.feature.rst | 1 + 1 file changed, 1 insertion(+) create mode 100644 doc/changes/DM-46307.feature.rst diff --git a/doc/changes/DM-46307.feature.rst b/doc/changes/DM-46307.feature.rst new file mode 100644 index 0000000..3991f23 --- /dev/null +++ b/doc/changes/DM-46307.feature.rst @@ -0,0 +1 @@ +The changes were made to allow the plugin to execute jobs remotely without the necessity to use remote build approach. From 70f47db7df9501471fcdf49253fcfda863326df7 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Thu, 21 Nov 2024 16:57:30 -0500 Subject: [PATCH 08/19] Remove duplicated code For some reasons (merge went wrong?) `replace_event_file()` function was duplicated in `cmd_line_decoder.py` module. Removed the duplicate. --- .../panda/edgenode/build_cmd_line_decoder.py | 3 +- .../bps/panda/edgenode/cmd_line_decoder.py | 105 +----------------- 2 files changed, 4 insertions(+), 104 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py b/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py index b326b8f..c49edc1 100644 --- a/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py +++ b/python/lsst/ctrl/bps/panda/edgenode/build_cmd_line_decoder.py @@ -15,10 +15,9 @@ from lsst.ctrl.bps.constants import DEFAULT_MEM_FMT, DEFAULT_MEM_UNIT from lsst.ctrl.bps.drivers import prepare_driver from lsst.ctrl.bps.panda.constants import PANDA_DEFAULT_MAX_COPY_WORKERS -from lsst.ctrl.bps.panda.utils import copy_files_for_distribution, get_idds_client +from lsst.ctrl.bps.panda.utils import copy_files_for_distribution, download_extract_archive, get_idds_client from lsst.resources import ResourcePath from lsst.utils.timer import time_this -from lsst.ctrl.bps.panda.utils import download_extract_archive logging.basicConfig( stream=sys.stdout, diff --git a/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py b/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py index 3ad268a..46858e9 100644 --- a/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py +++ b/python/lsst/ctrl/bps/panda/edgenode/cmd_line_decoder.py @@ -40,8 +40,9 @@ import re import sys -from lsst.resources import ResourcePath from lsst.ctrl.bps.panda.utils import download_extract_archive +from lsst.resources import ResourcePath + def replace_placeholders(cmd_line: str, tag: str, replacements: dict[str, str]) -> str: """Replace the placeholders. @@ -140,7 +141,7 @@ def deliver_input_files(src_path, files, skip_copy): files = files.split("+") src_uri = ResourcePath(src_path, forceDirectory=True) - if 'jobO' in skip_copy: + if "jobO" in skip_copy: download_extract_archive(skip_copy) for script in files: file_name_placeholder, file_pfn = script.split(":") @@ -171,106 +172,6 @@ def deliver_input_files(src_path, files, skip_copy): if file_name_placeholder == "job_executable": os.chmod(dest.path, 0o777) -def replace_event_file(params, files): - """Replace events with node id. - - Parameters - ---------- - params : `str` - String with parameters separated by the '+' sign. - Example params: - isr:eventservice_90^10+somethingelse. This part - 'isr:eventservice_90^10' is the EventService parameter. - The format for the EventService parameter for LSST is - 'label:eventservice_^'. The '' should - start from 1, which means the first event of the file - 'label:eventservice_'. In EventService, all pseudo files - for a label is recorded in the 'orderIdMapFilename' file, with - a dict {'label0':{"0":"pseudo_file0", "1":..},'label1':..}. - For example, for a workflow with 100 pseudo files for the 'isr' label, - the dict will be {'isr': {"0": "pseudo0", "1": "pseudo_file1", - "99": "pseudo_file99"}}. If we split the 100 pseudo files into 5 PanDA - jobs with 20 files per PanDA job, the 5 eventservice group name will be - 'isr:event_service_0' for events ["0"~"19"], 'isr:event_service_20' for - events ["20"~"39"], ..., and 'isr:event_service_80' for events - ["80"~"99"]. The EventService param 'isr:event_service_80^5' means the - 5th event in the group 'isr:event_service_80', which is '80 + 5 -1=84' - and will be mapped to file 'pseudo_file84'. - files : `str` - String with file names separated by the '+' sign. - Example: - orderIdMapFilename:panda_order_id_map.json+runQgraphFile:a.qgraph - - Returns - ------- - ret_status: `bool` - Status of this function. If eventservice is enabled but this function - cannot handle it, it should return False. Otherwise it should - return True. - with_events: `bool` - Whether there are event parameters. - params_map: `dict` [`str`, `dict`] - Parameter map for event information. - """ - ret_status = True - with_events = False - files = files.split("+") - file_map = {} - for file in files: - file_name_placeholder, file_pfn = file.split(":") - file_map[file_name_placeholder] = file_pfn - order_id_map_file = file_map.get("orderIdMapFilename", None) - order_id_map = {} - try: - # The orderIdMapFilename should exist locally or copied to current - # directory by deliver_input_files - if order_id_map_file and os.path.exists(order_id_map_file): - with open(order_id_map_file) as f: - order_id_map = json.load(f) - except Exception as ex: - print(f"failed to load orderIdMapFilename: {ex}") - - params_map = {} - params_list = params.split("+") - for param in params_list: - if "eventservice_" in param: - with_events = True - label, event = param.split(":") - event_id = event.split("_")[1] - event_base_id = event_id.split("^")[0] - # The original format for EventService parameter is - # 'label:eventservice_^^', - # which can have multiple events per EventService job. - # However, for LSST, the '' is always 1. - # When is 1, it will not show. So for LSST, - # we will see 'label:eventservice_^'. - # However, to leave posibilities for future updates, - # the line below has two splits based on '^', which is from - # the original EventService parameter format. - event_order = event_id.split("^")[1].split("^")[0] - event_index = str(int(event_base_id) + int(event_order) - 1) - if not order_id_map: - print("EventSerice is enabled but order_id_map file doesn't exist.") - ret_status = False - break - - if label not in order_id_map: - print( - f"EventSerice is enabled but label {label} doesn't in the keys" - f" of order_id_map {order_id_map.keys()}" - ) - ret_status = False - break - if event_index not in order_id_map[label]: - print( - f"EventSerice is enabled but event_index {event_index} is not" - f" in order_id_map[{label}] {order_id_map[label].keys()}" - ) - ret_status = False - break - - params_map[param] = {"event_index": event_index, "order_id_map": order_id_map[label]} - return ret_status, with_events, params_map def replace_event_file(params, files): """Replace events with node id. From 39cbf5308ebe407cb5d0a0ac21381cd1f7d6d640 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Wed, 4 Dec 2024 16:26:00 -0500 Subject: [PATCH 09/19] Rename config YAML needed to run commands remotely To make it clear that the config YAML ```bps_cmd.yaml`` contains PanDA specific settings and should be included while other BPS plugins are used I renamed it to ``bps_panda_cmd.yaml``. --- config/{bps_cmd.yaml => bps_panda_cmd.yaml} | 0 doc/lsst.ctrl.bps.panda/userguide.rst | 4 ++-- 2 files changed, 2 insertions(+), 2 deletions(-) rename config/{bps_cmd.yaml => bps_panda_cmd.yaml} (100%) diff --git a/config/bps_cmd.yaml b/config/bps_panda_cmd.yaml similarity index 100% rename from config/bps_cmd.yaml rename to config/bps_panda_cmd.yaml diff --git a/doc/lsst.ctrl.bps.panda/userguide.rst b/doc/lsst.ctrl.bps.panda/userguide.rst index a400921..07f38ce 100644 --- a/doc/lsst.ctrl.bps.panda/userguide.rst +++ b/doc/lsst.ctrl.bps.panda/userguide.rst @@ -83,7 +83,7 @@ See `bps submitcmd`_ for details. To execute custom scripts you need to specify the version of the LSST Stack to use *and* include the settings from ``${CTRL_BPS_PANDA_DIR}/config/bps_panda_DF.yaml`` and -``${CTRL_BPS_PANDA_DIR}/config/bps_cmd.yaml`` in your BPS config. +``${CTRL_BPS_PANDA_DIR}/config/bps_panda_cmd.yaml`` in your BPS config. .. code-block:: yaml @@ -91,7 +91,7 @@ use *and* include the settings from includeConfigs: - ${CTRL_BPS_PANDA_DIR}/config/bps_panda_DF.yaml - - ${CTRL_BPS_PANDA_DIR}/config/bps_cmd.yaml + - ${CTRL_BPS_PANDA_DIR}/config/bps_panda_cmd.yaml customJob: executable: From da7cec3f502af6b0f10b5ebbfb8bb350ff46a7ea Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:04:15 -0500 Subject: [PATCH 10/19] Remove default butler config location File ``bps_panda_DF.yaml`` is supposed to contain generic settings that can be used regardless of the compute site. However, it contained a default location of the Butler config. However, these locations differ between the compute site so I removed it. --- config/bps_panda_DF.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/config/bps_panda_DF.yaml b/config/bps_panda_DF.yaml index 01efc28..dfd84a9 100644 --- a/config/bps_panda_DF.yaml +++ b/config/bps_panda_DF.yaml @@ -10,10 +10,6 @@ s3EndpointUrl: "https://storage.googleapis.com" payloadFolder: payload fileDistributionEndPoint: "${LSST_RUN_TEMP_SPACE}/panda_cache_box/{payloadFolder}/{uniqProcName}/" -# location of main butler repo at USDF -payload: - butlerConfig: panda-test-med-1 - # Job environment setup custom_lsst_setup: "" setupLSSTEnv: > From 6662299652160c7d141e5134591da303ff451a31 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:10:44 -0500 Subject: [PATCH 11/19] Rename a variable to improve code readability --- python/lsst/ctrl/bps/panda/utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index a32f0c5..cdb6c2b 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -582,7 +582,7 @@ def add_idds_work(config, generic_workflow, idds_workflow): job_to_task = {} job_to_pseudo_filename = {} task_count = 0 # Task number/ID in idds workflow used for unique name - remote_filename = None + remote_archive_filename = None submit_cmd = generic_workflow.run_attrs.get("bps_iscustom", False) if submit_cmd: @@ -592,7 +592,7 @@ def add_idds_work(config, generic_workflow, idds_workflow): submit_path = config["submitPath"] archive_filename = f"jobO.{uuid.uuid4()}.tar.gz" archive_filename = create_archive_file(submit_path, archive_filename, files) - remote_filename = copy_files_to_pandacache(archive_filename) + remote_archive_filename = copy_files_to_pandacache(archive_filename) es_files = {} name_works = {} @@ -668,7 +668,7 @@ def add_idds_work(config, generic_workflow, idds_workflow): es_label=job_label, max_payloads_per_panda_job=max_payloads_per_panda_job, max_wms_job_wall_time=max_wms_job_wall_time, - remote_filename=remote_filename, + remote_filename=remote_archive_filename, ) name_works[work.task_name] = work files_to_pre_stage.update(files) From 89b86b099cb9dd71431f10ee2bd8175abf280c53 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:12:08 -0500 Subject: [PATCH 12/19] Make download_extract_archive() more generic Added an optional parameter to download_extract_archive() function that allow the programmer to control to what directory the archive will be downloaded and extracted to. --- python/lsst/ctrl/bps/panda/utils.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index cdb6c2b..651e594 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -780,16 +780,19 @@ def copy_files_to_pandacache(filename): return filename -def download_extract_archive(filename): +def download_extract_archive(filename, prefix=None): """Download and extract the tarball from pandacache. Parameters ---------- filename : `str` The filename to download. + prefix : `str`, optional + The target directory the tarball will be downloaded and extracted to. + If None (default), the current directory will be used. """ archive_basename = os.path.basename(filename) - target_dir = os.getcwd() + target_dir = prefix if prefix is not None else os.getcwd() full_output_filename = os.path.join(target_dir, archive_basename) if filename.startswith("https:"): From 91b4a80ed6da4f18ef6980ca904bfb54852a6663 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:16:54 -0500 Subject: [PATCH 13/19] Add a comment explaining in-function import There's an import inside of ``download_extract_archive()`` function. Added a comment explaining why it cannot be done at the beginning of the module containing it. --- python/lsst/ctrl/bps/panda/utils.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index 651e594..41837c3 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -803,6 +803,8 @@ def download_extract_archive(filename, prefix=None): panda_cache_url = os.environ.get("PANDACACHE_URL", None) print(f"PANDACACHE_URL: {panda_cache_url}") + # The import of PanDA client must happen *after* the PANDACACHE_URL is set. + # Otherwise, the PanDA client the environment setting will not be parsed. from pandaclient import Client attempt = 0 From 7cd9ff928ae85ed4010ad6c375d2a983e178087e Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:20:36 -0500 Subject: [PATCH 14/19] Add a random sleep between downloads Added a random sleep period between successive attempts regarding downloading the archive to mitigate potential transient network issues. --- python/lsst/ctrl/bps/panda/utils.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index 41837c3..cd4bff6 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -39,7 +39,9 @@ import concurrent.futures import logging import os +import random import tarfile +import time import uuid import idds.common.utils as idds_utils @@ -809,11 +811,17 @@ def download_extract_archive(filename, prefix=None): attempt = 0 max_attempts = 3 - done = False - while attempt < max_attempts and not done: + while attempt < max_attempts: status, output = Client.getFile(archive_basename, output_path=full_output_filename) if status == 0: - done = True + break + if attempt <= 1: + secs = random.randint(1, 10) + elif attempt <= 2: + secs = random.randint(1, 60) + else: + secs = random.randint(1, 120) + time.sleep(secs) print(f"Download archive file from pandacache status: {status}, output: {output}") if status != 0: raise RuntimeError("Failed to download archive file from pandacache") From 66eded401b28375d1b07f5c9f109dcadc8a520dd Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:24:12 -0500 Subject: [PATCH 15/19] Use the correct location of the script While preparing the archive with the script to execute, the original was used instead of the copy in the submit directory. Fixed this. --- python/lsst/ctrl/bps/panda/utils.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index cd4bff6..478cbd2 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -588,9 +588,7 @@ def add_idds_work(config, generic_workflow, idds_workflow): submit_cmd = generic_workflow.run_attrs.get("bps_iscustom", False) if submit_cmd: - files = [] - _, script = config["customJob"].search("executable", opt={"default": ""}) - files.append(script) + files = generic_workflow.get_executables(data=False, transfer_only=True) submit_path = config["submitPath"] archive_filename = f"jobO.{uuid.uuid4()}.tar.gz" archive_filename = create_archive_file(submit_path, archive_filename, files) From 34d2911b32d444a2b1d153c5ed1437eb2825ad1f Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:27:49 -0500 Subject: [PATCH 16/19] Fix typos in log messages --- python/lsst/ctrl/bps/panda/utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/lsst/ctrl/bps/panda/utils.py b/python/lsst/ctrl/bps/panda/utils.py index 478cbd2..7260505 100644 --- a/python/lsst/ctrl/bps/panda/utils.py +++ b/python/lsst/ctrl/bps/panda/utils.py @@ -825,9 +825,9 @@ def download_extract_archive(filename, prefix=None): raise RuntimeError("Failed to download archive file from pandacache") with tarfile.open(full_output_filename, "r:gz") as f: f.extractall(target_dir) - print(f"Extract {full_output_filename} to {target_dir}") + print(f"Extracted {full_output_filename} to {target_dir}") os.remove(full_output_filename) - print(f"Remove {full_output_filename}") + print(f"Removed {full_output_filename}") def get_task_parameter(config, remote_build, key): From a25a585beed1e7b2cfca24671021eaa867250d18 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:29:10 -0500 Subject: [PATCH 17/19] Fix few typos in the documentation --- doc/lsst.ctrl.bps.panda/userguide.rst | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/doc/lsst.ctrl.bps.panda/userguide.rst b/doc/lsst.ctrl.bps.panda/userguide.rst index 07f38ce..870f22e 100644 --- a/doc/lsst.ctrl.bps.panda/userguide.rst +++ b/doc/lsst.ctrl.bps.panda/userguide.rst @@ -38,7 +38,7 @@ Defining a submission BPS configuration files are YAML files with some reserved keywords and some special features. See `BPS configuration file`__ for details. -.. Describe any plugin specific ascpects of a definiing a submissinon below if +.. Describe any plugin specific aspects of a definiing a submission below if any. The memory autoscaling is *not* supported supported by the ``ctrl_bps_panda``, i.e., @@ -56,7 +56,7 @@ will have not effect on workflows submitted with this plugin. Authenticating -------------- -.. Describe any plugin specific ascpects of a authentication below if any. +.. Describe any plugin specific aspects of a authentication below if any. See https://panda.lsst.io for details. @@ -67,7 +67,7 @@ Submitting a run See `bps submit`_ and https://panda.lsst.io for details. -.. Describe any plugin specific ascpects of a submissinon below if any. +.. Describe any plugin specific aspects of a submission below if any. .. __: https://pipelines.lsst.io/v/weekly/modules/lsst.ctrl.bps/quickstart.html#submitting-a-run @@ -78,7 +78,7 @@ Submitting a custom script See `bps submitcmd`_ for details. -.. Describe any plugin specific ascpects of a submissinon below if any. +.. Describe any plugin specific aspects of a submission below if any. To execute custom scripts you need to specify the version of the LSST Stack to use *and* include the settings from @@ -118,7 +118,7 @@ Checking status `bps report`_ is *not* supported, use the WMS commands/tools directly. -.. Describe any plugin specific ascpects of a checking submission status below +.. Describe any plugin specific aspects of a checking submission status below if any. .. _panda-plugin-cancelling: @@ -128,7 +128,7 @@ Canceling submitted jobs `bps cancel`_ is *not* supported, use the WMS commands/tools directly. -.. Describe any plugin specific ascpects of a canceling submitted jobs below +.. Describe any plugin specific aspects of a canceling submitted jobs below if any. .. _panda-plugin-restarting: @@ -138,7 +138,7 @@ Restarting a failed run `bps restart`_ is *not* supported, use the WMS commands/tools directly. -.. Describe any plugin specific ascpects of restarting a failed jobs below +.. Describe any plugin specific aspects of restarting a failed jobs below if any. .. .. _panda-plugin-troubleshooting: From 7bc1737a8879ba4a16481b7a0208275e6de225f9 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 11:34:32 -0500 Subject: [PATCH 18/19] Update documentation Described alternatives for specifying ``computeSite``. --- doc/lsst.ctrl.bps.panda/userguide.rst | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/lsst.ctrl.bps.panda/userguide.rst b/doc/lsst.ctrl.bps.panda/userguide.rst index 870f22e..1a53a92 100644 --- a/doc/lsst.ctrl.bps.panda/userguide.rst +++ b/doc/lsst.ctrl.bps.panda/userguide.rst @@ -81,7 +81,7 @@ See `bps submitcmd`_ for details. .. Describe any plugin specific aspects of a submission below if any. To execute custom scripts you need to specify the version of the LSST Stack to -use *and* include the settings from +use and include the settings from ``${CTRL_BPS_PANDA_DIR}/config/bps_panda_DF.yaml`` and ``${CTRL_BPS_PANDA_DIR}/config/bps_panda_cmd.yaml`` in your BPS config. @@ -111,6 +111,12 @@ script to be executed at FrDF (CC-IN2P3), set ``computeSite`` and To execute the script in UKDF, set ``computeSite`` to ``LANCS``. +.. note:: + + Alternatively, you can include ``bps_panda_frdf.yaml`` or + ``bps_panda_ukdf.yaml`` instead of the ``bps_panda_DF.yaml`` which will set + the right ``computeSite`` (and ``computeCloud``) for you. + .. _panda-plugin-status: Checking status From a0023bbcabbc7a4999700054025bcda6971ab0b2 Mon Sep 17 00:00:00 2001 From: Mikolaj Kowalik Date: Mon, 9 Dec 2024 15:11:25 -0500 Subject: [PATCH 19/19] Clean up configs files Brought ``fileDistributionEndPoint`` values in the site-specific configs in sync with the value in ``bps_remote_DF.yaml`` and added missing scheme parts of these URLs. --- config/bps_frdf.yaml | 4 ++-- config/bps_panda_DF.yaml | 2 +- config/bps_ukdf.yaml | 4 ++-- config/bps_usdf.yaml | 4 ++-- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/config/bps_frdf.yaml b/config/bps_frdf.yaml index 414ecd9..4560e89 100644 --- a/config/bps_frdf.yaml +++ b/config/bps_frdf.yaml @@ -9,8 +9,8 @@ computeCloud: EU computeSite: CC-IN2P3 s3EndpointUrl: "https://storage.googleapis.com" payloadFolder: payload -fileDistributionEndPoint: "file://${LSST_RUN_TEMP_SPACE}/{operator}/panda_cache_box/{payloadFolder}/{uniqProcName}/" -fileDistributionEndPointDefault: "file:///sps/lsst/users/lsstgrid/{operator}/panda_cache_box/{payloadFolder}/{uniqProcName}/" +fileDistributionEndPoint: "file://${LSST_RUN_TEMP_SPACE}/panda_cache_box/{payloadFolder}/{uniqProcName}/" +fileDistributionEndPointDefault: "file:///sps/lsst/users/lsstgrid/panda_cache_box/{payloadFolder}/{uniqProcName}/" # location of main butler repo at USDF payload: diff --git a/config/bps_panda_DF.yaml b/config/bps_panda_DF.yaml index dfd84a9..83baf95 100644 --- a/config/bps_panda_DF.yaml +++ b/config/bps_panda_DF.yaml @@ -8,7 +8,7 @@ project: dev campaign: quick s3EndpointUrl: "https://storage.googleapis.com" payloadFolder: payload -fileDistributionEndPoint: "${LSST_RUN_TEMP_SPACE}/panda_cache_box/{payloadFolder}/{uniqProcName}/" +fileDistributionEndPoint: "file://${LSST_RUN_TEMP_SPACE}/panda_cache_box/{payloadFolder}/{uniqProcName}/" # Job environment setup custom_lsst_setup: "" diff --git a/config/bps_ukdf.yaml b/config/bps_ukdf.yaml index f436238..5feb3c7 100644 --- a/config/bps_ukdf.yaml +++ b/config/bps_ukdf.yaml @@ -9,8 +9,8 @@ computeCloud: EU computeSite: LANCS s3EndpointUrl: "https://storage.googleapis.com" payloadFolder: payload -fileDistributionEndPoint: "file://${LSST_RUN_TEMP_SPACE}/{operator}/panda_cache_box/{payloadFolder}/{uniqProcName}/" -fileDistributionEndPointDefault: "file:///cephfs/pool/rubin/{operator}/panda_cache_box/{payloadFolder}/{uniqProcName}/" +fileDistributionEndPoint: "file://${LSST_RUN_TEMP_SPACE}/panda_cache_box/{payloadFolder}/{uniqProcName}/" +fileDistributionEndPointDefault: "file:///cephfs/pool/rubin/panda_cache_box/{payloadFolder}/{uniqProcName}/" # location of main butler repo at USDF payload: diff --git a/config/bps_usdf.yaml b/config/bps_usdf.yaml index 846afb5..a1ad118 100644 --- a/config/bps_usdf.yaml +++ b/config/bps_usdf.yaml @@ -9,8 +9,8 @@ computeCloud: US computeSite: SLAC s3EndpointUrl: "https://storage.googleapis.com" payloadFolder: payload -fileDistributionEndPoint: "${LSST_RUN_TEMP_SPACE}/{operator}/panda_cache_box/{payloadFolder}/{uniqProcName}/" -fileDistributionEndPointDefault: "file:///sdf/data/rubin/panda_jobs/panda_cache/{operator}/panda_cache_box/{payloadFolder}/{uniqProcName}/" +fileDistributionEndPoint: "file://${LSST_RUN_TEMP_SPACE}/panda_cache_box/{payloadFolder}/{uniqProcName}/" +fileDistributionEndPointDefault: "file:///sdf/data/rubin/panda_jobs/panda_cache/panda_cache_box/{payloadFolder}/{uniqProcName}/" # location of main butler repo at USDF payload: