From a69360241f184b5efba079c4ac2a91a1623c8c41 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 13:19:09 +0200 Subject: [PATCH 01/26] add nhr hosts and storages --- src/utils/operandi_utils/hpc/constants.py | 41 +++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/src/utils/operandi_utils/hpc/constants.py b/src/utils/operandi_utils/hpc/constants.py index ae69abf1..94dfab6c 100644 --- a/src/utils/operandi_utils/hpc/constants.py +++ b/src/utils/operandi_utils/hpc/constants.py @@ -19,6 +19,47 @@ "HPC_TRANSFER_PROXY_HOSTS" ] +HPC_NHR_PROJECT: str = "project_pwieder_ocr_nhr" + +# https://docs.hpc.gwdg.de/start_here/connecting/login_nodes_and_example_commands/index.html#the-login-nodes +# https://docs.hpc.gwdg.de/how_to_use/the_storage_systems/data_stores/scratch_work/index.html#nhr +HPC_NHR_SCRATCH_EMMY_HDD: str = f"/mnt/lustre-emmy-hdd/projects/{HPC_NHR_PROJECT}" # Capacity - 110 TiB +HPC_NHR_SCRATCH_EMMY_SSD: str = f"/mnt/lustre-emmy-ssd/projects/{HPC_NHR_PROJECT}" # Capacity - 8.4 PiB +HPC_NHR_SCRATCH_GRETE_SSD: str = f"/mnt/lustre-grete/projects/{HPC_NHR_PROJECT}" # Capacity - 110 TiB + +HPC_NHR_CLUSTER = { + "EmmyPhase1": { + "host": "glogin-p1.hpc.gwdg.de", + "scratch-emmy-hdd": HPC_NHR_SCRATCH_EMMY_HDD, + "scratch-emmy-ssd": HPC_NHR_SCRATCH_EMMY_SSD, + "scratch-grete-ssd": "" # No Access + }, + "EmmyPhase2": { + "host": "glogin-p2.hpc.gwdg.de", + "scratch-emmy-hdd": HPC_NHR_SCRATCH_EMMY_HDD, + "scratch-emmy-ssd": HPC_NHR_SCRATCH_EMMY_SSD, + "scratch-grete-ssd": "" # No Access + }, + "EmmyPhase3": { + "host": "glogin-p3.hpc.gwdg.de", + "scratch-emmy-hdd": HPC_NHR_SCRATCH_EMMY_HDD, + "scratch-emmy-ssd": HPC_NHR_SCRATCH_EMMY_SSD, + "scratch-grete-ssd": HPC_NHR_SCRATCH_GRETE_SSD + }, + "Grete": { + "host": "glogin-gpu.hpc.gwdg.de", + "scratch-emmy-hdd": HPC_NHR_SCRATCH_EMMY_HDD, + "scratch-emmy-ssd": HPC_NHR_SCRATCH_EMMY_SSD, + "scratch-grete-ssd": HPC_NHR_SCRATCH_GRETE_SSD + }, + "KISSKI": { + "host": "glogin9.hpc.gwdg.de", + "scratch-emmy-hdd": HPC_NHR_SCRATCH_EMMY_HDD, + "scratch-emmy-ssd": HPC_NHR_SCRATCH_EMMY_SSD, + "scratch-grete-ssd": HPC_NHR_SCRATCH_GRETE_SSD + } +} + # "gwdu103.hpc.gwdg.de" - bad host entry, has no access to /scratch1, but to /scratch2 HPC_EXECUTOR_HOSTS = ["login-mdc.hpc.gwdg.de", "gwdu101.hpc.gwdg.de", "gwdu102.hpc.gwdg.de"] HPC_EXECUTOR_PROXY_HOSTS = ["login.gwdg.de"] From 0656a95fa1a5dbc263ca51a081ceacbe3677af38 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 16:39:20 +0200 Subject: [PATCH 02/26] adapt: .env variables --- .env | 3 +-- .github/workflows/tests/.env | 5 ++--- docker-compose.yml | 3 +-- docker-compose_image_based.yml | 1 - docker.env | 3 +-- tests/.env | 5 ++--- 6 files changed, 7 insertions(+), 13 deletions(-) diff --git a/.env b/.env index 65946ba0..c7f9abaa 100644 --- a/.env +++ b/.env @@ -4,8 +4,7 @@ OPERANDI_DB_ROOT_PASS=db_operandi OPERANDI_DB_URL=mongodb://db_operandi:db_operandi@localhost:27017 OPERANDI_HARVESTER_DEFAULT_USERNAME=harvester_operandi OPERANDI_HARVESTER_DEFAULT_PASSWORD=harvester_operandi -OPERANDI_HPC_USERNAME=mmustaf -OPERANDI_HPC_PROJECT_USERNAME=u11874 +OPERANDI_HPC_PROJECT_USERNAME=u12198 OPERANDI_HPC_SSH_KEYPATH=/home/mm/.ssh/gwdg-cluster OPERANDI_HPC_PROJECT_NAME=operandi OPERANDI_LOGS_DIR=/tmp/operandi_logs diff --git a/.github/workflows/tests/.env b/.github/workflows/tests/.env index d3142907..79b9eb6f 100644 --- a/.github/workflows/tests/.env +++ b/.github/workflows/tests/.env @@ -4,10 +4,9 @@ OPERANDI_DB_ROOT_PASS=db_operandi OPERANDI_DB_URL=mongodb://db_operandi:db_operandi@localhost:27017 OPERANDI_HARVESTER_DEFAULT_USERNAME=harvester_operandi OPERANDI_HARVESTER_DEFAULT_PASSWORD=harvester_operandi -OPERANDI_HPC_USERNAME=mmustaf -OPERANDI_HPC_PROJECT_USERNAME=u11874 +OPERANDI_HPC_PROJECT_USERNAME=u12198 OPERANDI_HPC_SSH_KEYPATH=/home/runner/.ssh/key_hpc -OPERANDI_HPC_PROJECT_NAME=operandi +OPERANDI_HPC_PROJECT_NAME=operandi_test_cicd OPERANDI_LOGS_DIR=/tmp/operandi_logs OPERANDI_RABBITMQ_CONFIG_JSON=./src/rabbitmq_definitions.json OPERANDI_RABBITMQ_URL=amqp://operandi_user:operandi_password@localhost:5672/ diff --git a/docker-compose.yml b/docker-compose.yml index e01fb8d1..baea255e 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -58,7 +58,7 @@ services: container_name: operandi-server build: context: ./src - dockerfile: ./Dockerfile_server + dockerfile: ./Dockerfile_server depends_on: operandi-rabbitmq: condition: service_healthy @@ -102,7 +102,6 @@ services: - OPERANDI_DB_NAME=${OPERANDI_DB_NAME} - OPERANDI_DB_URL=${OPERANDI_DB_URL} - OPERANDI_HPC_PROJECT_NAME=${OPERANDI_HPC_PROJECT_NAME} - - OPERANDI_HPC_USERNAME=${OPERANDI_HPC_USERNAME} - OPERANDI_HPC_PROJECT_USERNAME=${OPERANDI_HPC_PROJECT_USERNAME} - OPERANDI_HPC_SSH_KEYPATH=/home/root/.ssh/gwdg_hpc_key - OPERANDI_LOGS_DIR=${OPERANDI_LOGS_DIR} diff --git a/docker-compose_image_based.yml b/docker-compose_image_based.yml index 454c9e9c..20fa44cb 100644 --- a/docker-compose_image_based.yml +++ b/docker-compose_image_based.yml @@ -96,7 +96,6 @@ services: - OPERANDI_DB_NAME=${OPERANDI_DB_NAME} - OPERANDI_DB_URL=${OPERANDI_DB_URL} - OPERANDI_HPC_PROJECT_NAME=${OPERANDI_HPC_PROJECT_NAME} - - OPERANDI_HPC_USERNAME=${OPERANDI_HPC_USERNAME} - OPERANDI_HPC_PROJECT_USERNAME=${OPERANDI_HPC_PROJECT_USERNAME} - OPERANDI_HPC_SSH_KEYPATH=/home/root/.ssh/gwdg_hpc_key - OPERANDI_LOGS_DIR=${OPERANDI_LOGS_DIR} diff --git a/docker.env b/docker.env index 41a17abb..fc231a96 100644 --- a/docker.env +++ b/docker.env @@ -4,8 +4,7 @@ OPERANDI_DB_ROOT_PASS=db_operandi OPERANDI_DB_URL=mongodb://db_operandi:db_operandi@mongo-db-host:27017 OPERANDI_HARVESTER_DEFAULT_USERNAME=harvester_operandi OPERANDI_HARVESTER_DEFAULT_PASSWORD=harvester_operandi -OPERANDI_HPC_USERNAME=mmustaf -OPERANDI_HPC_PROJECT_USERNAME=u11874 +OPERANDI_HPC_PROJECT_USERNAME=u12198 OPERANDI_HPC_SSH_KEYPATH=/home/mm/.ssh/gwdg-cluster OPERANDI_HPC_PROJECT_NAME=operandi OPERANDI_LOGS_DIR=/tmp/operandi_logs diff --git a/tests/.env b/tests/.env index 65946ba0..5ea3a794 100644 --- a/tests/.env +++ b/tests/.env @@ -4,10 +4,9 @@ OPERANDI_DB_ROOT_PASS=db_operandi OPERANDI_DB_URL=mongodb://db_operandi:db_operandi@localhost:27017 OPERANDI_HARVESTER_DEFAULT_USERNAME=harvester_operandi OPERANDI_HARVESTER_DEFAULT_PASSWORD=harvester_operandi -OPERANDI_HPC_USERNAME=mmustaf -OPERANDI_HPC_PROJECT_USERNAME=u11874 +OPERANDI_HPC_PROJECT_USERNAME=u12198 OPERANDI_HPC_SSH_KEYPATH=/home/mm/.ssh/gwdg-cluster -OPERANDI_HPC_PROJECT_NAME=operandi +OPERANDI_HPC_PROJECT_NAME=operandi_test_local OPERANDI_LOGS_DIR=/tmp/operandi_logs OPERANDI_RABBITMQ_CONFIG_JSON=./src/rabbitmq_definitions.json OPERANDI_RABBITMQ_URL=amqp://operandi_user:operandi_password@localhost:5672/ From 5c6824b798da37b3006ffb8a718a6b85c0c88b6f Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 16:39:47 +0200 Subject: [PATCH 03/26] adapt: hpc utility tests --- tests/fixtures/hpc_nhr.py | 13 +++++ tests/tests_utils/conftest.py | 1 + tests/tests_utils/test_hpc_executor.py | 43 -------------- tests/tests_utils/test_hpc_nhr_executor.py | 45 ++++++++++++++ ...c_transfer.py => test_hpc_nhr_transfer.py} | 20 ++++--- ...xcombined.py => test_hpc_nhr_xcombined.py} | 58 ++++++++++--------- 6 files changed, 102 insertions(+), 78 deletions(-) create mode 100644 tests/fixtures/hpc_nhr.py delete mode 100644 tests/tests_utils/test_hpc_executor.py create mode 100644 tests/tests_utils/test_hpc_nhr_executor.py rename tests/tests_utils/{test_hpc_transfer.py => test_hpc_nhr_transfer.py} (54%) rename tests/tests_utils/{test_hpc_xcombined.py => test_hpc_nhr_xcombined.py} (56%) diff --git a/tests/fixtures/hpc_nhr.py b/tests/fixtures/hpc_nhr.py new file mode 100644 index 00000000..3ac676fb --- /dev/null +++ b/tests/fixtures/hpc_nhr.py @@ -0,0 +1,13 @@ +from pytest import fixture +from operandi_utils.hpc import NHRExecutor, NHRTransfer + +@fixture(scope="package", name="hpc_nhr_data_transfer") +def fixture_hpc_nhr_transfer_connector(): + hpc_transfer_connector = NHRTransfer() + yield hpc_transfer_connector + + +@fixture(scope="package", name="hpc_nhr_command_executor") +def fixture_hpc_nhr_execution_connector(): + hpc_paramiko_connector = NHRExecutor() + yield hpc_paramiko_connector diff --git a/tests/tests_utils/conftest.py b/tests/tests_utils/conftest.py index baf0d913..3bbf7e18 100644 --- a/tests/tests_utils/conftest.py +++ b/tests/tests_utils/conftest.py @@ -1,4 +1,5 @@ pytest_plugins = [ "tests.fixtures.hpc", + "tests.fixtures.hpc_nhr", "tests.fixtures.rabbitmq" ] diff --git a/tests/tests_utils/test_hpc_executor.py b/tests/tests_utils/test_hpc_executor.py deleted file mode 100644 index 57ea86b4..00000000 --- a/tests/tests_utils/test_hpc_executor.py +++ /dev/null @@ -1,43 +0,0 @@ -from datetime import datetime -from os.path import join -from time import sleep - -current_time = datetime.now().strftime("%Y%m%d_%H%M") - - -def test_hpc_connector_executor_mk_dir(hpc_command_executor): - test_dir_name = join(hpc_command_executor.project_root_dir, f"test_dir_{current_time}") - sleep(0.5) - output, err, return_code = hpc_command_executor.execute_blocking(command=f"bash -lc 'mkdir -p {test_dir_name}'") - assert return_code == 0, err - assert err == [] - assert output == [] - - -def test_hpc_connector_executor_rm_dir_negative(hpc_command_executor): - test_dir_name = join(hpc_command_executor.project_root_dir, f"test_dir_{current_time}") - sleep(0.5) - output, err, return_code = hpc_command_executor.execute_blocking(command=f"bash -lc 'rm {test_dir_name}'") - assert return_code == 1 - # The test dir name will be part of the returned error message - assert f'{test_dir_name}' in err[0] - assert output == [] - - -def test_hpc_connector_executor_rm_dir_positive(hpc_command_executor): - test_dir_name = join(hpc_command_executor.project_root_dir, f"test_dir_{current_time}") - sleep(0.5) - output, err, return_code = hpc_command_executor.execute_blocking(command=f"bash -lc 'rm -rf {test_dir_name}'") - assert return_code == 0 - assert err == [] - assert output == [] - - -def test_hpc_connector_executor_cd_dir(hpc_command_executor): - test_dir_name = join(hpc_command_executor.project_root_dir, f"test_dir_{current_time}") - sleep(0.5) - output, err, return_code = hpc_command_executor.execute_blocking(command=f"bash -lc 'cd {test_dir_name}'") - assert return_code == 1 - # The test dir name will be part of the returned error message - assert f'{test_dir_name}' in err[0] - assert output == [] diff --git a/tests/tests_utils/test_hpc_nhr_executor.py b/tests/tests_utils/test_hpc_nhr_executor.py new file mode 100644 index 00000000..7487a235 --- /dev/null +++ b/tests/tests_utils/test_hpc_nhr_executor.py @@ -0,0 +1,45 @@ +from datetime import datetime +from os.path import join +from time import sleep + +from operandi_utils.hpc.constants import HPC_NHR_SCRATCH_EMMY_HDD + +current_time = datetime.now().strftime("%Y%m%d_%H%M") +project_root = join(HPC_NHR_SCRATCH_EMMY_HDD, "operandi_test") + +def test_hpc_connector_executor_mk_dir(hpc_nhr_command_executor): + test_dir_name = join(project_root, f"test_dir_{current_time}") + sleep(0.5) + output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'mkdir -p {test_dir_name}'") + assert return_code == 0, err + assert err == [] + assert output == [] + + +def test_hpc_connector_executor_rm_dir_negative(hpc_nhr_command_executor): + test_dir_name = join(project_root, f"test_dir_{current_time}") + sleep(0.5) + output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'rm {test_dir_name}'") + assert return_code == 1 + # The test dir name will be part of the returned error message + assert f'{test_dir_name}' in err[0] + assert output == [] + + +def test_hpc_connector_executor_rm_dir_positive(hpc_nhr_command_executor): + test_dir_name = join(project_root, f"test_dir_{current_time}") + sleep(0.5) + output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'rm -rf {test_dir_name}'") + assert return_code == 0 + assert err == [] + assert output == [] + + +def test_hpc_connector_executor_cd_dir(hpc_nhr_command_executor): + test_dir_name = join(project_root, f"test_dir_{current_time}") + sleep(0.5) + output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'cd {test_dir_name}'") + assert return_code == 1 + # The test dir name will be part of the returned error message + assert f'{test_dir_name}' in err[0] + assert output == [] diff --git a/tests/tests_utils/test_hpc_transfer.py b/tests/tests_utils/test_hpc_nhr_transfer.py similarity index 54% rename from tests/tests_utils/test_hpc_transfer.py rename to tests/tests_utils/test_hpc_nhr_transfer.py index 52debd4d..948b9ebd 100644 --- a/tests/tests_utils/test_hpc_transfer.py +++ b/tests/tests_utils/test_hpc_nhr_transfer.py @@ -5,35 +5,37 @@ from tests.helpers_asserts import assert_exists_dir, assert_exists_file from tests.constants import BATCH_SCRIPT_EMPTY +from operandi_utils.hpc.constants import HPC_NHR_SCRATCH_EMMY_HDD + OPERANDI_SERVER_BASE_DIR = environ.get("OPERANDI_SERVER_BASE_DIR") current_time = datetime.now().strftime("%Y%m%d_%H%M") ID_WORKSPACE = f"test_folder_{current_time}" +project_root = join(HPC_NHR_SCRATCH_EMMY_HDD, "operandi_test") - -def test_hpc_connector_transfer_file(hpc_data_transfer, path_batch_script_empty): +def test_hpc_connector_transfer_file(hpc_nhr_data_transfer, path_batch_script_empty): """ Testing the put_file and get_file functionality of the HPC transfer """ assert_exists_file(path_batch_script_empty) - test_hpc_file_path = join(hpc_data_transfer.project_root_dir, BATCH_SCRIPT_EMPTY) - hpc_data_transfer.put_file(local_src=path_batch_script_empty, remote_dst=test_hpc_file_path) + test_hpc_file_path = join(project_root, BATCH_SCRIPT_EMPTY) + hpc_nhr_data_transfer.put_file(local_src=path_batch_script_empty, remote_dst=test_hpc_file_path) sleep(2) test_local_received_file_path = join(OPERANDI_SERVER_BASE_DIR, BATCH_SCRIPT_EMPTY) - hpc_data_transfer.get_file(remote_src=test_hpc_file_path, local_dst=test_local_received_file_path) + hpc_nhr_data_transfer.get_file(remote_src=test_hpc_file_path, local_dst=test_local_received_file_path) sleep(2) assert_exists_file(test_local_received_file_path) -def test_hpc_connector_transfer_dir(hpc_data_transfer, path_dummy_workspace_data_dir): +def test_hpc_connector_transfer_dir(hpc_nhr_data_transfer, path_dummy_workspace_data_dir): """ Testing the put_dir and get_dir functionality of the HPC transfer """ assert_exists_dir(path_dummy_workspace_data_dir) - test_hpc_dir_path = join(hpc_data_transfer.project_root_dir, ID_WORKSPACE) - hpc_data_transfer.put_dir(local_src=path_dummy_workspace_data_dir, remote_dst=test_hpc_dir_path) + test_hpc_dir_path = join(project_root, ID_WORKSPACE) + hpc_nhr_data_transfer.put_dir(local_src=path_dummy_workspace_data_dir, remote_dst=test_hpc_dir_path) sleep(5) test_local_received_dir_path = join(OPERANDI_SERVER_BASE_DIR, ID_WORKSPACE) - hpc_data_transfer.get_dir(remote_src=test_hpc_dir_path, local_dst=test_local_received_dir_path) + hpc_nhr_data_transfer.get_dir(remote_src=test_hpc_dir_path, local_dst=test_local_received_dir_path) sleep(2) assert_exists_dir(test_local_received_dir_path) diff --git a/tests/tests_utils/test_hpc_xcombined.py b/tests/tests_utils/test_hpc_nhr_xcombined.py similarity index 56% rename from tests/tests_utils/test_hpc_xcombined.py rename to tests/tests_utils/test_hpc_nhr_xcombined.py index 791b2330..98aa9fac 100644 --- a/tests/tests_utils/test_hpc_xcombined.py +++ b/tests/tests_utils/test_hpc_nhr_xcombined.py @@ -6,7 +6,9 @@ from operandi_server.constants import ( DEFAULT_FILE_GRP, DEFAULT_METS_BASENAME, SERVER_WORKFLOW_JOBS_ROUTER, SERVER_WORKSPACES_ROUTER ) -from operandi_utils.hpc.constants import HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_TEST_PARTITION, HPC_JOB_QOS_SHORT +from operandi_utils.hpc.constants import ( + HPC_JOB_DEADLINE_TIME_TEST, HPC_NHR_JOB_TEST_PARTITION, HPC_JOB_QOS_SHORT, HPC_NHR_CLUSTERS +) from tests.constants import BATCH_SUBMIT_WORKFLOW_JOB from tests.helpers_asserts import assert_exists_dir, assert_exists_file @@ -18,21 +20,24 @@ ID_WORKFLOW_JOB = f"test_wf_job_{current_time}" ID_WORKSPACE = f"test_ws_{current_time}" +project_root = join(HPC_NHR_CLUSTERS[""], "operandi_test") +batch_scripts_dir = join(project_root, "batch_scripts") + def helper_pack_and_put_slurm_workspace( - hpc_data_transfer, workflow_job_id: str, workspace_id: str, path_workflow: str, path_workspace_dir: str + hpc_nhr_data_transfer, workflow_job_id: str, workspace_id: str, path_workflow: str, path_workspace_dir: str ): # Move the test asset to actual workspaces location dst_path = join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKSPACES_ROUTER, workspace_id) local_workspace_dir = copytree(src=path_workspace_dir, dst=dst_path) assert_exists_dir(local_workspace_dir) - local_slurm_workspace_zip_path = hpc_data_transfer.create_slurm_workspace_zip( + local_slurm_workspace_zip_path = hpc_nhr_data_transfer.create_slurm_workspace_zip( ocrd_workspace_dir=local_workspace_dir, workflow_job_id=workflow_job_id, nextflow_script_path=path_workflow, tempdir_prefix="test_slurm_workspace-") assert_exists_file(local_slurm_workspace_zip_path) - hpc_dst_slurm_zip = hpc_data_transfer.put_slurm_workspace( + hpc_dst_slurm_zip = hpc_nhr_data_transfer.put_slurm_workspace( local_src_slurm_zip=local_slurm_workspace_zip_path, workflow_job_id=workflow_job_id) # TODO: implement this @@ -41,60 +46,61 @@ def helper_pack_and_put_slurm_workspace( Path(local_slurm_workspace_zip_path).unlink(missing_ok=True) -def test_hpc_connector_put_batch_script(hpc_data_transfer, path_batch_script_submit_workflow_job): - hpc_batch_script_path = join(hpc_data_transfer.batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) - hpc_data_transfer.put_file(local_src=path_batch_script_submit_workflow_job, remote_dst=hpc_batch_script_path) +def test_hpc_connector_put_batch_script(hpc_nhr_data_transfer, path_batch_script_submit_workflow_job): + hpc_batch_script_path = join(batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) + hpc_nhr_data_transfer.put_file(local_src=path_batch_script_submit_workflow_job, remote_dst=hpc_batch_script_path) -def test_pack_and_put_slurm_workspace(hpc_data_transfer, path_small_workspace_data_dir, template_workflow): +def test_pack_and_put_slurm_workspace(hpc_nhr_data_transfer, path_small_workspace_data_dir, template_workflow): helper_pack_and_put_slurm_workspace( - hpc_data_transfer=hpc_data_transfer, workflow_job_id=ID_WORKFLOW_JOB, workspace_id=ID_WORKSPACE, + hpc_nhr_data_transfer=hpc_nhr_data_transfer, workflow_job_id=ID_WORKFLOW_JOB, workspace_id=ID_WORKSPACE, path_workflow=template_workflow, path_workspace_dir=path_small_workspace_data_dir) def test_pack_and_put_slurm_workspace_with_ms( - hpc_data_transfer, path_small_workspace_data_dir, template_workflow_with_ms + hpc_nhr_data_transfer, path_small_workspace_data_dir, template_workflow_with_ms ): helper_pack_and_put_slurm_workspace( - hpc_data_transfer=hpc_data_transfer, workflow_job_id=ID_WORKFLOW_JOB_WITH_MS, workspace_id=ID_WORKSPACE_WITH_MS, - path_workflow=template_workflow_with_ms, path_workspace_dir=path_small_workspace_data_dir) + hpc_nhr_data_transfer=hpc_nhr_data_transfer, workflow_job_id=ID_WORKFLOW_JOB_WITH_MS, + workspace_id=ID_WORKSPACE_WITH_MS, path_workflow=template_workflow_with_ms, + path_workspace_dir=path_small_workspace_data_dir) -def _test_hpc_connector_run_batch_script(hpc_command_executor, template_workflow): - hpc_batch_script_path = join(hpc_command_executor.batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) - slurm_job_id = hpc_command_executor.trigger_slurm_job( +def test_hpc_connector_run_batch_script(hpc_nhr_command_executor, template_workflow): + hpc_batch_script_path = join(batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) + slurm_job_id = hpc_nhr_command_executor.trigger_slurm_job( batch_script_path=hpc_batch_script_path, workflow_job_id=ID_WORKFLOW_JOB, nextflow_script_path=template_workflow, input_file_grp=DEFAULT_FILE_GRP, workspace_id=ID_WORKSPACE, mets_basename=DEFAULT_METS_BASENAME, nf_process_forks=2, ws_pages_amount=8, use_mets_server=False, file_groups_to_remove="", cpus=2, ram=16, job_deadline_time=HPC_JOB_DEADLINE_TIME_TEST, - partition=HPC_JOB_TEST_PARTITION, qos=HPC_JOB_QOS_SHORT) - finished_successfully = hpc_command_executor.poll_till_end_slurm_job_state( + partition=HPC_NHR_JOB_TEST_PARTITION, qos=HPC_JOB_QOS_SHORT) + finished_successfully = hpc_nhr_command_executor.poll_till_end_slurm_job_state( slurm_job_id=slurm_job_id, interval=5, timeout=300) assert finished_successfully -def _test_hpc_connector_run_batch_script_with_ms(hpc_command_executor, template_workflow_with_ms): - hpc_batch_script_path = join(hpc_command_executor.batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) - slurm_job_id = hpc_command_executor.trigger_slurm_job( +def test_hpc_connector_run_batch_script_with_ms(hpc_nhr_command_executor, template_workflow_with_ms): + hpc_batch_script_path = join(batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) + slurm_job_id = hpc_nhr_command_executor.trigger_slurm_job( batch_script_path=hpc_batch_script_path, workflow_job_id=ID_WORKFLOW_JOB_WITH_MS, nextflow_script_path=template_workflow_with_ms, input_file_grp=DEFAULT_FILE_GRP, workspace_id=ID_WORKSPACE_WITH_MS, mets_basename=DEFAULT_METS_BASENAME, nf_process_forks=2, ws_pages_amount=8, use_mets_server=True, file_groups_to_remove="", cpus=3, ram=16, job_deadline_time=HPC_JOB_DEADLINE_TIME_TEST, - partition=HPC_JOB_TEST_PARTITION, qos=HPC_JOB_QOS_SHORT) - finished_successfully = hpc_command_executor.poll_till_end_slurm_job_state( + partition=HPC_NHR_JOB_TEST_PARTITION, qos=HPC_JOB_QOS_SHORT) + finished_successfully = hpc_nhr_command_executor.poll_till_end_slurm_job_state( slurm_job_id=slurm_job_id, interval=5, timeout=300) assert finished_successfully -def _test_get_and_unpack_slurm_workspace(hpc_data_transfer): - hpc_data_transfer.get_and_unpack_slurm_workspace( +def _test_get_and_unpack_slurm_workspace(hpc_nhr_data_transfer): + hpc_nhr_data_transfer.get_and_unpack_slurm_workspace( ocrd_workspace_dir=join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKSPACES_ROUTER, ID_WORKSPACE), workflow_job_dir=join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKFLOW_JOBS_ROUTER, ID_WORKFLOW_JOB) ) -def _test_get_and_unpack_slurm_workspace_with_ms(hpc_data_transfer): - hpc_data_transfer.get_and_unpack_slurm_workspace( +def _test_get_and_unpack_slurm_workspace_with_ms(hpc_nhr_data_transfer): + hpc_nhr_data_transfer.get_and_unpack_slurm_workspace( ocrd_workspace_dir=join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKSPACES_ROUTER, ID_WORKSPACE_WITH_MS), workflow_job_dir=join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKFLOW_JOBS_ROUTER, ID_WORKFLOW_JOB_WITH_MS) ) From 0c1e54029f3ff914952b0b991742d5056c8a167e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 16:41:14 +0200 Subject: [PATCH 04/26] adapt: hpc scc to nhr --- src/utils/operandi_utils/hpc/connector.py | 206 ------------------ src/utils/operandi_utils/hpc/nhr_connector.py | 62 ++++++ .../hpc/{executor.py => nhr_executor.py} | 93 ++++---- .../hpc/{transfer.py => nhr_transfer.py} | 137 +++++------- 4 files changed, 161 insertions(+), 337 deletions(-) delete mode 100644 src/utils/operandi_utils/hpc/connector.py create mode 100644 src/utils/operandi_utils/hpc/nhr_connector.py rename src/utils/operandi_utils/hpc/{executor.py => nhr_executor.py} (56%) rename src/utils/operandi_utils/hpc/{transfer.py => nhr_transfer.py} (60%) diff --git a/src/utils/operandi_utils/hpc/connector.py b/src/utils/operandi_utils/hpc/connector.py deleted file mode 100644 index dda1be36..00000000 --- a/src/utils/operandi_utils/hpc/connector.py +++ /dev/null @@ -1,206 +0,0 @@ -from logging import Logger -from paramiko import AutoAddPolicy, Channel, RSAKey, SSHClient -from pathlib import Path -from typing import List, Union - -from .constants import HPC_SSH_CONNECTION_TRY_TIMES -from .connection_utils import is_ssh_conn_responsive, is_sftp_conn_responsive -from .utils import ( - check_keyfile_existence, resolve_hpc_user_home_dir, resolve_hpc_project_root_dir, resolve_hpc_batch_scripts_dir, - resolve_hpc_slurm_workspaces_dir) - - -class HPCConnector: - def __init__( - self, hpc_hosts: List[str], proxy_hosts: List[str], username: str, project_username: str, key_path: Path, - key_pass: Union[str, None], project_name: str, log: Logger, - channel_keep_alive_interval: int = 30, connection_keep_alive_interval: int = 30, tunnel_host: str = 'localhost', - tunnel_port: int = 0 - ) -> None: - if not username: - raise ValueError("Environment variable is probably not set: OPERANDI_HPC_USERNAME") - if not project_username: - raise ValueError("Environment variable is probably not set: OPERANDI_HPC_PROJECT_USERNAME") - if not key_path: - raise ValueError("Environment variable is probably not set: OPERANDI_HPC_SSH_KEYPATH") - if not project_name: - raise ValueError("Environment variable is probably not set: OPERANDI_HPC_PROJECT_NAME") - self.log = log - - # The username is used to connect to the proxy server - self.username = username - # The project username is used to connect to the HPC front end to access the shared project folder - self.project_username = project_username - - check_keyfile_existence(hpc_key_path=key_path) - - # Use the same private key for both the proxy and hpc connections - self.proxy_key_path = key_path - self.proxy_key_pass = key_pass - self.hpc_key_path = key_path - self.hpc_key_pass = key_pass - - self.connection_keep_alive_interval = connection_keep_alive_interval - self.channel_keep_alive_interval = channel_keep_alive_interval - - # A list of hpc hosts - tries to connect to all until one is successful - self.hpc_hosts = hpc_hosts - self.last_used_hpc_host = None - - # A list of proxy hosts - tries to connect to all until one is successful - self.proxy_hosts = proxy_hosts - self.last_used_proxy_host = None - - self.ssh_proxy_client = None - self.proxy_tunnel = None - self.ssh_hpc_client = None - self.sftp_client = None - - self.log.info(f""" - HPCConnector initialized with: - Username: {self.username} - HPC hosts: {self.hpc_hosts} - Private key for hpc hosts: {self.hpc_key_path} - Proxy hosts: {self.proxy_hosts} - Private key for proxy hosts: {self.proxy_key_path} - """) - - self.project_name = project_name - self.user_home_dir = resolve_hpc_user_home_dir(project_username) - self.project_root_dir = resolve_hpc_project_root_dir(project_name) - self.batch_scripts_dir = resolve_hpc_batch_scripts_dir(project_name) - self.slurm_workspaces_dir = resolve_hpc_slurm_workspaces_dir(project_name) - - self.log.info(f""" - Project name: {self.project_name} - User home dir: {self.user_home_dir} - Project root dir: {self.project_root_dir} - Batch scripts root dir: {self.batch_scripts_dir} - Slurm workspaces root dir: {self.slurm_workspaces_dir} - """) - - self.tunnel_host = tunnel_host - self.tunnel_port = tunnel_port - self.create_ssh_connection_to_hpc_by_iteration(tunnel_host=tunnel_host, tunnel_port=tunnel_port) - - def connect_to_proxy_server(self, host: str, port: int = 22) -> SSHClient: - if self.ssh_proxy_client: - self.log.warning(f"Closing the previously existing ssh proxy client") - self.ssh_proxy_client.close() - self.ssh_proxy_client = None - self.ssh_proxy_client = SSHClient() - self.log.debug(f"Setting missing host key policy for the proxy client") - self.ssh_proxy_client.set_missing_host_key_policy(AutoAddPolicy()) - self.log.debug(f"Retrieving proxy server private key file from path: {self.proxy_key_path}") - proxy_pkey = RSAKey.from_private_key_file(str(self.proxy_key_path), self.proxy_key_pass) - self.log.info(f"Connecting to proxy server {host}:{port} with username: {self.username}") - self.ssh_proxy_client.connect( - hostname=host, port=port, username=self.username, pkey=proxy_pkey, passphrase=self.proxy_key_pass) - # self.ssh_proxy_client.get_transport().set_keepalive(self.connection_keep_alive_interval) - self.last_used_proxy_host = host - self.log.debug(f"Successfully connected to the proxy server") - return self.ssh_proxy_client - - def establish_proxy_tunnel( - self, dst_host: str, dst_port: int = 22, src_host: str = 'localhost', src_port: int = 0, - channel_kind: str = 'direct-tcpip', - ) -> Channel: - proxy_transport = self.ssh_proxy_client.get_transport() - if self.proxy_tunnel: - self.log.warning(f"Closing the previously existing ssh proxy tunel") - self.proxy_tunnel.close() - self.proxy_tunnel = None - self.log.info(f"Configuring a tunnel to destination {dst_host}:{dst_port} from {src_host}:{src_port}") - self.proxy_tunnel = proxy_transport.open_channel( - kind=channel_kind, src_addr=(src_host, src_port), dest_addr=(dst_host, dst_port)) - # self.proxy_tunnel.get_transport().set_keepalive(self.channel_keep_alive_interval) - self.last_used_hpc_host = dst_host - self.log.debug(f"Successfully configured a proxy tunnel") - return self.proxy_tunnel - - def connect_to_hpc_frontend_server(self, host: str, port: int = 22, proxy_tunnel: Channel = None) -> SSHClient: - if self.ssh_hpc_client: - self.log.warning(f"Closing the previously existing ssh hpc client") - self.ssh_hpc_client.close() - self.ssh_hpc_client = None - self.ssh_hpc_client = SSHClient() - self.log.debug(f"Setting missing host key policy for the hpc frontend client") - self.ssh_hpc_client.set_missing_host_key_policy(AutoAddPolicy()) - self.log.debug(f"Retrieving hpc frontend server private key file from path: {self.hpc_key_path}") - hpc_pkey = RSAKey.from_private_key_file(str(self.hpc_key_path), self.hpc_key_pass) - self.log.info(f"Connecting to hpc frontend server {host}:{port} with project username: {self.project_username}") - self.ssh_hpc_client.connect(hostname=host, port=port, username=self.project_username, pkey=hpc_pkey, - passphrase=self.hpc_key_pass, sock=proxy_tunnel) - # self.ssh_hpc_client.get_transport().set_keepalive(self.connection_keep_alive_interval) - self.last_used_hpc_host = host - self.log.debug(f"Successfully connected to the hpc frontend server") - return self.ssh_hpc_client - - def reconnect_if_required( - self, hpc_host: str = None, hpc_port: int = 22, proxy_host: str = None, proxy_port: int = 22, - tunnel_host: str = 'localhost', tunnel_port: int = 0 - ) -> None: - if not hpc_host: - hpc_host = self.last_used_hpc_host - if not proxy_host: - proxy_host = self.last_used_proxy_host - if not is_ssh_conn_responsive(self.log, self.ssh_proxy_client): - self.log.warning("The connection to proxy server is not responsive, trying to open a new connection") - self.ssh_proxy_client = self.connect_to_proxy_server(host=proxy_host, port=proxy_port) - if not is_ssh_conn_responsive(self.log, self.proxy_tunnel): - self.log.warning("The proxy tunnel is not responsive, trying to establish a new proxy tunnel") - self.proxy_tunnel = self.establish_proxy_tunnel(hpc_host, hpc_port, tunnel_host, tunnel_port) - if not is_ssh_conn_responsive(self.log, self.ssh_hpc_client): - self.log.warning("The connection to hpc frontend server is not responsive, trying to open a new connection") - self.ssh_hpc_client = self.connect_to_hpc_frontend_server(proxy_host, proxy_port, self.proxy_tunnel) - - def recreate_sftp_if_required( - self, hpc_host: str = None, hpc_port: int = 22, proxy_host: str = None, proxy_port: int = 22, - tunnel_host: str = 'localhost', tunnel_port: int = 0 - ) -> None: - if not hpc_host: - hpc_host = self.last_used_hpc_host - if not proxy_host: - proxy_host = self.last_used_proxy_host - self.reconnect_if_required(hpc_host=hpc_host, hpc_port=hpc_port, proxy_host=proxy_host, proxy_port=proxy_port, - tunnel_host=tunnel_host, tunnel_port=tunnel_port) - if not is_sftp_conn_responsive(self.log, self.sftp_client): - self.log.warning("The SFTP client is not responsive, trying to create a new SFTP client") - if self.sftp_client: - self.sftp_client.close() - self.sftp_client = None - self.sftp_client = self.ssh_hpc_client.open_sftp() - # self.sftp_client.get_channel().get_transport().set_keepalive(self.channel_keep_alive_interval) - - def create_ssh_connection_to_hpc_by_iteration( - self, try_times: int = HPC_SSH_CONNECTION_TRY_TIMES, tunnel_host: str = 'localhost', tunnel_port: int = 0 - ) -> None: - while try_times > 0: - for proxy_host in self.proxy_hosts: - self.ssh_proxy_client = None - self.last_used_proxy_host = None - for hpc_host in self.hpc_hosts: - self.ssh_hpc_client = None - self.last_used_hpc_host = None - try: - self.reconnect_if_required( - hpc_host=hpc_host, hpc_port=22, - proxy_host=proxy_host, proxy_port=22, - tunnel_host=tunnel_host, tunnel_port=tunnel_port) - return # all connections were successful - except Exception as error: - self.log.error(f""" - Failed to connect to hpc host: {hpc_host} - Over proxy host: {proxy_host} - Exception Error: {error} - """) - continue - try_times -= 1 - - raise Exception(f""" - Failed to establish connection to any of the HPC hosts: {self.hpc_hosts} - Using the hpc private key: {self.hpc_key_path} - Over any of the proxy hosts: {self.proxy_hosts} - Using the proxy private key: {self.proxy_key_path} - Performed connection iterations: {try_times} - """) diff --git a/src/utils/operandi_utils/hpc/nhr_connector.py b/src/utils/operandi_utils/hpc/nhr_connector.py new file mode 100644 index 00000000..35c42fd8 --- /dev/null +++ b/src/utils/operandi_utils/hpc/nhr_connector.py @@ -0,0 +1,62 @@ +from logging import Logger +from os import environ +from os.path import join +from pathlib import Path +from typing import Optional + +from paramiko import AutoAddPolicy, RSAKey, SSHClient + +from .utils import check_keyfile_existence +from .constants import HPC_NHR_CLUSTERS + +class NHRConnector: + def __init__( + self, + logger: Logger, + project_username: Optional[str] = environ.get("OPERANDI_HPC_PROJECT_USERNAME", None), + project_env: Optional[str] = environ.get("OPERANDI_HPC_PROJECT_NAME", None), + key_path: Optional[str] = environ.get("OPERANDI_HPC_SSH_KEYPATH", None), + key_pass: Optional[str] = environ.get("OPERANDI_HPC_SSH_KEYPASS", None), + ) -> None: + if not project_username: + raise ValueError("Environment variable is probably not set: OPERANDI_HPC_PROJECT_USERNAME") + if not project_env: + raise ValueError("Environment variable is probably not set: OPERANDI_HPC_PROJECT_NAME") + if not key_path: + raise ValueError("Environment variable is probably not set: OPERANDI_HPC_SSH_KEYPATH") + self.logger = logger + self.project_username = project_username + self.key_path = Path(key_path) + self.key_pass = key_pass + check_keyfile_existence(hpc_key_path=self.key_path) + self.logger.debug(f"Retrieving hpc frontend server private key file from path: {self.key_path}") + self._ssh_client = None + self._sftp_client = None + # TODO: Make the sub cluster options selectable + self.project_root_dir = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], project_env) + self.slurm_workspaces_dir = join(self.project_root_dir, "slurm_workspaces") + + @property + def ssh_client(self): + if not self._ssh_client: + # TODO: Make the sub cluster option selectable + self._ssh_client = self.connect_to_hpc_nhr_frontend_server(host=HPC_NHR_CLUSTERS["EmmyPhase3"]["host"]) + # self._ssh_client.get_transport().set_keepalive(30) + return self._ssh_client + + @staticmethod + def check_keyfile_existence(key_path: Path): + if not key_path.exists(): + raise FileNotFoundError(f"HPC private key path does not exists: {key_path}") + if not key_path.is_file(): + raise FileNotFoundError(f"HPC private key path is not a file: {key_path}") + + def connect_to_hpc_nhr_frontend_server(self, host: str, port: int = 22) -> SSHClient: + self._ssh_client = SSHClient() + self.ssh_client.set_missing_host_key_policy(AutoAddPolicy()) + hpc_pkey = RSAKey.from_private_key_file(str(self.key_path), self.key_pass) + self.logger.info(f"Connecting to hpc frontend server {host}:{port} with username: {self.project_username}") + self.ssh_client.connect( + hostname=host, port=port, username=self.project_username, pkey=hpc_pkey, passphrase=self.key_pass) + self.logger.debug(f"Successfully connected to the hpc frontend server") + return self.ssh_client diff --git a/src/utils/operandi_utils/hpc/executor.py b/src/utils/operandi_utils/hpc/nhr_executor.py similarity index 56% rename from src/utils/operandi_utils/hpc/executor.py rename to src/utils/operandi_utils/hpc/nhr_executor.py index 321c2562..e9c70c51 100644 --- a/src/utils/operandi_utils/hpc/executor.py +++ b/src/utils/operandi_utils/hpc/nhr_executor.py @@ -1,35 +1,20 @@ from logging import getLogger -from os import environ -from pathlib import Path from time import sleep -from typing import List + from operandi_utils.constants import StateJobSlurm -from .connector import HPCConnector from .constants import ( - HPC_EXECUTOR_HOSTS, HPC_EXECUTOR_PROXY_HOSTS, HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_DEFAULT, - HPC_JOB_DEFAULT_PARTITION, HPC_ROOT_BASH_SCRIPT + HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_DEFAULT, HPC_NHR_JOB_DEFAULT_PARTITION, HPC_ROOT_BASH_SCRIPT ) +from .nhr_connector import NHRConnector - -class HPCExecutor(HPCConnector): - def __init__( - self, - executor_hosts: List[str] = HPC_EXECUTOR_HOSTS, proxy_hosts: List[str] = HPC_EXECUTOR_PROXY_HOSTS, - username: str = environ.get("OPERANDI_HPC_USERNAME", None), - project_username: str = environ.get("OPERANDI_HPC_PROJECT_USERNAME", None), - key_path: str = environ.get("OPERANDI_HPC_SSH_KEYPATH", None), - project_name: str = environ.get("OPERANDI_HPC_PROJECT_NAME", None), - tunnel_host: str = 'localhost', tunnel_port: int = 0 - ) -> None: - super().__init__( - hpc_hosts=executor_hosts, proxy_hosts=proxy_hosts, project_name=project_name, - log=getLogger("operandi_utils.hpc.executor"), username=username, project_username=project_username, - key_path=Path(key_path), key_pass=None, tunnel_host=tunnel_host, tunnel_port=tunnel_port) +class NHRExecutor(NHRConnector): + def __init__(self) -> None: + logger = getLogger(name=self.__class__.__name__) + super().__init__(logger) # Execute blocking commands and wait for an output and return code def execute_blocking(self, command, timeout=None, environment=None): - self.reconnect_if_required() - stdin, stdout, stderr = self.ssh_hpc_client.exec_command( + stdin, stdout, stderr = self.ssh_client.exec_command( command=command, timeout=timeout, environment=environment) # TODO: Not satisfied with this but fast conversion from @@ -47,18 +32,18 @@ def trigger_slurm_job( self, batch_script_path: str, workflow_job_id: str, nextflow_script_path: str, input_file_grp: str, workspace_id: str, mets_basename: str, nf_process_forks: int, ws_pages_amount: int, use_mets_server: bool, file_groups_to_remove: str, cpus: int = 2, ram: int = 8, job_deadline_time: str = HPC_JOB_DEADLINE_TIME_TEST, - partition: str = HPC_JOB_DEFAULT_PARTITION, qos: str = HPC_JOB_QOS_DEFAULT + partition: str = HPC_NHR_JOB_DEFAULT_PARTITION, qos: str = HPC_JOB_QOS_DEFAULT ) -> str: if ws_pages_amount < nf_process_forks: - self.log.warning( - "The amount of workspace pages is less than the amount of requested Nextflow process forks. " - f"The pages amount: {ws_pages_amount}, forks requested: {nf_process_forks}. " - f"Setting the forks value to the value of amount of pages.") + self.logger.warning( + "The amount of workspace pages is less than the amount of requested Nextflow process forks. " + f"The pages amount: {ws_pages_amount}, forks requested: {nf_process_forks}. " + f"Setting the forks value to the value of amount of pages.") nf_process_forks = ws_pages_amount nextflow_script_id = nextflow_script_path.split('/')[-1] use_mets_server_bash_flag = "true" if use_mets_server else "false" - + command = f"{HPC_ROOT_BASH_SCRIPT}" # SBATCH arguments passed to the batch script @@ -84,36 +69,38 @@ def trigger_slurm_job( command += f" {use_mets_server_bash_flag}" command += f" {file_groups_to_remove}" - self.log.info(f"About to execute a blocking command: {command}") + self.logger.info(f"About to execute a blocking command: {command}") output, err, return_code = self.execute_blocking(command) - self.log.info(f"Command output: {output}") - self.log.info(f"Command err: {err}") - self.log.info(f"Command return code: {return_code}") + self.logger.info(f"Command output: {output}") + self.logger.info(f"Command err: {err}") + self.logger.info(f"Command return code: {return_code}") slurm_job_id = output[0].strip('\n').split(' ')[-1] - self.log.info(f"Slurm job id: {slurm_job_id}") + self.logger.info(f"Slurm job id: {slurm_job_id}") assert int(slurm_job_id) return slurm_job_id def check_slurm_job_state(self, slurm_job_id: str, tries: int = 10, wait_time: int = 2) -> str: + # TODO: Use force command here command = f"bash -lc 'sacct -j {slurm_job_id} --format=jobid,state,exitcode'" slurm_job_state = None while not slurm_job_state and tries > 0: - self.log.info(f"About to execute a blocking command: {command}") + self.logger.info(f"About to execute a blocking command: {command}") output, err, return_code = self.execute_blocking(command) - self.log.info(f"Command output: {output}") - self.log.info(f"Command err: {err}") - self.log.info(f"Command return code: {return_code}") + self.logger.info(f"Command output: {output}") + self.logger.info(f"Command err: {err}") + self.logger.info(f"Command return code: {return_code}") if output: if len(output) < 3: - self.log.warning("The output has returned with less than 3 lines. The job has not been listed yet.") + self.logger.warning("The output has returned with less than 3 lines. " + "The job has not been listed yet.") continue # Split the last line and get the second element, # i.e., the state element in the requested output format slurm_job_state = output[-2].split()[1] # TODO: dirty fast fix, improve this if slurm_job_state.startswith('---'): - self.log.warning("The output is dashes. The job has not been listed yet.") + self.logger.warning("The output is dashes. The job has not been listed yet.") slurm_job_state = None continue if slurm_job_state: @@ -121,37 +108,37 @@ def check_slurm_job_state(self, slurm_job_id: str, tries: int = 10, wait_time: i tries -= 1 sleep(wait_time) if not slurm_job_state: - self.log.warning(f"Returning a None slurm job state") - self.log.info(f"Slurm job state of {slurm_job_id}: {slurm_job_state}") + self.logger.warning(f"Returning a None slurm job state") + self.logger.info(f"Slurm job state of {slurm_job_id}: {slurm_job_state}") return slurm_job_state def poll_till_end_slurm_job_state(self, slurm_job_id: str, interval: int = 5, timeout: int = 300) -> bool: - self.log.info(f"Polling slurm job status till end") - tries_left = timeout/interval - self.log.info(f"Tries to be performed: {tries_left}") + self.logger.info(f"Polling slurm job status till end") + tries_left = timeout / interval + self.logger.info(f"Tries to be performed: {tries_left}") while tries_left: - self.log.info(f"Sleeping for {interval} secs") + self.logger.info(f"Sleeping for {interval} secs") sleep(interval) tries_left -= 1 - self.log.info(f"Tries left: {tries_left}") + self.logger.info(f"Tries left: {tries_left}") slurm_job_state = self.check_slurm_job_state(slurm_job_id) if not slurm_job_state: - self.log.info(f"Slurm job state is not available yet") + self.logger.info(f"Slurm job state is not available yet") continue if StateJobSlurm.is_state_success(slurm_job_state): - self.log.info(f"Slurm job state is in: {StateJobSlurm.success_states()}") + self.logger.info(f"Slurm job state is in: {StateJobSlurm.success_states()}") return True if StateJobSlurm.is_state_waiting(slurm_job_state): - self.log.info(f"Slurm job state is in: {StateJobSlurm.waiting_states()}") + self.logger.info(f"Slurm job state is in: {StateJobSlurm.waiting_states()}") continue if StateJobSlurm.is_state_fail(slurm_job_state): - self.log.info(f"Slurm job state is in: {StateJobSlurm.failing_states()}") + self.logger.info(f"Slurm job state is in: {StateJobSlurm.failing_states()}") return False # Sometimes the slurm state is still # not initialized inside the HPC environment. # This is not a problem that requires a raise of Exception - self.log.warning(f"Invalid SLURM job state: {slurm_job_state}") + self.logger.warning(f"Invalid SLURM job state: {slurm_job_state}") # Timeout reached - self.log.info("Polling slurm job status timeout reached") + self.logger.info("Polling slurm job status timeout reached") return False diff --git a/src/utils/operandi_utils/hpc/transfer.py b/src/utils/operandi_utils/hpc/nhr_transfer.py similarity index 60% rename from src/utils/operandi_utils/hpc/transfer.py rename to src/utils/operandi_utils/hpc/nhr_transfer.py index 51773d42..f1df397d 100644 --- a/src/utils/operandi_utils/hpc/transfer.py +++ b/src/utils/operandi_utils/hpc/nhr_transfer.py @@ -6,78 +6,65 @@ from stat import S_ISDIR from tempfile import mkdtemp from time import sleep -from typing import List, Tuple +from typing import Tuple from operandi_utils import make_zip_archive, unpack_zip_archive -from .connector import HPCConnector -from .constants import HPC_TRANSFER_HOSTS, HPC_TRANSFER_PROXY_HOSTS - - -class HPCTransfer(HPCConnector): - def __init__( - self, - transfer_hosts: List[str] = HPC_TRANSFER_HOSTS, proxy_hosts: List[str] = HPC_TRANSFER_PROXY_HOSTS, - username: str = environ.get("OPERANDI_HPC_USERNAME", None), - project_username: str = environ.get("OPERANDI_HPC_PROJECT_USERNAME", None), - key_path: str = environ.get("OPERANDI_HPC_SSH_KEYPATH", None), - project_name: str = environ.get("OPERANDI_HPC_PROJECT_NAME", None), - tunnel_host: str = 'localhost', - tunnel_port: int = 0 - ) -> None: - super().__init__( - hpc_hosts=transfer_hosts, proxy_hosts=proxy_hosts, project_name=project_name, - log=getLogger("operandi_utils.hpc.transfer"), username=username, project_username=project_username, - key_path=Path(key_path), key_pass=None, tunnel_host=tunnel_host, tunnel_port=tunnel_port) - - def put_batch_script(self, batch_script_id: str) -> str: - local_batch_script_path = join(dirname(__file__), "batch_scripts", batch_script_id) - hpc_batch_script_path = join(self.batch_scripts_dir, batch_script_id) - self.put_file(local_src=local_batch_script_path, remote_dst=hpc_batch_script_path) - self.log.info(f"Put file from local src: {local_batch_script_path}, to dst: {hpc_batch_script_path}") - return hpc_batch_script_path +from .nhr_connector import NHRConnector + +class NHRTransfer(NHRConnector): + def __init__(self) -> None: + logger = getLogger(name=self.__class__.__name__) + super().__init__(logger) + + @property + def sftp_client(self): + if not self._sftp_client: + self._sftp_client = self.ssh_client.open_sftp() + # self._sftp_client.get_channel().get_transport().set_keepalive(30) + return self._sftp_client def create_slurm_workspace_zip( self, ocrd_workspace_dir: str, workflow_job_id: str, nextflow_script_path: str, tempdir_prefix: str = "slurm_workspace-" ) -> str: - self.log.info(f"Entering pack_slurm_workspace") - self.log.info(f"ocrd_workspace_dir: {ocrd_workspace_dir}") - self.log.info(f"workflow_job_id: {workflow_job_id}") - self.log.info(f"nextflow_script_path: {nextflow_script_path}") - self.log.info(f"tempdir_prefix: {tempdir_prefix}") + self.logger.info(f"Entering pack_slurm_workspace") + self.logger.info(f"ocrd_workspace_dir: {ocrd_workspace_dir}") + self.logger.info(f"workflow_job_id: {workflow_job_id}") + self.logger.info(f"nextflow_script_path: {nextflow_script_path}") + self.logger.info(f"tempdir_prefix: {tempdir_prefix}") # Parse the nextflow file name from the script path nextflow_filename = nextflow_script_path.split('/')[-1] - self.log.info(f"Nextflow file name to be used: {nextflow_filename}") + self.logger.info(f"Nextflow file name to be used: {nextflow_filename}") # Parse the ocrd workspace id from the dir path ocrd_workspace_id = ocrd_workspace_dir.split('/')[-1] - self.log.info(f"OCR-D workspace id to be used: {ocrd_workspace_id}") + self.logger.info(f"OCR-D workspace id to be used: {ocrd_workspace_id}") tempdir = mkdtemp(prefix=tempdir_prefix) - self.log.info(f"Created a temp dir name: {tempdir}") + self.logger.info(f"Created a temp dir name: {tempdir}") temp_workflow_job_dir = join(tempdir, workflow_job_id) makedirs(temp_workflow_job_dir) - self.log.info(f"Created a slurm workspace dir: {temp_workflow_job_dir}") + self.logger.info(f"Created a slurm workspace dir: {temp_workflow_job_dir}") dst_script_path = join(temp_workflow_job_dir, nextflow_filename) symlink(src=nextflow_script_path, dst=dst_script_path) - self.log.info(f"Symlink created from src: {nextflow_script_path}, to dst: {dst_script_path}") + self.logger.info(f"Symlink created from src: {nextflow_script_path}, to dst: {dst_script_path}") dst_workspace_path = join(temp_workflow_job_dir, ocrd_workspace_id) copytree(src=ocrd_workspace_dir, dst=dst_workspace_path) - self.log.info(f"Copied tree from src: {ocrd_workspace_dir}, to dst: {dst_workspace_path}") + self.logger.info(f"Copied tree from src: {ocrd_workspace_dir}, to dst: {dst_workspace_path}") dst_zip_path = f"{temp_workflow_job_dir}.zip" make_zip_archive(source=temp_workflow_job_dir, destination=dst_zip_path) - self.log.info(f"Zip archive created from src: {temp_workflow_job_dir}, to dst: {dst_zip_path}") + self.logger.info(f"Zip archive created from src: {temp_workflow_job_dir}, to dst: {dst_zip_path}") return dst_zip_path def put_slurm_workspace(self, local_src_slurm_zip: str, workflow_job_id: str) -> str: - self.log.info(f"Workflow job id to be used: {workflow_job_id}") + self.logger.info(f"Workflow job id to be used: {workflow_job_id}") hpc_dst_slurm_zip = join(self.slurm_workspaces_dir, f"{workflow_job_id}.zip") self.put_file(local_src=local_src_slurm_zip, remote_dst=hpc_dst_slurm_zip) - self.log.info(f"Put file from local src: {local_src_slurm_zip}, to remote dst: {hpc_dst_slurm_zip}") - self.log.info(f"Leaving put_slurm_workspace, returning: {hpc_dst_slurm_zip}") + self.logger.info(f"Put file from local src: {local_src_slurm_zip}, to remote dst: {hpc_dst_slurm_zip}") + self.logger.info(f"Leaving put_slurm_workspace, returning: {hpc_dst_slurm_zip}") # Zip path inside the HPC environment return hpc_dst_slurm_zip @@ -85,38 +72,38 @@ def pack_and_put_slurm_workspace( self, ocrd_workspace_dir: str, workflow_job_id: str, nextflow_script_path: str, tempdir_prefix: str = "slurm_workspace-" ) -> Tuple[str, str]: - self.log.info(f"Entering put_slurm_workspace") - self.log.info(f"ocrd_workspace_dir: {ocrd_workspace_dir}") - self.log.info(f"workflow_job_id: {workflow_job_id}") - self.log.info(f"nextflow_script_path: {nextflow_script_path}") - self.log.info(f"tempdir_prefix: {tempdir_prefix}") + self.logger.info(f"Entering put_slurm_workspace") + self.logger.info(f"ocrd_workspace_dir: {ocrd_workspace_dir}") + self.logger.info(f"workflow_job_id: {workflow_job_id}") + self.logger.info(f"nextflow_script_path: {nextflow_script_path}") + self.logger.info(f"tempdir_prefix: {tempdir_prefix}") local_src_slurm_zip = self.create_slurm_workspace_zip( ocrd_workspace_dir=ocrd_workspace_dir, workflow_job_id=workflow_job_id, nextflow_script_path=nextflow_script_path, tempdir_prefix=tempdir_prefix) - self.log.info(f"Created slurm workspace zip: {local_src_slurm_zip}") + self.logger.info(f"Created slurm workspace zip: {local_src_slurm_zip}") hpc_dst = self.put_slurm_workspace(local_src_slurm_zip=local_src_slurm_zip, workflow_job_id=workflow_job_id) - self.log.info(f"Leaving pack_and_put_slurm_workspace") + self.logger.info(f"Leaving pack_and_put_slurm_workspace") Path(local_src_slurm_zip).unlink(missing_ok=True) return local_src_slurm_zip, hpc_dst def get_and_unpack_slurm_workspace(self, ocrd_workspace_dir: str, workflow_job_dir: str): - self.log.info(f"Entering get_and_unpack_slurm_workspace") - self.log.info(f"ocrd_workspace_dir: {ocrd_workspace_dir}") - self.log.info(f"workflow_job_dir: {workflow_job_dir}") + self.logger.info(f"Entering get_and_unpack_slurm_workspace") + self.logger.info(f"ocrd_workspace_dir: {ocrd_workspace_dir}") + self.logger.info(f"workflow_job_dir: {workflow_job_dir}") # Parse the ocrd workspace id from the dir path ocrd_workspace_id = ocrd_workspace_dir.split('/')[-1] - self.log.info(f"OCR-D workspace id to be used: {ocrd_workspace_id}") + self.logger.info(f"OCR-D workspace id to be used: {ocrd_workspace_id}") workflow_job_id = workflow_job_dir.split('/')[-1] - self.log.info(f"Workflow job id to be used: {workflow_job_id}") + self.logger.info(f"Workflow job id to be used: {workflow_job_id}") get_src = join(self.slurm_workspaces_dir, workflow_job_id, f"{workflow_job_id}.zip") get_dst = join(Path(workflow_job_dir).parent.absolute(), f"{workflow_job_id}.zip") self._get_file_with_retries(remote_src=get_src, local_dst=get_dst) - self.log.info(f"Got workflow job zip file from src: {get_src}, to dst: {get_dst}") + self.logger.info(f"Got workflow job zip file from src: {get_src}, to dst: {get_dst}") unpack_src = get_dst unpack_dst = workflow_job_dir @@ -125,22 +112,22 @@ def get_and_unpack_slurm_workspace(self, ocrd_workspace_dir: str, workflow_job_d except Exception as error: raise Exception( f"Error when unpacking workflow job zip: {error}, unpack_src: {unpack_src}, unpack_dst: {unpack_dst}") - self.log.info(f"Unpacked workflow job zip from src: {unpack_src}, to dst: {unpack_dst}") + self.logger.info(f"Unpacked workflow job zip from src: {unpack_src}, to dst: {unpack_dst}") # Remove the temporary workflow job zip Path(unpack_src).unlink(missing_ok=True) - self.log.info(f"Removed the temp workflow job zip: {unpack_src}") + self.logger.info(f"Removed the temp workflow job zip: {unpack_src}") # Remove the workspace dir from the local storage, # before transferring the results to avoid potential # overwrite errors or duplications rmtree(ocrd_workspace_dir, ignore_errors=True) - self.log.info(f"Removed tree dirs: {ocrd_workspace_dir}") + self.logger.info(f"Removed tree dirs: {ocrd_workspace_dir}") get_src = join(self.slurm_workspaces_dir, workflow_job_id, ocrd_workspace_id, f"{ocrd_workspace_id}.zip") get_dst = join(Path(ocrd_workspace_dir).parent.absolute(), f"{ocrd_workspace_id}.zip") self._get_file_with_retries(remote_src=get_src, local_dst=get_dst) - self.log.info(f"Got workspace zip file from src: {get_src}, to dst: {get_dst}") + self.logger.info(f"Got workspace zip file from src: {get_src}, to dst: {get_dst}") unpack_src = join(Path(ocrd_workspace_dir).parent.absolute(), f"{ocrd_workspace_id}.zip") unpack_dst = ocrd_workspace_dir @@ -149,11 +136,11 @@ def get_and_unpack_slurm_workspace(self, ocrd_workspace_dir: str, workflow_job_d except Exception as error: raise Exception( f"Error when unpacking workspace zip: {error}, unpack_src: {unpack_src}, unpack_dst: {unpack_dst}") - self.log.info(f"Unpacked workspace zip from src: {unpack_src}, to dst: {unpack_dst}") + self.logger.info(f"Unpacked workspace zip from src: {unpack_src}, to dst: {unpack_dst}") # Remove the temporary workspace zip Path(unpack_src).unlink(missing_ok=True) - self.log.info(f"Removed the temp workspace zip: {unpack_src}") + self.logger.info(f"Removed the temp workspace zip: {unpack_src}") # Remove the workspace dir from the local workflow job dir, # and. Then create a symlink of the workspace dir inside the @@ -164,12 +151,12 @@ def get_and_unpack_slurm_workspace(self, ocrd_workspace_dir: str, workflow_job_d except Exception as error: raise Exception( f"Error when symlink: {error}, src: {ocrd_workspace_dir}, dst: {workspace_dir_in_workflow_job}") - self.log.info(f"Symlinked from src: {ocrd_workspace_dir}, to dst: {workspace_dir_in_workflow_job}") - self.log.info(f"Leaving get_and_unpack_slurm_workspace") + self.logger.info(f"Symlinked from src: {ocrd_workspace_dir}, to dst: {workspace_dir_in_workflow_job}") + self.logger.info(f"Leaving get_and_unpack_slurm_workspace") def _get_file_with_retries(self, remote_src, local_dst, try_times: int = 100, sleep_time: int = 3): if try_times < 0 or sleep_time < 0: - raise ValueError("Negative values passed for the times") + raise ValueError("Negative value passed as a parameter for time") tries = try_times while tries > 0: try: @@ -178,29 +165,26 @@ def _get_file_with_retries(self, remote_src, local_dst, try_times: int = 100, sl except Exception as error: tries -= 1 if tries <= 0: - raise Exception(f"Error when getting zip file: {error}, " - f"remote_src: {remote_src}, local_dst: {local_dst}") + raise Exception(f"Error getting zip file: {error}, remote_src:{remote_src}, local_dst:{local_dst}") sleep(sleep_time) continue - def mkdir_p(self, remotepath, mode=0o766): - self.recreate_sftp_if_required() - if remotepath == '/': + def mkdir_p(self, remote_path, mode=0o766): + if remote_path == '/': self.sftp_client.chdir('/') # absolute path so change directory to root return False - if remotepath == '': + if remote_path == '': return False # top-level relative directory must exist try: - self.sftp_client.chdir(remotepath) # subdirectory exists + self.sftp_client.chdir(remote_path) # subdirectory exists except IOError as error: - dir_name, base_name = split(remotepath.rstrip('/')) + dir_name, base_name = split(remote_path.rstrip('/')) self.mkdir_p(dir_name) # make parent directories self.sftp_client.mkdir(path=base_name, mode=mode) # subdirectory missing, so created it self.sftp_client.chdir(base_name) return True def get_file(self, remote_src, local_dst): - self.recreate_sftp_if_required() makedirs(name=Path(local_dst).parent.absolute(), exist_ok=True) self.sftp_client.get(remotepath=remote_src, localpath=local_dst) @@ -210,7 +194,6 @@ def get_dir(self, remote_src, local_dst, mode=0o766): The remote source directory needs to exist. All subdirectories in source are created under destination. """ - self.recreate_sftp_if_required() makedirs(name=local_dst, mode=mode, exist_ok=True) for item in self.sftp_client.listdir(remote_src): item_src = join(remote_src, item) @@ -221,8 +204,7 @@ def get_dir(self, remote_src, local_dst, mode=0o766): self.get_file(remote_src=item_src, local_dst=item_dst) def put_file(self, local_src, remote_dst): - self.recreate_sftp_if_required() - self.mkdir_p(remotepath=str(Path(remote_dst).parent.absolute())) + self.mkdir_p(remote_path=str(Path(remote_dst).parent.absolute())) self.sftp_client.put(localpath=local_src, remotepath=remote_dst) def put_dir(self, local_src, remote_dst, mode=0o766): @@ -231,8 +213,7 @@ def put_dir(self, local_src, remote_dst, mode=0o766): The remote destination directory needs to exist. All subdirectories in source are created under destination. """ - self.recreate_sftp_if_required() - self.mkdir_p(remotepath=remote_dst, mode=mode) + self.mkdir_p(remote_path=remote_dst, mode=mode) for item in listdir(local_src): item_src = join(local_src, item) item_dst = join(remote_dst, item) From abd68e2b6304e273de6ee0416d5379f42313005e Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 16:42:26 +0200 Subject: [PATCH 05/26] fix: hpc __init__ --- src/utils/operandi_utils/hpc/__init__.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/src/utils/operandi_utils/hpc/__init__.py b/src/utils/operandi_utils/hpc/__init__.py index c9e1d29c..ac8fb93a 100644 --- a/src/utils/operandi_utils/hpc/__init__.py +++ b/src/utils/operandi_utils/hpc/__init__.py @@ -1,9 +1,5 @@ -__all__ = [ - "HPCConnector", - "HPCExecutor", - "HPCTransfer" -] +__all__ = ["NHRConnector", "NHRExecutor", "NHRTransfer"] -from operandi_utils.hpc.connector import HPCConnector -from operandi_utils.hpc.executor import HPCExecutor -from operandi_utils.hpc.transfer import HPCTransfer +from operandi_utils.hpc.nhr_connector import NHRConnector +from operandi_utils.hpc.nhr_executor import NHRExecutor +from operandi_utils.hpc.nhr_transfer import NHRTransfer From 1a45ae06afe68e1fa58501864e4bf8684c904e2c Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 16:43:51 +0200 Subject: [PATCH 06/26] refactor: remove hpc utils file --- src/utils/operandi_utils/hpc/constants.py | 17 +++++++--- src/utils/operandi_utils/hpc/nhr_connector.py | 1 - src/utils/operandi_utils/hpc/utils.py | 31 ------------------- 3 files changed, 12 insertions(+), 37 deletions(-) delete mode 100644 src/utils/operandi_utils/hpc/utils.py diff --git a/src/utils/operandi_utils/hpc/constants.py b/src/utils/operandi_utils/hpc/constants.py index 94dfab6c..37f3a5b8 100644 --- a/src/utils/operandi_utils/hpc/constants.py +++ b/src/utils/operandi_utils/hpc/constants.py @@ -7,16 +7,22 @@ "HPC_JOB_DEADLINE_TIME_TEST", "HPC_JOB_DEFAULT_PARTITION", "HPC_JOB_TEST_PARTITION", + "HPC_NHR_JOB_DEFAULT_PARTITION", + "HPC_NHR_JOB_TEST_PARTITION", "HPC_JOB_QOS_DEFAULT", "HPC_JOB_QOS_LONG", "HPC_JOB_QOS_SHORT", "HPC_JOB_QOS_VERY_LONG", "HPC_PATH_HOME_USERS", - "HPC_PATH_SCRATCH1_OCR_PROJECT", "HPC_ROOT_BASH_SCRIPT", "HPC_SSH_CONNECTION_TRY_TIMES", "HPC_TRANSFER_HOSTS", - "HPC_TRANSFER_PROXY_HOSTS" + "HPC_TRANSFER_PROXY_HOSTS", + + "HPC_NHR_PROJECT", + "HPC_NHR_CLUSTERS", + # TODO: Remove once CI/CD works + "HPC_NHR_SCRATCH_EMMY_HDD" ] HPC_NHR_PROJECT: str = "project_pwieder_ocr_nhr" @@ -27,7 +33,7 @@ HPC_NHR_SCRATCH_EMMY_SSD: str = f"/mnt/lustre-emmy-ssd/projects/{HPC_NHR_PROJECT}" # Capacity - 8.4 PiB HPC_NHR_SCRATCH_GRETE_SSD: str = f"/mnt/lustre-grete/projects/{HPC_NHR_PROJECT}" # Capacity - 110 TiB -HPC_NHR_CLUSTER = { +HPC_NHR_CLUSTERS = { "EmmyPhase1": { "host": "glogin-p1.hpc.gwdg.de", "scratch-emmy-hdd": HPC_NHR_SCRATCH_EMMY_HDD, @@ -66,8 +72,7 @@ HPC_TRANSFER_HOSTS = ["transfer-scc.gwdg.de", "transfer-mdc.hpc.gwdg.de"] HPC_TRANSFER_PROXY_HOSTS = ["transfer.gwdg.de", "login.gwdg.de"] HPC_PATH_HOME_USERS = "/home/users" -HPC_PATH_SCRATCH1_OCR_PROJECT = "/scratch1/projects/project_pwieder_ocr" -HPC_ROOT_BASH_SCRIPT = "/scratch1/projects/project_pwieder_ocr/invoke_batch_script.sh" +HPC_ROOT_BASH_SCRIPT = f"{HPC_NHR_SCRATCH_EMMY_HDD}/invoke_batch_script.sh" HPC_DIR_BATCH_SCRIPTS = "batch_scripts" HPC_DIR_SLURM_WORKSPACES = "slurm_workspaces" @@ -75,6 +80,8 @@ HPC_JOB_DEADLINE_TIME_TEST = "0:30:00" HPC_JOB_DEFAULT_PARTITION = "medium" HPC_JOB_TEST_PARTITION = "medium" +HPC_NHR_JOB_DEFAULT_PARTITION = "standard96:shared" +HPC_NHR_JOB_TEST_PARTITION = "standard96:shared" # Check here: https://docs.hpc.gwdg.de/getting_started/transition/index.html HPC_JOB_QOS_SHORT = "2h" diff --git a/src/utils/operandi_utils/hpc/nhr_connector.py b/src/utils/operandi_utils/hpc/nhr_connector.py index 35c42fd8..a9d0c662 100644 --- a/src/utils/operandi_utils/hpc/nhr_connector.py +++ b/src/utils/operandi_utils/hpc/nhr_connector.py @@ -6,7 +6,6 @@ from paramiko import AutoAddPolicy, RSAKey, SSHClient -from .utils import check_keyfile_existence from .constants import HPC_NHR_CLUSTERS class NHRConnector: diff --git a/src/utils/operandi_utils/hpc/utils.py b/src/utils/operandi_utils/hpc/utils.py deleted file mode 100644 index f0aac1f5..00000000 --- a/src/utils/operandi_utils/hpc/utils.py +++ /dev/null @@ -1,31 +0,0 @@ -from pathlib import Path - -from .constants import ( - HPC_DIR_BATCH_SCRIPTS, HPC_DIR_SLURM_WORKSPACES, HPC_PATH_HOME_USERS, HPC_PATH_SCRATCH1_OCR_PROJECT) - - -def check_keyfile_existence(hpc_key_path: Path): - if not hpc_key_path.exists(): - raise FileNotFoundError(f"HPC private key path does not exists: {hpc_key_path}") - if not hpc_key_path.is_file(): - raise FileNotFoundError(f"HPC private key path is not a file: {hpc_key_path}") - - -def resolve_hpc_user_home_dir(username: str) -> str: - return f"{HPC_PATH_HOME_USERS}/{username}" - - -def resolve_hpc_project_root_dir(project_root_dir: str) -> str: - return f"{HPC_PATH_SCRATCH1_OCR_PROJECT}/{project_root_dir}" - - -def resolve_hpc_batch_scripts_dir( - project_root_dir: str, batch_scripts_dir: str = HPC_DIR_BATCH_SCRIPTS -) -> str: - return f"{resolve_hpc_project_root_dir(project_root_dir)}/{batch_scripts_dir}" - - -def resolve_hpc_slurm_workspaces_dir( - project_root_dir: str, slurm_workspaces_dir: str = HPC_DIR_SLURM_WORKSPACES -) -> str: - return f"{resolve_hpc_project_root_dir(project_root_dir)}/{slurm_workspaces_dir}" From e08bf5ae81f424afb8714dfe669d7152a65877c3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 16:44:06 +0200 Subject: [PATCH 07/26] adapt: batch scripts --- .../batch_check_ocrd_all_version.sh | 17 ++++++----- .../batch_create_ocrd_all_maximum_sif.sh | 20 ++++++------- .../batch_download_ocrd_all_models.sh | 29 +++++++++---------- .../batch_submit_workflow_job.sh | 17 +++++------ 4 files changed, 40 insertions(+), 43 deletions(-) diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_check_ocrd_all_version.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_check_ocrd_all_version.sh index be72b3af..24466582 100644 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_check_ocrd_all_version.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_check_ocrd_all_version.sh @@ -1,10 +1,10 @@ #!/bin/bash -#SBATCH --constraint scratch -#SBATCH --partition medium -#SBATCH --time 0:05:00 -#SBATCH --output /scratch1/projects/project_pwieder_ocr/batch_job_logs/batch_check_ocrd_all_version_job-%J.txt +#SBATCH --partition standard96:shared +#SBATCH --time 00:05:00 +#SBATCH --qos 2h +#SBATCH --output check_ocrd_all_version_job-%J.txt #SBATCH --cpus-per-task 1 -#STABCH --mem 16G +#SBATCH --mem 16G set -e @@ -12,6 +12,9 @@ hostname /opt/slurm/etc/scripts/misc/slurm_resources module purge -module load singularity +module load apptainer +SIF_PATH="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_all_maximum_image.sif" -singularity exec "/scratch1/projects/project_pwieder_ocr/ocrd_all_maximum_image.sif" ocrd --version +apptainer exec "$SIF_PATH" ocrd --version +apptainer exec "$SIF_PATH" ocrd-tesserocr-recognize --dump-module-dir +apptainer exec "$SIF_PATH" ls -la /models diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_all_maximum_sif.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_all_maximum_sif.sh index 3a732bd3..96500594 100755 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_all_maximum_sif.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_create_ocrd_all_maximum_sif.sh @@ -1,23 +1,23 @@ #!/bin/bash -#SBATCH --constraint scratch -#SBATCH --partition medium -#SBATCH --time 02:00:00 -#SBATCH --output /scratch1/projects/project_pwieder_ocr/batch_job_logs/batch_create_ocrd_all_sif_job-%J.txt +#SBATCH --partition standard96:shared +#SBATCH --time 2:00:00 +#SBATCH --output create_ocrd_all_sif_job-%J.txt #SBATCH --cpus-per-task 16 #SBATCH --mem 64G set -e module purge -module load singularity +module load apptainer hostname /opt/slurm/etc/scripts/misc/slurm_resources -SINGULARITY_CACHE_DIR="/scratch1/projects/project_pwieder_ocr" -SIF_NAME="ocrd_all_maximum_image.sif" +APPTAINER_TMPDIR="$LOCAL_TMPDIR" +APPTAINER_CACHE_DIR="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr" +SIF_NAME="ocrd_all_maximum_image_new.sif" OCRD_ALL_MAXIMUM_IMAGE="docker://ocrd/all:latest" -cd "${SINGULARITY_CACHE_DIR}" || exit -singularity build --disable-cache "${SIF_NAME}" "${OCRD_ALL_MAXIMUM_IMAGE}" -singularity exec "${SIF_NAME}" ocrd --version +cd "${APPTAINER_CACHE_DIR}" || exit +apptainer build --disable-cache "${SIF_NAME}" "${OCRD_ALL_MAXIMUM_IMAGE}" +apptainer exec "${SIF_NAME}" ocrd --version diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_download_ocrd_all_models.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_download_ocrd_all_models.sh index 9ae5bcf7..05e543ab 100644 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_download_ocrd_all_models.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_download_ocrd_all_models.sh @@ -1,22 +1,21 @@ #!/bin/bash -#SBATCH --constraint scratch -#SBATCH --partition medium -#SBATCH --time 06:00:00 -#SBATCH --output /scratch1/projects/project_pwieder_ocr/batch_job_logs/batch_download_all_ocrd_models_job-%J.txt +#SBATCH --partition standard96:shared +#SBATCH --time 6:00:00 +#SBATCH --output download_all_ocrd_models_job-%J.txt #SBATCH --cpus-per-task 16 -#SBATCH --mem 64G +#SBATCH --mem 32G set -e module purge -module load singularity +module load apptainer hostname /opt/slurm/etc/scripts/misc/slurm_resources # This sif file is generated with another batch script -SIF_PATH="/scratch1/projects/project_pwieder_ocr/ocrd_all_maximum_image.sif" -OCRD_MODELS_DIR="/scratch1/projects/project_pwieder_ocr/ocrd_models" +SIF_PATH="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_all_maximum_image.sif" +OCRD_MODELS_DIR="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_models" OCRD_MODELS_DIR_IN_DOCKER="/usr/local/share" if [ ! -f "${SIF_PATH}" ]; then @@ -28,11 +27,9 @@ if [ ! -d "${OCRD_MODELS_DIR}" ]; then mkdir -p "${OCRD_MODELS_DIR}" fi -# Download all available ocrd models -singularity exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download '*' -# Download models for ocrd-tesserocr-recognize which are not downloaded with the '*' glob -singularity exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download ocrd-tesserocr-recognize '*' -# Download models for ocrd-kraken-recognize which are not downloaded with the '*' glob -singularity exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download ocrd-kraken-recognize '*' -# Download models for ocrd-calamari-recognize which are not downloaded with the '*' glob -singularity exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download ocrd-calamari-recognize '*' +apptainer exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download -o '*' +apptainer exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download -o ocrd-tesserocr-recognize '*' +apptainer exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download -o ocrd-calamari-recognize '*' +apptainer exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download -o ocrd-kraken-recognize '*' +apptainer exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download -o ocrd-sbb-binarize '*' +apptainer exec --bind "${OCRD_MODELS_DIR}:${OCRD_MODELS_DIR_IN_DOCKER}" "${SIF_PATH}" ocrd resmgr download -o ocrd-cis-ocropy-recognize '*' \ No newline at end of file diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh index a927c064..718071ef 100755 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh @@ -1,5 +1,4 @@ #!/bin/bash -#SBATCH --constraint scratch set -e @@ -18,9 +17,9 @@ set -e # $11 - Boolean flag showing whether a mets server is utilized or not # $12 - File groups to be removed from the workspace after the processing -SIF_PATH="/scratch1/projects/project_pwieder_ocr/ocrd_all_maximum_image.sif" +SIF_PATH="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_all_maximum_image.sif" SIF_PATH_IN_NODE="${TMP_LOCAL}/ocrd_all_maximum_image.sif" -OCRD_MODELS_DIR="/scratch1/projects/project_pwieder_ocr/ocrd_models" +OCRD_MODELS_DIR="/mnt/lustre-emmy-hdd/projects/project_pwieder_ocr_nhr/ocrd_models" OCRD_MODELS_DIR_IN_NODE="${TMP_LOCAL}/ocrd_models" OCRD_MODELS_DIR_IN_DOCKER="/usr/local/share/ocrd-resources" BIND_OCRD_MODELS="${OCRD_MODELS_DIR_IN_NODE}/ocrd-resources:${OCRD_MODELS_DIR_IN_DOCKER}" @@ -51,7 +50,7 @@ hostname /opt/slurm/etc/scripts/misc/slurm_resources module purge -module load singularity +module load apptainer module load nextflow # module load spack-user; eval "$(spack load --sh curl%gcc@10.2.0)" @@ -61,7 +60,6 @@ echo "Workspace dir: $WORKSPACE_DIR" echo "Nextflow script path: $NF_SCRIPT_PATH" echo "Use mets server: $USE_METS_SERVER" echo "Used file group: $IN_FILE_GRP" -echo "File groups to remove: $FILE_GROUPS_TO_REMOVE" echo "Pages: $PAGES" # To submit separate jobs for each process in the NF script @@ -107,7 +105,7 @@ transfer_requirements_to_node_storage() { exit 1 else echo "Successfully transferred SIF to node local storage" - singularity exec "$SIF_PATH_IN_NODE" ocrd --version + apptainer exec "$SIF_PATH_IN_NODE" ocrd --version fi cp -R "${OCRD_MODELS_DIR}" "${OCRD_MODELS_DIR_IN_NODE}" @@ -150,7 +148,7 @@ start_mets_server () { if [ "$1" == "true" ] ; then echo "Starting the mets server for the specific workspace in the background" - singularity exec \ + apptainer exec \ --bind "${BIND_WORKSPACE_DIR}" \ "${SIF_PATH_IN_NODE}" \ ocrd workspace -U "${BIND_METS_SOCKET_PATH}" -d "${WORKSPACE_DIR_IN_DOCKER}" server start \ @@ -168,7 +166,7 @@ stop_mets_server () { if [ "$1" == "true" ] ; then echo "Stopping the mets server" - singularity exec \ + apptainer exec \ --bind "${BIND_WORKSPACE_DIR}" \ "${SIF_PATH_IN_NODE}" \ ocrd workspace -U "${BIND_METS_SOCKET_PATH}" -d "${WORKSPACE_DIR_IN_DOCKER}" server stop @@ -227,7 +225,7 @@ list_file_groups_from_workspace () { remove_file_group_from_workspace () { echo "Removing file group: $1" - singularity exec --bind "${BIND_WORKSPACE_DIR}" "${SIF_PATH_IN_NODE}" \ + apptainer exec --bind "${BIND_WORKSPACE_DIR}" "${SIF_PATH_IN_NODE}" \ ocrd workspace -d "${WORKSPACE_DIR_IN_DOCKER}" remove-group -r -f "$1" \ > "${WORKSPACE_DIR}/remove_file_groups.log" 2>&1 } @@ -276,4 +274,3 @@ execute_nextflow_workflow "$USE_METS_SERVER" stop_mets_server "$USE_METS_SERVER" remove_file_groups_from_workspace "$FILE_GROUPS_TO_REMOVE" zip_results -clear_data_from_computing_node From 602b5107a2636afde4b3b82948acb5e410e673f3 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 16:56:24 +0200 Subject: [PATCH 08/26] improve: use standard96s partition, wrapper --- ...voke_batch_script.sh => wrapper_submit_workflow_job.sh} | 0 src/utils/operandi_utils/hpc/constants.py | 7 ++++--- src/utils/operandi_utils/hpc/nhr_connector.py | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) rename src/utils/operandi_utils/hpc/batch_scripts/{invoke_batch_script.sh => wrapper_submit_workflow_job.sh} (100%) diff --git a/src/utils/operandi_utils/hpc/batch_scripts/invoke_batch_script.sh b/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh similarity index 100% rename from src/utils/operandi_utils/hpc/batch_scripts/invoke_batch_script.sh rename to src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh diff --git a/src/utils/operandi_utils/hpc/constants.py b/src/utils/operandi_utils/hpc/constants.py index 37f3a5b8..3991d765 100644 --- a/src/utils/operandi_utils/hpc/constants.py +++ b/src/utils/operandi_utils/hpc/constants.py @@ -72,16 +72,17 @@ HPC_TRANSFER_HOSTS = ["transfer-scc.gwdg.de", "transfer-mdc.hpc.gwdg.de"] HPC_TRANSFER_PROXY_HOSTS = ["transfer.gwdg.de", "login.gwdg.de"] HPC_PATH_HOME_USERS = "/home/users" -HPC_ROOT_BASH_SCRIPT = f"{HPC_NHR_SCRATCH_EMMY_HDD}/invoke_batch_script.sh" HPC_DIR_BATCH_SCRIPTS = "batch_scripts" HPC_DIR_SLURM_WORKSPACES = "slurm_workspaces" +# TODO: Fix the constant file name - it should be automatically resolved +HPC_ROOT_BASH_SCRIPT = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/wrapper_submit_workflow_job.sh" HPC_JOB_DEADLINE_TIME_REGULAR = "48:00:00" HPC_JOB_DEADLINE_TIME_TEST = "0:30:00" HPC_JOB_DEFAULT_PARTITION = "medium" HPC_JOB_TEST_PARTITION = "medium" -HPC_NHR_JOB_DEFAULT_PARTITION = "standard96:shared" -HPC_NHR_JOB_TEST_PARTITION = "standard96:shared" +HPC_NHR_JOB_DEFAULT_PARTITION = "standard96s:shared" +HPC_NHR_JOB_TEST_PARTITION = "standard96s:shared" # Check here: https://docs.hpc.gwdg.de/getting_started/transition/index.html HPC_JOB_QOS_SHORT = "2h" diff --git a/src/utils/operandi_utils/hpc/nhr_connector.py b/src/utils/operandi_utils/hpc/nhr_connector.py index a9d0c662..ecfe56f7 100644 --- a/src/utils/operandi_utils/hpc/nhr_connector.py +++ b/src/utils/operandi_utils/hpc/nhr_connector.py @@ -27,7 +27,7 @@ def __init__( self.project_username = project_username self.key_path = Path(key_path) self.key_pass = key_pass - check_keyfile_existence(hpc_key_path=self.key_path) + self.check_keyfile_existence(key_path=self.key_path) self.logger.debug(f"Retrieving hpc frontend server private key file from path: {self.key_path}") self._ssh_client = None self._sftp_client = None From 0d36a34ddb37d8d7eb67f43b3e70f87b1c5d5416 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 17:03:35 +0200 Subject: [PATCH 09/26] adapt: broker workers --- .../operandi_broker/job_status_worker.py | 6 +++--- src/broker/operandi_broker/worker.py | 6 +++--- src/utils/operandi_utils/hpc/constants.py | 15 --------------- tests/fixtures/hpc.py | 18 ------------------ tests/integration_tests/test_full_cycle.py | 4 ++-- tests/tests_utils/conftest.py | 1 - 6 files changed, 8 insertions(+), 42 deletions(-) delete mode 100644 tests/fixtures/hpc.py diff --git a/src/broker/operandi_broker/job_status_worker.py b/src/broker/operandi_broker/job_status_worker.py index 77fe636b..537b0780 100644 --- a/src/broker/operandi_broker/job_status_worker.py +++ b/src/broker/operandi_broker/job_status_worker.py @@ -10,7 +10,7 @@ DBHPCSlurmJob, DBWorkflowJob, DBWorkspace, sync_db_initiate_database, sync_db_get_hpc_slurm_job, sync_db_get_workflow_job, sync_db_get_workspace, sync_db_update_hpc_slurm_job, sync_db_update_workflow_job, sync_db_update_workspace) -from operandi_utils.hpc import HPCExecutor, HPCTransfer +from operandi_utils.hpc import NHRExecutor, NHRTransfer from operandi_utils.rabbitmq import get_connection_consumer @@ -51,9 +51,9 @@ def run(self): signal.signal(signal.SIGTERM, self.signal_handler) sync_db_initiate_database(self.db_url) - self.hpc_executor = HPCExecutor(tunnel_host='localhost', tunnel_port=self.tunnel_port_executor) + self.hpc_executor = NHRExecutor() self.log.info("HPC executor connection successful.") - self.hpc_io_transfer = HPCTransfer(tunnel_host='localhost', tunnel_port=self.tunnel_port_transfer) + self.hpc_io_transfer = NHRTransfer() self.log.info("HPC transfer connection successful.") self.rmq_consumer = get_connection_consumer(rabbitmq_url=self.rmq_url) diff --git a/src/broker/operandi_broker/worker.py b/src/broker/operandi_broker/worker.py index 142392e2..f4cd7fdf 100644 --- a/src/broker/operandi_broker/worker.py +++ b/src/broker/operandi_broker/worker.py @@ -10,7 +10,7 @@ from operandi_utils.database import ( sync_db_initiate_database, sync_db_get_workflow, sync_db_get_workspace, sync_db_create_hpc_slurm_job, sync_db_update_workflow_job, sync_db_update_workspace) -from operandi_utils.hpc import HPCExecutor, HPCTransfer +from operandi_utils.hpc import NHRExecutor, NHRTransfer from operandi_utils.hpc.constants import ( HPC_JOB_DEADLINE_TIME_REGULAR, HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_SHORT, HPC_JOB_QOS_DEFAULT ) @@ -58,9 +58,9 @@ def run(self): signal.signal(signal.SIGTERM, self.signal_handler) sync_db_initiate_database(self.db_url) - self.hpc_executor = HPCExecutor(tunnel_host='localhost', tunnel_port=self.tunnel_port_executor) + self.hpc_executor = NHRExecutor() self.log.info("HPC executor connection successful.") - self.hpc_io_transfer = HPCTransfer(tunnel_host='localhost', tunnel_port=self.tunnel_port_transfer) + self.hpc_io_transfer = NHRTransfer() self.log.info("HPC transfer connection successful.") self.rmq_consumer = get_connection_consumer(rabbitmq_url=self.rmq_url) diff --git a/src/utils/operandi_utils/hpc/constants.py b/src/utils/operandi_utils/hpc/constants.py index 3991d765..ca33913e 100644 --- a/src/utils/operandi_utils/hpc/constants.py +++ b/src/utils/operandi_utils/hpc/constants.py @@ -1,23 +1,16 @@ __all__ = [ "HPC_DIR_BATCH_SCRIPTS", "HPC_DIR_SLURM_WORKSPACES", - "HPC_EXECUTOR_HOSTS", - "HPC_EXECUTOR_PROXY_HOSTS", "HPC_JOB_DEADLINE_TIME_REGULAR", "HPC_JOB_DEADLINE_TIME_TEST", - "HPC_JOB_DEFAULT_PARTITION", - "HPC_JOB_TEST_PARTITION", "HPC_NHR_JOB_DEFAULT_PARTITION", "HPC_NHR_JOB_TEST_PARTITION", "HPC_JOB_QOS_DEFAULT", "HPC_JOB_QOS_LONG", "HPC_JOB_QOS_SHORT", "HPC_JOB_QOS_VERY_LONG", - "HPC_PATH_HOME_USERS", "HPC_ROOT_BASH_SCRIPT", "HPC_SSH_CONNECTION_TRY_TIMES", - "HPC_TRANSFER_HOSTS", - "HPC_TRANSFER_PROXY_HOSTS", "HPC_NHR_PROJECT", "HPC_NHR_CLUSTERS", @@ -66,12 +59,6 @@ } } -# "gwdu103.hpc.gwdg.de" - bad host entry, has no access to /scratch1, but to /scratch2 -HPC_EXECUTOR_HOSTS = ["login-mdc.hpc.gwdg.de", "gwdu101.hpc.gwdg.de", "gwdu102.hpc.gwdg.de"] -HPC_EXECUTOR_PROXY_HOSTS = ["login.gwdg.de"] -HPC_TRANSFER_HOSTS = ["transfer-scc.gwdg.de", "transfer-mdc.hpc.gwdg.de"] -HPC_TRANSFER_PROXY_HOSTS = ["transfer.gwdg.de", "login.gwdg.de"] -HPC_PATH_HOME_USERS = "/home/users" HPC_DIR_BATCH_SCRIPTS = "batch_scripts" HPC_DIR_SLURM_WORKSPACES = "slurm_workspaces" # TODO: Fix the constant file name - it should be automatically resolved @@ -79,8 +66,6 @@ HPC_JOB_DEADLINE_TIME_REGULAR = "48:00:00" HPC_JOB_DEADLINE_TIME_TEST = "0:30:00" -HPC_JOB_DEFAULT_PARTITION = "medium" -HPC_JOB_TEST_PARTITION = "medium" HPC_NHR_JOB_DEFAULT_PARTITION = "standard96s:shared" HPC_NHR_JOB_TEST_PARTITION = "standard96s:shared" diff --git a/tests/fixtures/hpc.py b/tests/fixtures/hpc.py deleted file mode 100644 index 9e5e4f51..00000000 --- a/tests/fixtures/hpc.py +++ /dev/null @@ -1,18 +0,0 @@ -from pytest import fixture -from operandi_utils.hpc import HPCExecutor, HPCTransfer - - -@fixture(scope="package", name="hpc_data_transfer") -def fixture_hpc_transfer_connector(): - hpc_transfer_connector = HPCTransfer(tunnel_host="localhost", tunnel_port=22) - # print(hpc_transfer_connector.proxy_hosts) - # print(hpc_transfer_connector.hpc_hosts) - yield hpc_transfer_connector - - -@fixture(scope="package", name="hpc_command_executor") -def fixture_hpc_execution_connector(): - hpc_paramiko_connector = HPCExecutor(tunnel_host="localhost", tunnel_port=22) - # print(hpc_paramiko_connector.proxy_hosts) - # print(hpc_paramiko_connector.hpc_hosts) - yield hpc_paramiko_connector diff --git a/tests/integration_tests/test_full_cycle.py b/tests/integration_tests/test_full_cycle.py index 8a9a1931..2c493603 100644 --- a/tests/integration_tests/test_full_cycle.py +++ b/tests/integration_tests/test_full_cycle.py @@ -5,7 +5,7 @@ from operandi_server.constants import DEFAULT_METS_BASENAME, DEFAULT_FILE_GRP from operandi_utils.constants import StateJob from operandi_utils.rabbitmq import RABBITMQ_QUEUE_HARVESTER, RABBITMQ_QUEUE_JOB_STATUSES -from operandi_utils.hpc.constants import HPC_JOB_TEST_PARTITION +from operandi_utils.hpc.constants import HPC_NHR_JOB_TEST_PARTITION from tests.tests_server.helpers_asserts import assert_response_status_code @@ -77,7 +77,7 @@ def test_full_cycle(auth_harvester, operandi, service_broker, bytes_small_worksp "remove_file_grps": ",".join(remove_file_grps_list_odem), "mets_name": DEFAULT_METS_BASENAME }, - "sbatch_args": {"partition": HPC_JOB_TEST_PARTITION, "cpus": 2, "ram": 8} + "sbatch_args": {"partition": HPC_NHR_JOB_TEST_PARTITION, "cpus": 2, "ram": 8} } response = operandi.post(url=f"/workflow/{workflow_id}", json=req_data, auth=auth_harvester) assert_response_status_code(response.status_code, expected_floor=2) diff --git a/tests/tests_utils/conftest.py b/tests/tests_utils/conftest.py index 3bbf7e18..f4732f97 100644 --- a/tests/tests_utils/conftest.py +++ b/tests/tests_utils/conftest.py @@ -1,5 +1,4 @@ pytest_plugins = [ - "tests.fixtures.hpc", "tests.fixtures.hpc_nhr", "tests.fixtures.rabbitmq" ] From dc171ed6ee6b98225f853ffcf4100871b5db4fdf Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 17:05:53 +0200 Subject: [PATCH 10/26] fix: server request model --- src/server/operandi_server/models/base.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/server/operandi_server/models/base.py b/src/server/operandi_server/models/base.py index 6391c313..617c2438 100644 --- a/src/server/operandi_server/models/base.py +++ b/src/server/operandi_server/models/base.py @@ -2,7 +2,7 @@ from typing import Optional from operandi_utils import StateJob -from operandi_utils.hpc.constants import HPC_JOB_DEFAULT_PARTITION +from operandi_utils.hpc.constants import HPC_NHR_JOB_DEFAULT_PARTITION from ..constants import DEFAULT_FILE_GRP, DEFAULT_METS_BASENAME @@ -35,6 +35,6 @@ class WorkflowArguments(BaseModel): class SbatchArguments(BaseModel): - partition: str = HPC_JOB_DEFAULT_PARTITION # partition to be used + partition: str = HPC_NHR_JOB_DEFAULT_PARTITION # partition to be used cpus: int = 4 # cpus per job allocated by default ram: int = 32 # RAM (in GB) per job allocated by default From b6dc3cf74197a83d17cbb02fb3028eb1d55c6b17 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 17:10:56 +0200 Subject: [PATCH 11/26] fix: determine project env --- tests/tests_utils/test_hpc_nhr_executor.py | 8 +++++--- tests/tests_utils/test_hpc_nhr_transfer.py | 7 ++++--- tests/tests_utils/test_hpc_nhr_xcombined.py | 3 ++- 3 files changed, 11 insertions(+), 7 deletions(-) diff --git a/tests/tests_utils/test_hpc_nhr_executor.py b/tests/tests_utils/test_hpc_nhr_executor.py index 7487a235..c2d88b2c 100644 --- a/tests/tests_utils/test_hpc_nhr_executor.py +++ b/tests/tests_utils/test_hpc_nhr_executor.py @@ -1,11 +1,13 @@ from datetime import datetime +from os import environ from os.path import join from time import sleep - -from operandi_utils.hpc.constants import HPC_NHR_SCRATCH_EMMY_HDD +from operandi_utils.hpc.constants import HPC_NHR_CLUSTERS current_time = datetime.now().strftime("%Y%m%d_%H%M") -project_root = join(HPC_NHR_SCRATCH_EMMY_HDD, "operandi_test") +# TODO: Make project root more flexible based on the sub cluster +project_root = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], environ["OPERANDI_HPC_PROJECT_NAME"]) + def test_hpc_connector_executor_mk_dir(hpc_nhr_command_executor): test_dir_name = join(project_root, f"test_dir_{current_time}") diff --git a/tests/tests_utils/test_hpc_nhr_transfer.py b/tests/tests_utils/test_hpc_nhr_transfer.py index 948b9ebd..a9b32c9a 100644 --- a/tests/tests_utils/test_hpc_nhr_transfer.py +++ b/tests/tests_utils/test_hpc_nhr_transfer.py @@ -4,13 +4,14 @@ from time import sleep from tests.helpers_asserts import assert_exists_dir, assert_exists_file from tests.constants import BATCH_SCRIPT_EMPTY - -from operandi_utils.hpc.constants import HPC_NHR_SCRATCH_EMMY_HDD +from operandi_utils.hpc.constants import HPC_NHR_CLUSTERS OPERANDI_SERVER_BASE_DIR = environ.get("OPERANDI_SERVER_BASE_DIR") current_time = datetime.now().strftime("%Y%m%d_%H%M") ID_WORKSPACE = f"test_folder_{current_time}" -project_root = join(HPC_NHR_SCRATCH_EMMY_HDD, "operandi_test") + +# TODO: Make project root more flexible based on the sub cluster +project_root = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], environ["OPERANDI_HPC_PROJECT_NAME"]) def test_hpc_connector_transfer_file(hpc_nhr_data_transfer, path_batch_script_empty): """ diff --git a/tests/tests_utils/test_hpc_nhr_xcombined.py b/tests/tests_utils/test_hpc_nhr_xcombined.py index 98aa9fac..1f753c0a 100644 --- a/tests/tests_utils/test_hpc_nhr_xcombined.py +++ b/tests/tests_utils/test_hpc_nhr_xcombined.py @@ -20,7 +20,8 @@ ID_WORKFLOW_JOB = f"test_wf_job_{current_time}" ID_WORKSPACE = f"test_ws_{current_time}" -project_root = join(HPC_NHR_CLUSTERS[""], "operandi_test") +# TODO: Make project root more flexible based on the sub cluster +project_root = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], environ["OPERANDI_HPC_PROJECT_NAME"]) batch_scripts_dir = join(project_root, "batch_scripts") From 0e1821b441485f24dbbdec23e7ad326c7f2bbbb8 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 17:18:26 +0200 Subject: [PATCH 12/26] enable: the last 2 hpc tests --- src/utils/operandi_utils/hpc/constants.py | 3 --- tests/tests_utils/test_hpc_nhr_xcombined.py | 4 ++-- 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/src/utils/operandi_utils/hpc/constants.py b/src/utils/operandi_utils/hpc/constants.py index ca33913e..ee6aef62 100644 --- a/src/utils/operandi_utils/hpc/constants.py +++ b/src/utils/operandi_utils/hpc/constants.py @@ -11,11 +11,8 @@ "HPC_JOB_QOS_VERY_LONG", "HPC_ROOT_BASH_SCRIPT", "HPC_SSH_CONNECTION_TRY_TIMES", - "HPC_NHR_PROJECT", "HPC_NHR_CLUSTERS", - # TODO: Remove once CI/CD works - "HPC_NHR_SCRATCH_EMMY_HDD" ] HPC_NHR_PROJECT: str = "project_pwieder_ocr_nhr" diff --git a/tests/tests_utils/test_hpc_nhr_xcombined.py b/tests/tests_utils/test_hpc_nhr_xcombined.py index 1f753c0a..d9f46e61 100644 --- a/tests/tests_utils/test_hpc_nhr_xcombined.py +++ b/tests/tests_utils/test_hpc_nhr_xcombined.py @@ -93,14 +93,14 @@ def test_hpc_connector_run_batch_script_with_ms(hpc_nhr_command_executor, templa assert finished_successfully -def _test_get_and_unpack_slurm_workspace(hpc_nhr_data_transfer): +def test_get_and_unpack_slurm_workspace(hpc_nhr_data_transfer): hpc_nhr_data_transfer.get_and_unpack_slurm_workspace( ocrd_workspace_dir=join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKSPACES_ROUTER, ID_WORKSPACE), workflow_job_dir=join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKFLOW_JOBS_ROUTER, ID_WORKFLOW_JOB) ) -def _test_get_and_unpack_slurm_workspace_with_ms(hpc_nhr_data_transfer): +def test_get_and_unpack_slurm_workspace_with_ms(hpc_nhr_data_transfer): hpc_nhr_data_transfer.get_and_unpack_slurm_workspace( ocrd_workspace_dir=join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKSPACES_ROUTER, ID_WORKSPACE_WITH_MS), workflow_job_dir=join(OPERANDI_SERVER_BASE_DIR, SERVER_WORKFLOW_JOBS_ROUTER, ID_WORKFLOW_JOB_WITH_MS) From a62764a1499ffc76f93620f57091f8e4c762c513 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 17:21:47 +0200 Subject: [PATCH 13/26] add: put_batch_script --- src/utils/operandi_utils/hpc/nhr_connector.py | 1 + src/utils/operandi_utils/hpc/nhr_transfer.py | 7 +++++++ 2 files changed, 8 insertions(+) diff --git a/src/utils/operandi_utils/hpc/nhr_connector.py b/src/utils/operandi_utils/hpc/nhr_connector.py index ecfe56f7..1d14f944 100644 --- a/src/utils/operandi_utils/hpc/nhr_connector.py +++ b/src/utils/operandi_utils/hpc/nhr_connector.py @@ -33,6 +33,7 @@ def __init__( self._sftp_client = None # TODO: Make the sub cluster options selectable self.project_root_dir = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], project_env) + self.batch_scripts_dir = join(self.project_root_dir, "batch_scripts") self.slurm_workspaces_dir = join(self.project_root_dir, "slurm_workspaces") @property diff --git a/src/utils/operandi_utils/hpc/nhr_transfer.py b/src/utils/operandi_utils/hpc/nhr_transfer.py index f1df397d..aa05866d 100644 --- a/src/utils/operandi_utils/hpc/nhr_transfer.py +++ b/src/utils/operandi_utils/hpc/nhr_transfer.py @@ -23,6 +23,13 @@ def sftp_client(self): # self._sftp_client.get_channel().get_transport().set_keepalive(30) return self._sftp_client + def put_batch_script(self, batch_script_id: str) -> str: + local_batch_script_path = join(dirname(__file__), "batch_scripts", batch_script_id) + hpc_batch_script_path = join(self.batch_scripts_dir, batch_script_id) + self.put_file(local_src=local_batch_script_path, remote_dst=hpc_batch_script_path) + self.logger.info(f"Put file from local src: {local_batch_script_path}, to dst: {hpc_batch_script_path}") + return hpc_batch_script_path + def create_slurm_workspace_zip( self, ocrd_workspace_dir: str, workflow_job_id: str, nextflow_script_path: str, tempdir_prefix: str = "slurm_workspace-" From 2c8a3fbd41c4ad5ada7ae18f6e0b32a16bf3de74 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 17:28:18 +0200 Subject: [PATCH 14/26] increase integration test resources --- tests/integration_tests/test_full_cycle.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_full_cycle.py b/tests/integration_tests/test_full_cycle.py index 2c493603..ec1c6ec6 100644 --- a/tests/integration_tests/test_full_cycle.py +++ b/tests/integration_tests/test_full_cycle.py @@ -77,7 +77,7 @@ def test_full_cycle(auth_harvester, operandi, service_broker, bytes_small_worksp "remove_file_grps": ",".join(remove_file_grps_list_odem), "mets_name": DEFAULT_METS_BASENAME }, - "sbatch_args": {"partition": HPC_NHR_JOB_TEST_PARTITION, "cpus": 2, "ram": 8} + "sbatch_args": {"partition": HPC_NHR_JOB_TEST_PARTITION, "cpus": 8, "ram": 32} } response = operandi.post(url=f"/workflow/{workflow_id}", json=req_data, auth=auth_harvester) assert_response_status_code(response.status_code, expected_floor=2) From 352723c4ef293e22e5f1e3847167d20821bc0270 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Mon, 26 Aug 2024 17:34:10 +0200 Subject: [PATCH 15/26] fix: singularity -> apptainer --- .../hpc/batch_scripts/batch_submit_workflow_job.sh | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh index 718071ef..3f0b328c 100755 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh @@ -140,7 +140,7 @@ unzip_workflow_job_dir () { start_mets_server () { # TODO: Would be better to start the mets server as an instance, but this is still broken - # singularity instance start \ + # apptainer instance start \ # --bind "${BIND_WORKSPACE_DIR}" \ # "${SIF_PATH_IN_NODE}" \ # instance_mets_server \ @@ -162,7 +162,7 @@ stop_mets_server () { # curl -X DELETE --unix-socket "${WORKSPACE_DIR}/${METS_SOCKET_BASENAME}" "http://localhost/" # TODO Stop the instance here - # singularity instance stop instance_mets_server + # apptainer instance stop instance_mets_server if [ "$1" == "true" ] ; then echo "Stopping the mets server" @@ -174,7 +174,7 @@ stop_mets_server () { } execute_nextflow_workflow () { - local SINGULARITY_CMD="singularity exec --bind ${BIND_WORKSPACE_DIR} --bind ${BIND_OCRD_MODELS} --env OCRD_METS_CACHING=false ${SIF_PATH_IN_NODE}" + local APPTAINER_CMD="apptainer exec --bind ${BIND_WORKSPACE_DIR} --bind ${BIND_OCRD_MODELS} --env OCRD_METS_CACHING=false ${SIF_PATH_IN_NODE}" if [ "$1" == "true" ] ; then echo "Executing the nextflow workflow with mets server" nextflow run "${NF_SCRIPT_PATH}" \ @@ -185,7 +185,7 @@ execute_nextflow_workflow () { --mets_socket "${BIND_METS_SOCKET_PATH}" \ --workspace_dir "${WORKSPACE_DIR_IN_DOCKER}" \ --pages "${PAGES}" \ - --singularity_wrapper "${SINGULARITY_CMD}" \ + --singularity_wrapper "${APPTAINER_CMD}" \ --cpus "${CPUS}" \ --ram "${RAM}" \ --forks "${FORKS}" @@ -198,7 +198,7 @@ execute_nextflow_workflow () { --mets "${BIND_METS_FILE_PATH}" \ --workspace_dir "${WORKSPACE_DIR_IN_DOCKER}" \ --pages "${PAGES}" \ - --singularity_wrapper "${SINGULARITY_CMD}" \ + --singularity_wrapper "${APPTAINER_CMD}" \ --cpus "${CPUS}" \ --ram "${RAM}" \ --forks "${FORKS}" @@ -212,7 +212,7 @@ execute_nextflow_workflow () { list_file_groups_from_workspace () { all_file_groups=() - mapfile -t all_file_groups < <(singularity exec --bind "${BIND_WORKSPACE_DIR}" "${SIF_PATH_IN_NODE}" ocrd workspace -d "${WORKSPACE_DIR_IN_DOCKER}" list-group) + mapfile -t all_file_groups < <(apptainer exec --bind "${BIND_WORKSPACE_DIR}" "${SIF_PATH_IN_NODE}" ocrd workspace -d "${WORKSPACE_DIR_IN_DOCKER}" list-group) file_groups_length=${#all_file_groups[@]} echo -n "File groups: " for file_group in "${all_file_groups[@]}" From 7198391091c9dde2836f612858db5056c00e5600 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 01:56:02 +0200 Subject: [PATCH 16/26] refactor: improve code, remove constants --- Makefile | 2 +- src/broker/operandi_broker/worker.py | 12 +++---- .../batch_submit_workflow_job.sh | 33 ++++++++++++------- .../wrapper_submit_workflow_job.sh | 3 ++ src/utils/operandi_utils/hpc/constants.py | 6 ++-- src/utils/operandi_utils/hpc/nhr_executor.py | 12 ++++--- src/utils/operandi_utils/hpc/nhr_transfer.py | 7 ---- tests/constants.py | 3 -- tests/fixtures/batch_scripts.py | 11 ++----- ..._executor.py => _test_hpc_nhr_executor.py} | 12 +++---- ..._transfer.py => _test_hpc_nhr_transfer.py} | 8 ++--- tests/tests_utils/test_hpc_nhr_xcombined.py | 26 ++++----------- 12 files changed, 57 insertions(+), 78 deletions(-) rename tests/tests_utils/{test_hpc_nhr_executor.py => _test_hpc_nhr_executor.py} (73%) rename tests/tests_utils/{test_hpc_nhr_transfer.py => _test_hpc_nhr_transfer.py} (81%) diff --git a/Makefile b/Makefile index 41ec20c5..ffcbc8d2 100755 --- a/Makefile +++ b/Makefile @@ -115,7 +115,7 @@ run-tests: run-tests-utils run-tests-broker run-tests-server run-tests-harvester run-tests-utils: export $(shell sed 's/=.*//' ./tests/.env) - pytest tests/tests_utils/test_*.py -v + pytest tests/tests_utils/test_*.py -s -v run-tests-broker: export $(shell sed 's/=.*//' ./tests/.env) diff --git a/src/broker/operandi_broker/worker.py b/src/broker/operandi_broker/worker.py index f4cd7fdf..cfe469ae 100644 --- a/src/broker/operandi_broker/worker.py +++ b/src/broker/operandi_broker/worker.py @@ -12,8 +12,8 @@ sync_db_update_workflow_job, sync_db_update_workspace) from operandi_utils.hpc import NHRExecutor, NHRTransfer from operandi_utils.hpc.constants import ( - HPC_JOB_DEADLINE_TIME_REGULAR, HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_SHORT, HPC_JOB_QOS_DEFAULT -) + HPC_BATCH_SUBMIT_WORKFLOW_JOB, HPC_JOB_DEADLINE_TIME_REGULAR, HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_SHORT, + HPC_JOB_QOS_DEFAULT) from operandi_utils.rabbitmq import get_connection_consumer @@ -214,8 +214,6 @@ def prepare_and_trigger_slurm_job( # self.hpc_io_transfer = HPCTransfer(tunel_host='localhost', tunel_port=4023) # self.log.info("HPC transfer connection renewed successfully.") - hpc_batch_script_path = self.hpc_io_transfer.put_batch_script(batch_script_id="batch_submit_workflow_job.sh") - try: sync_db_update_workspace(find_workspace_id=workspace_id, state=StateWorkspace.TRANSFERRING_TO_HPC) sync_db_update_workflow_job(find_job_id=workflow_job_id, job_state=StateJob.TRANSFERRING_TO_HPC) @@ -228,8 +226,8 @@ def prepare_and_trigger_slurm_job( try: # NOTE: The paths below must be a valid existing path inside the HPC slurm_job_id = self.hpc_executor.trigger_slurm_job( - batch_script_path=hpc_batch_script_path, workflow_job_id=workflow_job_id, - nextflow_script_path=workflow_script_path, workspace_id=workspace_id, mets_basename=workspace_base_mets, + workflow_job_id=workflow_job_id, nextflow_script_path=workflow_script_path, + workspace_id=workspace_id, mets_basename=workspace_base_mets, input_file_grp=input_file_grp, nf_process_forks=nf_process_forks, ws_pages_amount=ws_pages_amount, use_mets_server=use_mets_server, file_groups_to_remove=file_groups_to_remove, cpus=cpus, ram=ram, job_deadline_time=job_deadline_time, partition=partition, qos=qos) @@ -239,7 +237,7 @@ def prepare_and_trigger_slurm_job( try: sync_db_create_hpc_slurm_job( workflow_job_id=workflow_job_id, hpc_slurm_job_id=slurm_job_id, - hpc_batch_script_path=hpc_batch_script_path, + hpc_batch_script_path=HPC_BATCH_SUBMIT_WORKFLOW_JOB, hpc_slurm_workspace_path=join(self.hpc_io_transfer.slurm_workspaces_dir, workflow_job_id)) except Exception as error: raise Exception(f"Failed to save the hpc slurm job in DB: {error}") diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh index 3f0b328c..3ace254d 100755 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh @@ -67,18 +67,20 @@ echo "Pages: $PAGES" # Define functions to be used -check_existence_of_paths () { +check_existence_of_paths() { # The SIF file of the OCR-D All docker image must be previously created if [ ! -f "${SIF_PATH}" ]; then echo "Required ocrd_all_image sif file not found at: ${SIF_PATH}" exit 1 fi + echo "Required ocrd_all_image sif file found at: ${SIF_PATH}" # Models directory must be previously filled with OCR-D models if [ ! -d "${OCRD_MODELS_DIR}" ]; then echo "Ocrd models directory not found at: ${OCRD_MODELS_DIR}" exit 1 fi + echo "Ocrd models directory found at: ${OCRD_MODELS_DIR}" if [ ! -d "${SCRATCH_BASE}" ]; then mkdir -p "${SCRATCH_BASE}" @@ -88,9 +90,10 @@ check_existence_of_paths () { echo "Required scratch base dir was not created: ${SCRATCH_BASE}" exit 1 fi + echo "Scratch base found/created at: ${SCRATCH_BASE}" } -clear_data_from_computing_node () { +clear_data_from_computing_node() { echo "If existing, removing the SIF from the computing node, path: ${SIF_PATH_IN_NODE}" rm -f "${SIF_PATH_IN_NODE}" echo "If existing, removing the OCR-D models from the computing node, path: ${OCRD_MODELS_DIR_IN_NODE}" @@ -118,7 +121,7 @@ transfer_requirements_to_node_storage() { fi } -unzip_workflow_job_dir () { +unzip_workflow_job_dir() { if [ ! -f "${WORKFLOW_JOB_DIR}.zip" ]; then echo "Required scratch slurm workspace zip is not available: ${WORKFLOW_JOB_DIR}.zip" exit 1 @@ -138,7 +141,7 @@ unzip_workflow_job_dir () { cd "${WORKFLOW_JOB_DIR}" || exit 1 } -start_mets_server () { +start_mets_server() { # TODO: Would be better to start the mets server as an instance, but this is still broken # apptainer instance start \ # --bind "${BIND_WORKSPACE_DIR}" \ @@ -157,12 +160,12 @@ start_mets_server () { sleep 10 } -stop_mets_server () { +stop_mets_server() { # Not supported in the HPC (the version there is <7.40) # curl -X DELETE --unix-socket "${WORKSPACE_DIR}/${METS_SOCKET_BASENAME}" "http://localhost/" # TODO Stop the instance here - # apptainer instance stop instance_mets_server + # singularity instance stop instance_mets_server if [ "$1" == "true" ] ; then echo "Stopping the mets server" @@ -173,7 +176,7 @@ stop_mets_server () { fi } -execute_nextflow_workflow () { +execute_nextflow_workflow() { local APPTAINER_CMD="apptainer exec --bind ${BIND_WORKSPACE_DIR} --bind ${BIND_OCRD_MODELS} --env OCRD_METS_CACHING=false ${SIF_PATH_IN_NODE}" if [ "$1" == "true" ] ; then echo "Executing the nextflow workflow with mets server" @@ -210,7 +213,7 @@ execute_nextflow_workflow () { esac } -list_file_groups_from_workspace () { +list_file_groups_from_workspace() { all_file_groups=() mapfile -t all_file_groups < <(apptainer exec --bind "${BIND_WORKSPACE_DIR}" "${SIF_PATH_IN_NODE}" ocrd workspace -d "${WORKSPACE_DIR_IN_DOCKER}" list-group) file_groups_length=${#all_file_groups[@]} @@ -223,14 +226,14 @@ list_file_groups_from_workspace () { echo } -remove_file_group_from_workspace () { +remove_file_group_from_workspace() { echo "Removing file group: $1" apptainer exec --bind "${BIND_WORKSPACE_DIR}" "${SIF_PATH_IN_NODE}" \ ocrd workspace -d "${WORKSPACE_DIR_IN_DOCKER}" remove-group -r -f "$1" \ > "${WORKSPACE_DIR}/remove_file_groups.log" 2>&1 } -remove_file_groups_from_workspace () { +remove_file_groups_from_workspace() { list_file_groups_from_workspace if [ "$1" != "" ] ; then echo "Splitting file groups to an array" @@ -250,7 +253,7 @@ remove_file_groups_from_workspace () { fi } -zip_results () { +zip_results() { # Delete symlinks created for the Nextflow workers find "${WORKFLOW_JOB_DIR}" -type l -delete # Create a zip of the ocrd workspace dir @@ -266,11 +269,19 @@ zip_results () { # Main loop for workflow job execution +echo "About to check existence of paths" check_existence_of_paths +echo "About to unzip workflow job dir" unzip_workflow_job_dir +echo "About to transfer requirements to node storage" transfer_requirements_to_node_storage +echo "About to start mets server if enabled" start_mets_server "$USE_METS_SERVER" +echo "About to execute nextflow workflow" execute_nextflow_workflow "$USE_METS_SERVER" +echo "About to stop mets server if enabled" stop_mets_server "$USE_METS_SERVER" +echo "About to remove file groups from workspace" remove_file_groups_from_workspace "$FILE_GROUPS_TO_REMOVE" +echo "About to zip results" zip_results diff --git a/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh b/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh index e2ad3028..a3f6255a 100644 --- a/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/wrapper_submit_workflow_job.sh @@ -28,3 +28,6 @@ if [ "$6" == "48h" ] ; then else sbatch --partition="$1" --time="$2" --output="$3" --cpus-per-task="$4" --mem="$5" --qos="$6" "$7" "$8" "$9" "${10}" "${11}" "${12}" "${13}" "${14}" "${15}" "${16}" "${17}" "${18}" "${19}" fi + +echo "0:$0 1:$1 2:$2 3:$3 4:$4 5:$5 6:$6 7:$7 8:$8 9:$9 10:${10}" +echo "11:${11} 12:${12} 13:${13} 14:${14} 15:${15} 16:${16} 17:${17} 18:${18} 19:${19}" diff --git a/src/utils/operandi_utils/hpc/constants.py b/src/utils/operandi_utils/hpc/constants.py index ee6aef62..024202ef 100644 --- a/src/utils/operandi_utils/hpc/constants.py +++ b/src/utils/operandi_utils/hpc/constants.py @@ -1,4 +1,5 @@ __all__ = [ + "HPC_BATCH_SUBMIT_WORKFLOW_JOB", "HPC_DIR_BATCH_SCRIPTS", "HPC_DIR_SLURM_WORKSPACES", "HPC_JOB_DEADLINE_TIME_REGULAR", @@ -9,10 +10,10 @@ "HPC_JOB_QOS_LONG", "HPC_JOB_QOS_SHORT", "HPC_JOB_QOS_VERY_LONG", - "HPC_ROOT_BASH_SCRIPT", "HPC_SSH_CONNECTION_TRY_TIMES", "HPC_NHR_PROJECT", "HPC_NHR_CLUSTERS", + "HPC_WRAPPER_SUBMIT_WORKFLOW_JOB" ] HPC_NHR_PROJECT: str = "project_pwieder_ocr_nhr" @@ -59,7 +60,8 @@ HPC_DIR_BATCH_SCRIPTS = "batch_scripts" HPC_DIR_SLURM_WORKSPACES = "slurm_workspaces" # TODO: Fix the constant file name - it should be automatically resolved -HPC_ROOT_BASH_SCRIPT = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/wrapper_submit_workflow_job.sh" +HPC_BATCH_SUBMIT_WORKFLOW_JOB = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/batch_submit_workflow_job.sh" +HPC_WRAPPER_SUBMIT_WORKFLOW_JOB = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/wrapper_submit_workflow_job.sh" HPC_JOB_DEADLINE_TIME_REGULAR = "48:00:00" HPC_JOB_DEADLINE_TIME_TEST = "0:30:00" diff --git a/src/utils/operandi_utils/hpc/nhr_executor.py b/src/utils/operandi_utils/hpc/nhr_executor.py index e9c70c51..de1db376 100644 --- a/src/utils/operandi_utils/hpc/nhr_executor.py +++ b/src/utils/operandi_utils/hpc/nhr_executor.py @@ -3,7 +3,8 @@ from operandi_utils.constants import StateJobSlurm from .constants import ( - HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_DEFAULT, HPC_NHR_JOB_DEFAULT_PARTITION, HPC_ROOT_BASH_SCRIPT + HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_DEFAULT, HPC_NHR_JOB_DEFAULT_PARTITION, HPC_BATCH_SUBMIT_WORKFLOW_JOB, + HPC_WRAPPER_SUBMIT_WORKFLOW_JOB ) from .nhr_connector import NHRConnector @@ -29,7 +30,7 @@ def execute_blocking(self, command, timeout=None, environment=None): return output, err, return_code def trigger_slurm_job( - self, batch_script_path: str, workflow_job_id: str, nextflow_script_path: str, input_file_grp: str, + self, workflow_job_id: str, nextflow_script_path: str, input_file_grp: str, workspace_id: str, mets_basename: str, nf_process_forks: int, ws_pages_amount: int, use_mets_server: bool, file_groups_to_remove: str, cpus: int = 2, ram: int = 8, job_deadline_time: str = HPC_JOB_DEADLINE_TIME_TEST, partition: str = HPC_NHR_JOB_DEFAULT_PARTITION, qos: str = HPC_JOB_QOS_DEFAULT @@ -44,7 +45,7 @@ def trigger_slurm_job( nextflow_script_id = nextflow_script_path.split('/')[-1] use_mets_server_bash_flag = "true" if use_mets_server else "false" - command = f"{HPC_ROOT_BASH_SCRIPT}" + command = f"{HPC_WRAPPER_SUBMIT_WORKFLOW_JOB}" # SBATCH arguments passed to the batch script command += f" {partition}" @@ -55,7 +56,7 @@ def trigger_slurm_job( command += f" {qos}" # Regular arguments passed to the batch script - command += f" {batch_script_path}" + command += f" {HPC_BATCH_SUBMIT_WORKFLOW_JOB}" command += f" {self.slurm_workspaces_dir}" command += f" {workflow_job_id}" command += f" {nextflow_script_id}" @@ -71,6 +72,9 @@ def trigger_slurm_job( self.logger.info(f"About to execute a blocking command: {command}") output, err, return_code = self.execute_blocking(command) + print(f"Command output: {output}") + print(f"Command err: {err}") + print(f"Command return code: {return_code}") self.logger.info(f"Command output: {output}") self.logger.info(f"Command err: {err}") self.logger.info(f"Command return code: {return_code}") diff --git a/src/utils/operandi_utils/hpc/nhr_transfer.py b/src/utils/operandi_utils/hpc/nhr_transfer.py index aa05866d..f1df397d 100644 --- a/src/utils/operandi_utils/hpc/nhr_transfer.py +++ b/src/utils/operandi_utils/hpc/nhr_transfer.py @@ -23,13 +23,6 @@ def sftp_client(self): # self._sftp_client.get_channel().get_transport().set_keepalive(30) return self._sftp_client - def put_batch_script(self, batch_script_id: str) -> str: - local_batch_script_path = join(dirname(__file__), "batch_scripts", batch_script_id) - hpc_batch_script_path = join(self.batch_scripts_dir, batch_script_id) - self.put_file(local_src=local_batch_script_path, remote_dst=hpc_batch_script_path) - self.logger.info(f"Put file from local src: {local_batch_script_path}, to dst: {hpc_batch_script_path}") - return hpc_batch_script_path - def create_slurm_workspace_zip( self, ocrd_workspace_dir: str, workflow_job_id: str, nextflow_script_path: str, tempdir_prefix: str = "slurm_workspace-" diff --git a/tests/constants.py b/tests/constants.py index 671c0a1d..ab8203b0 100644 --- a/tests/constants.py +++ b/tests/constants.py @@ -1,6 +1,3 @@ BATCH_SCRIPT_EMPTY = "test_empty.sh" -BATCH_SCRIPTS_ROUTER_DIR = "batch_scripts" -BATCH_SUBMIT_WORKFLOW_JOB = "batch_submit_workflow_job.sh" - WORKFLOWS_ROUTER_DIR = "workflows" WORKSPACES_ROUTER_DIR = "workspaces" diff --git a/tests/fixtures/batch_scripts.py b/tests/fixtures/batch_scripts.py index ceae6204..efdaa2d4 100644 --- a/tests/fixtures/batch_scripts.py +++ b/tests/fixtures/batch_scripts.py @@ -1,14 +1,7 @@ from pytest import fixture +from tests.constants import BATCH_SCRIPT_EMPTY from tests.helpers_utils import to_asset_path -from ..constants import BATCH_SCRIPT_EMPTY, BATCH_SCRIPTS_ROUTER_DIR, BATCH_SUBMIT_WORKFLOW_JOB - - @fixture(scope="package", name="path_batch_script_empty") def fixture_path_batch_script_empty(): - yield to_asset_path(resource_type=BATCH_SCRIPTS_ROUTER_DIR, name=BATCH_SCRIPT_EMPTY) - - -@fixture(scope="package", name="path_batch_script_submit_workflow_job") -def fixture_path_batch_script_submit_workflow_job(): - yield to_asset_path(resource_type=BATCH_SCRIPTS_ROUTER_DIR, name=BATCH_SUBMIT_WORKFLOW_JOB) + yield to_asset_path(resource_type="batch_scripts", name=BATCH_SCRIPT_EMPTY) diff --git a/tests/tests_utils/test_hpc_nhr_executor.py b/tests/tests_utils/_test_hpc_nhr_executor.py similarity index 73% rename from tests/tests_utils/test_hpc_nhr_executor.py rename to tests/tests_utils/_test_hpc_nhr_executor.py index c2d88b2c..9dd4bcfe 100644 --- a/tests/tests_utils/test_hpc_nhr_executor.py +++ b/tests/tests_utils/_test_hpc_nhr_executor.py @@ -1,16 +1,12 @@ from datetime import datetime -from os import environ from os.path import join from time import sleep -from operandi_utils.hpc.constants import HPC_NHR_CLUSTERS current_time = datetime.now().strftime("%Y%m%d_%H%M") -# TODO: Make project root more flexible based on the sub cluster -project_root = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], environ["OPERANDI_HPC_PROJECT_NAME"]) def test_hpc_connector_executor_mk_dir(hpc_nhr_command_executor): - test_dir_name = join(project_root, f"test_dir_{current_time}") + test_dir_name = join(hpc_nhr_command_executor.project_root_dir, f"test_dir_{current_time}") sleep(0.5) output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'mkdir -p {test_dir_name}'") assert return_code == 0, err @@ -19,7 +15,7 @@ def test_hpc_connector_executor_mk_dir(hpc_nhr_command_executor): def test_hpc_connector_executor_rm_dir_negative(hpc_nhr_command_executor): - test_dir_name = join(project_root, f"test_dir_{current_time}") + test_dir_name = join(hpc_nhr_command_executor.project_root_dir, f"test_dir_{current_time}") sleep(0.5) output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'rm {test_dir_name}'") assert return_code == 1 @@ -29,7 +25,7 @@ def test_hpc_connector_executor_rm_dir_negative(hpc_nhr_command_executor): def test_hpc_connector_executor_rm_dir_positive(hpc_nhr_command_executor): - test_dir_name = join(project_root, f"test_dir_{current_time}") + test_dir_name = join(hpc_nhr_command_executor.project_root_dir, f"test_dir_{current_time}") sleep(0.5) output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'rm -rf {test_dir_name}'") assert return_code == 0 @@ -38,7 +34,7 @@ def test_hpc_connector_executor_rm_dir_positive(hpc_nhr_command_executor): def test_hpc_connector_executor_cd_dir(hpc_nhr_command_executor): - test_dir_name = join(project_root, f"test_dir_{current_time}") + test_dir_name = join(hpc_nhr_command_executor.project_root_dir, f"test_dir_{current_time}") sleep(0.5) output, err, return_code = hpc_nhr_command_executor.execute_blocking(command=f"bash -lc 'cd {test_dir_name}'") assert return_code == 1 diff --git a/tests/tests_utils/test_hpc_nhr_transfer.py b/tests/tests_utils/_test_hpc_nhr_transfer.py similarity index 81% rename from tests/tests_utils/test_hpc_nhr_transfer.py rename to tests/tests_utils/_test_hpc_nhr_transfer.py index a9b32c9a..e30747a4 100644 --- a/tests/tests_utils/test_hpc_nhr_transfer.py +++ b/tests/tests_utils/_test_hpc_nhr_transfer.py @@ -4,22 +4,18 @@ from time import sleep from tests.helpers_asserts import assert_exists_dir, assert_exists_file from tests.constants import BATCH_SCRIPT_EMPTY -from operandi_utils.hpc.constants import HPC_NHR_CLUSTERS OPERANDI_SERVER_BASE_DIR = environ.get("OPERANDI_SERVER_BASE_DIR") current_time = datetime.now().strftime("%Y%m%d_%H%M") ID_WORKSPACE = f"test_folder_{current_time}" -# TODO: Make project root more flexible based on the sub cluster -project_root = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], environ["OPERANDI_HPC_PROJECT_NAME"]) - def test_hpc_connector_transfer_file(hpc_nhr_data_transfer, path_batch_script_empty): """ Testing the put_file and get_file functionality of the HPC transfer """ assert_exists_file(path_batch_script_empty) - test_hpc_file_path = join(project_root, BATCH_SCRIPT_EMPTY) + test_hpc_file_path = join(hpc_nhr_data_transfer.project_root_dir, BATCH_SCRIPT_EMPTY) hpc_nhr_data_transfer.put_file(local_src=path_batch_script_empty, remote_dst=test_hpc_file_path) sleep(2) test_local_received_file_path = join(OPERANDI_SERVER_BASE_DIR, BATCH_SCRIPT_EMPTY) @@ -33,7 +29,7 @@ def test_hpc_connector_transfer_dir(hpc_nhr_data_transfer, path_dummy_workspace_ Testing the put_dir and get_dir functionality of the HPC transfer """ assert_exists_dir(path_dummy_workspace_data_dir) - test_hpc_dir_path = join(project_root, ID_WORKSPACE) + test_hpc_dir_path = join(hpc_nhr_data_transfer.project_root_dir, ID_WORKSPACE) hpc_nhr_data_transfer.put_dir(local_src=path_dummy_workspace_data_dir, remote_dst=test_hpc_dir_path) sleep(5) test_local_received_dir_path = join(OPERANDI_SERVER_BASE_DIR, ID_WORKSPACE) diff --git a/tests/tests_utils/test_hpc_nhr_xcombined.py b/tests/tests_utils/test_hpc_nhr_xcombined.py index d9f46e61..7dfd5d1b 100644 --- a/tests/tests_utils/test_hpc_nhr_xcombined.py +++ b/tests/tests_utils/test_hpc_nhr_xcombined.py @@ -7,9 +7,8 @@ DEFAULT_FILE_GRP, DEFAULT_METS_BASENAME, SERVER_WORKFLOW_JOBS_ROUTER, SERVER_WORKSPACES_ROUTER ) from operandi_utils.hpc.constants import ( - HPC_JOB_DEADLINE_TIME_TEST, HPC_NHR_JOB_TEST_PARTITION, HPC_JOB_QOS_SHORT, HPC_NHR_CLUSTERS + HPC_JOB_DEADLINE_TIME_TEST, HPC_NHR_JOB_TEST_PARTITION, HPC_JOB_QOS_SHORT ) -from tests.constants import BATCH_SUBMIT_WORKFLOW_JOB from tests.helpers_asserts import assert_exists_dir, assert_exists_file OPERANDI_SERVER_BASE_DIR = environ.get("OPERANDI_SERVER_BASE_DIR") @@ -20,11 +19,6 @@ ID_WORKFLOW_JOB = f"test_wf_job_{current_time}" ID_WORKSPACE = f"test_ws_{current_time}" -# TODO: Make project root more flexible based on the sub cluster -project_root = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], environ["OPERANDI_HPC_PROJECT_NAME"]) -batch_scripts_dir = join(project_root, "batch_scripts") - - def helper_pack_and_put_slurm_workspace( hpc_nhr_data_transfer, workflow_job_id: str, workspace_id: str, path_workflow: str, path_workspace_dir: str ): @@ -46,12 +40,6 @@ def helper_pack_and_put_slurm_workspace( Path(local_slurm_workspace_zip_path).unlink(missing_ok=True) - -def test_hpc_connector_put_batch_script(hpc_nhr_data_transfer, path_batch_script_submit_workflow_job): - hpc_batch_script_path = join(batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) - hpc_nhr_data_transfer.put_file(local_src=path_batch_script_submit_workflow_job, remote_dst=hpc_batch_script_path) - - def test_pack_and_put_slurm_workspace(hpc_nhr_data_transfer, path_small_workspace_data_dir, template_workflow): helper_pack_and_put_slurm_workspace( hpc_nhr_data_transfer=hpc_nhr_data_transfer, workflow_job_id=ID_WORKFLOW_JOB, workspace_id=ID_WORKSPACE, @@ -68,10 +56,9 @@ def test_pack_and_put_slurm_workspace_with_ms( def test_hpc_connector_run_batch_script(hpc_nhr_command_executor, template_workflow): - hpc_batch_script_path = join(batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) slurm_job_id = hpc_nhr_command_executor.trigger_slurm_job( - batch_script_path=hpc_batch_script_path, workflow_job_id=ID_WORKFLOW_JOB, - nextflow_script_path=template_workflow, input_file_grp=DEFAULT_FILE_GRP, workspace_id=ID_WORKSPACE, + workflow_job_id=ID_WORKFLOW_JOB, nextflow_script_path=template_workflow, + input_file_grp=DEFAULT_FILE_GRP, workspace_id=ID_WORKSPACE, mets_basename=DEFAULT_METS_BASENAME, nf_process_forks=2, ws_pages_amount=8, use_mets_server=False, file_groups_to_remove="", cpus=2, ram=16, job_deadline_time=HPC_JOB_DEADLINE_TIME_TEST, partition=HPC_NHR_JOB_TEST_PARTITION, qos=HPC_JOB_QOS_SHORT) @@ -81,11 +68,10 @@ def test_hpc_connector_run_batch_script(hpc_nhr_command_executor, template_workf def test_hpc_connector_run_batch_script_with_ms(hpc_nhr_command_executor, template_workflow_with_ms): - hpc_batch_script_path = join(batch_scripts_dir, BATCH_SUBMIT_WORKFLOW_JOB) slurm_job_id = hpc_nhr_command_executor.trigger_slurm_job( - batch_script_path=hpc_batch_script_path, workflow_job_id=ID_WORKFLOW_JOB_WITH_MS, - nextflow_script_path=template_workflow_with_ms, input_file_grp=DEFAULT_FILE_GRP, - workspace_id=ID_WORKSPACE_WITH_MS, mets_basename=DEFAULT_METS_BASENAME, nf_process_forks=2, ws_pages_amount=8, + workflow_job_id=ID_WORKFLOW_JOB_WITH_MS, nextflow_script_path=template_workflow_with_ms, + input_file_grp=DEFAULT_FILE_GRP, workspace_id=ID_WORKSPACE_WITH_MS, + mets_basename=DEFAULT_METS_BASENAME, nf_process_forks=2, ws_pages_amount=8, use_mets_server=True, file_groups_to_remove="", cpus=3, ram=16, job_deadline_time=HPC_JOB_DEADLINE_TIME_TEST, partition=HPC_NHR_JOB_TEST_PARTITION, qos=HPC_JOB_QOS_SHORT) finished_successfully = hpc_nhr_command_executor.poll_till_end_slurm_job_state( From 56ee9f1e96722d38a6195be86c5fa6c8ad587d74 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 11:27:20 +0200 Subject: [PATCH 17/26] fix freeze issue: use EmmyPhase2, not EmmyPhase3 --- src/utils/operandi_utils/hpc/nhr_connector.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/utils/operandi_utils/hpc/nhr_connector.py b/src/utils/operandi_utils/hpc/nhr_connector.py index 1d14f944..ac0453dc 100644 --- a/src/utils/operandi_utils/hpc/nhr_connector.py +++ b/src/utils/operandi_utils/hpc/nhr_connector.py @@ -32,7 +32,7 @@ def __init__( self._ssh_client = None self._sftp_client = None # TODO: Make the sub cluster options selectable - self.project_root_dir = join(HPC_NHR_CLUSTERS["EmmyPhase3"]["scratch-emmy-hdd"], project_env) + self.project_root_dir = join(HPC_NHR_CLUSTERS["EmmyPhase2"]["scratch-emmy-hdd"], project_env) self.batch_scripts_dir = join(self.project_root_dir, "batch_scripts") self.slurm_workspaces_dir = join(self.project_root_dir, "slurm_workspaces") @@ -40,7 +40,7 @@ def __init__( def ssh_client(self): if not self._ssh_client: # TODO: Make the sub cluster option selectable - self._ssh_client = self.connect_to_hpc_nhr_frontend_server(host=HPC_NHR_CLUSTERS["EmmyPhase3"]["host"]) + self._ssh_client = self.connect_to_hpc_nhr_frontend_server(host=HPC_NHR_CLUSTERS["EmmyPhase2"]["host"]) # self._ssh_client.get_transport().set_keepalive(30) return self._ssh_client From 0fe8f518738a16fd1922530b37faeea26e32d52a Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 11:37:10 +0200 Subject: [PATCH 18/26] get rid of extra echos in batch --- .../hpc/batch_scripts/batch_submit_workflow_job.sh | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh index 3ace254d..bd9d3702 100755 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh @@ -269,19 +269,11 @@ zip_results() { # Main loop for workflow job execution -echo "About to check existence of paths" check_existence_of_paths -echo "About to unzip workflow job dir" unzip_workflow_job_dir -echo "About to transfer requirements to node storage" transfer_requirements_to_node_storage -echo "About to start mets server if enabled" start_mets_server "$USE_METS_SERVER" -echo "About to execute nextflow workflow" execute_nextflow_workflow "$USE_METS_SERVER" -echo "About to stop mets server if enabled" stop_mets_server "$USE_METS_SERVER" -echo "About to remove file groups from workspace" remove_file_groups_from_workspace "$FILE_GROUPS_TO_REMOVE" -echo "About to zip results" zip_results From 238ec87b91ed26af58ce1c2160a61974c17a4910 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 11:53:11 +0200 Subject: [PATCH 19/26] remove: print debugging --- Makefile | 2 +- src/utils/operandi_utils/hpc/nhr_executor.py | 3 --- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/Makefile b/Makefile index ffcbc8d2..41ec20c5 100755 --- a/Makefile +++ b/Makefile @@ -115,7 +115,7 @@ run-tests: run-tests-utils run-tests-broker run-tests-server run-tests-harvester run-tests-utils: export $(shell sed 's/=.*//' ./tests/.env) - pytest tests/tests_utils/test_*.py -s -v + pytest tests/tests_utils/test_*.py -v run-tests-broker: export $(shell sed 's/=.*//' ./tests/.env) diff --git a/src/utils/operandi_utils/hpc/nhr_executor.py b/src/utils/operandi_utils/hpc/nhr_executor.py index de1db376..cc7589ec 100644 --- a/src/utils/operandi_utils/hpc/nhr_executor.py +++ b/src/utils/operandi_utils/hpc/nhr_executor.py @@ -72,9 +72,6 @@ def trigger_slurm_job( self.logger.info(f"About to execute a blocking command: {command}") output, err, return_code = self.execute_blocking(command) - print(f"Command output: {output}") - print(f"Command err: {err}") - print(f"Command return code: {return_code}") self.logger.info(f"Command output: {output}") self.logger.info(f"Command err: {err}") self.logger.info(f"Command return code: {return_code}") From 989ca7e95328aaee569b67e34556bc57ddb192fa Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 12:31:36 +0200 Subject: [PATCH 20/26] clean local node temp data --- .../hpc/batch_scripts/batch_submit_workflow_job.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh index bd9d3702..04ebe84c 100755 --- a/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh +++ b/src/utils/operandi_utils/hpc/batch_scripts/batch_submit_workflow_job.sh @@ -277,3 +277,4 @@ execute_nextflow_workflow "$USE_METS_SERVER" stop_mets_server "$USE_METS_SERVER" remove_file_groups_from_workspace "$FILE_GROUPS_TO_REMOVE" zip_results +clear_data_from_computing_node From f5660819ba7b5a8b658b1343dfa3554a3899bb8d Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 12:47:22 +0200 Subject: [PATCH 21/26] increase rabbitmq and database healthcheck timeout --- docker-compose.yml | 8 ++++---- docker-compose_image_based.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/docker-compose.yml b/docker-compose.yml index baea255e..cb93b330 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -31,8 +31,8 @@ services: healthcheck: test: rabbitmq-diagnostics check_port_connectivity interval: 1s - timeout: 3s - retries: 30 + timeout: 5s + retries: 120 operandi-mongodb: image: "mongo" @@ -50,8 +50,8 @@ services: healthcheck: test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet interval: 1s - timeout: 3s - retries: 30 + timeout: 5s + retries: 120 operandi-server: image: operandi-server diff --git a/docker-compose_image_based.yml b/docker-compose_image_based.yml index 20fa44cb..8d85b105 100644 --- a/docker-compose_image_based.yml +++ b/docker-compose_image_based.yml @@ -31,8 +31,8 @@ services: healthcheck: test: rabbitmq-diagnostics check_port_connectivity interval: 1s - timeout: 3s - retries: 30 + timeout: 5s + retries: 120 operandi-mongodb: image: "mongo" @@ -50,8 +50,8 @@ services: healthcheck: test: echo 'db.runCommand("ping").ok' | mongosh localhost:27017/test --quiet interval: 1s - timeout: 3s - retries: 30 + timeout: 5s + retries: 120 operandi-server: image: ghcr.io/subugoe/operandi-server:main From 41e5137b2f86823096ef29739d26c9a8c5759af0 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 13:19:06 +0200 Subject: [PATCH 22/26] try fixing: pulling from PR not main --- .github/workflows/ci_cd.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci_cd.yml b/.github/workflows/ci_cd.yml index aa599d11..f055dedb 100644 --- a/.github/workflows/ci_cd.yml +++ b/.github/workflows/ci_cd.yml @@ -41,7 +41,7 @@ jobs: - name: Checkout repository uses: actions/checkout@v3 with: - ref: main + ref: ${{ github.head_ref }} - name: Set up Python ${{ matrix.python-version }} uses: actions/setup-python@v4 From 456debb23a48033b7b009e997226d16732e74a44 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 13:38:46 +0200 Subject: [PATCH 23/26] enable all utils tests --- .../{_test_hpc_nhr_executor.py => test_hpc_nhr_executor.py} | 0 .../{_test_hpc_nhr_transfer.py => test_hpc_nhr_transfer.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/tests_utils/{_test_hpc_nhr_executor.py => test_hpc_nhr_executor.py} (100%) rename tests/tests_utils/{_test_hpc_nhr_transfer.py => test_hpc_nhr_transfer.py} (100%) diff --git a/tests/tests_utils/_test_hpc_nhr_executor.py b/tests/tests_utils/test_hpc_nhr_executor.py similarity index 100% rename from tests/tests_utils/_test_hpc_nhr_executor.py rename to tests/tests_utils/test_hpc_nhr_executor.py diff --git a/tests/tests_utils/_test_hpc_nhr_transfer.py b/tests/tests_utils/test_hpc_nhr_transfer.py similarity index 100% rename from tests/tests_utils/_test_hpc_nhr_transfer.py rename to tests/tests_utils/test_hpc_nhr_transfer.py From ca824f5423f555a24b6cb2b7eb3e0c91d8a0a9de Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 13:41:19 +0200 Subject: [PATCH 24/26] fix: avoid name collisions in tests --- tests/tests_utils/test_hpc_nhr_executor.py | 2 +- tests/tests_utils/test_hpc_nhr_transfer.py | 2 +- tests/tests_utils/test_hpc_nhr_xcombined.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/tests_utils/test_hpc_nhr_executor.py b/tests/tests_utils/test_hpc_nhr_executor.py index 9dd4bcfe..2ba80b1d 100644 --- a/tests/tests_utils/test_hpc_nhr_executor.py +++ b/tests/tests_utils/test_hpc_nhr_executor.py @@ -2,7 +2,7 @@ from os.path import join from time import sleep -current_time = datetime.now().strftime("%Y%m%d_%H%M") +current_time = datetime.now().strftime("%Y%m%d_%H%M%S%f") def test_hpc_connector_executor_mk_dir(hpc_nhr_command_executor): diff --git a/tests/tests_utils/test_hpc_nhr_transfer.py b/tests/tests_utils/test_hpc_nhr_transfer.py index e30747a4..1b4be9a5 100644 --- a/tests/tests_utils/test_hpc_nhr_transfer.py +++ b/tests/tests_utils/test_hpc_nhr_transfer.py @@ -6,7 +6,7 @@ from tests.constants import BATCH_SCRIPT_EMPTY OPERANDI_SERVER_BASE_DIR = environ.get("OPERANDI_SERVER_BASE_DIR") -current_time = datetime.now().strftime("%Y%m%d_%H%M") +current_time = datetime.now().strftime("%Y%m%d_%H%M%S%f") ID_WORKSPACE = f"test_folder_{current_time}" def test_hpc_connector_transfer_file(hpc_nhr_data_transfer, path_batch_script_empty): diff --git a/tests/tests_utils/test_hpc_nhr_xcombined.py b/tests/tests_utils/test_hpc_nhr_xcombined.py index 7dfd5d1b..e062cd50 100644 --- a/tests/tests_utils/test_hpc_nhr_xcombined.py +++ b/tests/tests_utils/test_hpc_nhr_xcombined.py @@ -13,7 +13,7 @@ OPERANDI_SERVER_BASE_DIR = environ.get("OPERANDI_SERVER_BASE_DIR") -current_time = datetime.now().strftime("%Y%m%d_%H%M") +current_time = datetime.now().strftime("%Y%m%d_%H%M%S%f") ID_WORKFLOW_JOB_WITH_MS = f"test_wf_job_ms_{current_time}" ID_WORKSPACE_WITH_MS = f"test_ws_ms_{current_time}" ID_WORKFLOW_JOB = f"test_wf_job_{current_time}" From 177188773f96ae119e4d8218781e294a3ad345a4 Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 14:03:58 +0200 Subject: [PATCH 25/26] utilize: force command to check job status --- .../batch_scripts/wrapper_check_workflow_job_status.sh | 6 ++++++ src/utils/operandi_utils/hpc/constants.py | 4 +++- src/utils/operandi_utils/hpc/nhr_executor.py | 9 ++++----- 3 files changed, 13 insertions(+), 6 deletions(-) create mode 100644 src/utils/operandi_utils/hpc/batch_scripts/wrapper_check_workflow_job_status.sh diff --git a/src/utils/operandi_utils/hpc/batch_scripts/wrapper_check_workflow_job_status.sh b/src/utils/operandi_utils/hpc/batch_scripts/wrapper_check_workflow_job_status.sh new file mode 100644 index 00000000..ad40812a --- /dev/null +++ b/src/utils/operandi_utils/hpc/batch_scripts/wrapper_check_workflow_job_status.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +# $0 - This bash script +# $1 - Slurm job id + +sacct -j "$1" --format=jobid,state,exitcode diff --git a/src/utils/operandi_utils/hpc/constants.py b/src/utils/operandi_utils/hpc/constants.py index 024202ef..5e0210d4 100644 --- a/src/utils/operandi_utils/hpc/constants.py +++ b/src/utils/operandi_utils/hpc/constants.py @@ -13,7 +13,8 @@ "HPC_SSH_CONNECTION_TRY_TIMES", "HPC_NHR_PROJECT", "HPC_NHR_CLUSTERS", - "HPC_WRAPPER_SUBMIT_WORKFLOW_JOB" + "HPC_WRAPPER_SUBMIT_WORKFLOW_JOB", + "HPC_WRAPPER_CHECK_WORKFLOW_JOB_STATUS" ] HPC_NHR_PROJECT: str = "project_pwieder_ocr_nhr" @@ -62,6 +63,7 @@ # TODO: Fix the constant file name - it should be automatically resolved HPC_BATCH_SUBMIT_WORKFLOW_JOB = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/batch_submit_workflow_job.sh" HPC_WRAPPER_SUBMIT_WORKFLOW_JOB = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/wrapper_submit_workflow_job.sh" +HPC_WRAPPER_CHECK_WORKFLOW_JOB_STATUS = f"{HPC_NHR_SCRATCH_EMMY_HDD}/{HPC_DIR_BATCH_SCRIPTS}/wrapper_check_workflow_job_status.sh" HPC_JOB_DEADLINE_TIME_REGULAR = "48:00:00" HPC_JOB_DEADLINE_TIME_TEST = "0:30:00" diff --git a/src/utils/operandi_utils/hpc/nhr_executor.py b/src/utils/operandi_utils/hpc/nhr_executor.py index cc7589ec..50a1b136 100644 --- a/src/utils/operandi_utils/hpc/nhr_executor.py +++ b/src/utils/operandi_utils/hpc/nhr_executor.py @@ -4,7 +4,7 @@ from operandi_utils.constants import StateJobSlurm from .constants import ( HPC_JOB_DEADLINE_TIME_TEST, HPC_JOB_QOS_DEFAULT, HPC_NHR_JOB_DEFAULT_PARTITION, HPC_BATCH_SUBMIT_WORKFLOW_JOB, - HPC_WRAPPER_SUBMIT_WORKFLOW_JOB + HPC_WRAPPER_SUBMIT_WORKFLOW_JOB, HPC_WRAPPER_CHECK_WORKFLOW_JOB_STATUS ) from .nhr_connector import NHRConnector @@ -70,7 +70,7 @@ def trigger_slurm_job( command += f" {use_mets_server_bash_flag}" command += f" {file_groups_to_remove}" - self.logger.info(f"About to execute a blocking command: {command}") + self.logger.info(f"About to execute a force command: {command}") output, err, return_code = self.execute_blocking(command) self.logger.info(f"Command output: {output}") self.logger.info(f"Command err: {err}") @@ -81,12 +81,11 @@ def trigger_slurm_job( return slurm_job_id def check_slurm_job_state(self, slurm_job_id: str, tries: int = 10, wait_time: int = 2) -> str: - # TODO: Use force command here - command = f"bash -lc 'sacct -j {slurm_job_id} --format=jobid,state,exitcode'" + command = f"{HPC_WRAPPER_CHECK_WORKFLOW_JOB_STATUS} {slurm_job_id}" slurm_job_state = None while not slurm_job_state and tries > 0: - self.logger.info(f"About to execute a blocking command: {command}") + self.logger.info(f"About to execute a force command: {command}") output, err, return_code = self.execute_blocking(command) self.logger.info(f"Command output: {output}") self.logger.info(f"Command err: {err}") From 2ee8edbd26ce42f4e0b5e455f09bfcdfd6982bbc Mon Sep 17 00:00:00 2001 From: Mehmed Mustafa Date: Tue, 27 Aug 2024 16:42:21 +0200 Subject: [PATCH 26/26] release: v2.15.0 --- src/utils/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/utils/setup.py b/src/utils/setup.py index 23b5100b..4e6fe527 100644 --- a/src/utils/setup.py +++ b/src/utils/setup.py @@ -5,7 +5,7 @@ setup( name='operandi_utils', - version='2.14.0', + version='2.15.0', description='OPERANDI - Utils', long_description=open('README.md').read(), long_description_content_type='text/markdown',