Skip to content

Commit

Permalink
implement: reconnect times for clients
Browse files Browse the repository at this point in the history
  • Loading branch information
MehmedGIT committed Aug 28, 2024
1 parent 80d9006 commit e0a41fb
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 8 deletions.
13 changes: 9 additions & 4 deletions src/utils/operandi_utils/hpc/nhr_connector.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ def __init__(
self.check_keyfile_existence(key_path=self.key_path)
self.logger.debug(f"Retrieving hpc frontend server private key file from path: {self.key_path}")
self._ssh_client = None
self._ssh_reconnect_tries = 5
self._ssh_reconnect_tries_remaining = self._ssh_reconnect_tries
# TODO: Make the sub cluster options selectable
self.project_root_dir = join(HPC_NHR_CLUSTERS["EmmyPhase2"]["scratch-emmy-hdd"], project_env)
self.batch_scripts_dir = join(self.project_root_dir, "batch_scripts")
Expand All @@ -44,14 +46,17 @@ def ssh_client(self):
# Note: This extra check is required against aggressive
# Firewalls that ignore the keepalive option!
self._ssh_client.get_transport().send_ignore()
self._ssh_reconnect_tries_remaining = self._ssh_reconnect_tries
except Exception as error:
self.logger.warning(f"SSH client error: {error}")
self.logger.warning(f"SSH client failed to send ignore, connection is broken: {error}")
if self._ssh_client:
self._ssh_client.close()
self._ssh_client = None
self.logger.info(f"Reconnecting the SSH client")
self._ssh_client = self.connect_to_hpc_nhr_frontend_server(host=HPC_NHR_CLUSTERS["EmmyPhase2"]["host"])
self._ssh_client.get_transport().set_keepalive(30)
if self._ssh_reconnect_tries_remaining < 0:
raise Exception(f"Failed to reconnect {self._ssh_reconnect_tries} times: {error}")
self.logger.info(f"Reconnecting the SSH client, try times: {self._ssh_reconnect_tries_remaining}")
self._ssh_reconnect_tries_remaining -= 1
return self.ssh_client # recursive call to itself to try again
return self._ssh_client

@staticmethod
Expand Down
13 changes: 9 additions & 4 deletions src/utils/operandi_utils/hpc/nhr_transfer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@ def __init__(self) -> None:
logger = getLogger(name=self.__class__.__name__)
super().__init__(logger)
self._sftp_client = None
self._sftp_reconnect_tries = 5
self._sftp_reconnect_tries_remaining = self._sftp_reconnect_tries

@property
def sftp_client(self):
Expand All @@ -26,14 +28,17 @@ def sftp_client(self):
# Note: This extra check is required against aggressive
# Firewalls that ignore the keepalive option!
self._sftp_client.get_channel().get_transport().send_ignore()
self._sftp_reconnect_tries_remaining = self._sftp_reconnect_tries
except Exception as error:
self.logger.warning(f"SFTP client error: {error}")
self.logger.warning(f"SFTP client failed to send ignore, connection is broken: {error}")
if self._sftp_client:
self._sftp_client.close()
self._sftp_client = None
self.logger.info(f"Reconnecting the SFTP client")
self._sftp_client = self.ssh_client.open_sftp()
self._sftp_client.get_channel().get_transport().set_keepalive(30)
if self._sftp_reconnect_tries_remaining < 0:
raise Exception(f"Failed to reconnect {self._sftp_reconnect_tries} times: {error}")
self.logger.info(f"Reconnecting the SFTP client, try times: {self._sftp_reconnect_tries_remaining}")
self._sftp_reconnect_tries_remaining -= 1
return self.sftp_client # recursive call to itself to try again
return self._sftp_client

def create_slurm_workspace_zip(
Expand Down

0 comments on commit e0a41fb

Please sign in to comment.