From 9cf940e582d039a5dc369f5256b879235cb331da Mon Sep 17 00:00:00 2001 From: Rafael Sarmiento Date: Tue, 26 Nov 2024 17:37:01 +0100 Subject: [PATCH] improve error handling --- chart/f7t4jhub/files/jupyterhub-config.py | 2 +- chart/values.yaml | 147 +++++++++++++++------- firecrestspawner/spawner.py | 69 ++++++---- tests/test_spawner.py | 2 +- 4 files changed, 153 insertions(+), 67 deletions(-) diff --git a/chart/f7t4jhub/files/jupyterhub-config.py b/chart/f7t4jhub/files/jupyterhub-config.py index 121d311..bb076e8 100644 --- a/chart/f7t4jhub/files/jupyterhub-config.py +++ b/chart/f7t4jhub/files/jupyterhub-config.py @@ -86,7 +86,7 @@ async def get_node_ip_from_output(spawner): c.JupyterHub.hub_connect_ip = socket.gethostbyname(hostname) c.JupyterHub.spawner_class = 'firecrestspawner.spawner.SlurmSpawner' -c.Spawner.enable_aux_fc_client = {{ .Values.serviceAccount.enabled | toJson | replace "true" "True" | replace "false" "False" }} +c.Spawner.polling_with_service_account = {{ .Values.serviceAccount.enabled | toJson | replace "true" "True" | replace "false" "False" }} c.Spawner.req_host = '{{ .Values.config.spawner.host }}' c.Spawner.node_name_template = '{{ .Values.config.spawner.nodeNameTemplate }}' c.Spawner.req_partition = '{{ .Values.config.spawner.partition }}' diff --git a/chart/values.yaml b/chart/values.yaml index 4e1a11c..a2dedaf 100644 --- a/chart/values.yaml +++ b/chart/values.yaml @@ -1,7 +1,9 @@ reloader: + # This is a section configures the reloader settings. + # In general this is kept unchanged from one deployment to another. reloader: # Set to true to enable the reloader for automatically restarting pods on ConfigMap/Secret changes. - enabled: false + enabled: true # If true, the reloader will watch ConfigMaps and Secrets in all namespaces, not just its own. watchGlobally: false @@ -16,51 +18,54 @@ reloader: # Ensures the reloader container's filesystem is mounted as read-only to enhance security. securityContext: readOnlyRootFilesystem: true + allowPrivilegeEscalation: false seccompProfile: type: RuntimeDefault f7t4jhub: + # This section configures the deployment of JupyterHub and the proxy setup: - # URL for the Firecrest service (replace with your own Firecrest URL) - firecrestUrl: "https://firecrest.example.com" + # URL for the Firecrest service + # Replace with the URL of the FirecREST service targeting your cluster + firecrestUrl: "https://firecrest.cscs.ch" # URL to obtain an auth token from your identity provider (replace with your own token URL) - authTokenUrl: "https://auth.example.com/auth/realms/yourrealm/protocol/openid-connect/token" + authTokenUrl: "https://auth.cscs.ch/auth/realms//protocol/openid-connect/token" proxy: # Image for the configurable HTTP proxy - image: 'quay.io/jupyterhub/configurable-http-proxy:4.6.1' + image: 'ghcr.io/eth-cscs/chp:4.6.2' hub: - # Image for the JupyterHub application (replace with your own JupyterHub image) - image: 'ghcr.io/eth-cscs/f7t4jhub:4.1.5' + # Image for the JupyterHub application + image: 'ghcr.io/eth-cscs/f7t4jhub:4.1.6' - # Set log level to logging.DEBUG + # Set JupyterHub's log level to logging.DEBUG debug: false - reloader: - # Enable or disable reloader integration - enabled: false - vault: # URL for the Vault service (replace with your own Vault URL) - url: 'https://vault.example.com' + url: 'https://vault.example.cscs.ch' # Secret engine used in Vault (replace with your own secret engine) - secretEngine: 'secret-engine' + secretEngine: 'jupyterhub' # Role ID for accessing Vault secrets (replace with your own role ID) - roleId: 'role-id' + roleId: '' - # keycloack credentials + # Credentials for both the Keycloak Authorization Code Flow client, which is + # used to manage the access to JupyterHub as well as the authentication with + # FirecREST and the Client Credentials client, which can be used as service + # account for job status polling keycloak: # Enable or disable Vault integration - enabled: false + enabled: true # Secret path in Vault (replace with your own secret path) secretPath: 'secret/path/keycloack' - # container registry credentials + # Container registry credentials + # This can be used for docker container registries requiring authenticatoin containerRegistry: # Enable or disable Vault integration enabled: false @@ -68,28 +73,32 @@ f7t4jhub: # Secret path in Vault (replace with your own secret path) secretPath: 'secret/path/containers' - # proxy authentication token + # Proxy-Hub authentication token + # This is used in JupyterHub to secure communication between the hub and + # the proxy configProxyAuthToken: # Enable or disable Vault integration - enabled: false + enabled: true # Secret path in Vault (replace with your own secret path) secretPath: 'secret/path/proxy' - # service account for polling jobs + # Service account for polling jobs serviceAccount: # Enable or disable service account for polling jobs + # If enabled, the client's id and secret for the service accounts + # are accessed with the same secrets used for keycloak enabled: true # URL to obtain an auth token from your identity provider (replace with the SA's token URL) - authTokenUrl: 'https://auth-sa.example.com/auth/realms/yourrealm/protocol/openid-connect/token' + authTokenUrl: 'https://auth.cscs.ch/auth/realms//protocol/openid-connect/token' metricbeat: # Enable or disable annotations for metric beat monitoring enabled: false # Allow or deny access to /hub/metrics - deny_metrics_endpoint: false + deny_metrics_endpoint: true network: # Ports configuration for the application @@ -98,30 +107,30 @@ f7t4jhub: externalPort: 8081 config: - # Common name for the JupyterHub instance (replace with your own domain) - commonName: 'jupyterhub.example.com' + # URL for the JupyterHub instance (replace with your own domain) + commonName: 'jupyterhub-.cscs.ch' # Admin users for the JupyterHub instance (replace with your own admin users) - adminUsers: "{'adminuser'}" + adminUsers: "{''}" # Default URL for the hub hubDefaultUrl: '/hub/home' auth: # OAuth callback URL (replace with your own callback URL) - oauthCallbackUrl: "https://jupyterhub.example.com/hub/oauth_callback" + oauthCallbackUrl: "https://jupyterhub-.cscs.ch/hub/oauth_callback" # Authorization URL for your identity provider (replace with your own authorize URL) - authorizeUrl: "https://auth.example.com/auth/realms/yourrealm/protocol/openid-connect/auth" + authorizeUrl: "https://auth.cscs.ch/auth/realms//protocol/openid-connect/auth" # Token URL for your identity provider (replace with your own token URL) - tokenUrl: "https://auth.example.com/auth/realms/yourrealm/protocol/openid-connect/token" + tokenUrl: "https://auth.cscs.ch/auth/realms//protocol/openid-connect/token" # User data URL for your identity provider (replace with your own user info URL) - userDataUrl: "https://auth.example.com/auth/realms/yourrealm/protocol/openid-connect/userinfo" + userDataUrl: "https://auth.cscs.ch/auth/realms//protocol/openid-connect/userinfo" # Login service URL (replace with your own login service URL) - loginService: "https://auth.example.com" + loginService: "https://auth.cscs.ch" # Key for the username field in the user data response userNameKey: "preferred_username" @@ -130,45 +139,95 @@ f7t4jhub: userDataParams: "{'state': 'state'}" # Scopes for the authentication request (customize as needed) - scope: "['openid', 'profile', 'customscope']" + scope: "['openid', 'profile', 'firecrest']" spawner: - # Host for the spawner (replace with your own host) - host: 'dom' + # Name of the custer where the notebooks are going to be launched + host: '' # Port for the single-user server. Set to 0 to use random port port: 57001 - # Node name template (replace with your own node name template) - nodeNameTemplate: '{}.example.com' + # Node name template for the cluster (replace with your own node name template) + nodeNameTemplate: '{}.example.cscs.ch' - # Job name for the spawner (customize as needed) + # Name of the job that runs the notebook server jobName: 'spawner-jupyterhub' - # Partition for the job scheduler (customize as needed) - partition: 'slurm_partition' + # Name of the partition of the job scheduler (e.g. normal, debug, long) + partition: '' - # Constraint for the job scheduler (customize as needed) - constraint: 'slurm_constraint' + # Constraint for the job scheduler (e.g. gpu, mc, nvgpu) + constraint: '' - # Command to run srun (customize as needed) + # Command to run srun + # Can be left as empty string, meaning the the notebook server will run + # on the master node without using srun + # + # Can be used to pass command line options to slurm such as + # `srun --interactive` srun: '' # Command to start the JupyterHub single-user server (customize as needed) + # In general the command must be `firecrestspawner-singleuser jupyterhub-singleuser` + # but sometimes customization is need. For instance, to something before the + # command, like when using a uenv: + # 'bash -c ". /user-environment/env/default/activate.sh && firecrestspawner-singleuser jupyterhub-singleuser"' cmd: 'firecrestspawner-singleuser jupyterhub-singleuser' # Pre-launch commands (customize as needed) + # This is to add logic before the `srun` line + # It can be used to setup environment variables for instance prelaunchCmds: '' # Virtual environment setup (customize as needed) vclusterEnv: '. /path/to/venv/bin/activate' # Custom state get host function (customize as needed) + # This is used if it's not possible to set a `nodeNameTemplate` + # One can pass here the name of a function, defined in the configuration, + # to fetch the DNS name or IP of the compute node where the + # notebooks will run customStateGetHost: None - # Literal python code to add at the end of jupyterhub's configuration + # JupyterHub options form + # It allows users to customize their environment + # (e.g., selecting resources, environment type, or packages) + # before launching their Jupyter notebook server + # The settings here are reflected on the batch script + # that submits the JupyterLab job optionsForm: | +
+
+
+ + +
+
+
+
+ + +
+
+
+
+ + +
+
# Literal python code to add at the end of jupyterhub's configuration extraConfig: | - # ... + c.JupyterHub.authenticate_prometheus = False diff --git a/firecrestspawner/spawner.py b/firecrestspawner/spawner.py index dc252f4..e2d8555 100644 --- a/firecrestspawner/spawner.py +++ b/firecrestspawner/spawner.py @@ -11,6 +11,7 @@ import hostlist import httpx import inspect +import json import jupyterhub import os import pwd @@ -23,6 +24,7 @@ from jinja2 import Template from jupyterhub.spawner import Spawner from time import sleep +from tornado.web import HTTPError from traitlets import Any, Bool, Integer, Unicode, Float, default from typing import AsyncGenerator, Optional @@ -105,7 +107,20 @@ def get_access_token(self) -> Optional[str]: response = requests.post(self.token_url, data=params, headers=headers) if response.status_code != 200: - return None + # if the refresh token is expired, Keycloak returns + # + # HTTP 400: Bad Request + # ({ + # "error": "invalid_grant", + # "error_description": "Session not active" + # }) + # + err = HTTPError(response.status_code, json.dumps(response.json())) + err.html_message = ( + "The credentials for spawning a new job have expired.
" + "Please log out and log back in." + ) + raise err json_response = response.json() self.refresh_token = json_response["refresh_token"] @@ -227,10 +242,9 @@ def _req_username_default(self): "needs specification.", ).tag(config=True) - enable_aux_fc_client = Bool( + polling_with_service_account = Bool( True, - help="If ``True``, use an auxiliary client to poll when client " - "credentials are expired.", + help="If ``True``, use a service account client for job polling.", ).tag(config=True) # Raw output of job submission command unless overridden @@ -259,12 +273,22 @@ async def get_firecrest_client(self): Flow method""" auth_state = await self.user.get_auth_state() - auth = AuthorizationCodeFlowAuth( - client_id=self.user.authenticator.client_id, - client_secret=self.user.authenticator.client_secret, - refresh_token=auth_state["refresh_token"], - token_url=self.user.authenticator.token_url, - ) + try: + auth = AuthorizationCodeFlowAuth( + client_id=self.user.authenticator.client_id, + client_secret=self.user.authenticator.client_secret, + refresh_token=auth_state["refresh_token"], + token_url=self.user.authenticator.token_url, + ) + except TypeError as e: + # If `auth_state` is None, then `auth_state["refresh_token"]` can + # throw a `TypeError` + # That can happen in some case where jupyterhub starts from scratch + # after the user has input the login and password + err = HTTPError(401, f"{e}") + err.html_message = ("Please log out and " + "log back in to refresh the credentials.") + raise err client = firecrest.AsyncFirecrest( firecrest_url=self.firecrest_url, authorization=auth @@ -315,7 +339,7 @@ async def get_firecrest_client_service_account(self): async def firecrest_poll(self): """Helper function to poll jobs.""" - if self.enable_aux_fc_client: + if self.polling_with_service_account: client = await self.get_firecrest_client_service_account() else: client = await self.get_firecrest_client() @@ -366,7 +390,7 @@ async def submit_batch_script(self): script = await self._get_batch_script(**subvars) self.log.info("Spawner submitting job using firecREST") - self.log.info("Spawner submitted script:\n" + script) + self.log.info(f"Spawner submitted script:\n{script}") try: client = await self.get_firecrest_client() @@ -375,7 +399,7 @@ async def submit_batch_script(self): self.host, script_str=script, env_vars=job_env ) self.log.debug(f"[client.submit] {self.job}") - self.job_id = str(self.job["jobid"]) + self.job_id = f"{self.job['jobid']}" self.log.info(f"Job {self.job_id} submitted") # In case the connection to the firecrest server timesout # catch httpx.ConnectTimeout since httpx.ConnectTimeout @@ -435,7 +459,6 @@ async def cancel_batch_job(self) -> None: def load_state(self, state) -> None: """Load ``job_id`` from state""" - super(FirecRESTSpawnerBase, self).load_state(state) self.job_id = state.get("job_id", "") self.job_status = state.get("job_status", "") @@ -489,13 +512,17 @@ async def poll(self) -> Optional[int]: if class_name == "HomeHandler": auth_state = await self.user.get_auth_state() - auth = AuthorizationCodeFlowAuth( - client_id=self.user.authenticator.client_id, - client_secret=self.user.authenticator.client_secret, - refresh_token=auth_state["refresh_token"], - token_url=self.user.authenticator.token_url, - ) - self.access_token_is_valid = bool(auth.get_access_token()) + try: + auth = AuthorizationCodeFlowAuth( + client_id=self.user.authenticator.client_id, + client_secret=self.user.authenticator.client_secret, + refresh_token=auth_state["refresh_token"], + token_url=self.user.authenticator.token_url, + ) + self.access_token_is_valid = bool(auth.get_access_token()) + except HTTPError: + self.log.info("Credentials expired.") + self.access_token_is_valid = False status = await self.query_job_status() if status in (JobStatus.PENDING, JobStatus.RUNNING, JobStatus.UNKNOWN): diff --git a/tests/test_spawner.py b/tests/test_spawner.py index 24673de..3106ad9 100644 --- a/tests/test_spawner.py +++ b/tests/test_spawner.py @@ -58,7 +58,7 @@ def new_spawner(db, spawner_class=SlurmSpawner, **kwargs): req_host="cluster1", port=testport, node_name_template="{}.cluster1.ch", - enable_aux_fc_client=False, + polling_with_service_account=False, ) return _spawner