diff --git a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh index 47c1ac2cd2..9a3a590023 100644 --- a/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh +++ b/community/front-end/ofe/infrastructure_files/gcs_bucket/webserver/startup.sh @@ -57,6 +57,14 @@ chmod +x /usr/local/bin/yq curl --silent --show-error --location https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz --output /tmp/shellcheck.tar.xz tar xfa /tmp/shellcheck.tar.xz --strip=1 --directory /usr/local/bin +# Packages for https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_enable_cleanup_compute +pip3.8 install google-api-python-client \ + google-cloud-secret-manager \ + google.cloud.pubsub \ + pyyaml addict httplib2 + +# Set Python3.8 as default Python3 +echo '2' | update-alternatives --config python3 # Download configuration file # gsutil cp "gs://${config_bucket}/webserver/config" /tmp/config diff --git a/community/front-end/ofe/script/service_account.sh b/community/front-end/ofe/script/service_account.sh index e92f72f4d2..53b1a6a2d9 100755 --- a/community/front-end/ofe/script/service_account.sh +++ b/community/front-end/ofe/script/service_account.sh @@ -59,7 +59,8 @@ SA_ROLES=('aiplatform.admin' 'iam.serviceAccountUser' 'notebooks.admin' 'resourcemanager.projectIamAdmin' - 'monitoring.viewer') + 'monitoring.viewer' + 'pubsub.admin') # # diff --git a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py index dec018ba14..31ed09cf31 100644 --- a/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py +++ b/community/front-end/ofe/website/ghpcfe/cluster_manager/clusterinfo.py @@ -217,7 +217,8 @@ def _prepare_ghpc_partitions(self, part_uses): settings: enable_smt: {part.enable_hyperthreads} machine_type: {part.machine_type} - node_count_dynamic_max: {part.max_node_count} + node_count_dynamic_max: {part.dynamic_node_count} + node_count_static: {part.static_node_count} {instance_image_yaml} """ ) @@ -342,6 +343,14 @@ def _prepare_ghpc_yaml(self): kind: terraform id: slurm_controller settings: + enable_cleanup_compute: True + enable_cleanup_subscriptions: True + cloud_parameters: + resume_rate: 0 + resume_timeout: 500 + suspend_rate: 0 + suspend_timeout: 300 + no_comma_params: false machine_type: {self.cluster.controller_instance_type} disk_type: {self.cluster.controller_disk_type} disk_size_gb: {self.cluster.controller_disk_size} diff --git a/community/front-end/ofe/website/ghpcfe/forms.py b/community/front-end/ofe/website/ghpcfe/forms.py index 07f6e00fbe..9b6d4f68aa 100644 --- a/community/front-end/ofe/website/ghpcfe/forms.py +++ b/community/front-end/ofe/website/ghpcfe/forms.py @@ -244,7 +244,8 @@ class Meta: "name", "machine_type", "image", - "max_node_count", + "dynamic_node_count", + "static_node_count", "enable_placement", "enable_hyperthreads", "enable_node_reuse", diff --git a/community/front-end/ofe/website/ghpcfe/models.py b/community/front-end/ofe/website/ghpcfe/models.py index 977136a1b8..c5b544e234 100644 --- a/community/front-end/ofe/website/ghpcfe/models.py +++ b/community/front-end/ofe/website/ghpcfe/models.py @@ -885,11 +885,16 @@ class ClusterPartition(models.Model): default=None, on_delete=models.SET_NULL, ) - max_node_count = models.PositiveIntegerField( - validators=[MinValueValidator(1)], - help_text="The maximum number of nodes in the partition", + dynamic_node_count = models.PositiveIntegerField( + validators=[MinValueValidator(0)], + help_text="The maximum number of dynamic nodes in the partition", default=2, ) + static_node_count = models.PositiveIntegerField( + validators=[MinValueValidator(0)], + help_text="The number of statically created nodes in the partition", + default=0, + ) enable_placement = models.BooleanField( default=True, help_text=( diff --git a/community/front-end/ofe/website/ghpcfe/templates/cluster/detail.html b/community/front-end/ofe/website/ghpcfe/templates/cluster/detail.html index a771936ad2..fc1e50b778 100644 --- a/community/front-end/ofe/website/ghpcfe/templates/cluster/detail.html +++ b/community/front-end/ofe/website/ghpcfe/templates/cluster/detail.html @@ -93,7 +93,8 @@

Cluster Detail

vCPUs per Node GPU Type GPUs per Node - Maximum Instances + Dynamic Instances + Static Instances {% for part in object.partitions.all %} @@ -102,7 +103,8 @@

Cluster Detail

{{ part.vCPU_per_node }} {% if part.GPU_per_node > 0 %}{{ part.GPU_type }}{% else %}-{% endif %} {% if part.GPU_per_node > 0 %}{{ part.GPU_per_node }}{% else %}-{% endif %} - {{ part.max_node_count }} + {{ part.dynamic_node_count }} + {{ part.static_node_count }} {% endfor %} diff --git a/community/front-end/ofe/website/ghpcfe/views/clusters.py b/community/front-end/ofe/website/ghpcfe/views/clusters.py index f68c52cd78..12a06df794 100644 --- a/community/front-end/ofe/website/ghpcfe/views/clusters.py +++ b/community/front-end/ofe/website/ghpcfe/views/clusters.py @@ -230,7 +230,7 @@ def form_valid(self, form): **{ "name": "batch", "machine_type": self.find_default_instance_type(), - "max_node_count": 4, + "dynamic_node_count": 4, "vCPU_per_node": self.find_default_instance_type_vcpus(), } )