Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Static nodes in cluster partition via OFE #1738

Merged
merged 7 commits into from
Sep 6, 2023
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,14 @@ chmod +x /usr/local/bin/yq
curl --silent --show-error --location https://github.com/koalaman/shellcheck/releases/download/stable/shellcheck-stable.linux.x86_64.tar.xz --output /tmp/shellcheck.tar.xz
tar xfa /tmp/shellcheck.tar.xz --strip=1 --directory /usr/local/bin

# Packages for https://github.com/GoogleCloudPlatform/hpc-toolkit/tree/main/community/modules/scheduler/schedmd-slurm-gcp-v5-controller#input_enable_cleanup_compute
pip3.8 install google-api-python-client \
google-cloud-secret-manager \
google.cloud.pubsub \
pyyaml addict httplib2

# Set Python3.8 as default Python3
echo '2' | update-alternatives --config python3
# Download configuration file
#
gsutil cp "gs://${config_bucket}/webserver/config" /tmp/config
Expand Down
3 changes: 2 additions & 1 deletion community/front-end/ofe/script/service_account.sh
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ SA_ROLES=('aiplatform.admin'
'iam.serviceAccountUser'
'notebooks.admin'
'resourcemanager.projectIamAdmin'
'monitoring.viewer')
'monitoring.viewer'
'pubsub.admin')

#
#
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,8 @@ def _prepare_ghpc_partitions(self, part_uses):
settings:
enable_smt: {part.enable_hyperthreads}
machine_type: {part.machine_type}
node_count_dynamic_max: {part.max_node_count}
node_count_dynamic_max: {part.dynamic_node_count}
node_count_static: {part.static_node_count}
{instance_image_yaml}
"""
)
Expand Down Expand Up @@ -342,6 +343,14 @@ def _prepare_ghpc_yaml(self):
kind: terraform
id: slurm_controller
settings:
enable_cleanup_compute: True
enable_cleanup_subscriptions: True
cloud_parameters:
resume_rate: 0
resume_timeout: 500
suspend_rate: 0
suspend_timeout: 300
no_comma_params: false
machine_type: {self.cluster.controller_instance_type}
disk_type: {self.cluster.controller_disk_type}
disk_size_gb: {self.cluster.controller_disk_size}
Expand Down
3 changes: 2 additions & 1 deletion community/front-end/ofe/website/ghpcfe/forms.py
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,8 @@ class Meta:
"name",
"machine_type",
"image",
"max_node_count",
"dynamic_node_count",
"static_node_count",
"enable_placement",
"enable_hyperthreads",
"enable_node_reuse",
Expand Down
11 changes: 8 additions & 3 deletions community/front-end/ofe/website/ghpcfe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -885,11 +885,16 @@ class ClusterPartition(models.Model):
default=None,
on_delete=models.SET_NULL,
)
max_node_count = models.PositiveIntegerField(
validators=[MinValueValidator(1)],
help_text="The maximum number of nodes in the partition",
dynamic_node_count = models.PositiveIntegerField(
validators=[MinValueValidator(0)],
help_text="The maximum number of dynamic nodes in the partition",
default=2,
)
static_node_count = models.PositiveIntegerField(
validators=[MinValueValidator(0)],
help_text="The number of statically created nodes in the partition",
default=0,
)
enable_placement = models.BooleanField(
default=True,
help_text=(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,8 @@ <h2>Cluster Detail</h2>
<th>vCPUs per Node</th>
<th>GPU Type</th>
<th>GPUs per Node</th>
<th>Maximum Instances</th>
<th>Dynamic Instances</th>
<th>Static Instances</th>
</tr>
{% for part in object.partitions.all %}
<tr>
Expand All @@ -102,7 +103,8 @@ <h2>Cluster Detail</h2>
<td>{{ part.vCPU_per_node }}</td>
<td>{% if part.GPU_per_node > 0 %}{{ part.GPU_type }}{% else %}-{% endif %}</td>
<td>{% if part.GPU_per_node > 0 %}{{ part.GPU_per_node }}{% else %}-{% endif %}</td>
<td>{{ part.max_node_count }}</td>
<td>{{ part.dynamic_node_count }}</td>
<td>{{ part.static_node_count }}</td>
</tr>
{% endfor %}
</table>
Expand Down
2 changes: 1 addition & 1 deletion community/front-end/ofe/website/ghpcfe/views/clusters.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def form_valid(self, form):
**{
"name": "batch",
"machine_type": self.find_default_instance_type(),
"max_node_count": 4,
"dynamic_node_count": 4,
"vCPU_per_node": self.find_default_instance_type_vcpus(),
}
)
Expand Down
Loading