Skip to content

Commit

Permalink
Merge branch 'develop' into template_pstore_mount
Browse files Browse the repository at this point in the history
  • Loading branch information
harshthakkar01 authored Dec 21, 2024
2 parents 850c370 + 66804e7 commit 8e83ef6
Show file tree
Hide file tree
Showing 10 changed files with 182 additions and 38 deletions.
16 changes: 8 additions & 8 deletions cmd/create.go
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ func printAdvancedInstructionsMessage(deplDir string) {
logging.Info("Find instructions for cleanly destroying infrastructure and advanced manual")
logging.Info("deployment instructions at:")
logging.Info("")
logging.Info(modulewriter.InstructionsPath(deplDir))
logging.Info("%s", modulewriter.InstructionsPath(deplDir))
}

// TODO: move to expand.go
Expand Down Expand Up @@ -135,10 +135,10 @@ func v5DeprecationWarning(bp config.Blueprint) {
alreadyContainsV5 := false
bp.WalkModulesSafe(func(mp config.ModulePath, m *config.Module) {
if strings.Contains(m.Source, "schedmd-slurm-gcp-v5-controller") && !alreadyContainsV5 {
logging.Info(boldYellow(
"We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n" +
"Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n" +
"However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n" +
logging.Info("%s", boldYellow(
"We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n"+
"Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n"+
"However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n"+
"While this will not directly or immediately impact running clusters, we recommend replacing any v5 clusters with Slurm-gcp v6.",
))
alreadyContainsV5 = true // This is to avoid the logging message showing repeatedly for multiple v5 controllers
Expand All @@ -152,7 +152,7 @@ func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) {
if err == nil {
return
}
logging.Error(renderError(err, ctx))
logging.Error("%s", renderError(err, ctx))

logging.Error("One or more blueprint validators has failed. See messages above for suggested")
logging.Error("actions. General troubleshooting guidance and instructions for configuring")
Expand All @@ -169,12 +169,12 @@ func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) {
switch bp.ValidationLevel {
case config.ValidationWarning:
{
logging.Error(boldYellow("Validation failures were treated as a warning, continuing to create blueprint."))
logging.Error("%s", boldYellow("Validation failures were treated as a warning, continuing to create blueprint."))
logging.Error("")
}
case config.ValidationError:
{
logging.Fatal(boldRed("validation failed due to the issues listed above"))
logging.Fatal("%s", boldRed("validation failed due to the issues listed above"))
}
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,6 +262,6 @@ func checkErr(err error, ctx *config.YamlCtx) {
ctx = &config.YamlCtx{}
}
if err != nil {
logging.Fatal(renderError(err, *ctx))
logging.Fatal("%s", renderError(err, *ctx))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@ metadata:
data:
schedule-daemon.py: |
#!/usr/bin/env python
"""schedule-daemon.py is a Topology-aware Kubernetes pod scheduler."""
# Copyright 2024 Google Inc. All Rights Reserved.
#
Expand All @@ -21,6 +20,7 @@ data:
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""schedule-daemon.py is a Topology-aware Kubernetes pod scheduler."""
import argparse
import collections
Expand Down Expand Up @@ -293,6 +293,16 @@ data:
)
continue
# skip nodes that is not in Ready state
if any(
condition.type == "Ready" and condition.status != "True" for condition in node.status.conditions
):
logging.info(
'Skipping node %s because it is NotReady',
node_name
)
continue
allocatable = node.status.allocatable
used_cpu, used_memory, used_gpu = 0, 0, 0
Expand Down Expand Up @@ -445,7 +455,7 @@ data:
v1: kubernetes.client.CoreV1Api,
pod_name: str,
pod_namespace: str,
node_name: str,
node: dict[str, Any],
gate_name: str,
) -> bool:
"""Schedules a pod on a given node using affinity for direct assignment.
Expand All @@ -454,7 +464,7 @@ data:
v1: The kubernetes client.
pod_name: The name of the pod to schedule.
pod_namespace: The namespace of the pod to schedule.
node_name: The name of the node to schedule the pod on.
node: The node to schedule the pod on.
gate_name: The name of the gate to remove from the pod.
Returns:
Expand All @@ -473,7 +483,7 @@ data:
'matchExpressions': [{
'key': 'kubernetes.io/hostname',
'operator': 'In',
'values': [node_name],
'values': [node['name']],
}]
}]
}
Expand All @@ -484,7 +494,7 @@ data:
v1.replace_namespaced_pod(pod_name, pod_namespace, pod)
logging.info(
'Pod %s/%s scheduled on %s', pod_namespace, pod_name, node_name
'Pod %s/%s scheduled on %s with topology %s', pod_namespace, pod_name, node['name'], node_topology_key(node)
)
except kubernetes.client.rest.ApiException as e:
logging.exception(
Expand Down Expand Up @@ -727,7 +737,7 @@ data:
for i, pod in enumerate(sorted_pods):
node = sorted_nodes[best_assignment[i]]
if not schedule_pod_on_node(
v1, pod['name'], pod['namespace'], node['name'], gate_name
v1, pod['name'], pod['namespace'], node, gate_name
):
logging.error(
'Failed to schedule pod %s on node %s. Skipping job %s',
Expand Down
34 changes: 17 additions & 17 deletions examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,22 @@ vars:
system_node_pool_disk_size_gb: 200
a3ultra_node_pool_disk_size_gb: 100

terraform_providers:
google:
source: hashicorp/google
version: 6.13.0
configuration:
project: $(vars.project_id)
region: $(vars.region)
zone: $(vars.zone)
google-beta:
source: hashicorp/google-beta
version: 6.13.0
configuration:
project: $(vars.project_id)
region: $(vars.region)
zone: $(vars.zone)

deployment_groups:
- group: primary
modules:
Expand Down Expand Up @@ -171,7 +187,7 @@ deployment_groups:
use: [a3-ultragpu-cluster]

- id: workload-manager-install
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=e0c690b
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=8c26d4a
use: [a3-ultragpu-cluster]
settings:
kueue:
Expand All @@ -194,19 +210,3 @@ deployment_groups:
node_count: 2
name: run-nvidia-smi
outputs: [instructions]

terraform_providers:
google:
source: hashicorp/google
version: 6.13.0
configuration:
project: $(vars.project_id)
region: $(vars.region)
zone: $(vars.zone)
google-beta:
source: hashicorp/google-beta
version: 6.13.0
configuration:
project: $(vars.project_id)
region: $(vars.region)
zone: $(vars.zone)
2 changes: 2 additions & 0 deletions modules/file-system/parallelstore/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,8 @@ No modules.
| <a name="input_daos_agent_config"></a> [daos\_agent\_config](#input\_daos\_agent\_config) | Additional configuration to be added to daos\_config.yml | `string` | `""` | no |
| <a name="input_deployment_name"></a> [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. | `string` | n/a | yes |
| <a name="input_dfuse_environment"></a> [dfuse\_environment](#input\_dfuse\_environment) | Additional environment variables for DFuse process | `map(string)` | `{}` | no |
| <a name="input_directory_stripe"></a> [directory\_stripe](#input\_directory\_stripe) | The parallelstore stripe level for directories. | `string` | `"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"` | no |
| <a name="input_file_stripe"></a> [file\_stripe](#input\_file\_stripe) | The parallelstore stripe level for files. | `string` | `"FILE_STRIPE_LEVEL_UNSPECIFIED"` | no |
| <a name="input_import_destination_path"></a> [import\_destination\_path](#input\_import\_destination\_path) | The name of local path to import data on parallelstore instance from GCS bucket. | `string` | `null` | no |
| <a name="input_import_gcs_bucket_uri"></a> [import\_gcs\_bucket\_uri](#input\_import\_gcs\_bucket\_uri) | The name of the GCS bucket to import data from to parallelstore. | `string` | `null` | no |
| <a name="input_labels"></a> [labels](#input\_labels) | Labels to add to parallel store instance. | `map(string)` | `{}` | no |
Expand Down
12 changes: 7 additions & 5 deletions modules/file-system/parallelstore/main.tf
Original file line number Diff line number Diff line change
Expand Up @@ -51,11 +51,13 @@ resource "random_id" "resource_name_suffix" {
}

resource "google_parallelstore_instance" "instance" {
project = var.project_id
instance_id = local.id
location = var.zone
capacity_gib = var.size_gb
network = var.network_id
project = var.project_id
instance_id = local.id
location = var.zone
capacity_gib = var.size_gb
network = var.network_id
file_stripe_level = var.file_stripe
directory_stripe_level = var.directory_stripe

labels = local.labels

Expand Down
30 changes: 30 additions & 0 deletions modules/file-system/parallelstore/variables.tf
Original file line number Diff line number Diff line change
Expand Up @@ -105,3 +105,33 @@ variable "import_destination_path" {
type = string
default = null
}

variable "file_stripe" {
description = "The parallelstore stripe level for files."
type = string
default = "FILE_STRIPE_LEVEL_UNSPECIFIED"
validation {
condition = contains([
"FILE_STRIPE_LEVEL_UNSPECIFIED",
"FILE_STRIPE_LEVEL_MIN",
"FILE_STRIPE_LEVEL_BALANCED",
"FILE_STRIPE_LEVEL_MAX",
], var.file_stripe)
error_message = "var.file_stripe must be set to \"FILE_STRIPE_LEVEL_UNSPECIFIED\", \"FILE_STRIPE_LEVEL_MIN\", \"FILE_STRIPE_LEVEL_BALANCED\", or \"FILE_STRIPE_LEVEL_MAX\""
}
}

variable "directory_stripe" {
description = "The parallelstore stripe level for directories."
type = string
default = "DIRECTORY_STRIPE_LEVEL_UNSPECIFIED"
validation {
condition = contains([
"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED",
"DIRECTORY_STRIPE_LEVEL_MIN",
"DIRECTORY_STRIPE_LEVEL_BALANCED",
"DIRECTORY_STRIPE_LEVEL_MAX",
], var.directory_stripe)
error_message = "var.directory_stripe must be set to \"DIRECTORY_STRIPE_LEVEL_UNSPECIFIED\", \"DIRECTORY_STRIPE_LEVEL_MIN\", \"DIRECTORY_STRIPE_LEVEL_BALANCED\", or \"DIRECTORY_STRIPE_LEVEL_MAX\""
}
}
3 changes: 2 additions & 1 deletion pkg/modulereader/hcl_utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
package modulereader

import (
"errors"
"fmt"
"hpc-toolkit/pkg/logging"
"hpc-toolkit/pkg/sourcereader"
Expand Down Expand Up @@ -136,7 +137,7 @@ func ReadHclAttributes(file string) (map[string]cty.Value, error) {
// work around ugly <nil> in error message missing d.Subject
// https://github.com/hashicorp/hcl2/blob/fb75b3253c80b3bc7ca99c4bfa2ad6743841b1af/hcl/diagnostic.go#L76-L78
if len(diags) == 1 {
return nil, fmt.Errorf(diags[0].Detail)
return nil, errors.New(diags[0].Detail)
}
return nil, diags
}
Expand Down
60 changes: 60 additions & 0 deletions tools/cloud-build/daily-tests/builds/gke-a3-ultragpu.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---
tags:
- m.gke-job-template
- gke


timeout: 14400s # 4hr
steps:
- id: gke-a3-ultragpu
name: us-central1-docker.pkg.dev/$PROJECT_ID/hpc-toolkit-repo/test-runner
entrypoint: /bin/bash
env:
- "ANSIBLE_HOST_KEY_CHECKING=false"
- "ANSIBLE_CONFIG=/workspace/tools/cloud-build/ansible.cfg"
args:
- -c
- |
set -x -e
cd /workspace && make
BUILD_ID_FULL=$BUILD_ID
BUILD_ID_SHORT=$${BUILD_ID_FULL:0:6}
EXAMPLE_BP=examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
# adding vm to act as remote node
echo ' - id: remote-node' >> $${EXAMPLE_BP}
echo ' source: modules/compute/vm-instance' >> $${EXAMPLE_BP}
echo ' use: [gke-a3-ultra-net-0]' >> $${EXAMPLE_BP}
echo ' settings:' >> $${EXAMPLE_BP}
echo ' machine_type: e2-standard-2' >> $${EXAMPLE_BP}
echo ' name_prefix: remote-node' >> $${EXAMPLE_BP}
echo ' add_deployment_name_before_prefix: true' >> $${EXAMPLE_BP}
echo ''
echo ' - id: job_template_hostname' >> $${EXAMPLE_BP}
echo ' source: modules/compute/gke-job-template' >> $${EXAMPLE_BP}
echo ' use: [a3-ultragpu-pool]' >> $${EXAMPLE_BP}
echo ' settings:' >> $${EXAMPLE_BP}
echo ' image: nvidia/cuda:11.0.3-runtime-ubuntu20.04' >> $${EXAMPLE_BP}
echo ' command:' >> $${EXAMPLE_BP}
echo ' - nvidia-smi' >> $${EXAMPLE_BP}
echo ' node_count: 1' >> $${EXAMPLE_BP}
echo ' outputs: [instructions]' >> $${EXAMPLE_BP}
ansible-playbook tools/cloud-build/daily-tests/ansible_playbooks/base-integration-test.yml \
--user=sa_106486320838376751393 --extra-vars="project=${PROJECT_ID} build=$${BUILD_ID_SHORT}" \
--extra-vars="@tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml"
39 changes: 39 additions & 0 deletions tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

---

# region, zone must be defined
# in build file with --extra-vars flag!
test_name: gke-a3ultra
deployment_name: gke-a3ultra-{{ build }}
workspace: /workspace
blueprint_yaml: "{{ workspace }}/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml"
network: gke-a3-ultra-net-0
region: europe-west1
zone: europe-west1-b
remote_node: "{{ deployment_name }}-remote-node-0"
extended_reservation: slurm-dev-gcp-a3u-gsc
static_node_count: 1
cli_deployment_vars:
region: "{{ region }}"
zone: "{{ zone }}"
static_node_count: "{{ static_node_count }}"
extended_reservation: "{{ extended_reservation }}"
authorized_cidr: "{{ build_ip.stdout }}/32"
gcp_public_cidrs_access_enabled: true
custom_vars:
project: "{{ project }}"
post_deploy_tests:
- test-validation/test-gke-job.yml

0 comments on commit 8e83ef6

Please sign in to comment.