Merge pull request #3148 from GoogleCloudPlatform/release-candidate

Release v1.41.0
GoogleCloudPlatform · Oct 25, 2024 · 26fafe0 · 26fafe0
2 parents eb00254 + 0c0f1c4
commit 26fafe0
Show file tree

Hide file tree

Showing 258 changed files with 3,633 additions and 813 deletions.
diff --git a/.github/workflows/pr-label-validation.yml b/.github/workflows/pr-label-validation.yml
@@ -28,11 +28,13 @@ on:
     - ready_for_review
     - unlocked
     branches:
+    - main
     - develop
+    - release-candidate
 
 jobs:
   pr-label-validation:
-    if: github.repository == 'GoogleCloudPlatform/cluster-toolkit'
+    if: github.repository == 'GoogleCloudPlatform/cluster-toolkit' && github.event.pull_request.draft == false
     runs-on: ubuntu-latest
     permissions:
       pull-requests: read

diff --git a/.github/workflows/pr-precommit.yml b/.github/workflows/pr-precommit.yml
@@ -19,9 +19,8 @@ name: 'Use pre-commit to validate Pull Request'
 on:
   pull_request:
     types:
-    - edited
     - opened
-    - labeled
+    - reopened
     - synchronize
     branches:
     - main

diff --git a/cmd/create.go b/cmd/create.go
@@ -125,9 +125,27 @@ func expandOrDie(path string) (config.Blueprint, *config.YamlCtx) {
 	// Expand the blueprint
 	checkErr(bp.Expand(), ctx)
 	validateMaybeDie(bp, *ctx)
+	v5DeprecationWarning(bp)
+
 	return bp, ctx
 }
 
+// TODO: Remove this warning when v5 deprecation is complete
+func v5DeprecationWarning(bp config.Blueprint) {
+	alreadyContainsV5 := false
+	bp.WalkModulesSafe(func(mp config.ModulePath, m *config.Module) {
+		if strings.Contains(m.Source, "schedmd-slurm-gcp-v5-controller") && !alreadyContainsV5 {
+			logging.Info(boldYellow(
+				"We have been supporting slurm-gcp v5 since July 2022 and are now deprecating it, as we've launched slurm-gcp v6 in June 2024. \n" +
+					"Toolkit blueprints using Slurm-gcp v5 will be marked “deprecated” starting October 2024 and slurm-gcp v6 will be the default deployment. \n" +
+					"However we won't begin removing slurm-gcp v5 blueprints until January 6, 2025. Beginning on January 6, 2025, the Cluster Toolkit team will cease their support for Slurm-gcp v5. \n" +
+					"While this will not directly or immediately impact running clusters, we recommend replacing any v5 clusters with Slurm-gcp v6.",
+			))
+			alreadyContainsV5 = true // This is to avoid the logging message showing repeatedly for multiple v5 controllers
+		}
+	})
+}
+
 // TODO: move to expand.go
 func validateMaybeDie(bp config.Blueprint, ctx config.YamlCtx) {
 	err := validators.Execute(bp)

diff --git a/cmd/root.go b/cmd/root.go
@@ -53,7 +53,7 @@ HPC deployments on the Google Cloud Platform.`,
 				logging.Fatal("cmd.Help function failed: %s", err)
 			}
 		},
-		Version:     "v1.40.0",
+		Version:     "v1.41.0",
 		Annotations: annotation,
 	}
 )

diff --git a/community/examples/AMD/hpc-amd-slurm.yaml b/community/examples/AMD/hpc-amd-slurm.yaml
@@ -168,7 +168,7 @@ deployment_groups:
         # these images must match the images used by Slurm modules below because
         # we are building OpenMPI with PMI support in libraries contained in
         # Slurm installation
-        family: slurm-gcp-6-6-hpc-rocky-linux-8
+        family: slurm-gcp-6-7-hpc-rocky-linux-8
         project: schedmd-slurm-public
 
   - id: low_cost_nodeset

diff --git a/community/examples/hpc-build-slurm-image.yaml b/community/examples/hpc-build-slurm-image.yaml
@@ -23,7 +23,7 @@ vars:
   image_build_machine_type: n2d-standard-16
   build_from_image_family: hpc-rocky-linux-8
   build_from_image_project: cloud-hpc-image-public
-  build_from_git_ref: 6.7.0
+  build_from_git_ref: 6.8.2
   built_image_family: my-custom-slurm
   built_instance_image:
     family: $(vars.built_image_family)

diff --git a/community/examples/hpc-slurm-ramble-gromacs.yaml b/community/examples/hpc-slurm-ramble-gromacs.yaml
@@ -31,7 +31,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network
     source: modules/network/vpc
 

diff --git a/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml b/community/examples/hpc-slurm-ubuntu2004-v5-legacy.yaml
@@ -34,7 +34,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/vpc
 

diff --git a/community/examples/hpc-slurm-ubuntu2004.yaml b/community/examples/hpc-slurm-ubuntu2004.yaml
@@ -24,7 +24,7 @@ vars:
   slurm_image:
     # Please refer to the following link for the latest images:
     # https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#supported-operating-systems
-    family: slurm-gcp-6-6-ubuntu-2004-lts
+    family: slurm-gcp-6-7-ubuntu-2004-lts
     project: schedmd-slurm-public
   instance_image_custom: true
 
@@ -33,7 +33,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/vpc
 

diff --git a/community/examples/hpc-slurm6-apptainer.yaml b/community/examples/hpc-slurm6-apptainer.yaml
@@ -60,7 +60,7 @@ deployment_groups:
     settings:
       source_image_project_id: [schedmd-slurm-public]
       # see latest in https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/docs/images.md#published-image-family
-      source_image_family: slurm-gcp-6-6-hpc-rocky-linux-8
+      source_image_family: slurm-gcp-6-7-hpc-rocky-linux-8
       # You can find size of source image by using following command
       # gcloud compute images describe-from-family <source_image_family> --project schedmd-slurm-public
       disk_size: $(vars.disk_size)

diff --git a/community/examples/htc-slurm-v5-legacy.yaml b/community/examples/htc-slurm-v5-legacy.yaml
@@ -42,7 +42,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/pre-existing-vpc
   - id: network1
     source: modules/network/vpc
 

diff --git a/community/examples/htc-slurm.yaml b/community/examples/htc-slurm.yaml
@@ -42,7 +42,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local or community module, prefix with ./, ../ or /
-  # Example - ./modules/network/pre-existing-vpc
   - id: network
     source: modules/network/vpc
 

diff --git a/community/examples/tutorial-starccm-slurm.yaml b/community/examples/tutorial-starccm-slurm.yaml
@@ -15,6 +15,8 @@
 ---
 
 blueprint_name: starccm-on-slurm
+toolkit_modules_url: github.com/GoogleCloudPlatform/cluster-toolkit
+toolkit_modules_version: v1.41.0
 
 vars:
   project_id:  ## Set GCP Project ID Here ##
@@ -30,7 +32,6 @@ deployment_groups:
   modules:
   # Source is an embedded module, denoted by "modules/*" without ./, ../, /
   # as a prefix. To refer to a local module, prefix with ./, ../ or /
-  # Example - ./modules/network/vpc
   - id: network1
     source: modules/network/vpc
 

diff --git a/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/config-map.yaml.tftpl
@@ -0,0 +1,6 @@
+kind: ConfigMap
+apiVersion: v1
+metadata:
+  name: ${name}
+data:
+  h100-mega-80gb-8: "${num_nodes}"
diff --git a/community/examples/xpk-gke-a3-megagpu-files/kueue-xpk-configuration.yaml.tftpl b/community/examples/xpk-gke-a3-megagpu-files/kueue-xpk-configuration.yaml.tftpl
@@ -0,0 +1,73 @@
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ResourceFlavor
+metadata:
+  name: 1xh100-mega-80gb-8
+spec:
+  nodeLabels:
+    cloud.google.com/gke-accelerator: nvidia-h100-mega-80gb
+---
+
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: ClusterQueue
+metadata:
+  name: cluster-queue
+spec:
+  preemption:
+      reclaimWithinCohort: Never # Don't preempt other queues in the cohort.
+      withinClusterQueue: LowerPriority
+  namespaceSelector: {} # match all.
+  resourceGroups:
+  - coveredResources: ["nvidia.com/gpu"]
+    flavors:
+    - name: 1xh100-mega-80gb-8
+      resources:
+      - name: "nvidia.com/gpu"
+        nominalQuota: ${num_chips}
+---
+apiVersion: kueue.x-k8s.io/v1beta1
+kind: LocalQueue
+metadata:
+  namespace: default
+  name: multislice-queue
+spec:
+  clusterQueue: cluster-queue
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: very-low
+value: 100
+globalDefault: false
+description: "Very Low"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: low
+value: 250
+globalDefault: false
+description: "Low"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: medium
+value: 500
+globalDefault: false
+description: "Medium"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: high
+value: 750
+globalDefault: false
+description: "High"
+---
+apiVersion: scheduling.k8s.io/v1
+kind: PriorityClass
+metadata:
+  name: very-high
+value: 1000
+globalDefault: false
+description: "Very High"
diff --git a/community/examples/xpk-gke-a3-megagpu.yaml b/community/examples/xpk-gke-a3-megagpu.yaml
@@ -0,0 +1,118 @@
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+---
+
+blueprint_name: xpk-gke-a3-megagpu
+
+vars:
+  project_id:  ## Set GCP Project ID Here ##
+  deployment_name: xpk-gke-a3-megagpu
+  region: us-central1
+  zone: us-central1-c
+
+  # Cidr block containing the IP of the machine calling terraform.
+  # The following line must be updated for this example to work.
+  authorized_cidr: <your-ip-address>/32
+
+deployment_groups:
+- group: primary
+  modules:
+  - id: network1
+    source: modules/network/vpc
+    settings:
+      subnetwork_name: xpk-gke-a3-megagpu-subnet
+      secondary_ranges:
+        xpk-gke-a3-megagpu-subnet:
+        - range_name: pods
+          ip_cidr_range: 10.4.0.0/14
+        - range_name: services
+          ip_cidr_range: 10.0.32.0/20
+
+  - id: gpunets
+    source: modules/network/multivpc
+    settings:
+      network_name_prefix: $(vars.deployment_name)-gpunet
+      global_ip_address_range: 192.169.0.0/16
+      network_count: 8
+      subnetwork_cidr_suffix: 24
+
+  - id: gke_cluster
+    source: modules/scheduler/gke-cluster
+    use: [network1, gpunets]
+    settings:
+      master_authorized_networks:
+      - cidr_block: $(vars.authorized_cidr)  # Allows your machine run kubectl command. It's required for the multi-network setup.
+        display_name: "kubectl-access-network"
+      system_node_pool_machine_type: "e2-standard-32"
+    outputs: [instructions]
+
+  - id: group_placement_0
+    source: modules/compute/resource-policy
+    settings:
+      name: $(vars.deployment_name)-gp-np-0
+      group_placement_max_distance: 2
+
+  - id: group_placement_1
+    source: modules/compute/resource-policy
+    settings:
+      name: $(vars.deployment_name)-gp-np-1
+      group_placement_max_distance: 2
+
+  - id: a3_megagpu_pool_0
+    source: modules/compute/gke-node-pool
+    use: [gke_cluster, gpunets, group_placement_0]
+    settings:
+      name: a3-megagpu-pool-0
+      machine_type: a3-megagpu-8g
+      autoscaling_total_min_nodes: 2
+      initial_node_count: 2
+      zones: [$(vars.zone)]
+      host_maintenance_interval: PERIODIC
+    outputs: [instructions]
+
+  - id: a3_megagpu_pool_1
+    source: modules/compute/gke-node-pool
+    use: [gke_cluster, gpunets, group_placement_1]
+    settings:
+      name: a3-megagpu-pool-1
+      machine_type: a3-megagpu-8g
+      autoscaling_total_min_nodes: 2
+      initial_node_count: 2
+      zones: [$(vars.zone)]
+      host_maintenance_interval: PERIODIC
+    outputs: [instructions]
+
+  - id: workload_component_install
+    source: modules/management/kubectl-apply
+    use: [gke_cluster]
+    settings:
+      kueue:
+        install: true
+        config_path: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/kueue-xpk-configuration.yaml.tftpl
+        config_template_vars: {num_chips: "32"}
+      jobset:
+        install: true
+
+  - id: topology_aware_scheduler_install
+    source: community/modules/compute/gke-topology-scheduler
+    use: [gke_cluster]
+
+  - id: workload_configmap
+    source: modules/management/kubectl-apply
+    use: [gke_cluster]
+    settings:
+      apply_manifests:
+      - source: $(ghpc_stage("xpk-gke-a3-megagpu-files"))/config-map.yaml.tftpl
+        template_vars: {name: "xpk-gke-a3-megagpu-resources-configmap", num_nodes: "4"}
diff --git a/community/front-end/ofe/requirements.txt b/community/front-end/ofe/requirements.txt
@@ -19,7 +19,7 @@ dill==0.3.6
 distlib==0.3.6
 # django-revproxy==0.11.0 released but not yet in pypi
 git+https://github.com/jazzband/django-revproxy.git@d2234005135dc0771b7c4e0bb0465664ccfa5787
-Django==4.2.15
+Django==4.2.16
 django-allauth==0.54.0
 django-extensions==3.2.3
 djangorestframework==3.15.2