Skip to content

Commit

Permalink
Update A3U template design
Browse files Browse the repository at this point in the history
  • Loading branch information
ighosh98 committed Dec 22, 2024
1 parent 66804e7 commit 6642287
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 18 deletions.
35 changes: 18 additions & 17 deletions examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -53,21 +53,22 @@ deployment_groups:
- group: primary
modules:
- id: gke-a3-ultra-net-0
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e0c690b
source: modules/network/vpc
settings:
network_name: gke-a3-ultra-net-0
network_name: $(vars.deployment_name)-net-0
subnetworks:
- subnet_name: gke-a3-ultra-sub-0
- subnet_name: $(vars.deployment_name)-sub-0
subnet_region: $(vars.region)
subnet_ip: 192.168.0.0/18
secondary_ranges:
gke-a3-ultra-sub-0:
secondary_ranges_list:
- subnetwork_name: $(vars.deployment_name)-sub-0
ranges:
- range_name: pods
ip_cidr_range: 10.4.0.0/14
- range_name: services
ip_cidr_range: 10.0.32.0/20
firewall_rules:
- name: gke-a3-ultra-internal-0
- name: $(vars.deployment_name)-internal-0
ranges: [192.168.0.0/16]
allow:
- protocol: tcp
Expand All @@ -77,16 +78,16 @@ deployment_groups:
- protocol: icmp

- id: gke-a3-ultra-net-1
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/network/vpc?ref=e0c690b
source: modules/network/vpc
settings:
network_name: gke-a3-ultra-net-1
network_name: $(vars.deployment_name)-net-1
mtu: $(vars.mtu_size)
subnetworks:
- subnet_name: gke-a3-ultra-sub-1
- subnet_name: $(vars.deployment_name)-sub-1
subnet_region: $(vars.region)
subnet_ip: 192.168.64.0/18
firewall_rules:
- name: gke-a3-ultra-internal-1
- name: $(vars.deployment_name)-internal-1
ranges: [192.168.0.0/16]
allow:
- protocol: tcp
Expand All @@ -96,20 +97,20 @@ deployment_groups:
- protocol: icmp

- id: gke-a3-ultra-rdma-net
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/network/rdma-vpc?ref=98c49fe
source: modules/network/gpu-rdma-vpc
settings:
network_name: gke-a3-ultra-rdma-net
network_name: $(vars.deployment_name)-rdma-net
mtu: $(vars.mtu_size)
network_profile: https://www.googleapis.com/compute/beta/projects/$(vars.project_id)/global/networkProfiles/$(vars.zone)-vpc-roce
network_routing_mode: REGIONAL
subnetworks_template:
name_prefix: gke-a3-ultra-rdma-sub
name_prefix: $(vars.deployment_name)-rdma-sub
count: 8
ip_range: 192.168.128.0/18
region: $(vars.region)

- id: a3-ultragpu-cluster
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/scheduler/gke-cluster?ref=e0c690b
source: modules/scheduler/gke-cluster
use: [gke-a3-ultra-net-0]
settings:
release_channel: RAPID
Expand Down Expand Up @@ -146,7 +147,7 @@ deployment_groups:
outputs: [instructions]

- id: a3-ultragpu-pool
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/compute/gke-node-pool?ref=e0c690b
source: modules/compute/gke-node-pool
use: [a3-ultragpu-cluster]
settings:
machine_type: a3-ultragpu-8g
Expand Down Expand Up @@ -183,11 +184,11 @@ deployment_groups:
outputs: [instructions]

- id: topology-aware-scheduler-install
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//community/modules/compute/gke-topology-scheduler?ref=e0c690b
source: community/modules/compute/gke-topology-scheduler
use: [a3-ultragpu-cluster]

- id: workload-manager-install
source: github.com/GoogleCloudPlatform/cluster-toolkit.git//modules/management/kubectl-apply?ref=8c26d4a
source: modules/management/kubectl-apply
use: [a3-ultragpu-cluster]
settings:
kueue:
Expand Down
2 changes: 1 addition & 1 deletion tools/cloud-build/daily-tests/tests/gke-a3-ultragpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ test_name: gke-a3ultra
deployment_name: gke-a3ultra-{{ build }}
workspace: /workspace
blueprint_yaml: "{{ workspace }}/examples/gke-a3-ultragpu/gke-a3-ultragpu.yaml"
network: gke-a3-ultra-net-0
network: "{{ deployment_name }}-net-0"
region: europe-west1
zone: europe-west1-b
remote_node: "{{ deployment_name }}-remote-node-0"
Expand Down

0 comments on commit 6642287

Please sign in to comment.