From 00c34d8e4a1b9058d3e2b85fa3e0e7601332dbec Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Wed, 9 Oct 2024 14:05:45 +0000 Subject: [PATCH 1/2] RDMA Support in GKE Modules --- .../compute/pbspro-execution/README.md | 16 ++--- .../README.md | 30 ++++---- community/modules/network/rdma-vpc/README.md | 4 +- community/modules/network/rdma-vpc/main.tf | 17 +++++ community/modules/network/rdma-vpc/outputs.tf | 14 ++++ .../network/rdma-vpc/vpc-submodule/README.md | 8 +-- .../chrome-remote-desktop/README.md | 16 ++--- .../schedmd-slurm-gcp-v6-controller/README.md | 66 ++++++++--------- modules/compute/gke-node-pool/README.md | 2 + modules/compute/gke-node-pool/main.tf | 1 + modules/compute/gke-node-pool/variables.tf | 14 ++++ modules/compute/vm-instance/README.md | 26 +++---- modules/scheduler/gke-cluster/README.md | 3 + modules/scheduler/gke-cluster/main.tf | 70 +++++++++++++------ .../templates/gke-network-paramset.yaml.tftpl | 2 +- modules/scheduler/gke-cluster/variables.tf | 20 ++++++ .../pre-existing-gke-cluster/README.md | 1 + .../pre-existing-gke-cluster/main.tf | 43 +++++++++--- .../templates/gke-network-paramset.yaml.tftpl | 2 +- .../pre-existing-gke-cluster/variables.tf | 6 ++ 20 files changed, 248 insertions(+), 113 deletions(-) diff --git a/community/modules/compute/pbspro-execution/README.md b/community/modules/compute/pbspro-execution/README.md index 9b7ce281fa..5e0a884453 100644 --- a/community/modules/compute/pbspro-execution/README.md +++ b/community/modules/compute/pbspro-execution/README.md @@ -87,38 +87,38 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| | [auto\_delete\_boot\_disk](#input\_auto\_delete\_boot\_disk) | Controls if boot disk should be auto-deleted when instance is deleted. | `bool` | `true` | no | -| [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image th
at supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-v
m-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image th
at supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-v
m-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | | [deployment\_name](#input\_deployment\_name) | Cluster Toolkit deployment name. Cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for instances. | `number` | `200` | no | | [disk\_type](#input\_disk\_type) | Disk type for instances. | `string` | `"pd-standard"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true, instances will have public IPs on the internet. | `bool` | `true` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `null` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `null` | no | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Instance Image

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"name": "hpc-centos-7-v20240712",
"project": "cloud-hpc-image-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Instance Image

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"name": "hpc-centos-7-v20240712",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count](#input\_local\_ssd\_count) | The number of local SSDs to attach to each VM. See https://cloud.google.com/compute/docs/disks/local-ssd. | `number` | `0` | no | | [local\_ssd\_interface](#input\_local\_ssd\_interface) | Interface to be used with local SSDs. Can be either 'NVME' or 'SCSI'. No effect unless `local_ssd_count` is also set. | `string` | `"NVME"` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation | `string` | `"c2-standard-60"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | | [name\_prefix](#input\_name\_prefix) | Name prefix for PBS execution hostnames | `string` | `null` | no | -| [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface

**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `enable_public_ips` also do not apply
to network interfaces defined in this variable.

Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET", "RDMA", "IRDMA", "MRDMA"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface

**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `enable_public_ips` also do not apply
to network interfaces defined in this variable.

Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET", "RDMA", "IRDMA", "MRDMA"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network to attach the VM. | `string` | `"default"` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Describes maintenance behavior for the instance. If left blank this will default to `MIGRATE` except for when `placement_policy`, spot provisioning, or GPUs require it to be `TERMINATE` | `string` | `null` | no | | [pbs\_exec](#input\_pbs\_exec) | Root path in which to install PBS | `string` | `"/opt/pbs"` | no | | [pbs\_execution\_rpm\_url](#input\_pbs\_execution\_rpm\_url) | Path to PBS Pro Execution Host RPM file | `string` | n/a | yes | | [pbs\_home](#input\_pbs\_home) | PBS working directory | `string` | `"/var/spool/pbs"` | no | | [pbs\_server](#input\_pbs\_server) | IP address or DNS name of PBS server host | `string` | n/a | yes | -| [placement\_policy](#input\_placement\_policy) | Control where your VM instances are physically located relative to each other within a zone. |
object({
vm_count = number,
availability_domain_count = number,
collocation = string,
})
| `null` | no | +| [placement\_policy](#input\_placement\_policy) | Control where your VM instances are physically located relative to each other within a zone. |
object({
vm_count = number,
availability_domain_count = number,
collocation = string,
})
| `null` | no | | [project\_id](#input\_project\_id) | Project in which Google Cloud resources will be created | `string` | n/a | yes | | [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | -| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#service_account. |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/devstorage.read_write",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/servicecontrol",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/trace.append"
]
}
| no | +| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#service_account. |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/devstorage.read_write",
"https://www.googleapis.com/auth/logging.write",
"https://www.googleapis.com/auth/monitoring.write",
"https://www.googleapis.com/auth/servicecontrol",
"https://www.googleapis.com/auth/service.management.readonly",
"https://www.googleapis.com/auth/trace.append"
]
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [startup\_script](#input\_startup\_script) | Startup script used on the instance | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to attach the VM. | `string` | `null` | no | | [tags](#input\_tags) | Network tags, provided as a list | `list(string)` | `[]` | no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [zone](#input\_zone) | Default zone for creating resources | `string` | n/a | yes | ## Outputs diff --git a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md index 76a1e0172f..cc81ec31d7 100644 --- a/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md +++ b/community/modules/compute/schedmd-slurm-gcp-v6-nodeset-dynamic/README.md @@ -86,41 +86,41 @@ modules. For support with the underlying modules, see the instructions in the | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | -| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | -| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | -| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | -| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | +| [access\_config](#input\_access\_config) | Access configurations, i.e. IPs via which the VM instance can be accessed via the Internet. |
list(object({
nat_ip = string
network_tier = string
}))
| `[]` | no | +| [additional\_disks](#input\_additional\_disks) | Configurations of additional disks to be included on the partition nodes. (do not use "disk\_type: local-ssd"; known issue being addressed) |
list(object({
disk_name = string
device_name = string
disk_size_gb = number
disk_type = string
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [additional\_networks](#input\_additional\_networks) | Additional network interface details for GCE, if any. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [disk\_auto\_delete](#input\_disk\_auto\_delete) | Whether or not the boot disk should be auto-deleted. | `bool` | `true` | no | | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of boot disk to create for the partition compute nodes. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-standard"` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | -| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | +| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true. The node group VMs will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | | [enable\_spot\_vm](#input\_enable\_spot\_vm) | Enable the partition to use spot VMs (https://cloud.google.com/spot-vms). | `bool` | `false` | no | | [feature](#input\_feature) | The node feature, used to bind nodes to the nodeset. If not set, the nodeset name will be used. | `string` | `null` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | -| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm node group VM instances.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [labels](#input\_labels) | Labels to add to partition compute instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Compute Platform machine type to use for this partition compute nodes. | `string` | `"c2-standard-60"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | -| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.

Note: Placement groups are not supported when on\_host\_maintenance is set to
"MIGRATE" and will be deactivated regardless of the value of
enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is
set to "TERMINATE". | `string` | `"TERMINATE"` | no | +| [name](#input\_name) | Name of the nodeset. Automatically populated by the module id if not set.
If setting manually, ensure a unique value across all nodesets. | `string` | n/a | yes | +| [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy.

Note: Placement groups are not supported when on\_host\_maintenance is set to
"MIGRATE" and will be deactivated regardless of the value of
enable\_placement. To support enable\_placement, ensure on\_host\_maintenance is
set to "TERMINATE". | `string` | `"TERMINATE"` | no | | [preemptible](#input\_preemptible) | Should use preemptibles to burst. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | | [region](#input\_region) | The default region for Cloud resources. | `string` | n/a | yes | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the compute instances. | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | -| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the compute instances. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
- enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
- enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
- enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | | [slurm\_bucket\_path](#input\_slurm\_bucket\_path) | Path to the Slurm bucket. | `string` | n/a | yes | | [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Name of the Slurm cluster. | `string` | n/a | yes | -| [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | +| [spot\_instance\_config](#input\_spot\_instance\_config) | Configuration for spot VMs. |
object({
termination_action = string
})
| `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | @@ -129,6 +129,6 @@ modules. For support with the underlying modules, see the instructions in the | Name | Description | |------|-------------| | [instance\_template\_self\_link](#output\_instance\_template\_self\_link) | The URI of the template. | -| [node\_name\_prefix](#output\_node\_name\_prefix) | The prefix to be used for the node names.

Make sure that nodes are named `-`
This temporary required for proper functioning of the nodes.
While Slurm scheduler uses "features" to bind node and nodeset,
the SlurmGCP relies on node names for this (to be switched to features as well). | +| [node\_name\_prefix](#output\_node\_name\_prefix) | The prefix to be used for the node names.

Make sure that nodes are named `-`
This temporary required for proper functioning of the nodes.
While Slurm scheduler uses "features" to bind node and nodeset,
the SlurmGCP relies on node names for this (to be switched to features as well). | | [nodeset\_dyn](#output\_nodeset\_dyn) | Details of the nodeset. Typically used as input to `schedmd-slurm-gcp-v6-partition`. | diff --git a/community/modules/network/rdma-vpc/README.md b/community/modules/network/rdma-vpc/README.md index df9f6c3b94..d9ff0993f2 100644 --- a/community/modules/network/rdma-vpc/README.md +++ b/community/modules/network/rdma-vpc/README.md @@ -66,7 +66,7 @@ No resources. | [region](#input\_region) | The default region for Cloud resources | `string` | n/a | yes | | [secondary\_ranges](#input\_secondary\_ranges) | Secondary ranges that will be used in some of the subnets. Please see https://goo.gle/hpc-toolkit-vpc-deprecation for migration instructions. | `map(list(object({ range_name = string, ip_cidr_range = string })))` | `{}` | no | | [shared\_vpc\_host](#input\_shared\_vpc\_host) | Makes this project a Shared VPC host if 'true' (default 'false') | `bool` | `false` | no | -| [subnetworks\_template](#input\_subnetworks\_template) | Rules for creating subnetworks within the VPC |
object({
count = number
name_prefix = string
ip_range = string
region = string
private_access = optional(bool)
})
|
{
"count": 8,
"ip_range": "192.168.0.0/16",
"name_prefix": "subnet",
"region": null
}
| no | +| [subnetworks\_template](#input\_subnetworks\_template) | Rules for creating subnetworks within the VPC |
object({
count = number
name_prefix = string
ip_range = string
region = string
private_access = optional(bool)
})
|
{
"count": 8,
"ip_range": "192.168.0.0/16",
"name_prefix": "subnet",
"region": null
}
| no | ## Outputs @@ -76,5 +76,7 @@ No resources. | [network\_name](#output\_network\_name) | Name of the new VPC network | | [network\_self\_link](#output\_network\_self\_link) | Self link of the new VPC network | | [subnetwork\_interfaces](#output\_subnetwork\_interfaces) | Full list of subnetwork objects belonging to the new VPC network (compatible with vm-instance) | +| [subnetwork\_interfaces\_gke](#output\_subnetwork\_interfaces\_gke) | Full list of subnetwork objects belonging to the new VPC network (compatible with gke-node-pool) | +| [subnetwork\_name\_prefix](#output\_subnetwork\_name\_prefix) | Prefix of the RDMA subnetwork names | | [subnetworks](#output\_subnetworks) | Full list of subnetwork objects belonging to the new VPC network | diff --git a/community/modules/network/rdma-vpc/main.tf b/community/modules/network/rdma-vpc/main.tf index d2fa87603b..a166599c58 100644 --- a/community/modules/network/rdma-vpc/main.tf +++ b/community/modules/network/rdma-vpc/main.tf @@ -125,6 +125,23 @@ locals { alias_ip_range = [] } ] + + # FIX_ME(arajmane): There is a concern about this not working in a shared VPC environment. + # To unblock experimental testing, we decided to go ahead with this. + output_subnets_gke = [ + for subnet in module.vpc.subnets : { + network = local.network_name + subnetwork = subnet.name + subnetwork_project = null + network_ip = "" + nic_type = coalesce(var.nic_type, try(regex("IRDMA", local.profile_name), regex("MRDMA", local.profile_name), "RDMA")) + stack_type = null + queue_count = null + access_config = [] + ipv6_access_config = [] + alias_ip_range = [] + } + ] } module "vpc" { diff --git a/community/modules/network/rdma-vpc/outputs.tf b/community/modules/network/rdma-vpc/outputs.tf index 7831625145..1c2a304fd8 100644 --- a/community/modules/network/rdma-vpc/outputs.tf +++ b/community/modules/network/rdma-vpc/outputs.tf @@ -43,3 +43,17 @@ output "subnetwork_interfaces" { value = local.output_subnets depends_on = [module.vpc] } + +# The output subnetwork_interfaces is compatible with vm-instance module but not with gke-node-pool +# See https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/99493df21cecf6a092c45298bf7a45e0343cf622/modules/compute/vm-instance/variables.tf#L220 +# So, we need a separate output that makes the network and subnetwork names available +output "subnetwork_interfaces_gke" { + description = "Full list of subnetwork objects belonging to the new VPC network (compatible with gke-node-pool)" + value = local.output_subnets_gke + depends_on = [module.vpc] +} + +output "subnetwork_name_prefix" { + description = "Prefix of the RDMA subnetwork names" + value = var.subnetworks_template.name_prefix +} diff --git a/community/modules/network/rdma-vpc/vpc-submodule/README.md b/community/modules/network/rdma-vpc/vpc-submodule/README.md index 470062baf7..5dfd55b4bb 100644 --- a/community/modules/network/rdma-vpc/vpc-submodule/README.md +++ b/community/modules/network/rdma-vpc/vpc-submodule/README.md @@ -54,10 +54,10 @@ limitations under the License. | [auto\_create\_subnetworks](#input\_auto\_create\_subnetworks) | When set to true, the network is created in 'auto subnet mode' and it will create a subnet for each region automatically across the 10.128.0.0/9 address range. When set to false, the network is created in 'custom subnet mode' so the user can explicitly connect subnetwork resources. | `bool` | `false` | no | | [delete\_default\_internet\_gateway\_routes](#input\_delete\_default\_internet\_gateway\_routes) | If set, ensure that all routes within the network specified whose names begin with 'default-route' and with a next hop of 'default-internet-gateway' are deleted | `bool` | `false` | no | | [description](#input\_description) | An optional description of this resource. The resource must be recreated to modify this field. | `string` | `""` | no | -| [egress\_rules](#input\_egress\_rules) | List of egress rules. This will be ignored if variable 'rules' is non-empty |
list(object({
name = string
description = optional(string, null)
disabled = optional(bool, null)
priority = optional(number, null)
destination_ranges = optional(list(string), [])
source_ranges = optional(list(string), [])
source_tags = optional(list(string))
source_service_accounts = optional(list(string))
target_tags = optional(list(string))
target_service_accounts = optional(list(string))

allow = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
deny = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
log_config = optional(object({
metadata = string
}))
}))
| `[]` | no | +| [egress\_rules](#input\_egress\_rules) | List of egress rules. This will be ignored if variable 'rules' is non-empty |
list(object({
name = string
description = optional(string, null)
disabled = optional(bool, null)
priority = optional(number, null)
destination_ranges = optional(list(string), [])
source_ranges = optional(list(string), [])
source_tags = optional(list(string))
source_service_accounts = optional(list(string))
target_tags = optional(list(string))
target_service_accounts = optional(list(string))

allow = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
deny = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
log_config = optional(object({
metadata = string
}))
}))
| `[]` | no | | [enable\_ipv6\_ula](#input\_enable\_ipv6\_ula) | Enabled IPv6 ULA, this is a permanent change and cannot be undone! (default 'false') | `bool` | `false` | no | -| [firewall\_rules](#input\_firewall\_rules) | This is DEPRECATED and available for backward compatibility. Use ingress\_rules and egress\_rules variables. List of firewall rules |
list(object({
name = string
description = optional(string, null)
direction = optional(string, "INGRESS")
disabled = optional(bool, null)
priority = optional(number, null)
ranges = optional(list(string), [])
source_tags = optional(list(string))
source_service_accounts = optional(list(string))
target_tags = optional(list(string))
target_service_accounts = optional(list(string))

allow = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
deny = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
log_config = optional(object({
metadata = string
}))
}))
| `[]` | no | -| [ingress\_rules](#input\_ingress\_rules) | List of ingress rules. This will be ignored if variable 'rules' is non-empty |
list(object({
name = string
description = optional(string, null)
disabled = optional(bool, null)
priority = optional(number, null)
destination_ranges = optional(list(string), [])
source_ranges = optional(list(string), [])
source_tags = optional(list(string))
source_service_accounts = optional(list(string))
target_tags = optional(list(string))
target_service_accounts = optional(list(string))

allow = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
deny = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
log_config = optional(object({
metadata = string
}))
}))
| `[]` | no | +| [firewall\_rules](#input\_firewall\_rules) | This is DEPRECATED and available for backward compatibility. Use ingress\_rules and egress\_rules variables. List of firewall rules |
list(object({
name = string
description = optional(string, null)
direction = optional(string, "INGRESS")
disabled = optional(bool, null)
priority = optional(number, null)
ranges = optional(list(string), [])
source_tags = optional(list(string))
source_service_accounts = optional(list(string))
target_tags = optional(list(string))
target_service_accounts = optional(list(string))

allow = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
deny = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
log_config = optional(object({
metadata = string
}))
}))
| `[]` | no | +| [ingress\_rules](#input\_ingress\_rules) | List of ingress rules. This will be ignored if variable 'rules' is non-empty |
list(object({
name = string
description = optional(string, null)
disabled = optional(bool, null)
priority = optional(number, null)
destination_ranges = optional(list(string), [])
source_ranges = optional(list(string), [])
source_tags = optional(list(string))
source_service_accounts = optional(list(string))
target_tags = optional(list(string))
target_service_accounts = optional(list(string))

allow = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
deny = optional(list(object({
protocol = string
ports = optional(list(string))
})), [])
log_config = optional(object({
metadata = string
}))
}))
| `[]` | no | | [internal\_ipv6\_range](#input\_internal\_ipv6\_range) | When enabling IPv6 ULA, optionally, specify a /48 from fd20::/20 (default null) | `string` | `null` | no | | [mtu](#input\_mtu) | The network MTU (If set to 0, meaning MTU is unset - defaults to '1460'). Recommended values: 1460 (default for historic reasons), 1500 (Internet default), or 8896 (for Jumbo packets). Allowed are all values in the range 1300 to 8896, inclusively. | `number` | `0` | no | | [network\_firewall\_policy\_enforcement\_order](#input\_network\_firewall\_policy\_enforcement\_order) | Set the order that Firewall Rules and Firewall Policies are evaluated. Valid values are `BEFORE_CLASSIC_FIREWALL` and `AFTER_CLASSIC_FIREWALL`. (default null or equivalent to `AFTER_CLASSIC_FIREWALL`) | `string` | `null` | no | @@ -68,7 +68,7 @@ limitations under the License. | [routing\_mode](#input\_routing\_mode) | The network routing mode (default 'GLOBAL') | `string` | `"GLOBAL"` | no | | [secondary\_ranges](#input\_secondary\_ranges) | Secondary ranges that will be used in some of the subnets | `map(list(object({ range_name = string, ip_cidr_range = string })))` | `{}` | no | | [shared\_vpc\_host](#input\_shared\_vpc\_host) | Makes this project a Shared VPC host if 'true' (default 'false') | `bool` | `false` | no | -| [subnets](#input\_subnets) | The list of subnets being created |
list(object({
subnet_name = string
subnet_ip = string
subnet_region = string
subnet_private_access = optional(string)
subnet_private_ipv6_access = optional(string)
subnet_flow_logs = optional(string)
subnet_flow_logs_interval = optional(string)
subnet_flow_logs_sampling = optional(string)
subnet_flow_logs_metadata = optional(string)
subnet_flow_logs_filter = optional(string)
subnet_flow_logs_metadata_fields = optional(list(string))
description = optional(string)
purpose = optional(string)
role = optional(string)
stack_type = optional(string)
ipv6_access_type = optional(string)
}))
| n/a | yes | +| [subnets](#input\_subnets) | The list of subnets being created |
list(object({
subnet_name = string
subnet_ip = string
subnet_region = string
subnet_private_access = optional(string)
subnet_private_ipv6_access = optional(string)
subnet_flow_logs = optional(string)
subnet_flow_logs_interval = optional(string)
subnet_flow_logs_sampling = optional(string)
subnet_flow_logs_metadata = optional(string)
subnet_flow_logs_filter = optional(string)
subnet_flow_logs_metadata_fields = optional(list(string))
description = optional(string)
purpose = optional(string)
role = optional(string)
stack_type = optional(string)
ipv6_access_type = optional(string)
}))
| n/a | yes | ## Outputs diff --git a/community/modules/remote-desktop/chrome-remote-desktop/README.md b/community/modules/remote-desktop/chrome-remote-desktop/README.md index ee7ee37357..f2f2f1966c 100644 --- a/community/modules/remote-desktop/chrome-remote-desktop/README.md +++ b/community/modules/remote-desktop/chrome-remote-desktop/README.md @@ -74,29 +74,29 @@ No resources. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [add\_deployment\_name\_before\_prefix](#input\_add\_deployment\_name\_before\_prefix) | If true, the names of VMs and disks will always be prefixed with `deployment_name` to enable uniqueness across deployments.
See `name_prefix` for further details on resource naming behavior. | `bool` | `false` | no | +| [add\_deployment\_name\_before\_prefix](#input\_add\_deployment\_name\_before\_prefix) | If true, the names of VMs and disks will always be prefixed with `deployment_name` to enable uniqueness across deployments.
See `name_prefix` for further details on resource naming behavior. | `bool` | `false` | no | | [auto\_delete\_boot\_disk](#input\_auto\_delete\_boot\_disk) | Controls if boot disk should be auto-deleted when instance is deleted. | `bool` | `true` | no | -| [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image th
at supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-v
m-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image th
at supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-v
m-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | | [deployment\_name](#input\_deployment\_name) | Cluster Toolkit deployment name. Cloud resource names will include this value. | `string` | n/a | yes | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for instances. | `number` | `200` | no | | [disk\_type](#input\_disk\_type) | Disk type for instances. | `string` | `"pd-balanced"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | | [enable\_public\_ips](#input\_enable\_public\_ips) | If set to true, instances will have public IPs on the internet. | `bool` | `true` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. Requires virtual workstation accelerator if Nvidia Grid Drivers are required |
list(object({
type = string,
count = number
}))
|
[
{
"count": 1,
"type": "nvidia-tesla-t4-vws"
}
]
| no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. Requires virtual workstation accelerator if Nvidia Grid Drivers are required |
list(object({
type = string,
count = number
}))
|
[
{
"count": 1,
"type": "nvidia-tesla-t4-vws"
}
]
| no | | [install\_nvidia\_driver](#input\_install\_nvidia\_driver) | Installs the nvidia driver (true/false). For details, see https://cloud.google.com/compute/docs/gpus/install-drivers-gpu | `bool` | n/a | yes | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is
name="debian-12-bookworm-v20240815" and project="debian-cloud".
NOTE: uses fixed version of image to avoid NVIDIA driver compatibility issues.

An alternative image is from name="ubuntu-2204-jammy-v20240126" and project="ubuntu-os-cloud".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"name": "debian-12-bookworm-v20240815",
"project": "debian-cloud"
}
| no | +| [instance\_image](#input\_instance\_image) | Image used to build chrome remote desktop node. The default image is
name="debian-12-bookworm-v20240815" and project="debian-cloud".
NOTE: uses fixed version of image to avoid NVIDIA driver compatibility issues.

An alternative image is from name="ubuntu-2204-jammy-v20240126" and project="ubuntu-os-cloud".

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted. | `map(string)` |
{
"name": "debian-12-bookworm-v20240815",
"project": "debian-cloud"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | `{}` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation. Must be N1 family if GPU is used. | `string` | `"n1-standard-8"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | -| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | -| [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface
**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `enable_public_ips` also do not apply
to network interfaces defined in this variable.
Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET", "RDMA", "IRDMA", "MRDMA"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | +| [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface
**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `enable_public_ips` also do not apply
to network interfaces defined in this variable.
Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET", "RDMA", "IRDMA", "MRDMA"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network to attach the VM. | `string` | `"default"` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Describes maintenance behavior for the instance. If left blank this will default to `MIGRATE` except for when `placement_policy`, spot provisioning, or GPUs require it to be `TERMINATE` | `string` | `"TERMINATE"` | no | | [project\_id](#input\_project\_id) | Project in which Google Cloud resources will be created | `string` | n/a | yes | | [region](#input\_region) | Default region for creating resources | `string` | n/a | yes | -| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#service_account. |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/cloud-platform"
]
}
| no | +| [service\_account](#input\_service\_account) | Service account to attach to the instance. See https://www.terraform.io/docs/providers/google/r/compute_instance_template.html#service_account. |
object({
email = string,
scopes = set(string)
})
|
{
"email": null,
"scopes": [
"https://www.googleapis.com/auth/cloud-platform"
]
}
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [startup\_script](#input\_startup\_script) | Startup script used on the instance | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to attach the VM. | `string` | `null` | no | diff --git a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md index ddf940b92a..b67c388d4e 100644 --- a/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md +++ b/community/modules/scheduler/schedmd-slurm-gcp-v6-controller/README.md @@ -251,19 +251,19 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | -| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | -| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | +| [additional\_disks](#input\_additional\_disks) | List of maps of disks. |
list(object({
disk_name = string
device_name = string
disk_type = string
disk_size_gb = number
disk_labels = map(string)
auto_delete = bool
boot = bool
}))
| `[]` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Configures the network interface card and the maximum egress bandwidth for VMs.
- Setting `platform_default` respects the Google Cloud Platform API default values for networking.
- Setting `virtio_enabled` explicitly selects the VirtioNet network adapter.
- Setting `gvnic_enabled` selects the gVNIC network adapter (without Tier 1 high bandwidth).
- Setting `tier_1_enabled` selects both the gVNIC adapter and Tier 1 high bandwidth networking.
- Note: both gVNIC and Tier 1 networking require a VM image with gVNIC support as well as specific VM families and shapes.
- See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"platform_default"` | no | | [bucket\_dir](#input\_bucket\_dir) | Bucket directory for cluster files to be put into. If not specified, then one will be chosen based on slurm\_cluster\_name. | `string` | `null` | no | -| [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | +| [bucket\_name](#input\_bucket\_name) | Name of GCS bucket.
Ignored when 'create\_bucket' is true. | `string` | `null` | no | | [can\_ip\_forward](#input\_can\_ip\_forward) | Enable IP forwarding, for NAT instances for example. | `bool` | `false` | no | | [cgroup\_conf\_tpl](#input\_cgroup\_conf\_tpl) | Slurm cgroup.conf template file path. | `string` | `null` | no | -| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | -| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | +| [cloud\_parameters](#input\_cloud\_parameters) | cloud.conf options. Defaults inherited from [Slurm GCP repo](https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/terraform/slurm_cluster/modules/slurm_files/README_TF.md#input_cloud_parameters) |
object({
no_comma_params = optional(bool)
resume_rate = optional(number)
resume_timeout = optional(number)
suspend_rate = optional(number)
suspend_timeout = optional(number)
topology_plugin = optional(string)
topology_param = optional(string)
tree_width = optional(number)
})
| `{}` | no | +| [cloudsql](#input\_cloudsql) | Use this database instead of the one on the controller.
server\_ip : Address of the database server.
user : The user to access the database as.
password : The password, given the user, to access the given database. (sensitive)
db\_name : The database to access.
user\_managed\_replication : The list of location and (optional) kms\_key\_name for secret |
object({
server_ip = string
user = string
password = string # sensitive
db_name = string
user_managed_replication = optional(list(object({
location = string
kms_key_name = optional(string)
})), [])
})
| `null` | no | | [compute\_startup\_script](#input\_compute\_startup\_script) | Startup script used by the compute VMs. | `string` | `"# no-op"` | no | -| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [compute\_startup\_scripts\_timeout](#input\_compute\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in compute\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [controller\_startup\_script](#input\_controller\_startup\_script) | Startup script used by the controller VM. | `string` | `"# no-op"` | no | -| [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [controller\_startup\_scripts\_timeout](#input\_controller\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in controller\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [create\_bucket](#input\_create\_bucket) | Create GCS bucket instead of using an existing one. | `bool` | `true` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment. | `string` | n/a | yes | | [disable\_controller\_public\_ips](#input\_disable\_controller\_public\_ips) | DEPRECATED: Use `enable_controller_public_ips` instead. | `bool` | `null` | no | @@ -273,56 +273,56 @@ limitations under the License. | [disk\_labels](#input\_disk\_labels) | Labels specific to the boot disk. These will be merged with var.labels. | `map(string)` | `{}` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Boot disk size in GB. | `number` | `50` | no | | [disk\_type](#input\_disk\_type) | Boot disk type, can be either hyperdisk-balanced, pd-ssd, pd-standard, pd-balanced, or pd-extreme. | `string` | `"pd-ssd"` | no | -| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | -| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes will be destroyed. | `bool` | `true` | no | +| [enable\_bigquery\_load](#input\_enable\_bigquery\_load) | Enables loading of cluster job usage into big query.

NOTE: Requires Google Bigquery API. | `bool` | `false` | no | +| [enable\_cleanup\_compute](#input\_enable\_cleanup\_compute) | Enables automatic cleanup of compute nodes and resource policies (e.g.
placement groups) managed by this module, when cluster is destroyed.

*WARNING*: Toggling this off will impact the running workload.
Deployed compute nodes will be destroyed. | `bool` | `true` | no | | [enable\_confidential\_vm](#input\_enable\_confidential\_vm) | Enable the Confidential VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_controller\_public\_ips](#input\_enable\_controller\_public\_ips) | If set to true. The controller will have a random public IP assigned to it. Ignored if access\_config is set. | `bool` | `false` | no | | [enable\_debug\_logging](#input\_enable\_debug\_logging) | Enables debug logging mode. | `bool` | `false` | no | -| [enable\_default\_mounts](#input\_enable\_default\_mounts) | Enable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `true` | no | +| [enable\_default\_mounts](#input\_enable\_default\_mounts) | Enable default global network storage from the controller
- /usr/local/etc/slurm
- /etc/munge
- /home
- /apps
Warning: If these are disabled, the slurm etc and munge dirs must be added
manually, or some other mechanism must be used to synchronize the slurm conf
files and the munge key across the cluster. | `bool` | `true` | no | | [enable\_devel](#input\_enable\_devel) | DEPRECATED: `enable_devel` is always on. | `bool` | `null` | no | -| [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/tools/prologs-epilogs/README.md | `bool` | `null` | no | -| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | +| [enable\_external\_prolog\_epilog](#input\_enable\_external\_prolog\_epilog) | Automatically enable a script that will execute prolog and epilog scripts
shared by NFS from the controller to compute nodes. Find more details at:
https://github.com/GoogleCloudPlatform/slurm-gcp/blob/master/tools/prologs-epilogs/README.md | `bool` | `null` | no | +| [enable\_oslogin](#input\_enable\_oslogin) | Enables Google Cloud os-login for user login and authentication for VMs.
See https://cloud.google.com/compute/docs/oslogin | `bool` | `true` | no | | [enable\_shielded\_vm](#input\_enable\_shielded\_vm) | Enable the Shielded VM configuration. Note: the instance image must support option. | `bool` | `false` | no | | [enable\_slurm\_gcp\_plugins](#input\_enable\_slurm\_gcp\_plugins) | Enables calling hooks in scripts/slurm\_gcp\_plugins during cluster resume and suspend. | `any` | `false` | no | | [enable\_smt](#input\_enable\_smt) | Enables Simultaneous Multi-Threading (SMT) on instance. | `bool` | `false` | no | -| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": "beta"
}
| no | -| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | +| [endpoint\_versions](#input\_endpoint\_versions) | Version of the API to use (The compute service is the only API currently supported) |
object({
compute = string
})
|
{
"compute": "beta"
}
| no | +| [epilog\_scripts](#input\_epilog\_scripts) | List of scripts to be used for Epilog. Programs for the slurmd to execute
on every node when a user's job completes.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Epilog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | | [extra\_logging\_flags](#input\_extra\_logging\_flags) | The only available flag is `trace_api` | `map(bool)` | `{}` | no | | [gcloud\_path\_override](#input\_gcloud\_path\_override) | Directory of the gcloud executable to be used during cleanup | `string` | `""` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | -| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | -| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [instance\_image](#input\_instance\_image) | Defines the image that will be used in the Slurm controller VM instance.

Expected Fields:
name: The name of the image. Mutually exclusive with family.
family: The image family to use. Mutually exclusive with name.
project: The project where the image is hosted.

For more information on creating custom images that comply with Slurm on GCP
see the "Slurm on GCP Custom Images" section in docs/vm-images.md. | `map(string)` |
{
"family": "slurm-gcp-6-6-hpc-rocky-linux-8",
"project": "schedmd-slurm-public"
}
| no | +| [instance\_image\_custom](#input\_instance\_image\_custom) | A flag that designates that the user is aware that they are requesting
to use a custom and potentially incompatible image for this Slurm on
GCP module.

If the field is set to false, only the compatible families and project
names will be accepted. The deployment will fail with any other image
family or name. If set to true, no checks will be done.

See: https://goo.gle/hpc-slurm-images | `bool` | `false` | no | | [instance\_template](#input\_instance\_template) | DEPRECATED: Instance template can not be specified for controller. | `string` | `null` | no | | [labels](#input\_labels) | Labels, provided as a map. | `map(string)` | `{}` | no | -| [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | -| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
additional_networks = optional(list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | +| [login\_network\_storage](#input\_login\_network\_storage) | An array of network attached storage mounts to be configured on all login nodes. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
}))
| `[]` | no | +| [login\_nodes](#input\_login\_nodes) | List of slurm login instance definitions. |
list(object({
name_prefix = string
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
additional_networks = optional(list(object({
access_config = optional(list(object({
nat_ip = string
network_tier = string
})), [])
alias_ip_range = optional(list(object({
ip_cidr_range = string
subnetwork_range_name = string
})), [])
ipv6_access_config = optional(list(object({
network_tier = string
})), [])
network = optional(string)
network_ip = optional(string, "")
nic_type = optional(string)
queue_count = optional(number)
stack_type = optional(string)
subnetwork = optional(string)
subnetwork_project = optional(string)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string, "n1-standard-1")
enable_confidential_vm = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
num_instances = optional(number, 1)
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
static_ips = optional(list(string), [])
subnetwork = string
spot = optional(bool, false)
tags = optional(list(string), [])
zone = optional(string)
termination_action = optional(string)
}))
| `[]` | no | | [login\_startup\_script](#input\_login\_startup\_script) | Startup script used by the login VMs. | `string` | `"# no-op"` | no | -| [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | +| [login\_startup\_scripts\_timeout](#input\_login\_startup\_scripts\_timeout) | The timeout (seconds) applied to each script in login\_startup\_scripts. If
any script exceeds this timeout, then the instance setup process is considered
failed and handled accordingly.

NOTE: When set to 0, the timeout is considered infinite and thus disabled. | `number` | `300` | no | | [machine\_type](#input\_machine\_type) | Machine type to create. | `string` | `"c2-standard-4"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map. | `map(string)` | `{}` | no | -| [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | -| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | -| [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | -| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | +| [min\_cpu\_platform](#input\_min\_cpu\_platform) | Specifies a minimum CPU platform. Applicable values are the friendly names of
CPU platforms, such as Intel Haswell or Intel Skylake. See the complete list:
https://cloud.google.com/compute/docs/instances/specify-min-cpu-platform | `string` | `null` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured on all instances. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
}))
| `[]` | no | +| [nodeset](#input\_nodeset) | Define nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 1)
node_conf = optional(map(string), {})
nodeset_name = string
additional_disks = optional(list(object({
disk_name = optional(string)
device_name = optional(string)
disk_size_gb = optional(number)
disk_type = optional(string)
disk_labels = optional(map(string), {})
auto_delete = optional(bool, true)
boot = optional(bool, false)
})), [])
bandwidth_tier = optional(string, "platform_default")
can_ip_forward = optional(bool, false)
disable_smt = optional(bool, false)
disk_auto_delete = optional(bool, true)
disk_labels = optional(map(string), {})
disk_size_gb = optional(number)
disk_type = optional(string)
enable_confidential_vm = optional(bool, false)
enable_placement = optional(bool, false)
enable_oslogin = optional(bool, true)
enable_shielded_vm = optional(bool, false)
enable_maintenance_reservation = optional(bool, true)
gpu = optional(object({
count = number
type = string
}))
labels = optional(map(string), {})
machine_type = optional(string)
maintenance_interval = optional(string)
instance_properties_json = string
metadata = optional(map(string), {})
min_cpu_platform = optional(string)
network_tier = optional(string, "STANDARD")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
on_host_maintenance = optional(string)
preemptible = optional(bool, false)
region = optional(string)
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
shielded_instance_config = optional(object({
enable_integrity_monitoring = optional(bool, true)
enable_secure_boot = optional(bool, true)
enable_vtpm = optional(bool, true)
}))
source_image_family = optional(string)
source_image_project = optional(string)
source_image = optional(string)
subnetwork_self_link = string
additional_networks = optional(list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
})))
access_config = optional(list(object({
nat_ip = string
network_tier = string
})))
spot = optional(bool, false)
tags = optional(list(string), [])
termination_action = optional(string)
reservation_name = optional(string)
startup_script = optional(list(object({
filename = string
content = string })), [])

zone_target_shape = string
zone_policy_allow = set(string)
zone_policy_deny = set(string)
}))
| `[]` | no | +| [nodeset\_dyn](#input\_nodeset\_dyn) | Defines dynamic nodesets, as a list. |
list(object({
nodeset_name = string
nodeset_feature = string
}))
| `[]` | no | +| [nodeset\_tpu](#input\_nodeset\_tpu) | Define TPU nodesets, as a list. |
list(object({
node_count_static = optional(number, 0)
node_count_dynamic_max = optional(number, 5)
nodeset_name = string
enable_public_ip = optional(bool, false)
node_type = string
accelerator_config = optional(object({
topology = string
version = string
}), {
topology = ""
version = ""
})
tf_version = string
preemptible = optional(bool, false)
preserve_tpu = optional(bool, false)
zone = string
data_disks = optional(list(string), [])
docker_image = optional(string, "")
network_storage = optional(list(object({
server_ip = string
remote_mount = string
local_mount = string
fs_type = string
mount_options = string
client_install_runner = optional(map(string))
mount_runner = optional(map(string))
})), [])
subnetwork = string
service_account = optional(object({
email = optional(string)
scopes = optional(list(string), ["https://www.googleapis.com/auth/cloud-platform"])
}))
project_id = string
reserved = optional(string, false)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Instance availability Policy. | `string` | `"MIGRATE"` | no | -| [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
partition_name = string
partition_conf = optional(map(string), {})
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
enable_job_exclusive = optional(bool, false)
}))
| n/a | yes | +| [partitions](#input\_partitions) | Cluster partitions as a list. See module slurm\_partition. |
list(object({
partition_name = string
partition_conf = optional(map(string), {})
partition_nodeset = optional(list(string), [])
partition_nodeset_dyn = optional(list(string), [])
partition_nodeset_tpu = optional(list(string), [])
enable_job_exclusive = optional(bool, false)
}))
| n/a | yes | | [preemptible](#input\_preemptible) | Allow the instance to be preempted. | `bool` | `false` | no | | [project\_id](#input\_project\_id) | Project ID to create resources in. | `string` | n/a | yes | -| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | +| [prolog\_scripts](#input\_prolog\_scripts) | List of scripts to be used for Prolog. Programs for the slurmd to execute
whenever it is asked to run a job step from a new job allocation.
See https://slurm.schedmd.com/slurm.conf.html#OPT_Prolog. |
list(object({
filename = string
content = optional(string)
source = optional(string)
}))
| `[]` | no | | [region](#input\_region) | The default region to place resources in. | `string` | n/a | yes | -| [service\_account](#input\_service\_account) | DEPRECATED: Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string
scopes = set(string)
})
| `null` | no | +| [service\_account](#input\_service\_account) | DEPRECATED: Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to attach to the controller instance. | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the controller instance. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | -| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | -| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting.
If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to attach to the controller instance. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [shielded\_instance\_config](#input\_shielded\_instance\_config) | Shielded VM configuration for the instance. Note: not used unless
enable\_shielded\_vm is 'true'.
enable\_integrity\_monitoring : Compare the most recent boot measurements to the
integrity policy baseline and return a pair of pass/fail results depending on
whether they match or not.
enable\_secure\_boot : Verify the digital signature of all boot components, and
halt the boot process if signature verification fails.
enable\_vtpm : Use a virtualized trusted platform module, which is a
specialized computer chip you can use to encrypt objects like keys and
certificates. |
object({
enable_integrity_monitoring = bool
enable_secure_boot = bool
enable_vtpm = bool
})
|
{
"enable_integrity_monitoring": true,
"enable_secure_boot": true,
"enable_vtpm": true
}
| no | +| [slurm\_cluster\_name](#input\_slurm\_cluster\_name) | Cluster name, used for resource naming and slurm accounting.
If not provided it will default to the first 8 characters of the deployment name (removing any invalid characters). | `string` | `null` | no | | [slurm\_conf\_tpl](#input\_slurm\_conf\_tpl) | Slurm slurm.conf template file path. | `string` | `null` | no | | [slurmdbd\_conf\_tpl](#input\_slurmdbd\_conf\_tpl) | Slurm slurmdbd.conf template file path. | `string` | `null` | no | | [static\_ips](#input\_static\_ips) | List of static IPs for VM instances. | `list(string)` | `[]` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | Subnet to deploy to. | `string` | n/a | yes | | [tags](#input\_tags) | Network tag list. | `list(string)` | `[]` | no | | [universe\_domain](#input\_universe\_domain) | Domain address for alternate API universe | `string` | `"googleapis.com"` | no | -| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | +| [zone](#input\_zone) | Zone where the instances should be created. If not specified, instances will be
spread across available zones in the region. | `string` | `null` | no | ## Outputs diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index c45a3ed83e..21cb2f9daf 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -297,12 +297,14 @@ limitations under the License. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | +| [is\_gke\_sandbox](#input\_is\_gke\_sandbox) | Temporary variable to identify the GKE sandbox environment | `bool` | `false` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [local\_ssd\_count\_nvme\_block](#input\_local\_ssd\_count\_nvme\_block) | The number of local SSDs to attach to each node to back block storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | | [machine\_type](#input\_machine\_type) | The name of a Google Compute Engine machine type. | `string` | `"c2-standard-60"` | no | | [name](#input\_name) | The name of the node pool. If left blank, will default to the machine type. | `string` | `null` | no | +| [node\_version](#input\_node\_version) | Temporary variable to explicitly set the node version | `string` | `null` | no | | [placement\_policy](#input\_placement\_policy) | Group placement policy to use for the node pool's nodes. `COMPACT` is the only supported value for `type` currently. `name` is the name of the placement policy.
It is assumed that the specified policy exists. To create a placement policy refer to https://cloud.google.com/sdk/gcloud/reference/compute/resource-policies/create/group-placement.
Note: Placement policies have the [following](https://cloud.google.com/compute/docs/instances/placement-policies-overview#restrictions-compact-policies) restrictions. |
object({
type = string
name = optional(string)
})
|
{
"name": null,
"type": null
}
| no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | | [reservation\_affinity](#input\_reservation\_affinity) | Reservation resource to consume. When targeting SPECIFIC\_RESERVATION, specific\_reservations needs be specified.
Even though specific\_reservations is a list, only one reservation is allowed by the NodePool API.
It is assumed that the specified reservation exists and has available capacity.
For a shared reservation, specify the project\_id as well in which it was created.
To create a reservation refer to https://cloud.google.com/compute/docs/instances/reservations-single-project and https://cloud.google.com/compute/docs/instances/reservations-shared |
object({
consume_reservation_type = string
specific_reservations = optional(list(object({
name = string
project = optional(string)
})))
})
|
{
"consume_reservation_type": "NO_RESERVATION",
"specific_reservations": []
}
| no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 59cbe1d911..9c126dcc65 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -44,6 +44,7 @@ resource "google_container_node_pool" "node_pool" { name = var.name == null ? var.machine_type : var.name cluster = var.cluster_id node_locations = var.zones + version = var.is_gke_sandbox ? var.node_version : null node_count = var.static_node_count dynamic "autoscaling" { diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 6e24edaa02..877fae56a8 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -354,3 +354,17 @@ variable "host_maintenance_interval" { error_message = "Invalid host_maintenance_interval value. Must be PERIODIC, AS_NEEDED or the empty string" } } + +# REMOVE_ME: It's a temporary variable used in internal testing +variable "is_gke_sandbox" { + description = "Temporary variable to identify the GKE sandbox environment" + default = false + type = bool +} + +# REMOVE_ME: It's a temporary variable used in internal testing +variable "node_version" { + description = "Temporary variable to explicitly set the node version" + type = string + default = null +} diff --git a/modules/compute/vm-instance/README.md b/modules/compute/vm-instance/README.md index ce1b93f949..e5e4ffe63d 100644 --- a/modules/compute/vm-instance/README.md +++ b/modules/compute/vm-instance/README.md @@ -206,43 +206,43 @@ limitations under the License. | Name | Description | Type | Default | Required | |------|-------------|------|---------|:--------:| -| [add\_deployment\_name\_before\_prefix](#input\_add\_deployment\_name\_before\_prefix) | If true, the names of VMs and disks will always be prefixed with `deployment_name` to enable uniqueness across deployments.
See `name_prefix` for further details on resource naming behavior. | `bool` | `false` | no | -| [allocate\_ip](#input\_allocate\_ip) | If not null, allocate IPs with the given configuration. See details at
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_address |
object({
address_type = optional(string, "INTERNAL")
purpose = optional(string),
network_tier = optional(string),
ip_version = optional(string, "IPV4"),
})
| `null` | no | -| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | +| [add\_deployment\_name\_before\_prefix](#input\_add\_deployment\_name\_before\_prefix) | If true, the names of VMs and disks will always be prefixed with `deployment_name` to enable uniqueness across deployments.
See `name_prefix` for further details on resource naming behavior. | `bool` | `false` | no | +| [allocate\_ip](#input\_allocate\_ip) | If not null, allocate IPs with the given configuration. See details at
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_address |
object({
address_type = optional(string, "INTERNAL")
purpose = optional(string),
network_tier = optional(string),
ip_version = optional(string, "IPV4"),
})
| `null` | no | +| [allow\_automatic\_updates](#input\_allow\_automatic\_updates) | If false, disables automatic system package updates on the created instances. This feature is
only available on supported images (or images derived from them). For more details, see
https://cloud.google.com/compute/docs/instances/create-hpc-vm#disable_automatic_updates | `bool` | `true` | no | | [auto\_delete\_boot\_disk](#input\_auto\_delete\_boot\_disk) | Controls if boot disk should be auto-deleted when instance is deleted. | `bool` | `true` | no | | [automatic\_restart](#input\_automatic\_restart) | Specifies if the instance should be restarted if it was terminated by Compute Engine (not a user). | `bool` | `null` | no | -| [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | +| [bandwidth\_tier](#input\_bandwidth\_tier) | Tier 1 bandwidth increases the maximum egress bandwidth for VMs.
Using the `tier_1_enabled` setting will enable both gVNIC and TIER\_1 higher bandwidth networking.
Using the `gvnic_enabled` setting will only enable gVNIC and will not enable TIER\_1.
Note that TIER\_1 only works with specific machine families & shapes and must be using an image that supports gVNIC. See [official docs](https://cloud.google.com/compute/docs/networking/configure-vm-with-high-bandwidth-configuration) for more details. | `string` | `"not_enabled"` | no | | [deployment\_name](#input\_deployment\_name) | Name of the deployment, will optionally be used name resources according to `name_prefix` | `string` | n/a | yes | | [disable\_public\_ips](#input\_disable\_public\_ips) | If set to true, instances will not have public IPs | `bool` | `false` | no | | [disk\_size\_gb](#input\_disk\_size\_gb) | Size of disk for instances. | `number` | `200` | no | | [disk\_type](#input\_disk\_type) | Disk type for instances. | `string` | `"pd-standard"` | no | | [enable\_oslogin](#input\_enable\_oslogin) | Enable or Disable OS Login with "ENABLE" or "DISABLE". Set to "INHERIT" to inherit project OS Login setting. | `string` | `"ENABLE"` | no | -| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | +| [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = string,
count = number
}))
| `[]` | no | | [instance\_count](#input\_instance\_count) | Number of instances | `number` | `1` | no | -| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | +| [instance\_image](#input\_instance\_image) | Instance Image | `map(string)` |
{
"family": "hpc-rocky-linux-8",
"project": "cloud-hpc-image-public"
}
| no | | [labels](#input\_labels) | Labels to add to the instances. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count](#input\_local\_ssd\_count) | The number of local SSDs to attach to each VM. See https://cloud.google.com/compute/docs/disks/local-ssd. | `number` | `0` | no | | [local\_ssd\_interface](#input\_local\_ssd\_interface) | Interface to be used with local SSDs. Can be either 'NVME' or 'SCSI'. No effect unless `local_ssd_count` is also set. | `string` | `"NVME"` | no | | [machine\_type](#input\_machine\_type) | Machine type to use for the instance creation | `string` | `"c2-standard-60"` | no | | [metadata](#input\_metadata) | Metadata, provided as a map | `map(string)` | `{}` | no | | [min\_cpu\_platform](#input\_min\_cpu\_platform) | The name of the minimum CPU platform that you want the instance to use. | `string` | `null` | no | -| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | -| [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface

**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `disable_public_ips` also do not apply
to network interfaces defined in this variable.

Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | +| [name\_prefix](#input\_name\_prefix) | An optional name for all VM and disk resources.
If not supplied, `deployment_name` will be used.
When `name_prefix` is supplied, and `add_deployment_name_before_prefix` is set,
then resources are named by "<`deployment_name`>-<`name_prefix`>-<#>". | `string` | `null` | no | +| [network\_interfaces](#input\_network\_interfaces) | A list of network interfaces. The options match that of the terraform
network\_interface block of google\_compute\_instance. For descriptions of the
subfields or more information see the documentation:
https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_instance#nested_network_interface

**\_NOTE:\_** If `network_interfaces` are set, `network_self_link` and
`subnetwork_self_link` will be ignored, even if they are provided through
the `use` field. `bandwidth_tier` and `disable_public_ips` also do not apply
to network interfaces defined in this variable.

Subfields:
network (string, required if subnetwork is not supplied)
subnetwork (string, required if network is not supplied)
subnetwork\_project (string, optional)
network\_ip (string, optional)
nic\_type (string, optional, choose from ["GVNIC", "VIRTIO\_NET"])
stack\_type (string, optional, choose from ["IPV4\_ONLY", "IPV4\_IPV6"])
queue\_count (number, optional)
access\_config (object, optional)
ipv6\_access\_config (object, optional)
alias\_ip\_range (list(object), optional) |
list(object({
network = string,
subnetwork = string,
subnetwork_project = string,
network_ip = string,
nic_type = string,
stack_type = string,
queue_count = number,
access_config = list(object({
nat_ip = string,
public_ptr_domain_name = string,
network_tier = string
})),
ipv6_access_config = list(object({
public_ptr_domain_name = string,
network_tier = string
})),
alias_ip_range = list(object({
ip_cidr_range = string,
subnetwork_range_name = string
}))
}))
| `[]` | no | | [network\_self\_link](#input\_network\_self\_link) | The self link of the network to attach the VM. Can use "default" for the default network. | `string` | `null` | no | -| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | +| [network\_storage](#input\_network\_storage) | An array of network attached storage mounts to be configured. |
list(object({
server_ip = string,
remote_mount = string,
local_mount = string,
fs_type = string,
mount_options = string,
client_install_runner = map(string)
mount_runner = map(string)
}))
| `[]` | no | | [on\_host\_maintenance](#input\_on\_host\_maintenance) | Describes maintenance behavior for the instance. If left blank this will default to `MIGRATE` except for when `placement_policy`, spot provisioning, or GPUs require it to be `TERMINATE` | `string` | `null` | no | -| [placement\_policy](#input\_placement\_policy) | Control where your VM instances are physically located relative to each other within a zone.
See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#nested_group_placement_policy | `any` | `null` | no | +| [placement\_policy](#input\_placement\_policy) | Control where your VM instances are physically located relative to each other within a zone.
See https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/compute_resource_policy#nested_group_placement_policy | `any` | `null` | no | | [pre\_existing\_placement\_policy](#input\_pre\_existing\_placement\_policy) | Use pre-existing placement policy within the project specified | `string` | `null` | no | | [project\_id](#input\_project\_id) | Project in which the HPC deployment will be created | `string` | n/a | yes | | [region](#input\_region) | The region to deploy to | `string` | n/a | yes | -| [service\_account](#input\_service\_account) | DEPRECATED - Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string,
scopes = set(string)
})
| `null` | no | +| [service\_account](#input\_service\_account) | DEPRECATED - Use `service_account_email` and `service_account_scopes` instead. |
object({
email = string,
scopes = set(string)
})
| `null` | no | | [service\_account\_email](#input\_service\_account\_email) | Service account e-mail address to use with the node pool | `string` | `null` | no | -| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | +| [service\_account\_scopes](#input\_service\_account\_scopes) | Scopes to to use with the node pool. | `set(string)` |
[
"https://www.googleapis.com/auth/cloud-platform"
]
| no | | [spot](#input\_spot) | Provision VMs using discounted Spot pricing, allowing for preemption | `bool` | `false` | no | | [startup\_script](#input\_startup\_script) | Startup script used on the instance | `string` | `null` | no | | [subnetwork\_self\_link](#input\_subnetwork\_self\_link) | The self link of the subnetwork to attach the VM. | `string` | `null` | no | | [tags](#input\_tags) | Network tags, provided as a list | `list(string)` | `[]` | no | -| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | +| [threads\_per\_core](#input\_threads\_per\_core) | Sets the number of threads per physical core. By setting threads\_per\_core
to 2, Simultaneous Multithreading (SMT) is enabled extending the total number
of virtual cores. For example, a machine of type c2-standard-60 will have 60
virtual cores with threads\_per\_core equal to 2. With threads\_per\_core equal
to 1 (SMT turned off), only the 30 physical cores will be available on the VM.

The default value of \"0\" will turn off SMT for supported machine types, and
will fall back to GCE defaults for unsupported machine types (t2d, shared-core
instances, or instances with less than 2 vCPU).

Disabling SMT can be more performant in many HPC workloads, therefore it is
disabled by default where compatible.

null = SMT configuration will use the GCE defaults for the machine type
0 = SMT will be disabled where compatible (default)
1 = SMT will always be disabled (will fail on incompatible machine types)
2 = SMT will always be enabled (will fail on incompatible machine types) | `number` | `0` | no | | [zone](#input\_zone) | Compute Platform zone | `string` | n/a | yes | ## Outputs diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index 3a72e1149b..b39e159e39 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -159,6 +159,7 @@ limitations under the License. | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | | [enable\_private\_nodes](#input\_enable\_private\_nodes) | (Beta) Whether nodes have internal IP addresses only. | `bool` | `true` | no | | [gcp\_public\_cidrs\_access\_enabled](#input\_gcp\_public\_cidrs\_access\_enabled) | Whether the cluster master is accessible via all the Google Compute Engine Public IPs. To view this list of IP addresses look here https://cloud.google.com/compute/docs/faq#find_ip_range | `bool` | `false` | no | +| [is\_gke\_sandbox](#input\_is\_gke\_sandbox) | Temporary variable to identify the GKE sandbox environment | `bool` | `false` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. |
list(object({
name = string
start_time = string
end_time = string
exclusion_scope = string
}))
| `[]` | no | | [maintenance\_start\_time](#input\_maintenance\_start\_time) | Start time for daily maintenance operations. Specified in GMT with `HH:MM` format. | `string` | `"09:00"` | no | @@ -170,6 +171,7 @@ limitations under the License. | [pods\_ip\_range\_name](#input\_pods\_ip\_range\_name) | The name of the secondary subnet ip range to use for pods. | `string` | `"pods"` | no | | [prefix\_with\_deployment\_name](#input\_prefix\_with\_deployment\_name) | If true, cluster name will be prefixed by `deployment_name` (ex: -). | `bool` | `true` | no | | [project\_id](#input\_project\_id) | The project ID to host the cluster in. | `string` | n/a | yes | +| [rdma\_subnetwork\_name\_prefix](#input\_rdma\_subnetwork\_name\_prefix) | Prefix of the RDMA subnetwork names | `string` | `null` | no | | [region](#input\_region) | The region to host the cluster in. | `string` | n/a | yes | | [release\_channel](#input\_release\_channel) | The release channel of this cluster. Accepted values are `UNSPECIFIED`, `RAPID`, `REGULAR` and `STABLE`. | `string` | `"UNSPECIFIED"` | no | | [service\_account](#input\_service\_account) | DEPRECATED: use service\_account\_email and scopes. |
object({
email = string,
scopes = set(string)
})
| `null` | no | @@ -187,6 +189,7 @@ limitations under the License. | [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "components.gke.io/gke-managed-components",
"value": true
}
]
| no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | +| [zone](#input\_zone) | Zone | `string` | `null` | no | ## Outputs diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 480d5b7d58..72514b2414 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -36,6 +36,44 @@ locals { # multi networking needs enabled Dataplane v2 derived_enable_dataplane_v2 = coalesce(var.enable_dataplane_v2, local.derived_enable_multi_networking) + + rdma_networks = [for network_info in var.additional_networks : network_info if strcontains(upper(network_info.nic_type), "RDMA")] + non_rdma_networks = [for network_info in var.additional_networks : network_info if !strcontains(upper(network_info.nic_type), "RDMA")] + apply_manifests_rdma_networks = flatten([ + for idx, network_info in local.rdma_networks : [ + { + source = "${path.module}/templates/gke-network-paramset.yaml.tftpl", + template_vars = { + name = "${var.rdma_subnetwork_name_prefix}-${idx}", + network_name = network_info.network + subnetwork_name = "${var.rdma_subnetwork_name_prefix}-${idx}", + device_mode = "RDMA" + } + }, + { + source = "${path.module}/templates/network-object.yaml.tftpl", + template_vars = { name = "${var.rdma_subnetwork_name_prefix}-${idx}" } + } + ] + ]) + + apply_manifests_non_rdma_networks = flatten([ + for idx, network_info in local.non_rdma_networks : [ + { + source = "${path.module}/templates/gke-network-paramset.yaml.tftpl", + template_vars = { + name = network_info.subnetwork + network_name = network_info.network + subnetwork_name = network_info.subnetwork + device_mode = "NetDevice" + } + }, + { + source = "${path.module}/templates/network-object.yaml.tftpl", + template_vars = { name = network_info.subnetwork } + } + ] + ]) } data "google_compute_default_service_account" "default_sa" { @@ -47,7 +85,7 @@ resource "google_container_cluster" "gke_cluster" { project = var.project_id name = local.name - location = var.region + location = var.is_gke_sandbox ? var.zone : var.region resource_labels = local.labels # decouple node pool lifecycle from cluster life cycle @@ -59,6 +97,10 @@ resource "google_container_cluster" "gke_cluster" { network = var.network_id subnetwork = var.subnetwork_self_link + # Note: Though the default value of VPC_NATIVE is sufficient to enable IP Aliasing, + # It makes sense to let that argument be explicit so that it remains in our consideration when upgrading the provider. + # Because, in the newer provider versions the default may change + networking_mode = "VPC_NATIVE" # Note: the existence of the "master_authorized_networks_config" block enables # the master authorized networks even if it's empty. @@ -196,9 +238,12 @@ resource "google_container_node_pool" "system_node_pools" { provider = google-beta count = var.system_node_pool_enabled ? 1 : 0 - project = var.project_id - name = var.system_node_pool_name - cluster = google_container_cluster.gke_cluster.self_link + project = var.project_id + name = var.system_node_pool_name + cluster = var.is_gke_sandbox ? google_container_cluster.gke_cluster.name : google_container_cluster.gke_cluster.self_link + version = var.min_master_version + location = var.is_gke_sandbox ? var.zone : null + autoscaling { total_min_node_count = var.system_node_pool_node_count.total_min_nodes total_max_node_count = var.system_node_pool_node_count.total_max_nodes @@ -338,20 +383,5 @@ module "kubectl_apply" { cluster_id = google_container_cluster.gke_cluster.id project_id = var.project_id - apply_manifests = flatten([ - for idx, network_info in var.additional_networks : [ - { - source = "${path.module}/templates/gke-network-paramset.yaml.tftpl", - template_vars = { - name = "vpc${idx + 1}", - network_name = network_info.network - subnetwork_name = network_info.subnetwork - } - }, - { - source = "${path.module}/templates/network-object.yaml.tftpl", - template_vars = { name = "vpc${idx + 1}" } - } - ] - ]) + apply_manifests = concat(local.apply_manifests_non_rdma_networks, local.apply_manifests_rdma_networks) } diff --git a/modules/scheduler/gke-cluster/templates/gke-network-paramset.yaml.tftpl b/modules/scheduler/gke-cluster/templates/gke-network-paramset.yaml.tftpl index fb7f0dba83..d376a1a760 100644 --- a/modules/scheduler/gke-cluster/templates/gke-network-paramset.yaml.tftpl +++ b/modules/scheduler/gke-cluster/templates/gke-network-paramset.yaml.tftpl @@ -6,4 +6,4 @@ metadata: spec: vpc: ${network_name} vpcSubnet: ${subnetwork_name} - deviceMode: NetDevice + deviceMode: ${device_mode} diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index e91be6b297..4088eae21c 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -327,3 +327,23 @@ variable "additional_networks" { })) })) } + +variable "rdma_subnetwork_name_prefix" { + description = "Prefix of the RDMA subnetwork names" + default = null + type = string +} + +# REMOVE_ME: It's a temporary variable used in internal testing +variable "is_gke_sandbox" { + description = "Temporary variable to identify the GKE sandbox environment" + default = false + type = bool +} + +# REMOVE_ME: It's a temporary variable used in internal testing +variable "zone" { + description = "Zone" + default = null + type = string +} diff --git a/modules/scheduler/pre-existing-gke-cluster/README.md b/modules/scheduler/pre-existing-gke-cluster/README.md index 519715480d..ada5676eb8 100644 --- a/modules/scheduler/pre-existing-gke-cluster/README.md +++ b/modules/scheduler/pre-existing-gke-cluster/README.md @@ -103,6 +103,7 @@ limitations under the License. | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [cluster\_name](#input\_cluster\_name) | Name of the existing cluster | `string` | n/a | yes | | [project\_id](#input\_project\_id) | Project that hosts the existing cluster | `string` | n/a | yes | +| [rdma\_subnetwork\_name\_prefix](#input\_rdma\_subnetwork\_name\_prefix) | Prefix of the RDMA subnetwork names | `string` | `null` | no | | [region](#input\_region) | Region in which to search for the cluster | `string` | n/a | yes | ## Outputs diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf index 4b65ebe365..d8d3171361 100644 --- a/modules/scheduler/pre-existing-gke-cluster/main.tf +++ b/modules/scheduler/pre-existing-gke-cluster/main.tf @@ -20,26 +20,51 @@ data "google_container_cluster" "existing_gke_cluster" { location = var.region } -module "kubectl_apply" { - source = "../../management/kubectl-apply" # can point to github - - cluster_id = data.google_container_cluster.existing_gke_cluster.id - project_id = var.project_id +locals { + rdma_networks = [for network_info in var.additional_networks : network_info if strcontains(upper(network_info.nic_type), "RDMA")] + non_rdma_networks = [for network_info in var.additional_networks : network_info if !strcontains(upper(network_info.nic_type), "RDMA")] + apply_manifests_rdma_networks = flatten([ + for idx, network_info in local.rdma_networks : [ + { + source = "${path.module}/templates/gke-network-paramset.yaml.tftpl", + template_vars = { + name = "${var.rdma_subnetwork_name_prefix}-${idx}", + network_name = network_info.network + subnetwork_name = "${var.rdma_subnetwork_name_prefix}-${idx}", + device_mode = "RDMA" + } + }, + { + source = "${path.module}/templates/network-object.yaml.tftpl", + template_vars = { name = "${var.rdma_subnetwork_name_prefix}-${idx}" } + } + ] + ]) - apply_manifests = flatten([ - for idx, network_info in var.additional_networks : [ + apply_manifests_non_rdma_networks = flatten([ + for idx, network_info in local.non_rdma_networks : [ { source = "${path.module}/templates/gke-network-paramset.yaml.tftpl", template_vars = { - name = "vpc${idx + 1}", + name = network_info.subnetwork network_name = network_info.network subnetwork_name = network_info.subnetwork + device_mode = "NetDevice" } }, { source = "${path.module}/templates/network-object.yaml.tftpl", - template_vars = { name = "vpc${idx + 1}" } + template_vars = { name = network_info.subnetwork } } ] ]) } + +module "kubectl_apply" { + source = "../../management/kubectl-apply" # can point to github + + cluster_id = data.google_container_cluster.existing_gke_cluster.id + project_id = var.project_id + + apply_manifests = concat(local.apply_manifests_non_rdma_networks, local.apply_manifests_rdma_networks) +} diff --git a/modules/scheduler/pre-existing-gke-cluster/templates/gke-network-paramset.yaml.tftpl b/modules/scheduler/pre-existing-gke-cluster/templates/gke-network-paramset.yaml.tftpl index fb7f0dba83..d376a1a760 100644 --- a/modules/scheduler/pre-existing-gke-cluster/templates/gke-network-paramset.yaml.tftpl +++ b/modules/scheduler/pre-existing-gke-cluster/templates/gke-network-paramset.yaml.tftpl @@ -6,4 +6,4 @@ metadata: spec: vpc: ${network_name} vpcSubnet: ${subnetwork_name} - deviceMode: NetDevice + deviceMode: ${device_mode} diff --git a/modules/scheduler/pre-existing-gke-cluster/variables.tf b/modules/scheduler/pre-existing-gke-cluster/variables.tf index 67e7a24dca..9e9ed98ed3 100644 --- a/modules/scheduler/pre-existing-gke-cluster/variables.tf +++ b/modules/scheduler/pre-existing-gke-cluster/variables.tf @@ -53,3 +53,9 @@ variable "additional_networks" { })) })) } + +variable "rdma_subnetwork_name_prefix" { + description = "Prefix of the RDMA subnetwork names" + default = null + type = string +} From 5ca7d82e60599865b5d3490c6b317bb51a71d72c Mon Sep 17 00:00:00 2001 From: Atul Rajmane Date: Fri, 11 Oct 2024 13:41:50 +0000 Subject: [PATCH 2/2] Address Feedback --- community/modules/network/rdma-vpc/main.tf | 2 +- modules/compute/gke-node-pool/README.md | 1 - modules/compute/gke-node-pool/main.tf | 2 +- modules/compute/gke-node-pool/variables.tf | 8 -------- modules/scheduler/gke-cluster/README.md | 5 +++-- modules/scheduler/gke-cluster/main.tf | 19 ++++++++++++------- modules/scheduler/gke-cluster/variables.tf | 18 +++++++++++------- .../pre-existing-gke-cluster/main.tf | 2 +- 8 files changed, 29 insertions(+), 28 deletions(-) diff --git a/community/modules/network/rdma-vpc/main.tf b/community/modules/network/rdma-vpc/main.tf index a166599c58..85f2125209 100644 --- a/community/modules/network/rdma-vpc/main.tf +++ b/community/modules/network/rdma-vpc/main.tf @@ -132,7 +132,7 @@ locals { for subnet in module.vpc.subnets : { network = local.network_name subnetwork = subnet.name - subnetwork_project = null + subnetwork_project = var.project_id network_ip = "" nic_type = coalesce(var.nic_type, try(regex("IRDMA", local.profile_name), regex("MRDMA", local.profile_name), "RDMA")) stack_type = null diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md index 21cb2f9daf..d487efc5cd 100644 --- a/modules/compute/gke-node-pool/README.md +++ b/modules/compute/gke-node-pool/README.md @@ -297,7 +297,6 @@ limitations under the License. | [guest\_accelerator](#input\_guest\_accelerator) | List of the type and count of accelerator cards attached to the instance. |
list(object({
type = optional(string)
count = optional(number, 0)
gpu_driver_installation_config = optional(list(object({
gpu_driver_version = string
})))
gpu_partition_size = optional(string)
gpu_sharing_config = optional(list(object({
gpu_sharing_strategy = optional(string)
max_shared_clients_per_gpu = optional(number)
})))
}))
| `null` | no | | [host\_maintenance\_interval](#input\_host\_maintenance\_interval) | Specifies the frequency of planned maintenance events. | `string` | `""` | no | | [image\_type](#input\_image\_type) | The default image type used by NAP once a new node pool is being created. Use either COS\_CONTAINERD or UBUNTU\_CONTAINERD. | `string` | `"COS_CONTAINERD"` | no | -| [is\_gke\_sandbox](#input\_is\_gke\_sandbox) | Temporary variable to identify the GKE sandbox environment | `bool` | `false` | no | | [kubernetes\_labels](#input\_kubernetes\_labels) | Kubernetes labels to be applied to each node in the node group. Key-value pairs.
(The `kubernetes.io/` and `k8s.io/` prefixes are reserved by Kubernetes Core components and cannot be specified) | `map(string)` | `null` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [local\_ssd\_count\_ephemeral\_storage](#input\_local\_ssd\_count\_ephemeral\_storage) | The number of local SSDs to attach to each node to back ephemeral storage.
Uses NVMe interfaces. Must be supported by `machine_type`.
When set to null, default value either is [set based on machine\_type](https://cloud.google.com/compute/docs/disks/local-ssd#choose_number_local_ssds) or GKE decides about default value.
[See above](#local-ssd-storage) for more info. | `number` | `null` | no | diff --git a/modules/compute/gke-node-pool/main.tf b/modules/compute/gke-node-pool/main.tf index 9c126dcc65..356377abea 100644 --- a/modules/compute/gke-node-pool/main.tf +++ b/modules/compute/gke-node-pool/main.tf @@ -44,7 +44,7 @@ resource "google_container_node_pool" "node_pool" { name = var.name == null ? var.machine_type : var.name cluster = var.cluster_id node_locations = var.zones - version = var.is_gke_sandbox ? var.node_version : null + version = var.node_version node_count = var.static_node_count dynamic "autoscaling" { diff --git a/modules/compute/gke-node-pool/variables.tf b/modules/compute/gke-node-pool/variables.tf index 877fae56a8..37c19ca201 100644 --- a/modules/compute/gke-node-pool/variables.tf +++ b/modules/compute/gke-node-pool/variables.tf @@ -355,14 +355,6 @@ variable "host_maintenance_interval" { } } -# REMOVE_ME: It's a temporary variable used in internal testing -variable "is_gke_sandbox" { - description = "Temporary variable to identify the GKE sandbox environment" - default = false - type = bool -} - -# REMOVE_ME: It's a temporary variable used in internal testing variable "node_version" { description = "Temporary variable to explicitly set the node version" type = string diff --git a/modules/scheduler/gke-cluster/README.md b/modules/scheduler/gke-cluster/README.md index b39e159e39..f338452460 100644 --- a/modules/scheduler/gke-cluster/README.md +++ b/modules/scheduler/gke-cluster/README.md @@ -147,6 +147,8 @@ limitations under the License. | [additional\_networks](#input\_additional\_networks) | Additional network interface details for GKE, if any. Providing additional networks enables multi networking and creates relevat network objects on the cluster. |
list(object({
network = string
subnetwork = string
subnetwork_project = string
network_ip = string
nic_type = string
stack_type = string
queue_count = number
access_config = list(object({
nat_ip = string
network_tier = string
}))
ipv6_access_config = list(object({
network_tier = string
}))
alias_ip_range = list(object({
ip_cidr_range = string
subnetwork_range_name = string
}))
}))
| `[]` | no | | [authenticator\_security\_group](#input\_authenticator\_security\_group) | The name of the RBAC security group for use with Google security groups in Kubernetes RBAC. Group name must be in format gke-security-groups@yourdomain.com | `string` | `null` | no | | [autoscaling\_profile](#input\_autoscaling\_profile) | (Beta) Optimize for utilization or availability when deciding to remove nodes. Can be BALANCED or OPTIMIZE\_UTILIZATION. | `string` | `"OPTIMIZE_UTILIZATION"` | no | +| [cluster\_availability\_type](#input\_cluster\_availability\_type) | Type of cluster availability. Possible values are: {REGIONAL, MULTI\_ZONAL} | `string` | `"REGIONAL"` | no | +| [cluster\_reference\_type](#input\_cluster\_reference\_type) | How the google\_container\_node\_pool.system\_node\_pools refers to the cluster. Possible values are: {SELF\_LINK, NAME} | `string` | `"SELF_LINK"` | no | | [configure\_workload\_identity\_sa](#input\_configure\_workload\_identity\_sa) | When true, a kubernetes service account will be created and bound using workload identity to the service account used to create the cluster. | `bool` | `false` | no | | [deployment\_name](#input\_deployment\_name) | Name of the HPC deployment. Used in the GKE cluster name by default and can be configured with `prefix_with_deployment_name`. | `string` | n/a | yes | | [enable\_dataplane\_v2](#input\_enable\_dataplane\_v2) | Enables [Dataplane v2](https://cloud.google.com/kubernetes-engine/docs/concepts/dataplane-v2). This setting is immutable on clusters. If null, will default to false unless using multi-networking, in which case it will default to true | `bool` | `null` | no | @@ -159,7 +161,6 @@ limitations under the License. | [enable\_private\_ipv6\_google\_access](#input\_enable\_private\_ipv6\_google\_access) | The private IPv6 google access type for the VMs in this subnet. | `bool` | `true` | no | | [enable\_private\_nodes](#input\_enable\_private\_nodes) | (Beta) Whether nodes have internal IP addresses only. | `bool` | `true` | no | | [gcp\_public\_cidrs\_access\_enabled](#input\_gcp\_public\_cidrs\_access\_enabled) | Whether the cluster master is accessible via all the Google Compute Engine Public IPs. To view this list of IP addresses look here https://cloud.google.com/compute/docs/faq#find_ip_range | `bool` | `false` | no | -| [is\_gke\_sandbox](#input\_is\_gke\_sandbox) | Temporary variable to identify the GKE sandbox environment | `bool` | `false` | no | | [labels](#input\_labels) | GCE resource labels to be applied to resources. Key-value pairs. | `map(string)` | n/a | yes | | [maintenance\_exclusions](#input\_maintenance\_exclusions) | List of maintenance exclusions. A cluster can have up to three. |
list(object({
name = string
start_time = string
end_time = string
exclusion_scope = string
}))
| `[]` | no | | [maintenance\_start\_time](#input\_maintenance\_start\_time) | Start time for daily maintenance operations. Specified in GMT with `HH:MM` format. | `string` | `"09:00"` | no | @@ -189,7 +190,7 @@ limitations under the License. | [system\_node\_pool\_taints](#input\_system\_node\_pool\_taints) | Taints to be applied to the system node pool. |
list(object({
key = string
value = any
effect = string
}))
|
[
{
"effect": "NO_SCHEDULE",
"key": "components.gke.io/gke-managed-components",
"value": true
}
]
| no | | [timeout\_create](#input\_timeout\_create) | Timeout for creating a node pool | `string` | `null` | no | | [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no | -| [zone](#input\_zone) | Zone | `string` | `null` | no | +| [zone](#input\_zone) | Zone for a zonal cluster | `string` | `null` | no | ## Outputs diff --git a/modules/scheduler/gke-cluster/main.tf b/modules/scheduler/gke-cluster/main.tf index 72514b2414..2a42cd909a 100644 --- a/modules/scheduler/gke-cluster/main.tf +++ b/modules/scheduler/gke-cluster/main.tf @@ -85,7 +85,7 @@ resource "google_container_cluster" "gke_cluster" { project = var.project_id name = local.name - location = var.is_gke_sandbox ? var.zone : var.region + location = var.cluster_availability_type == "MULTI_ZONAL" ? var.zone : var.region resource_labels = local.labels # decouple node pool lifecycle from cluster life cycle @@ -97,10 +97,6 @@ resource "google_container_cluster" "gke_cluster" { network = var.network_id subnetwork = var.subnetwork_self_link - # Note: Though the default value of VPC_NATIVE is sufficient to enable IP Aliasing, - # It makes sense to let that argument be explicit so that it remains in our consideration when upgrading the provider. - # Because, in the newer provider versions the default may change - networking_mode = "VPC_NATIVE" # Note: the existence of the "master_authorized_networks_config" block enables # the master authorized networks even if it's empty. @@ -226,6 +222,15 @@ resource "google_container_cluster" "gke_cluster" { condition = !(!coalesce(var.enable_multi_networking, true) && length(var.additional_networks) > 0) error_message = "'enable_multi_networking' cannot be false when using multivpc module, which passes additional_networks." } + precondition { + condition = contains(["REGIONAL", "MULTI_ZONAL"], var.cluster_availability_type) + error_message = "`cluster_availability_type` must be one of {REGIONAL, MULTI_ZONAL}" + } + precondition { + condition = contains(["SELF_LINK", "NAME"], var.cluster_reference_type) + error_message = "`cluster_reference_type` must be one of {SELF_LINK, NAME}" + } + } logging_service = "logging.googleapis.com/kubernetes" @@ -240,9 +245,9 @@ resource "google_container_node_pool" "system_node_pools" { project = var.project_id name = var.system_node_pool_name - cluster = var.is_gke_sandbox ? google_container_cluster.gke_cluster.name : google_container_cluster.gke_cluster.self_link + cluster = var.cluster_reference_type == "NAME" ? google_container_cluster.gke_cluster.name : google_container_cluster.gke_cluster.self_link version = var.min_master_version - location = var.is_gke_sandbox ? var.zone : null + location = var.cluster_availability_type == "MULTI_ZONAL" ? var.zone : null autoscaling { total_min_node_count = var.system_node_pool_node_count.total_min_nodes diff --git a/modules/scheduler/gke-cluster/variables.tf b/modules/scheduler/gke-cluster/variables.tf index 4088eae21c..bbaf07bd4a 100644 --- a/modules/scheduler/gke-cluster/variables.tf +++ b/modules/scheduler/gke-cluster/variables.tf @@ -334,16 +334,20 @@ variable "rdma_subnetwork_name_prefix" { type = string } -# REMOVE_ME: It's a temporary variable used in internal testing -variable "is_gke_sandbox" { - description = "Temporary variable to identify the GKE sandbox environment" - default = false - type = bool +variable "cluster_reference_type" { + description = "How the google_container_node_pool.system_node_pools refers to the cluster. Possible values are: {SELF_LINK, NAME}" + default = "SELF_LINK" + type = string +} + +variable "cluster_availability_type" { + description = "Type of cluster availability. Possible values are: {REGIONAL, MULTI_ZONAL}" + default = "REGIONAL" + type = string } -# REMOVE_ME: It's a temporary variable used in internal testing variable "zone" { - description = "Zone" + description = "Zone for a zonal cluster" default = null type = string } diff --git a/modules/scheduler/pre-existing-gke-cluster/main.tf b/modules/scheduler/pre-existing-gke-cluster/main.tf index d8d3171361..926d2be100 100644 --- a/modules/scheduler/pre-existing-gke-cluster/main.tf +++ b/modules/scheduler/pre-existing-gke-cluster/main.tf @@ -61,7 +61,7 @@ locals { } module "kubectl_apply" { - source = "../../management/kubectl-apply" # can point to github + source = "../../management/kubectl-apply" cluster_id = data.google_container_cluster.existing_gke_cluster.id project_id = var.project_id