GoogleCloudPlatform · chengcongdu · Sep 13, 2024 · Sep 6, 2024 · Sep 6, 2024 · Sep 9, 2024
diff --git a/examples/gke-a3-highgpu.yaml b/examples/gke-a3-highgpu.yaml
@@ -32,9 +32,9 @@ deployment_groups:
   - id: network1
     source: modules/network/vpc
     settings:
-      subnetwork_name: gke-subnet
+      subnetwork_name: gke-subnet-a3-highgpu
       secondary_ranges:
-        gke-subnet:
+        gke-subnet-a3-highgpu:
         - range_name: pods
           ip_cidr_range: 10.4.0.0/14
         - range_name: services
@@ -65,5 +65,4 @@ deployment_groups:
       machine_type: a3-highgpu-8g
       autoscaling_total_min_nodes: 2
       zones: [$(vars.zone)]
-
-# We need to do the following here after deployment: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-highgpuyaml--
+    outputs: [instructions]
diff --git a/examples/gke-a3-megagpu.yaml b/examples/gke-a3-megagpu.yaml
@@ -32,9 +32,9 @@ deployment_groups:
   - id: network1
     source: modules/network/vpc
     settings:
-      subnetwork_name: gke-subnet
+      subnetwork_name: gke-subnet-a3-mega
       secondary_ranges:
-        gke-subnet:
+        gke-subnet-a3-mega:
         - range_name: pods
           ip_cidr_range: 10.4.0.0/14
         - range_name: services
@@ -64,6 +64,6 @@ deployment_groups:
     settings:
       machine_type: a3-megagpu-8g
       autoscaling_total_min_nodes: 2
+      user_workload_path: gpu-direct-workload/sample-tcpxo-workload-job.yaml # gke-node-pool module relative path to the user provided workload
       zones: [$(vars.zone)]
-
-# We need to do the following here after deployment: https://github.com/GoogleCloudPlatform/cluster-toolkit/blob/main/examples/README.md#gke-a3-megagpuyaml--
+    outputs: [instructions]
diff --git a/modules/compute/gke-node-pool/README.md b/modules/compute/gke-node-pool/README.md
@@ -233,17 +233,21 @@ limitations under the License.
 | <a name="requirement_terraform"></a> [terraform](#requirement\_terraform) | >= 1.2 |
 | <a name="requirement_google"></a> [google](#requirement\_google) | ~> 5.0 |
 | <a name="requirement_google-beta"></a> [google-beta](#requirement\_google-beta) | ~> 5.0 |
+| <a name="requirement_null"></a> [null](#requirement\_null) | ~> 3.0 |
 
 ## Providers
 
 | Name | Version |
 |------|---------|
 | <a name="provider_google"></a> [google](#provider\_google) | ~> 5.0 |
 | <a name="provider_google-beta"></a> [google-beta](#provider\_google-beta) | ~> 5.0 |
+| <a name="provider_null"></a> [null](#provider\_null) | ~> 3.0 |
 
 ## Modules
 
-No modules.
+| Name | Source | Version |
+|------|--------|---------|
+| <a name="module_kubectl_apply"></a> [kubectl\_apply](#module\_kubectl\_apply) | ../../management/kubectl-apply | n/a |
 
 ## Resources
 
@@ -256,7 +260,11 @@ No modules.
 | [google_project_iam_member.node_service_account_metric_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
 | [google_project_iam_member.node_service_account_monitoring_viewer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
 | [google_project_iam_member.node_service_account_resource_metadata_writer](https://registry.terraform.io/providers/hashicorp/google/latest/docs/resources/project_iam_member) | resource |
+| [null_resource.enable_tcpx_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
+| [null_resource.enable_tcpxo_in_workload](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
+| [null_resource.install_dependencies](https://registry.terraform.io/providers/hashicorp/null/latest/docs/resources/resource) | resource |
 | [google_compute_default_service_account.default_sa](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/compute_default_service_account) | data source |
+| [google_container_cluster.gke_cluster](https://registry.terraform.io/providers/hashicorp/google/latest/docs/data-sources/container_cluster) | data source |
 
 ## Inputs
 
@@ -295,6 +303,7 @@ No modules.
 | <a name="input_timeout_update"></a> [timeout\_update](#input\_timeout\_update) | Timeout for updating a node pool | `string` | `null` | no |
 | <a name="input_total_max_nodes"></a> [total\_max\_nodes](#input\_total\_max\_nodes) | DEPRECATED: Use autoscaling\_total\_max\_nodes. | `number` | `null` | no |
 | <a name="input_total_min_nodes"></a> [total\_min\_nodes](#input\_total\_min\_nodes) | DEPRECATED: Use autoscaling\_total\_min\_nodes. | `number` | `null` | no |
+| <a name="input_user_workload_path"></a> [user\_workload\_path](#input\_user\_workload\_path) | The gke-node-pool module relative path to the user workload, this should point to the kubernetes job manifest<br>that user want to have the GPUDirect rxdm sidecar injected. The toolkit would apply the required changes to this<br>user workload and generate a new workload file for user to inspect and apply to the cluster. Details of the required<br>changes can be found in the [GPUDirect user guide](https://cloud.google.com/kubernetes-engine/docs/how-to/gpu-bandwidth-gpudirect-tcpx#add-gpudirect-manifests) | `string` | `null` | no |
 | <a name="input_zones"></a> [zones](#input\_zones) | A list of zones to be used. Zones must be in region of cluster. If null, cluster zones will be inherited. Note `zones` not `zone`; does not work with `zone` deployment variable. | `list(string)` | `null` | no |
 
 ## Outputs
@@ -303,6 +312,7 @@ No modules.
 |------|-------------|
 | <a name="output_allocatable_cpu_per_node"></a> [allocatable\_cpu\_per\_node](#output\_allocatable\_cpu\_per\_node) | Number of CPUs available for scheduling pods on each node. |
 | <a name="output_has_gpu"></a> [has\_gpu](#output\_has\_gpu) | Boolean value indicating whether nodes in the pool are configured with GPUs. |
+| <a name="output_instructions"></a> [instructions](#output\_instructions) | Instructions for submitting the sample GPUDirect enabled job. |
 | <a name="output_node_pool_name"></a> [node\_pool\_name](#output\_node\_pool\_name) | Name of the node pool. |
 | <a name="output_tolerations"></a> [tolerations](#output\_tolerations) | Tolerations needed for a pod to be scheduled on this node pool. |
 <!-- END OF PRE-COMMIT-TERRAFORM DOCS HOOK -->
diff --git a/modules/compute/gke-node-pool/gpu-direct-workload/sample-tcpx-workload-job.yaml b/modules/compute/gke-node-pool/gpu-direct-workload/sample-tcpx-workload-job.yaml
@@ -0,0 +1,50 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: my-sample-job
+spec:
+  parallelism: 2
+  completions: 2
+  completionMode: Indexed
+  template:
+    spec:
+      containers:
+      - name: nccl-test
+        image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/nccl-plugin-gpudirecttcpx-dev:v3.1.9
+        imagePullPolicy: Always
+        command:
+        - /bin/sh
+        - -c
+        - |
+          service ssh restart;
+          sleep infinity;
+        env:
+        - name: LD_LIBRARY_PATH
+          value: /usr/local/nvidia/lib64
+        volumeMounts:
+        - name: config-volume
+          mountPath: /configs
+        resources:
+          limits:
+            nvidia.com/gpu: 8
+      volumes:
+      - name: config-volume
+        configMap:
+          name: nccl-configmap
+          defaultMode: 0777
+      restartPolicy: Never
+  backoffLimit: 0
diff --git a/modules/compute/gke-node-pool/gpu-direct-workload/sample-tcpxo-workload-job.yaml b/modules/compute/gke-node-pool/gpu-direct-workload/sample-tcpxo-workload-job.yaml
@@ -0,0 +1,70 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: my-sample-job
+spec:
+  parallelism: 2
+  completions: 2
+  completionMode: Indexed
+  template:
+    spec:
+      hostname: host1
+      subdomain: nccl-host-1
+      containers:
+      - name: nccl-test
+        image: us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpxo/nccl-plugin-gpudirecttcpx-dev:v1.0.3
+        imagePullPolicy: Always
+        command:
+        - /bin/sh
+        - -c
+        - |
+          set -ex
+          chmod 755  /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh
+          cat >/scripts/allgather.sh <<EOF
+          #!/bin/bash
+          /scripts/init_ssh.sh \${@};
+          pushd /scripts;
+          /scripts/gen_hostfiles.sh \${@};
+          popd;
+          BENCHMARK=all_gather_perf NHOSTS=2 NCCL_LIB_DIR="${LD_LIBRARY_PATH}" LD_LIBRARY_PATH="${LD_LIBRARY_PATH}" /scripts/demo-run-nccl-test-tcpxo-via-mpi.sh
+          EOF
+          chmod +x /scripts/allgather.sh
+          service ssh restart;
+          sleep infinity;
+        env:
+        - name: LD_LIBRARY_PATH
+          value: /usr/local/nvidia/lib64
+        - name: NCCL_FASTRAK_LLCM_DEVICE_DIRECTORY
+          value: /dev/aperture_devices
+        volumeMounts:
+        - name: nvidia
+          mountPath: /usr/local/nvidia/lib64
+        - name: shared-memory
+          mountPath: /dev/shm
+        resources:
+          limits:
+            nvidia.com/gpu: 8
+      volumes:
+      - name: nvidia
+        hostPath:
+          path: /home/kubernetes/bin/nvidia/lib64
+      - name: shared-memory
+        emptyDir:
+          medium: "Memory"
+          sizeLimit: 1Gi
+      restartPolicy: Never
+  backoffLimit: 0
diff --git a/modules/compute/gke-node-pool/gpu-direct-workload/scripts/enable-tcpx-in-workload.py b/modules/compute/gke-node-pool/gpu-direct-workload/scripts/enable-tcpx-in-workload.py
@@ -0,0 +1,182 @@
+# Copyright 2024 "Google LLC"
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import yaml
+import argparse
+
+def main():
+    parser = argparse.ArgumentParser(description="TCPX Job Manifest Generator")
+    parser.add_argument("-f", "--file", required=True, help="Path to your job template YAML file")
+    parser.add_argument("-r", "--rxdm", required=True, help="RxDM version")
+
+    args = parser.parse_args()
+
+    # Get the YAML file from the user
+    if not args.file:
+        args.file = input("Please provide the path to your job template YAML file: ")
+
+    # Get component versions from user
+    if not args.rxdm:
+        args.rxdm = input("Enter the RxDM version: ")
+
+    # Load and modify the YAML
+    with open(args.file, "r") as file:
+        job_manifest = yaml.load(file, Loader=yaml.BaseLoader)
+
+    # Update annotations
+    add_annotations(job_manifest)
+
+    # Update volumes
+    add_volumes(job_manifest)
+
+    # Update tolerations
+    add_tolerations(job_manifest)
+
+    # Add tcpx-daemon container
+    add_tcpx_daemon_container(job_manifest, args.rxdm)
+
+    # Update environment variables and volumeMounts for GPU containers
+    update_gpu_containers(job_manifest)
+
+    # Generate the new YAML file
+    updated_job = str(yaml.dump(job_manifest, default_flow_style=False, width=1000, default_style="|", sort_keys=False)).replace("|-", "")
+
+    new_file_name = args.file.replace(".yaml", "-tcpx.yaml")
+    with open(new_file_name, "w", encoding="utf-8") as file:
+        file.write(updated_job)
+
+    # Step 7: Provide instructions to the user
+    print("\nA new manifest had been generated and updated to have TCPX enabled based on the provided worklad,")
+    print("It can be found in the same path as the original workload file with name ending \"-tcpx\"")
+
+def add_annotations(job_manifest):
+    annotations = {
+        'devices.gke.io/container.tcpx-daemon':"""|+
+- path: /dev/nvidia0
+- path: /dev/nvidia1
+- path: /dev/nvidia2
+- path: /dev/nvidia3
+- path: /dev/nvidia4
+- path: /dev/nvidia5
+- path: /dev/nvidia6
+- path: /dev/nvidia7
+- path: /dev/nvidiactl
+- path: /dev/nvidia-uvm""",
+        "networking.gke.io/default-interface": "eth0",
+        "networking.gke.io/interfaces":"""|
+[
+    {"interfaceName":"eth0","network":"default"},
+    {"interfaceName":"eth1","network":"vpc1"},
+    {"interfaceName":"eth2","network":"vpc2"},
+    {"interfaceName":"eth3","network":"vpc3"},
+    {"interfaceName":"eth4","network":"vpc4"}
+]""",
+    }
+
+    # Create path if it doesn't exist
+    job_manifest.setdefault("spec", {}).setdefault("template", {}).setdefault("metadata", {})
+
+    # Add/update annotations
+    pod_template_spec = job_manifest["spec"]["template"]["metadata"]
+    if "annotations" in pod_template_spec:
+        pod_template_spec["annotations"].update(annotations)
+    else:
+        pod_template_spec["annotations"] = annotations
+
+def add_tolerations(job_manifest):
+    tolerations = [
+        {"key": "user-workload", "operator": "Equal", "value": """\"true\"""", "effect": "NoSchedule"},
+    ]
+
+    # Create path if it doesn't exist
+    job_manifest.setdefault("spec", {}).setdefault("template", {}).setdefault("spec", {})
+
+    # Add tolerations
+    pod_spec = job_manifest["spec"]["template"]["spec"]
+    if "tolerations" in pod_spec:
+        pod_spec["tolerations"].extend(tolerations)
+    else:
+        pod_spec["tolerations"] = tolerations
+
+def add_volumes(job_manifest):
+    volumes = [
+        {"name": "libraries", "hostPath": {"path": "/home/kubernetes/bin/nvidia/lib64"}},
+        {"name": "tcpx-socket", "emptyDir": {}},
+        {"name": "sys", "hostPath": {"path": "/sys"}},
+        {"name": "proc-sys", "hostPath": {"path": "/proc/sys"}},
+    ]
+
+    # Create path if it doesn't exist
+    job_manifest.setdefault("spec", {}).setdefault("template", {}).setdefault("spec", {})
+
+    # Add volumes
+    pod_spec = job_manifest["spec"]["template"]["spec"]
+    if "volumes" in pod_spec:
+        pod_spec["volumes"].extend(volumes)
+    else:
+        pod_spec["volumes"] = volumes
+
+
+def add_tcpx_daemon_container(job_template, rxdm_version):
+    tcpx_daemon_container = {
+        "name": "tcpx-daemon",
+        "image": f"us-docker.pkg.dev/gce-ai-infra/gpudirect-tcpx/tcpgpudmarxd-dev:{rxdm_version}",   # Use provided RxDM version
+        "imagePullPolicy": "Always",
+        "command": 
+        """- /tcpgpudmarxd/build/app/tcpgpudmarxd
+- --gpu_nic_preset
+- a3vm
+- --gpu_shmem_type
+- fd
+- --uds_path
+- /run/tcpx
+- --setup_param
+- \\\"--verbose 128 2 0 \\\"""",
+        "securityContext": {
+            "capabilities": {"add": ["NET_ADMIN"]}
+        },
+        "volumeMounts": [
+            {"name": "libraries", "mountPath": "/usr/local/nvidia/lib64"},
+            {"name": "tcpx-socket", "mountPath": "/run/tcpx"},
+            {"name": "sys", "mountPath": "/hostsysfs"},
+            {"name": "proc-sys", "mountPath": "/hostprocsysfs"},
+        ],
+        "env": [{"name": "LD_LIBRARY_PATH", "value": "/usr/local/nvidia/lib64"}],
+    }
+
+    # Create path if it doesn't exist
+    job_template.setdefault("spec", {}).setdefault("template", {}).setdefault("spec", {})
+
+    # Add container
+    pod_spec = job_template["spec"]["template"]["spec"]
+    pod_spec.setdefault("containers", []).insert(0, tcpx_daemon_container)
+
+def update_gpu_containers(job_manifest):
+    env_vars = [{"name": "LD_LIBRARY_PATH", "value": "/usr/local/nvidia/lib64"}]
+    volume_mounts = [
+        {"name": "tcpx-socket", "mountPath": "/tmp"},
+        {"name": "libraries", "mountPath": "/usr/local/nvidia/lib64"},
+    ]
+
+    pod_spec = job_manifest.get("spec", {}).get("template", {}).get("spec", {})
+    for container in pod_spec.get("containers", []):
+        # Create path if it doesn't exist
+        container.setdefault("env", [])
+        container.setdefault("volumeMounts", [])
+        if int(container.get("resources", {}).get("limits", {}).get("nvidia.com/gpu", 0)) > 0:
+            container["env"].extend(env_vars)
+            container["volumeMounts"].extend(volume_mounts)
+
+if __name__ == "__main__":
+    main()