From 7e4f2a6c1cf14945ca824e8b5ecdcea50394a54e Mon Sep 17 00:00:00 2001
From: Darin Peetz <darinpeetz@google.com>
Date: Fri, 1 Nov 2024 11:15:12 -0500
Subject: [PATCH] Use the tpu_network_optimizer image instead of a raw bash
 script.

The container image contains the script, but makes future updates (if
necessary) more straightforward by just pulling a new image rather than
updating the sript inside the daemonset. The contents of the image also
allow us to mount only /sys and /proc, instead of mounting / entirely.
---
 .../v6e-network-optimization.yaml             | 142 ++----------------
 1 file changed, 14 insertions(+), 128 deletions(-)

diff --git a/scripts/network-setup/v6e-network-optimization.yaml b/scripts/network-setup/v6e-network-optimization.yaml
index a30217b5d..c559d978f 100644
--- a/scripts/network-setup/v6e-network-optimization.yaml
+++ b/scripts/network-setup/v6e-network-optimization.yaml
@@ -34,139 +34,25 @@ spec:
         effect: "NoSchedule"
       initContainers:
       - name: "tpu-network-optimization"
-        image: "ubuntu:latest"
+        image: "gke.gcr.io/tpu_network_optimizer@sha256:e5aba9c51fc0750863187bb22767b12890a16fd98dd74fd012f3510d38874dde"
         securityContext:
           privileged: true
         command:
-        - bash
-        - -c
-        - |
-          #!/bin/bash
-
-          # returns 0 (success) if it's running on a v6e VM.
-          is_v6etpu_platform() {
-            local machine_type
-            machine_type=$(curl -H "Metadata-Flavor: Google" \
-              http://169.254.169.254/computeMetadata/v1/instance/machine-type)
-
-            echo "machine_type: $machine_type"
-            # Non-v6 TPUs are exempt
-            [[ "$machine_type" == *"ct6e"* ]] || return 1
-
-            return 0
-          }
-
-          if ! is_v6etpu_platform; then
-            echo "Not a v6e TPU platform"
-            exit 0
-          fi
-
-          echo "Running on a v6e TPU platform"
-          # This must be a v6e platform. Continue with v6e-specific network tunings.
-
-          # PART 1: IRQ SPREADING. If this VM has multiple vnics, we need to make sure
-          # they're using different sets of cores for interrupt handling.
-
-          # Used to wrap around to the first core if we run out of cores. We limit
-          # ourselves to node 0, and avoid hyperbuddies.
-          node0_cores=$(echo /sys/devices/system/node/node0/cpu[0-9]* | wc -w)
-          ht_buddies=$(cat /sys/devices/system/cpu/cpu0/topology/core_cpus_list | tr ',' ' ' | wc -w)
-          total_schedulable_cores=$((node0_cores / ht_buddies))
-
-          core=0
-          for nic in $(ls -1 /sys/class/net);
-          do
-            echo "Updating interrupt cores for $nic"
-            if [[ -d "/sys/class/net/$nic/device" ]]; then
-              # ASSUMPTION: There are an equal number of TX and RX queues.
-              NUM_QUEUES=$(echo /sys/class/net/"$nic"/queues/tx* | wc -w)
-              # Helpers to figure out where to write the soft IRQ affinities. See functions
-              # gve_tx_idx_to_ntfy and gve_rx_idx_to_ntfy. Notify blocks are allocated here:
-              # https://github.com/GoogleCloudPlatform/compute-virtual-ethernet-linux/blob/1b4fe3f70e982b49507bc6fad865c23c9d22cc30/google/gve/gve_main.c#L394
-              # The bash here counts how many notify blocks there are, then identifies the
-              # base block for TX and RX in identical fashion to the GVE functions.
-              # TODO: Consider the case of if IRQ entries are not contiguous.
-              base_ntfy_block=$(ls -1 /sys/class/net/"${nic}"/device/msi_irqs | sort -n | head -n 1)
-              num_ntfy_blocks=$(ls /sys/class/net/"${nic}"/device/msi_irqs/ | wc -w)
-              tx_irq_base_directory=$((base_ntfy_block))
-              rx_irq_base_directory=$((base_ntfy_block + (num_ntfy_blocks / 2)))
-
-              for ((queue = 0; queue < $NUM_QUEUES; queue++)); do
-                echo $core > /proc/irq/$((tx_irq_base_directory + $queue))/smp_affinity_list
-                echo $core > /proc/irq/$((rx_irq_base_directory + $queue))/smp_affinity_list
-                # Also set XPS affinity for the TX queue to the same core.
-                cp /proc/irq/$((tx_irq_base_directory + $queue))/smp_affinity /sys/class/net/"$nic"/queues/tx-"$queue"/xps_cpus
-                core=$((core + 1))
-                # Wrap around to the first core if we run out of cores.
-                if [[ $core -ge $total_schedulable_cores ]]; then
-                  core=0
-                fi
-              done
-              echo "$nic cores:"
-              for ((queue = 0; queue < $NUM_QUEUES; queue++)); do
-                echo "queue $queue"
-                irq=$((tx_irq_base_directory + $queue))
-                cat /proc/irq/$irq/smp_affinity_list
-                irq=$((rx_irq_base_directory + $queue))
-                cat /proc/irq/$irq/smp_affinity_list
-              done
-              fi
-          done
-
-          # PART 2: TCP tunings.
-
-          # Disable metrics cache
-          sysctl -w net.ipv4.tcp_no_metrics_save=1
-
-          # Disable slow start after idle
-          sysctl -w net.ipv4.tcp_slow_start_after_idle=0
-
-          # Set rto_min 5ms and enable quickack
-          sysctl_rto_min_exists=$(sudo sysctl -a | grep tcp_rto_min_us)
-          if [[ -z "$sysctl_rto_min_exists" ]]; then
-            ip route show | while IFS= read -r route; do
-              if ! echo "$route" | grep -q "linkdown"; then
-                ip route change ${route/lock/} rto_min 5ms quickack 1
-              fi
-            done
-          else
-            sysctl -w net.ipv4.tcp_rto_min_us=5000
-            ip route show | while IFS= read -r route; do
-              if ! echo "$route" | grep -q "linkdown"; then
-                ip route change ${route/lock/} quickack 1
-              fi
-            done
-          fi
-
-          # Increase TCP zerocopy control memory
-          sysctl -w net.core.optmem_max=131072
-
-          # Disable Cubic Hystart Ack-Train
-          echo 2 > /sys/module/tcp_cubic/parameters/hystart_detect
-
-          # PART 3: Larger gve buffers.
-
-          echo "Enabling max rx buffer size for v6e "
-          for nic in $(ls /sys/class/net); do
-            if [[ -d "/sys/class/net/$nic/device" ]]; then
-              if ethtool --set-priv-flags "$nic" enable-max-rx-buffer-size on; then
-                echo "Max RX buffer size enabled for $nic"
-              else
-                echo "Unable to enable max RX buffer size for $nic"
-              fi
-            fi
-          done
-
-          # The script cannot return an error status.
-          exit 0
+        - /tpu_network_optimizer.sh
         volumeMounts:
-        - mountPath: /
-          name: root
+        - name: sys
+          mountPath: /sys
+        - name: proc
+          mountPath: /proc
       volumes:
-        - name: root
-          hostPath:
-            path: /
-            type: Directory
+      - name: sys
+        hostPath:
+          path: /sys
+          type: Directory
+      - name: proc
+        hostPath:
+          path: /proc
+          type: Directory
       containers:
       - image: "gke.gcr.io/pause:3.8@sha256:880e63f94b145e46f1b1082bb71b85e21f16b99b180b9996407d61240ceb9830"
         name: pause