GoogleCloudPlatform · kfswain · May 24, 2024 · Apr 29, 2024 · May 1, 2024 · May 1, 2024
@@ -37,6 +37,7 @@ The Locust benchmarking tool currently supports these frameworks:
 - tensorrt_llm_triton
 - text generation inference (tgi)
 - vllm
+- jetstream
 
 ## Instructions
 

@@ -47,6 +47,7 @@ locals {
       tokenizer                      = var.tokenizer
       use_beam_search                = var.use_beam_search
       hugging_face_token_secret_list = local.hugging_face_token_secret == null ? [] : [local.hugging_face_token_secret]
+      k8s_hf_secret_list             = var.k8s_hf_secret == null ? [] : [var.k8s_hf_secret]
       stop_timeout                   = var.stop_timeout
       request_type                   = var.request_type
     })) : data]

@@ -48,6 +48,13 @@ spec:
             - name: USE_BEAM_SEARCH
               value: ${use_beam_search}
 %{ for hugging_face_token_secret in hugging_face_token_secret_list ~}
+            - name: HUGGINGFACE_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-key
+                  key: HF_TOKEN
+%{ endfor ~}
+%{ for hf_token in k8s_hf_secret_list ~}
             - name: HUGGINGFACE_TOKEN
               valueFrom:
                 secretKeyRef:

@@ -0,0 +1,29 @@
+credentials_config = {
+  fleet_host = "https://connectgateway.googleapis.com/v1/projects/PROJECT_NUMBER/locations/global/gkeMemberships/ai-tpu-benchmark"
+}
+
+project_id = "PROJECT_ID"
+
+namespace = "default"
+ksa       = "benchmark-sa"
+request_type = "grpc"
+
+k8s_hf_secret = "hf-token"
+
+
+# Locust service configuration 
+artifact_registry                        = "REGISTRY_LOCATION"
+inference_server_service                 = "jetstream-http-svc:9000" # 104.196.118.117:9000
+locust_runner_kubernetes_service_account = "sample-runner-sa"
+output_bucket                            = "${PROJECT_ID}-jetstream-benchmark-output-bucket-01"
+gcs_path                                 = "PATH_TO_PROMPT_BUCKET"
+
+# Benchmark configuration for Locust Docker accessing inference server
+inference_server_framework = "jetstream"
+tokenizer                  = "google/gemma-7b"
+
+# Benchmark configuration for triggering single test via Locust Runner
+test_duration = 60
+# Increase test_users to allow more parallelism (especially when testing HPA)
+test_users = 1
+test_rate  = 5
@@ -211,6 +211,13 @@ variable "hugging_face_secret_version" {
   default     = null
 }
 
+variable "k8s_hf_secret" {
+  description = "Name of secret in k8s for huggingface token"
+  type        = string
+  nullable    = true
+  default     = null
+}
+
 variable "request_type" {
   description = "The method of request used when calling the model server (http or grpc)"
   type        = string

@@ -0,0 +1,145 @@
+# AI on GKE Benchmarking for JetStream
+
+Deploying and benchmarking JetStream on TPU has many similarities with the standard GPU path. But distinct enough differences to warrant a separate readme. If you are familiar with deploying on GPU, much of this should be familiar. For a more detailed understanding of each step. Refer to our primary benchmarking (README)[https://github.com/GoogleCloudPlatform/ai-on-gke/tree/main/benchmarks]
+
+## Pre-requisites
+- kaggle user/token
+- huggingface user/token
+- gcs bucket with test-prompts
+
+### Creating K8s infra
+
+To create our TPU cluster, run:
+
+```
+# Stage 1 creates the cluster.
+cs infra/stage-1
+
+# Copy the sample variables and update the project ID, cluster name and other
+parameters as needed in the `terraform.tfvars` file.
+cp sample-tfvars/jetstream-sample.tfvars terraform.tfvars
+
+# Initialize the Terraform modules.
+terraform init
+
+# Run plan to see the changes that will be made.
+terraform plan
+
+# Run apply if the changes look good by confirming the prompt.
+terraform apply
+```
+To verify that the cluster has been set up correctly, run
+```
+# Get credentials using fleet membership
+gcloud container fleet memberships get-credentials <cluster-name>
+
+# Run a kubectl command to verify
+kubectl get nodes
+```
+
+## Configure the cluster
+
+To configure the cluster to run inference workloads we need to set up workload identity and GCS Fuse.
+```
+# Stage 2 configures the cluster for running inference workloads.
+cd infra/stage-2
+
+# Copy the sample variables and update the project number and cluster name in
+# the fleet_host variable "https://connectgateway.googleapis.com/v1/projects/<project-number>/locations/global/gkeMemberships/<cluster-name>"
+# and the project name and bucket name parameters as needed in the
+# `terraform.tfvars` file. You can specify a new bucket name in which case it
+# will be created.
+cp sample-tfvars/jetstream-sample.tfvars terraform.tfvars
+
+# Initialize the Terraform modules.
+terraform init
+
+# Run plan to see the changes that will be made.
+terraform plan
+
+# Run apply if the changes look good by confirming the prompt.
+terraform apply
+```
+
+### Convert Gemma model weights to maxtext weights
+
+Jetstream currently requires that models be converted to MaxText weights. This example will deploy a Gemma-7b model. Much of this information is similar to this guide (here)[https://cloud.google.com/kubernetes-engine/docs/tutorials/serve-gemma-tpu-jetstream#convert-checkpoints].
+
+*SKIP IF ALREADY COMPLETED*
+
+Create kaggle secret
+```
+kubectl create secret generic kaggle-secret \
+    --from-file=kaggle.json
+```
+
+Replace `model-conversion/kaggle_converter.yaml: GEMMA_BUCKET_NAME` with the correct bucket name where you would like the model to be stored.
+***NOTE: If you are using a different bucket that the ones you created give the service account Storage Admin permissions on that bucket.
+
+Run:
+```
+kubectl apply -f model-conversion/kaggle_converter.yaml
+```
+
+This should take ~10 minutes to complete.
+
+### Deploy JetStream
+
+Replace the `jetstream.yaml:GEMMA_BUCKET_NAME` with the same bucket name as above.
+
+Run:
+```
+kubectl apply -f jetstream.yaml
+```
+
+Verify the pod is running with
+```
+kubectl get pods
+```
+
+Get the external IP with:
+
+```
+kubectl get services
+```
+
+And you can make a request prompt with:
+```
+curl --request POST \
+--header "Content-type: application/json" \
+-s \
+JETSTREAM_EXTERNAL_IP:8000/generate \
+--data \
+'{
+    "prompt": "What is a TPU?",
+    "max_tokens": 200
+}'
+```
+
+### Deploy the benchmark
+
+To prepare the dataset for the Locust inference benchmark, view the README.md file in:
+```
+cd benchmark/dataset/ShareGPT_v3_unflitered_cleaned_split
+```
+
+To deploy the Locust inference benchmark with the above model, run
+```
+cd benchmark/tools/locust-load-inference
+
+# Copy the sample variables and update the project number and cluster name in
+# the fleet_host variable "https://connectgateway.googleapis.com/v1/projects/<project-number>/locations/global/gkeMemberships/<cluster-name>"
+# in the `terraform.tfvars` file.
+cp sample-tfvars/jetstream-sample.tfvars terraform.tfvars
+
+# Initialize the Terraform modules.
+terraform init
+
+# Run plan to see the changes that will be made.
+terraform plan
+
+# Run apply if the changes look good by confirming the prompt.
+terraform apply
+```
+
+To further interact with the Locust inference benchmark, view the README.md file in `benchmark/tools/locust-load-inference`
@@ -0,0 +1,63 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: maxengine-server
+spec:
+  replicas: 1
+  selector:
+    matchLabels:
+      app: maxengine-server
+  template:
+    metadata:
+      labels:
+        app: maxengine-server
+    spec:
+      serviceAccountName: benchmark-sa
+      nodeSelector:
+        cloud.google.com/gke-tpu-topology: 2x2
+        cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+      containers:
+      - name: maxengine-server
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/maxengine-server:v0.2.0
+        args:
+        - model_name=gemma-7b
+        - tokenizer_path=assets/tokenizer.gemma
+        - per_device_batch_size=4
+        - max_prefill_predict_length=1024
+        - max_target_length=2048
+        - async_checkpointing=false
+        - ici_fsdp_parallelism=1
+        - ici_autoregressive_parallelism=-1
+        - ici_tensor_parallelism=1
+        - scan_layers=false
+        - weight_dtype=bfloat16
+        - load_parameters_path=gs://GEMMA_BUCKET_NAME/final/unscanned/gemma_7b-it/0/checkpoints/0/items
+        ports:
+        - containerPort: 9000
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
+      - name: jetstream-http
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/jetstream-http:v0.2.0
+        ports:
+        - containerPort: 8000
+---
+apiVersion: v1
+kind: Service
+metadata:
+  name: jetstream-http-svc
+spec:
+  selector:
+    app: maxengine-server
+  ports:
+  - protocol: TCP
+    name: http
+    port: 8000
+    targetPort: 8000
+  - protocol: TCP
+    name: grpc
+    port: 9000
+    targetPort: 9000
+  type: LoadBalancer
@@ -0,0 +1,33 @@
+apiVersion: batch/v1
+kind: Job
+metadata:
+  name: data-loader-7b
+spec:
+  ttlSecondsAfterFinished: 30
+  template:
+    spec:
+      serviceAccountName: benchmark-sa
+      restartPolicy: Never
+      containers:
+      - name: inference-checkpoint
+        image: us-docker.pkg.dev/cloud-tpu-images/inference/inference-checkpoint:v0.2.0
+        args:
+        - -b=GEMMA_BUCKET_NAME
+        - -m=google/gemma/maxtext/7b-it/2
+        volumeMounts:
+        - mountPath: "/kaggle/"
+          name: kaggle-credentials
+          readOnly: true
+        resources:
+          requests:
+            google.com/tpu: 4
+          limits:
+            google.com/tpu: 4
+      nodeSelector:
+        cloud.google.com/gke-tpu-topology: 2x2
+        cloud.google.com/gke-tpu-accelerator: tpu-v5-lite-podslice
+      volumes:
+      - name: kaggle-credentials
+        secret:
+          defaultMode: 0400
+          secretName: kaggle-secret
@@ -187,6 +187,7 @@ module "cluster-nodepool" {
 
   node_config = {
     machine_type = each.value.machine_type
+    spot = each.value.spot
     shielded_instance_config = {
       enable_integrity_monitoring = true
       enable_secure_boot          = true

@@ -143,6 +143,7 @@ variable "nodepools" {
     gke_version    = optional(string),
     max_node_count = optional(number, 10),
     min_node_count = optional(number, 1),
+    spot = optional(bool, false)
 
     guest_accelerator = optional(object({
       type  = optional(string),

@@ -0,0 +1,27 @@
+project_id   = "PROJECT_ID"
+cluster_name = "ai-tpu-benchmark"
+region       = "us-east1"
+gke_location = "us-east1-c"
+prefix       = "ai-tpu-benchmark"
+spot_vms     = true
+
+vpc_create = {
+  name             = "ai-benchmark"
+  enable_cloud_nat = true
+}
+
+cluster_options = {
+  enable_gcs_fuse_csi_driver            = false
+  enable_gcp_filestore_csi_driver       = false
+  enable_gce_persistent_disk_csi_driver = false
+}
+
+nodepools = {
+  nodepool-tpu = {
+    machine_type = "ct5lp-hightpu-4t",
+    spot = true,
+  },
+  nodepool-cpu = {
+    machine_type = "n2-standard-2",
+  },
+}
@@ -32,4 +32,6 @@ module "gke-setup" {
   secret_create                               = var.secret_name == null ? false : true
   secret_name                                 = var.secret_name
   secret_location                             = var.secret_location
+  nvidia_dcgm_create                          = var.nvidia_dcgm_create
+  gcs_fuse_create                             = var.gcs_fuse_create
 }
@@ -32,7 +32,7 @@ module "gcs-fuse" {
   project_id             = var.project_id
   bucket_name            = var.bucket_name
   bucket_location        = var.bucket_location
-  google_service_account = var.google_service_account
+  google_service_account = module.workload-identity.0.created_resources.gsa_email
   depends_on             = [module.workload-identity]
 }
 

@@ -42,6 +42,6 @@ module "gcs-fuse-bucket" {
 
 resource "google_storage_bucket_iam_member" "bucket-iam" {
   bucket = local.bucket_name
-  role   = "roles/storage.objectAdmin"
-  member = data.google_service_account.gsa.member
+  role   = "roles/storage.admin"
+  member = "serviceAccount:${var.google_service_account}"
 }