GoogleCloudPlatform · roberthbailey · Apr 26, 2024 · Apr 23, 2024 · Apr 23, 2024 · Apr 24, 2024
diff --git a/applications/rag/variables.tf b/applications/rag/variables.tf
@@ -378,7 +378,7 @@ variable "gpu_pools" {
     max_count              = optional(number, 3)
     local_ssd_count        = optional(number, 0)
     spot                   = optional(bool, false)
-    disk_size_gb           = optional(number, 100)
+    disk_size_gb           = optional(number, 200)
     disk_type              = optional(string, "pd-standard")
     image_type             = optional(string, "COS_CONTAINERD")
     enable_gcfs            = optional(bool, false)
@@ -399,7 +399,7 @@ variable "gpu_pools" {
     autoscaling        = true
     min_count          = 1
     max_count          = 3
-    disk_size_gb       = 100
+    disk_size_gb       = 200
     disk_type          = "pd-balanced"
     enable_gcfs        = true
     accelerator_count  = 2

diff --git a/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars b/infrastructure/tfvars_tests/standard-gke-public.platform.tfvars
@@ -58,7 +58,7 @@ gpu_pools = [{
   min_count          = 2
   max_count          = 3
   accelerator_count  = 2
-  disk_size_gb       = 100
+  disk_size_gb       = 200
   enable_gcfs        = true
   logging_variant    = "DEFAULT"
   disk_type          = "pd-balanced"

diff --git a/tutorials-and-examples/hf-tgi/main.tf b/tutorials-and-examples/hf-tgi/main.tf
@@ -47,6 +47,9 @@ resource "kubernetes_service" "inference_service" {
 }
 
 resource "kubernetes_deployment" "inference_deployment" {
+  timeouts {
+    create = "30m"
+  }
   metadata {
     name      = "mistral-7b-instruct"
     namespace = var.namespace
@@ -56,7 +59,11 @@ resource "kubernetes_deployment" "inference_deployment" {
   }
 
   spec {
-    replicas = 1
+    # It takes more than 10m for the deployment to be ready on Autopilot cluster
+    # Set the progress deadline to 30m to avoid the deployment controller
+    # considering the deployment to be failed
+    progress_deadline_seconds = 1800
+    replicas                  = 1
 
     selector {
       match_labels = merge({
@@ -72,6 +79,15 @@ resource "kubernetes_deployment" "inference_deployment" {
       }
 
       spec {
+        init_container {
+          name    = "download-model"
+          image   = "google/cloud-sdk:473.0.0-alpine"
+          command = ["gsutil", "cp", "-r", "gs://vertex-model-garden-public-us/mistralai/Mistral-7B-Instruct-v0.1/", "/model-data/"]
+          volume_mount {
+            mount_path = "/model-data"
+            name       = "model-storage"
+          }
+        }
         container {
           image = "ghcr.io/huggingface/text-generation-inference:1.1.0"
           name  = "mistral-7b-instruct"
@@ -82,9 +98,11 @@ resource "kubernetes_deployment" "inference_deployment" {
             protocol       = "TCP"
           }
 
+          args = ["--model-id", "$(MODEL_ID)"]
+
           env {
             name  = "MODEL_ID"
-            value = "mistralai/Mistral-7B-Instruct-v0.1"
+            value = "/model/Mistral-7B-Instruct-v0.1"
           }
 
           env {
@@ -118,6 +136,12 @@ resource "kubernetes_deployment" "inference_deployment" {
             name       = "data"
           }
 
+          volume_mount {
+            mount_path = "/model"
+            name       = "model-storage"
+            read_only  = "true"
+          }
+
           #liveness_probe {
           #http_get {
           #path = "/"
@@ -146,6 +170,11 @@ resource "kubernetes_deployment" "inference_deployment" {
           empty_dir {}
         }
 
+        volume {
+          name = "model-storage"
+          empty_dir {}
+        }
+
         node_selector = merge({
           "cloud.google.com/gke-accelerator" = "nvidia-l4"
           }, var.autopilot_cluster ? {