GoogleCloudPlatform · Bslabe123 · Sep 23, 2024 · Sep 20, 2024 · Sep 23, 2024
@@ -1,9 +1,8 @@
-FROM nvidia/cuda:12.1.0-devel-ubuntu22.04 AS dev
+FROM python:3.9.20-slim-bookworm as dev
 
 RUN apt-get update -y \
     && apt-get install -y python3-pip git vim curl wget
 RUN pip3 install --upgrade pip
-RUN pip install packaging torch transformers
 WORKDIR /workspace
 
 # install build and runtime dependencies

@@ -15,7 +15,6 @@ pytest-forked
 pytest-asyncio
 httpx
 einops # required for MPT
-flash_attn # required for HuggingFace's llama implementation
 openai
 requests
 
@@ -26,7 +25,7 @@ ray >= 2.9
 sentencepiece  # Required for LLaMA tokenizer.
 numpy < 2.0
 torch == 2.1.1
-transformers >= 4.37.0 # Required for Qwen2
+transformers >= 4.42.0 # Required for Qwen2
 xformers == 0.0.23
 fastapi
 uvicorn[standard]

@@ -18,9 +18,6 @@ spec:
       containers:
         - name: latency-profile-generator
           image: ${artifact_registry}/latency-profile:latest
-          resources:
-            limits:
-              nvidia.com/gpu: 1
           command: ["bash", "-c", "./latency_throughput_curve.sh"]
           env:
             - name: TOKENIZER
@@ -54,7 +51,4 @@ spec:
                 secretKeyRef:
                   name: hf-token
                   key: HF_TOKEN
-%{ endfor ~}
-      nodeSelector:
-        cloud.google.com/gke-accelerator: nvidia-l4
-        iam.gke.io/gke-metadata-server-enabled: "true"
+%{ endfor ~}