Skip to content

Commit

Permalink
Add gh200 support and model (#300)
Browse files Browse the repository at this point in the history
This PR will add the initial support for GH200 nodes in KubeAI
  • Loading branch information
happytreees authored Nov 2, 2024
1 parent c287c12 commit 6082f5c
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 0 deletions.
12 changes: 12 additions & 0 deletions charts/kubeai/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ modelServers:
default: "vllm/vllm-openai:v0.6.3.post1"
cpu: "substratusai/vllm:v0.6.3.post1-cpu"
google-tpu: "substratusai/vllm:v0.6.3.post1-tpu"
gh200: "drikster80/vllm-gh200-openai:v0.6.3.post1"
OLlama:
images:
default: "ollama/ollama:latest"
Expand Down Expand Up @@ -97,6 +98,17 @@ resourceProfiles:
operator: "Equal"
value: "present"
effect: "NoSchedule"
nvidia-gpu-gh200:
imageName: "gh200"
limits:
nvidia.com/gpu: "1"
requests:
nvidia.com/gpu: "1"
tolerations:
- key: "nvidia.com/gpu"
operator: "Equal"
value: "present"
effect: "NoSchedule"
nvidia-gpu-a100-80gb:
imageName: "nvidia-gpu"
limits:
Expand Down
12 changes: 12 additions & 0 deletions charts/models/values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,18 @@ catalog:
# You can also use nvidia-gpu-a100-80gb:8
resourceProfile: nvidia-gpu-h100:8
targetRequests: 500
llama-3.1-70b-instruct-awq-int4-gh200:
enabled: false
features: [TextGeneration]
url: hf://hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
engine: VLLM
args:
- --max-model-len=16384
- --max-num-batched-token=16384
- --enable-prefix-caching
- --disable-log-requests
resourceProfile: nvidia-gpu-gh200:1
targetRequests: 50
llama-3.1-405b-instruct-fp8-a100-80b:
features: [TextGeneration]
url: hf://neuralmagic/Meta-Llama-3.1-405B-Instruct-FP8
Expand Down
17 changes: 17 additions & 0 deletions manifests/models/llama-3.1-70b-instruct-awq-int4-gh200.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
apiVersion: kubeai.org/v1
kind: Model
metadata:
name: llama-3.1-70b-instruct-awq-int4-gh200
spec:
features: [TextGeneration]
owner:
url: hf://hugging-quants/Meta-Llama-3.1-70B-Instruct-AWQ-INT4
engine: VLLM
args:
- --max-model-len=16384
- --max-num-batched-token=16384
- --enable-prefix-caching
- --disable-log-requests
targetRequests: 50
minReplicas: 1
resourceProfile: nvidia-gpu-gh200:1

0 comments on commit 6082f5c

Please sign in to comment.