Cache optimized routing ("PrefixHash" load balancing - i.e. CHWBL) (#333

) Implementation of proposal: #314 * Add `.spec.loadBalancing` field to Model * Add `PrefixHash` (i.e. "Consistent Hashing with Bounded Loads" - CHWBL) load balancing strategy * Rename `endpoints` package to `loadbalancer` * Rename `modelscaler` package to `modelclient` * Refactor request parsing logic out of `modelproxy` and `messenger` and into `apiutils` as a shared library * Add Load Balancing concepts doc * Add benchmark showing `34%` improvement in time per generated token using PrefixHash over LeastLoad in specific circumstances TODO: * File issue for making `PrefixHash` the default strategy in the future if benchmarks look good
substratusai · Dec 18, 2024 · 3beb635 · 3beb635
1 parent f0d7dd4
commit 3beb635
Show file tree

Hide file tree

Showing 45 changed files with 2,184 additions and 714 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -1,6 +1,7 @@
 # More info: https://docs.docker.com/engine/reference/builder/#dockerignore-file
 # Ignore build and test binaries.
 bin/
+benchmarks/
 charts/
 components/
 docs/

diff --git a/api/v1/model_types.go b/api/v1/model_types.go
@@ -117,6 +117,11 @@ type ModelSpec struct {
 	// DEPRECATED.
 	// +kubebuilder:validation:Optional
 	Owner string `json:"owner"`
+
+	// LoadBalancing configuration for the model.
+	// If not specified, a default is used based on the engine and request.
+	// +kubebuilder:default={}
+	LoadBalancing LoadBalancing `json:"loadBalancing,omitempty"`
 }
 
 // +kubebuilder:validation:Enum=TextGeneration;TextEmbedding;SpeechToText
@@ -146,6 +151,44 @@ type Adapter struct {
 	URL string `json:"url"`
 }
 
+type LoadBalancing struct {
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:default=LeastLoad
+	Strategy LoadBalancingStrategy `json:"strategy,omitempty"`
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:default={}
+	PrefixHash PrefixHash `json:"prefixHash,omitempty"`
+}
+
+// +kubebuilder:validation:Enum=LeastLoad;PrefixHash
+type LoadBalancingStrategy string
+
+const (
+	LeastLoadStrategy  LoadBalancingStrategy = "LeastLoad"
+	PrefixHashStrategy LoadBalancingStrategy = "PrefixHash"
+)
+
+type PrefixHash struct {
+	// MeanLoadPercentage is the percentage that any given endpoint's load must not exceed
+	// over the mean load of all endpoints in the hash ring. Defaults to 125% which is
+	// a widely accepted value for the Consistent Hashing with Bounded Loads algorithm.
+	// +kubebuilder:default=125
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:validation:Minimum=100
+	MeanLoadPercentage int `json:"meanLoadFactor,omitempty"`
+	// Replication is the number of replicas of each endpoint on the hash ring.
+	// Higher values will result in a more even distribution of load but will
+	// decrease lookup performance.
+	// +kubebuilder:validation:XValidation:rule="self == oldSelf", message="replication is immutable."
+	// +kubebuilder:default=20
+	// +kubebuilder:validation:Optional
+	Replication int `json:"replication,omitempty"`
+	// PrefixCharLength is the number of characters to count when building the prefix to hash.
+	// +kubebuilder:validation:Optional
+	// +kubebuilder:default=100
+	PrefixCharLength int `json:"prefixCharLength,omitempty"`
+}
+
 // ModelStatus defines the observed state of Model.
 type ModelStatus struct {
 	Replicas ModelStatusReplicas `json:"replicas,omitempty"`

diff --git a/api/v1/zz_generated.deepcopy.go b/api/v1/zz_generated.deepcopy.go
diff --git a/benchmarks/chat/.dockerignore b/benchmarks/chat/.dockerignore
@@ -0,0 +1 @@
+data/ShareGPT_V3_unfiltered_cleaned_split.json
diff --git a/benchmarks/chat/.gitignore b/benchmarks/chat/.gitignore
@@ -0,0 +1 @@
+data/*.json
diff --git a/benchmarks/chat/Dockerfile b/benchmarks/chat/Dockerfile
@@ -0,0 +1,14 @@
+FROM ubuntu:20.04
+
+RUN apt-get update && apt-get install -y build-essential make python3 wget vim
+
+# Install k6 binary.
+ENV K6_VERSION=v0.55.0
+RUN wget https://github.com/grafana/k6/releases/download/${K6_VERSION}/k6-${K6_VERSION}-linux-amd64.tar.gz && tar -zxvf k6-${K6_VERSION}-linux-amd64.tar.gz && mv k6-${K6_VERSION}-linux-amd64/k6 /usr/local/bin && rm k6-${K6_VERSION}-linux-amd64.tar.gz
+
+WORKDIR /work
+
+COPY ./k6.js .
+COPY ./Makefile .
+COPY ./data ./data
+COPY ./scenarios ./scenarios
diff --git a/benchmarks/chat/Makefile b/benchmarks/chat/Makefile
@@ -0,0 +1,10 @@
+data/ShareGPT_V3_unfiltered_cleaned_split.json:
+	cd data && wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+.PHONY: data
+data: data/ShareGPT_V3_unfiltered_cleaned_split.json
+	cd data && python prepare-message-threads.py
+
+run: 
+	ls scenarios/${SCENARIO}
+	CONFIG_DIR=scenarios/${SCENARIO} DATA_DIR=data MODEL_ADDR=kubeai/openai k6 run ./k6.js
diff --git a/benchmarks/chat/data/prepare-message-threads.py b/benchmarks/chat/data/prepare-message-threads.py
@@ -0,0 +1,43 @@
+import json
+
+
+def main():
+    with open("./ShareGPT_V3_unfiltered_cleaned_split.json", "r") as f:
+        data = json.load(f)
+
+    # Select a subnet the first conversations that start with a human.
+    max = 2000
+    output = []
+    for entry in data:
+        conv = entry.get("conversations")
+        if conv and conv[0]["from"] == "human" and len(conv[0]["value"]) != 0:
+            # Filter the conversation to only include messages from a human using a for loop.
+            # entry["userMessages"] = [c["value"] for c in conv if c["from"] == "human"]
+            totalContentLength = 0
+            userMessages = []
+            for c in conv:
+                if c["from"] == "human":
+                    content = c["value"]
+                    userMessages.append(content)
+                    totalContentLength += len(content)
+
+            if totalContentLength < 2500:
+                continue
+
+            if len(userMessages) < 5:
+                continue
+
+            # Delete the original conversation
+            entry["userMessages"] = userMessages
+            del entry["conversations"]
+            output.append(entry)
+
+            if len(output) >= max:
+                break
+
+    with open("./message-threads.json", "w") as f:
+        data = json.dump(output, f, indent=4)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmarks/chat/k6.js b/benchmarks/chat/k6.js
@@ -0,0 +1,71 @@
+import { check } from 'k6';
+import { scenario } from 'k6/execution';
+import http from 'k6/http';
+import { Trend, Counter } from 'k6/metrics';
+
+const model_addr = __ENV.MODEL_ADDR;
+const config_dir = __ENV.CONFIG_DIR;
+const data_dir = __ENV.DATA_DIR;
+
+const timePerToken = new Trend('time_per_token', true);
+const tokens = new Counter('tokens');
+const new_tokens = new Counter('new_tokens');
+const input_tokens = new Counter('input_tokens');
+
+const k6Options = JSON.parse(open(`${config_dir}/k6.json`));
+const baseRequest = JSON.parse(open(`${config_dir}/base-request.json`));
+const messageThreads = JSON.parse(open(`${data_dir}/message-threads.json`))
+
+export const options = k6Options;
+
+export default function run() {
+    const headers = { 'Content-Type': 'application/json' };
+    const msgThread = messageThreads[scenario.iterationInTest % messageThreads.length];
+    var payload = JSON.parse(JSON.stringify(baseRequest));
+
+    // console.log(`Message thread: ${JSON.stringify(msgThread)}`);
+
+    // Iterate over all the messages in the thread, appending the completions to the same payload.
+    for (let i = 0; i < msgThread["userMessages"].length; i++) {
+        payload.messages.push({
+            "role": "user",
+            "content": msgThread["userMessages"][i]
+        });
+        //console.log(`Payload: ${JSON.stringify(payload)}`);
+
+        const res = http.post(`http://${model_addr}/v1/chat/completions`, JSON.stringify(payload), {
+            headers,
+        });
+        if (res.status >= 400 && res.status < 500) {
+            return;
+        }
+
+        check(res, {
+            'Post status is 200': (res) => res.status === 200,
+        });
+        const duration = res.timings.duration;
+
+        if (res.status === 200) {
+            // console.log(`Status: ${res.status}`);
+            const body = res.json();
+
+            const completion_tokens = body.usage.completion_tokens;
+            const prompt_tokens = body.usage.prompt_tokens;
+            const latency_ms_per_token = duration / completion_tokens;
+
+            new_tokens.add(completion_tokens);
+            input_tokens.add(prompt_tokens);
+            timePerToken.add(latency_ms_per_token);
+            tokens.add(completion_tokens + prompt_tokens);
+
+            const msg0 = body.choices[0].message;
+            payload.messages.push({
+                "role": msg0.role,
+                "content": msg0.content
+            });
+        } else {
+            console.log(`Error Status: ${res.status}`);
+            console.log(`Response: ${res.body}`);
+        }
+    }
+}
diff --git a/benchmarks/chat/scenarios/least-load-vs-prefix-hash/README.md b/benchmarks/chat/scenarios/least-load-vs-prefix-hash/README.md
@@ -0,0 +1,131 @@
+# Results
+
+Under specific conditions:
+
+* Restricted GPU memory
+* Low `max_tokens` to be generated
+* Chat threads with decently long user messages
+
+Prefix hashing was shown to have `34%` decrease in average time per token.
+
+`712.11ms (LeastLoad) --> 469.34ms (PrefixHash)`
+
+## Steps taken
+
+```bash
+gcloud container clusters create-auto cluster-1 \
+    --location=us-central1
+skaffold run -f ./skaffold.yaml --tail --port-forward --profile kubeai-only-gke --default-repo us-central1-docker.pkg.dev/substratus-dev
+
+cd ./benchmarks/chat
+make data
+export IMG=us-central1-docker.pkg.dev/substratus-dev/default/kubeai-benchmark-chat:v0.0.2
+docker build -t $IMG . && docker push $IMG
+
+kubectl apply -f ./scenarios/least-load-vs-prefix-hash/model.yaml
+kubectl apply -f ./scenarios/least-load-vs-prefix-hash/pod.yaml
+
+# Run 2x (to ensure both cases start with a preloaded cache)
+kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash make run
+
+kubectl patch model llama-3.1-8b-instruct-fp8-l4 --type='merge' -p '{"spec": {"loadBalancing": {"strategy": "PrefixHash"}}}'
+kubectl exec -it chat-benchmark -- SCENARIO=least-load-vs-prefix-hash make run
+```
+
+## Next Steps
+
+* Rerun with increased replicas (i.e. 10 instead of 2)
+
+## Benchmark Output
+
+### LeastLoad
+
+```
+         /\      Grafana   /‾‾/  
+    /\  /  \     |\  __   /  /   
+   /  \/    \    | |/ /  /   ‾‾\ 
+  /          \   |   (  |  (‾)  |
+ / __________ \  |_|\_\  \_____/ 
+
+     execution: local
+        script: ./k6.js
+        output: -
+
+     scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop):
+              * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s)
+
+
+     ✓ Post status is 200
+
+     checks.........................: 100.00% 7341 out of 7341
+     data_received..................: 4.7 MB  7.9 kB/s
+     data_sent......................: 25 MB   42 kB/s
+     http_req_blocked...............: avg=161.4µs  min=2.83µs   med=5.8µs    max=16.67ms  p(90)=8.06µs   p(95)=10.19µs 
+     http_req_connecting............: avg=55.73µs  min=0s       med=0s       max=8.41ms   p(90)=0s       p(95)=0s      
+     http_req_duration..............: avg=6.31s    min=165.25ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
+       { expected_response:true }...: avg=6.31s    min=165.25ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
+   ✓ http_req_failed................: 0.00%   0 out of 7341
+     http_req_receiving.............: avg=84.64µs  min=29.4µs   med=74.05µs  max=732.69µs p(90)=129.94µs p(95)=154.19µs
+     http_req_sending...............: avg=68µs     min=12.1µs   med=32.3µs   max=1.38ms   p(90)=144.04µs p(95)=173.19µs
+     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s       max=0s       p(90)=0s       p(95)=0s      
+     http_req_waiting...............: avg=6.31s    min=165.04ms med=6.66s    max=11.65s   p(90)=8.55s    p(95)=9.07s   
+     http_reqs......................: 7341    12.422953/s
+     input_tokens...................: 4990223 8444.803735/s
+     iteration_duration.............: avg=46.39s   min=6.73s    med=41.26s   max=4m13s    p(90)=1m8s     p(95)=1m28s   
+     iterations.....................: 1000    1.69227/s
+     new_tokens.....................: 68062   115.179268/s
+     time_per_token.................: avg=712.11ms min=39.56ms  med=703.28ms max=2.69s    p(90)=928.58ms p(95)=1.09s   
+     tokens.........................: 5058285 8559.983003/s
+     vus............................: 1       min=0            max=80
+     vus_max........................: 80      min=21           max=80
+
+
+running (09m50.9s), 00/80 VUs, 1000 complete and 0 interrupted iterations
+chat ✓ [======================================] 80 VUs  09m50.9s/10m0s  1000/1000 shared iters
+```
+
+### PrefixHash
+
+```
+         /\      Grafana   /‾‾/  
+    /\  /  \     |\  __   /  /   
+   /  \/    \    | |/ /  /   ‾‾\ 
+  /          \   |   (  |  (‾)  |
+ / __________ \  |_|\_\  \_____/ 
+
+     execution: local
+        script: ./k6.js
+        output: -
+
+     scenarios: (100.00%) 1 scenario, 80 max VUs, 10m30s max duration (incl. graceful stop):
+              * chat: 1000 iterations shared among 80 VUs (maxDuration: 10m0s, gracefulStop: 30s)
+
+
+     ✓ Post status is 200
+
+     checks.........................: 100.00% 7341 out of 7341
+     data_received..................: 4.7 MB  12 kB/s
+     data_sent......................: 25 MB   65 kB/s
+     http_req_blocked...............: avg=268.24µs min=2.94µs   med=5.76µs   max=28.19ms  p(90)=8.17µs   p(95)=10.41µs 
+     http_req_connecting............: avg=136.33µs min=0s       med=0s       max=17.7ms   p(90)=0s       p(95)=0s      
+     http_req_duration..............: avg=4.08s    min=151.9ms  med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
+       { expected_response:true }...: avg=4.08s    min=151.9ms  med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
+   ✓ http_req_failed................: 0.00%   0 out of 7341
+     http_req_receiving.............: avg=81.81µs  min=28.68µs  med=72.08µs  max=786.09µs p(90)=125.04µs p(95)=148.6µs 
+     http_req_sending...............: avg=63.61µs  min=11.85µs  med=31.65µs  max=1.59ms   p(90)=136.85µs p(95)=161.88µs
+     http_req_tls_handshaking.......: avg=0s       min=0s       med=0s       max=0s       p(90)=0s       p(95)=0s      
+     http_req_waiting...............: avg=4.08s    min=151.81ms med=2.45s    max=12.32s   p(90)=9.63s    p(95)=10.26s  
+     http_reqs......................: 7341    19.230625/s
+     input_tokens...................: 4990576 13073.409349/s
+     iteration_duration.............: avg=29.98s   min=2.37s    med=20.29s   max=2m53s    p(90)=1m1s     p(95)=1m18s   
+     iterations.....................: 1000    2.619619/s
+     new_tokens.....................: 68218   178.705191/s
+     time_per_token.................: avg=469.34ms min=44.2ms   med=257.72ms max=3.86s    p(90)=1s       p(95)=1.1s    
+     tokens.........................: 5058794 13252.11454/s
+     vus............................: 3       min=0            max=80
+     vus_max........................: 80      min=19           max=80
+
+
+running (06m21.7s), 00/80 VUs, 1000 complete and 0 interrupted iterations
+chat ✓ [======================================] 80 VUs  06m21.7s/10m0s  1000/1000 shared iters
+```