substratusai · samos123 · Nov 13, 2023 · Nov 11, 2023 · Nov 11, 2023 · Nov 11, 2023
diff --git a/.gitignore b/.gitignore
@@ -1 +1,2 @@
 bin
+.venv
diff --git a/autoscaler.go b/autoscaler.go
@@ -32,7 +32,7 @@ type Autoscaler struct {
 func (a *Autoscaler) Start() {
 	for range time.Tick(a.Interval) {
 		log.Println("Calculating scales for all")
-		for deploymentName, waitCount := range a.FIFO.WaitCounts() {
+		for deploymentName, waitCount := range a.FIFO.TotalCounts() {
 			avg := a.getMovingAvgQueueSize(deploymentName)
 			avg.Next(float64(waitCount))
 			flt := avg.Calculate()

diff --git a/deploy/proxy_service.yaml b/deploy/proxy_service.yaml
@@ -9,3 +9,5 @@ spec:
     - protocol: TCP
       port: 80
       targetPort: 8080
+      nodePort: 30080
+  type: NodePort
diff --git a/handler.go b/handler.go
@@ -48,17 +48,18 @@ func (h *Handler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
 
 	h.Deployments.AtLeastOne(deploy)
 
-	log.Println("Entering queue")
+	log.Println("Entering queue", id)
 	complete := h.FIFO.EnqueueAndWait(r.Context(), deploy, id)
-	log.Println("Admitted into queue")
+	log.Println("Admitted into queue", id)
 	defer complete()
 
-	log.Println("Waiting for IPs")
+	log.Println("Waiting for IPs", id)
 	host := h.Endpoints.GetHost(r.Context(), deploy)
-	log.Printf("Got host: %v", host)
+	log.Printf("Got host: %v, id: %v\n", host, id)
 
 	// TODO: Avoid creating new reverse proxies for each request.
 	// TODO: Consider implementing a round robin scheme.
+	log.Printf("Proxying request to host %v: %v\n", host, id)
 	newReverseProxy(host).ServeHTTP(w, proxyRequest)
 }
 

diff --git a/integration_test.go b/integration_test.go
@@ -3,6 +3,7 @@ package main
 import (
 	"bytes"
 	"fmt"
+	"log"
 	"net/http"
 	"net/http/httptest"
 	"net/url"
@@ -32,6 +33,7 @@ func TestIntegration(t *testing.T) {
 
 	backendRequests := &atomic.Int32{}
 	testBackend := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
+		log.Println("Serving request from testBackend")
 		backendRequests.Add(1)
 		<-backendComplete
 		w.WriteHeader(200)
@@ -55,22 +57,23 @@ func TestIntegration(t *testing.T) {
 	// Send request number 1
 	var wg sync.WaitGroup
 	sendRequests(t, &wg, modelName, 1)
+
 	requireDeploymentReplicas(t, deploy, 1)
 	require.Equal(t, int32(1), backendRequests.Load(), "ensure the request made its way to the backend")
 	completeRequests(backendComplete, 1)
 
 	// Ensure the deployment scaled scaled past 1.
-	// 1/3 should be admitted
-	// 2/3 should remain in queue --> replicas should equal 2
-	sendRequests(t, &wg, modelName, 3)
+	// 1/2 should be admitted
+	// 1/2 should remain in queue --> replicas should equal 2
+	sendRequests(t, &wg, modelName, 2)
 	requireDeploymentReplicas(t, deploy, 2)
 
 	// Make sure deployment will not be scaled past default max (3).
 	sendRequests(t, &wg, modelName, 2)
 	requireDeploymentReplicas(t, deploy, 3)
 
 	// Have the mock backend respond to the remaining 4 requests.
-	completeRequests(backendComplete, 5)
+	completeRequests(backendComplete, 4)
 
 	// Ensure scale-down.
 	requireDeploymentReplicas(t, deploy, 0)

diff --git a/main.go b/main.go
@@ -102,6 +102,11 @@ func run() error {
 	autoscaler.FIFO = fifo
 	go autoscaler.Start()
 
+	// Change the global defaults and remove limits on max conns
+	defaultTransport := http.DefaultTransport.(*http.Transport)
+	defaultTransport.MaxIdleConns = 0
+	defaultTransport.MaxIdleConnsPerHost = 0
+	defaultTransport.MaxConnsPerHost = 0
 	handler := &Handler{
 		Deployments: scaler,
 		Endpoints:   endpoints,

diff --git a/pkg/queue/queue.go b/pkg/queue/queue.go
@@ -3,8 +3,9 @@ package queue
 import (
 	"container/list"
 	"context"
-	"fmt"
+	"log"
 	"sync"
+	"sync/atomic"
 	"time"
 )
 
@@ -30,6 +31,14 @@ type FIFOQueue struct {
 
 	// completed signals when a item that has been dequeued has completed.
 	completed chan struct{}
+
+	// totalCount is the number of requests that have been enqueued
+	// but not yet completed.
+	totalCount atomic.Int64
+
+	// inProgressCount is the number of requests that have been dequeued
+	// but not yet completed.
+	inProgressCount atomic.Int64
 }
 
 type item struct {
@@ -46,6 +55,9 @@ type item struct {
 }
 
 func (q *FIFOQueue) dequeue(itm *item, inProgress bool) {
+	if inProgress {
+		q.inProgressCount.Add(1)
+	}
 	q.listMtx.Lock()
 	itm.inProgress = inProgress
 	q.list.Remove(itm.e)
@@ -60,6 +72,7 @@ func (q *FIFOQueue) dequeue(itm *item, inProgress bool) {
 // It returns a function that should be called after all work has completed.
 // The id parameter is only used for tracking/debugging purposes.
 func (q *FIFOQueue) EnqueueAndWait(ctx context.Context, id string) func() {
+	q.totalCount.Add(1)
 	itm := &item{
 		id:       id,
 		dequeued: make(chan struct{}),
@@ -87,27 +100,43 @@ func (q *FIFOQueue) EnqueueAndWait(ctx context.Context, id string) func() {
 
 func (q *FIFOQueue) completeFunc(itm *item) func() {
 	return func() {
+		log.Println("Running completeFunc: ", itm.id)
+		q.totalCount.Add(-1)
+
+		log.Println("Locking queue.list: ", itm.id)
 		q.listMtx.Lock()
 		if !itm.closed {
+			log.Println("Closing item.dequeued: ", itm.id)
 			close(itm.dequeued)
 			itm.closed = true
 		}
+
 		inProgress := itm.inProgress
+		log.Printf("Item %v inProgress: %v\n", itm.id, inProgress)
 		q.listMtx.Unlock()
+
 		if inProgress {
+			q.inProgressCount.Add(-1)
+
 			// Make sure we only send a message on the completed channel if the
 			// item was counted as inProgress.
-			q.completed <- struct{}{}
+			select {
+			case q.completed <- struct{}{}:
+				log.Println("Sent completed message: ", itm.id)
+			default:
+				log.Println("Did not send completed message: ", itm.id)
+			}
 		}
+
+		log.Println("Finished completeFunc: ", itm.id)
 	}
 }
 
 func (q *FIFOQueue) Start() {
-	var inProgress int
 	for {
-		if inProgress >= q.GetConcurrency() {
+		if q.inProgressCount.Load() >= int64(q.GetConcurrency()) {
+			log.Println("Waiting for requests to complete")
 			<-q.completed
-			inProgress--
 			continue
 		}
 
@@ -121,11 +150,9 @@ func (q *FIFOQueue) Start() {
 			continue
 		}
 
-		inProgress++
-
 		itm := e.Value.(*item)
 		q.dequeue(itm, true)
-		fmt.Println("Dequeued: ", itm.id)
+		log.Println("Dequeued: ", itm.id)
 
 		time.Sleep(time.Second / 100)
 	}
@@ -143,8 +170,14 @@ func (q *FIFOQueue) SetConcurrency(n int) {
 	q.concurrencyMtx.Unlock()
 }
 
-func (q *FIFOQueue) Size() int {
-	q.listMtx.Lock()
-	defer q.listMtx.Unlock()
-	return q.list.Len()
+// TotalCount returns all requests that have made a call to EnqueueAndWait()
+// but have not yet completed.
+func (q *FIFOQueue) TotalCount() int64 {
+	return q.totalCount.Load()
+}
+
+// inProgressCount returns all requests that have been dequeued
+// but have not yet completed.
+func (q *FIFOQueue) InProgressCount() int64 {
+	return q.inProgressCount.Load()
 }
diff --git a/pkg/queuemanager/manager.go b/pkg/queuemanager/manager.go
@@ -24,12 +24,12 @@ type FIFOQueueManager struct {
 	queues map[string]*queue.FIFOQueue
 }
 
-// WaitCounts returns the number of pending or in-progress requests for each deployment name
-func (m *FIFOQueueManager) WaitCounts() map[string]int {
+// TotalCounts returns the number of pending or in-progress requests for each deployment name
+func (m *FIFOQueueManager) TotalCounts() map[string]int64 {
 	m.mtx.Lock()
-	sizes := make(map[string]int, len(m.queues))
+	sizes := make(map[string]int64, len(m.queues))
 	for name, q := range m.queues {
-		sizes[name] = q.Size()
+		sizes[name] = q.TotalCount()
 	}
 	m.mtx.Unlock()
 	return sizes

diff --git a/tests/system-test-kind.sh b/tests/system-test-kind.sh
@@ -1,40 +1,111 @@
 #!/usr/bin/env bash
 
-set -e
+set -xe
 
-kind create cluster --name=substratus-test
-trap "kind delete cluster --name=substratus-test" EXIT
+DELETE_CLUSTER=${DELETE_CLUSTER:-true}
+# This is possible because of kind extraPortMappings
+HOST=127.0.0.1
+PORT=30080
+BASE_URL="http://$HOST:$PORT/v1"
+
+
+if kind get clusters | grep -q substratus-test; then
+  echo "Cluster substratus-tests already exists.. reusing it"
+  else
+  kind create cluster --config - << EOF
+kind: Cluster
+apiVersion: kind.x-k8s.io/v1alpha4
+name: substratus-test
+nodes:
+- role: control-plane
+  # port forward 80 on the host to 80 on this node
+  extraPortMappings:
+  - containerPort: 30080
+    hostPort: 30080
+    listenAddress: "127.0.0.1"
+EOF
+  if [ "$DELETE_CLUSTER" = true ]; then
+    echo "Going to delete cluster substratus-test on exit"
+    trap "kind delete cluster --name=substratus-test" EXIT
+  fi
+fi
+
+if ! kubectl get deployment proxy-controller; then
+  skaffold run
+fi
 
-skaffold run
 
 kubectl wait --for=condition=available --timeout=30s deployment/proxy-controller
 
-kubectl port-forward svc/proxy-controller 8080:80 &
+
+if ! helm repo list | grep -q substratusai; then
+  helm repo add substratusai https://substratusai.github.io/helm/
+fi
+helm repo update
+helm upgrade --install stapi-minilm-l6-v2 substratusai/stapi -f - << EOF
+model: all-MiniLM-L6-v2
+replicaCount: 0
+deploymentAnnotations:
+  lingo.substratus.ai/models: text-embedding-ada-002
+EOF
 
 # need to wait for a bit for the port-forward to be ready
 sleep 5
 
-replicas=$(kubectl get deployment backend -o jsonpath='{.spec.replicas}')
+replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}')
 if [ "$replicas" -ne 0 ]; then
   echo "Expected 0 replica before sending requests, got $replicas"
   exit 1
 fi
 
-echo "Sending 60 requests to model named backend"
-for i in {1..60}; do
-curl -s -o /dev/null http://localhost:8080/delay/10 \
-  -H "Content-Type: application/json" \
-  -d '{
-    "text": "Your text string goes here",
-    "model": "backend"
-  }' &
-done
+SCRIPT_DIR=$(dirname "$0")
+VENV_DIR=$SCRIPT_DIR/.venv
 
-sleep 10
+if [ ! -d "$VENV_DIR" ]; then
+  python3 -m venv "$VENV_DIR"
+fi
+source "$VENV_DIR/bin/activate"
+pip3 install openai==1.2.3
 
-replicas=$(kubectl get deployment backend -o jsonpath='{.spec.replicas}')
+# Send 60 requests in parallel to stapi backend using openai python client and threading
+python3 $SCRIPT_DIR/test_openai_embedding.py \
+  --requests 60 --timeout 300 --base-url "${BASE_URL}" \
+  --model text-embedding-ada-002
 
-if [ "$replicas" -ne 1 ]; then
-  echo "Expected 1 replica after sending less than 100 requests, got $replicas"
+# Ensure replicas has been scaled up to 1 after sending 60 requests
+replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}')
+if [ "$replicas" -eq 1 ]; then
+  echo "Test passed: Expected 1 replica after sending requests 60 requests"
+  else
+  echo "Test failed: Expected 1 replica after sending requests 60 requests, got $replicas"
   exit 1
 fi
+
+echo "Waiting for deployment to scale down back to 0 within 2 minutes"
+for i in {1..15}; do
+  if [ "$i" -eq 15 ]; then
+    echo "Test failed: Expected 0 replica after not having requests for more than 1 minute, got $replicas"
+    exit 1
+  fi
+  replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}')
+  if [ "$replicas" -eq 0 ]; then
+    echo "Test passed: Expected 0 replica after not having requests for more than 1 minute"
+    break
+  fi
+  sleep 8
+done
+
+# Scale up again after scaling to 0 is broken right now
+# requests=500
+# echo "Send $requests requests in parallel to stapi backend using openai python client and threading"
+# python3 $SCRIPT_DIR/test_openai_embedding.py \
+#   --requests $requests --timeout 600 --base-url "${BASE_URL}" \
+#   --model text-embedding-ada-002
+# 
+# replicas=$(kubectl get deployment stapi-minilm-l6-v2 -o jsonpath='{.spec.replicas}')
+# if [ "$replicas" -ge 2 ]; then
+#   echo "Test passed: Expected 2 or more replicas after sending more than $requests requests, got $replicas"
+#   else
+#   echo "Test failed: Expected 2 or more replicas after sending more than $requests requests, got $replicas"
+#   exit 1
+# fi