Skip to content

Commit

Permalink
server: tests: add health check and concurrent request example
Browse files Browse the repository at this point in the history
  • Loading branch information
phymbert committed Feb 20, 2024
1 parent eb65592 commit 0765d9c
Show file tree
Hide file tree
Showing 3 changed files with 151 additions and 29 deletions.
22 changes: 19 additions & 3 deletions examples/server/tests/features/server.feature
Original file line number Diff line number Diff line change
@@ -1,5 +1,13 @@
Feature: llama.cpp server

Background: The server is started and ready to accept prompts
When wait for the server to be started
Then wait for the server to be healthy

Scenario: Health endpoint
Given an health liveness probe
Then the server must be healthy

Scenario Outline: run a completion request
Given a prompt <prompt>
When we request a completion
Expand All @@ -18,6 +26,14 @@ Feature: llama.cpp server
Then the oai response contains completion tokens

Examples: Prompts
| model | system_prompt | user_prompt |
| tinyllama-2 | You are ChatGPT. | Say hello |
| tinyllama-2 | You are a coding assistant. | Write the fibonacci function in c++ |
| model | system_prompt | user_prompt |
| tinyllama-2 | You are ChatGPT. | Say hello |
| tinyllama-2 | You are a coding assistant. | Write the fibonacci function in c++ |


Scenario: Health endpoint during processing with concurrent requests
Given 2 slow concurrent prompts
Then wait for all slots processing
Then the server is overloaded
When wait for all slots idle
Then all prompts must be predicted
142 changes: 129 additions & 13 deletions examples/server/tests/features/steps/steps.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,105 @@
import socket
import threading
import time
from contextlib import closing

import openai
import requests
from behave import *
from behave import step
from behave.api.async_step import async_run_until_complete

base_fqdn = 'localhost'
base_port = 8080
base_url = f"http://{base_fqdn}:{base_port}"

openai.api_key = 'llama.cpp'
openai.api_base = "http://localhost:8080/v1/chat"
openai.api_base = f"{base_url}/v1/chat"

slow_prompt = 'say hello ' * 10
fast_prompt = 'Write a joke'

n_slots = 2


@step(u'wait for the server to be started')
def step_wait_for_the_server_to_be_started(context):
server_started = False
while not server_started:
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as sock:
result = sock.connect_ex((base_fqdn, base_port))
if result != 0:
print("server not ready: ", base_fqdn, base_port, result)
time.sleep(1)
else:
return 0


@step(u'wait for the server to be healthy')
def step_wait_for_the_server_to_be_healthy(context):
status_code = 500
while status_code != 200:
status_code = requests.get(f'{base_url}/health').status_code
if status_code != 200:
time.sleep(1)


@given(u'a prompt {prompt}')
@step(u'an health liveness probe')
def step_an_health_liveness_probe(context):
response = requests.get(f'{base_url}/health')
context.status_code = response.status_code
context.response_data = response.json()


@step(u'the server must be healthy')
def step_server_healthy(context):
assert context.status_code == 200
assert context.response_data['status'] == 'ok'


@step(u'the server is overloaded')
@async_run_until_complete()
async def step_server_overloaded(context):
response = requests.get(f'{base_url}/health?fail_on_no_slot')
assert response.status_code == 503
assert response.json()['status'] == 'no slot available'


@step(u'a prompt {prompt}')
def step_prompt(context, prompt):
context.prompt = prompt


@when(u'we request a completion')
@step(u'we request a completion')
def step_request_completion(context):
response = requests.post('http://localhost:8080/completion', json={
response = requests.post(f'{base_url}/completion', json={
"prompt": context.prompt
})
status_code = response.status_code
assert status_code == 200
context.response_data = response.json()


@then(u'tokens are predicted')
@step(u'tokens are predicted')
def step_request_completion(context):
assert len(context.response_data['content']) > 0
assert context.response_data['timings']['predicted_n'] > 0
prompt_predicted(context.response_data)


@given(u'a user prompt {user_prompt}')
@step(u'a user prompt {user_prompt}')
def step_user_prompt(context, user_prompt):
context.user_prompt = user_prompt


@given(u'a system prompt {system_prompt}')
@step(u'a system prompt {system_prompt}')
def step_system_prompt(context, system_prompt):
context.system_prompt = system_prompt


@given(u'a model {model}')
@step(u'a model {model}')
def step_model(context, model):
context.model = model


@when(u'we request the oai completions endpoint')
@step(u'we request the oai completions endpoint')
def step_oai_completions(context):
context.chat_completion = openai.Completion.create(
messages=[
Expand All @@ -59,8 +116,67 @@ def step_oai_completions(context):
)


@then(u'the oai response contains completion tokens')
@step(u'the oai response contains completion tokens')
def step_oai_response_has_completion_tokens(context):
assert len(context.chat_completion.choices) == 1
assert len(context.chat_completion.choices[0].message) > 0
assert context.chat_completion.usage.completion_tokens > 0


def async_prompt(context, prompt):
response = requests.post(f'{base_url}/completion', json={
"prompt": prompt
})

context.async_responses.append(response)


@step(u'{n_prompt} {prompt_type} concurrent prompts')
def step_n_concurrent_prompts(context, n_prompt, prompt_type):
prompt = fast_prompt
if prompt_type == 'slow':
prompt = slow_prompt
context.async_responses = []
context.threads = []
for i in range(int(n_prompt)):
thread = threading.Thread(target=async_prompt, args=(context, prompt))
thread.start()
context.threads.append(thread)


def wait_for_slots_processing(context, expected_slots_processing):
while True:
health = requests.get(f'{base_url}/health').json()
if 'slots_processing' in health: # FIXME when #5594 is merged
slots_processing = health['slots_processing']
else:
slots_processing = 0
if slots_processing == expected_slots_processing:
break
else:
time.sleep(0.2)


@step(u'wait for all slots processing')
def step_wait_for_all_slots_processing(context):
wait_for_slots_processing(context, n_slots)


@step(u'wait for all slots idle')
def step_wait_for_all_slots_idle(context):
wait_for_slots_processing(context, 0)


@step(u'all prompts must be predicted')
def step_all_prompts_must_be_predicted(context):
for thread in context.threads:
thread.join()
for async_response in context.async_responses:
assert async_response.status_code == 200
response_data = async_response.json()
prompt_predicted(response_data)


def prompt_predicted(response_data):
assert len(response_data['content']) > 0
assert response_data['timings']['predicted_n'] > 0
16 changes: 3 additions & 13 deletions examples/server/tests/tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ then
exit 1
fi

# kill the server at the end
cleanup() {
pkill -P $$
}
Expand All @@ -20,26 +21,15 @@ set -eu
../../../build/bin/server \
--model "$model_path" \
--alias tinyllama-2 \
--ctx-size 64 \
--ctx-size 1024 \
--parallel 2 \
--n-predict 32 \
--n-predict 1024 \
--batch-size 32 \
--threads 4 \
--threads-batch 4 \
--embedding \
--cont-batching \
"$@" &

# Wait for the server to start
max_attempts=30
attempts=${max_attempts}
until curl --silent --fail "http://localhost:8080/health" | jq -r '.status' | grep ok; do
attempts=$(( attempts - 1));
[ "${attempts}" -eq 0 ] && { echo "Server did not startup" >&2; exit 1; }
sleep_time=$(( (max_attempts - attempts) * 2 ))
echo "waiting for server to be ready ${sleep_time}s..."
sleep ${sleep_time}
done

# Start tests
behave

0 comments on commit 0765d9c

Please sign in to comment.