Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add amd support #114

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 41 additions & 0 deletions docker-compose-rocm-ggml.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
version: '3.6'

services:
llama-gpt-api-rocm-ggml:
build:
context: ./rocm
dockerfile: ggml.Dockerfile
restart: on-failure
devices:
- /dev/dri:/dev/dri
volumes:
- './models:/models'
- './rocm:/rocm'
ports:
- 3001:8000
environment:
MODEL: '/models/${MODEL_NAME:-llama-2-7b-chat.bin}'
MODEL_DOWNLOAD_URL: '${MODEL_DOWNLOAD_URL:-https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin}'
N_GQA: '${N_GQA:-1}'
USE_MLOCK: 1
cap_add:
- IPC_LOCK
- SYS_RESOURCE
command: '/bin/sh /rocm/run.sh'

llama-gpt-ui:
# TODO: Use this image instead of building from source after the next release
# image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
build:
context: ./ui
dockerfile: Dockerfile
ports:
- 3000:3000
restart: on-failure
environment:
- 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
- 'OPENAI_API_HOST=http://llama-gpt-api-rocm-ggml:8000'
- 'DEFAULT_MODEL=/models/${MODEL_NAME:-llama-2-7b-chat.bin}'
- 'NEXT_PUBLIC_DEFAULT_SYSTEM_PROMPT=${DEFAULT_SYSTEM_PROMPT:-"You are a helpful and friendly AI assistant. Respond very concisely."}'
- 'WAIT_HOSTS=llama-gpt-api-rocm-ggml:8000'
- 'WAIT_TIMEOUT=${WAIT_TIMEOUT:-3600}'
41 changes: 41 additions & 0 deletions docker-compose-rocm-gguf.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
version: '3.6'

services:
llama-gpt-api-rocm-gguf:
build:
context: ./rocm
dockerfile: gguf.Dockerfile
restart: on-failure
devices:
- /dev/dri:/dev/dri
volumes:
- './models:/models'
- './rocm:/rocm'
ports:
- 3001:8000
environment:
MODEL: '/models/${MODEL_NAME:-code-llama-2-7b-chat.gguf}'
MODEL_DOWNLOAD_URL: '${MODEL_DOWNLOAD_URL:-https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q4_K_M.gguf}'
N_GQA: '${N_GQA:-1}'
USE_MLOCK: 1
cap_add:
- IPC_LOCK
- SYS_RESOURCE
command: '/bin/sh /rocm/run.sh'

llama-gpt-ui:
# TODO: Use this image instead of building from source after the next release
# image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
build:
context: ./ui
dockerfile: Dockerfile
ports:
- 3000:3000
restart: on-failure
environment:
- 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
- 'OPENAI_API_HOST=http://llama-gpt-api-rocm-gguf:8000'
- 'DEFAULT_MODEL=/models/${MODEL_NAME:-code-llama-2-7b-chat.gguf}'
- 'NEXT_PUBLIC_DEFAULT_SYSTEM_PROMPT=${DEFAULT_SYSTEM_PROMPT:-"You are a helpful and friendly AI assistant. Respond very concisely."}'
- 'WAIT_HOSTS=llama-gpt-api-rocm-gguf:8000'
- 'WAIT_TIMEOUT=${WAIT_TIMEOUT:-3600}'
25 changes: 25 additions & 0 deletions rocm/ggml.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM rocm/dev-ubuntu-22.04

# We need to set the host to 0.0.0.0 to allow outside access
ENV HOST 0.0.0.0

RUN apt-get update && apt-get upgrade -y \
&& apt-get install -y git build-essential \
python3 python3-pip gcc wget \
rocm-dev rocm-libs rocblas hipblas \
&& mkdir -p /etc/OpenCL/vendors && echo "libamdocl64.so" > /etc/OpenCL/vendors/amd.icd

COPY . .

# setting build related env vars
ENV ROCM_DOCKER_ARCH=all
ENV LLAMA_HIPBLAS=1

# Install dependencies
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings

# Install llama-cpp-python 0.1.78 which has GGML support (build with rocm)
RUN CMAKE_ARGS="-DLLAMA_HIPBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78

# Run the server
CMD python3 -m llama_cpp.server
25 changes: 25 additions & 0 deletions rocm/gguf.Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
FROM rocm/dev-ubuntu-22.04

# We need to set the host to 0.0.0.0 to allow outside access
ENV HOST 0.0.0.0

RUN apt-get update && apt-get upgrade -y \
&& apt-get install -y git build-essential \
python3 python3-pip gcc wget \
rocm-dev rocm-libs rocblas hipblas \
&& mkdir -p /etc/OpenCL/vendors && echo "libamdocl64.so" > /etc/OpenCL/vendors/amd.icd

COPY . .

# setting build related env vars
ENV ROCM_DOCKER_ARCH=all
ENV LLAMA_HIPBLAS=1

# Install dependencies
RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings

# Install llama-cpp-python 0.1.80 which has GGUF support (build with ROCm)
RUN CMAKE_ARGS="-DLLAMA_HIPBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.80

# Run the server
CMD python3 -m llama_cpp.server
63 changes: 63 additions & 0 deletions rocm/run.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
#!/bin/bash

# Check if the MODEL environment variable is set
if [ -z "$MODEL" ]
then
echo "Please set the MODEL_FILE environment variable"
exit 1
fi

# Check if the MODEL_DOWNLOAD_URL environment variable is set
if [ -z "$MODEL_DOWNLOAD_URL" ]
then
echo "Please set the MODEL_DOWNLOAD_URL environment variable"
exit 1
fi

# Check if the model file exists
if [ ! -f $MODEL ]; then
echo "Model file not found. Downloading..."
# Check if curl is installed
if ! [ -x "$(command -v curl)" ]; then
echo "curl is not installed. Installing..."
apt-get update --yes --quiet
apt-get install --yes --quiet curl
fi
# Download the model file
curl -L -o $MODEL $MODEL_DOWNLOAD_URL
if [ $? -ne 0 ]; then
echo "Download failed. Trying with TLS 1.2..."
curl -L --tlsv1.2 -o $MODEL $MODEL_DOWNLOAD_URL
fi
else
echo "$MODEL model found."
fi

# Build the project
make build

# Get the number of available CPU threads
n_threads=$(grep -c ^processor /proc/cpuinfo)

# Define context window
n_ctx=4096

# Offload layers to GPU
n_gpu_layers=10

# Define batch size based on total RAM
total_ram=$(cat /proc/meminfo | grep MemTotal | awk '{print $2}')
n_batch=2096
if [ $total_ram -lt 8000000 ]; then
n_batch=1024
fi

# Display configuration information
echo "Initializing server with:"
echo "Batch size: $n_batch"
echo "Number of CPU threads: $n_threads"
echo "Number of GPU layers: $n_gpu_layers"
echo "Context window: $n_ctx"

# Run the server
exec python3 -m llama_cpp.server --n_ctx $n_ctx --n_threads $n_threads --n_gpu_layers $n_gpu_layers --n_batch $n_batch
10 changes: 10 additions & 0 deletions run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,12 @@ fi

# Parse command line arguments for model value and check for --with-cuda flag
with_cuda=0
with_rocm=0
while [[ "$#" -gt 0 ]]; do
case $1 in
--model) model="$2"; shift ;;
--with-cuda) with_cuda=1 ;;
--with-rocm) with_rocm=1 ;;
*) echo "Unknown parameter passed: $1"; exit 1 ;;
esac
shift
Expand Down Expand Up @@ -97,6 +99,14 @@ then
else
docker compose -f docker-compose-cuda-gguf.yml up --build
fi
elif [ "$with_rocm" -eq 1 ]
then
if [ "$model_type" = "ggml" ]
then
docker compose -f docker-compose-rocm-ggml.yml up --build
else
docker compose -f docker-compose-rocm-gguf.yml up --build
fi
else
if [ "$model_type" = "ggml" ]
then
Expand Down