getumbrel · ParthJadhav · Oct 9, 2023 · Oct 10, 2023
diff --git a/docker-compose-rocm-ggml.yml b/docker-compose-rocm-ggml.yml
@@ -0,0 +1,41 @@
+version: '3.6'
+
+services:
+  llama-gpt-api-rocm-ggml:
+    build:
+      context: ./rocm
+      dockerfile: ggml.Dockerfile
+    restart: on-failure
+    devices:
+          - /dev/dri:/dev/dri
+    volumes:
+      - './models:/models'
+      - './rocm:/rocm'
+    ports:
+      - 3001:8000
+    environment:
+      MODEL: '/models/${MODEL_NAME:-llama-2-7b-chat.bin}'
+      MODEL_DOWNLOAD_URL: '${MODEL_DOWNLOAD_URL:-https://huggingface.co/TheBloke/Nous-Hermes-Llama-2-7B-GGML/resolve/main/nous-hermes-llama-2-7b.ggmlv3.q4_0.bin}'
+      N_GQA: '${N_GQA:-1}'
+      USE_MLOCK: 1
+    cap_add:
+      - IPC_LOCK
+      - SYS_RESOURCE
+    command: '/bin/sh /rocm/run.sh'
+
+  llama-gpt-ui:
+    # TODO: Use this image instead of building from source after the next release
+    # image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
+    build:
+      context: ./ui
+      dockerfile: Dockerfile
+    ports:
+      - 3000:3000
+    restart: on-failure
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_HOST=http://llama-gpt-api-rocm-ggml:8000'
+      - 'DEFAULT_MODEL=/models/${MODEL_NAME:-llama-2-7b-chat.bin}'
+      - 'NEXT_PUBLIC_DEFAULT_SYSTEM_PROMPT=${DEFAULT_SYSTEM_PROMPT:-"You are a helpful and friendly AI assistant. Respond very concisely."}'
+      - 'WAIT_HOSTS=llama-gpt-api-rocm-ggml:8000'
+      - 'WAIT_TIMEOUT=${WAIT_TIMEOUT:-3600}'
diff --git a/docker-compose-rocm-gguf.yml b/docker-compose-rocm-gguf.yml
@@ -0,0 +1,41 @@
+version: '3.6'
+
+services:
+  llama-gpt-api-rocm-gguf:
+    build:
+      context: ./rocm
+      dockerfile: gguf.Dockerfile
+    restart: on-failure
+    devices:
+          - /dev/dri:/dev/dri
+    volumes:
+      - './models:/models'
+      - './rocm:/rocm'
+    ports:
+      - 3001:8000
+    environment:
+      MODEL: '/models/${MODEL_NAME:-code-llama-2-7b-chat.gguf}'
+      MODEL_DOWNLOAD_URL: '${MODEL_DOWNLOAD_URL:-https://huggingface.co/TheBloke/CodeLlama-7B-Instruct-GGUF/resolve/main/codellama-7b-instruct.Q4_K_M.gguf}'
+      N_GQA: '${N_GQA:-1}'
+      USE_MLOCK: 1
+    cap_add:
+      - IPC_LOCK
+      - SYS_RESOURCE
+    command: '/bin/sh /rocm/run.sh'
+
+  llama-gpt-ui:
+    # TODO: Use this image instead of building from source after the next release
+    # image: 'ghcr.io/getumbrel/llama-gpt-ui:latest'
+    build:
+      context: ./ui
+      dockerfile: Dockerfile
+    ports:
+      - 3000:3000
+    restart: on-failure
+    environment:
+      - 'OPENAI_API_KEY=sk-XXXXXXXXXXXXXXXXXXXX'
+      - 'OPENAI_API_HOST=http://llama-gpt-api-rocm-gguf:8000'
+      - 'DEFAULT_MODEL=/models/${MODEL_NAME:-code-llama-2-7b-chat.gguf}'
+      - 'NEXT_PUBLIC_DEFAULT_SYSTEM_PROMPT=${DEFAULT_SYSTEM_PROMPT:-"You are a helpful and friendly AI assistant. Respond very concisely."}'
+      - 'WAIT_HOSTS=llama-gpt-api-rocm-gguf:8000'
+      - 'WAIT_TIMEOUT=${WAIT_TIMEOUT:-3600}'
diff --git a/rocm/ggml.Dockerfile b/rocm/ggml.Dockerfile
@@ -0,0 +1,25 @@
+FROM rocm/dev-ubuntu-22.04
+
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    rocm-dev rocm-libs rocblas hipblas \
+    && mkdir -p /etc/OpenCL/vendors && echo "libamdocl64.so" > /etc/OpenCL/vendors/amd.icd
+
+COPY . .
+
+# setting build related env vars
+ENV ROCM_DOCKER_ARCH=all
+ENV LLAMA_HIPBLAS=1
+
+# Install dependencies
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+
+# Install llama-cpp-python 0.1.78 which has GGML support (build with rocm)
+RUN CMAKE_ARGS="-DLLAMA_HIPBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.78
+
+# Run the server
+CMD python3 -m llama_cpp.server
diff --git a/rocm/gguf.Dockerfile b/rocm/gguf.Dockerfile
@@ -0,0 +1,25 @@
+FROM rocm/dev-ubuntu-22.04
+
+# We need to set the host to 0.0.0.0 to allow outside access
+ENV HOST 0.0.0.0
+
+RUN apt-get update && apt-get upgrade -y \
+    && apt-get install -y git build-essential \
+    python3 python3-pip gcc wget \
+    rocm-dev rocm-libs rocblas hipblas \
+    && mkdir -p /etc/OpenCL/vendors && echo "libamdocl64.so" > /etc/OpenCL/vendors/amd.icd
+
+COPY . .
+
+# setting build related env vars
+ENV ROCM_DOCKER_ARCH=all
+ENV LLAMA_HIPBLAS=1
+
+# Install dependencies
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette pydantic-settings
+
+# Install llama-cpp-python 0.1.80 which has GGUF support (build with ROCm)
+RUN CMAKE_ARGS="-DLLAMA_HIPBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.80
+
+# Run the server
+CMD python3 -m llama_cpp.server
diff --git a/rocm/run.sh b/rocm/run.sh
@@ -0,0 +1,63 @@
+#!/bin/bash
+
+ # Check if the MODEL environment variable is set
+ if [ -z "$MODEL" ]
+ then
+     echo "Please set the MODEL_FILE environment variable"
+     exit 1
+ fi
+
+ # Check if the MODEL_DOWNLOAD_URL environment variable is set
+ if [ -z "$MODEL_DOWNLOAD_URL" ]
+ then
+     echo "Please set the MODEL_DOWNLOAD_URL environment variable"
+     exit 1
+ fi
+
+ # Check if the model file exists
+ if [ ! -f $MODEL ]; then
+     echo "Model file not found. Downloading..."
+     # Check if curl is installed
+     if ! [ -x "$(command -v curl)" ]; then
+         echo "curl is not installed. Installing..."
+         apt-get update --yes --quiet
+         apt-get install --yes --quiet curl
+     fi
+     # Download the model file
+     curl -L -o $MODEL $MODEL_DOWNLOAD_URL
+     if [ $? -ne 0 ]; then
+         echo "Download failed. Trying with TLS 1.2..."
+         curl -L --tlsv1.2 -o $MODEL $MODEL_DOWNLOAD_URL
+     fi
+ else
+     echo "$MODEL model found."
+ fi
+
+# Build the project
+make build
+
+# Get the number of available CPU threads
+n_threads=$(grep -c ^processor /proc/cpuinfo)
+
+# Define context window
+n_ctx=4096
+
+# Offload layers to GPU
+n_gpu_layers=10
+
+# Define batch size based on total RAM
+total_ram=$(cat /proc/meminfo | grep MemTotal | awk '{print $2}')
+n_batch=2096
+if [ $total_ram -lt 8000000 ]; then
+    n_batch=1024
+fi
+
+# Display configuration information
+echo "Initializing server with:"
+echo "Batch size: $n_batch"
+echo "Number of CPU threads: $n_threads"
+echo "Number of GPU layers: $n_gpu_layers"
+echo "Context window: $n_ctx"
+
+# Run the server
+exec python3 -m llama_cpp.server --n_ctx $n_ctx --n_threads $n_threads --n_gpu_layers $n_gpu_layers --n_batch $n_batch
diff --git a/run.sh b/run.sh
@@ -9,10 +9,12 @@ fi
 
 # Parse command line arguments for model value and check for --with-cuda flag
 with_cuda=0
+with_rocm=0
 while [[ "$#" -gt 0 ]]; do
     case $1 in
         --model) model="$2"; shift ;;
         --with-cuda) with_cuda=1 ;;
+        --with-rocm) with_rocm=1 ;;
         *) echo "Unknown parameter passed: $1"; exit 1 ;;
     esac
     shift
@@ -97,6 +99,14 @@ then
     else
         docker compose -f docker-compose-cuda-gguf.yml up --build
     fi
+elif [ "$with_rocm" -eq 1 ]
+then
+    if [ "$model_type" = "ggml" ]
+    then
+        docker compose -f docker-compose-rocm-ggml.yml up --build
+    else
+        docker compose -f docker-compose-rocm-gguf.yml up --build
+    fi
 else
     if [ "$model_type" = "ggml" ]
     then