From e97edd6c022f9e6aae7f6214add12294df918721 Mon Sep 17 00:00:00 2001 From: Nexesenex <124105151+Nexesenex@users.noreply.github.com> Date: Thu, 20 Jun 2024 00:50:04 +0200 Subject: [PATCH] Add CMake flag for pipeline parallelism for multi-GPU LCPP Default is set to 4, which is a bit too much in my opinion. Saves VRAM (0.5-1%?), some compute and some electricity if set to 2, at the expense of some potential performance (prompt processing?), that I do not notice in usage. 2 is thus my own setting. --- CMakeLists.txt | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 08fca8df0f7e6..5842b6dc786da 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -26,6 +26,7 @@ set(LLAMA_GPROF OFF) set(LLAMA_SANITIZE_THREAD OFF) set(LLAMA_SANITIZE_ADDRESS OFF) set(LLAMA_SANITIZE_UNDEFINED OFF) +set(LLAMA_SCHED_MAX_COPIES "2" CACHE STRING "llama: max input copies for pipeline parallelism") # instruction set specific option(LLAMA_AVX "llama: enable AVX" ON) @@ -64,6 +65,7 @@ set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) add_compile_definitions(LOG_DISABLE_LOGS) +add_compile_definitions(GGML_SCHED_MAX_COPIES=${LLAMA_SCHED_MAX_COPIES}) file(GLOB GGML_SOURCES_CUDA "ggml-cuda/*.cu") list(APPEND GGML_SOURCES_CUDA "ggml-cuda.cu")