Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

demo : per-layer KV / partial offloading of KV cache #3457

Closed
wants to merge 3 commits into from
Closed
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
144 changes: 97 additions & 47 deletions llama.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
// TODO: move to context params
bool offload_k = true;
bool offload_v = true;

#define LLAMA_API_INTERNAL
#include "llama.h"

Expand Down Expand Up @@ -1036,6 +1040,9 @@ struct llama_kv_cache {
struct ggml_tensor * k = NULL;
struct ggml_tensor * v = NULL;

std::vector<struct ggml_tensor *> k_l; // per layer
std::vector<struct ggml_tensor *> v_l;

struct ggml_context * ctx = NULL;

llama_buffer buf;
Expand Down Expand Up @@ -1239,6 +1246,7 @@ static bool llama_kv_cache_init(
cache.cells.clear();
cache.cells.resize(n_ctx);


cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);

struct ggml_init_params params;
Expand All @@ -1248,34 +1256,44 @@ static bool llama_kv_cache_init(

cache.ctx = ggml_init(params);

size_t vram_kv_cache = 0;

if (!cache.ctx) {
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
return false;
}

cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
ggml_set_name(cache.k, "cache_k");
ggml_set_name(cache.v, "cache_v");
cache.k_l.reserve(n_layer);
cache.v_l.reserve(n_layer);

(void) n_gpu_layers;
#ifdef GGML_USE_CUBLAS
size_t vram_kv_cache = 0;
const int i_gpu_start = n_layer - n_gpu_layers;

if (n_gpu_layers > (int)n_layer + 1) {
ggml_cuda_assign_buffers_no_scratch(cache.v);
LLAMA_LOG_INFO("%s: offloading v cache to GPU\n", __func__);
vram_kv_cache += ggml_nbytes(cache.v);
for (uint32_t i = 0; i < n_layer; i++) {
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, wtype, n_embd*n_ctx);
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
cache.v_l.push_back(v);
#ifdef GGML_USE_CUBLAS
if ((int)i >= i_gpu_start) {
if (offload_k) {
ggml_cuda_assign_buffers_no_scratch(k);
vram_kv_cache += ggml_nbytes(k);
}
if (offload_v) {
ggml_cuda_assign_buffers_no_scratch(v);
vram_kv_cache += ggml_nbytes(v);
}
}
if (n_gpu_layers > (int)n_layer + 2) {
ggml_cuda_assign_buffers_no_scratch(cache.k);
LLAMA_LOG_INFO("%s: offloading k cache to GPU\n", __func__);
vram_kv_cache += ggml_nbytes(cache.k);
#endif // GGML_USE_CUBLAS
}

if (vram_kv_cache > 0) {
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
}
#endif // GGML_USE_CUBLAS

(void) n_gpu_layers;

return true;
}
Expand Down Expand Up @@ -2634,17 +2652,17 @@ static struct ggml_cgraph * llm_build_llama(
// offload functions set the tensor output backend to GPU
// tensors are GPU-accelerated if any input or the output has been offloaded
offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
offload_func_t offload_func_kq = llama_nop;
offload_func_t offload_func_v = llama_nop;
offload_func_t offload_func_kq = llama_nop;

#ifdef GGML_USE_CUBLAS
if (n_gpu_layers > n_layer) {
offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
}
if (n_gpu_layers > n_layer + 1) {
if (n_gpu_layers > 0 && offload_v) {
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
}
if (n_gpu_layers > n_layer + 2) {
if (n_gpu_layers > 0 && offload_k) {
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
}
#endif // GGML_USE_CUBLAS
Expand All @@ -2659,7 +2677,6 @@ static struct ggml_cgraph * llm_build_llama(

// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
offload_func_kq(KQ_mask);
ggml_set_name(KQ_mask, "KQ_mask");
ggml_allocr_alloc(lctx.alloc, KQ_mask);
if (!ggml_allocr_is_measure(lctx.alloc)) {
Expand All @@ -2680,9 +2697,12 @@ static struct ggml_cgraph * llm_build_llama(
}
}

struct ggml_tensor * KQ_mask_gpu = ggml_view_tensor(ctx0, KQ_mask);
offload_func_kq(KQ_mask_gpu);
ggml_set_name(KQ_mask_gpu, "KQ_mask_gpu");

// KQ_pos - contains the positions
struct ggml_tensor * KQ_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
offload_func_kq(KQ_pos);
ggml_set_name(KQ_pos, "KQ_pos");
ggml_allocr_alloc(lctx.alloc, KQ_pos);
if (!ggml_allocr_is_measure(lctx.alloc)) {
Expand All @@ -2692,6 +2712,10 @@ static struct ggml_cgraph * llm_build_llama(
}
}

struct ggml_tensor * KQ_pos_gpu = ggml_view_tensor(ctx0, KQ_pos);
offload_func_kq(KQ_pos_gpu);
ggml_set_name(KQ_pos_gpu, "KQ_pos_gpu");

// shift the entire K-cache if needed
if (do_rope_shift) {
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_ctx);
Expand All @@ -2708,13 +2732,15 @@ static struct ggml_cgraph * llm_build_llama(
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * tmp =
ggml_rope_custom_inplace(ctx0,
ggml_view_3d(ctx0, kv_self.k,
ggml_view_3d(ctx0, kv_self.k_l[il],
n_embd_head, n_head_kv, n_ctx,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il),
ggml_element_size(kv_self.k_l[il])*n_embd_head,
ggml_element_size(kv_self.k_l[il])*n_embd_gqa,
0),
K_shift, n_embd_head, 0, 0, freq_base, freq_scale);
offload_func_kq(tmp);
if (il >= i_gpu_start) {
offload_func_kq(tmp);
}
ggml_build_forward_expand(gf, tmp);
}
}
Expand All @@ -2723,10 +2749,23 @@ static struct ggml_cgraph * llm_build_llama(
ggml_format_name(inpL, "layer_inp_%d", il);

offload_func_t offload_func = llama_nop;
offload_func_v = llama_nop;
offload_func_kq = llama_nop;

struct ggml_tensor * KQ_mask_l = KQ_mask;
struct ggml_tensor * KQ_pos_l = KQ_pos;

#ifdef GGML_USE_CUBLAS
if (il >= i_gpu_start) {
offload_func = ggml_cuda_assign_buffers_no_alloc;
if (offload_k) {
KQ_mask_l = KQ_mask_gpu;
KQ_pos_l = KQ_pos_gpu;
offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
}
if (offload_v) {
offload_func_v = ggml_cuda_assign_buffers_no_alloc;
}
}
#endif // GGML_USE_CUBLAS

Expand Down Expand Up @@ -2755,13 +2794,13 @@ static struct ggml_cgraph * llm_build_llama(
offload_func_kq(tmpq);
ggml_set_name(tmpq, "tmpq");

struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
struct ggml_tensor * Kcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), KQ_pos_l, n_embd_head, 0, 0, freq_base, freq_scale);
offload_func_kq(Kcur);
ggml_set_name(Kcur, "Kcur");
ggml_format_name(Kcur, "Kcur%d", il);

struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos, n_embd_head, 0, 0, freq_base, freq_scale);
struct ggml_tensor * Qcur = ggml_rope_custom(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), KQ_pos_l, n_embd_head, 0, 0, freq_base, freq_scale);
offload_func_kq(Qcur);
ggml_set_name(Qcur, "Qcur");
ggml_format_name(Qcur, "Qcur%d", il);

// store key and value to memory
{
Expand All @@ -2775,13 +2814,13 @@ static struct ggml_cgraph * llm_build_llama(
offload_func_v(Vcur);
ggml_set_name(Vcur, "Vcur");

struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + kv_head));
struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k_l[il], n_tokens*n_embd_gqa, (ggml_element_size(kv_self.k_l[il])*n_embd_gqa)*(kv_head));
offload_func_kq(k);
ggml_set_name(k, "k");

struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, n_tokens, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v),
(il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + kv_head*ggml_element_size(kv_self.v));
struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v_l[il], n_tokens, n_embd_gqa,
( n_ctx)*ggml_element_size(kv_self.v_l[il]),
kv_head*ggml_element_size(kv_self.v_l[il]));
offload_func_v(v);
ggml_set_name(v, "v");

Expand All @@ -2795,11 +2834,11 @@ static struct ggml_cgraph * llm_build_llama(
ggml_set_name(Q, "Q");

struct ggml_tensor * K =
ggml_view_3d(ctx0, kv_self.k,
ggml_view_3d(ctx0, kv_self.k_l[il],
n_embd_head, n_kv, n_head_kv,
ggml_element_size(kv_self.k)*n_embd_gqa,
ggml_element_size(kv_self.k)*n_embd_head,
ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
ggml_element_size(kv_self.k_l[il])*n_embd_gqa,
ggml_element_size(kv_self.k_l[il])*n_embd_head,
0);
offload_func_kq(K);
ggml_set_name(K, "K");

Expand All @@ -2815,9 +2854,9 @@ static struct ggml_cgraph * llm_build_llama(
ggml_set_name(KQ_scaled, "KQ_scaled");

// KQ_masked = mask_past(KQ_scaled)
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask_l);
offload_func_kq(KQ_masked);
ggml_set_name(KQ_masked, "KQ_masked");
ggml_format_name(KQ_masked, "KQ_masked%d", il);

// KQ = soft_max(KQ_masked)
struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
Expand All @@ -2826,11 +2865,11 @@ static struct ggml_cgraph * llm_build_llama(

// split cached V into n_head heads
struct ggml_tensor * V =
ggml_view_3d(ctx0, kv_self.v,
ggml_view_3d(ctx0, kv_self.v_l[il],
n_kv, n_embd_head, n_head_kv,
ggml_element_size(kv_self.v)*n_ctx,
ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
ggml_element_size(kv_self.v_l[il])*n_ctx,
ggml_element_size(kv_self.v_l[il])*n_ctx*n_embd_head,
0);
offload_func_v(V);
ggml_set_name(V, "V");

Expand Down Expand Up @@ -6872,7 +6911,14 @@ struct llama_context * llama_new_context_with_model(
}

{
const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
// const size_t memory_size = ggml_nbytes(ctx->kv_self.k) + ggml_nbytes(ctx->kv_self.v);
size_t memory_size = 0;
for (auto & k : ctx->kv_self.k_l) {
memory_size += ggml_nbytes(k);
}
for (auto & v : ctx->kv_self.v_l) {
memory_size += ggml_nbytes(v);
}
LLAMA_LOG_INFO("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}

Expand Down Expand Up @@ -6946,8 +6992,12 @@ struct llama_context * llama_new_context_with_model(
}

size_t kv_vram_size = 0;
add_tensor(ctx->kv_self.k, kv_vram_size);
add_tensor(ctx->kv_self.v, kv_vram_size);
for (auto & k : ctx->kv_self.k_l) {
add_tensor(k, kv_vram_size);
}
for (auto & v : ctx->kv_self.v_l) {
add_tensor(v, kv_vram_size);
}

size_t ctx_vram_size = alloc_size + kv_vram_size;
size_t total_vram_size = model_vram_size + ctx_vram_size;
Expand Down