Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llama : greatly reduce output buffer memory usage #6122

Merged
merged 26 commits into from
Mar 26, 2024
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
1fd1918
llama : greatly reduce logits memory usage
compilade Mar 15, 2024
98914c0
llama : more compact state saving and reloading
compilade Mar 15, 2024
705d393
llama : fix lctx.n_outputs not being set before building graph
compilade Mar 16, 2024
25981fc
perplexity : adapt to the logits API changes
compilade Mar 17, 2024
17b45c9
perplexity : fix Winogrande, use correct logits for second choice start
compilade Mar 17, 2024
d0129e8
perplexity : normalize spaces and punctuation in Winogrande sentences
compilade Mar 17, 2024
487f89e
llama : fix embedding conditions
compilade Mar 17, 2024
408fcb0
llama : fix llama_get_embeddings_ith when the resulting id is 0
compilade Mar 17, 2024
e19cb3a
llama : fix wrong n_outputs in llama_set_inputs
compilade Mar 17, 2024
a57fa7f
llama : fix not-skipping outputs of non-causal models
compilade Mar 18, 2024
711b0bc
llama : fix running a batch with n_outputs == 0
compilade Mar 18, 2024
d100502
llama : keep same graph topology even when n_outputs == 0
compilade Mar 18, 2024
99c37cc
ggml : saner ggml_can_repeat with empty tensors
compilade Mar 18, 2024
6bf7f3f
ggml : do not multi-thread ops returning empty tensors
compilade Mar 18, 2024
09bb15a
ggml : make ggml_is_empty public and work with views
compilade Mar 19, 2024
4551e7e
llama : use a vector for ctx->output_ids
compilade Mar 19, 2024
8b826c5
ggml : skip empty tensors in all backends
compilade Mar 19, 2024
d04cfaf
llama : fix llama_output_reserve nullptr deref when new_size is 0
compilade Mar 19, 2024
8f70dcb
perplexity : make Winogrande work as it does on master
compilade Mar 19, 2024
615a3a4
llama : clearer error messages for invalid logits or embeddings ids
compilade Mar 19, 2024
7d8d6b5
llama : handle errors from llama_output_reserve at call sites
compilade Mar 21, 2024
5f33a67
perplexity : make hellaswag and multiple-choice outputs identical to …
compilade Mar 21, 2024
ffa9abd
Merge branch 'master' into compilade/smaller-output-buffer
compilade Mar 25, 2024
e9095ac
llama : allow loading state saved with a different ctx size
compilade Mar 26, 2024
5027d81
llama : minor
ggerganov Mar 26, 2024
20248e8
readme : update recent API changes, and warn about Vulkan
compilade Mar 26, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -403,6 +403,7 @@ static bool compute_imatrix(llama_context * ctx, const gpt_params & params, bool
tokens[batch_start] = llama_token_bos(llama_get_model(ctx));
}

// TODO: use batch.logits to save computations instead of relying on logits_all == true
if (llama_decode(ctx, llama_batch_get_one(tokens.data() + batch_start, batch_size, j * n_batch, 0))) {
fprintf(stderr, "%s : failed to eval\n", __func__);
return false;
Expand Down
1 change: 0 additions & 1 deletion examples/parallel/parallel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,6 @@ int main(int argc, char ** argv) {
llama_context * ctx = NULL;

// load the target model
params.logits_all = true;
std::tie(model, ctx) = llama_init_from_gpt_params(params);

// load the prompts from an external file if there are any
Expand Down
189 changes: 118 additions & 71 deletions examples/perplexity/perplexity.cpp

Large diffs are not rendered by default.

3 changes: 2 additions & 1 deletion examples/server/server.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -744,7 +744,8 @@ struct server_context {
{
const int32_t n_batch = llama_n_batch(ctx);

batch = llama_batch_init(n_batch, 0, params.n_parallel);
// only a single seq_id per token is needed
batch = llama_batch_init(n_batch, 0, 1);
}

metrics.init();
Expand Down
1 change: 0 additions & 1 deletion examples/speculative/speculative.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ int main(int argc, char ** argv) {
llama_context * ctx_dft = NULL;

// load the target model
params.logits_all = true;
std::tie(model_tgt, ctx_tgt) = llama_init_from_gpt_params(params);

// load the draft model
Expand Down
16 changes: 14 additions & 2 deletions ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -2542,6 +2542,12 @@ static inline bool ggml_is_padded_1d(const struct ggml_tensor * tensor) {
tensor->nb[3] == tensor->nb[2]*tensor->ne[2];
}

static inline bool ggml_is_empty(const struct ggml_tensor * tensor) {
// the stride of the last dimension is zero if any of the previous dimensions has no elements
// but to be sure, the number of elements in the last dimension must also be checked
return tensor->nb[GGML_MAX_DIMS - 1] == 0 || tensor->ne[GGML_MAX_DIMS - 1] == 0;
slaren marked this conversation as resolved.
Show resolved Hide resolved
}

bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

Expand All @@ -2556,7 +2562,7 @@ bool ggml_are_same_shape(const struct ggml_tensor * t0, const struct ggml_tensor
static inline bool ggml_can_repeat(const struct ggml_tensor * t0, const struct ggml_tensor * t1) {
static_assert(GGML_MAX_DIMS == 4, "GGML_MAX_DIMS is not 4 - update this function");

return
return ggml_is_empty(t0) ? ggml_is_empty(t1) :
(t1->ne[0]%t0->ne[0] == 0) &&
(t1->ne[1]%t0->ne[1] == 0) &&
(t1->ne[2]%t0->ne[2] == 0) &&
Expand Down Expand Up @@ -16047,7 +16053,7 @@ static void ggml_compute_forward_cross_entropy_loss_back(
static void ggml_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
GGML_ASSERT(params);

if (tensor->op == GGML_OP_NONE) {
if (tensor->op == GGML_OP_NONE || ggml_is_empty(tensor)) {
return;
}

Expand Down Expand Up @@ -17929,6 +17935,12 @@ static void ggml_graph_compute_perf_stats_node(struct ggml_tensor * node, const
static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads, int n_cur_threads) {
int n_tasks = 0;

if (ggml_is_empty(node)) {
// no need to multi-thread a no-op
n_tasks = 1;
return n_tasks;
}

switch (node->op) {
case GGML_OP_CPY:
case GGML_OP_DUP:
Expand Down
Loading
Loading