Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Don't double count the sample time #2107

Merged
merged 1 commit into from
Jul 5, 2023
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 9 additions & 11 deletions llama.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1896,10 +1896,10 @@ void llama_sample_top_p(struct llama_context * ctx, llama_token_data_array * can
return;
}

const int64_t t_start_sample_us = ggml_time_us();

llama_sample_softmax(ctx, candidates);

const int64_t t_start_sample_us = ggml_time_us();

// Compute the cumulative probabilities
float cum_sum = 0.0f;
size_t last_idx = candidates->size;
Expand Down Expand Up @@ -1928,9 +1928,8 @@ void llama_sample_tail_free(struct llama_context * ctx, llama_token_data_array *
return;
}

const int64_t t_start_sample_us = ggml_time_us();

llama_sample_softmax(nullptr, candidates);
const int64_t t_start_sample_us = ggml_time_us();

// Compute the first and second derivatives
std::vector<float> first_derivatives(candidates->size - 1);
Expand Down Expand Up @@ -1982,11 +1981,11 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
return;
}

const int64_t t_start_sample_us = ggml_time_us();

// Compute the softmax of logits and calculate entropy
llama_sample_softmax(nullptr, candidates);

const int64_t t_start_sample_us = ggml_time_us();

float entropy = 0.0f;
for (size_t i = 0; i < candidates->size; ++i) {
entropy += -candidates->data[i].p * logf(candidates->data[i].p);
Expand Down Expand Up @@ -2155,13 +2154,11 @@ llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_

if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
ctx->n_sample++;
howard0su marked this conversation as resolved.
Show resolved Hide resolved
}
return X;
}

llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, float * mu) {
assert(ctx);
int64_t t_start_sample_us;
t_start_sample_us = ggml_time_us();

Expand All @@ -2176,13 +2173,14 @@ llama_token llama_sample_token_mirostat_v2(struct llama_context * ctx, llama_tok
candidates->size = 1;
}

if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}

// Normalize the probabilities of the remaining words
llama_sample_softmax(ctx, candidates);

// Sample the next word X from the remaining words
if (ctx) {
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
}
llama_token X = llama_sample_token(ctx, candidates);
t_start_sample_us = ggml_time_us();

Expand Down