Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: upgrade llama.cpp #645

Merged
merged 5 commits into from
Oct 27, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,12 @@
# v0.5.0 [Unreleased]

## Notice
* llama.cpp backend (CPU, Metal) now requires a redownload of gguf model due to upstream format changes: https://github.com/TabbyML/tabby/pull/645 https://github.com/ggerganov/llama.cpp/pull/3252

## Features

## Fixes and Improvements

* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637

Expand Down
2 changes: 1 addition & 1 deletion crates/llama-cpp-bindings/llama.cpp
Submodule llama.cpp updated 139 files
18 changes: 3 additions & 15 deletions crates/llama-cpp-bindings/src/engine.cc
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :
model_(std::move(model)),
ctx_(std::move(ctx)) {
batch_ = llama_batch_init(N_BATCH, 0);
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Previous usage of batch api in tabby generates a segmentation fault with updated llama.cpp version - roll back to llama_batch_get_one as workaround, will revisit this when integrating the continuous batching support.

}

void start(rust::Slice<const uint32_t> input_token_ids) override {
Expand All @@ -46,14 +45,14 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
}

uint32_t eos_token() const override {
return llama_token_eos(ctx_.get());
return llama_token_eos(llama_get_model(ctx_.get()));
}

private:
uint32_t sample() const {
auto* ctx = ctx_.get();

auto logits = llama_get_logits_ith(ctx, batch_.n_tokens - 1);
auto logits = llama_get_logits_ith(ctx, 0);
auto n_vocab = llama_n_vocab(llama_get_model(ctx));

// Greedy sampling (always select the highest logit).
Expand All @@ -65,18 +64,9 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
n_past_ = 0;
}

batch_.n_tokens = size;
for (size_t i = 0; i < size; ++i) {
batch_.token[i] = data[i];
batch_.pos[i] = n_past_ + i;
batch_.seq_id[i] = 0;
batch_.logits[i] = false;
}
batch_.logits[size - 1] = true;

auto* ctx = ctx_.get();
llama_kv_cache_tokens_rm(ctx, n_past_, -1);
if (llama_decode(ctx, batch_)) {
if (llama_decode(ctx, llama_batch_get_one(data, size, n_past_, 0))) {
throw std::runtime_error("Failed to eval");
}

Expand All @@ -86,8 +76,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
size_t n_past_;
owned<llama_model> model_;
owned<llama_context> ctx_;

llama_batch batch_;
};

static int g_llama_cpp_log_level = 0;
Expand Down
4 changes: 4 additions & 0 deletions crates/tabby-common/src/path.rs
Original file line number Diff line number Diff line change
Expand Up @@ -89,4 +89,8 @@ impl ModelDir {
pub fn ggml_q8_0_file(&self) -> String {
self.path_string("ggml/q8_0.gguf")
}

pub fn ggml_q8_0_v2_file(&self) -> String {
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

updated llama.cpp requires re-converting all starcoder models, updating filepath to keep forward compatibility.

self.path_string("ggml/q8_0.v2.gguf")
}
}
2 changes: 1 addition & 1 deletion crates/tabby-download/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ impl Downloader {
let files = vec![
("tabby.json", true),
("tokenizer.json", true),
("ggml/q8_0.gguf", true),
("ggml/q8_0.v2.gguf", true),
];
self.download_files(&files).await
}
Expand Down
2 changes: 1 addition & 1 deletion crates/tabby/src/serve/engine.rs
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ fn create_ctranslate2_engine(

fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
.model_path(model_dir.ggml_q8_0_file())
.model_path(model_dir.ggml_q8_0_v2_file())
.tokenizer_path(model_dir.tokenizer_file())
.use_gpu(device.ggml_use_gpu())
.build()
Expand Down
Loading