diff --git a/CHANGELOG.md b/CHANGELOG.md index 744449f2dafb..ead851f841f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,12 @@ # v0.5.0 [Unreleased] +## Notice +* llama.cpp backend (CPU, Metal) now requires a redownload of gguf model due to upstream format changes: https://github.com/TabbyML/tabby/pull/645 https://github.com/ggerganov/llama.cpp/pull/3252 + ## Features ## Fixes and Improvements + * Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638 * add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637 diff --git a/crates/llama-cpp-bindings/llama.cpp b/crates/llama-cpp-bindings/llama.cpp index 6ed7dce31afd..5cc49e631f09 160000 --- a/crates/llama-cpp-bindings/llama.cpp +++ b/crates/llama-cpp-bindings/llama.cpp @@ -1 +1 @@ -Subproject commit 6ed7dce31afdf4d5a11ed8bfd0f993dcb8df39c0 +Subproject commit 5cc49e631f0902f33b10b7703b4d174fd635ccd9 diff --git a/crates/llama-cpp-bindings/src/engine.cc b/crates/llama-cpp-bindings/src/engine.cc index 7f3f2986cd2a..e5d10c26821b 100644 --- a/crates/llama-cpp-bindings/src/engine.cc +++ b/crates/llama-cpp-bindings/src/engine.cc @@ -21,7 +21,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine { TextInferenceEngineImpl(owned model, owned ctx) : model_(std::move(model)), ctx_(std::move(ctx)) { - batch_ = llama_batch_init(N_BATCH, 0); } void start(rust::Slice input_token_ids) override { @@ -46,14 +45,14 @@ class TextInferenceEngineImpl : public TextInferenceEngine { } uint32_t eos_token() const override { - return llama_token_eos(ctx_.get()); + return llama_token_eos(llama_get_model(ctx_.get())); } private: uint32_t sample() const { auto* ctx = ctx_.get(); - auto logits = llama_get_logits_ith(ctx, batch_.n_tokens - 1); + auto logits = llama_get_logits_ith(ctx, 0); auto n_vocab = llama_n_vocab(llama_get_model(ctx)); // Greedy sampling (always select the highest logit). @@ -65,18 +64,9 @@ class TextInferenceEngineImpl : public TextInferenceEngine { n_past_ = 0; } - batch_.n_tokens = size; - for (size_t i = 0; i < size; ++i) { - batch_.token[i] = data[i]; - batch_.pos[i] = n_past_ + i; - batch_.seq_id[i] = 0; - batch_.logits[i] = false; - } - batch_.logits[size - 1] = true; - auto* ctx = ctx_.get(); llama_kv_cache_tokens_rm(ctx, n_past_, -1); - if (llama_decode(ctx, batch_)) { + if (llama_decode(ctx, llama_batch_get_one(data, size, n_past_, 0))) { throw std::runtime_error("Failed to eval"); } @@ -86,8 +76,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine { size_t n_past_; owned model_; owned ctx_; - - llama_batch batch_; }; static int g_llama_cpp_log_level = 0; diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index 9dc0ec0fc8f9..17717a40f114 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -89,4 +89,8 @@ impl ModelDir { pub fn ggml_q8_0_file(&self) -> String { self.path_string("ggml/q8_0.gguf") } + + pub fn ggml_q8_0_v2_file(&self) -> String { + self.path_string("ggml/q8_0.v2.gguf") + } } diff --git a/crates/tabby-download/src/lib.rs b/crates/tabby-download/src/lib.rs index 4ce9a8abfa27..16cf31a8aa4a 100644 --- a/crates/tabby-download/src/lib.rs +++ b/crates/tabby-download/src/lib.rs @@ -48,7 +48,7 @@ impl Downloader { let files = vec![ ("tabby.json", true), ("tokenizer.json", true), - ("ggml/q8_0.gguf", true), + ("ggml/q8_0.v2.gguf", true), ]; self.download_files(&files).await } diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs index 8675bf32b097..b2e14ae08ce0 100644 --- a/crates/tabby/src/serve/engine.rs +++ b/crates/tabby/src/serve/engine.rs @@ -82,7 +82,7 @@ fn create_ctranslate2_engine( fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box { let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default() - .model_path(model_dir.ggml_q8_0_file()) + .model_path(model_dir.ggml_q8_0_v2_file()) .tokenizer_path(model_dir.tokenizer_file()) .use_gpu(device.ggml_use_gpu()) .build()