TabbyML · wsxiaoys · Oct 27, 2023 · Oct 26, 2023 · Oct 26, 2023 · Oct 26, 2023
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,8 +1,12 @@
 # v0.5.0 [Unreleased]
 
+## Notice
+* llama.cpp backend (CPU, Metal) now requires a redownload of gguf model due to upstream format changes: https://github.com/TabbyML/tabby/pull/645 https://github.com/ggerganov/llama.cpp/pull/3252
+
 ## Features
 
 ## Fixes and Improvements
+
 * Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
 * add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637
 

diff --git a/crates/llama-cpp-bindings/llama.cpp b/crates/llama-cpp-bindings/llama.cpp
diff --git a/crates/llama-cpp-bindings/src/engine.cc b/crates/llama-cpp-bindings/src/engine.cc
@@ -21,7 +21,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
   TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :
     model_(std::move(model)),
     ctx_(std::move(ctx)) {
-      batch_ = llama_batch_init(N_BATCH, 0);
   }
 
   void start(rust::Slice<const uint32_t> input_token_ids) override {
@@ -46,14 +45,14 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
   }
 
   uint32_t eos_token() const override {
-    return llama_token_eos(ctx_.get());
+    return llama_token_eos(llama_get_model(ctx_.get()));
   }
 
  private:
   uint32_t sample() const {
     auto* ctx = ctx_.get();
 
-    auto logits = llama_get_logits_ith(ctx, batch_.n_tokens - 1);
+    auto logits = llama_get_logits_ith(ctx, 0);
     auto n_vocab = llama_n_vocab(llama_get_model(ctx));
 
     // Greedy sampling (always select the highest logit).
@@ -65,18 +64,9 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
       n_past_ = 0;
     }
 
-    batch_.n_tokens = size;
-    for (size_t i = 0; i < size; ++i) {
-      batch_.token[i] = data[i];
-      batch_.pos[i] = n_past_ + i;
-      batch_.seq_id[i] = 0;
-      batch_.logits[i] = false;
-    }
-    batch_.logits[size - 1] = true;
-
     auto* ctx = ctx_.get();
     llama_kv_cache_tokens_rm(ctx, n_past_, -1);
-    if (llama_decode(ctx, batch_)) {
+    if (llama_decode(ctx, llama_batch_get_one(data, size, n_past_, 0))) {
       throw std::runtime_error("Failed to eval");
     }
 
@@ -86,8 +76,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
   size_t n_past_;
   owned<llama_model> model_;
   owned<llama_context> ctx_;
-
-  llama_batch batch_;
 };
 
 static int g_llama_cpp_log_level = 0;

diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs
@@ -89,4 +89,8 @@ impl ModelDir {
     pub fn ggml_q8_0_file(&self) -> String {
         self.path_string("ggml/q8_0.gguf")
     }
+
+    pub fn ggml_q8_0_v2_file(&self) -> String {
+        self.path_string("ggml/q8_0.v2.gguf")
+    }
 }
diff --git a/crates/tabby-download/src/lib.rs b/crates/tabby-download/src/lib.rs
@@ -48,7 +48,7 @@ impl Downloader {
         let files = vec![
             ("tabby.json", true),
             ("tokenizer.json", true),
-            ("ggml/q8_0.gguf", true),
+            ("ggml/q8_0.v2.gguf", true),
         ];
         self.download_files(&files).await
     }

diff --git a/crates/tabby/src/serve/engine.rs b/crates/tabby/src/serve/engine.rs
@@ -82,7 +82,7 @@ fn create_ctranslate2_engine(
 
 fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
     let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
-        .model_path(model_dir.ggml_q8_0_file())
+        .model_path(model_dir.ggml_q8_0_v2_file())
         .tokenizer_path(model_dir.tokenizer_file())
         .use_gpu(device.ggml_use_gpu())
         .build()