mlc-ai · vvchernov · Jun 6, 2023 · Jun 6, 2023 · Jun 6, 2023 · Jun 6, 2023
diff --git a/build.py b/build.py
@@ -11,7 +11,7 @@
 
 import mlc_llm
 from mlc_llm import utils
-from mlc_llm.relax_model import gpt_bigcode, gpt_neox, llama, moss, rwkv
+from mlc_llm.relax_model import gpt_bigcode, gpt_neox, llama, moss, rwkv, mpt
 
 
 def _parse_args():
@@ -57,6 +57,12 @@ def _parse_args():
         default=1,
         help="Whether to use previously pickled IRModule and skip trace.",
     )
+    args.add_argument(
+        "--use-kv-cache",
+        action="store_false",
+        default=True,
+        help="Forcely replace use_cache hyperparameter in model config",
+    )
     args.add_argument("--debug-dump", action="store_true", default=False)
     args.add_argument("--debug-load-script", action="store_true", default=False)
     args.add_argument(
@@ -274,6 +280,20 @@ def mod_transform_before_build(
             "get_metadata",
             "reset_kv_cache",
         ]
+    elif ARGS.model.startswith("mpt-"):
+        if ARGS.use_kv_cache:
+            model_names = [
+                "decode",
+                "create_kv_cache",
+                "softmax_with_temperature",
+                "get_metadata",
+            ]
+        else:
+            model_names = [
+                "decode",
+                "softmax_with_temperature",
+                "get_metadata",
+            ]
     else:
         model_names = [
             "prefill",
@@ -337,6 +357,10 @@ def dump_default_mlc_chat_config(args):
     config["shift_fill_factor"] = 0.3
     config["tokenizer_files"] = utils.get_tokenizer_files(params_path)
 
+    # TODO(vchernov): create mechanism which gets default config prepared for specific model and covers this one
+    if args.model_category == "mpt":
+        config["temperature"] = 0.0
+
     dump_path = os.path.join(params_path, "mlc-chat-config.json")
     with open(dump_path, "w", encoding="utf-8") as outfile:
         json.dump(config, outfile, indent=4)
@@ -407,6 +431,8 @@ def main():
                 mod, params = moss.get_model(ARGS, config)
             elif ARGS.model_category == "rwkv":
                 mod, params = rwkv.get_model(ARGS, config)
+            elif ARGS.model_category == "mpt":
+                mod, params = mpt.get_model(ARGS, config)
             else:
                 raise ValueError(f"Model {ARGS.model} not supported")
             mod = mod_transform_before_build(mod, params, ARGS)

diff --git a/cpp/conv_templates.cc b/cpp/conv_templates.cc
@@ -295,6 +295,25 @@ Conversation CodeGPT() {
   return conv;
 }
 
+Conversation MPT() {
+  Conversation conv;
+  conv.name = "mpt";
+  conv.system = "";
+  conv.roles = {"", ""};
+  conv.messages = {};
+  conv.separator_style = SeparatorStyle::kSepRoleMsg;
+  conv.offset = 0;
+  conv.seps = {"\n"};
+  conv.role_msg_sep = "";
+  conv.role_empty_sep = "";
+  // TODO(mlc-team): add eos to mlc-chat-config
+  // and remove eos from stop token setting.
+  conv.stop_tokens = {0};
+  conv.stop_str = "<|endoftext|>";
+  conv.add_bos = false;
+  return conv;
+}
+
 }  // namespace
 
 using ConvFactory = Conversation (*)();
@@ -312,6 +331,7 @@ Conversation Conversation::FromTemplate(const std::string& name) {
       {"moss", MOSS},
       {"LM", VanillaLM},
       {"code_gpt", CodeGPT},
+      {"mpt", MPT},
   };
   auto it = factory.find(name);
   if (it == factory.end()) {

diff --git a/cpp/llm_chat.cc b/cpp/llm_chat.cc
@@ -128,7 +128,7 @@ class LLMChat {
   friend class LLMChatModule;
 
  public:
-  explicit LLMChat(DLDevice device) : device_(device) {}
+  explicit LLMChat(DLDevice device) : device_(device), debug_index_(0) {}
 
   /*!
    * \return Text describing runtime stats.
@@ -289,8 +289,11 @@ class LLMChat {
         << "Cannot find env function vm.builtin.attention_kv_cache_array_popn";
     fkvcache_array_popn_ = *fkvcache_array_popn;
 
-    // Step 4. KV cache creation.
-    kv_cache_ = vm_->GetFunction("create_kv_cache")();
+    // Step 4. KV cache creation if need.
+    auto kv_cache_func = vm_->GetFunction("create_kv_cache");
+    if (kv_cache_func.defined()) {
+      kv_cache_ = kv_cache_func();
+    }
 
     // Step 5. KV cache reset.
     reset_kv_cache_func_ = vm_->GetFunction("reset_kv_cache");
@@ -508,6 +511,9 @@ class LLMChat {
     }
 
     std::vector<int32_t> prompt_tokens = this->GetInputTokens();
+    if (kv_cache_.empty()) {
+      full_output_ids_.insert(full_output_ids_.end(), prompt_tokens.begin(), prompt_tokens.end());
+    }
     int64_t token_len = static_cast<int64_t>(prompt_tokens.size());
     if (token_len == 0) return;
 
@@ -527,14 +533,18 @@ class LLMChat {
   }
 
   void DecodeStep() {
-    ICHECK(!output_ids_.empty());
-    int32_t last_token = output_ids_.back();
-    tvm::runtime::NDArray input_data = GetInputTokenNDArray({last_token});
+    std::vector<int32_t> input_tokens;
+    if (kv_cache_.empty()) {
+      ICHECK(!full_output_ids_.empty());
+      input_tokens = full_output_ids_;
+    } else {
+      ICHECK(!output_ids_.empty());
+      input_tokens = {output_ids_.back()};
+    }
 
     auto tstart = std::chrono::high_resolution_clock::now();
 
-    NDArray logits_on_device = this->Forward({last_token}, total_seq_len_ + 1);
-    total_seq_len_ += 1;
+    NDArray logits_on_device = this->Forward(input_tokens, ++total_seq_len_);
 
     int32_t next_token = this->SampleTokenFromLogits(logits_on_device, temperature_, top_p_);
 
@@ -588,12 +598,7 @@ class LLMChat {
     auto decoding_end = std::chrono::high_resolution_clock::now();
 
     // print first few logits for eyeballs
-    std::ostringstream os;
-    for (int i = 0; i < 10; ++i) {
-      if (i != 0) os << ", ";
-      os << static_cast<float*>(logits_on_cpu_->data)[i];
-    }
-    LOG(INFO) << "logits[:10] =[" << os.str() << "]";
+    PrintNDArray(logits_on_cpu_, 10, "Logits");
 
     double encoding_ms = static_cast<double>((decoding_start - encoding_start).count()) / 1e6;
     double decoding_ms = static_cast<double>((decoding_end - decoding_start).count()) / 1e6;
@@ -602,6 +607,62 @@ class LLMChat {
               << "decoding-time=" << decoding_ms << "ms.";
   }
 
+  NDArray getArrayToPrint(NDArray array) const {
+    ICHECK(array->data != nullptr) << "Array data is nullptr";
+    // Check that the data on CPU and copy if need
+    if (array->device.device_type != kDLCPU) {
+      NDArray array_cpu;
+      array_cpu = array.CopyTo(DLDevice{kDLCPU, 0});
+      TVMSynchronize(device_.device_type, device_.device_id, nullptr);
+      return array_cpu;
+    } else {
+      return array;
+    }
+  }
+
+  void PrintNDArray(NDArray array, int64_t num = -1, std::string tensor_tag = "Tensor", bool to_save = false) {
+    NDArray array_cpu = getArrayToPrint(array);
+
+    size_t ndim = array_cpu->ndim;
+    int64_t numel = 1;
+    // Print shape and calculate numel
+    std::ostringstream os_shape;
+    for (size_t i = 0; i < ndim; ++i) {
+      if (i != 0) os_shape << ", ";
+      numel *= array_cpu->shape[i];
+      os_shape << array_cpu->shape[i];
+    }
+
+    std::string num_tag = std::to_string(num);
+    if (num == -1 || num >= numel) {
+      num = numel;
+      num_tag = "";
+    }
+    // TODO(vchernov): after test return LOG(INFO)
+    std::cout << tensor_tag << " shape = [" << os_shape.str() << "]" << std::endl;
+    // LOG(INFO) << tensor_tag << " shape = [" << os_shape.str() << "]";
+
+    // Print specified number of values from tensor
+    std::ostringstream os;
+    const float* p_data = static_cast<float*>(array_cpu->data);
+    for (int64_t i = 0; i < num; ++i) {
+      if (i != 0) os << ", ";
+      os << p_data[i];
+    }
+    // TODO(vchernov): after test return LOG(INFO)
+    std::cout << tensor_tag << "[:" << num_tag << "] = [" << os.str() << "]" << std::endl;
+    // LOG(INFO) << tensor_tag << "[:" << num_tag << "] = [" << os.str() << "]";
+
+    // Save to binary file
+    if (to_save) {
+      std::string file_name = "tensor_" + std::to_string(debug_index_++) + ".bin";
+      std::cout << tensor_tag << " is saved in " << file_name << std::endl;
+      std::ofstream fs(file_name, std::ios::out | std::ios::binary | std::ios::app);
+      fs.write(reinterpret_cast<const char*>(p_data), 4 * numel);
+      fs.close();
+    }
+  }
+
  private:
   picojson::value SerializeConfigToJSONValue() const {
     picojson::object config;
@@ -656,6 +717,9 @@ class LLMChat {
 
     if (!stop_triggered_) {
       output_ids_.push_back(next_token);
+      if (kv_cache_.empty()) {
+        full_output_ids_.push_back(next_token);
+      }
       appeared_token_ids_.insert(next_token);
     }
 
@@ -699,10 +763,16 @@ class LLMChat {
       ret = prefill_func_(input_data, ShapeTuple({cur_pos}), kv_cache_, params_);
     } else {
       // running decode function when prefill is not available
-      for (int i = 0; i < input_tokens.size(); ++i) {
-        NDArray input_data = this->GetInputTokenNDArray({input_tokens[i]});
-        int64_t pos = cur_pos + i + 1 - input_tokens.size();
-        ret = decode_func_(input_data, ShapeTuple({pos}), kv_cache_, params_);
+      if (kv_cache_.empty()){
+        // Without kv_cache full sequence of tokens is used
+        NDArray input_data = this->GetInputTokenNDArray(input_tokens);
+        ret = decode_func_(input_data, params_);
+      } else {
+        for (int i = 0; i < input_tokens.size(); ++i) {
+          NDArray input_data = this->GetInputTokenNDArray({input_tokens[i]});
+          int64_t pos = cur_pos + i + 1 - input_tokens.size();
+          ret = decode_func_(input_data, ShapeTuple({pos}), kv_cache_, params_);
+        }
       }
     }
     return Downcast<NDArray>(ret[0]);
@@ -763,7 +833,10 @@ class LLMChat {
   // Clear kv cache
   void ResetKVCache() { reset_kv_cache_func_(kv_cache_); }
 
-  void ProcessSystemPrompts() { this->PrefillStep(/*inp=*/"", /*append_conversation=*/false); }
+  void ProcessSystemPrompts() {
+    full_output_ids_.clear();
+    this->PrefillStep(/*inp=*/"", /*append_conversation=*/false);
+  }
 
   // Utils
   static double GetRandomNumber() {
@@ -783,6 +856,7 @@ class LLMChat {
     ICHECK(logits_on_cpu_.defined()) << "logits_on_cpu_ is not defined";
     ICHECK_EQ(logits_on_cpu_->ndim, 3) << "logits_on_cpu_ should be 3D";
     ICHECK_EQ(logits_on_cpu_->shape[0], 1) << "logits_on_cpu_ should be 1 batch";
+
     return fsample_topp_from_prob_(logits_on_cpu_, top_p_, GetRandomNumber());
   }
 
@@ -816,6 +890,8 @@ class LLMChat {
   double top_p_{0.95};
   // output ids till now (refresh after encoding step)
   std::vector<int32_t> output_ids_;
+  // output ids till now (sys and client prompt + generated by decoder)
+  std::vector<int32_t> full_output_ids_;
   // appeared token ids till now (refresh after encoding step)
   std::unordered_set<int32_t> appeared_token_ids_;
   // output message till now (refresh after encoding step)
@@ -866,6 +942,8 @@ class LLMChat {
   Array<ObjectRef> kv_cache_;
   // Temp logits on cpu
   NDArray logits_on_cpu_{nullptr};
+  // Debug index
+  int32_t debug_index_;
 };
 
 /*!

diff --git a/mlc_llm/dispatch/dispatch_tir_operator.py b/mlc_llm/dispatch/dispatch_tir_operator.py
@@ -19,6 +19,9 @@ def __init__(self, model: str):
         elif model == "rwkv":
             lookup = None
 
+        elif model == "mpt":
+            lookup = None
+
         else:
             raise ValueError(f"Model {model} not supported")
         self.lookup = lookup

diff --git a/mlc_llm/relax_model/__init__.py b/mlc_llm/relax_model/__init__.py
@@ -1 +1,2 @@
 from . import llama
+from .mpt import mpt