fix baichuan2-7b, deepseek-vl and xcomposer2d5-4bit

InternLM · Aug 6, 2024 · bf33b6d · bf33b6d
1 parent 69f90ae
commit bf33b6d
Show file tree

Hide file tree

Showing 4 changed files with 11 additions and 6 deletions.
diff --git a/lmdeploy/turbomind/deploy/policy.py b/lmdeploy/turbomind/deploy/policy.py
@@ -24,11 +24,13 @@ def unpack_awq_gemm(x: torch.Tensor) -> torch.Tensor:
     return torch.stack(ys, dim=-1).view(*x.shape[:-1], -1)
 
 
-def process_awq_gemm(x: torch.Tensor, *args):
+def process_awq_gemm(x: torch.Tensor, kind: str):
     x = x.cuda()
     if x.dtype == torch.int32:
         x = unpack_awq_gemm(x)
-    return x.t()
+    if kind in ['qweight', 'qzeros', 'scales']:
+        x = x.t()
+    return x
 
 
 def process_gptq(x: torch.Tensor, kind: str):
@@ -39,7 +41,9 @@ def process_gptq(x: torch.Tensor, kind: str):
             x = torch.stack(xs, dim=1).view(-1, x.size(-1))
         else:  # 'qzeros' (k/g,n/8)
             x = torch.stack(xs, dim=-1).view(x.size(0), -1) + 1
-    return x.t()
+    if kind in ['qweight', 'qzeros', 'scales']:
+        x = x.t()
+    return x
 
 
 def get_input_policy(model_format):

diff --git a/lmdeploy/turbomind/deploy/source_model/baichuan.py b/lmdeploy/turbomind/deploy/source_model/baichuan.py
@@ -14,7 +14,7 @@ def _attn(self, i: int, kind: str):
         q, k, v, o = (None, ) * 4
         pack_key = f'model.layers.{i}.self_attn.W_pack.{kind}'
         qkv = self.transform(self.params.get(pack_key), kind)
-        if qkv:
+        if qkv is not None:
             q, k, v = torch.split(qkv, qkv.shape[0] // 3, dim=0)
         o = self.params.get(f'model.layers.{i}.self_attn.o_proj.{kind}')
         o = self.transform(o, kind)

diff --git a/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py b/lmdeploy/turbomind/deploy/source_model/deepseek_vl.py
@@ -9,6 +9,7 @@
 class DeepSeekVLReader(LlamaReader):
     """DeepSeekVL model reader."""
 
+    attn_layer_prefix = 'language_model.model.layers'
     attn_layer_patten = r'language_model.model.layers.([0-9]+).'
     tok_embeddings_key = 'language_model.model.embed_tokens.weight'
     norm_weight_key = 'language_model.model.norm.weight'

diff --git a/src/turbomind/kernels/gemm/tune/stopping_criterion.cc b/src/turbomind/kernels/gemm/tune/stopping_criterion.cc
@@ -9,8 +9,8 @@ class Optimistic: public StoppingCriterion {
 public:
     Optimistic(int min_iter, int max_iter, float max_ms)
     {
-        min_iter_ = min_iter ? min_iter > 0 : 1;
-        max_iter_ = max_iter ? max_iter > 0 : std::numeric_limits<int>::max();
+        min_iter_ = std::max(min_iter, 1);
+        max_iter_ = max_iter > 0 ? max_iter : std::numeric_limits<int>::max();
         max_ms_   = max_ms > 0 ? max_ms : std::numeric_limits<float>::infinity();
     }
     bool should_stop(const Stats& stats) override