From 8cd0a286b46b6f946f499827f22ac0d37bb4d1f5 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 26 Jan 2024 08:29:09 -0500
Subject: [PATCH 01/41] mamba : begin working on support for Mamba SSM

---
 convert-hf-to-gguf.py          |  11 ++
 ggml.c                         | 185 ++++++++++++++++++++++++++++++++-
 ggml.h                         |  19 ++++
 gguf-py/gguf/constants.py      |  30 ++++++
 gguf-py/gguf/tensor_mapping.py |  45 +++++++-
 llama.cpp                      | 183 ++++++++++++++++++++++++++++++++
 6 files changed, 469 insertions(+), 4 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index ffdba74441e19..28e865e5c6698 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1844,6 +1844,17 @@ class StarCoder2Model(Model):
     model_arch = gguf.MODEL_ARCH.STARCODER2
 
 
+@Model.register("MambaForCausalLM")
+class MambaModel(Model):
+    model_arch = gguf.MODEL_ARCH.MAMBA
+
+    def set_gguf_parameters(self):
+        self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_file_type(self.ftype)
+
+
 ###### CONVERSION LOGIC ######
 
 
diff --git a/ggml.c b/ggml.c
index f29b9f13fbcaf..597bf319ed15a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1577,6 +1577,7 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
 inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
+inline static void ggml_vec_exp_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
 inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
 inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
@@ -1778,6 +1779,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "DIV",
     "SQR",
     "SQRT",
+    "EXP",
     "LOG",
     "SUM",
     "SUM_ROWS",
@@ -1811,6 +1813,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "DIAG_MASK_ZERO",
     "SOFT_MAX",
     "SOFT_MAX_BACK",
+    "SOFT_PLUS",
     "ROPE",
     "ROPE_BACK",
     "ALIBI",
@@ -1850,7 +1853,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1864,6 +1867,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "x/y",
     "x^2",
     "√x",
+    "e^x", // or should this be "exp(x)"?
     "log(x)",
     "Σx",
     "Σx_k",
@@ -1897,6 +1901,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "diag_mask_zero(x)",
     "soft_max(x)",
     "soft_max_back(x)",
+    "soft_plus(x)",
     "rope(x)",
     "rope_back(x)",
     "alibi(x)",
@@ -1936,7 +1941,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 72, "GGML_OP_COUNT != 72");
+static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -3796,6 +3801,39 @@ struct ggml_tensor * ggml_sqrt_inplace(
     return ggml_sqrt_impl(ctx, a, true);
 }
 
+// ggml_exp
+
+static struct ggml_tensor * ggml_exp_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool inplace) {
+    bool is_node = false;
+
+    if (!inplace && (a->grad)) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_EXP;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_exp(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_exp_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_exp_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_exp_impl(ctx, a, true);
+}
+
 // ggml_log
 
 static struct ggml_tensor * ggml_log_impl(
@@ -5291,6 +5329,42 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
     return ggml_soft_max_back_impl(ctx, a, b, true);
 }
 
+// ggml_soft_plus
+
+struct ggml_tensor * ggml_soft_plus_impl(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a,
+        bool                  inplace) {
+
+    // TODO: does `a` need to be contiguous?
+
+    bool is_node = false;
+
+    if (a->grad) {
+        is_node = true; // TODO : implement backward pass
+    }
+
+    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
+
+    result->op   = GGML_OP_SOFT_PLUS;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = a;
+
+    return result;
+}
+
+struct ggml_tensor * ggml_soft_plus(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_soft_plus_impl(ctx, a, false);
+}
+
+struct ggml_tensor * ggml_soft_plus_inplace(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * a) {
+    return ggml_soft_plus_impl(ctx, a, true);
+}
+
 // ggml_rope
 
 static struct ggml_tensor * ggml_rope_impl(
@@ -8593,6 +8667,49 @@ static void ggml_compute_forward_sqrt(
     }
 }
 
+// ggml_compute_forward_exp
+
+static void ggml_compute_forward_exp_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+        return;
+    }
+
+    const int n  = ggml_nrows(src0);
+    const int nc = src0->ne[0];
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < n; i++) {
+        ggml_vec_exp_f32(nc,
+                (float *) ((char *) dst->data  + i*( dst->nb[1])),
+                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    }
+}
+
+static void ggml_compute_forward_exp(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_exp_f32(params, src0, dst);
+            } break;
+        case GGML_TYPE_F16: // TODO: use ggml_table_exp_f16
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_log
 
 static void ggml_compute_forward_log_f32(
@@ -12052,6 +12169,48 @@ static void ggml_compute_forward_soft_max_back(
     }
 }
 
+static void ggml_compute_forward_soft_plus_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+              struct ggml_tensor * dst) {
+    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_are_same_shape(src0, dst));
+
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+        return;
+    }
+
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
+
+    GGML_ASSERT( dst->nb[0] == sizeof(float));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+
+    for (int i = 0; i < nr; ++i) {
+        float * x = (float *) ((char *) dst->data  + i*( dst->nb[1]));
+        float * y = (float *) ((char *) src0->data + i*(src0->nb[1]));
+        for (int j = 0; j < nc; ++j) {
+            x[j] = logf(1.0f + expf(y[i]));
+        }
+    }
+}
+
+static void ggml_compute_forward_soft_plus(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+              struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_soft_plus_f32(params, src0, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_alibi
 
 static void ggml_compute_forward_alibi_f32(
@@ -15447,6 +15606,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_sqrt(params, tensor);
             } break;
+        case GGML_OP_EXP:
+            {
+                ggml_compute_forward_exp(params, tensor->src[0], tensor);
+            } break;
         case GGML_OP_LOG:
             {
                 ggml_compute_forward_log(params, tensor);
@@ -15571,6 +15734,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_soft_max_back(params, tensor);
             } break;
+        case GGML_OP_SOFT_PLUS:
+            {
+                ggml_compute_forward_soft_plus(params, tensor->src[0], tensor);
+            }
         case GGML_OP_ROPE:
             {
                 ggml_compute_forward_rope(params, tensor);
@@ -16123,6 +16290,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 zero_table);
                 }
             } break;
+        case GGML_OP_EXP:
+            {
+                GGML_ASSERT(false); // TODO: implement
+            } break;
         case GGML_OP_LOG:
             {
                 if (src0->grad) {
@@ -16501,6 +16672,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
+        case GGML_OP_SOFT_PLUS:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_ROPE:
             {
                 // necessary for llama
@@ -17243,6 +17418,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_SUB:
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
+        case GGML_OP_EXP:
         case GGML_OP_LOG:
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:
@@ -17343,6 +17519,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
             } break;
+        case GGML_OP_SOFT_PLUS:
+            {
+                n_tasks = 1; //TODO
+            } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
                 n_tasks = n_threads;
@@ -17715,6 +17895,7 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     }
                 } break;
             case GGML_OP_SOFT_MAX:
+            case GGML_OP_SOFT_PLUS:
             case GGML_OP_ROPE:
                 {
                     cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
diff --git a/ggml.h b/ggml.h
index 0a6d3c051fe72..efb62c5983f44 100644
--- a/ggml.h
+++ b/ggml.h
@@ -410,6 +410,7 @@ extern "C" {
         GGML_OP_DIV,
         GGML_OP_SQR,
         GGML_OP_SQRT,
+        GGML_OP_EXP,
         GGML_OP_LOG,
         GGML_OP_SUM,
         GGML_OP_SUM_ROWS,
@@ -443,6 +444,7 @@ extern "C" {
         GGML_OP_DIAG_MASK_ZERO,
         GGML_OP_SOFT_MAX,
         GGML_OP_SOFT_MAX_BACK,
+        GGML_OP_SOFT_PLUS,
         GGML_OP_ROPE,
         GGML_OP_ROPE_BACK,
         GGML_OP_ALIBI,
@@ -932,6 +934,14 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
+    GGML_API struct ggml_tensor * ggml_exp(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    GGML_API struct ggml_tensor * ggml_exp_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     GGML_API struct ggml_tensor * ggml_log(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
@@ -1420,6 +1430,15 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
+    GGML_API struct ggml_tensor * ggml_soft_plus(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
+    // in-place, returns view(a)
+    GGML_API struct ggml_tensor * ggml_soft_plus_inplace(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * a);
+
     // rotary position embedding
     // if mode & 1 == 1, skip n_past elements (DEPRECATED)
     // if mode & 2 == 1, GPT-NeoX style
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index a62139811ef36..a281083830ce8 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -113,6 +113,7 @@ class MODEL_ARCH(IntEnum):
     MINICPM    = auto()
     GEMMA      = auto()
     STARCODER2 = auto()
+    MAMBA      = auto()
 
 
 class MODEL_TENSOR(IntEnum):
@@ -144,6 +145,13 @@ class MODEL_TENSOR(IntEnum):
     ATTN_Q_NORM     = auto()
     ATTN_K_NORM     = auto()
     LAYER_OUT_NORM  = auto()
+    SSM_IN          = auto()
+    SSM_CONV1D      = auto()
+    SSM_X           = auto()
+    SSM_DT          = auto()
+    SSM_A           = auto()
+    SSM_D           = auto()
+    SSM_OUT         = auto()
 
 
 MODEL_ARCH_NAMES: dict[MODEL_ARCH, str] = {
@@ -171,6 +179,7 @@ class MODEL_TENSOR(IntEnum):
     MODEL_ARCH.MINICPM:        "minicpm",
     MODEL_ARCH.GEMMA:          "gemma",
     MODEL_ARCH.STARCODER2:     "starcoder2",
+    MODEL_ARCH.MAMBA:          "mamba",
 }
 
 TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
@@ -202,6 +211,14 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
     MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
     MODEL_TENSOR.LAYER_OUT_NORM:  "blk.{bid}.layer_output_norm",
+    # FIXME: NAMES FOR MAMBA ARE NOT FINAL
+    MODEL_TENSOR.SSM_IN:          "blk.{bid}.ssm_in",
+    MODEL_TENSOR.SSM_CONV1D:      "blk.{bid}.ssm_conv1d",
+    MODEL_TENSOR.SSM_X:           "blk.{bid}.ssm_x",
+    MODEL_TENSOR.SSM_DT:          "blk.{bid}.ssm_dt",
+    MODEL_TENSOR.SSM_A:           "blk.{bid}.ssm_a",
+    MODEL_TENSOR.SSM_D:           "blk.{bid}.ssm_d",
+    MODEL_TENSOR.SSM_OUT:         "blk.{bid}.ssm_out",
 }
 
 MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
@@ -543,6 +560,19 @@ class MODEL_TENSOR(IntEnum):
         MODEL_TENSOR.FFN_DOWN,
         MODEL_TENSOR.FFN_UP,
     ],
+    MODEL_ARCH.MAMBA: [
+        MODEL_TENSOR.TOKEN_EMBD,
+        MODEL_TENSOR.OUTPUT_NORM,
+        MODEL_TENSOR.OUTPUT,
+        MODEL_TENSOR.ATTN_NORM,
+        MODEL_TENSOR.SSM_IN,
+        MODEL_TENSOR.SSM_CONV1D,
+        MODEL_TENSOR.SSM_X,
+        MODEL_TENSOR.SSM_DT,
+        MODEL_TENSOR.SSM_A,
+        MODEL_TENSOR.SSM_D,
+        MODEL_TENSOR.SSM_OUT,
+    ],
     # TODO
 }
 
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index db2ec9704a441..d24d10dcb6cff 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -20,6 +20,8 @@ class TensorNameMap:
             "wte",                                       # gpt2
             "transformer.embd.wte",                      # phi2
             "model.tok_embeddings",                      # internlm2
+            "model.embedding",                           # mamba
+            "backbone.embedding",                        # mamba
         ),
 
         # Token type embeddings
@@ -44,7 +46,7 @@ class TensorNameMap:
         # Output
         MODEL_TENSOR.OUTPUT: (
             "embed_out",                 # gptneox
-            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen
+            "lm_head",                   # gpt2 mpt falcon llama-hf baichuan qwen mamba
             "output",                    # llama-pth bloom internlm2
             "word_embeddings_for_head",  # persimmon
             "lm_head.linear",            # phi2
@@ -61,6 +63,8 @@ class TensorNameMap:
             "language_model.encoder.final_layernorm",  # persimmon
             "model.final_layernorm",                   # persimmon
             "lm_head.ln",                              # phi2
+            "model.norm_f",                            # mamba
+            "backbone.norm_f",                         # mamba
         ),
 
         # Rope frequencies
@@ -86,6 +90,8 @@ class TensorNameMap:
             "transformer.h.{bid}.ln",                               # phi2
             "model.layers.layers.{bid}.norm",                       # plamo
             "model.layers.{bid}.attention_norm",                    # internlm2
+            "model.layers.{bid}.norm",                              # mamba
+            "backbone.layers.{bid}.mixer.norm",                     # mamba
         ),
 
         # Attention norm 2
@@ -282,7 +288,42 @@ class TensorNameMap:
         MODEL_TENSOR.LAYER_OUT_NORM: (
             "encoder.layer.{bid}.output.LayerNorm",  # bert
             "encoder.layers.{bid}.norm2",            # nomic-bert
-        )
+        ),
+
+        MODEL_TENSOR.SSM_IN: (
+            "model.layers.{bid}.in_proj",
+            "backbone.layers.{bid}.mixer.in_proj",
+        ),
+
+        MODEL_TENSOR.SSM_CONV1D: (
+            "model.layers.{bid}.conv1d",
+            "backbone.layers.{bid}.mixer.conv1d",
+        ),
+
+        MODEL_TENSOR.SSM_X: (
+            "model.layers.{bid}.x_proj",
+            "backbone.layers.{bid}.mixer.x_proj",
+        ),
+
+        MODEL_TENSOR.SSM_DT: (
+            "model.layers.{bid}.dt_proj",
+            "backbone.layers.{bid}.mixer.dt_proj",
+        ),
+
+        MODEL_TENSOR.SSM_A: (
+            "model.layers.{bid}.A_log",
+            "backbone.layers.{bid}.mixer.A_log",
+        ),
+
+        MODEL_TENSOR.SSM_D: (
+            "model.layers.{bid}.D",
+            "backbone.layers.{bid}.mixer.D",
+        ),
+
+        MODEL_TENSOR.SSM_OUT: (
+            "model.layers.{bid}.out_proj",
+            "backbone.layers.{bid}.mixer.out_proj",
+        ),
     }
 
     mapping: dict[str, tuple[MODEL_TENSOR, str]]
diff --git a/llama.cpp b/llama.cpp
index c1f015791e826..2dc1cc1b33c98 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -213,6 +213,7 @@ enum llm_arch {
     LLM_ARCH_MINICPM,
     LLM_ARCH_GEMMA,
     LLM_ARCH_STARCODER2,
+    LLM_ARCH_MAMBA,
     LLM_ARCH_UNKNOWN,
 };
 
@@ -241,6 +242,7 @@ static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
     { LLM_ARCH_MINICPM,         "minicpm"    },
     { LLM_ARCH_GEMMA,           "gemma"      },
     { LLM_ARCH_STARCODER2,      "starcoder2" },
+    { LLM_ARCH_MAMBA,           "mamba"      },
     { LLM_ARCH_UNKNOWN,         "(unknown)"  },
 };
 
@@ -399,6 +401,15 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_NORM,
     LLM_TENSOR_ATTN_K_NORM,
     LLM_TENSOR_LAYER_OUT_NORM,
+    // TODO: maybe use longer names?
+    // TODO: can the in_proj and/or the out_proj instead re-use some of the above types?
+    LLM_TENSOR_SSM_IN,
+    LLM_TENSOR_SSM_CONV1D,
+    LLM_TENSOR_SSM_X,
+    LLM_TENSOR_SSM_DT,
+    LLM_TENSOR_SSM_A,
+    LLM_TENSOR_SSM_D,
+    LLM_TENSOR_SSM_OUT,
 };
 
 static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES = {
@@ -801,6 +812,22 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_FFN_UP,          "blk.%d.ffn_up" },
         },
     },
+    {
+        LLM_ARCH_MAMBA,
+        {
+            { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
+            { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
+            { LLM_TENSOR_OUTPUT,          "output" },
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm"},
+            { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in"},
+            { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d"},
+            { LLM_TENSOR_SSM_X,           "blk.%d.ssm_x"},
+            { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt"},
+            { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a"},
+            { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d"},
+            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out"},
+        },
+    },
     {
         LLM_ARCH_UNKNOWN,
         {
@@ -1737,6 +1764,22 @@ struct llama_layer {
     struct ggml_tensor * ffn_down_b; // b2
     struct ggml_tensor * ffn_up_b;   // b3
     struct ggml_tensor * ffn_act;
+
+    
+    // mamba proj
+    struct ggml_tensor * ssm_in;
+    struct ggml_tensor * ssm_x;
+    struct ggml_tensor * ssm_dt;
+    struct ggml_tensor * ssm_out;
+
+    // mamba
+    struct ggml_tensor * ssm_conv1d;
+    struct ggml_tensor * ssm_a;
+    struct ggml_tensor * ssm_d;
+
+    // mamba bias
+    struct ggml_tensor * ssm_conv1d_b;
+    struct ggml_tensor * ssm_dt_b;
 };
 
 struct llama_kv_cell {
@@ -3376,6 +3419,29 @@ static void llm_load_hparams(
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
             } break;
+        case LLM_ARCH_MAMBA:
+            {
+                switch (hparams.n_layer) {
+                    case 24:
+                        switch (hparams.n_embd) {
+                            case 768: model.type = e_model::MODEL_SMALL; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 48:
+                        switch (hparams.n_embd) {
+                            case 1024: model.type = e_model::MODEL_MEDIUM; break;
+                            case 1536: model.type = e_model::MODEL_LARGE; break;
+                            case 2048: model.type = e_model::MODEL_XL; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    case 64:
+                        switch (hparams.n_embd) {
+                            case 2560: model.type = e_model::MODEL_3B; break;
+                            default: model.type = e_model::MODEL_UNKNOWN;
+                        } break;
+                    default: model.type = e_model::MODEL_UNKNOWN;
+                }
+            }
         default: (void)0;
     }
 
@@ -4596,6 +4662,36 @@ static bool llm_load_tensors(
                         layer.ffn_up_b   = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP ,  "bias", i), {  n_ff});
                     }
                 } break;
+            case LLM_ARCH_MAMBA:
+                {
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                    
+                    // output
+                    {
+                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                    }
+                    // TODO: MAMBA
+
+                    for (int i = 0; i < n_layer; ++i) {
+                        ggml_context * ctx_layer = ctx_for_layer(i);
+                        ggml_context * ctx_split = ctx_for_layer_split(i);
+
+                        auto & layer = model.layers[i];
+
+                        // norm
+                        layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+
+                        // TODO: D, in_proj, conv1d, x_proj, dt_proj, A_log, out_proj
+                        // TODO: what's the difference between ctx_layer and ctx_split?
+                        //       A: It seems that ctx_split is for matrices (2d???) while ctx_layer is for other things (like 1D bias and norms, probably.)
+
+                        // out_proj
+                        layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {2*n_embd, n_embd});
+
+                      
+                    }
+                }
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -7779,6 +7875,92 @@ struct llm_build_context {
 
         return gf;
     }
+
+    struct ggml_cgraph * build_mamba() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        // d_model
+        const int64_t n_embd = hparams.n_embd;
+        const int64_t d_state = 16;
+        const int64_t d_conv = 4;
+        // expand = 2
+        // d_inner = expand * d_model
+        const int64_t d_inner = 2 * n_embd; // FIXME: this is wrong
+
+        struct ggml_tensor * cur;
+        struct ggml_tensor * inpL;
+
+        // TODO: give it the right size
+        struct ggml_tensor * state;
+
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        cb(inpL, "inp_embd", -1);
+
+        for (int il = 0; il < n_layer; ++il) {
+            // FIXME: init attn_norm
+            // norm
+            cur = llm_build_norm(ctx0, inpL, hparams,
+                    model.layers[il].attn_norm, NULL,
+                    LLM_NORM_RMS, cb, il);
+            // TODO: that's probably the wrong name.
+            cb(cur, "attn_norm", il);
+
+            // conv
+            {
+                // [] * [] = [2*n_embd]
+                struct ggml_tensor * xz = ggml_mul_mat(ctx0, cur, model.layers[il].ssm_in);
+                // split the above in two
+                struct ggml_tensor * x = ggml_view_1d(ctx0, xz, d_inner, 0);
+                struct ggml_tensor * z = ggml_view_1d(ctx0, xz, d_inner, d_inner);
+
+                
+                // FIXME: this is wrong
+                cur = ggml_conv_1d(ctx0, cur, model.layers[il].ssm_conv1d, 1, d_conv - 1, 1);
+
+                cur = ggml_add(ctx0, cur, model.layers[il].ssm_conv1d_b);
+
+                // TODO: there's some SiLU in there (but no ffn? or is the conv an ffn?)
+                cur = ggml_silu(ctx0, cur);
+            }
+
+            // ssm
+            {
+                
+                // TODO: use ggml_soft_plus here
+                
+            }
+
+            // TODO: there's some SiLU again towards the end. Can the `llm_build_ffn` helper be used?
+            //       Maybe the best way is to implement it, _then_ check if that helper would do the same thing.
+            // discretize
+            {
+            }
+
+            // residual
+            cur = ggml_add(ctx0, cur, inpL);
+            cb(cur, "l_out", il);
+
+            // input for next layer
+            inpL = cur;
+        }
+
+        // the last step of each layer already makes these equivalent
+        // cur = inpL;
+
+        // final rmsnorm
+        cur = llm_build_norm(ctx0, cur, hparams,
+                model.output_norm, NULL,
+                LLM_NORM_RMS, cb, -1);
+        cb(cur, "result_norm", -1);
+
+        // lm_head
+        cur = ggml_mul_mat(ctx0, model.output, cur);
+        cb(cur, "result_output", -1);
+
+        ggml_build_forward_expand(gf, cur);
+
+        return gf;
+    }
 };
 
 static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
@@ -12321,6 +12503,7 @@ enum llama_rope_type llama_rope_type(const struct llama_model * model) {
         case LLM_ARCH_MPT:
         case LLM_ARCH_REFACT:
         case LLM_ARCH_BLOOM:
+        case LLM_ARCH_MAMBA:
             return LLAMA_ROPE_TYPE_NONE;
 
         // use what we call a normal RoPE, operating on pairs of consecutive head values

From 5a69a262a13d5672961a7edf82e56e55acd67d17 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 27 Jan 2024 11:41:20 -0500
Subject: [PATCH 02/41] mamba : begin figuring out how to (ab)use the kv cache
 for Mamba

---
 convert-hf-to-gguf.py |   6 ++-
 llama.cpp             | 107 +++++++++++++++++++++++++++++-------------
 2 files changed, 79 insertions(+), 34 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 28e865e5c6698..fab409b084a42 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1849,9 +1849,13 @@ class MambaModel(Model):
     model_arch = gguf.MODEL_ARCH.MAMBA
 
     def set_gguf_parameters(self):
+        d_model = self.hparams["d_model"]
         self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_embedding_length(self.hparams["d_model"])
+        self.gguf_writer.add_embedding_length(d_model)
         self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_head_count(2 * d_model) # d_inner
+        self.gguf_writer.add_key_length(4) # d_conv
+        self.gguf_writer.add_value_length(16) # d_state
         self.gguf_writer.add_file_type(self.ftype)
 
 
diff --git a/llama.cpp b/llama.cpp
index 2dc1cc1b33c98..c46f669e36125 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1765,7 +1765,7 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_b;   // b3
     struct ggml_tensor * ffn_act;
 
-    
+
     // mamba proj
     struct ggml_tensor * ssm_in;
     struct ggml_tensor * ssm_x;
@@ -2067,6 +2067,14 @@ static bool llama_kv_cache_init(
     const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
     const int64_t  n_layer      = hparams.n_layer;
 
+    if (model.arch == LLM_ARCH_MAMBA) {
+        // only one slot is needed for Mamba
+        n_ctx = 1;
+        // it's probably best to keep as much precision as possible for the states
+        ktype = GGML_TYPE_F32;
+        vtype = GGML_TYPE_F32;
+    }
+
     cache.has_shift = false;
 
     cache.head = 0;
@@ -2151,6 +2159,12 @@ static bool llama_kv_cache_find_slot(
     const uint32_t n_ctx    = cache.size;
     const uint32_t n_tokens = batch.n_tokens;
 
+    // for Mamba and/or other model archs that only ever use one slot
+    if (n_ctx == 1) {
+        // hopefully no one actually uses a context size of 1 on Transformer-based models
+        return true;
+    }
+
     if (n_tokens > n_ctx) {
         LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
         return false;
@@ -4665,13 +4679,19 @@ static bool llm_load_tensors(
             case LLM_ARCH_MAMBA:
                 {
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-                    
+
+                    const int64_t d_conv  = hparams.n_embd_head_k;
+                    const int64_t d_state = hparams.n_embd_head_v;
+                    const int64_t d_inner = hparams.n_head;
+                    // FIXME: ceiling instead of floor
+                    const int64_t dt_rank = n_embd / 16;
+                    GGML_ASSERT(2 * n_embd == d_inner);
+
                     // output
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
                         model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
-                    // TODO: MAMBA
 
                     for (int i = 0; i < n_layer; ++i) {
                         ggml_context * ctx_layer = ctx_for_layer(i);
@@ -4679,19 +4699,30 @@ static bool llm_load_tensors(
 
                         auto & layer = model.layers[i];
 
+                        // TODO: what's the difference between ctx_layer and ctx_split?
+                        //       A: It seems that ctx_split is for matrices (2d???) while ctx_layer is for other things (like 1D bias and norms, probably.)
+
                         // norm
                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
-                        // TODO: D, in_proj, conv1d, x_proj, dt_proj, A_log, out_proj
-                        // TODO: what's the difference between ctx_layer and ctx_split?
-                        //       A: It seems that ctx_split is for matrices (2d???) while ctx_layer is for other things (like 1D bias and norms, probably.)
+                        layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner});
 
-                        // out_proj
-                        layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {2*n_embd, n_embd});
+                        layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, 1, d_inner});
+                        layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner});
+
+                        layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state});
+
+                        layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner});
+                        layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner});
 
-                      
+                        // FIXME: maybe no suffix for these
+                        layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, "weight", i), {d_state, d_inner});
+                        layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, "weight", i), {d_inner});
+
+                        // out_proj
+                        layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
                     }
-                }
+                } break;
             default:
                 throw std::runtime_error("unknown architecture");
         }
@@ -5272,7 +5303,7 @@ struct llm_build_context {
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (batch.n_tokens),
-        n_kv             (worst_case ? n_ctx            : kv_self.n),
+        n_kv             (worst_case ? kv_self.size     : kv_self.n),
         kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
         n_orig_ctx       (cparams.n_yarn_orig_ctx),
         pooling_type     (cparams.pooling_type),
@@ -7876,28 +7907,30 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_mamba() {
+    struct ggml_cgraph * build_mamba(bool use_conv) {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
-        // d_model
-        const int64_t n_embd = hparams.n_embd;
-        const int64_t d_state = 16;
-        const int64_t d_conv = 4;
-        // expand = 2
-        // d_inner = expand * d_model
-        const int64_t d_inner = 2 * n_embd; // FIXME: this is wrong
+        GGML_ASSERT(use_conv == false); // TODO: implement
+
+        const int64_t d_model = hparams.n_embd;
+        const int64_t d_inner = hparams.n_head;
+        GGML_ASSERT(2 * d_model == d_inner);
+        const int64_t d_conv = hparams.n_embd_head_k;
+        const int64_t d_state = hparams.n_embd_head_v;
+        const int64_t dt_rank = d_model / 16;
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        // TODO: give it the right size
-        struct ggml_tensor * state;
-
-        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
+        // {n_embd, batch}
+        inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            // FIXME: init attn_norm
+            // (ab)using the kv cache to store the state
+            ggml_tensor * conv_state = kv_self.k_l[il]; // {d_conv,  d_inner}
+            ggml_tensor * ssm_state  = kv_self.v_l[il]; // {d_state, d_inner}
+
             // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm, NULL,
@@ -7905,15 +7938,19 @@ struct llm_build_context {
             // TODO: that's probably the wrong name.
             cb(cur, "attn_norm", il);
 
+            // {n_embd, batch} * {n_embd, 2*d_inner} = {batch, 2*d_inner}
+            struct ggml_tensor * xz = ggml_mul_mat(ctx0, cur, model.layers[il].ssm_in);
+            // split the above in two
+            struct ggml_tensor * x = ggml_view_1d(ctx0, xz, d_inner, 0);
+            struct ggml_tensor * z = ggml_view_1d(ctx0, xz, d_inner, d_inner);
+
+            // FIXME: figure out when to transpose
             // conv
             {
-                // [] * [] = [2*n_embd]
-                struct ggml_tensor * xz = ggml_mul_mat(ctx0, cur, model.layers[il].ssm_in);
-                // split the above in two
-                struct ggml_tensor * x = ggml_view_1d(ctx0, xz, d_inner, 0);
-                struct ggml_tensor * z = ggml_view_1d(ctx0, xz, d_inner, d_inner);
+                // TODO: figure out how to do a row-wise dot product
+                // TODO: use the kv-cache to store the state
+                kv_self.k_l[il];
 
-                
                 // FIXME: this is wrong
                 cur = ggml_conv_1d(ctx0, cur, model.layers[il].ssm_conv1d, 1, d_conv - 1, 1);
 
@@ -7925,9 +7962,9 @@ struct llm_build_context {
 
             // ssm
             {
-                
+
                 // TODO: use ggml_soft_plus here
-                
+
             }
 
             // TODO: there's some SiLU again towards the end. Can the `llm_build_ffn` helper be used?
@@ -8111,6 +8148,10 @@ static struct ggml_cgraph * llama_build_graph(
             {
                 result = llm.build_starcoder2();
             } break;
+        case LLM_ARCH_MAMBA:
+            {
+                result = llm.build_mamba(/* use_conv =*/ batch.n_tokens > 1);
+            } break;
         default:
             GGML_ASSERT(false);
     }
@@ -8366,7 +8407,7 @@ static int llama_decode_internal(
     // a heuristic, to avoid attending the full cache if it is not yet utilized
     // after enough generations, the benefit from this heuristic disappears
     // if we start defragmenting the cache, the benefit from this will be more important
-    kv_self.n = std::min(cparams.n_ctx, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
+    kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
     //kv_self.n = llama_kv_cache_cell_max(kv_self);
 
     //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);

From f680364bd81572ba798a25f78f53f9d61d80d493 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 28 Jan 2024 15:36:42 -0500
Subject: [PATCH 03/41] mamba : recurrent inference almost works, but
 incoherent

---
 convert-hf-to-gguf.py |  46 +++++++++++++++-
 ggml.c                |   4 +-
 llama.cpp             | 119 ++++++++++++++++++++++++++++--------------
 3 files changed, 128 insertions(+), 41 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index fab409b084a42..a9e921fdf349b 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1851,13 +1851,57 @@ class MambaModel(Model):
     def set_gguf_parameters(self):
         d_model = self.hparams["d_model"]
         self.gguf_writer.add_name(self.dir_model.name)
+        self.gguf_writer.add_context_length(128) # arbitrary value; it shouldn't be important for Mamba
         self.gguf_writer.add_embedding_length(d_model)
-        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
         self.gguf_writer.add_head_count(2 * d_model) # d_inner
+        self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
         self.gguf_writer.add_key_length(4) # d_conv
         self.gguf_writer.add_value_length(16) # d_state
         self.gguf_writer.add_file_type(self.ftype)
 
+    def write_tensors(self):
+        block_count = self.hparams["n_layer"]
+        tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+        for name, data_torch in self.get_tensors():
+            old_dtype = data_torch.dtype
+
+            # convert any unsupported data types to float32
+            if data_torch.dtype not in (torch.float16, torch.float32):
+                data_torch = data_torch.to(torch.float32)
+
+            # map tensor names
+            new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
+            if new_name is None:
+                print(f"Can not map tensor {name!r}")
+                sys.exit()
+
+            if name.endswith(".A_log"):
+                print("A_log --> A ==> " + new_name)
+                data_torch = -torch.exp(data_torch)
+
+            data = data_torch.squeeze().numpy()
+
+            n_dims = len(data.shape)
+            data_dtype = data.dtype
+
+            # if f32 desired, convert any float16 to float32
+            if self.ftype == 0 and data_dtype == np.float16:
+                data = data.astype(np.float32)
+
+            # TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
+            if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
+                data = data.astype(np.float32)
+
+            # if f16 desired, convert big float32 2-dim weight tensors to float16
+            if self.ftype == 1 and data_dtype == np.float32 and new_name.removesuffix(".weight").endswith((".ssm_in", ".ssm_out", "token_embd", "output")) and n_dims == 2:
+                data = data.astype(np.float16)
+
+            print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")
+
+            self.gguf_writer.add_tensor(new_name, data)
+
 
 ###### CONVERSION LOGIC ######
 
diff --git a/ggml.c b/ggml.c
index 597bf319ed15a..e760a1c781a4e 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5331,7 +5331,7 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
 
 // ggml_soft_plus
 
-struct ggml_tensor * ggml_soft_plus_impl(
+static struct ggml_tensor * ggml_soft_plus_impl(
         struct ggml_context * ctx,
         struct ggml_tensor  * a,
         bool                  inplace) {
@@ -15737,7 +15737,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
         case GGML_OP_SOFT_PLUS:
             {
                 ggml_compute_forward_soft_plus(params, tensor->src[0], tensor);
-            }
+            } break;
         case GGML_OP_ROPE:
             {
                 ggml_compute_forward_rope(params, tensor);
diff --git a/llama.cpp b/llama.cpp
index c46f669e36125..23848e16adfea 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1765,7 +1765,6 @@ struct llama_layer {
     struct ggml_tensor * ffn_up_b;   // b3
     struct ggml_tensor * ffn_act;
 
-
     // mamba proj
     struct ggml_tensor * ssm_in;
     struct ggml_tensor * ssm_x;
@@ -3435,6 +3434,7 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_MAMBA:
             {
+                ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
                 switch (hparams.n_layer) {
                     case 24:
                         switch (hparams.n_embd) {
@@ -3455,7 +3455,7 @@ static void llm_load_hparams(
                         } break;
                     default: model.type = e_model::MODEL_UNKNOWN;
                 }
-            }
+            } break;
         default: (void)0;
     }
 
@@ -3939,7 +3939,10 @@ static bool llm_load_tensors(
         const int64_t n_vocab_type = hparams.n_vocab_type;
         const int64_t n_ff         = hparams.n_ff;
 
-        GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+        // Mamba uses these in its own way
+        if (model.arch != LLM_ARCH_MAMBA) {
+            GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+        }
 
         ggml_context * ctx_input        = ctx_map.at(model.buft_input.buft);
         ggml_context * ctx_output       = ctx_map.at(model.buft_output.buft);
@@ -4678,19 +4681,21 @@ static bool llm_load_tensors(
                 } break;
             case LLM_ARCH_MAMBA:
                 {
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
-
                     const int64_t d_conv  = hparams.n_embd_head_k;
                     const int64_t d_state = hparams.n_embd_head_v;
                     const int64_t d_inner = hparams.n_head;
                     // FIXME: ceiling instead of floor
                     const int64_t dt_rank = n_embd / 16;
                     GGML_ASSERT(2 * n_embd == d_inner);
+                    // round up the vocab size to the next multiple of 8
+                    const int64_t rounded_vocab = (n_vocab + 7) & -8;
+
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, rounded_vocab});
 
                     // output
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, rounded_vocab});
                     }
 
                     for (int i = 0; i < n_layer; ++i) {
@@ -4707,7 +4712,7 @@ static bool llm_load_tensors(
 
                         layer.ssm_in = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_IN, "weight", i), {n_embd, 2*d_inner});
 
-                        layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, 1, d_inner});
+                        layer.ssm_conv1d = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_CONV1D, "weight", i), {d_conv, d_inner});
                         layer.ssm_conv1d_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_CONV1D, "bias", i), {d_inner});
 
                         layer.ssm_x = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_X, "weight", i), {d_inner, dt_rank + 2*d_state});
@@ -4715,9 +4720,9 @@ static bool llm_load_tensors(
                         layer.ssm_dt = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_DT, "weight", i), {dt_rank, d_inner});
                         layer.ssm_dt_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_DT, "bias", i), {d_inner});
 
-                        // FIXME: maybe no suffix for these
-                        layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, "weight", i), {d_state, d_inner});
-                        layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, "weight", i), {d_inner});
+                        // no "weight" suffix for these
+                        layer.ssm_a = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_A, i), {d_state, d_inner});
+                        layer.ssm_d = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_SSM_D, i), {d_inner});
 
                         // out_proj
                         layer.ssm_out = ml.create_tensor(ctx_split, tn(LLM_TENSOR_SSM_OUT, "weight", i), {d_inner, n_embd});
@@ -7907,16 +7912,18 @@ struct llm_build_context {
         return gf;
     }
 
-    struct ggml_cgraph * build_mamba(bool use_conv) {
+    struct ggml_cgraph * build_mamba() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
+        const bool use_conv = batch.n_tokens > 1;
         GGML_ASSERT(use_conv == false); // TODO: implement
 
-        const int64_t d_model = hparams.n_embd;
-        const int64_t d_inner = hparams.n_head;
+        // hopefully the compiler does constant folding
+        const int64_t d_model = n_embd;
+        const int64_t d_inner = n_head;
         GGML_ASSERT(2 * d_model == d_inner);
-        const int64_t d_conv = hparams.n_embd_head_k;
-        const int64_t d_state = hparams.n_embd_head_v;
+        const int64_t d_conv = n_embd_head_k;
+        const int64_t d_state = n_embd_head_v;
         const int64_t dt_rank = d_model / 16;
 
         struct ggml_tensor * cur;
@@ -7928,8 +7935,10 @@ struct llm_build_context {
 
         for (int il = 0; il < n_layer; ++il) {
             // (ab)using the kv cache to store the state
-            ggml_tensor * conv_state = kv_self.k_l[il]; // {d_conv,  d_inner}
-            ggml_tensor * ssm_state  = kv_self.v_l[il]; // {d_state, d_inner}
+            // NOTE: the conv_state is transposed to ease shifting it.
+            // if you figured out a way to shift it without transposing it like this, go ahead and fix this.
+            ggml_tensor * conv_state = kv_self.k_l[il]; // {d_inner, d_conv}
+            ggml_tensor * ssm_state  = ggml_reshape_2d(ctx0, kv_self.v_l[il], d_state, d_inner);
 
             // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7941,36 +7950,73 @@ struct llm_build_context {
             // {n_embd, batch} * {n_embd, 2*d_inner} = {batch, 2*d_inner}
             struct ggml_tensor * xz = ggml_mul_mat(ctx0, cur, model.layers[il].ssm_in);
             // split the above in two
+            // assuming it's contiguous
+            // FIXME: handle batches of more than 1 token
             struct ggml_tensor * x = ggml_view_1d(ctx0, xz, d_inner, 0);
-            struct ggml_tensor * z = ggml_view_1d(ctx0, xz, d_inner, d_inner);
+            struct ggml_tensor * z = ggml_view_1d(ctx0, xz, d_inner, ggml_element_size(xz)*d_inner);
+
+            cur = x;
 
-            // FIXME: figure out when to transpose
             // conv
             {
-                // TODO: figure out how to do a row-wise dot product
-                // TODO: use the kv-cache to store the state
-                kv_self.k_l[il];
+                // shift conv state left
+                conv_state = ggml_set_1d(ctx0, conv_state, ggml_view_1d(ctx0, conv_state, (d_conv - 1)*d_inner, ggml_element_size(conv_state)*d_inner), 0);
+
+                // update last column
+                conv_state = ggml_set_1d(ctx0, conv_state, x, ggml_element_size(conv_state)*(d_conv - 1)*d_inner);
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state, ggml_view_tensor(ctx0, kv_self.k_l[il])));
 
-                // FIXME: this is wrong
-                cur = ggml_conv_1d(ctx0, cur, model.layers[il].ssm_conv1d, 1, d_conv - 1, 1);
+                // rearrange and sum
+                conv_state = ggml_reshape_2d(ctx0, conv_state, d_inner, d_conv);
+                // TODO: find a way to directly shift a 2d conv_state, avoiding the need to transpose here.
+                conv_state = ggml_cont(ctx0, ggml_transpose(ctx0, conv_state));
 
-                cur = ggml_add(ctx0, cur, model.layers[il].ssm_conv1d_b);
+                // --> {1, d_inner}
+                x = ggml_sum_rows(ctx0, ggml_mul(ctx0, conv_state, model.layers[il].ssm_conv1d));
+                x = ggml_transpose(ctx0, x);
 
-                // TODO: there's some SiLU in there (but no ffn? or is the conv an ffn?)
-                cur = ggml_silu(ctx0, cur);
+                // bias
+                x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
+
+                x = ggml_silu(ctx0, x);
             }
 
             // ssm
             {
+                // {2*n_embd, batch} * {2*n_embd, dt_rank + 2*d_state} = {batch, dt_rank + 2*d_state}
+                struct ggml_tensor * x_db = ggml_mul_mat(ctx0, x, model.layers[il].ssm_x);
+                // FIXME: handle batches of more than 1 token
+                struct ggml_tensor * dt = ggml_view_1d(ctx0, x_db, dt_rank, 0);
+                struct ggml_tensor * B = ggml_view_1d(ctx0, x_db, d_state, ggml_element_size(x_db)*dt_rank);
+                struct ggml_tensor * C = ggml_view_1d(ctx0, x_db, d_state, ggml_element_size(x_db)*(dt_rank+d_state));
 
-                // TODO: use ggml_soft_plus here
+                // {dt_rank} * {dt_rank, d_inner} = {1, d_inner}
+                dt = ggml_mul_mat(ctx0, dt, model.layers[il].ssm_dt);
+                dt = ggml_add(ctx0, dt, ggml_transpose(ctx0, model.layers[il].ssm_dt_b));
+                dt = ggml_soft_plus(ctx0, dt);
 
-            }
+                // => {d_state, d_inner}
+                struct ggml_tensor * dA = ggml_exp(ctx0, ggml_mul(ctx0, model.layers[il].ssm_a, dt));
 
-            // TODO: there's some SiLU again towards the end. Can the `llm_build_ffn` helper be used?
-            //       Maybe the best way is to implement it, _then_ check if that helper would do the same thing.
-            // discretize
-            {
+                // => {d_state, d_inner}
+                struct ggml_tensor * dB = ggml_out_prod(ctx0, B, ggml_transpose(ctx0, dt));
+
+                // => {d_state, d_inner}
+                cur = ggml_mul(ctx0, dB, ggml_transpose(ctx0, x));
+
+                ssm_state = ggml_add(ctx0, ggml_mul(ctx0, ssm_state, dA), cur);
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, ssm_state, ggml_view_tensor(ctx0, kv_self.v_l[il])));
+
+                // row-wise dot product ("dn,n->d")
+                // {d_state, d_inner} * {d_state} => {d_inner, 1}
+                struct ggml_tensor * y = ggml_mul_mat(ctx0, ssm_state, C);
+                y = ggml_add(ctx0, y, ggml_mul(ctx0, model.layers[il].ssm_d, x));
+                y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
+
+                // {d_inner, n_embd} * {d_inner, 1} = {n_embd, 1}
+                cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
             }
 
             // residual
@@ -7981,11 +8027,8 @@ struct llm_build_context {
             inpL = cur;
         }
 
-        // the last step of each layer already makes these equivalent
-        // cur = inpL;
-
         // final rmsnorm
-        cur = llm_build_norm(ctx0, cur, hparams,
+        cur = llm_build_norm(ctx0, inpL, hparams,
                 model.output_norm, NULL,
                 LLM_NORM_RMS, cb, -1);
         cb(cur, "result_norm", -1);
@@ -8150,7 +8193,7 @@ static struct ggml_cgraph * llama_build_graph(
             } break;
         case LLM_ARCH_MAMBA:
             {
-                result = llm.build_mamba(/* use_conv =*/ batch.n_tokens > 1);
+                result = llm.build_mamba();
             } break;
         default:
             GGML_ASSERT(false);

From 54d3e48601bcc4d82f0c8ed5916f7b87b2ca2142 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 28 Jan 2024 16:20:03 -0500
Subject: [PATCH 04/41] mamba : recurrent inference WORKS!!!

---
 ggml.c    | 4 +---
 llama.cpp | 1 -
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/ggml.c b/ggml.c
index e760a1c781a4e..dcc70d0109a69 100644
--- a/ggml.c
+++ b/ggml.c
@@ -5336,8 +5336,6 @@ static struct ggml_tensor * ggml_soft_plus_impl(
         struct ggml_tensor  * a,
         bool                  inplace) {
 
-    // TODO: does `a` need to be contiguous?
-
     bool is_node = false;
 
     if (a->grad) {
@@ -12190,7 +12188,7 @@ static void ggml_compute_forward_soft_plus_f32(
         float * x = (float *) ((char *) dst->data  + i*( dst->nb[1]));
         float * y = (float *) ((char *) src0->data + i*(src0->nb[1]));
         for (int j = 0; j < nc; ++j) {
-            x[j] = logf(1.0f + expf(y[i]));
+            x[j] = logf(1.0f + expf(y[j]));
         }
     }
 }
diff --git a/llama.cpp b/llama.cpp
index 23848e16adfea..fceee631776e5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7944,7 +7944,6 @@ struct llm_build_context {
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm, NULL,
                     LLM_NORM_RMS, cb, il);
-            // TODO: that's probably the wrong name.
             cb(cur, "attn_norm", il);
 
             // {n_embd, batch} * {n_embd, 2*d_inner} = {batch, 2*d_inner}

From 74eea856bf07f1ca075a8bac88c93728d3267687 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 29 Jan 2024 08:27:09 -0500
Subject: [PATCH 05/41] convert : optionally use d_conv and d_state from
 config.json for Mamba

---
 convert-hf-to-gguf.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index a9e921fdf349b..8e20600f84c73 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1850,15 +1850,19 @@ class MambaModel(Model):
 
     def set_gguf_parameters(self):
         d_model = self.hparams["d_model"]
+        d_inner = self.hparams.get("d_inner", 2 * d_model)
+        # Fail early for models which don't have a block expansion factor of 2
+        assert d_inner == 2 * d_model
+
         self.gguf_writer.add_name(self.dir_model.name)
-        self.gguf_writer.add_context_length(128) # arbitrary value; it shouldn't be important for Mamba
+        self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
         self.gguf_writer.add_embedding_length(d_model)
         self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
-        self.gguf_writer.add_head_count(2 * d_model) # d_inner
+        self.gguf_writer.add_head_count(d_inner)
         self.gguf_writer.add_block_count(self.hparams["n_layer"])
-        self.gguf_writer.add_layer_norm_rms_eps(1e-5)
-        self.gguf_writer.add_key_length(4) # d_conv
-        self.gguf_writer.add_value_length(16) # d_state
+        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
+        self.gguf_writer.add_key_length(self.hparams.get("d_conv", 4))
+        self.gguf_writer.add_value_length(self.hparams.get("d_state", 16))
         self.gguf_writer.add_file_type(self.ftype)
 
     def write_tensors(self):

From 9e77061a3b4d719ecabe68a930aa102f478a34fd Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 29 Jan 2024 10:21:19 -0500
Subject: [PATCH 06/41] mamba : refactor recurrent conv, resulting in 20% perf
 increase

It's still slower than I'd like, but I did not really optimize `ggml_exp` yet.

I also refactored `ggml_exp` to work with tensors with more than 2 dimensions.
---
 ggml.c    | 17 ++++++++++-------
 llama.cpp | 28 +++++++++++++---------------
 2 files changed, 23 insertions(+), 22 deletions(-)

diff --git a/ggml.c b/ggml.c
index dcc70d0109a69..a5b337d966440 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8678,16 +8678,19 @@ static void ggml_compute_forward_exp_f32(
         return;
     }
 
-    const int n  = ggml_nrows(src0);
-    const int nc = src0->ne[0];
-
     GGML_ASSERT( dst->nb[0] == sizeof(float));
     GGML_ASSERT(src0->nb[0] == sizeof(float));
 
-    for (int i = 0; i < n; i++) {
-        ggml_vec_exp_f32(nc,
-                (float *) ((char *) dst->data  + i*( dst->nb[1])),
-                (float *) ((char *) src0->data + i*(src0->nb[1])));
+    GGML_TENSOR_UNARY_OP_LOCALS
+
+    for (int64_t i3 = 0; i3 < ne03; i3++) {
+        for (int64_t i2 = 0; i2 < ne02; i2++) {
+            for (int64_t i1 = 0; i1 < ne01; i1++) {
+                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
+                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
+                ggml_vec_exp_f32(ne00, dst_row, src_row);
+            }
+        }
     }
 }
 
diff --git a/llama.cpp b/llama.cpp
index fceee631776e5..048bd8e509029 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7929,15 +7929,14 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
+        // NOTE: not sure what's the difference between the sequence length and the batch size in the paper.
         // {n_embd, batch}
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         for (int il = 0; il < n_layer; ++il) {
             // (ab)using the kv cache to store the state
-            // NOTE: the conv_state is transposed to ease shifting it.
-            // if you figured out a way to shift it without transposing it like this, go ahead and fix this.
-            ggml_tensor * conv_state = kv_self.k_l[il]; // {d_inner, d_conv}
+            ggml_tensor * conv_state = ggml_reshape_2d(ctx0, kv_self.k_l[il], d_conv, d_inner);
             ggml_tensor * ssm_state  = ggml_reshape_2d(ctx0, kv_self.v_l[il], d_state, d_inner);
 
             // norm
@@ -7946,33 +7945,32 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "attn_norm", il);
 
-            // {n_embd, batch} * {n_embd, 2*d_inner} = {batch, 2*d_inner}
-            struct ggml_tensor * xz = ggml_mul_mat(ctx0, cur, model.layers[il].ssm_in);
+            // {n_embd, 2*d_inner} * {n_embd, batch} = {2*d_inner, batch}
+            struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
             // split the above in two
             // assuming it's contiguous
-            // FIXME: handle batches of more than 1 token
-            struct ggml_tensor * x = ggml_view_1d(ctx0, xz, d_inner, 0);
-            struct ggml_tensor * z = ggml_view_1d(ctx0, xz, d_inner, ggml_element_size(xz)*d_inner);
+            // {d_inner, batch}
+            struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
+            struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
 
             cur = x;
 
             // conv
             {
                 // shift conv state left
-                conv_state = ggml_set_1d(ctx0, conv_state, ggml_view_1d(ctx0, conv_state, (d_conv - 1)*d_inner, ggml_element_size(conv_state)*d_inner), 0);
+                conv_state = ggml_set_2d(ctx0, conv_state, ggml_view_2d(ctx0, conv_state, (d_conv - 1), d_inner, conv_state->nb[1], ggml_element_size(conv_state)*1), conv_state->nb[1], 0);
 
                 // update last column
-                conv_state = ggml_set_1d(ctx0, conv_state, x, ggml_element_size(conv_state)*(d_conv - 1)*d_inner);
+                // x here is {d_inner, 1} (a row), but should be {1, d_inner} (a column)
+                conv_state = ggml_set_2d(ctx0, conv_state, ggml_cont(ctx0, ggml_transpose(ctx0, x)), conv_state->nb[1], ggml_element_size(conv_state)*(d_conv - 1));
 
                 ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state, ggml_view_tensor(ctx0, kv_self.k_l[il])));
 
                 // rearrange and sum
-                conv_state = ggml_reshape_2d(ctx0, conv_state, d_inner, d_conv);
-                // TODO: find a way to directly shift a 2d conv_state, avoiding the need to transpose here.
-                conv_state = ggml_cont(ctx0, ggml_transpose(ctx0, conv_state));
-
-                // --> {1, d_inner}
+                // no need to rearrange the conv_state, since it's already in the right shape
+                // => {1, d_inner}
                 x = ggml_sum_rows(ctx0, ggml_mul(ctx0, conv_state, model.layers[il].ssm_conv1d));
+                // => {d_inner, 1}
                 x = ggml_transpose(ctx0, x);
 
                 // bias

From 3f7233b62e04056ff8d59e8f6dc816b292ec3bf0 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 29 Jan 2024 13:33:27 -0500
Subject: [PATCH 07/41] ggml : parallelize ggml_exp

This results in 8% faster token generation for Mamba-130M.
---
 ggml.c | 33 +++++++++++++++++++--------------
 1 file changed, 19 insertions(+), 14 deletions(-)

diff --git a/ggml.c b/ggml.c
index a5b337d966440..8f351d823b730 100644
--- a/ggml.c
+++ b/ggml.c
@@ -8671,27 +8671,32 @@ static void ggml_compute_forward_exp_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
         struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
+    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
     GGML_ASSERT(ggml_are_same_shape(src0, dst));
 
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
     }
 
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    const int ith = params->ith;
+    const int nth = params->nth;
 
-    GGML_TENSOR_UNARY_OP_LOCALS
+    const int nc = src0->ne[0];
+    const int nr = ggml_nrows(src0);
 
-    for (int64_t i3 = 0; i3 < ne03; i3++) {
-        for (int64_t i2 = 0; i2 < ne02; i2++) {
-            for (int64_t i1 = 0; i1 < ne01; i1++) {
-                float * src_row = (float *) ((char *) src0->data + i1*nb01 + i2*nb02 + i3*nb03);
-                float * dst_row = (float *) ((char *) dst->data  + i1*nb1  + i2*nb2  + i3*nb3);
-                ggml_vec_exp_f32(ne00, dst_row, src_row);
-            }
-        }
-    }
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        ggml_vec_exp_f32(nc,
+                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
+                (float *) ((char *) src0->data + i1*(src0->nb[1])));
+    };
 }
 
 static void ggml_compute_forward_exp(
@@ -17413,13 +17418,13 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_ADD:
         case GGML_OP_ADD1:
         case GGML_OP_ACC:
+        case GGML_OP_EXP:
             {
                 n_tasks = n_threads;
             } break;
         case GGML_OP_SUB:
         case GGML_OP_SQR:
         case GGML_OP_SQRT:
-        case GGML_OP_EXP:
         case GGML_OP_LOG:
         case GGML_OP_SUM:
         case GGML_OP_SUM_ROWS:

From e9cc45ecae696e8f1fa15b8f355b9c2e1f984f80 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 30 Jan 2024 21:48:04 -0500
Subject: [PATCH 08/41] mamba : simplify the conv step with a self-overlapping
 view

Turns out the conv_state can be made smaller by one column.
Note that this breaks existing GGUFs of Mamba,
because the key_value_length field is tied to the conv_state size.

Convolution with a self-overlapping view is cool!
And it's much simpler than what I initially thought would be necessary
to make the convolution step work with more than 1 token at a time.

Next step is to make the SSM step work on batches of tokens too,
and thus I need to figure out a way to make a parallel selective scan
which will keep the ssm_state small and won't make it bigger
by a factor of (n_layer * batch_size).

* llama : fix Mamba KV self size wrongly displaying as f16 instead of f32

Relatedly, I also tried to see if other types than f32 worked for the states,
but they don't, because of the operators used.
It's probably better anyway to keep lots of precision there,
since the states are small anyway.
---
 convert-hf-to-gguf.py |   6 ++-
 llama.cpp             | 104 ++++++++++++++++++++++++------------------
 2 files changed, 63 insertions(+), 47 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 8e20600f84c73..e49b2f4f63519 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1858,10 +1858,12 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
         self.gguf_writer.add_embedding_length(d_model)
         self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
-        self.gguf_writer.add_head_count(d_inner)
+        self.gguf_writer.add_head_count(d_inner) # the number of rows in conv_state and ssm_state
         self.gguf_writer.add_block_count(self.hparams["n_layer"])
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
-        self.gguf_writer.add_key_length(self.hparams.get("d_conv", 4))
+        # NOTE: (ab)using the KV cache metadata to store dimensions for conv_state and ssm_state
+        # Since the first column of the conv_state is shifted out each time, it's not actually needed
+        self.gguf_writer.add_key_length(self.hparams.get("d_conv", 4) - 1)
         self.gguf_writer.add_value_length(self.hparams.get("d_state", 16))
         self.gguf_writer.add_file_type(self.ftype)
 
diff --git a/llama.cpp b/llama.cpp
index 048bd8e509029..cf41d69beed7a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2069,9 +2069,6 @@ static bool llama_kv_cache_init(
     if (model.arch == LLM_ARCH_MAMBA) {
         // only one slot is needed for Mamba
         n_ctx = 1;
-        // it's probably best to keep as much precision as possible for the states
-        ktype = GGML_TYPE_F32;
-        vtype = GGML_TYPE_F32;
     }
 
     cache.has_shift = false;
@@ -4681,7 +4678,7 @@ static bool llm_load_tensors(
                 } break;
             case LLM_ARCH_MAMBA:
                 {
-                    const int64_t d_conv  = hparams.n_embd_head_k;
+                    const int64_t d_conv  = hparams.n_embd_head_k + 1;
                     const int64_t d_state = hparams.n_embd_head_v;
                     const int64_t d_inner = hparams.n_head;
                     // FIXME: ceiling instead of floor
@@ -7915,28 +7912,27 @@ struct llm_build_context {
     struct ggml_cgraph * build_mamba() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
-        const bool use_conv = batch.n_tokens > 1;
-        GGML_ASSERT(use_conv == false); // TODO: implement
+        const int32_t n_tok = batch.n_tokens;
 
         // hopefully the compiler does constant folding
         const int64_t d_model = n_embd;
         const int64_t d_inner = n_head;
         GGML_ASSERT(2 * d_model == d_inner);
-        const int64_t d_conv = n_embd_head_k;
+        const int64_t d_conv = n_embd_head_k + 1;
         const int64_t d_state = n_embd_head_v;
         const int64_t dt_rank = d_model / 16;
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        // NOTE: not sure what's the difference between the sequence length and the batch size in the paper.
-        // {n_embd, batch}
+        // {n_embd, n_tok}
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         for (int il = 0; il < n_layer; ++il) {
             // (ab)using the kv cache to store the state
-            ggml_tensor * conv_state = ggml_reshape_2d(ctx0, kv_self.k_l[il], d_conv, d_inner);
+            // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
+            ggml_tensor * conv_state = ggml_reshape_2d(ctx0, kv_self.k_l[il], d_conv - 1, d_inner);
             ggml_tensor * ssm_state  = ggml_reshape_2d(ctx0, kv_self.v_l[il], d_state, d_inner);
 
             // norm
@@ -7945,33 +7941,43 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "attn_norm", il);
 
-            // {n_embd, 2*d_inner} * {n_embd, batch} = {2*d_inner, batch}
+            // {n_embd, 2*d_inner} * {n_embd, n_tok} => {2*d_inner, n_tok}
             struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
             // split the above in two
-            // assuming it's contiguous
-            // {d_inner, batch}
+            // => {d_inner, n_tok}
             struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
             struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
 
-            cur = x;
-
             // conv
             {
-                // shift conv state left
-                conv_state = ggml_set_2d(ctx0, conv_state, ggml_view_2d(ctx0, conv_state, (d_conv - 1), d_inner, conv_state->nb[1], ggml_element_size(conv_state)*1), conv_state->nb[1], 0);
-
-                // update last column
-                // x here is {d_inner, 1} (a row), but should be {1, d_inner} (a column)
-                conv_state = ggml_set_2d(ctx0, conv_state, ggml_cont(ctx0, ggml_transpose(ctx0, x)), conv_state->nb[1], ggml_element_size(conv_state)*(d_conv - 1));
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_state, ggml_view_tensor(ctx0, kv_self.k_l[il])));
-
-                // rearrange and sum
-                // no need to rearrange the conv_state, since it's already in the right shape
-                // => {1, d_inner}
-                x = ggml_sum_rows(ctx0, ggml_mul(ctx0, conv_state, model.layers[il].ssm_conv1d));
-                // => {d_inner, 1}
-                x = ggml_transpose(ctx0, x);
+                // concat last (d_conv - 1) columns of conv_state, and x
+
+                // The following tensor is too big in order to avoid an assertion error when making an overlapping view.
+                // TODO: in ggml_new_tensor_impl, handle overlapping data range in data size calculation
+                // This could then be a tensor with ne[] = {(d_conv-1)+n_tok, d_inner}
+                // which is around (d_conv-1) times as small as its current size.
+                struct ggml_tensor * conv_x = ggml_new_tensor_1d(ctx0, conv_state->type, d_conv*d_inner*n_tok);
+                const size_t conv_x_nb1 = (d_conv - 1 + n_tok) * ggml_element_size(conv_x);
+
+                conv_x = ggml_set_2d(ctx0, conv_x, conv_state, conv_x_nb1, 0);
+                // unfortunately, making x contiguous is necessary because ggml_set expects nb0 == sizeof(float)
+                conv_x = ggml_set_2d(ctx0, conv_x, ggml_cont(ctx0, ggml_transpose(ctx0, x)), conv_x_nb1, (d_conv - 1)*ggml_element_size(conv_x));
+
+                // store last (d_conv - 1) columns of conv_x back into the KV cache for the next conv_state
+                ggml_build_forward_expand(gf,
+                    ggml_cpy(ctx0,
+                        ggml_view_2d(ctx0, conv_x, d_conv - 1, d_inner, conv_x_nb1, n_tok*ggml_element_size(conv_x)),
+                        ggml_view_tensor(ctx0, kv_self.k_l[il])));
+
+                // prepare convolution for all tokens in the batch with a self-overlapping view
+                // {(d_conv-1)+n_tok, d_inner} => {d_conv, d_inner, n_tok}
+                conv_x = ggml_view_3d(ctx0, conv_x, d_conv, d_inner, n_tok, conv_x_nb1, -(d_conv - 1)*d_inner*ggml_element_size(conv_x), 0);
+
+                // perform convolution
+                // => {1, d_inner, n_tok}
+                x = ggml_sum_rows(ctx0, ggml_mul(ctx0, conv_x, model.layers[il].ssm_conv1d));
+                // => {d_inner, n_tok, 1}
+                x = ggml_permute(ctx0, x, 2, 0, 1, 3);
 
                 // bias
                 x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
@@ -7981,23 +7987,24 @@ struct llm_build_context {
 
             // ssm
             {
-                // {2*n_embd, batch} * {2*n_embd, dt_rank + 2*d_state} = {batch, dt_rank + 2*d_state}
-                struct ggml_tensor * x_db = ggml_mul_mat(ctx0, x, model.layers[il].ssm_x);
-                // FIXME: handle batches of more than 1 token
-                struct ggml_tensor * dt = ggml_view_1d(ctx0, x_db, dt_rank, 0);
-                struct ggml_tensor * B = ggml_view_1d(ctx0, x_db, d_state, ggml_element_size(x_db)*dt_rank);
-                struct ggml_tensor * C = ggml_view_1d(ctx0, x_db, d_state, ggml_element_size(x_db)*(dt_rank+d_state));
-
-                // {dt_rank} * {dt_rank, d_inner} = {1, d_inner}
-                dt = ggml_mul_mat(ctx0, dt, model.layers[il].ssm_dt);
-                dt = ggml_add(ctx0, dt, ggml_transpose(ctx0, model.layers[il].ssm_dt_b));
+                //  {d_inner, dt_rank + 2*d_state} * {d_inner, n_tok} => {dt_rank + 2*d_state, n_tok}
+                struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
+                // split
+                struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, x_db->ne[1], x_db->nb[1], 0);
+                struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, x_db->ne[1], x_db->nb[1], ggml_element_size(x_db)*dt_rank);
+                struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, x_db->ne[1], x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
+
+                // {dt_rank, d_inner} * {dt_rank, n_tok} => {d_inner, n_tok}
+                dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
+                dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
                 dt = ggml_soft_plus(ctx0, dt);
 
+                // FIXME: support batches with more than 1 token
                 // => {d_state, d_inner}
-                struct ggml_tensor * dA = ggml_exp(ctx0, ggml_mul(ctx0, model.layers[il].ssm_a, dt));
+                struct ggml_tensor * dA = ggml_exp(ctx0, ggml_mul(ctx0, model.layers[il].ssm_a, ggml_transpose(ctx0, dt)));
 
                 // => {d_state, d_inner}
-                struct ggml_tensor * dB = ggml_out_prod(ctx0, B, ggml_transpose(ctx0, dt));
+                struct ggml_tensor * dB = ggml_out_prod(ctx0, B, dt);
 
                 // => {d_state, d_inner}
                 cur = ggml_mul(ctx0, dB, ggml_transpose(ctx0, x));
@@ -8012,7 +8019,7 @@ struct llm_build_context {
                 y = ggml_add(ctx0, y, ggml_mul(ctx0, model.layers[il].ssm_d, x));
                 y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
 
-                // {d_inner, n_embd} * {d_inner, 1} = {n_embd, 1}
+                // {d_inner, n_embd} * {d_inner, 1} => {n_embd, 1}
                 cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
             }
 
@@ -12327,8 +12334,15 @@ struct llama_context * llama_new_context_with_model(
     ctx->rng                 = std::mt19937(params.seed);
     ctx->logits_all          = params.logits_all;
 
-    const ggml_type type_k = params.type_k;
-    const ggml_type type_v = params.type_v;
+    ggml_type type_k = params.type_k;
+    ggml_type type_v = params.type_v;
+
+    // Mamba (mis)uses the KV cache to store its states
+    if (model->arch == LLM_ARCH_MAMBA) {
+        // it's probably best to keep as much precision as possible for the states
+        type_k = GGML_TYPE_F32; // required by ggml_set for Mamba's conv_state
+        type_v = GGML_TYPE_F32; // required by ggml_mul for Mamba's ssm_state
+    }
 
     GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
     GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);

From 81b57bb37599fea3a2c2806ac37d4fcf39bc5383 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Wed, 31 Jan 2024 08:47:53 -0500
Subject: [PATCH 09/41] mamba : fix self-overlapping view depth stride

---
 llama.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index cf41d69beed7a..f064969d25d59 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7960,7 +7960,7 @@ struct llm_build_context {
                 const size_t conv_x_nb1 = (d_conv - 1 + n_tok) * ggml_element_size(conv_x);
 
                 conv_x = ggml_set_2d(ctx0, conv_x, conv_state, conv_x_nb1, 0);
-                // unfortunately, making x contiguous is necessary because ggml_set expects nb0 == sizeof(float)
+                // making x contiguous is necessary because ggml_set expects it
                 conv_x = ggml_set_2d(ctx0, conv_x, ggml_cont(ctx0, ggml_transpose(ctx0, x)), conv_x_nb1, (d_conv - 1)*ggml_element_size(conv_x));
 
                 // store last (d_conv - 1) columns of conv_x back into the KV cache for the next conv_state
@@ -7969,9 +7969,10 @@ struct llm_build_context {
                         ggml_view_2d(ctx0, conv_x, d_conv - 1, d_inner, conv_x_nb1, n_tok*ggml_element_size(conv_x)),
                         ggml_view_tensor(ctx0, kv_self.k_l[il])));
 
-                // prepare convolution for all tokens in the batch with a self-overlapping view
+                // prepare convolution for all tokens in the batch with a self-overlapping view,
+                // shifting by one column each ... depth? ... with a window of d_conv columns.
                 // {(d_conv-1)+n_tok, d_inner} => {d_conv, d_inner, n_tok}
-                conv_x = ggml_view_3d(ctx0, conv_x, d_conv, d_inner, n_tok, conv_x_nb1, -(d_conv - 1)*d_inner*ggml_element_size(conv_x), 0);
+                conv_x = ggml_view_3d(ctx0, conv_x, d_conv, d_inner, n_tok, conv_x_nb1, 1*ggml_element_size(conv_x), 0);
 
                 // perform convolution
                 // => {1, d_inner, n_tok}

From ffc116f5ec225c142ae0d96c649700d810318e80 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Wed, 31 Jan 2024 20:45:04 -0500
Subject: [PATCH 10/41] mamba : handle batches of more than 1 token

This means running Mamba no longer crashes when using the default settings!
And probably also slightly faster prompt processing.
Both batched and non-batched processing yield the same output.

Previously, the state was not cleared when starting a sequence.
Next step is to make the KV cache API work as expected for Mamba models.

* ggml: add ggml_ssm_scan to help with parallel selective scan

If the selective scan was implemented without a custom operator,
there would be waaay too many nodes in the graph. For example,
for Mamba-130M, with a batch size of 512 (the default),
a naive selective scan could add at least 24*512=12288 nodes,
which is more than LLAMA_MAX_NODES (8192),
and that's only for the smallest Mamba model.
So it's much cleaner with a custom operator.
Not sure about the name, though.
---
 ggml.c    | 124 +++++++++++++++++++++++++++++++++++++++++++++++++++++-
 ggml.h    |   7 +++
 llama.cpp |  86 +++++++++++++++++++++++++++++--------
 3 files changed, 197 insertions(+), 20 deletions(-)

diff --git a/ggml.c b/ggml.c
index 8f351d823b730..177d8b3c34ecb 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1831,6 +1831,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "FLASH_ATTN",
     "FLASH_FF",
     "FLASH_ATTN_BACK",
+    "SSM_SCAN",
     "WIN_PART",
     "WIN_UNPART",
     "GET_REL_POS",
@@ -1853,7 +1854,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1919,6 +1920,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "flash_attn(x)",
     "flash_ff(x)",
     "flash_attn_back(x)",
+    "ssm_scan(x)",
     "win_part(x)",
     "win_unpart(x)",
     "get_rel_pos(x)",
@@ -1941,7 +1943,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
+static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -6149,6 +6151,40 @@ struct ggml_tensor * ggml_flash_attn_back(
     return result;
 }
 
+// ggml_ssm_scan
+
+struct ggml_tensor * ggml_ssm_scan(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * s,
+        struct ggml_tensor  * dA,
+        struct ggml_tensor  * dB_x) {
+    GGML_ASSERT(ggml_are_same_shape(dA, dB_x));
+
+    GGML_ASSERT(   s->nb[0] == ggml_type_size(   s->type));
+    GGML_ASSERT(  dA->nb[0] == ggml_type_size(  dA->type));
+    GGML_ASSERT(dB_x->nb[0] == ggml_type_size(dB_x->type));
+
+    GGML_ASSERT(s->ne[0] == dA->ne[0]);
+    GGML_ASSERT(s->ne[1] == dA->ne[1]);
+    GGML_ASSERT(s->ne[2] == 1 && s->ne[3] == 1); // the ssm_state should be 2D
+
+    bool is_node = false;
+
+    if (s->grad || dA->grad || dB_x->grad) {
+        is_node = true;
+    }
+
+    struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, dA->ne[0], dA->ne[1], dA->ne[2]);
+
+    result->op   = GGML_OP_SSM_SCAN;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = s;
+    result->src[1] = dA;
+    result->src[2] = dB_x;
+
+    return result;
+}
+
 // ggml_win_part
 
 struct ggml_tensor * ggml_win_part(
@@ -14755,6 +14791,78 @@ static void ggml_compute_forward_flash_attn_back(
     }
 }
 
+// ggml_compute_forward_ssm_scan
+
+static void ggml_compute_forward_ssm_scan_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * src2,
+        struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int64_t nc  = src1->ne[0];
+    const int64_t n_b = src1->ne[2]; // number of batches
+    const int64_t nr0 = ggml_nrows(src0);
+
+    GGML_ASSERT(nc*n_b*nr0 == ggml_nelements(src1));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src2->nb[0] == sizeof(float));
+
+    // rows per thread
+    const int dr = (nr0 + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr0);
+
+    // first batch
+    for (int i1 = ir0; i1 < ir1; i1++) {
+        float * dest = (float *) ((char *) dst->data  + i1*( dst->nb[1]));
+        float * s    = (float *) ((char *) src0->data + i1*(src0->nb[1]));
+        float * dA   = (float *) ((char *) src1->data + i1*(src1->nb[1]));
+        float * dB_x = (float *) ((char *) src2->data + i1*(src2->nb[1]));
+        ggml_vec_mul_f32(nc, dest, s, dA);
+        ggml_vec_add_f32(nc, dest, dest, dB_x);
+    }
+
+    // rest of batches, state comes from dest
+    for (int i2 = 1; i2 < n_b; i2++) {
+        for (int i1 = ir0; i1 < ir1; i1++) {
+            float * dest = (float *) ((char *) dst->data  + i1*( dst->nb[1]) +  i2   *( dst->nb[2]));
+            float * s    = (float *) ((char *) dst->data  + i1*( dst->nb[1]) + (i2-1)*( dst->nb[2]));
+            float * dA   = (float *) ((char *) src1->data + i1*(src1->nb[1]) +  i2   *(src1->nb[2]));
+            float * dB_x = (float *) ((char *) src2->data + i1*(src2->nb[1]) +  i2   *(src2->nb[2]));
+            ggml_vec_mul_f32(nc, dest, s, dA);
+            ggml_vec_add_f32(nc, dest, dest, dB_x);
+        }
+    }
+}
+
+static void ggml_compute_forward_ssm_scan(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * src2,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_ssm_scan_f32(params, src0, src1, src2, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_win_part
 
 static void ggml_compute_forward_win_part_f32(
@@ -15814,6 +15922,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 bool masked = t != 0;
                 ggml_compute_forward_flash_attn_back(params, masked, tensor);
             } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                ggml_compute_forward_ssm_scan(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+            } break;
         case GGML_OP_WIN_PART:
             {
                 ggml_compute_forward_win_part(params, tensor);
@@ -16868,6 +16980,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // not supported
             } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                GGML_ASSERT(false); // TODO: not implemented
+            } break;
         case GGML_OP_WIN_PART:
         case GGML_OP_WIN_UNPART:
         case GGML_OP_UNARY:
@@ -17570,6 +17686,10 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = n_threads;
             } break;
+        case GGML_OP_SSM_SCAN:
+            {
+                n_tasks = n_threads;
+            } break;
         case GGML_OP_WIN_PART:
         case GGML_OP_WIN_UNPART:
         case GGML_OP_GET_REL_POS:
diff --git a/ggml.h b/ggml.h
index efb62c5983f44..0a40f87625666 100644
--- a/ggml.h
+++ b/ggml.h
@@ -462,6 +462,7 @@ extern "C" {
         GGML_OP_FLASH_ATTN,
         GGML_OP_FLASH_FF,
         GGML_OP_FLASH_ATTN_BACK,
+        GGML_OP_SSM_SCAN,
         GGML_OP_WIN_PART,
         GGML_OP_WIN_UNPART,
         GGML_OP_GET_REL_POS,
@@ -1720,6 +1721,12 @@ extern "C" {
             struct ggml_tensor  * c0,
             struct ggml_tensor  * c1);
 
+    GGML_API struct ggml_tensor * ggml_ssm_scan(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * s,
+            struct ggml_tensor  * dA,
+            struct ggml_tensor  * dB_x);
+
     // partition into non-overlapping windows with padding if needed
     // example:
     // a:   768   64   64    1
diff --git a/llama.cpp b/llama.cpp
index f064969d25d59..bbf16e8f43387 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7935,6 +7935,13 @@ struct llm_build_context {
             ggml_tensor * conv_state = ggml_reshape_2d(ctx0, kv_self.k_l[il], d_conv - 1, d_inner);
             ggml_tensor * ssm_state  = ggml_reshape_2d(ctx0, kv_self.v_l[il], d_state, d_inner);
 
+            // reset the states when starting a new sequence
+            // TODO: ensure kv_self clearing is handled
+            if (!batch.pos || batch.pos[0] == 0) {
+                conv_state = ggml_scale(ctx0, conv_state, 0);
+                ssm_state = ggml_scale(ctx0, ssm_state, 0);
+            }
+
             // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
                     model.layers[il].attn_norm, NULL,
@@ -7991,36 +7998,79 @@ struct llm_build_context {
                 //  {d_inner, dt_rank + 2*d_state} * {d_inner, n_tok} => {dt_rank + 2*d_state, n_tok}
                 struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
                 // split
-                struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, x_db->ne[1], x_db->nb[1], 0);
-                struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, x_db->ne[1], x_db->nb[1], ggml_element_size(x_db)*dt_rank);
-                struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, x_db->ne[1], x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
+                struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tok, x_db->nb[1], 0);
+                struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tok, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
+                struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tok, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
 
                 // {dt_rank, d_inner} * {dt_rank, n_tok} => {d_inner, n_tok}
                 dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
                 dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
                 dt = ggml_soft_plus(ctx0, dt);
 
-                // FIXME: support batches with more than 1 token
-                // => {d_state, d_inner}
-                struct ggml_tensor * dA = ggml_exp(ctx0, ggml_mul(ctx0, model.layers[il].ssm_a, ggml_transpose(ctx0, dt)));
-
-                // => {d_state, d_inner}
-                struct ggml_tensor * dB = ggml_out_prod(ctx0, B, dt);
+                struct ggml_tensor * dA;
+                struct ggml_tensor * dB;
+                if (n_tok == 1) {
+                    // => {d_state, d_inner}
+                    dA = ggml_exp(ctx0, ggml_mul(ctx0, model.layers[il].ssm_a, ggml_transpose(ctx0, dt)));
 
-                // => {d_state, d_inner}
-                cur = ggml_mul(ctx0, dB, ggml_transpose(ctx0, x));
+                    // {d_state} * {d_inner} => {d_state, d_inner}
+                    dB = ggml_out_prod(ctx0, B, dt);
+                } else {
+                    // {d_state, d_inner} * {d_inner, n_tok} => {d_state, d_inner, n_tok} * {1, d_inner, n_tok}
+                    // => {d_state, d_inner, n_tok}
+                    // Trying to do the equivalent of
+                    // dA = torch.exp(rearrange(dt, "b d -> b d 1") * A)  # (batch, dim, dstate)
+                    struct ggml_tensor * A = model.layers[il].ssm_a;
+                    dA = ggml_exp(ctx0,
+                        ggml_mul(ctx0,
+                            ggml_repeat(ctx0, A, ggml_new_tensor_3d(ctx0, A->type, d_state, d_inner, n_tok)),
+                            // {d_inner, n_tok} => {1, d_inner, n_tok}
+                            ggml_permute(ctx0, dt, 1, 2, 0, 3))
+                    );
+
+                    // {d_state, 1, n_tok} * {d_inner, 1, n_tok} => {d_state, d_inner, n_tok}
+                    dB = ggml_out_prod(ctx0,
+                        // {d_state, n_tok} => {d_state, 1, n_tok}
+                        ggml_permute(ctx0, B, 0, 2, 1, 3),
+                        // {d_state, n_tok} => {d_state, 1, n_tok}
+                        ggml_permute(ctx0, dt, 0, 2, 1, 3));
+                }
 
-                ssm_state = ggml_add(ctx0, ggml_mul(ctx0, ssm_state, dA), cur);
+                // {d_state, d_inner, n_tok} * {1, d_inner, n_tok} => {d_state, d_inner, n_tok}
+                cur = ggml_mul(ctx0, dB, ggml_permute(ctx0, x, 1, 2, 0, 3));
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, ssm_state, ggml_view_tensor(ctx0, kv_self.v_l[il])));
+                // The selective scan seems inherently sequential...
+                // To avoid making (n_layer * n_tok) graph nodes, let's use a custom operator.
+                // When n_tok == 1, it's equivalent to the following:
+                //     ssm_state = ggml_add(ctx0, ggml_mul(ctx0, ssm_state, dA), cur);
+                // When n_tok is bigger, it's the same thing, but iterated n_tok times,
+                // with the correct dA and cur for each token.
+                // The resulting states are layered on the ne[2] dimension.
+                // => {d_state, d_inner, n_tok}
+                ssm_state = ggml_ssm_scan(ctx0, ssm_state, dA, cur);
 
-                // row-wise dot product ("dn,n->d")
-                // {d_state, d_inner} * {d_state} => {d_inner, 1}
-                struct ggml_tensor * y = ggml_mul_mat(ctx0, ssm_state, C);
-                y = ggml_add(ctx0, y, ggml_mul(ctx0, model.layers[il].ssm_d, x));
+                // only store last state
+                ggml_build_forward_expand(gf,
+                    ggml_cpy(ctx0,
+                        ggml_view_2d(ctx0, ssm_state, d_state, d_inner, ssm_state->nb[1], (n_tok-1)*ssm_state->nb[2]),
+                        ggml_view_tensor(ctx0, kv_self.v_l[il])));
+
+                struct ggml_tensor * y;
+                if (n_tok == 1) {
+                    // row-wise dot product ("dn,n->d")
+                    // {d_state, d_inner} * {d_state, 1} => {d_inner, 1}
+                    y = ggml_mul_mat(ctx0, ssm_state, C);
+                } else {
+                    // {d_state, d_inner, n_tok} * {d_state, n_tok} => {d_inner, 1, n_tok}
+                    y = ggml_mul_mat(ctx0, ssm_state, ggml_permute(ctx0, C, 0, 2, 1, 3));
+                    // => {d_inner, n_tok}
+                    y = ggml_permute(ctx0, y, 0, 2, 1, 3);
+                }
+                // {d_inner, n_tok} * {d_inner} => {d_inner, n_tok}
+                y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
                 y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
 
-                // {d_inner, n_embd} * {d_inner, 1} => {n_embd, 1}
+                // {d_inner, n_embd} * {d_inner, n_tok} => {n_embd, n_tok}
                 cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
             }
 

From 78a853b7885d2edb48651f7eaec484fa2b6f2697 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 1 Feb 2024 21:16:40 -0500
Subject: [PATCH 11/41] ggml : in ggml_ssm_scan, merge multiple rows in the
 same vec operation

This will help with performance on CPU if ggml_vec_mul_f32
and ggml_vec_add_f32 are ever optimized with SIMD.
---
 ggml.c | 45 ++++++++++++++++++++++++---------------------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/ggml.c b/ggml.c
index 177d8b3c34ecb..b132bec68eaec 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14807,13 +14807,17 @@ static void ggml_compute_forward_ssm_scan_f32(
     const int nth = params->nth;
 
     const int64_t nc  = src1->ne[0];
-    const int64_t n_b = src1->ne[2]; // number of batches
+    const int64_t n_t = src1->ne[2]; // number of tokens in the batch
     const int64_t nr0 = ggml_nrows(src0);
 
-    GGML_ASSERT(nc*n_b*nr0 == ggml_nelements(src1));
+    GGML_ASSERT(nc*n_t*nr0  == ggml_nelements(src1));
     GGML_ASSERT(src0->nb[0] == sizeof(float));
     GGML_ASSERT(src1->nb[0] == sizeof(float));
     GGML_ASSERT(src2->nb[0] == sizeof(float));
+    // allow merging multiple rows in the same vec operation
+    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
+    GGML_ASSERT(src1->nb[1] == src1->ne[0]*sizeof(float));
+    GGML_ASSERT(src2->nb[1] == src2->ne[0]*sizeof(float));
 
     // rows per thread
     const int dr = (nr0 + nth - 1)/nth;
@@ -14821,27 +14825,26 @@ static void ggml_compute_forward_ssm_scan_f32(
     // row range for this thread
     const int ir0 = dr*ith;
     const int ir1 = MIN(ir0 + dr, nr0);
+    const int ir  = ir1 - ir0;
 
     // first batch
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        float * dest = (float *) ((char *) dst->data  + i1*( dst->nb[1]));
-        float * s    = (float *) ((char *) src0->data + i1*(src0->nb[1]));
-        float * dA   = (float *) ((char *) src1->data + i1*(src1->nb[1]));
-        float * dB_x = (float *) ((char *) src2->data + i1*(src2->nb[1]));
-        ggml_vec_mul_f32(nc, dest, s, dA);
-        ggml_vec_add_f32(nc, dest, dest, dB_x);
-    }
-
-    // rest of batches, state comes from dest
-    for (int i2 = 1; i2 < n_b; i2++) {
-        for (int i1 = ir0; i1 < ir1; i1++) {
-            float * dest = (float *) ((char *) dst->data  + i1*( dst->nb[1]) +  i2   *( dst->nb[2]));
-            float * s    = (float *) ((char *) dst->data  + i1*( dst->nb[1]) + (i2-1)*( dst->nb[2]));
-            float * dA   = (float *) ((char *) src1->data + i1*(src1->nb[1]) +  i2   *(src1->nb[2]));
-            float * dB_x = (float *) ((char *) src2->data + i1*(src2->nb[1]) +  i2   *(src2->nb[2]));
-            ggml_vec_mul_f32(nc, dest, s, dA);
-            ggml_vec_add_f32(nc, dest, dest, dB_x);
-        }
+    {
+        float * dest = (float *) ((char *)  dst->data + ir0*( dst->nb[1]));
+        float * s    = (float *) ((char *) src0->data + ir0*(src0->nb[1]));
+        float * dA   = (float *) ((char *) src1->data + ir0*(src1->nb[1]));
+        float * dB_x = (float *) ((char *) src2->data + ir0*(src2->nb[1]));
+        ggml_vec_mul_f32(nc*ir, dest,    s,   dA);
+        ggml_vec_add_f32(nc*ir, dest, dest, dB_x);
+    }
+
+    // compute state for rest of tokens, previous state comes from dest
+    for (int i2 = 1; i2 < n_t; i2++) {
+        float * dest = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) +  i2   *( dst->nb[2]));
+        float * s    = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) + (i2-1)*( dst->nb[2]));
+        float * dA   = (float *) ((char *) src1->data + ir0*(src1->nb[1]) +  i2   *(src1->nb[2]));
+        float * dB_x = (float *) ((char *) src2->data + ir0*(src2->nb[1]) +  i2   *(src2->nb[2]));
+        ggml_vec_mul_f32(nc*ir, dest,    s,   dA);
+        ggml_vec_add_f32(nc*ir, dest, dest, dB_x);
     }
 }
 

From 5816ae687ea1c2f9add7c582d283e80cc5d089ba Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 1 Feb 2024 21:22:28 -0500
Subject: [PATCH 12/41] mamba : very basic quantization support

Mostly works, but there is currently no difference
between the variants of a k-quant (e.g. Q4_K_S and Q4_K_M are the same).
Most of the SSM-specific weights can be kept in f32 without affecting
the size that much, since they are relatively small.
(the linear projection weights are responsible for most of Mamba's size)

Too much quantization seems to make the state degrade quite fast, and
the model begins to output gibberish.
It seems to affect bigger models to a lesser extent than small models,
but I'm not sure by how much.

Experimentation will be needed to figure out which weights are more important
for the _M (and _L?) variants of k-quants for Mamba.

* convert : fix wrong name for layer norm weight of offical Mamba models

I was using Q-bert/Mamba-* models before, which have a slighlty different
naming scheme for the weights.
(they start with "model.layers" instead of "backbone.layers")
---
 gguf-py/gguf/tensor_mapping.py | 2 +-
 llama.cpp                      | 6 ++++++
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index d24d10dcb6cff..85af29549de6c 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -91,7 +91,7 @@ class TensorNameMap:
             "model.layers.layers.{bid}.norm",                       # plamo
             "model.layers.{bid}.attention_norm",                    # internlm2
             "model.layers.{bid}.norm",                              # mamba
-            "backbone.layers.{bid}.mixer.norm",                     # mamba
+            "backbone.layers.{bid}.norm",                           # mamba
         ),
 
         # Attention norm 2
diff --git a/llama.cpp b/llama.cpp
index bbf16e8f43387..9dba8eeb28f0c 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -11718,6 +11718,12 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD,    "weight");
         quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_TOKEN_TYPES, "weight");
 
+        // do not quantize Mamba's small yet 2D weights
+        // NOTE: can't use LLM_TN here because the layer number is not known
+        quantize &= name.find("ssm_conv1d.weight") == std::string::npos;
+        quantize &= name.find("ssm_x.weight") == std::string::npos;
+        quantize &= name.find("ssm_dt.weight") == std::string::npos;
+
         enum ggml_type new_type;
         void * new_data;
         size_t new_size;

From a3f4a1c7dc9fc10082d5290b49505bc3d3db239c Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 3 Feb 2024 17:49:36 -0500
Subject: [PATCH 13/41] mamba : fuse more steps of the SSM scan in the
 ggml_ssm_scan operator

This increases performance on CPU by around 30% for prompt processing,
and by around 20% for text generation.

However, it also makes the ggml_exp and ggml_soft_plus operators unused.
Whether or not they should be kept will be decided later.
---
 ggml.c    | 113 +++++++++++++++++++++++++++++++++++++-----------------
 ggml.h    |   6 ++-
 llama.cpp |  49 +++--------------------
 3 files changed, 87 insertions(+), 81 deletions(-)

diff --git a/ggml.c b/ggml.c
index b132bec68eaec..90dcddbb71df3 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6156,31 +6156,45 @@ struct ggml_tensor * ggml_flash_attn_back(
 struct ggml_tensor * ggml_ssm_scan(
         struct ggml_context * ctx,
         struct ggml_tensor  * s,
-        struct ggml_tensor  * dA,
-        struct ggml_tensor  * dB_x) {
-    GGML_ASSERT(ggml_are_same_shape(dA, dB_x));
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * dt,
+        struct ggml_tensor  * A,
+        struct ggml_tensor  * B) {
+    GGML_ASSERT(ggml_is_contiguous(s));
+    GGML_ASSERT(ggml_is_contiguous(x));
+    GGML_ASSERT(ggml_is_contiguous(dt));
+    GGML_ASSERT(ggml_is_contiguous(A));
+    GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
+    ggml_are_same_shape(x, dt);
+    GGML_ASSERT(s->ne[2] == 1 && s->ne[3] == 1); // the ssm_state should be 2D
 
-    GGML_ASSERT(   s->nb[0] == ggml_type_size(   s->type));
-    GGML_ASSERT(  dA->nb[0] == ggml_type_size(  dA->type));
-    GGML_ASSERT(dB_x->nb[0] == ggml_type_size(dB_x->type));
+    {
+        const int64_t d_state = s->ne[0];
+        const int64_t d_inner = s->ne[1];
+        const int64_t n_tok   = x->ne[1];
 
-    GGML_ASSERT(s->ne[0] == dA->ne[0]);
-    GGML_ASSERT(s->ne[1] == dA->ne[1]);
-    GGML_ASSERT(s->ne[2] == 1 && s->ne[3] == 1); // the ssm_state should be 2D
+        GGML_ASSERT(x->ne[0] == d_inner);
+        GGML_ASSERT(A->ne[0] == d_state);
+        GGML_ASSERT(A->ne[1] == d_inner);
+        GGML_ASSERT(B->ne[0] == d_state);
+        GGML_ASSERT(B->ne[1] == n_tok);
+    }
 
     bool is_node = false;
 
-    if (s->grad || dA->grad || dB_x->grad) {
+    if (s->grad || x->grad || dt->grad || A->grad || B->grad) {
         is_node = true;
     }
 
-    struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, dA->ne[0], dA->ne[1], dA->ne[2]);
+    struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, s->ne[0], s->ne[1], x->ne[1]);
 
     result->op   = GGML_OP_SSM_SCAN;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
     result->src[0] = s;
-    result->src[1] = dA;
-    result->src[2] = dB_x;
+    result->src[1] = x;
+    result->src[2] = dt;
+    result->src[3] = A;
+    result->src[4] = B;
 
     return result;
 }
@@ -14795,9 +14809,11 @@ static void ggml_compute_forward_flash_attn_back(
 
 static void ggml_compute_forward_ssm_scan_f32(
         const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * src2,
+        const struct ggml_tensor * src0, // s
+        const struct ggml_tensor * src1, // x
+        const struct ggml_tensor * src2, // dt
+        const struct ggml_tensor * src3, // A
+        const struct ggml_tensor * src4, // B
         struct ggml_tensor * dst) {
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
@@ -14806,18 +14822,19 @@ static void ggml_compute_forward_ssm_scan_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int64_t nc  = src1->ne[0];
-    const int64_t n_t = src1->ne[2]; // number of tokens in the batch
+    const int64_t nc  = src0->ne[0];
+    const int64_t n_t = src1->ne[1]; // number of tokens in the batch
     const int64_t nr0 = ggml_nrows(src0);
 
-    GGML_ASSERT(nc*n_t*nr0  == ggml_nelements(src1));
+    GGML_ASSERT(nc*n_t*nr0  == ggml_nelements(dst));
     GGML_ASSERT(src0->nb[0] == sizeof(float));
     GGML_ASSERT(src1->nb[0] == sizeof(float));
     GGML_ASSERT(src2->nb[0] == sizeof(float));
+    GGML_ASSERT(src3->nb[0] == sizeof(float));
+    GGML_ASSERT(src4->nb[0] == sizeof(float));
     // allow merging multiple rows in the same vec operation
     GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
-    GGML_ASSERT(src1->nb[1] == src1->ne[0]*sizeof(float));
-    GGML_ASSERT(src2->nb[1] == src2->ne[0]*sizeof(float));
+    GGML_ASSERT(src3->nb[1] == src3->ne[0]*sizeof(float));
 
     // rows per thread
     const int dr = (nr0 + nth - 1)/nth;
@@ -14829,22 +14846,44 @@ static void ggml_compute_forward_ssm_scan_f32(
 
     // first batch
     {
-        float * dest = (float *) ((char *)  dst->data + ir0*( dst->nb[1]));
-        float * s    = (float *) ((char *) src0->data + ir0*(src0->nb[1]));
-        float * dA   = (float *) ((char *) src1->data + ir0*(src1->nb[1]));
-        float * dB_x = (float *) ((char *) src2->data + ir0*(src2->nb[1]));
-        ggml_vec_mul_f32(nc*ir, dest,    s,   dA);
-        ggml_vec_add_f32(nc*ir, dest, dest, dB_x);
+        float * dest = (float *) ((char *)  dst->data + ir0*( dst->nb[1])); // {d_state, d_inner, n_tok}
+        float * s    = (float *) ((char *) src0->data + ir0*(src0->nb[1])); // {d_state, d_inner}
+        float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0])); // {d_inner, n_tok}
+        float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0])); // {d_inner, n_tok}
+        float * A    = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+        float * B    = (float *) ((char *) src4->data);                     // {d_state, n_tok}
+        // d_inner
+        for (int i1 = 0; i1 < ir; ++i1) {
+            float dt_soft_plus = log1pf(expf(dt[i1]));
+            float x_dt = x[i1] * dt_soft_plus;
+            // d_state
+            for (int i0 = 0; i0 < nc; ++i0) {
+                int i = i0 + i1*nc;
+                // ssm_state * dA + dB * x
+                dest[i] = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+            }
+        }
     }
 
     // compute state for rest of tokens, previous state comes from dest
-    for (int i2 = 1; i2 < n_t; i2++) {
-        float * dest = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) +  i2   *( dst->nb[2]));
-        float * s    = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) + (i2-1)*( dst->nb[2]));
-        float * dA   = (float *) ((char *) src1->data + ir0*(src1->nb[1]) +  i2   *(src1->nb[2]));
-        float * dB_x = (float *) ((char *) src2->data + ir0*(src2->nb[1]) +  i2   *(src2->nb[2]));
-        ggml_vec_mul_f32(nc*ir, dest,    s,   dA);
-        ggml_vec_add_f32(nc*ir, dest, dest, dB_x);
+    for (int i2 = 1; i2 < n_t; ++i2) {
+        float * dest = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) +  i2   *( dst->nb[2])); // {d_state, d_inner, n_tok}
+        float * s    = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) + (i2-1)*( dst->nb[2])); // {d_state, d_inner, n_tok}
+        float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0]) +  i2   *(src1->nb[1])); // {d_inner, n_tok}
+        float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0]) +  i2   *(src2->nb[1])); // {d_inner, n_tok}
+        float * A    = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+        float * B    = (float *) ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tok}
+        // d_inner
+        for (int i1 = 0; i1 < ir; ++i1) {
+            float dt_soft_plus = log1pf(expf(dt[i1]));
+            float x_dt = x[i1] * dt_soft_plus;
+            // d_state
+            for (int i0 = 0; i0 < nc; ++i0) {
+                int i = i0 + i1*nc;
+                // ssm_state * dA + dB * x
+                dest[i] = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+            }
+        }
     }
 }
 
@@ -14853,11 +14892,13 @@ static void ggml_compute_forward_ssm_scan(
         const struct ggml_tensor * src0,
         const struct ggml_tensor * src1,
         const struct ggml_tensor * src2,
+        const struct ggml_tensor * src3,
+        const struct ggml_tensor * src4,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_ssm_scan_f32(params, src0, src1, src2, dst);
+                ggml_compute_forward_ssm_scan_f32(params, src0, src1, src2, src3, src4, dst);
             } break;
         default:
             {
@@ -15927,7 +15968,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_SSM_SCAN:
             {
-                ggml_compute_forward_ssm_scan(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor);
+                ggml_compute_forward_ssm_scan(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
             } break;
         case GGML_OP_WIN_PART:
             {
diff --git a/ggml.h b/ggml.h
index 0a40f87625666..3a4c9201aee19 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1724,8 +1724,10 @@ extern "C" {
     GGML_API struct ggml_tensor * ggml_ssm_scan(
             struct ggml_context * ctx,
             struct ggml_tensor  * s,
-            struct ggml_tensor  * dA,
-            struct ggml_tensor  * dB_x);
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * dt,
+            struct ggml_tensor  * A,
+            struct ggml_tensor  * B);
 
     // partition into non-overlapping windows with padding if needed
     // example:
diff --git a/llama.cpp b/llama.cpp
index 9dba8eeb28f0c..466f8bc0c541a 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -7999,55 +7999,18 @@ struct llm_build_context {
                 struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
                 // split
                 struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tok, x_db->nb[1], 0);
-                struct ggml_tensor * B = ggml_view_2d(ctx0, x_db, d_state, n_tok, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
-                struct ggml_tensor * C = ggml_view_2d(ctx0, x_db, d_state, n_tok, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
+                struct ggml_tensor * B  = ggml_view_2d(ctx0, x_db, d_state, n_tok, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
+                struct ggml_tensor * C  = ggml_view_2d(ctx0, x_db, d_state, n_tok, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
 
                 // {dt_rank, d_inner} * {dt_rank, n_tok} => {d_inner, n_tok}
                 dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
                 dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
-                dt = ggml_soft_plus(ctx0, dt);
 
-                struct ggml_tensor * dA;
-                struct ggml_tensor * dB;
-                if (n_tok == 1) {
-                    // => {d_state, d_inner}
-                    dA = ggml_exp(ctx0, ggml_mul(ctx0, model.layers[il].ssm_a, ggml_transpose(ctx0, dt)));
-
-                    // {d_state} * {d_inner} => {d_state, d_inner}
-                    dB = ggml_out_prod(ctx0, B, dt);
-                } else {
-                    // {d_state, d_inner} * {d_inner, n_tok} => {d_state, d_inner, n_tok} * {1, d_inner, n_tok}
-                    // => {d_state, d_inner, n_tok}
-                    // Trying to do the equivalent of
-                    // dA = torch.exp(rearrange(dt, "b d -> b d 1") * A)  # (batch, dim, dstate)
-                    struct ggml_tensor * A = model.layers[il].ssm_a;
-                    dA = ggml_exp(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, A, ggml_new_tensor_3d(ctx0, A->type, d_state, d_inner, n_tok)),
-                            // {d_inner, n_tok} => {1, d_inner, n_tok}
-                            ggml_permute(ctx0, dt, 1, 2, 0, 3))
-                    );
-
-                    // {d_state, 1, n_tok} * {d_inner, 1, n_tok} => {d_state, d_inner, n_tok}
-                    dB = ggml_out_prod(ctx0,
-                        // {d_state, n_tok} => {d_state, 1, n_tok}
-                        ggml_permute(ctx0, B, 0, 2, 1, 3),
-                        // {d_state, n_tok} => {d_state, 1, n_tok}
-                        ggml_permute(ctx0, dt, 0, 2, 1, 3));
-                }
-
-                // {d_state, d_inner, n_tok} * {1, d_inner, n_tok} => {d_state, d_inner, n_tok}
-                cur = ggml_mul(ctx0, dB, ggml_permute(ctx0, x, 1, 2, 0, 3));
-
-                // The selective scan seems inherently sequential...
-                // To avoid making (n_layer * n_tok) graph nodes, let's use a custom operator.
-                // When n_tok == 1, it's equivalent to the following:
-                //     ssm_state = ggml_add(ctx0, ggml_mul(ctx0, ssm_state, dA), cur);
-                // When n_tok is bigger, it's the same thing, but iterated n_tok times,
-                // with the correct dA and cur for each token.
-                // The resulting states are layered on the ne[2] dimension.
+                // Custom operator to implement some of the optimizations
+                // described in the Annex D of the Mamba paper.
+                // TODO: maybe also optimize step 4 of the Speed section of Annex D (the mul_mat with C)
                 // => {d_state, d_inner, n_tok}
-                ssm_state = ggml_ssm_scan(ctx0, ssm_state, dA, cur);
+                ssm_state = ggml_ssm_scan(ctx0, ssm_state, x, dt, model.layers[il].ssm_a, B);
 
                 // only store last state
                 ggml_build_forward_expand(gf,

From 9f55809f7211bc58510ba501cfd681e9607cfb6a Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 4 Feb 2024 09:00:42 -0500
Subject: [PATCH 14/41] convert : for Mamba, also consider the
 "MambaLMHeadModel" arch name

It's the name of the class of the official implementation,
though they don't use it (yet) in the "architectures" field of config.json
---
 convert-hf-to-gguf.py     | 2 +-
 gguf-py/gguf/constants.py | 1 -
 llama.cpp                 | 2 --
 3 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index e49b2f4f63519..42b0fb66e745b 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1844,7 +1844,7 @@ class StarCoder2Model(Model):
     model_arch = gguf.MODEL_ARCH.STARCODER2
 
 
-@Model.register("MambaForCausalLM")
+@Model.register("MambaForCausalLM", "MambaLMHeadModel")
 class MambaModel(Model):
     model_arch = gguf.MODEL_ARCH.MAMBA
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index a281083830ce8..651323a1eed55 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -211,7 +211,6 @@ class MODEL_TENSOR(IntEnum):
     MODEL_TENSOR.FFN_DOWN_EXP:    "blk.{bid}.ffn_down.{xid}",
     MODEL_TENSOR.FFN_UP_EXP:      "blk.{bid}.ffn_up.{xid}",
     MODEL_TENSOR.LAYER_OUT_NORM:  "blk.{bid}.layer_output_norm",
-    # FIXME: NAMES FOR MAMBA ARE NOT FINAL
     MODEL_TENSOR.SSM_IN:          "blk.{bid}.ssm_in",
     MODEL_TENSOR.SSM_CONV1D:      "blk.{bid}.ssm_conv1d",
     MODEL_TENSOR.SSM_X:           "blk.{bid}.ssm_x",
diff --git a/llama.cpp b/llama.cpp
index 466f8bc0c541a..37ac7425d4b04 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -401,8 +401,6 @@ enum llm_tensor {
     LLM_TENSOR_ATTN_Q_NORM,
     LLM_TENSOR_ATTN_K_NORM,
     LLM_TENSOR_LAYER_OUT_NORM,
-    // TODO: maybe use longer names?
-    // TODO: can the in_proj and/or the out_proj instead re-use some of the above types?
     LLM_TENSOR_SSM_IN,
     LLM_TENSOR_SSM_CONV1D,
     LLM_TENSOR_SSM_X,

From cd0f33f281b6f7e20850bac97717205cd4e51d30 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 4 Feb 2024 09:49:23 -0500
Subject: [PATCH 15/41] mamba : fix vocab size problems with official models

The perplexity was waaaay to high for models with a non-round vocab size.
Not sure why, but it needed to be fixed in the metadata.

Note that this breaks existing GGUF-converted Mamba models,
but **only if** the vocab size was not already rounded.
---
 convert-hf-to-gguf.py | 7 +++++++
 llama.cpp             | 6 ++----
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 42b0fb66e745b..93d714a409a08 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1848,6 +1848,13 @@ class StarCoder2Model(Model):
 class MambaModel(Model):
     model_arch = gguf.MODEL_ARCH.MAMBA
 
+    def set_vocab(self):
+        vocab_size = self.hparams["vocab_size"];
+        # Round vocab size to next multiple of 8
+        pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8);
+        self.hparams["vocab_size"] = ((vocab_size + (pad_vocab - 1)) // pad_vocab) * pad_vocab
+        return self._set_vocab_gpt2()
+
     def set_gguf_parameters(self):
         d_model = self.hparams["d_model"]
         d_inner = self.hparams.get("d_inner", 2 * d_model)
diff --git a/llama.cpp b/llama.cpp
index 37ac7425d4b04..0a6ff773613b2 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4682,15 +4682,13 @@ static bool llm_load_tensors(
                     // FIXME: ceiling instead of floor
                     const int64_t dt_rank = n_embd / 16;
                     GGML_ASSERT(2 * n_embd == d_inner);
-                    // round up the vocab size to the next multiple of 8
-                    const int64_t rounded_vocab = (n_vocab + 7) & -8;
 
-                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, rounded_vocab});
+                    model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
 
                     // output
                     {
                         model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, rounded_vocab});
+                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
                     }
 
                     for (int i = 0; i < n_layer; ++i) {

From de92f15634ff40194af210dbf7afb38b8fbbc657 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 4 Feb 2024 17:08:54 -0500
Subject: [PATCH 16/41] ggml : remove ggml_exp and ggml_soft_plus

They did not exist anyway outside of this branch,
and since ggml_ssm_scan fused operations together, they are unused.
It's always possible to bring them back if needed.
---
 ggml.c | 191 +--------------------------------------------------------
 ggml.h |  19 ------
 2 files changed, 2 insertions(+), 208 deletions(-)

diff --git a/ggml.c b/ggml.c
index 90dcddbb71df3..a7f55016e4ec9 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1577,7 +1577,6 @@ inline static void ggml_vec_scale_f32(const int n, float * y, const float   v) {
 inline static void ggml_vec_norm_f32 (const int n, float * s, const float * x) { ggml_vec_dot_f32(n, s, 0, x, 0, x, 0, 1); *s = sqrtf(*s);   }
 inline static void ggml_vec_sqr_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = x[i]*x[i];   }
 inline static void ggml_vec_sqrt_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = sqrtf(x[i]); }
-inline static void ggml_vec_exp_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = expf(x[i]); }
 inline static void ggml_vec_log_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = logf(x[i]);   }
 inline static void ggml_vec_abs_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = fabsf(x[i]); }
 inline static void ggml_vec_sgn_f32  (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
@@ -1779,7 +1778,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "DIV",
     "SQR",
     "SQRT",
-    "EXP",
     "LOG",
     "SUM",
     "SUM_ROWS",
@@ -1813,7 +1811,6 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "DIAG_MASK_ZERO",
     "SOFT_MAX",
     "SOFT_MAX_BACK",
-    "SOFT_PLUS",
     "ROPE",
     "ROPE_BACK",
     "ALIBI",
@@ -1854,7 +1851,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1868,7 +1865,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "x/y",
     "x^2",
     "√x",
-    "e^x", // or should this be "exp(x)"?
     "log(x)",
     "Σx",
     "Σx_k",
@@ -1902,7 +1898,6 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "diag_mask_zero(x)",
     "soft_max(x)",
     "soft_max_back(x)",
-    "soft_plus(x)",
     "rope(x)",
     "rope_back(x)",
     "alibi(x)",
@@ -1943,7 +1938,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 75, "GGML_OP_COUNT != 75");
+static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -3803,39 +3798,6 @@ struct ggml_tensor * ggml_sqrt_inplace(
     return ggml_sqrt_impl(ctx, a, true);
 }
 
-// ggml_exp
-
-static struct ggml_tensor * ggml_exp_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool inplace) {
-    bool is_node = false;
-
-    if (!inplace && (a->grad)) {
-        is_node = true;
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_EXP;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_exp(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_exp_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_exp_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_exp_impl(ctx, a, true);
-}
-
 // ggml_log
 
 static struct ggml_tensor * ggml_log_impl(
@@ -5331,40 +5293,6 @@ struct ggml_tensor * ggml_soft_max_back_inplace(
     return ggml_soft_max_back_impl(ctx, a, b, true);
 }
 
-// ggml_soft_plus
-
-static struct ggml_tensor * ggml_soft_plus_impl(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a,
-        bool                  inplace) {
-
-    bool is_node = false;
-
-    if (a->grad) {
-        is_node = true; // TODO : implement backward pass
-    }
-
-    struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
-
-    result->op   = GGML_OP_SOFT_PLUS;
-    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
-    result->src[0] = a;
-
-    return result;
-}
-
-struct ggml_tensor * ggml_soft_plus(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_soft_plus_impl(ctx, a, false);
-}
-
-struct ggml_tensor * ggml_soft_plus_inplace(
-        struct ggml_context * ctx,
-        struct ggml_tensor  * a) {
-    return ggml_soft_plus_impl(ctx, a, true);
-}
-
 // ggml_rope
 
 static struct ggml_tensor * ggml_rope_impl(
@@ -8715,57 +8643,6 @@ static void ggml_compute_forward_sqrt(
     }
 }
 
-// ggml_compute_forward_exp
-
-static void ggml_compute_forward_exp_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(src0));
-    GGML_ASSERT(ggml_is_contiguous_except_dim_1(dst));
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int ith = params->ith;
-    const int nth = params->nth;
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    // rows per thread
-    const int dr = (nr + nth - 1)/nth;
-
-    // row range for this thread
-    const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr);
-
-    for (int i1 = ir0; i1 < ir1; i1++) {
-        ggml_vec_exp_f32(nc,
-                (float *) ((char *) dst->data  + i1*( dst->nb[1])),
-                (float *) ((char *) src0->data + i1*(src0->nb[1])));
-    };
-}
-
-static void ggml_compute_forward_exp(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_exp_f32(params, src0, dst);
-            } break;
-        case GGML_TYPE_F16: // TODO: use ggml_table_exp_f16
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_log
 
 static void ggml_compute_forward_log_f32(
@@ -12225,48 +12102,6 @@ static void ggml_compute_forward_soft_max_back(
     }
 }
 
-static void ggml_compute_forward_soft_plus_f32(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-              struct ggml_tensor * dst) {
-    GGML_ASSERT(params->ith == 0);
-    GGML_ASSERT(ggml_are_same_shape(src0, dst));
-
-    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
-        return;
-    }
-
-    const int nc = src0->ne[0];
-    const int nr = ggml_nrows(src0);
-
-    GGML_ASSERT( dst->nb[0] == sizeof(float));
-    GGML_ASSERT(src0->nb[0] == sizeof(float));
-
-    for (int i = 0; i < nr; ++i) {
-        float * x = (float *) ((char *) dst->data  + i*( dst->nb[1]));
-        float * y = (float *) ((char *) src0->data + i*(src0->nb[1]));
-        for (int j = 0; j < nc; ++j) {
-            x[j] = logf(1.0f + expf(y[j]));
-        }
-    }
-}
-
-static void ggml_compute_forward_soft_plus(
-        const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-              struct ggml_tensor * dst) {
-    switch (src0->type) {
-        case GGML_TYPE_F32:
-            {
-                ggml_compute_forward_soft_plus_f32(params, src0, dst);
-            } break;
-        default:
-            {
-                GGML_ASSERT(false);
-            } break;
-    }
-}
-
 // ggml_compute_forward_alibi
 
 static void ggml_compute_forward_alibi_f32(
@@ -15764,10 +15599,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_sqrt(params, tensor);
             } break;
-        case GGML_OP_EXP:
-            {
-                ggml_compute_forward_exp(params, tensor->src[0], tensor);
-            } break;
         case GGML_OP_LOG:
             {
                 ggml_compute_forward_log(params, tensor);
@@ -15892,10 +15723,6 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             {
                 ggml_compute_forward_soft_max_back(params, tensor);
             } break;
-        case GGML_OP_SOFT_PLUS:
-            {
-                ggml_compute_forward_soft_plus(params, tensor->src[0], tensor);
-            } break;
         case GGML_OP_ROPE:
             {
                 ggml_compute_forward_rope(params, tensor);
@@ -16452,10 +16279,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
                                 zero_table);
                 }
             } break;
-        case GGML_OP_EXP:
-            {
-                GGML_ASSERT(false); // TODO: implement
-            } break;
         case GGML_OP_LOG:
             {
                 if (src0->grad) {
@@ -16834,10 +16657,6 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // TODO: not implemented
             } break;
-        case GGML_OP_SOFT_PLUS:
-            {
-                GGML_ASSERT(false); // TODO: not implemented
-            } break;
         case GGML_OP_ROPE:
             {
                 // necessary for llama
@@ -17578,7 +17397,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
         case GGML_OP_ADD:
         case GGML_OP_ADD1:
         case GGML_OP_ACC:
-        case GGML_OP_EXP:
             {
                 n_tasks = n_threads;
             } break;
@@ -17685,10 +17503,6 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = MIN(n_threads, ggml_nrows(node->src[0]));
             } break;
-        case GGML_OP_SOFT_PLUS:
-            {
-                n_tasks = 1; //TODO
-            } break;
         case GGML_OP_CONV_TRANSPOSE_1D:
             {
                 n_tasks = n_threads;
@@ -18065,7 +17879,6 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     }
                 } break;
             case GGML_OP_SOFT_MAX:
-            case GGML_OP_SOFT_PLUS:
             case GGML_OP_ROPE:
                 {
                     cur = ggml_type_size(GGML_TYPE_F32) * node->ne[0] * n_tasks;
diff --git a/ggml.h b/ggml.h
index 3a4c9201aee19..092b86e487acc 100644
--- a/ggml.h
+++ b/ggml.h
@@ -410,7 +410,6 @@ extern "C" {
         GGML_OP_DIV,
         GGML_OP_SQR,
         GGML_OP_SQRT,
-        GGML_OP_EXP,
         GGML_OP_LOG,
         GGML_OP_SUM,
         GGML_OP_SUM_ROWS,
@@ -444,7 +443,6 @@ extern "C" {
         GGML_OP_DIAG_MASK_ZERO,
         GGML_OP_SOFT_MAX,
         GGML_OP_SOFT_MAX_BACK,
-        GGML_OP_SOFT_PLUS,
         GGML_OP_ROPE,
         GGML_OP_ROPE_BACK,
         GGML_OP_ALIBI,
@@ -935,14 +933,6 @@ extern "C" {
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
 
-    GGML_API struct ggml_tensor * ggml_exp(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    GGML_API struct ggml_tensor * ggml_exp_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
     GGML_API struct ggml_tensor * ggml_log(
             struct ggml_context * ctx,
             struct ggml_tensor  * a);
@@ -1431,15 +1421,6 @@ extern "C" {
             struct ggml_tensor  * a,
             struct ggml_tensor  * b);
 
-    GGML_API struct ggml_tensor * ggml_soft_plus(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
-    // in-place, returns view(a)
-    GGML_API struct ggml_tensor * ggml_soft_plus_inplace(
-            struct ggml_context * ctx,
-            struct ggml_tensor  * a);
-
     // rotary position embedding
     // if mode & 1 == 1, skip n_past elements (DEPRECATED)
     // if mode & 2 == 1, GPT-NeoX style

From 766db753c85be85137c3fb8ef25481232003f838 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 4 Feb 2024 18:25:14 -0500
Subject: [PATCH 17/41] mamba : remove some useless comments

No code change.
---
 llama.cpp | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 0a6ff773613b2..d320d727ff920 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -4697,9 +4697,6 @@ static bool llm_load_tensors(
 
                         auto & layer = model.layers[i];
 
-                        // TODO: what's the difference between ctx_layer and ctx_split?
-                        //       A: It seems that ctx_split is for matrices (2d???) while ctx_layer is for other things (like 1D bias and norms, probably.)
-
                         // norm
                         layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
 
@@ -7910,7 +7907,6 @@ struct llm_build_context {
 
         const int32_t n_tok = batch.n_tokens;
 
-        // hopefully the compiler does constant folding
         const int64_t d_model = n_embd;
         const int64_t d_inner = n_head;
         GGML_ASSERT(2 * d_model == d_inner);
@@ -7957,8 +7953,8 @@ struct llm_build_context {
 
                 // The following tensor is too big in order to avoid an assertion error when making an overlapping view.
                 // TODO: in ggml_new_tensor_impl, handle overlapping data range in data size calculation
-                // This could then be a tensor with ne[] = {(d_conv-1)+n_tok, d_inner}
-                // which is around (d_conv-1) times as small as its current size.
+                // This could then be a tensor with ne[] = {(d_conv-1)+n_tok, d_inner},
+                // but the size difference is not that big (d_conv is usually 4).
                 struct ggml_tensor * conv_x = ggml_new_tensor_1d(ctx0, conv_state->type, d_conv*d_inner*n_tok);
                 const size_t conv_x_nb1 = (d_conv - 1 + n_tok) * ggml_element_size(conv_x);
 

From c52fb3c2de3f55345fc97bc9b7d1410b9bf344b5 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 4 Feb 2024 20:41:07 -0500
Subject: [PATCH 18/41] convert : fix flake8 linter errors

---
 convert-hf-to-gguf.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 93d714a409a08..de7bf431f4a69 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1849,9 +1849,9 @@ class MambaModel(Model):
     model_arch = gguf.MODEL_ARCH.MAMBA
 
     def set_vocab(self):
-        vocab_size = self.hparams["vocab_size"];
+        vocab_size = self.hparams["vocab_size"]
         # Round vocab size to next multiple of 8
-        pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8);
+        pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
         self.hparams["vocab_size"] = ((vocab_size + (pad_vocab - 1)) // pad_vocab) * pad_vocab
         return self._set_vocab_gpt2()
 

From 6ff34da092ed22debf6a74b8891022128e8b4aac Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 5 Feb 2024 10:13:55 -0500
Subject: [PATCH 19/41] mamba : apply suggestions from code review

* mamba : remove unecessary branch for row-wise ssm_state and C multiplication

It was previously done to avoid permuting when only one token is processed
at a time (like when generating text), but permuting is cheap,
and dynamically changing the compute graph is not future-proof.

* ggml : in ggml_ssm_scan, use more appropriate asserts

* ggml : rename the destination pointer in ggml_compute_forward_ssm_scan_f32
---
 ggml.c    | 13 +++++++------
 llama.cpp | 15 ++++-----------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/ggml.c b/ggml.c
index a7f55016e4ec9..2ab47216e5809 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6093,8 +6093,8 @@ struct ggml_tensor * ggml_ssm_scan(
     GGML_ASSERT(ggml_is_contiguous(dt));
     GGML_ASSERT(ggml_is_contiguous(A));
     GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
-    ggml_are_same_shape(x, dt);
-    GGML_ASSERT(s->ne[2] == 1 && s->ne[3] == 1); // the ssm_state should be 2D
+    GGML_ASSERT(ggml_are_same_shape(x, dt));
+    GGML_ASSERT(ggml_is_matrix(s)); // the ssm_state should be 2D
 
     {
         const int64_t d_state = s->ne[0];
@@ -6111,6 +6111,7 @@ struct ggml_tensor * ggml_ssm_scan(
     bool is_node = false;
 
     if (s->grad || x->grad || dt->grad || A->grad || B->grad) {
+        GGML_ASSERT(false); // TODO: implement
         is_node = true;
     }
 
@@ -14681,7 +14682,7 @@ static void ggml_compute_forward_ssm_scan_f32(
 
     // first batch
     {
-        float * dest = (float *) ((char *)  dst->data + ir0*( dst->nb[1])); // {d_state, d_inner, n_tok}
+        float * pdst = (float *) ((char *)  dst->data + ir0*( dst->nb[1])); // {d_state, d_inner, n_tok}
         float * s    = (float *) ((char *) src0->data + ir0*(src0->nb[1])); // {d_state, d_inner}
         float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0])); // {d_inner, n_tok}
         float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0])); // {d_inner, n_tok}
@@ -14695,14 +14696,14 @@ static void ggml_compute_forward_ssm_scan_f32(
             for (int i0 = 0; i0 < nc; ++i0) {
                 int i = i0 + i1*nc;
                 // ssm_state * dA + dB * x
-                dest[i] = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                pdst[i] = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
             }
         }
     }
 
     // compute state for rest of tokens, previous state comes from dest
     for (int i2 = 1; i2 < n_t; ++i2) {
-        float * dest = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) +  i2   *( dst->nb[2])); // {d_state, d_inner, n_tok}
+        float * pdst = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) +  i2   *( dst->nb[2])); // {d_state, d_inner, n_tok}
         float * s    = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) + (i2-1)*( dst->nb[2])); // {d_state, d_inner, n_tok}
         float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0]) +  i2   *(src1->nb[1])); // {d_inner, n_tok}
         float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0]) +  i2   *(src2->nb[1])); // {d_inner, n_tok}
@@ -14716,7 +14717,7 @@ static void ggml_compute_forward_ssm_scan_f32(
             for (int i0 = 0; i0 < nc; ++i0) {
                 int i = i0 + i1*nc;
                 // ssm_state * dA + dB * x
-                dest[i] = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                pdst[i] = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
             }
         }
     }
diff --git a/llama.cpp b/llama.cpp
index d320d727ff920..fc1dd024e2d27 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8010,17 +8010,10 @@ struct llm_build_context {
                         ggml_view_2d(ctx0, ssm_state, d_state, d_inner, ssm_state->nb[1], (n_tok-1)*ssm_state->nb[2]),
                         ggml_view_tensor(ctx0, kv_self.v_l[il])));
 
-                struct ggml_tensor * y;
-                if (n_tok == 1) {
-                    // row-wise dot product ("dn,n->d")
-                    // {d_state, d_inner} * {d_state, 1} => {d_inner, 1}
-                    y = ggml_mul_mat(ctx0, ssm_state, C);
-                } else {
-                    // {d_state, d_inner, n_tok} * {d_state, n_tok} => {d_inner, 1, n_tok}
-                    y = ggml_mul_mat(ctx0, ssm_state, ggml_permute(ctx0, C, 0, 2, 1, 3));
-                    // => {d_inner, n_tok}
-                    y = ggml_permute(ctx0, y, 0, 2, 1, 3);
-                }
+                // {d_state, d_inner, n_tok} * {d_state, n_tok} => {d_inner, 1, n_tok}
+                struct ggml_tensor * y = ggml_mul_mat(ctx0, ssm_state, ggml_permute(ctx0, C, 0, 2, 1, 3));
+                // => {d_inner, n_tok}
+                y = ggml_permute(ctx0, y, 0, 2, 1, 3);
                 // {d_inner, n_tok} * {d_inner} => {d_inner, n_tok}
                 y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
                 y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));

From 8a43ffcfa1f1298b201149383d115edd17266e9e Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Tue, 13 Feb 2024 19:06:18 -0500
Subject: [PATCH 20/41] mamba : multiple sequences, but one at a time

This is a step towards making this Mamba implementation usable
with the server example (the way the system prompt is kept when clearing
the client slots will need to be changed before this can work, though).

The KV cache size for this kind of model is tied to the maximum number
of sequences kept at any single time.
For now, this number is obtained from n_parallel (plus one,
to have an extra sequence to dedicate to the system prompt),
but there might be a better way to do this which won't also
make the main example use 2 cells even if only 1 is really used.
(for this specific case, --parallel 0 helps)

Simultaneous sequence processing will probably require changes to
ggml_ssm_scan, and possibly a new operator for the conv step.

* mamba : support llama_kv_cache_seq_cp

This (mis)uses the logic around K shifts, because tokens in a state
can't be shifted anyway, and because inp_K_shift has the right shape and type.
Using ggml_get_rows is a nice way to do copies, but copy chains can't work.
Fortunately, copy chains don't really seem to be used in the examples.

Each KV cell is dedicated to the sequence ID corresponding to its own index.

* mamba : use a state mask

It's cleaner than the previous heuristic of
checking for the pos of the first token in the batch.

inp_KQ_mask could not be re-used for this, because it has the wrong shape
and because it seems more suited to the next step of
simultaneous sequence processing (helping with the problem of
remembering which token belongs to which sequence(s)/state(s)).

* llama : replace the usage of n_ctx with kv_self.size in many places

* mamba : use n_tokens directly instead of n_tok
---
 common/common.cpp |   1 +
 ggml.c            |  26 ++--
 llama.cpp         | 314 +++++++++++++++++++++++++++++++++++-----------
 llama.h           |   1 +
 4 files changed, 257 insertions(+), 85 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index dbe7e9229b770..b7c43cab29087 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1295,6 +1295,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
 
     cparams.n_ctx             = params.n_ctx;
     cparams.n_batch           = params.n_batch;
+    cparams.n_parallel        = params.n_parallel;
     cparams.n_threads         = params.n_threads;
     cparams.n_threads_batch   = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
     cparams.seed              = params.seed;
diff --git a/ggml.c b/ggml.c
index 2ab47216e5809..09e77bb8c1f06 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6097,15 +6097,15 @@ struct ggml_tensor * ggml_ssm_scan(
     GGML_ASSERT(ggml_is_matrix(s)); // the ssm_state should be 2D
 
     {
-        const int64_t d_state = s->ne[0];
-        const int64_t d_inner = s->ne[1];
-        const int64_t n_tok   = x->ne[1];
+        const int64_t d_state  = s->ne[0];
+        const int64_t d_inner  = s->ne[1];
+        const int64_t n_tokens = x->ne[1];
 
         GGML_ASSERT(x->ne[0] == d_inner);
         GGML_ASSERT(A->ne[0] == d_state);
         GGML_ASSERT(A->ne[1] == d_inner);
         GGML_ASSERT(B->ne[0] == d_state);
-        GGML_ASSERT(B->ne[1] == n_tok);
+        GGML_ASSERT(B->ne[1] == n_tokens);
     }
 
     bool is_node = false;
@@ -14682,12 +14682,12 @@ static void ggml_compute_forward_ssm_scan_f32(
 
     // first batch
     {
-        float * pdst = (float *) ((char *)  dst->data + ir0*( dst->nb[1])); // {d_state, d_inner, n_tok}
+        float * pdst = (float *) ((char *)  dst->data + ir0*( dst->nb[1])); // {d_state, d_inner, n_tokens}
         float * s    = (float *) ((char *) src0->data + ir0*(src0->nb[1])); // {d_state, d_inner}
-        float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0])); // {d_inner, n_tok}
-        float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0])); // {d_inner, n_tok}
+        float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0])); // {d_inner, n_tokens}
+        float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0])); // {d_inner, n_tokens}
         float * A    = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-        float * B    = (float *) ((char *) src4->data);                     // {d_state, n_tok}
+        float * B    = (float *) ((char *) src4->data);                     // {d_state, n_tokens}
         // d_inner
         for (int i1 = 0; i1 < ir; ++i1) {
             float dt_soft_plus = log1pf(expf(dt[i1]));
@@ -14703,12 +14703,12 @@ static void ggml_compute_forward_ssm_scan_f32(
 
     // compute state for rest of tokens, previous state comes from dest
     for (int i2 = 1; i2 < n_t; ++i2) {
-        float * pdst = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) +  i2   *( dst->nb[2])); // {d_state, d_inner, n_tok}
-        float * s    = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) + (i2-1)*( dst->nb[2])); // {d_state, d_inner, n_tok}
-        float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0]) +  i2   *(src1->nb[1])); // {d_inner, n_tok}
-        float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0]) +  i2   *(src2->nb[1])); // {d_inner, n_tok}
+        float * pdst = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) +  i2   *( dst->nb[2])); // {d_state, d_inner, n_tokens}
+        float * s    = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) + (i2-1)*( dst->nb[2])); // {d_state, d_inner, n_tokens}
+        float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0]) +  i2   *(src1->nb[1])); // {d_inner, n_tokens}
+        float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0]) +  i2   *(src2->nb[1])); // {d_inner, n_tokens}
         float * A    = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-        float * B    = (float *) ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tok}
+        float * B    = (float *) ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tokens}
         // d_inner
         for (int i1 = 0; i1 < ir; ++i1) {
             float dt_soft_plus = log1pf(expf(dt[i1]));
diff --git a/llama.cpp b/llama.cpp
index fc1dd024e2d27..6908093df3356 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1802,6 +1802,8 @@ struct llama_kv_cell {
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
+    // with Mamba, a slot can hold the state for more than one past token
+    bool unlimited = false;
 
     // Note: The value of head isn't only used to optimize searching
     // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2036,11 +2038,12 @@ struct llama_context {
     struct ggml_tensor * inp_tokens;    // I32 [n_batch]
     struct ggml_tensor * inp_embd;      // F32 [n_embd, n_batch]
     struct ggml_tensor * inp_pos;       // I32 [n_batch]
-    struct ggml_tensor * inp_KQ_mask;   // F32 [n_ctx, n_batch]
-    struct ggml_tensor * inp_KQ_pos;    // F32 [n_ctx]
-    struct ggml_tensor * inp_K_shift;   // I32 [n_ctx]
+    struct ggml_tensor * inp_KQ_mask;   // F32 [kv_size, n_batch]
+    struct ggml_tensor * inp_KQ_pos;    // F32 [kv_size]
+    struct ggml_tensor * inp_K_shift;   // I32 [kv_size]
     struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
     struct ggml_tensor * inp_cls;       // I32 [n_batch]
+    struct ggml_tensor * inp_s_mask;    // F32 [kv_size] (only used by constant state models like Mamba)
 
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
@@ -2056,7 +2059,7 @@ static bool llama_kv_cache_init(
                  const llama_model & model,
                          ggml_type   type_k,
                          ggml_type   type_v,
-                          uint32_t   n_ctx,
+                          uint32_t   kv_size,
                               bool   offload) {
     const struct llama_hparams & hparams = model.hparams;
 
@@ -2064,22 +2067,26 @@ static bool llama_kv_cache_init(
     const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
     const int64_t  n_layer      = hparams.n_layer;
 
-    if (model.arch == LLM_ARCH_MAMBA) {
-        // only one slot is needed for Mamba
-        n_ctx = 1;
-    }
-
     cache.has_shift = false;
 
+    // for now, only Mamba can hold state for more than one past token per slot
+    cache.unlimited = model.arch == LLM_ARCH_MAMBA;
+
     cache.head = 0;
-    cache.size = n_ctx;
+    cache.size = kv_size;
     cache.used = 0;
 
     cache.type_k = type_k;
     cache.type_v = type_v;
 
     cache.cells.clear();
-    cache.cells.resize(n_ctx);
+    cache.cells.resize(kv_size);
+
+    if (cache.unlimited) {
+        for (uint32_t i = 0; i < cache.size; ++i) {
+            cache.cells[i].delta = i;
+        }
+    } // else, delta is already initialized to zero
 
 #ifdef GGML_USE_CLBLAST
     offload = false;
@@ -2118,8 +2125,8 @@ static bool llama_kv_cache_init(
 
     for (int i = 0; i < (int) n_layer; i++) {
         struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
-        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
-        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
+        ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*kv_size);
+        ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*kv_size);
         ggml_format_name(k, "cache_k_l%d", i);
         ggml_format_name(v, "cache_v_l%d", i);
         cache.k_l.push_back(k);
@@ -2153,11 +2160,51 @@ static bool llama_kv_cache_find_slot(
     const uint32_t n_ctx    = cache.size;
     const uint32_t n_tokens = batch.n_tokens;
 
-    // for Mamba and/or other model archs that only ever use one slot
-    if (n_ctx == 1) {
-        // hopefully no one actually uses a context size of 1 on Transformer-based models
-        return true;
+    if (cache.unlimited) {
+        // For unlimited context architectures (like Mamba),
+        // each KV cache cell can store the state for a whole sequence.
+
+        // starting point to find the minimum seq_id used in the batch
+        cache.head = cache.size - 1;
+        // likewise, to find the max seq_id in the batch
+        cache.used = 0;
+        for (uint32_t i = 0; i < n_tokens; ++i) {
+            for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) {
+                llama_seq_id seq_id = batch.seq_id[i][j];
+                // make sure it's a valid seq_id
+                if ((uint32_t)seq_id < cache.size) {
+                    // the number of "used" cells is simply the biggest seq_id
+                    if (cache.used < (uint32_t)seq_id) {
+                        cache.used = seq_id;
+                    }
+                    // the "head" is the smallest seq_id
+                    if (cache.head > (uint32_t)seq_id) {
+                        cache.head = seq_id;
+                    }
+                    // Assuming the tokens are in-order
+                    if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
+                        // What should happen when the pos backtracks?
+                        // Clearing the state mid-batch would require special-casing which isn't done.
+                        LLAMA_LOG_ERROR("%s: non-consecutive token position %d after %d for sequence %d\n",
+                            __func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
+                        return false;
+                    }
+                    cache.cells[seq_id].pos = batch.pos[i];
+                    // NOTE: seq_ids are not inserted here, because they are handled when the graph is built.
+                } else {
+                    // too big seq_id
+                    // TODO: would it be possible to resize the KV cache size instead?
+                    LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d\n", __func__, seq_id, cache.size);
+                    return false;
+                }
+            }
+        }
+
+        cache.n = cache.used - cache.head + 1;
+        // sanity check (max >= min)
+        return cache.used >= cache.head;
     }
+    // otherwise, one cell per token.
 
     if (n_tokens > n_ctx) {
         LLAMA_LOG_ERROR("%s: n_tokens=%d > n_ctx=%d\n", __func__, n_tokens, n_ctx);
@@ -2238,6 +2285,13 @@ static void llama_kv_cache_seq_rm(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
+    if (cache.unlimited) {
+        // can only remove whole sequences for models like Mamba
+        GGML_ASSERT(p0 == 0);
+        GGML_ASSERT((uint32_t)seq_id < cache.size);
+        GGML_ASSERT(cache.cells[seq_id].pos < p1);
+    }
+
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
             if (seq_id < 0) {
@@ -2270,6 +2324,26 @@ static void llama_kv_cache_seq_cp(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
+    if (cache.unlimited) {
+        if ((uint32_t)seq_id_dst < cache.size && (uint32_t)seq_id_src < cache.size) {
+            // intent to "copy from" (does not support copy chains)
+            cache.cells[seq_id_dst].delta = seq_id_src;
+            // NOTE: a sequence can't have multiple sources, but can have multiple destinations.
+            // For compatibility with the other KV cache API functions,
+            // the seq_id(s) of a slot suggests an intent to "copy to" those id(s),
+            // so that when a sequence is copied, it can initially be found from the source cell.
+            cache.cells[seq_id_src].seq_id.insert(seq_id_dst);
+            // prevent the destination from getting cleared
+            cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
+            // repurposed as a "need copy" flag
+            // (shifting can't be done anyway for this kind of KV cache)
+            cache.has_shift = seq_id_src != seq_id_dst;
+            // NOTE: this is not correct for sequence swaps (which aren't a thing in the KV cache API yet)
+            cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
+        }
+        return;
+    }
+
     cache.head = 0;
 
     for (uint32_t i = 0; i < cache.size; ++i) {
@@ -2309,6 +2383,10 @@ static void llama_kv_cache_seq_add(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
+    if (cache.unlimited) {
+        GGML_ASSERT(false); // not supported
+    }
+
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
             cache.has_shift = true;
@@ -2342,6 +2420,10 @@ static void llama_kv_cache_seq_div(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
+    if (cache.unlimited) {
+        GGML_ASSERT(false); // not supported
+    }
+
     for (uint32_t i = 0; i < cache.size; ++i) {
         if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
             cache.has_shift = true;
@@ -4943,6 +5025,8 @@ static void llm_build_kv_store(
     const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
     const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
 
+    GGML_ASSERT(kv.size == n_ctx);
+
     // compute the transposed [n_tokens, n_embd] V matrix
     struct ggml_tensor * v_cur_t = ggml_transpose(ctx, ggml_reshape_2d(ctx, v_cur, n_embd_v_gqa, n_tokens));
     //struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
@@ -5152,6 +5236,8 @@ static struct ggml_tensor * llm_build_kqv(
         cb(kq, "kq_soft_max_ext", il);
     }
 
+    GGML_ASSERT(kv.size == n_ctx);
+
     // split cached v into n_head heads
     struct ggml_tensor * v =
         ggml_view_3d(ctx, kv.v_l[il],
@@ -5298,8 +5384,8 @@ struct llm_build_context {
         norm_eps         (hparams.f_norm_eps),
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (batch.n_tokens),
-        n_kv             (worst_case ? kv_self.size     : kv_self.n),
-        kv_head          (worst_case ? n_ctx - n_tokens : kv_self.head),
+        n_kv             (worst_case ? kv_self.size : kv_self.n),
+        kv_head          (worst_case ? (kv_self.unlimited ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_orig_ctx       (cparams.n_yarn_orig_ctx),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
@@ -5328,6 +5414,22 @@ struct llm_build_context {
     struct ggml_cgraph * build_k_shift() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
+        // TODO: do this in a another graph with a dedicated input tensor
+        if (kv_self.unlimited) {
+            for (int il = 0; il < n_layer; ++il) {
+                ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], n_embd_k_gqa, kv_self.size);
+                ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], n_embd_v_gqa, kv_self.size);
+
+                conv_states = ggml_get_rows(ctx0, conv_states, lctx.inp_K_shift);
+                ssm_states  = ggml_get_rows(ctx0,  ssm_states, lctx.inp_K_shift);
+
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_states, kv_self.k_l[il]));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx0,  ssm_states, kv_self.v_l[il]));
+            }
+
+            return gf;
+        }
+
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * tmp =
                 // we rotate only the first n_rot dimensions
@@ -7905,8 +8007,6 @@ struct llm_build_context {
     struct ggml_cgraph * build_mamba() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
-        const int32_t n_tok = batch.n_tokens;
-
         const int64_t d_model = n_embd;
         const int64_t d_inner = n_head;
         GGML_ASSERT(2 * d_model == d_inner);
@@ -7917,22 +8017,34 @@ struct llm_build_context {
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        // {n_embd, n_tok}
+        GGML_ASSERT(kv_self.used - kv_self.head + 1 == 1); // TODO: support more than one sequence per batch
+
+        // {n_embd, n_tokens}
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
         for (int il = 0; il < n_layer; ++il) {
             // (ab)using the kv cache to store the state
             // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
-            ggml_tensor * conv_state = ggml_reshape_2d(ctx0, kv_self.k_l[il], d_conv - 1, d_inner);
-            ggml_tensor * ssm_state  = ggml_reshape_2d(ctx0, kv_self.v_l[il], d_state, d_inner);
+            ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], (d_conv-1)*(d_inner), kv_self.size);
+            ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il],  (d_state)*(d_inner), kv_self.size);
 
-            // reset the states when starting a new sequence
-            // TODO: ensure kv_self clearing is handled
-            if (!batch.pos || batch.pos[0] == 0) {
-                conv_state = ggml_scale(ctx0, conv_state, 0);
-                ssm_state = ggml_scale(ctx0, ssm_state, 0);
-            }
+            {
+                ggml_tensor * state_mask = ggml_view_2d(ctx0, lctx.inp_s_mask, 1, n_kv, lctx.inp_s_mask->nb[0], 0);
+                // clear states of sequences which are starting at the beginning of this batch
+                conv_states = ggml_mul(ctx0,
+                    ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]),
+                    state_mask);
+                ssm_states  = ggml_mul(ctx0,
+                    ggml_view_2d(ctx0, ssm_states, ssm_states->ne[0], n_kv, ssm_states->nb[1], kv_head*ssm_states->nb[1]),
+                    state_mask);
+            }
+
+            // TODO: support more than one sequence per batch (these could then use ggml_reshape_3d)
+            ggml_tensor * conv_state = ggml_view_2d(ctx0, conv_states, d_conv - 1, d_inner,
+                (d_conv - 1)*ggml_element_size(conv_states), 0);
+            ggml_tensor * ssm_state  = ggml_view_2d(ctx0,  ssm_states, d_state, d_inner,
+                (d_state)*ggml_element_size(ssm_states), 0);
 
             // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -7940,10 +8052,10 @@ struct llm_build_context {
                     LLM_NORM_RMS, cb, il);
             cb(cur, "attn_norm", il);
 
-            // {n_embd, 2*d_inner} * {n_embd, n_tok} => {2*d_inner, n_tok}
+            // {n_embd, 2*d_inner} * {n_embd, n_tokens} => {2*d_inner, n_tokens}
             struct ggml_tensor * xz = ggml_mul_mat(ctx0, model.layers[il].ssm_in, cur);
             // split the above in two
-            // => {d_inner, n_tok}
+            // => {d_inner, n_tokens}
             struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
             struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
 
@@ -7953,10 +8065,10 @@ struct llm_build_context {
 
                 // The following tensor is too big in order to avoid an assertion error when making an overlapping view.
                 // TODO: in ggml_new_tensor_impl, handle overlapping data range in data size calculation
-                // This could then be a tensor with ne[] = {(d_conv-1)+n_tok, d_inner},
+                // This could then be a tensor with ne[] = {(d_conv-1)+n_tokens, d_inner},
                 // but the size difference is not that big (d_conv is usually 4).
-                struct ggml_tensor * conv_x = ggml_new_tensor_1d(ctx0, conv_state->type, d_conv*d_inner*n_tok);
-                const size_t conv_x_nb1 = (d_conv - 1 + n_tok) * ggml_element_size(conv_x);
+                struct ggml_tensor * conv_x = ggml_new_tensor_1d(ctx0, conv_state->type, d_conv*d_inner*n_tokens);
+                const size_t conv_x_nb1 = (d_conv - 1 + n_tokens) * ggml_element_size(conv_x);
 
                 conv_x = ggml_set_2d(ctx0, conv_x, conv_state, conv_x_nb1, 0);
                 // making x contiguous is necessary because ggml_set expects it
@@ -7965,18 +8077,18 @@ struct llm_build_context {
                 // store last (d_conv - 1) columns of conv_x back into the KV cache for the next conv_state
                 ggml_build_forward_expand(gf,
                     ggml_cpy(ctx0,
-                        ggml_view_2d(ctx0, conv_x, d_conv - 1, d_inner, conv_x_nb1, n_tok*ggml_element_size(conv_x)),
-                        ggml_view_tensor(ctx0, kv_self.k_l[il])));
+                        ggml_view_2d(ctx0, conv_x, d_conv - 1, d_inner, conv_x_nb1, n_tokens*ggml_element_size(conv_x)),
+                        ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner), kv_self.head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_x))));
 
                 // prepare convolution for all tokens in the batch with a self-overlapping view,
                 // shifting by one column each ... depth? ... with a window of d_conv columns.
-                // {(d_conv-1)+n_tok, d_inner} => {d_conv, d_inner, n_tok}
-                conv_x = ggml_view_3d(ctx0, conv_x, d_conv, d_inner, n_tok, conv_x_nb1, 1*ggml_element_size(conv_x), 0);
+                // {(d_conv-1)+n_tokens, d_inner} => {d_conv, d_inner, n_tokens}
+                conv_x = ggml_view_3d(ctx0, conv_x, d_conv, d_inner, n_tokens, conv_x_nb1, 1*ggml_element_size(conv_x), 0);
 
                 // perform convolution
-                // => {1, d_inner, n_tok}
+                // => {1, d_inner, n_tokens}
                 x = ggml_sum_rows(ctx0, ggml_mul(ctx0, conv_x, model.layers[il].ssm_conv1d));
-                // => {d_inner, n_tok, 1}
+                // => {d_inner, n_tokens, 1}
                 x = ggml_permute(ctx0, x, 2, 0, 1, 3);
 
                 // bias
@@ -7987,38 +8099,38 @@ struct llm_build_context {
 
             // ssm
             {
-                //  {d_inner, dt_rank + 2*d_state} * {d_inner, n_tok} => {dt_rank + 2*d_state, n_tok}
+                //  {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
                 struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
                 // split
-                struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tok, x_db->nb[1], 0);
-                struct ggml_tensor * B  = ggml_view_2d(ctx0, x_db, d_state, n_tok, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
-                struct ggml_tensor * C  = ggml_view_2d(ctx0, x_db, d_state, n_tok, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
+                struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
+                struct ggml_tensor * B  = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*dt_rank);
+                struct ggml_tensor * C  = ggml_view_2d(ctx0, x_db, d_state, n_tokens, x_db->nb[1], ggml_element_size(x_db)*(dt_rank+d_state));
 
-                // {dt_rank, d_inner} * {dt_rank, n_tok} => {d_inner, n_tok}
+                // {dt_rank, d_inner} * {dt_rank, n_tokens} => {d_inner, n_tokens}
                 dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
                 dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
 
                 // Custom operator to implement some of the optimizations
                 // described in the Annex D of the Mamba paper.
                 // TODO: maybe also optimize step 4 of the Speed section of Annex D (the mul_mat with C)
-                // => {d_state, d_inner, n_tok}
+                // => {d_state, d_inner, n_tokens}
                 ssm_state = ggml_ssm_scan(ctx0, ssm_state, x, dt, model.layers[il].ssm_a, B);
 
                 // only store last state
                 ggml_build_forward_expand(gf,
                     ggml_cpy(ctx0,
-                        ggml_view_2d(ctx0, ssm_state, d_state, d_inner, ssm_state->nb[1], (n_tok-1)*ssm_state->nb[2]),
-                        ggml_view_tensor(ctx0, kv_self.v_l[il])));
+                        ggml_view_2d(ctx0, ssm_state, d_state, d_inner, ssm_state->nb[1], (n_tokens-1)*ssm_state->nb[2]),
+                        ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner, kv_self.head*d_state*d_inner*ggml_element_size(ssm_state))));
 
-                // {d_state, d_inner, n_tok} * {d_state, n_tok} => {d_inner, 1, n_tok}
+                // {d_state, d_inner, n_tokens} * {d_state, n_tokens} => {d_inner, 1, n_tokens}
                 struct ggml_tensor * y = ggml_mul_mat(ctx0, ssm_state, ggml_permute(ctx0, C, 0, 2, 1, 3));
-                // => {d_inner, n_tok}
+                // => {d_inner, n_tokens}
                 y = ggml_permute(ctx0, y, 0, 2, 1, 3);
-                // {d_inner, n_tok} * {d_inner} => {d_inner, n_tok}
+                // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
                 y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
                 y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));
 
-                // {d_inner, n_embd} * {d_inner, n_tok} => {n_embd, n_tok}
+                // {d_inner, n_embd} * {d_inner, n_tokens} => {n_embd, n_tokens}
                 cur = ggml_mul_mat(ctx0, model.layers[il].ssm_out, y);
             }
 
@@ -8208,15 +8320,13 @@ static struct ggml_cgraph * llama_build_graph(
 }
 
 static void llama_set_k_shift(llama_context & lctx) {
-    const auto & cparams = lctx.cparams;
-
-    const int64_t n_ctx = cparams.n_ctx;
+    const int64_t kv_size = lctx.kv_self.size;
 
     assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
 
     int32_t * data = (int32_t *) lctx.inp_K_shift->data;
 
-    for (int i = 0; i < n_ctx; ++i) {
+    for (int i = 0; i < kv_size; ++i) {
         data[i] = lctx.kv_self.cells[i].delta;
     }
 }
@@ -8257,6 +8367,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
 
         float * data = (float *) lctx.inp_KQ_mask->data;
 
+        // For Transformers, use only the previous KV cells
+        // of the correct sequence for each token of the batch.
+        // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
         for (int h = 0; h < 1; ++h) {
             for (int j = 0; j < n_tokens; ++j) {
                 const llama_pos    pos    = batch.pos[j];
@@ -8274,6 +8387,13 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                 }
             }
         }
+        // For Mamba (and other constant-time-and-size architectures),
+        // update the correct state(s)/sequence(s) for each token of the batch.
+        // Source and destination states are both the same for the sake of implementation simplicity.
+        // It would be more complex if they were sometimes the same and somtimes not.
+        // (with Transformers, source KV cells are never the destination,
+        //  which is also simpler, but more memory hungry)
+        // TODO: implement
     }
 
     if (hparams.need_kq_pos) {
@@ -8330,6 +8450,43 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
             }
         }
     }
+
+    if (kv_self.unlimited) {
+        const uint32_t kv_size = kv_self.size;
+        const uint32_t n_kv    = kv_self.n;
+
+        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
+        float * data = (float *) lctx.inp_s_mask->data;
+
+        // states which are not affected by the current batch are left untouched
+        for (uint32_t i = 0; i < n_kv; ++i) {
+            llama_seq_id    seq_id        = i + lctx.kv_self.head;
+            llama_kv_cell & kv_cell       = lctx.kv_self.cells[seq_id];
+            bool            has_self_seq  = kv_cell.has_seq_id(seq_id);
+
+            data[i] = (float) has_self_seq;
+
+            // ensure current sequences will be kept
+            if (!has_self_seq) {
+                kv_cell.seq_id.insert(seq_id);
+            }
+        }
+        // remove extraneous seq_ids when state copies are made
+        {
+            for (uint32_t i = 0; i < kv_size; ++i) {
+                llama_kv_cell & kv_cell      = lctx.kv_self.cells[i];
+                uint32_t        n_seqs       = kv_cell.seq_id.size();
+                bool            has_self_seq = kv_cell.has_seq_id(i);
+
+                if (has_self_seq && n_seqs > 1) {
+                    kv_cell.seq_id.clear();
+                    kv_cell.seq_id.insert(i);
+                } else if (!has_self_seq && n_seqs > 0) {
+                    kv_cell.seq_id.clear();
+                }
+            }
+        }
+    }
 }
 
 static void llama_graph_compute(
@@ -8450,13 +8607,15 @@ static int llama_decode_internal(
         return 1;
     }
 
-    // a heuristic, to avoid attending the full cache if it is not yet utilized
-    // after enough generations, the benefit from this heuristic disappears
-    // if we start defragmenting the cache, the benefit from this will be more important
-    kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
-    //kv_self.n = llama_kv_cache_cell_max(kv_self);
+    if (!kv_self.unlimited) {
+        // a heuristic, to avoid attending the full cache if it is not yet utilized
+        // after enough generations, the benefit from this heuristic disappears
+        // if we start defragmenting the cache, the benefit from this will be more important
+        kv_self.n = std::min(kv_self.size, std::max(32u, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
+        //kv_self.n = llama_kv_cache_cell_max(kv_self);
 
-    //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+        //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
+    }
 
     ggml_backend_sched_reset(lctx.sched);
     ggml_backend_sched_set_eval_callback(lctx.sched, lctx.cparams.cb_eval, lctx.cparams.cb_eval_user_data);
@@ -8817,7 +8976,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
 static void llama_kv_cache_update_internal(struct llama_context & lctx) {
     // apply K-shift if needed
-    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
+    if ((lctx.kv_self.unlimited || lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) && lctx.kv_self.has_shift) {
         llama_set_k_shift(lctx);
 
         {
@@ -8832,7 +8991,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
             kv_self.has_shift = false;
 
             for (uint32_t i = 0; i < kv_self.size; ++i) {
-                kv_self.cells[i].delta = 0;
+                kv_self.cells[i].delta = kv_self.unlimited ? i : 0;
             }
         }
     }
@@ -12122,6 +12281,7 @@ struct llama_context_params llama_context_default_params() {
         /*.seed                        =*/ LLAMA_DEFAULT_SEED,
         /*.n_ctx                       =*/ 512,
         /*.n_batch                     =*/ 512,
+        /*.n_parallel                  =*/ 1,
         /*.n_threads                   =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
         /*.n_threads_batch             =*/ GGML_DEFAULT_N_THREADS,
         /*.rope_scaling_type           =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
@@ -12283,6 +12443,7 @@ struct llama_context * llama_new_context_with_model(
     auto       & cparams = ctx->cparams;
 
     cparams.n_batch          = params.n_batch;
+    // TODO: maybe add n_parallel here too
     cparams.n_threads        = params.n_threads;
     cparams.n_threads_batch  = params.n_threads_batch;
     cparams.yarn_ext_factor  = params.yarn_ext_factor;
@@ -12339,14 +12500,19 @@ struct llama_context * llama_new_context_with_model(
     ctx->rng                 = std::mt19937(params.seed);
     ctx->logits_all          = params.logits_all;
 
+    uint32_t kv_size = cparams.n_ctx;
     ggml_type type_k = params.type_k;
     ggml_type type_v = params.type_v;
 
-    // Mamba (mis)uses the KV cache to store its states
+    // Mamba only needs a constant number of KV cache slots per sequence
     if (model->arch == LLM_ARCH_MAMBA) {
+        // Mamba needs as many slots as there are distinct sequences processed at the same time
+        // The extra slot allows dedicating a sequence id to the system prompt
+        // TODO: find a better way to get the max number of parallel sequences
+        kv_size = params.n_parallel + 1;
         // it's probably best to keep as much precision as possible for the states
         type_k = GGML_TYPE_F32; // required by ggml_set for Mamba's conv_state
-        type_v = GGML_TYPE_F32; // required by ggml_mul for Mamba's ssm_state
+        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_state
     }
 
     GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
@@ -12447,7 +12613,7 @@ struct llama_context * llama_new_context_with_model(
         }
         ctx->backends.push_back(ctx->backend_cpu);
 
-        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
+        if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, kv_size, cparams.offload_kqv)) {
             LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
             llama_free(ctx);
             return nullptr;
@@ -12481,7 +12647,7 @@ struct llama_context * llama_new_context_with_model(
         // graph inputs
         {
             ggml_init_params init_params = {
-                /* .mem_size   */ ggml_tensor_overhead()*8,
+                /* .mem_size   */ ggml_tensor_overhead()*(8 + ctx->kv_self.unlimited),
                 /* .mem_buffer */ nullptr,
                 /* .no_alloc   */ true,
             };
@@ -12490,11 +12656,13 @@ struct llama_context * llama_new_context_with_model(
             ctx->inp_tokens  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
             ctx->inp_embd    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, hparams.n_embd, cparams.n_batch);
             ctx->inp_pos     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
-            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx, cparams.n_batch);
-            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_ctx);
-            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_ctx);
+            ctx->inp_KQ_mask = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, kv_size, cparams.n_batch);
+            ctx->inp_KQ_pos  = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size);
+            ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size);
             ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
             ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
+            if (ctx->kv_self.unlimited)
+                ctx->inp_s_mask = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size);
 
             ggml_set_name(ctx->inp_tokens,  "inp_tokens");
             ggml_set_name(ctx->inp_embd,    "inp_embd");
@@ -12504,6 +12672,8 @@ struct llama_context * llama_new_context_with_model(
             ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
             ggml_set_name(ctx->inp_mean,    "inp_mean");
             ggml_set_name(ctx->inp_cls,     "inp_cls");
+            if (ctx->kv_self.unlimited)
+                ggml_set_name(ctx->inp_s_mask, "inp_s_mask");
 
             ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
             LLAMA_LOG_INFO("%s: %10s input buffer size   = %8.2f MiB\n", __func__,
diff --git a/llama.h b/llama.h
index 70da4cb3f0ff6..bbf738988413b 100644
--- a/llama.h
+++ b/llama.h
@@ -235,6 +235,7 @@ extern "C" {
         uint32_t seed;              // RNG seed, -1 for random
         uint32_t n_ctx;             // text context, 0 = from model
         uint32_t n_batch;           // prompt processing maximum batch size
+        uint32_t n_parallel;        // number of parallel sequences
         uint32_t n_threads;         // number of threads to use for generation
         uint32_t n_threads_batch;   // number of threads to use for batch processing
 

From e73eaa7b4f998b886b6569a6c7d469f16fe76ba4 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Wed, 14 Feb 2024 13:43:14 -0500
Subject: [PATCH 21/41] mamba : in comments, properly refer to KV cells instead
 of slots

---
 llama.cpp | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 6908093df3356..3360eb3e65295 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1802,7 +1802,7 @@ struct llama_kv_cell {
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
-    // with Mamba, a slot can hold the state for more than one past token
+    // with Mamba, a cell can hold the state for more than one past token
     bool unlimited = false;
 
     // Note: The value of head isn't only used to optimize searching
@@ -2069,7 +2069,7 @@ static bool llama_kv_cache_init(
 
     cache.has_shift = false;
 
-    // for now, only Mamba can hold state for more than one past token per slot
+    // for now, only Mamba can hold state for more than one past token per cell
     cache.unlimited = model.arch == LLM_ARCH_MAMBA;
 
     cache.head = 0;
@@ -2330,7 +2330,7 @@ static void llama_kv_cache_seq_cp(
             cache.cells[seq_id_dst].delta = seq_id_src;
             // NOTE: a sequence can't have multiple sources, but can have multiple destinations.
             // For compatibility with the other KV cache API functions,
-            // the seq_id(s) of a slot suggests an intent to "copy to" those id(s),
+            // the seq_id(s) of a cell suggests an intent to "copy to" those id(s),
             // so that when a sequence is copied, it can initially be found from the source cell.
             cache.cells[seq_id_src].seq_id.insert(seq_id_dst);
             // prevent the destination from getting cleared
@@ -12504,10 +12504,10 @@ struct llama_context * llama_new_context_with_model(
     ggml_type type_k = params.type_k;
     ggml_type type_v = params.type_v;
 
-    // Mamba only needs a constant number of KV cache slots per sequence
+    // Mamba only needs a constant number of KV cache cells per sequence
     if (model->arch == LLM_ARCH_MAMBA) {
-        // Mamba needs as many slots as there are distinct sequences processed at the same time
-        // The extra slot allows dedicating a sequence id to the system prompt
+        // Mamba needs as many KV cells as there are sequences kept at any time
+        // The extra cell allows dedicating a sequence id to the system prompt
         // TODO: find a better way to get the max number of parallel sequences
         kv_size = params.n_parallel + 1;
         // it's probably best to keep as much precision as possible for the states

From de50c549c4da0c60d3582f82660d3c534b4811ac Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 17 Feb 2024 20:30:29 -0500
Subject: [PATCH 22/41] mamba : reduce memory usage of ggml_ssm_scan

From 290.37 MiB to 140.68 MiB of CPU compute buffer size
with Mamba 3B with a batch size of 512.

The result tensor of ggml_ssm_scan was previously a big part
of the CPU compute buffer size. To make it smaller,
it does not contain the intermediate ssm states anymore.
Both y and the last ssm state are combined in the result tensor,
because it seems only a single tensor can be returned by an operator
with the way the graph is built.
---
 ggml.c    | 84 +++++++++++++++++++++++++++++++++++--------------------
 ggml.h    |  3 +-
 llama.cpp | 33 ++++++++++------------
 3 files changed, 70 insertions(+), 50 deletions(-)

diff --git a/ggml.c b/ggml.c
index 09e77bb8c1f06..3fa290f6f8175 100644
--- a/ggml.c
+++ b/ggml.c
@@ -6087,14 +6087,15 @@ struct ggml_tensor * ggml_ssm_scan(
         struct ggml_tensor  * x,
         struct ggml_tensor  * dt,
         struct ggml_tensor  * A,
-        struct ggml_tensor  * B) {
+        struct ggml_tensor  * B,
+        struct ggml_tensor  * C) {
     GGML_ASSERT(ggml_is_contiguous(s));
     GGML_ASSERT(ggml_is_contiguous(x));
     GGML_ASSERT(ggml_is_contiguous(dt));
     GGML_ASSERT(ggml_is_contiguous(A));
     GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
+    GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
     GGML_ASSERT(ggml_are_same_shape(x, dt));
-    GGML_ASSERT(ggml_is_matrix(s)); // the ssm_state should be 2D
 
     {
         const int64_t d_state  = s->ne[0];
@@ -6106,6 +6107,8 @@ struct ggml_tensor * ggml_ssm_scan(
         GGML_ASSERT(A->ne[1] == d_inner);
         GGML_ASSERT(B->ne[0] == d_state);
         GGML_ASSERT(B->ne[1] == n_tokens);
+        GGML_ASSERT(C->ne[0] == d_state);
+        GGML_ASSERT(C->ne[1] == n_tokens);
     }
 
     bool is_node = false;
@@ -6115,7 +6118,8 @@ struct ggml_tensor * ggml_ssm_scan(
         is_node = true;
     }
 
-    struct ggml_tensor * result = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, s->ne[0], s->ne[1], x->ne[1]);
+    // 2-in-1 concatenated y and ssm_states, {d_inner, n_tokens} with {d_state, d_inner, n_kv}
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, ggml_nelements(x) + ggml_nelements(s));
 
     result->op   = GGML_OP_SSM_SCAN;
     result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6124,6 +6128,7 @@ struct ggml_tensor * ggml_ssm_scan(
     result->src[2] = dt;
     result->src[3] = A;
     result->src[4] = B;
+    result->src[5] = C;
 
     return result;
 }
@@ -14650,6 +14655,7 @@ static void ggml_compute_forward_ssm_scan_f32(
         const struct ggml_tensor * src2, // dt
         const struct ggml_tensor * src3, // A
         const struct ggml_tensor * src4, // B
+        const struct ggml_tensor * src5, // C
         struct ggml_tensor * dst) {
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
@@ -14658,67 +14664,84 @@ static void ggml_compute_forward_ssm_scan_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int64_t nc  = src0->ne[0];
+    const int64_t nc  = src0->ne[0]; // d_state
+    const int64_t nr  = src0->ne[1]; // d_inner
     const int64_t n_t = src1->ne[1]; // number of tokens in the batch
-    const int64_t nr0 = ggml_nrows(src0);
 
-    GGML_ASSERT(nc*n_t*nr0  == ggml_nelements(dst));
+    GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
     GGML_ASSERT(src0->nb[0] == sizeof(float));
     GGML_ASSERT(src1->nb[0] == sizeof(float));
     GGML_ASSERT(src2->nb[0] == sizeof(float));
     GGML_ASSERT(src3->nb[0] == sizeof(float));
     GGML_ASSERT(src4->nb[0] == sizeof(float));
-    // allow merging multiple rows in the same vec operation
+    GGML_ASSERT(src5->nb[0] == sizeof(float));
+    // required for the dot product between s and C
     GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
-    GGML_ASSERT(src3->nb[1] == src3->ne[0]*sizeof(float));
+    // required to get correct offset for state destination
+    GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
 
     // rows per thread
-    const int dr = (nr0 + nth - 1)/nth;
+    const int dr = (nr + nth - 1)/nth;
 
     // row range for this thread
     const int ir0 = dr*ith;
-    const int ir1 = MIN(ir0 + dr, nr0);
+    const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
 
-    // first batch
+    // first token in the batch
     {
-        float * pdst = (float *) ((char *)  dst->data + ir0*( dst->nb[1])); // {d_state, d_inner, n_tokens}
-        float * s    = (float *) ((char *) src0->data + ir0*(src0->nb[1])); // {d_state, d_inner}
-        float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0])); // {d_inner, n_tokens}
-        float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0])); // {d_inner, n_tokens}
-        float * A    = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-        float * B    = (float *) ((char *) src4->data);                     // {d_state, n_tokens}
+        float * y  = (float *) ((char *)  dst->data + ir0*(src1->nb[0])); // {d_inner, n_tokens}
+        float * s  = (float *) ((char *)  dst->data + ir0*(src0->nb[1]) + src1->nb[2]); // {d_state, d_inner, n_kv}
+        float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1])); // {d_state, d_inner, n_kv}
+        float * x  = (float *) ((char *) src1->data + ir0*(src1->nb[0])); // {d_inner, n_tokens}
+        float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0])); // {d_inner, n_tokens}
+        float * A  = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+        float * B  = (float *) ((char *) src4->data); // {d_state, n_tokens}
+        float * C  = (float *) ((char *) src5->data); // {d_state, n_tokens}
         // d_inner
         for (int i1 = 0; i1 < ir; ++i1) {
             float dt_soft_plus = log1pf(expf(dt[i1]));
             float x_dt = x[i1] * dt_soft_plus;
+            float sumf = 0.0f;
             // d_state
             for (int i0 = 0; i0 < nc; ++i0) {
                 int i = i0 + i1*nc;
-                // ssm_state * dA + dB * x
-                pdst[i] = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                // state = prev_state * dA + dB * x
+                float state = s0[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                // y = rowwise_dotprod(state, C)
+                sumf += state*C[i0];
+                // FIXME: handle simultaneous sequences
+                s[i] = state;
             }
+            y[i1] = sumf;
         }
     }
 
-    // compute state for rest of tokens, previous state comes from dest
+    // rest of the batch, state comes from previous one which was stored in destination
     for (int i2 = 1; i2 < n_t; ++i2) {
-        float * pdst = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) +  i2   *( dst->nb[2])); // {d_state, d_inner, n_tokens}
-        float * s    = (float *) ((char *)  dst->data + ir0*( dst->nb[1]) + (i2-1)*( dst->nb[2])); // {d_state, d_inner, n_tokens}
-        float * x    = (float *) ((char *) src1->data + ir0*(src1->nb[0]) +  i2   *(src1->nb[1])); // {d_inner, n_tokens}
-        float * dt   = (float *) ((char *) src2->data + ir0*(src2->nb[0]) +  i2   *(src2->nb[1])); // {d_inner, n_tokens}
-        float * A    = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-        float * B    = (float *) ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tokens}
+        float * y  = (float *) ((char *)  dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
+        float * s  = (float *) ((char *)  dst->data + ir0*(src0->nb[1]) + src1->nb[2]); // {d_state, d_inner, n_kv}
+        float * x  = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
+        float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
+        float * A  = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+        float * B  = (float *) ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tokens}
+        float * C  = (float *) ((char *) src5->data +  i2*(src5->nb[1])); // {d_state, n_tokens}
         // d_inner
         for (int i1 = 0; i1 < ir; ++i1) {
             float dt_soft_plus = log1pf(expf(dt[i1]));
             float x_dt = x[i1] * dt_soft_plus;
+            float sumf = 0.0f;
             // d_state
             for (int i0 = 0; i0 < nc; ++i0) {
                 int i = i0 + i1*nc;
-                // ssm_state * dA + dB * x
-                pdst[i] = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                // state = prev_state * dA + dB * x
+                float state = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                // y = rowwise_dotprod(state, C)
+                sumf += state*C[i0];
+                // FIXME: handle simultaneous sequences
+                s[i] = state;
             }
+            y[i1] = sumf;
         }
     }
 }
@@ -14730,11 +14753,12 @@ static void ggml_compute_forward_ssm_scan(
         const struct ggml_tensor * src2,
         const struct ggml_tensor * src3,
         const struct ggml_tensor * src4,
+        const struct ggml_tensor * src5,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_ssm_scan_f32(params, src0, src1, src2, src3, src4, dst);
+                ggml_compute_forward_ssm_scan_f32(params, src0, src1, src2, src3, src4, src5, dst);
             } break;
         default:
             {
@@ -15796,7 +15820,7 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_SSM_SCAN:
             {
-                ggml_compute_forward_ssm_scan(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor);
+                ggml_compute_forward_ssm_scan(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor->src[5], tensor);
             } break;
         case GGML_OP_WIN_PART:
             {
diff --git a/ggml.h b/ggml.h
index 092b86e487acc..fdf251911d41c 100644
--- a/ggml.h
+++ b/ggml.h
@@ -1708,7 +1708,8 @@ extern "C" {
             struct ggml_tensor  * x,
             struct ggml_tensor  * dt,
             struct ggml_tensor  * A,
-            struct ggml_tensor  * B);
+            struct ggml_tensor  * B,
+            struct ggml_tensor  * C);
 
     // partition into non-overlapping windows with padding if needed
     // example:
diff --git a/llama.cpp b/llama.cpp
index 3360eb3e65295..2613340ccbc04 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -8029,9 +8029,9 @@ struct llm_build_context {
             ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], (d_conv-1)*(d_inner), kv_self.size);
             ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il],  (d_state)*(d_inner), kv_self.size);
 
+            // clear states of sequences which are starting at the beginning of this batch
             {
                 ggml_tensor * state_mask = ggml_view_2d(ctx0, lctx.inp_s_mask, 1, n_kv, lctx.inp_s_mask->nb[0], 0);
-                // clear states of sequences which are starting at the beginning of this batch
                 conv_states = ggml_mul(ctx0,
                     ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]),
                     state_mask);
@@ -8040,11 +8040,8 @@ struct llm_build_context {
                     state_mask);
             }
 
-            // TODO: support more than one sequence per batch (these could then use ggml_reshape_3d)
-            ggml_tensor * conv_state = ggml_view_2d(ctx0, conv_states, d_conv - 1, d_inner,
-                (d_conv - 1)*ggml_element_size(conv_states), 0);
-            ggml_tensor * ssm_state  = ggml_view_2d(ctx0,  ssm_states, d_state, d_inner,
-                (d_state)*ggml_element_size(ssm_states), 0);
+            struct ggml_tensor * conv_state = ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv);
+            struct ggml_tensor * ssm_state  = ggml_reshape_3d(ctx0,  ssm_states,    d_state, d_inner, n_kv);
 
             // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -8099,7 +8096,7 @@ struct llm_build_context {
 
             // ssm
             {
-                //  {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
+                // {d_inner, dt_rank + 2*d_state} * {d_inner, n_tokens} => {dt_rank + 2*d_state, n_tokens}
                 struct ggml_tensor * x_db = ggml_mul_mat(ctx0, model.layers[il].ssm_x, x);
                 // split
                 struct ggml_tensor * dt = ggml_view_2d(ctx0, x_db, dt_rank, n_tokens, x_db->nb[1], 0);
@@ -8110,22 +8107,20 @@ struct llm_build_context {
                 dt = ggml_mul_mat(ctx0, model.layers[il].ssm_dt, dt);
                 dt = ggml_add(ctx0, dt, model.layers[il].ssm_dt_b);
 
-                // Custom operator to implement some of the optimizations
-                // described in the Annex D of the Mamba paper.
-                // TODO: maybe also optimize step 4 of the Speed section of Annex D (the mul_mat with C)
-                // => {d_state, d_inner, n_tokens}
-                ssm_state = ggml_ssm_scan(ctx0, ssm_state, x, dt, model.layers[il].ssm_a, B);
+                // Custom operator to optimize the parallel associative scan
+                // as described in the Annex D of the Mamba paper.
+                // => {d_inner, n_tokens} and {d_state, d_inner, n_kv} combined,
+                // because only a single tensor can be returned.
+                struct ggml_tensor * y_ssm_states = ggml_ssm_scan(ctx0, ssm_state, x, dt, model.layers[il].ssm_a, B, C);
 
-                // only store last state
+                // store last states (the second part of y_ssm_states)
                 ggml_build_forward_expand(gf,
                     ggml_cpy(ctx0,
-                        ggml_view_2d(ctx0, ssm_state, d_state, d_inner, ssm_state->nb[1], (n_tokens-1)*ssm_state->nb[2]),
-                        ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner, kv_self.head*d_state*d_inner*ggml_element_size(ssm_state))));
+                        ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*ggml_element_size(y_ssm_states)),
+                        ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_self.head*d_state*d_inner*ggml_element_size(ssm_state))));
+
+                struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
 
-                // {d_state, d_inner, n_tokens} * {d_state, n_tokens} => {d_inner, 1, n_tokens}
-                struct ggml_tensor * y = ggml_mul_mat(ctx0, ssm_state, ggml_permute(ctx0, C, 0, 2, 1, 3));
-                // => {d_inner, n_tokens}
-                y = ggml_permute(ctx0, y, 0, 2, 1, 3);
                 // {d_inner, n_tokens} * {d_inner} => {d_inner, n_tokens}
                 y = ggml_add(ctx0, y, ggml_mul(ctx0, x, model.layers[il].ssm_d));
                 y = ggml_mul(ctx0, y, ggml_silu(ctx0, z));

From 9473ec2147f06f1c9cdd44c0caac5811493abc68 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 18 Feb 2024 20:57:30 -0500
Subject: [PATCH 23/41] mamba : simultaneous sequence processing

A batch can now contain tokens from multiple sequences.

This is necessary for at least the parallel example, the server example,
and the HellaSwag test in the perplexity example.

However, for this to be useful, uses of llama_kv_cache_seq_rm/cp
will need to be changed to work on whole sequences.

* ggml : add ggml_ssm_conv as a new operator for the conv step of Mamba

This operator makes it possible to use and update the correct states
for each token of the batch in the same way as ggml_ssm_scan.
Other solutions which use existing operators would need loops which would
add too many nodes to the graph (at least the ones I thought of).

Using this operator further reduces the size of the CPU compute buffer
from 140.68 MiB to 103.20 MiB with Mamba 3B with a batch size of 512.
And (at least on CPU), it's a bit faster than before.

Note that "ggml_ssm_conv" is probably not the most appropriate name,
and it could be changed if a better one is found.

* llama : add inp_s_seq as a new input tensor

The most convenient implementation to select the correct state (for Mamba)
for each token is to directly get the correct index from a tensor.
This is why inp_s_seq is storing int32_t and not floats.

The other, less convenient way to select the correct state would be
to have inp_KQ_mask contain 1.0f for each state used by a token
and 0.0f otherwise. This complicates quickly fetching the first used
state of a token, and is also less efficient because a whole row
of the mask would always need to be read for each token.

Using indexes makes it easy to stop searching when there are
no more sequences for a token, and the first sequence assigned
is always very quickly available (it's the first element of each row).
---
 ggml.c    | 292 +++++++++++++++++++++++++++++++++++++++++++++---------
 ggml.h    |  11 +-
 llama.cpp | 138 +++++++++++++++-----------
 3 files changed, 331 insertions(+), 110 deletions(-)

diff --git a/ggml.c b/ggml.c
index 3fa290f6f8175..981a2302a41c1 100644
--- a/ggml.c
+++ b/ggml.c
@@ -1828,6 +1828,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "FLASH_ATTN",
     "FLASH_FF",
     "FLASH_ATTN_BACK",
+    "SSM_CONV",
     "SSM_SCAN",
     "WIN_PART",
     "WIN_UNPART",
@@ -1851,7 +1852,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = {
     "CROSS_ENTROPY_LOSS_BACK",
 };
 
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
 
 static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "none",
@@ -1915,6 +1916,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "flash_attn(x)",
     "flash_ff(x)",
     "flash_attn_back(x)",
+    "ssm_conv(x)",
     "ssm_scan(x)",
     "win_part(x)",
     "win_unpart(x)",
@@ -1938,7 +1940,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
     "cross_entropy_loss_back(x,y)",
 };
 
-static_assert(GGML_OP_COUNT == 73, "GGML_OP_COUNT != 73");
+static_assert(GGML_OP_COUNT == 74, "GGML_OP_COUNT != 74");
 
 static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2");
 
@@ -6079,6 +6081,51 @@ struct ggml_tensor * ggml_flash_attn_back(
     return result;
 }
 
+// ggml_ssm_conv
+
+struct ggml_tensor * ggml_ssm_conv(
+        struct ggml_context * ctx,
+        struct ggml_tensor  * s,
+        struct ggml_tensor  * x,
+        struct ggml_tensor  * c,
+        struct ggml_tensor  * sq) {
+    GGML_ASSERT(ggml_is_3d(s));
+    GGML_ASSERT(ggml_is_matrix(x));
+    GGML_ASSERT(ggml_is_matrix(c));
+    GGML_ASSERT(ggml_is_matrix(sq));
+    GGML_ASSERT(sq->type == GGML_TYPE_I32);
+
+    const int64_t d_conv   = c->ne[0];
+    const int64_t d_inner  = c->ne[1];
+    const int64_t n_tokens = x->ne[1];
+    const int64_t n_kv     = s->ne[2];
+
+    GGML_ASSERT( s->ne[0] == d_conv - 1);
+    GGML_ASSERT( s->ne[1] == d_inner);
+    GGML_ASSERT( x->ne[0] == d_inner);
+    GGML_ASSERT(sq->ne[0] == n_kv);
+    GGML_ASSERT(sq->ne[1] == n_tokens);
+
+    bool is_node = false;
+
+    if (s->grad || x->grad || c->grad || sq->grad) {
+        GGML_ASSERT(false); // TODO: implement
+        is_node = true;
+    }
+
+    // 2-in-1 concatenated x and conv_states, {d_inner, n_tokens} with {d_conv, d_inner, n_kv}
+    struct ggml_tensor * result = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, (d_inner*n_tokens) + (d_conv*d_inner*n_kv));
+
+    result->op   = GGML_OP_SSM_CONV;
+    result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
+    result->src[0] = s;
+    result->src[1] = x;
+    result->src[2] = c;
+    result->src[3] = sq;
+
+    return result;
+}
+
 // ggml_ssm_scan
 
 struct ggml_tensor * ggml_ssm_scan(
@@ -6088,11 +6135,13 @@ struct ggml_tensor * ggml_ssm_scan(
         struct ggml_tensor  * dt,
         struct ggml_tensor  * A,
         struct ggml_tensor  * B,
-        struct ggml_tensor  * C) {
+        struct ggml_tensor  * C,
+        struct ggml_tensor  * sq) {
     GGML_ASSERT(ggml_is_contiguous(s));
     GGML_ASSERT(ggml_is_contiguous(x));
     GGML_ASSERT(ggml_is_contiguous(dt));
     GGML_ASSERT(ggml_is_contiguous(A));
+    GGML_ASSERT(sq->type == GGML_TYPE_I32);
     GGML_ASSERT(B->nb[0] == ggml_type_size(B->type));
     GGML_ASSERT(C->nb[0] == ggml_type_size(C->type));
     GGML_ASSERT(ggml_are_same_shape(x, dt));
@@ -6113,7 +6162,7 @@ struct ggml_tensor * ggml_ssm_scan(
 
     bool is_node = false;
 
-    if (s->grad || x->grad || dt->grad || A->grad || B->grad) {
+    if (s->grad || x->grad || dt->grad || A->grad || B->grad || C->grad || sq->grad) {
         GGML_ASSERT(false); // TODO: implement
         is_node = true;
     }
@@ -6129,6 +6178,7 @@ struct ggml_tensor * ggml_ssm_scan(
     result->src[3] = A;
     result->src[4] = B;
     result->src[5] = C;
+    result->src[6] = sq;
 
     return result;
 }
@@ -14646,6 +14696,135 @@ static void ggml_compute_forward_flash_attn_back(
     }
 }
 
+// ggml_compute_forward_ssm_conv
+
+static void ggml_compute_forward_ssm_conv_f32(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0, // conv_state
+        const struct ggml_tensor * src1, // x
+        const struct ggml_tensor * src2, // conv1d.weight
+        const struct ggml_tensor * src3, // state_seq
+        struct ggml_tensor * dst) {
+    if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
+        return;
+    }
+
+    const int ith = params->ith;
+    const int nth = params->nth;
+
+    const int nc   = src2->ne[0]; // d_conv
+    const int nr   = src0->ne[1]; // d_inner
+    const int n_t  = src1->ne[1]; // n_tokens
+    const int n_kv = src0->ne[2]; // max number of sequences in the batch
+
+    GGML_ASSERT((nr*n_t) + (nc*nr*n_kv) == ggml_nelements(dst));
+    GGML_ASSERT(src0->nb[0] == sizeof(float));
+    GGML_ASSERT(src1->nb[0] == sizeof(float));
+    GGML_ASSERT(src2->nb[0] == sizeof(float));
+    GGML_ASSERT(src3->nb[0] == sizeof(int32_t));
+    GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
+    // for use with the destination state offset between sequences
+    GGML_ASSERT(src2->nb[2] == src2->ne[1]*src2->ne[0]*sizeof(float));
+
+    // rows per thread
+    const int dr = (nr + nth - 1)/nth;
+
+    // row range for this thread
+    const int ir0 = dr*ith;
+    const int ir1 = MIN(ir0 + dr, nr);
+    const int ir  = ir1 - ir0;
+
+    if (n_kv > 1) {
+        // multiple sequences means it's hard to know when it's the first time a state is read,
+        // so copy them all over to the destination, just to be sure.
+        for (int i3 = 0; i3 < n_kv; ++i3) {
+            float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
+            float * s  = (float *) ((char *)  dst->data + ir0*(src2->nb[1]) + i3*(src2->nb[2]) + nr*n_t*sizeof(float));
+            // can't use memcpy because of d_conv vs d_conv - 1
+            for (int i1 = 0; i1 < ir; ++i1) {
+                for (int i0 = 0; i0 < nc - 1; ++i0) {
+                    // copy s0 to last (d_conv - 1) columns of s
+                    s[1 + i0 + i1*nc] = s0[i0 + i1*(nc - 1)];
+                }
+            }
+        }
+    }
+
+    for (int i2 = 0; i2 < n_t; ++i2) {
+        int32_t * sq = (int32_t *) ((char *) src3->data +  i2*(src3->nb[1])); // {n_kv, n_tokens}
+        float *   x  = (float *)   ((char *)  dst->data + ir0*sizeof(float) + i2*(nr*sizeof(float))); // {d_inner, n_tokens}
+        float *   s  = (float *)   ((char *)  dst->data + ir0*(src2->nb[1]) + sq[0]*(src2->nb[2]) + nr*n_t*sizeof(float)); // {d_conv, d_inner, n_kv}
+        float *   s0; // {d_conv - 1, d_inner, n_kv}
+        float *   x0 = (float *)   ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
+        float *   c  = (float *)   ((char *) src2->data + ir0*(src2->nb[1])); // {d_conv, d_inner}
+        int ne0s0;
+
+        GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
+
+        // avoid needing to copy the state for the first token
+        if (i2 == 0) {
+            s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_conv - 1, d_inner, n_kv}
+            ne0s0 = src0->ne[0];
+        } else {
+            // the source is the last (d_conv - 1) columns of the destination
+            s0 = s + 1;
+            ne0s0 = nc;
+        }
+
+        // d_inner
+        for (int i1 = 0; i1 < ir; ++i1) {
+            // shift state left
+            for (int i0 = 0; i0 < nc - 1; ++i0) {
+                s[i0 + i1*nc] = s0[i0 + i1*ne0s0];
+            }
+            // insert x on the last column
+            s[(nc - 1) + i1*nc] = x0[i1];
+        }
+
+        // handle copies when there are multiple output states
+        for (int i3 = 1; i3 < n_kv; ++i3) {
+            int32_t seq = sq[i3];
+            if (0 <= seq && seq < n_kv) {
+                float * s1 = s + (seq - sq[0])*nc*nr;
+                memcpy(s1, s, nc*ir*sizeof(float));
+            } else {
+                // stop at negative or too big seq_ids
+                break;
+            }
+        }
+
+        // it seems a little faster when this is separate from the state shift
+        for (int i1 = 0; i1 < ir; ++i1) {
+            // rowwise dot product
+            float sumf = 0.0f;
+            for (int i0 = 0; i0 < nc; ++i0) {
+                int i = i0 + i1*nc;
+                sumf += s[i] * c[i];
+            }
+            x[i1] = sumf;
+        }
+    }
+}
+
+static void ggml_compute_forward_ssm_conv(
+        const struct ggml_compute_params * params,
+        const struct ggml_tensor * src0,
+        const struct ggml_tensor * src1,
+        const struct ggml_tensor * src2,
+        const struct ggml_tensor * src3,
+        struct ggml_tensor * dst) {
+    switch (src0->type) {
+        case GGML_TYPE_F32:
+            {
+                ggml_compute_forward_ssm_conv_f32(params, src0, src1, src2, src3, dst);
+            } break;
+        default:
+            {
+                GGML_ASSERT(false);
+            } break;
+    }
+}
+
 // ggml_compute_forward_ssm_scan
 
 static void ggml_compute_forward_ssm_scan_f32(
@@ -14656,6 +14835,7 @@ static void ggml_compute_forward_ssm_scan_f32(
         const struct ggml_tensor * src3, // A
         const struct ggml_tensor * src4, // B
         const struct ggml_tensor * src5, // C
+        const struct ggml_tensor * src6, // sq
         struct ggml_tensor * dst) {
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
@@ -14664,9 +14844,10 @@ static void ggml_compute_forward_ssm_scan_f32(
     const int ith = params->ith;
     const int nth = params->nth;
 
-    const int64_t nc  = src0->ne[0]; // d_state
-    const int64_t nr  = src0->ne[1]; // d_inner
-    const int64_t n_t = src1->ne[1]; // number of tokens in the batch
+    const int64_t nc   = src0->ne[0]; // d_state
+    const int64_t nr   = src0->ne[1]; // d_inner
+    const int64_t n_t  = src1->ne[1]; // number of tokens in the batch
+    const int64_t n_kv = src0->ne[2]; // max number of sequences in the batch
 
     GGML_ASSERT(ggml_nelements(src1) + ggml_nelements(src0) == ggml_nelements(dst));
     GGML_ASSERT(src0->nb[0] == sizeof(float));
@@ -14675,9 +14856,11 @@ static void ggml_compute_forward_ssm_scan_f32(
     GGML_ASSERT(src3->nb[0] == sizeof(float));
     GGML_ASSERT(src4->nb[0] == sizeof(float));
     GGML_ASSERT(src5->nb[0] == sizeof(float));
-    // required for the dot product between s and C
+    // required for the dot product between s and C, and when copying the states
     GGML_ASSERT(src0->nb[1] == src0->ne[0]*sizeof(float));
-    // required to get correct offset for state destination
+    // required for per-sequence offsets for states
+    GGML_ASSERT(src0->nb[2] == src0->ne[0]*src0->ne[1]*sizeof(float));
+    // required to get correct offset for state destination (i.e. src1->nb[2])
     GGML_ASSERT(src1->nb[2] == src1->ne[0]*src1->ne[1]*sizeof(float));
 
     // rows per thread
@@ -14688,44 +14871,37 @@ static void ggml_compute_forward_ssm_scan_f32(
     const int ir1 = MIN(ir0 + dr, nr);
     const int ir  = ir1 - ir0;
 
-    // first token in the batch
-    {
-        float * y  = (float *) ((char *)  dst->data + ir0*(src1->nb[0])); // {d_inner, n_tokens}
-        float * s  = (float *) ((char *)  dst->data + ir0*(src0->nb[1]) + src1->nb[2]); // {d_state, d_inner, n_kv}
-        float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1])); // {d_state, d_inner, n_kv}
-        float * x  = (float *) ((char *) src1->data + ir0*(src1->nb[0])); // {d_inner, n_tokens}
-        float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0])); // {d_inner, n_tokens}
-        float * A  = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-        float * B  = (float *) ((char *) src4->data); // {d_state, n_tokens}
-        float * C  = (float *) ((char *) src5->data); // {d_state, n_tokens}
-        // d_inner
-        for (int i1 = 0; i1 < ir; ++i1) {
-            float dt_soft_plus = log1pf(expf(dt[i1]));
-            float x_dt = x[i1] * dt_soft_plus;
-            float sumf = 0.0f;
-            // d_state
-            for (int i0 = 0; i0 < nc; ++i0) {
-                int i = i0 + i1*nc;
-                // state = prev_state * dA + dB * x
-                float state = s0[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
-                // y = rowwise_dotprod(state, C)
-                sumf += state*C[i0];
-                // FIXME: handle simultaneous sequences
-                s[i] = state;
-            }
-            y[i1] = sumf;
+    if (n_kv > 1) {
+        // it's hard to know if the source states have already been copied
+        // when there are multiple, so copy them already.
+        for (int i3 = 0; i3 < n_kv; ++i3) {
+            float * s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]));
+            float * s  = (float *) ((char *)  dst->data + ir0*(src0->nb[1]) + i3*(src0->nb[2]) + src1->nb[2]);
+            memcpy(s, s0, nc*ir*sizeof(float));
         }
     }
 
-    // rest of the batch, state comes from previous one which was stored in destination
-    for (int i2 = 1; i2 < n_t; ++i2) {
-        float * y  = (float *) ((char *)  dst->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
-        float * s  = (float *) ((char *)  dst->data + ir0*(src0->nb[1]) + src1->nb[2]); // {d_state, d_inner, n_kv}
-        float * x  = (float *) ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
-        float * dt = (float *) ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
-        float * A  = (float *) ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
-        float * B  = (float *) ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tokens}
-        float * C  = (float *) ((char *) src5->data +  i2*(src5->nb[1])); // {d_state, n_tokens}
+    for (int i2 = 0; i2 < n_t; ++i2) {
+        int32_t * sq = (int32_t *) ((char *) src6->data +  i2*(src6->nb[1])); // {n_kv, n_tokens}
+        float *   y  = (float *)   ((char *)  dst->data + ir0*(src1->nb[0]) +    i2*(src1->nb[1])); // {d_inner, n_tokens}
+        float *   s  = (float *)   ((char *)  dst->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2]) + src1->nb[2]); // {d_state, d_inner, n_kv}
+        float *   s0;
+        float *   x  = (float *)   ((char *) src1->data + ir0*(src1->nb[0]) + i2*(src1->nb[1])); // {d_inner, n_tokens}
+        float *   dt = (float *)   ((char *) src2->data + ir0*(src2->nb[0]) + i2*(src2->nb[1])); // {d_inner, n_tokens}
+        float *   A  = (float *)   ((char *) src3->data + ir0*(src3->nb[1])); // {d_state, d_inner}
+        float *   B  = (float *)   ((char *) src4->data +  i2*(src4->nb[1])); // {d_state, n_tokens}
+        float *   C  = (float *)   ((char *) src5->data +  i2*(src5->nb[1])); // {d_state, n_tokens}
+
+        GGML_ASSERT(0 <= sq[0] && sq[0] < n_kv);
+
+        // avoid needing to copy the state for the first token
+        if (i2 == 0) {
+            s0 = (float *) ((char *) src0->data + ir0*(src0->nb[1]) + sq[0]*(src0->nb[2])); // {d_state, d_inner, n_kv}
+        } else {
+            // otherwise the source is the same as the destination
+            s0 = s;
+        }
+
         // d_inner
         for (int i1 = 0; i1 < ir; ++i1) {
             float dt_soft_plus = log1pf(expf(dt[i1]));
@@ -14735,14 +14911,25 @@ static void ggml_compute_forward_ssm_scan_f32(
             for (int i0 = 0; i0 < nc; ++i0) {
                 int i = i0 + i1*nc;
                 // state = prev_state * dA + dB * x
-                float state = s[i]*(expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
+                float state = (s0[i] * expf(dt_soft_plus * A[i])) + (B[i0] * x_dt);
                 // y = rowwise_dotprod(state, C)
-                sumf += state*C[i0];
-                // FIXME: handle simultaneous sequences
+                sumf += state * C[i0];
                 s[i] = state;
             }
             y[i1] = sumf;
         }
+
+        // handle copies when there are multiple output states
+        for (int i3 = 1; i3 < n_kv; ++i3) {
+            int32_t seq = sq[i3];
+            if (0 <= seq && seq < n_kv) {
+                float * s1 = s + (seq - sq[0])*nc*nr;
+                memcpy(s1, s, nc*ir*sizeof(float));
+            } else {
+                // stop at negative or too big seq_ids
+                break;
+            }
+        }
     }
 }
 
@@ -14754,11 +14941,12 @@ static void ggml_compute_forward_ssm_scan(
         const struct ggml_tensor * src3,
         const struct ggml_tensor * src4,
         const struct ggml_tensor * src5,
+        const struct ggml_tensor * src6,
         struct ggml_tensor * dst) {
     switch (src0->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_ssm_scan_f32(params, src0, src1, src2, src3, src4, src5, dst);
+                ggml_compute_forward_ssm_scan_f32(params, src0, src1, src2, src3, src4, src5, src6, dst);
             } break;
         default:
             {
@@ -15818,9 +16006,13 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
                 bool masked = t != 0;
                 ggml_compute_forward_flash_attn_back(params, masked, tensor);
             } break;
+        case GGML_OP_SSM_CONV:
+            {
+                ggml_compute_forward_ssm_conv(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
+            } break;
         case GGML_OP_SSM_SCAN:
             {
-                ggml_compute_forward_ssm_scan(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor->src[5], tensor);
+                ggml_compute_forward_ssm_scan(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor->src[5], tensor->src[6], tensor);
             } break;
         case GGML_OP_WIN_PART:
             {
@@ -16868,6 +17060,7 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
             {
                 GGML_ASSERT(false); // not supported
             } break;
+        case GGML_OP_SSM_CONV:
         case GGML_OP_SSM_SCAN:
             {
                 GGML_ASSERT(false); // TODO: not implemented
@@ -17569,6 +17762,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
             {
                 n_tasks = n_threads;
             } break;
+        case GGML_OP_SSM_CONV:
         case GGML_OP_SSM_SCAN:
             {
                 n_tasks = n_threads;
diff --git a/ggml.h b/ggml.h
index fdf251911d41c..6d5cf76962f5d 100644
--- a/ggml.h
+++ b/ggml.h
@@ -460,6 +460,7 @@ extern "C" {
         GGML_OP_FLASH_ATTN,
         GGML_OP_FLASH_FF,
         GGML_OP_FLASH_ATTN_BACK,
+        GGML_OP_SSM_CONV,
         GGML_OP_SSM_SCAN,
         GGML_OP_WIN_PART,
         GGML_OP_WIN_UNPART,
@@ -1702,6 +1703,13 @@ extern "C" {
             struct ggml_tensor  * c0,
             struct ggml_tensor  * c1);
 
+    GGML_API struct ggml_tensor * ggml_ssm_conv(
+            struct ggml_context * ctx,
+            struct ggml_tensor  * s,
+            struct ggml_tensor  * x,
+            struct ggml_tensor  * c,
+            struct ggml_tensor  * sq);
+
     GGML_API struct ggml_tensor * ggml_ssm_scan(
             struct ggml_context * ctx,
             struct ggml_tensor  * s,
@@ -1709,7 +1717,8 @@ extern "C" {
             struct ggml_tensor  * dt,
             struct ggml_tensor  * A,
             struct ggml_tensor  * B,
-            struct ggml_tensor  * C);
+            struct ggml_tensor  * C,
+            struct ggml_tensor  * sq);
 
     // partition into non-overlapping windows with padding if needed
     // example:
diff --git a/llama.cpp b/llama.cpp
index 2613340ccbc04..ad03226745f26 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2044,6 +2044,7 @@ struct llama_context {
     struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
     struct ggml_tensor * inp_cls;       // I32 [n_batch]
     struct ggml_tensor * inp_s_mask;    // F32 [kv_size] (only used by constant state models like Mamba)
+    struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]
 
 #ifdef GGML_USE_MPI
     ggml_mpi_context * ctx_mpi = NULL;
@@ -4761,8 +4762,9 @@ static bool llm_load_tensors(
                     const int64_t d_conv  = hparams.n_embd_head_k + 1;
                     const int64_t d_state = hparams.n_embd_head_v;
                     const int64_t d_inner = hparams.n_head;
-                    // FIXME: ceiling instead of floor
-                    const int64_t dt_rank = n_embd / 16;
+                    // TODO: allow loading dt_rank from the model config
+                    // ceiling division
+                    const int64_t dt_rank = (n_embd / 16) + (n_embd % 16 > 0);
                     GGML_ASSERT(2 * n_embd == d_inner);
 
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -8012,13 +8014,12 @@ struct llm_build_context {
         GGML_ASSERT(2 * d_model == d_inner);
         const int64_t d_conv = n_embd_head_k + 1;
         const int64_t d_state = n_embd_head_v;
-        const int64_t dt_rank = d_model / 16;
+        // ceiling division
+        const int64_t dt_rank = (d_model / 16) + (d_model % 16 > 0);
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
 
-        GGML_ASSERT(kv_self.used - kv_self.head + 1 == 1); // TODO: support more than one sequence per batch
-
         // {n_embd, n_tokens}
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
@@ -8040,8 +8041,8 @@ struct llm_build_context {
                     state_mask);
             }
 
-            struct ggml_tensor * conv_state = ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv);
-            struct ggml_tensor * ssm_state  = ggml_reshape_3d(ctx0,  ssm_states,    d_state, d_inner, n_kv);
+            conv_states = ggml_reshape_3d(ctx0, conv_states, d_conv - 1, d_inner, n_kv);
+            ssm_states  = ggml_reshape_3d(ctx0,  ssm_states,    d_state, d_inner, n_kv);
 
             // norm
             cur = llm_build_norm(ctx0, inpL, hparams,
@@ -8056,37 +8057,31 @@ struct llm_build_context {
             struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
             struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
 
+            struct ggml_tensor * state_seq = ggml_view_2d(ctx0, lctx.inp_s_seq, n_kv, n_tokens, n_kv*ggml_element_size(lctx.inp_s_seq), 0);
+
             // conv
             {
-                // concat last (d_conv - 1) columns of conv_state, and x
-
-                // The following tensor is too big in order to avoid an assertion error when making an overlapping view.
-                // TODO: in ggml_new_tensor_impl, handle overlapping data range in data size calculation
-                // This could then be a tensor with ne[] = {(d_conv-1)+n_tokens, d_inner},
-                // but the size difference is not that big (d_conv is usually 4).
-                struct ggml_tensor * conv_x = ggml_new_tensor_1d(ctx0, conv_state->type, d_conv*d_inner*n_tokens);
-                const size_t conv_x_nb1 = (d_conv - 1 + n_tokens) * ggml_element_size(conv_x);
-
-                conv_x = ggml_set_2d(ctx0, conv_x, conv_state, conv_x_nb1, 0);
-                // making x contiguous is necessary because ggml_set expects it
-                conv_x = ggml_set_2d(ctx0, conv_x, ggml_cont(ctx0, ggml_transpose(ctx0, x)), conv_x_nb1, (d_conv - 1)*ggml_element_size(conv_x));
-
-                // store last (d_conv - 1) columns of conv_x back into the KV cache for the next conv_state
+                // Custom operator which is needed only to ease simultaneous sequence processing.
+                // For a single sequence, the equivalent is to concatenate the columns of conv_states and x,
+                // then make a self-overlapping view of that over d_conv columns at each stride in the 3rd dimension,
+                // then element-wise multiply that with the conv1d weigth,
+                // then sum the elements of each row,
+                // (the last two steps are a dot product over rows (also doable with mul_mat))
+                // then permute away the ne[0] dimension,
+                // and then you're left with the resulting x tensor.
+                // The new conv_states is the last (d_conv - 1) columns
+                // of the last 3rd dimensional "layer" of the self-overlapping view.
+                // For simultaneous sequences, it's more complicated.
+                struct ggml_tensor * x_conv = ggml_ssm_conv(ctx0, conv_states, x, model.layers[il].ssm_conv1d, state_seq);
+
+                // store last (d_conv - 1) columns of the conv_state part of x_conv back into the KV cache
                 ggml_build_forward_expand(gf,
                     ggml_cpy(ctx0,
-                        ggml_view_2d(ctx0, conv_x, d_conv - 1, d_inner, conv_x_nb1, n_tokens*ggml_element_size(conv_x)),
-                        ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner), kv_self.head*(d_conv - 1)*(d_inner)*ggml_element_size(conv_x))));
-
-                // prepare convolution for all tokens in the batch with a self-overlapping view,
-                // shifting by one column each ... depth? ... with a window of d_conv columns.
-                // {(d_conv-1)+n_tokens, d_inner} => {d_conv, d_inner, n_tokens}
-                conv_x = ggml_view_3d(ctx0, conv_x, d_conv, d_inner, n_tokens, conv_x_nb1, 1*ggml_element_size(conv_x), 0);
+                        ggml_view_2d(ctx0, x_conv, d_conv - 1, d_inner*n_kv, d_conv*ggml_element_size(x_conv), (1+d_inner*n_tokens)*ggml_element_size(x_conv)),
+                        ggml_view_1d(ctx0, kv_self.k_l[il], (d_conv - 1)*(d_inner)*(n_kv), kv_self.head*(d_conv - 1)*(d_inner)*ggml_element_size(x_conv))));
 
-                // perform convolution
-                // => {1, d_inner, n_tokens}
-                x = ggml_sum_rows(ctx0, ggml_mul(ctx0, conv_x, model.layers[il].ssm_conv1d));
-                // => {d_inner, n_tokens, 1}
-                x = ggml_permute(ctx0, x, 2, 0, 1, 3);
+                // extract x from x_conv
+                x = ggml_view_2d(ctx0, x_conv, d_inner, n_tokens, d_inner*ggml_element_size(x_conv), 0);
 
                 // bias
                 x = ggml_add(ctx0, x, model.layers[il].ssm_conv1d_b);
@@ -8111,13 +8106,13 @@ struct llm_build_context {
                 // as described in the Annex D of the Mamba paper.
                 // => {d_inner, n_tokens} and {d_state, d_inner, n_kv} combined,
                 // because only a single tensor can be returned.
-                struct ggml_tensor * y_ssm_states = ggml_ssm_scan(ctx0, ssm_state, x, dt, model.layers[il].ssm_a, B, C);
+                struct ggml_tensor * y_ssm_states = ggml_ssm_scan(ctx0, ssm_states, x, dt, model.layers[il].ssm_a, B, C, state_seq);
 
                 // store last states (the second part of y_ssm_states)
                 ggml_build_forward_expand(gf,
                     ggml_cpy(ctx0,
                         ggml_view_1d(ctx0, y_ssm_states, d_state*d_inner*n_kv, d_inner*n_tokens*ggml_element_size(y_ssm_states)),
-                        ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_self.head*d_state*d_inner*ggml_element_size(ssm_state))));
+                        ggml_view_1d(ctx0, kv_self.v_l[il], d_state*d_inner*n_kv, kv_self.head*d_state*d_inner*ggml_element_size(ssm_states))));
 
                 struct ggml_tensor * y = ggml_view_2d(ctx0, y_ssm_states, d_inner, n_tokens, d_inner*ggml_element_size(y_ssm_states), 0);
 
@@ -8362,7 +8357,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
 
         float * data = (float *) lctx.inp_KQ_mask->data;
 
-        // For Transformers, use only the previous KV cells
+        // For Transformers, use only the previous KV cells (or all, when non-causal)
         // of the correct sequence for each token of the batch.
         // It's assumed that if a token in the batch has multiple sequences, they are equivalent.
         for (int h = 0; h < 1; ++h) {
@@ -8382,13 +8377,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                 }
             }
         }
-        // For Mamba (and other constant-time-and-size architectures),
-        // update the correct state(s)/sequence(s) for each token of the batch.
-        // Source and destination states are both the same for the sake of implementation simplicity.
-        // It would be more complex if they were sometimes the same and somtimes not.
-        // (with Transformers, source KV cells are never the destination,
-        //  which is also simpler, but more memory hungry)
-        // TODO: implement
     }
 
     if (hparams.need_kq_pos) {
@@ -8447,28 +8435,54 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
     }
 
     if (kv_self.unlimited) {
-        const uint32_t kv_size = kv_self.size;
-        const uint32_t n_kv    = kv_self.n;
+        const int64_t kv_size = kv_self.size;
+        const int64_t n_kv    = kv_self.n;
 
-        GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
-        float * data = (float *) lctx.inp_s_mask->data;
+        {
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
+            float * data = (float *) lctx.inp_s_mask->data;
 
-        // states which are not affected by the current batch are left untouched
-        for (uint32_t i = 0; i < n_kv; ++i) {
-            llama_seq_id    seq_id        = i + lctx.kv_self.head;
-            llama_kv_cell & kv_cell       = lctx.kv_self.cells[seq_id];
-            bool            has_self_seq  = kv_cell.has_seq_id(seq_id);
+            // states which are not affected by the current batch are left untouched
+            for (int i = 0; i < n_kv; ++i) {
+                llama_seq_id    seq_id        = i + lctx.kv_self.head;
+                llama_kv_cell & kv_cell       = lctx.kv_self.cells[seq_id];
+                bool            has_self_seq  = kv_cell.has_seq_id(seq_id);
+
+                data[i] = (float) has_self_seq;
+
+                // ensure current sequences will be kept
+                if (!has_self_seq) {
+                    kv_cell.seq_id.insert(seq_id);
+                }
+            }
+        }
+        // For Mamba (and other constant-time-and-size architectures),
+        // update the correct state(s)/sequence(s) for each token of the batch.
+        // Like with the KQ_mask, if a token in the batch has multiple sequences,
+        // they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv).
+        {
+            const int64_t n_tokens = batch.n_tokens;
+
+            GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_seq->buffer));
+            int32_t * data = (int32_t *) lctx.inp_s_seq->data;
 
-            data[i] = (float) has_self_seq;
+            for (int j = 0; j < n_tokens; ++j) {
+                const int32_t n_seq = batch.n_seq_id[j];
+                GGML_ASSERT(0 < n_seq); // a token should be part of at least 1 sequence
 
-            // ensure current sequences will be kept
-            if (!has_self_seq) {
-                kv_cell.seq_id.insert(seq_id);
+                for (int i = 0; i < n_kv; ++i) {
+                    if (i < n_seq) {
+                        // for this type of model, the head is the minimum seq_id of the batch
+                        data[j*n_kv + i] = batch.seq_id[j][i] - kv_self.head;
+                    } else {
+                        data[j*n_kv + i] = -1;
+                    }
+                }
             }
         }
         // remove extraneous seq_ids when state copies are made
         {
-            for (uint32_t i = 0; i < kv_size; ++i) {
+            for (int i = 0; i < kv_size; ++i) {
                 llama_kv_cell & kv_cell      = lctx.kv_self.cells[i];
                 uint32_t        n_seqs       = kv_cell.seq_id.size();
                 bool            has_self_seq = kv_cell.has_seq_id(i);
@@ -12642,7 +12656,7 @@ struct llama_context * llama_new_context_with_model(
         // graph inputs
         {
             ggml_init_params init_params = {
-                /* .mem_size   */ ggml_tensor_overhead()*(8 + ctx->kv_self.unlimited),
+                /* .mem_size   */ ggml_tensor_overhead()*(8 + 2*(ctx->kv_self.unlimited)),
                 /* .mem_buffer */ nullptr,
                 /* .no_alloc   */ true,
             };
@@ -12656,8 +12670,10 @@ struct llama_context * llama_new_context_with_model(
             ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size);
             ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
             ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
-            if (ctx->kv_self.unlimited)
+            if (ctx->kv_self.unlimited) {
                 ctx->inp_s_mask = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size);
+                ctx->inp_s_seq  = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_I32, kv_size, cparams.n_batch);
+            }
 
             ggml_set_name(ctx->inp_tokens,  "inp_tokens");
             ggml_set_name(ctx->inp_embd,    "inp_embd");
@@ -12667,8 +12683,10 @@ struct llama_context * llama_new_context_with_model(
             ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
             ggml_set_name(ctx->inp_mean,    "inp_mean");
             ggml_set_name(ctx->inp_cls,     "inp_cls");
-            if (ctx->kv_self.unlimited)
+            if (ctx->kv_self.unlimited) {
                 ggml_set_name(ctx->inp_s_mask, "inp_s_mask");
+                ggml_set_name(ctx->inp_s_seq,  "inp_s_seq");
+            }
 
             ctx->buf_input = ggml_backend_alloc_ctx_tensors_from_buft(ctx->ctx_input, llama_default_buffer_type_cpu(true));
             LLAMA_LOG_INFO("%s: %10s input buffer size   = %8.2f MiB\n", __func__,

From 3dcf79824d33806b3462aa4b55ebef5477aa1a81 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 25 Feb 2024 09:51:49 -0500
Subject: [PATCH 24/41] mamba : support llama_kv_cache_seq_cp copy chains

* mamba : support shifting and dividing the kv cache pos
---
 llama.cpp | 69 +++++++++++++++++++++++++++----------------------------
 1 file changed, 34 insertions(+), 35 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index ad03226745f26..7281972b39775 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2184,18 +2184,17 @@ static bool llama_kv_cache_find_slot(
                     }
                     // Assuming the tokens are in-order
                     if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
-                        // What should happen when the pos backtracks?
+                        // What should happen when the pos backtracks or skips a value?
                         // Clearing the state mid-batch would require special-casing which isn't done.
-                        LLAMA_LOG_ERROR("%s: non-consecutive token position %d after %d for sequence %d\n",
+                        LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
                             __func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
-                        return false;
                     }
                     cache.cells[seq_id].pos = batch.pos[i];
-                    // NOTE: seq_ids are not inserted here, because they are handled when the graph is built.
+                    // NOTE: seq_ids are not inserted here; they are handled when the input tensors are set
                 } else {
                     // too big seq_id
                     // TODO: would it be possible to resize the KV cache size instead?
-                    LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d\n", __func__, seq_id, cache.size);
+                    LLAMA_LOG_ERROR("%s: seq_id=%d >= kv_size=%d Try using a bigger --parallel value\n", __func__, seq_id, cache.size);
                     return false;
                 }
             }
@@ -2326,24 +2325,26 @@ static void llama_kv_cache_seq_cp(
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     if (cache.unlimited) {
-        if ((uint32_t)seq_id_dst < cache.size && (uint32_t)seq_id_src < cache.size) {
-            // intent to "copy from" (does not support copy chains)
+        if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
+            seq_id_src = cache.cells[seq_id_src].delta;
+            GGML_ASSERT((uint32_t) seq_id_src < cache.size);
+            // intent to "copy from"
+            // supports copy chains thanks to taking the source of the source
             cache.cells[seq_id_dst].delta = seq_id_src;
-            // NOTE: a sequence can't have multiple sources, but can have multiple destinations.
-            // For compatibility with the other KV cache API functions,
-            // the seq_id(s) of a cell suggests an intent to "copy to" those id(s),
-            // so that when a sequence is copied, it can initially be found from the source cell.
-            cache.cells[seq_id_src].seq_id.insert(seq_id_dst);
-            // prevent the destination from getting cleared
-            cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
+
+            // prevent the destination from getting cleared if the source is not empty
+            if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) {
+                cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
+            }
             // repurposed as a "need copy" flag
             // (shifting can't be done anyway for this kind of KV cache)
-            cache.has_shift = seq_id_src != seq_id_dst;
-            // NOTE: this is not correct for sequence swaps (which aren't a thing in the KV cache API yet)
+            cache.has_shift = true;
+
             cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
         }
         return;
     }
+    // otherwise, this is the KV cache of a Transformer-like model
 
     cache.head = 0;
 
@@ -2385,7 +2386,14 @@ static void llama_kv_cache_seq_add(
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     if (cache.unlimited) {
-        GGML_ASSERT(false); // not supported
+        // for Mamba-like models, only the pos needs to be shifted
+        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
+            llama_kv_cell & cell = cache.cells[seq_id];
+            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                cell.pos += delta;
+            }
+        }
+        return;
     }
 
     for (uint32_t i = 0; i < cache.size; ++i) {
@@ -2422,7 +2430,14 @@ static void llama_kv_cache_seq_div(
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     if (cache.unlimited) {
-        GGML_ASSERT(false); // not supported
+        // for Mamba-like models, only the pos needs to be changed
+        if (0 <= seq_id && seq_id < (int64_t) cache.size) {
+            llama_kv_cell & cell = cache.cells[seq_id];
+            if (cell.has_seq_id(seq_id) && p0 <= cell.pos && cell.pos < p1) {
+                cell.pos /= d;
+            }
+        }
+        return;
     }
 
     for (uint32_t i = 0; i < cache.size; ++i) {
@@ -8435,7 +8450,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
     }
 
     if (kv_self.unlimited) {
-        const int64_t kv_size = kv_self.size;
         const int64_t n_kv    = kv_self.n;
 
         {
@@ -8451,7 +8465,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                 data[i] = (float) has_self_seq;
 
                 // ensure current sequences will be kept
-                if (!has_self_seq) {
+                if (!has_self_seq && kv_cell.pos >= 0) {
                     kv_cell.seq_id.insert(seq_id);
                 }
             }
@@ -8480,21 +8494,6 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                 }
             }
         }
-        // remove extraneous seq_ids when state copies are made
-        {
-            for (int i = 0; i < kv_size; ++i) {
-                llama_kv_cell & kv_cell      = lctx.kv_self.cells[i];
-                uint32_t        n_seqs       = kv_cell.seq_id.size();
-                bool            has_self_seq = kv_cell.has_seq_id(i);
-
-                if (has_self_seq && n_seqs > 1) {
-                    kv_cell.seq_id.clear();
-                    kv_cell.seq_id.insert(i);
-                } else if (!has_self_seq && n_seqs > 0) {
-                    kv_cell.seq_id.clear();
-                }
-            }
-        }
     }
 }
 

From 34e2fca8ebcb1ad11b840fb336579a801b68f9e6 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 25 Feb 2024 09:59:53 -0500
Subject: [PATCH 25/41] mamba : make the server and parallel examples work with
 whole sequences

A seq_id is dedicated to the system prompt in both cases.

* llama : make llama_kv_cache_seq_rm return whether it succeeded or not
---
 examples/parallel/parallel.cpp | 20 ++++++++++------
 examples/server/server.cpp     | 43 ++++++++++++++++++++++++----------
 llama.cpp                      | 38 ++++++++++++++++++++----------
 llama.h                        |  2 +-
 4 files changed, 70 insertions(+), 33 deletions(-)

diff --git a/examples/parallel/parallel.cpp b/examples/parallel/parallel.cpp
index 7d11fcd593080..a2ef0fb039c3f 100644
--- a/examples/parallel/parallel.cpp
+++ b/examples/parallel/parallel.cpp
@@ -107,6 +107,9 @@ int main(int argc, char ** argv) {
     // number of simultaneous "clients" to simulate
     const int32_t n_clients = params.n_parallel;
 
+    // dedicate one sequence to the system prompt
+    params.n_parallel += 1;
+
     // requests to simulate
     const int32_t n_seq = params.n_sequences;
 
@@ -196,8 +199,8 @@ int main(int argc, char ** argv) {
         }
 
         // assign the system KV cache to all parallel sequences
-        for (int32_t i = 1; i < n_clients; ++i) {
-            llama_kv_cache_seq_cp(ctx, 0, i, 0, n_tokens_system);
+        for (int32_t i = 1; i <= n_clients; ++i) {
+            llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
         }
 
         LOG_TEE("\n");
@@ -221,15 +224,17 @@ int main(int argc, char ** argv) {
 
             client.i_batch = batch.n_tokens;
 
-            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id }, true);
+            llama_batch_add(batch, client.sampled, n_tokens_system + client.n_prompt + client.n_decoded, { client.id + 1 }, true);
 
             client.n_decoded += 1;
         }
 
         if (batch.n_tokens == 0) {
             // all sequences have ended - clear the entire KV cache
-            for (int i = 0; i < n_clients; ++i) {
-                llama_kv_cache_seq_rm(ctx, i, n_tokens_system, -1);
+            for (int i = 1; i <= n_clients; ++i) {
+                llama_kv_cache_seq_rm(ctx, i, -1, -1);
+                // but keep the system prompt
+                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
             }
 
             LOG_TEE("%s: clearing the KV cache\n", __func__);
@@ -255,7 +260,7 @@ int main(int argc, char ** argv) {
                     tokens_prompt = ::llama_tokenize(ctx, client.prompt, false);
 
                     for (size_t i = 0; i < tokens_prompt.size(); ++i) {
-                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id }, false);
+                        llama_batch_add(batch, tokens_prompt[i], i + n_tokens_system, { client.id + 1 }, false);
                     }
 
                     // extract the logits only for the last token
@@ -366,7 +371,8 @@ int main(int argc, char ** argv) {
                     }
 
                     // delete only the generated part of the sequence, i.e. keep the system prompt in the cache
-                    llama_kv_cache_seq_rm(ctx, client.id, n_tokens_system, -1);
+                    llama_kv_cache_seq_rm(ctx, client.id + 1, -1, -1);
+                    llama_kv_cache_seq_cp(ctx, 0, client.id + 1, -1, -1);
 
                     const auto t_main_end = ggml_time_us();
 
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 0ca388f47db7b..e4bf8d06f3efa 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -377,12 +377,16 @@ struct llama_server_context
                 return false;
             }
 
-            if (params.n_ctx < 2048) { // request larger context for the image embedding
+            if (params.n_ctx != 0 && params.n_ctx < 2048) { // request larger context for the image embedding
                 params.n_ctx = 2048;
             }
         }
 
+        // dedicate one sequence to the system prompt
+        params.n_parallel += 1;
+
         std::tie(model, ctx) = llama_init_from_gpt_params(params);
+        params.n_parallel -= 1; // but be sneaky about it
         if (model == nullptr)
         {
             LOG_ERROR("unable to load model", {{"model", params.model}});
@@ -862,9 +866,9 @@ struct llama_server_context
             }
 
             // assign the system KV cache to all parallel sequences
-            for (int32_t i = 1; i < params.n_parallel; ++i)
+            for (int32_t i = 1; i <= params.n_parallel; ++i)
             {
-                llama_kv_cache_seq_cp(ctx, 0, i, 0, system_tokens.size());
+                llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
             }
         }
 
@@ -1351,7 +1355,7 @@ struct llama_server_context
             std::vector<llama_token> append_tokens = tokenize(json_prompt, false); // has next image
             for (int i = 0; i < (int) append_tokens.size(); ++i)
             {
-                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id }, true);
+                llama_batch_add(batch, append_tokens[i], system_tokens.size() + slot.n_past, { slot.id + 1 }, true);
                 slot.n_past += 1;
             }
         }
@@ -1587,8 +1591,8 @@ struct llama_server_context
                         {"n_system_tokens", system_tokens.size()},
                         {"n_cache_tokens",  slot.cache_tokens.size()}
                     });
-                    llama_kv_cache_seq_rm (ctx, slot.id, n_keep            , n_keep + n_discard);
-                    llama_kv_cache_seq_add(ctx, slot.id, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
+                    llama_kv_cache_seq_rm (ctx, slot.id + 1, n_keep            , n_keep + n_discard);
+                    llama_kv_cache_seq_add(ctx, slot.id + 1, n_keep + n_discard, system_tokens.size() + slot.n_past, -n_discard);
 
                     for (size_t i = n_keep + n_discard; i < slot.cache_tokens.size(); i++)
                     {
@@ -1640,7 +1644,7 @@ struct llama_server_context
 
             // TODO: we always have to take into account the "system_tokens"
             //       this is not great and needs to be improved somehow
-            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id }, true);
+            llama_batch_add(batch, slot.sampled, system_tokens.size() + slot_npast, { slot.id + 1 }, true);
             slot.n_past += 1;
         }
 
@@ -1810,13 +1814,28 @@ struct llama_server_context
                         }
                     }
 
+                    // keep only the common part
                     int p0 = (int) system_tokens.size() + slot.n_past;
                     LOG_INFO("kv cache rm [p0, end)", {
                         { "slot_id", slot.id },
                         { "task_id", slot.task_id },
                         { "p0",      p0 }
                     });
-                    llama_kv_cache_seq_rm(ctx, slot.id, p0, -1);
+                    if (!llama_kv_cache_seq_rm(ctx, slot.id + 1, p0, -1)) {
+                        // could not partially delete (likely using a non-Transformer model)
+                        // TODO: logging
+                        llama_kv_cache_seq_rm(ctx,    slot.id + 1, -1, -1);
+                        llama_kv_cache_seq_cp(ctx, 0, slot.id + 1, -1, -1);
+
+                        // there is no common part left (except for the system prompt)
+                        // TODO: maybe find a way to refactor this to reuse the !cache_prompt case above
+                        slot.n_past = 0;
+                        slot.n_past_se = 0;
+                        slot.ga_i = 0;
+                        slot.n_prompt_tokens_processed = slot.n_prompt_tokens;
+                        // TODO: is the system prompt ever in the sampling context?
+                        llama_sampling_reset(slot.ctx_sampling);
+                    }
 
                     LOG_VERBOSE("prompt ingested", {
                                                     {"n_past",  slot.n_past},
@@ -1845,7 +1864,7 @@ struct llama_server_context
                                 ga_i += ga_w/ga_n;
                             }
                         }
-                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, {slot.id }, false);
+                        llama_batch_add(batch, prefix_tokens[slot.n_past], system_tokens.size() + slot_npast, { slot.id + 1 }, false);
                         slot_npast++;
                     }
 
@@ -1899,9 +1918,9 @@ struct llama_server_context
                         LOG_TEE("div:   [%6d, %6d] / %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w, slot.ga_n, (slot.ga_i + ib * bd) / slot.ga_n, (slot.ga_i + ib * bd + slot.ga_w) / slot.ga_n);
                         LOG_TEE("shift: [%6d, %6d] + %6d -> [%6d, %6d]\n", slot.ga_i + ib * bd + slot.ga_w, slot.n_past_se + ib * bd, dd, slot.ga_i + ib * bd + slot.ga_w + dd, slot.n_past_se + ib * bd + dd);
 
-                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i, slot.n_past_se, ib * bd);
-                        llama_kv_cache_seq_div(ctx, slot.id, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
-                        llama_kv_cache_seq_add(ctx, slot.id, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
+                        llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i, slot.n_past_se, ib * bd);
+                        llama_kv_cache_seq_div(ctx, slot.id + 1, slot.ga_i + ib * bd, slot.ga_i + ib * bd + slot.ga_w,slot.ga_n);
+                        llama_kv_cache_seq_add(ctx, slot.id + 1, slot.ga_i + ib * bd + slot.ga_w,slot.n_past_se + ib * bd, dd);
 
                         slot.n_past_se -= bd;
 
diff --git a/llama.cpp b/llama.cpp
index 7281972b39775..c3b0d311f99fc 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2275,7 +2275,7 @@ static void llama_kv_cache_clear(struct llama_kv_cache & cache) {
     cache.used = 0;
 }
 
-static void llama_kv_cache_seq_rm(
+static bool llama_kv_cache_seq_rm(
         struct llama_kv_cache & cache,
                  llama_seq_id   seq_id,
                     llama_pos   p0,
@@ -2285,11 +2285,23 @@ static void llama_kv_cache_seq_rm(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
+    // models like Mamba can't have a state partially erased
     if (cache.unlimited) {
-        // can only remove whole sequences for models like Mamba
-        GGML_ASSERT(p0 == 0);
-        GGML_ASSERT((uint32_t)seq_id < cache.size);
-        GGML_ASSERT(cache.cells[seq_id].pos < p1);
+        if (seq_id >= (int64_t) cache.size) {
+            // could be fatal
+            return false;
+        }
+        if (0 <= seq_id) {
+            // partial intersection is invalid
+            if ((0 < p0 && p0 <= cache.cells[seq_id].pos) || (0 < p1 && p1 <= cache.cells[seq_id].pos)) {
+                return false;
+            }
+        } else {
+            // seq_id is negative, then the range should include everything or nothing
+            if (p0 != p1 && (p0 != 0 || p1 != std::numeric_limits<llama_pos>::max())) {
+                return false;
+            }
+        }
     }
 
     for (uint32_t i = 0; i < cache.size; ++i) {
@@ -2313,6 +2325,8 @@ static void llama_kv_cache_seq_rm(
 
     // If we freed up a slot, set head to it so searching can start there.
     if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
+
+    return true;
 }
 
 static void llama_kv_cache_seq_cp(
@@ -12514,13 +12528,11 @@ struct llama_context * llama_new_context_with_model(
 
     // Mamba only needs a constant number of KV cache cells per sequence
     if (model->arch == LLM_ARCH_MAMBA) {
-        // Mamba needs as many KV cells as there are sequences kept at any time
-        // The extra cell allows dedicating a sequence id to the system prompt
-        // TODO: find a better way to get the max number of parallel sequences
-        kv_size = params.n_parallel + 1;
+        // Mamba needs at least as many KV cells as there are sequences kept at any time
+        kv_size = std::max((uint32_t) 1, params.n_parallel);
         // it's probably best to keep as much precision as possible for the states
-        type_k = GGML_TYPE_F32; // required by ggml_set for Mamba's conv_state
-        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_state
+        type_k = GGML_TYPE_F32; // required by ggml_ssm_conv for Mamba's conv_states
+        type_v = GGML_TYPE_F32; // required by ggml_ssm_scan for Mamba's ssm_states
     }
 
     GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
@@ -13039,8 +13051,8 @@ void llama_kv_cache_clear(struct llama_context * ctx) {
     llama_kv_cache_clear(ctx->kv_self);
 }
 
-void llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
-    llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
+bool llama_kv_cache_seq_rm(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1) {
+    return llama_kv_cache_seq_rm(ctx->kv_self, seq_id, p0, p1);
 }
 
 void llama_kv_cache_seq_cp(struct llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1) {
diff --git a/llama.h b/llama.h
index bbf738988413b..a4675d4c3256a 100644
--- a/llama.h
+++ b/llama.h
@@ -503,7 +503,7 @@ extern "C" {
     // seq_id < 0 : match any sequence
     // p0 < 0     : [0,  p1]
     // p1 < 0     : [p0, inf)
-    LLAMA_API void llama_kv_cache_seq_rm(
+    LLAMA_API bool llama_kv_cache_seq_rm(
             struct llama_context * ctx,
                     llama_seq_id   seq_id,
                        llama_pos   p0,

From 79d636cc7e6d466e67cc230facbd5815bb08f5b7 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 25 Feb 2024 17:26:31 -0500
Subject: [PATCH 26/41] mamba : dedicate an input tensor for state copy indices

This is cleaner and makes it easier to adapt when/if token positions
(and by extension, inp_K_shift) are no longer integers.
---
 llama.cpp | 122 ++++++++++++++++++++++++++++++++++++++++--------------
 1 file changed, 91 insertions(+), 31 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index c3b0d311f99fc..b3964810c5891 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1782,6 +1782,7 @@ struct llama_layer {
 struct llama_kv_cell {
     llama_pos pos   = -1;
     llama_pos delta = 0;
+    int32_t   src   = 0; // used by recurrent state models to copy states
 
     std::set<llama_seq_id> seq_id;
 
@@ -1802,6 +1803,7 @@ struct llama_kv_cell {
 struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
+    bool do_copy   = false;
     // with Mamba, a cell can hold the state for more than one past token
     bool unlimited = false;
 
@@ -2043,7 +2045,8 @@ struct llama_context {
     struct ggml_tensor * inp_K_shift;   // I32 [kv_size]
     struct ggml_tensor * inp_mean;      // F32 [n_batch, n_batch]
     struct ggml_tensor * inp_cls;       // I32 [n_batch]
-    struct ggml_tensor * inp_s_mask;    // F32 [kv_size] (only used by constant state models like Mamba)
+    struct ggml_tensor * inp_s_copy;    // I32 [kv_size]
+    struct ggml_tensor * inp_s_mask;    // F32 [kv_size]
     struct ggml_tensor * inp_s_seq;     // I32 [kv_size, n_batch]
 
 #ifdef GGML_USE_MPI
@@ -2085,9 +2088,9 @@ static bool llama_kv_cache_init(
 
     if (cache.unlimited) {
         for (uint32_t i = 0; i < cache.size; ++i) {
-            cache.cells[i].delta = i;
+            cache.cells[i].src = i;
         }
-    } // else, delta is already initialized to zero
+    }
 
 #ifdef GGML_USE_CLBLAST
     offload = false;
@@ -2340,19 +2343,20 @@ static void llama_kv_cache_seq_cp(
 
     if (cache.unlimited) {
         if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
-            seq_id_src = cache.cells[seq_id_src].delta;
+            seq_id_src = cache.cells[seq_id_src].src;
             GGML_ASSERT((uint32_t) seq_id_src < cache.size);
             // intent to "copy from"
             // supports copy chains thanks to taking the source of the source
-            cache.cells[seq_id_dst].delta = seq_id_src;
+            cache.cells[seq_id_dst].src = seq_id_src;
 
-            // prevent the destination from getting cleared if the source is not empty
+            // preserve the "keep or clear" status of the copied sequence
             if (cache.cells[seq_id_src].has_seq_id(seq_id_src)) {
                 cache.cells[seq_id_dst].seq_id.insert(seq_id_dst);
+            } else {
+                cache.cells[seq_id_dst].seq_id.erase(seq_id_dst);
             }
-            // repurposed as a "need copy" flag
-            // (shifting can't be done anyway for this kind of KV cache)
-            cache.has_shift = true;
+
+            cache.do_copy = true;
 
             cache.cells[seq_id_dst].pos = cache.cells[seq_id_src].pos;
         }
@@ -5445,21 +5449,7 @@ struct llm_build_context {
     struct ggml_cgraph * build_k_shift() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
-        // TODO: do this in a another graph with a dedicated input tensor
-        if (kv_self.unlimited) {
-            for (int il = 0; il < n_layer; ++il) {
-                ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], n_embd_k_gqa, kv_self.size);
-                ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], n_embd_v_gqa, kv_self.size);
-
-                conv_states = ggml_get_rows(ctx0, conv_states, lctx.inp_K_shift);
-                ssm_states  = ggml_get_rows(ctx0,  ssm_states, lctx.inp_K_shift);
-
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_states, kv_self.k_l[il]));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0,  ssm_states, kv_self.v_l[il]));
-            }
-
-            return gf;
-        }
+        GGML_ASSERT(kv_self.size == n_ctx);
 
         for (int il = 0; il < n_layer; ++il) {
             struct ggml_tensor * tmp =
@@ -5479,6 +5469,25 @@ struct llm_build_context {
         return gf;
     }
 
+    struct ggml_cgraph * build_s_copy() {
+        struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
+
+        for (int il = 0; il < n_layer; ++il) {
+            ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], n_embd_k_gqa, kv_self.size);
+            ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], n_embd_v_gqa, kv_self.size);
+
+            conv_states = ggml_get_rows(ctx0, conv_states, lctx.inp_s_copy);
+            ssm_states  = ggml_get_rows(ctx0,  ssm_states, lctx.inp_s_copy);
+
+            // TODO: name the intermediate tensors with cb()
+
+            ggml_build_forward_expand(gf, ggml_cpy(ctx0, conv_states, kv_self.k_l[il]));
+            ggml_build_forward_expand(gf, ggml_cpy(ctx0,  ssm_states, kv_self.v_l[il]));
+        }
+
+        return gf;
+    }
+
     struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
@@ -8211,6 +8220,23 @@ static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
     return result;
 }
 
+static struct ggml_cgraph * llama_build_graph_s_copy(llama_context & lctx) {
+    llama_batch dummy;
+    dummy.n_tokens = 0;
+
+    llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
+
+    struct llm_build_context llm(lctx, dummy, cb, false);
+
+    llm.init();
+
+    struct ggml_cgraph * result = llm.build_s_copy();
+
+    llm.free();
+
+    return result;
+}
+
 static struct ggml_cgraph * llama_build_graph(
          llama_context & lctx,
      const llama_batch & batch,
@@ -8350,6 +8376,18 @@ static void llama_set_k_shift(llama_context & lctx) {
     }
 }
 
+static void llama_set_s_copy(llama_context & lctx) {
+    const int64_t kv_size = lctx.kv_self.size;
+
+    assert(ggml_backend_buffer_is_host(lctx.inp_s_copy->buffer));
+
+    int32_t * data = (int32_t *) lctx.inp_s_copy->data;
+
+    for (int i = 0; i < kv_size; ++i) {
+        data[i] = lctx.kv_self.cells[i].src;
+    }
+}
+
 static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
     //
     // set input data
@@ -8464,7 +8502,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
     }
 
     if (kv_self.unlimited) {
-        const int64_t n_kv    = kv_self.n;
+        const int64_t n_kv = kv_self.n;
 
         {
             GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_s_mask->buffer));
@@ -8472,9 +8510,9 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
 
             // states which are not affected by the current batch are left untouched
             for (int i = 0; i < n_kv; ++i) {
-                llama_seq_id    seq_id        = i + lctx.kv_self.head;
-                llama_kv_cell & kv_cell       = lctx.kv_self.cells[seq_id];
-                bool            has_self_seq  = kv_cell.has_seq_id(seq_id);
+                llama_seq_id    seq_id       = i + lctx.kv_self.head;
+                llama_kv_cell & kv_cell      = lctx.kv_self.cells[seq_id];
+                bool            has_self_seq = kv_cell.has_seq_id(seq_id);
 
                 data[i] = (float) has_self_seq;
 
@@ -8998,7 +9036,7 @@ static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
 
 static void llama_kv_cache_update_internal(struct llama_context & lctx) {
     // apply K-shift if needed
-    if ((lctx.kv_self.unlimited || lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE) && lctx.kv_self.has_shift) {
+    if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
         llama_set_k_shift(lctx);
 
         {
@@ -9013,7 +9051,27 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
             kv_self.has_shift = false;
 
             for (uint32_t i = 0; i < kv_self.size; ++i) {
-                kv_self.cells[i].delta = kv_self.unlimited ? i : 0;
+                kv_self.cells[i].delta = 0;
+            }
+        }
+    }
+
+    if (lctx.kv_self.unlimited && lctx.kv_self.do_copy) {
+        llama_set_s_copy(lctx);
+
+        {
+            ggml_cgraph * gf = llama_build_graph_s_copy(lctx);
+
+            llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
+        }
+
+        {
+            auto & kv_self = lctx.kv_self;
+
+            kv_self.do_copy = false;
+
+            for (uint32_t i = 0; i < kv_self.size; ++i) {
+                kv_self.cells[i].src = i;
             }
         }
     }
@@ -12667,7 +12725,7 @@ struct llama_context * llama_new_context_with_model(
         // graph inputs
         {
             ggml_init_params init_params = {
-                /* .mem_size   */ ggml_tensor_overhead()*(8 + 2*(ctx->kv_self.unlimited)),
+                /* .mem_size   */ ggml_tensor_overhead()*(8 + 3*(ctx->kv_self.unlimited)),
                 /* .mem_buffer */ nullptr,
                 /* .no_alloc   */ true,
             };
@@ -12682,6 +12740,7 @@ struct llama_context * llama_new_context_with_model(
             ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
             ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
             if (ctx->kv_self.unlimited) {
+                ctx->inp_s_copy = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size);
                 ctx->inp_s_mask = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size);
                 ctx->inp_s_seq  = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_I32, kv_size, cparams.n_batch);
             }
@@ -12695,6 +12754,7 @@ struct llama_context * llama_new_context_with_model(
             ggml_set_name(ctx->inp_mean,    "inp_mean");
             ggml_set_name(ctx->inp_cls,     "inp_cls");
             if (ctx->kv_self.unlimited) {
+                ggml_set_name(ctx->inp_s_copy, "inp_s_copy");
                 ggml_set_name(ctx->inp_s_mask, "inp_s_mask");
                 ggml_set_name(ctx->inp_s_seq,  "inp_s_seq");
             }

From 8f605cfe0d6473e57a3d88ae00046e9696b1b49c Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 26 Feb 2024 20:25:23 -0500
Subject: [PATCH 27/41] mamba : adapt perplexity, batched, and batched-bench
 examples

* perplexity : limit the max number of sequences

This adapts to what the loaded model can provide.

* llama : add llama_n_max_seq to get the upper limit for seq_ids

Used by the perplexity example.

* batched : pass n_parallel to the model's context params

This should have been there already, but it wasn't.

* batched-bench : reserve sequences to support Mamba

* batched-bench : fix tokens being put in wrong sequences

Generation quality isn't what's measured in there anyway,
but at least using the correct sequences avoids using non-consecutive
token positions.
---
 examples/batched-bench/batched-bench.cpp | 13 ++++++++-----
 examples/batched/batched.cpp             |  3 ++-
 examples/perplexity/perplexity.cpp       |  9 ++++++---
 llama.cpp                                |  4 ++++
 llama.h                                  |  1 +
 5 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/examples/batched-bench/batched-bench.cpp b/examples/batched-bench/batched-bench.cpp
index 19aff18aefde7..dff6c68ec2e69 100644
--- a/examples/batched-bench/batched-bench.cpp
+++ b/examples/batched-bench/batched-bench.cpp
@@ -105,6 +105,9 @@ int main(int argc, char ** argv) {
     ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
 
+    // ensure enough sequences are available
+    ctx_params.n_parallel = *std::max_element(n_pl.begin(), n_pl.end());
+
     llama_context * ctx = llama_new_context_with_model(model, ctx_params);
 
     if (ctx == NULL) {
@@ -174,10 +177,10 @@ int main(int argc, char ** argv) {
 
                 llama_batch_clear(batch);
 
-                const int n_tokens = is_pp_shared ? pp : pl*pp;
-
-                for (int i = 0; i < n_tokens; ++i) {
-                    llama_batch_add(batch, 0, i, { 0 }, false);
+                for (int i = 0; i < pp; ++i) {
+                    for (int j = 0; j < (is_pp_shared ? 1 : pl); ++j) {
+                        llama_batch_add(batch, 0, i, { j }, false);
+                    }
                 }
                 batch.logits[batch.n_tokens - 1] = true;
 
@@ -192,7 +195,7 @@ int main(int argc, char ** argv) {
 
                 if (is_pp_shared) {
                     for (int32_t i = 1; i < pl; ++i) {
-                        llama_kv_cache_seq_cp(ctx, 0, i, 0, pp);
+                        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
                     }
                 }
 
diff --git a/examples/batched/batched.cpp b/examples/batched/batched.cpp
index 9be7eb56bcd8a..dde4d5a068e24 100644
--- a/examples/batched/batched.cpp
+++ b/examples/batched/batched.cpp
@@ -80,6 +80,7 @@ int main(int argc, char ** argv) {
     ctx_params.seed  = 1234;
     ctx_params.n_ctx = n_kv_req;
     ctx_params.n_batch = std::max(n_len, n_parallel);
+    ctx_params.n_parallel      = n_parallel;
     ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
 
@@ -132,7 +133,7 @@ int main(int argc, char ** argv) {
     // assign the system KV cache to all parallel sequences
     // this way, the parallel sequences will "reuse" the prompt tokens without having to copy them
     for (int32_t i = 1; i < n_parallel; ++i) {
-        llama_kv_cache_seq_cp(ctx, 0, i, 0, batch.n_tokens);
+        llama_kv_cache_seq_cp(ctx, 0, i, -1, -1);
     }
 
     if (n_parallel > 1) {
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
index 9ec989389cfad..52789ee631234 100644
--- a/examples/perplexity/perplexity.cpp
+++ b/examples/perplexity/perplexity.cpp
@@ -809,7 +809,7 @@ static void hellaswag_score(llama_context * ctx, const gpt_params & params) {
     const int n_batch = params.n_batch;
 
     const int max_tasks_per_batch = 32;
-    const int max_seq = 4*max_tasks_per_batch;
+    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
 
     llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
 
@@ -1086,7 +1086,7 @@ static void winogrande_score(llama_context * ctx, const gpt_params & params) {
     const int n_batch = params.n_batch;
 
     const int max_tasks_per_batch = 128;
-    const int max_seq = 2*max_tasks_per_batch;
+    const int max_seq = std::min(2*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
 
     llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
 
@@ -1438,7 +1438,7 @@ static void multiple_choice_score(llama_context * ctx, const gpt_params & params
     const int n_batch = params.n_batch;
 
     const int max_tasks_per_batch = 32;
-    const int max_seq = 4*max_tasks_per_batch;
+    const int max_seq = std::min(4*max_tasks_per_batch, (int) llama_n_max_seq(ctx));
 
     llama_batch batch = llama_batch_init(n_ctx, 0, max_seq);
 
@@ -1815,6 +1815,9 @@ int main(int argc, char ** argv) {
     llama_model * model;
     llama_context * ctx;
 
+    // ensure there's at least enough seq_ids for HellaSwag
+    params.n_parallel = std::max(4, params.n_parallel);
+
     // load the model and apply lora adapter, if any
     std::tie(model, ctx) = llama_init_from_gpt_params(params);
     if (model == NULL) {
diff --git a/llama.cpp b/llama.cpp
index b3964810c5891..f437059b26012 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -12844,6 +12844,10 @@ uint32_t llama_n_batch(const struct llama_context * ctx) {
     return ctx->cparams.n_batch;
 }
 
+uint32_t llama_n_max_seq(const struct llama_context * ctx) {
+    return ctx->kv_self.size;
+}
+
 enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
     return model->vocab.type;
 }
diff --git a/llama.h b/llama.h
index a4675d4c3256a..f0aca1fe5410a 100644
--- a/llama.h
+++ b/llama.h
@@ -377,6 +377,7 @@ extern "C" {
 
     LLAMA_API uint32_t llama_n_ctx      (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_max_seq  (const struct llama_context * ctx);
 
     LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
     LLAMA_API enum llama_rope_type  llama_rope_type (const struct llama_model * model);

From 206e8ee2b2c540e67f6631989fc677bcc69e589a Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Wed, 28 Feb 2024 10:58:17 -0500
Subject: [PATCH 28/41] mamba : stop abusing attention metadata

This breaks existing converted-to-GGUF Mamba models,
but will allow supporting mixed architectures like MambaFormer
without needing to break Mamba models.

This will also allow changing the size of Mamba's states
without having to reconvert models in the future.
(e.g. using something else than d_conv - 1 columns for the conv_states
 will not require breaking existing converted Mamba models again)

* gguf-py : add new KV metadata key-value pairs for Mamba

* llama : add new metadata key-value pairs for Mamba

* llama : guard against divisions by zero when n_head is 0

* mamba : rename "unlimited" KV cache property to "recurrent"
---
 convert-hf-to-gguf.py       |  17 +++--
 gguf-py/gguf/constants.py   |  12 ++++
 gguf-py/gguf/gguf_writer.py |  12 ++++
 llama.cpp                   | 136 ++++++++++++++++++++++++------------
 4 files changed, 128 insertions(+), 49 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index de7bf431f4a69..bed830ce6e4c1 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1857,7 +1857,14 @@ def set_vocab(self):
 
     def set_gguf_parameters(self):
         d_model = self.hparams["d_model"]
+        d_conv  = self.hparams.get("d_conv", 4)
         d_inner = self.hparams.get("d_inner", 2 * d_model)
+        d_state = self.hparams.get("d_state", 16)
+        # ceiling division
+        # ref: https://stackoverflow.com/a/17511341/22827863
+        # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
+        dt_rank = self.hparams.get("dt_rank", -(d_model // -16))
+
         # Fail early for models which don't have a block expansion factor of 2
         assert d_inner == 2 * d_model
 
@@ -1865,13 +1872,13 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_context_length(2**20) # arbitrary value; for those who use the default
         self.gguf_writer.add_embedding_length(d_model)
         self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
-        self.gguf_writer.add_head_count(d_inner) # the number of rows in conv_state and ssm_state
+        self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
         self.gguf_writer.add_block_count(self.hparams["n_layer"])
+        self.gguf_writer.add_ssm_conv_kernel_size(d_conv)
+        self.gguf_writer.add_ssm_inner_length(d_inner)
+        self.gguf_writer.add_ssm_state_length(d_state)
+        self.gguf_writer.add_ssm_dt_rank(dt_rank)
         self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
-        # NOTE: (ab)using the KV cache metadata to store dimensions for conv_state and ssm_state
-        # Since the first column of the conv_state is shifted out each time, it's not actually needed
-        self.gguf_writer.add_key_length(self.hparams.get("d_conv", 4) - 1)
-        self.gguf_writer.add_value_length(self.hparams.get("d_state", 16))
         self.gguf_writer.add_file_type(self.ftype)
 
     def write_tensors(self):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 651323a1eed55..8030023f30648 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -61,6 +61,12 @@ class Rope:
         SCALING_ORIG_CTX_LEN = "{arch}.rope.scaling.original_context_length"
         SCALING_FINETUNED    = "{arch}.rope.scaling.finetuned"
 
+    class SSM:
+        CONV_KERNEL_SIZE     = "{arch}.ssm.d_conv"
+        INNER_LENGTH         = "{arch}.ssm.d_inner"
+        STATE_LENGTH         = "{arch}.ssm.d_state"
+        DT_RANK              = "{arch}.ssm.dt_rank"
+
     class Tokenizer:
         MODEL            = "tokenizer.ggml.model"
         LIST             = "tokenizer.ggml.tokens"
@@ -763,6 +769,12 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_ROPE_SCALING_ORIG_CTX_LEN = Keys.Rope.SCALING_ORIG_CTX_LEN
 KEY_ROPE_SCALING_FINETUNED    = Keys.Rope.SCALING_FINETUNED
 
+# SSM
+KEY_SSM_CONV_KERNEL_SIZE = Keys.SSM.CONV_KERNEL_SIZE
+KEY_SSM_INNER_LENGTH     = Keys.SSM.INNER_LENGTH
+KEY_SSM_STATE_LENGTH     = Keys.SSM.STATE_LENGTH
+KEY_SSM_DT_RANK          = Keys.SSM.DT_RANK
+
 # tokenization
 KEY_TOKENIZER_MODEL      = Keys.Tokenizer.MODEL
 KEY_TOKENIZER_LIST       = Keys.Tokenizer.LIST
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 8011608323c45..146358e6971cf 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -382,6 +382,18 @@ def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
     def add_rope_scaling_finetuned(self, value: bool) -> None:
         self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
 
+    def add_ssm_conv_kernel_size(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.CONV_KERNEL_SIZE.format(arch=self.arch), value)
+
+    def add_ssm_inner_length(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.INNER_LENGTH.format(arch=self.arch), value)
+
+    def add_ssm_state_length(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.STATE_LENGTH.format(arch=self.arch), value)
+
+    def add_ssm_dt_rank(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.DT_RANK.format(arch=self.arch), value)
+
     def add_tokenizer_model(self, model: str) -> None:
         self.add_string(Keys.Tokenizer.MODEL, model)
 
diff --git a/llama.cpp b/llama.cpp
index f437059b26012..eb1f02e426d1e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -286,6 +286,11 @@ enum llm_kv {
     LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
     LLM_KV_ROPE_SCALING_FINETUNED,
 
+    LLM_KV_SSM_D_INNER,
+    LLM_KV_SSM_D_CONV,
+    LLM_KV_SSM_D_STATE,
+    LLM_KV_SSM_DT_RANK,
+
     LLM_KV_TOKENIZER_MODEL,
     LLM_KV_TOKENIZER_LIST,
     LLM_KV_TOKENIZER_TOKEN_TYPE,
@@ -344,6 +349,11 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
     { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
 
+    { LLM_KV_SSM_D_CONV,                    "%s.ssm.d_conv" },
+    { LLM_KV_SSM_D_INNER,                   "%s.ssm.d_inner"},
+    { LLM_KV_SSM_D_STATE,                   "%s.ssm.d_state"},
+    { LLM_KV_SSM_DT_RANK,                   "%s.ssm.dt_rank"},
+
     { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
     { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
     { LLM_KV_TOKENIZER_TOKEN_TYPE,          "tokenizer.ggml.token_type"         },
@@ -1638,6 +1648,12 @@ struct llama_hparams {
     float    rope_freq_scale_train;
     uint32_t n_yarn_orig_ctx;
 
+    // for State Space Models
+    uint32_t ssm_d_conv  = 0;
+    uint32_t ssm_d_inner = 0;
+    uint32_t ssm_d_state = 0;
+    uint32_t ssm_dt_rank = 0;
+
     float f_clamp_kqv      = 0.0f;
     float f_max_alibi_bias = 0.0f;
 
@@ -1666,6 +1682,11 @@ struct llama_hparams {
         if (this->rope_finetuned  != other.rope_finetuned)  return true;
         if (this->n_yarn_orig_ctx != other.n_yarn_orig_ctx) return true;
 
+        if (this->ssm_d_conv  != other.ssm_d_conv)  return true;
+        if (this->ssm_d_inner != other.ssm_d_inner) return true;
+        if (this->ssm_d_state != other.ssm_d_state) return true;
+        if (this->ssm_dt_rank != other.ssm_dt_rank) return true;
+
         const float EPSILON = 1e-9f;
 
         if (!is_float_close(this->f_norm_eps,            other.f_norm_eps,            EPSILON)) return true;
@@ -1677,6 +1698,9 @@ struct llama_hparams {
     }
 
     uint32_t n_gqa() const {
+        if (n_head_kv == 0) {
+            return 0;
+        }
         return n_head/n_head_kv;
     }
 
@@ -1687,6 +1711,18 @@ struct llama_hparams {
     uint32_t n_embd_v_gqa() const { // dimension of value embeddings across all k-v heads
         return n_embd_head_v * n_head_kv;
     }
+
+    uint32_t n_embd_k_s() const { // dimension of the recurrent convolution state embeddings
+        // corresponds to Mamba's conv_states size
+        // TODO: maybe support other convolution strides than 1
+        // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
+        return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
+    }
+
+    uint32_t n_embd_v_s() const { // dimension of the ssm scan state embeddings
+        // corresponds to Mamba's ssm_states size
+        return ssm_d_state * ssm_d_inner;
+    }
 };
 
 struct llama_cparams {
@@ -1804,8 +1840,8 @@ struct llama_kv_cache {
     bool has_shift = false;
     bool do_defrag = false;
     bool do_copy   = false;
-    // with Mamba, a cell can hold the state for more than one past token
-    bool unlimited = false;
+    // with recurrent state models, a cell can hold the state for more than one past token
+    bool recurrent = false;
 
     // Note: The value of head isn't only used to optimize searching
     // for a free KV slot. llama_decode_internal also uses it, so it
@@ -2067,14 +2103,21 @@ static bool llama_kv_cache_init(
                               bool   offload) {
     const struct llama_hparams & hparams = model.hparams;
 
-    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+    const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
+    const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
     const int64_t  n_layer      = hparams.n_layer;
 
     cache.has_shift = false;
 
-    // for now, only Mamba can hold state for more than one past token per cell
-    cache.unlimited = model.arch == LLM_ARCH_MAMBA;
+    // TODO: find a nicer way to add other recurrent model architectures
+    cache.recurrent = model.arch == LLM_ARCH_MAMBA;
+
+    // TODO: support mixed reccurent Transformer architectues
+    // NOTE: (!a || b) is a logical implication (a -> b)
+    GGML_ASSERT(!cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_s());
+    GGML_ASSERT(!cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_s());
+    GGML_ASSERT( cache.recurrent || n_embd_k_gqa == hparams.n_embd_k_gqa());
+    GGML_ASSERT( cache.recurrent || n_embd_v_gqa == hparams.n_embd_v_gqa());
 
     cache.head = 0;
     cache.size = kv_size;
@@ -2086,7 +2129,8 @@ static bool llama_kv_cache_init(
     cache.cells.clear();
     cache.cells.resize(kv_size);
 
-    if (cache.unlimited) {
+    if (cache.recurrent) {
+        // init state copy sources
         for (uint32_t i = 0; i < cache.size; ++i) {
             cache.cells[i].src = i;
         }
@@ -2164,8 +2208,8 @@ static bool llama_kv_cache_find_slot(
     const uint32_t n_ctx    = cache.size;
     const uint32_t n_tokens = batch.n_tokens;
 
-    if (cache.unlimited) {
-        // For unlimited context architectures (like Mamba),
+    if (cache.recurrent) {
+        // For recurrent state architectures (like Mamba),
         // each KV cache cell can store the state for a whole sequence.
 
         // starting point to find the minimum seq_id used in the batch
@@ -2289,7 +2333,7 @@ static bool llama_kv_cache_seq_rm(
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
     // models like Mamba can't have a state partially erased
-    if (cache.unlimited) {
+    if (cache.recurrent) {
         if (seq_id >= (int64_t) cache.size) {
             // could be fatal
             return false;
@@ -2341,7 +2385,7 @@ static void llama_kv_cache_seq_cp(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
-    if (cache.unlimited) {
+    if (cache.recurrent) {
         if ((uint32_t) seq_id_dst < cache.size && (uint32_t) seq_id_src < cache.size) {
             seq_id_src = cache.cells[seq_id_src].src;
             GGML_ASSERT((uint32_t) seq_id_src < cache.size);
@@ -2403,7 +2447,7 @@ static void llama_kv_cache_seq_add(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
-    if (cache.unlimited) {
+    if (cache.recurrent) {
         // for Mamba-like models, only the pos needs to be shifted
         if (0 <= seq_id && seq_id < (int64_t) cache.size) {
             llama_kv_cell & cell = cache.cells[seq_id];
@@ -2447,7 +2491,7 @@ static void llama_kv_cache_seq_div(
     if (p0 < 0) p0 = 0;
     if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
 
-    if (cache.unlimited) {
+    if (cache.recurrent) {
         // for Mamba-like models, only the pos needs to be changed
         if (0 <= seq_id && seq_id < (int64_t) cache.size) {
             llama_kv_cell & cell = cache.cells[seq_id];
@@ -3277,7 +3321,7 @@ static void llm_load_hparams(
 
     // sanity check for n_rot (optional)
     {
-        hparams.n_rot = hparams.n_embd / hparams.n_head;
+        hparams.n_rot = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
 
         ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
 
@@ -3290,10 +3334,10 @@ static void llm_load_hparams(
         // gpt-j n_rot = rotary_dim
     }
 
-    hparams.n_embd_head_k = hparams.n_embd / hparams.n_head;
+    hparams.n_embd_head_k = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
     ml.get_key(LLM_KV_ATTENTION_KEY_LENGTH, hparams.n_embd_head_k, false);
 
-    hparams.n_embd_head_v = hparams.n_embd / hparams.n_head;
+    hparams.n_embd_head_v = (hparams.n_head == 0) ? 0 : hparams.n_embd / hparams.n_head;
     ml.get_key(LLM_KV_ATTENTION_VALUE_LENGTH, hparams.n_embd_head_v, false);
 
     // arch-specific KVs
@@ -3545,7 +3589,13 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_MAMBA:
             {
+                ml.get_key(LLM_KV_SSM_D_CONV,  hparams.ssm_d_conv);
+                ml.get_key(LLM_KV_SSM_D_INNER, hparams.ssm_d_inner);
+                ml.get_key(LLM_KV_SSM_D_STATE, hparams.ssm_d_state);
+                ml.get_key(LLM_KV_SSM_DT_RANK, hparams.ssm_dt_rank);
+
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
+
                 switch (hparams.n_layer) {
                     case 24:
                         switch (hparams.n_embd) {
@@ -3886,6 +3936,10 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
     LLAMA_LOG_INFO("%s: freq_scale_train = %g\n",     __func__, hparams.rope_freq_scale_train);
     LLAMA_LOG_INFO("%s: n_yarn_orig_ctx  = %u\n",     __func__, hparams.n_yarn_orig_ctx);
     LLAMA_LOG_INFO("%s: rope_finetuned   = %s\n",     __func__, hparams.rope_finetuned ? "yes" : "unknown");
+    LLAMA_LOG_INFO("%s: ssm_d_conv       = %u\n",     __func__, hparams.ssm_d_conv);
+    LLAMA_LOG_INFO("%s: ssm_d_inner      = %u\n",     __func__, hparams.ssm_d_inner);
+    LLAMA_LOG_INFO("%s: ssm_d_state      = %u\n",     __func__, hparams.ssm_d_state);
+    LLAMA_LOG_INFO("%s: ssm_dt_rank      = %u\n",     __func__, hparams.ssm_dt_rank);
     LLAMA_LOG_INFO("%s: model type       = %s\n",     __func__, llama_model_type_name(model.type));
     LLAMA_LOG_INFO("%s: model ftype      = %s\n",     __func__, llama_model_ftype_name(model.ftype).c_str());
     if (ml.n_elements >= 1e12) {
@@ -4050,10 +4104,7 @@ static bool llm_load_tensors(
         const int64_t n_vocab_type = hparams.n_vocab_type;
         const int64_t n_ff         = hparams.n_ff;
 
-        // Mamba uses these in its own way
-        if (model.arch != LLM_ARCH_MAMBA) {
-            GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-        }
+        GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
 
         ggml_context * ctx_input        = ctx_map.at(model.buft_input.buft);
         ggml_context * ctx_output       = ctx_map.at(model.buft_output.buft);
@@ -4792,12 +4843,11 @@ static bool llm_load_tensors(
                 } break;
             case LLM_ARCH_MAMBA:
                 {
-                    const int64_t d_conv  = hparams.n_embd_head_k + 1;
-                    const int64_t d_state = hparams.n_embd_head_v;
-                    const int64_t d_inner = hparams.n_head;
-                    // TODO: allow loading dt_rank from the model config
-                    // ceiling division
-                    const int64_t dt_rank = (n_embd / 16) + (n_embd % 16 > 0);
+                    const int64_t d_conv  = hparams.ssm_d_conv;
+                    const int64_t d_inner = hparams.ssm_d_inner;
+                    const int64_t d_state = hparams.ssm_d_state;
+                    const int64_t dt_rank = hparams.ssm_dt_rank;
+                    // only an expansion factor of 2 is supported for now
                     GGML_ASSERT(2 * n_embd == d_inner);
 
                     model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
@@ -5420,7 +5470,7 @@ struct llm_build_context {
         norm_rms_eps     (hparams.f_norm_rms_eps),
         n_tokens         (batch.n_tokens),
         n_kv             (worst_case ? kv_self.size : kv_self.n),
-        kv_head          (worst_case ? (kv_self.unlimited ? 0 : kv_self.size - n_tokens) : kv_self.head),
+        kv_head          (worst_case ? (kv_self.recurrent ? 0 : kv_self.size - n_tokens) : kv_self.head),
         n_orig_ctx       (cparams.n_yarn_orig_ctx),
         pooling_type     (cparams.pooling_type),
         rope_type        (hparams.rope_type),
@@ -5473,8 +5523,8 @@ struct llm_build_context {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         for (int il = 0; il < n_layer; ++il) {
-            ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], n_embd_k_gqa, kv_self.size);
-            ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], n_embd_v_gqa, kv_self.size);
+            ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
+            ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
 
             conv_states = ggml_get_rows(ctx0, conv_states, lctx.inp_s_copy);
             ssm_states  = ggml_get_rows(ctx0,  ssm_states, lctx.inp_s_copy);
@@ -8048,12 +8098,11 @@ struct llm_build_context {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
         const int64_t d_model = n_embd;
-        const int64_t d_inner = n_head;
+        const int64_t d_conv  = hparams.ssm_d_conv;
+        const int64_t d_inner = hparams.ssm_d_inner;
         GGML_ASSERT(2 * d_model == d_inner);
-        const int64_t d_conv = n_embd_head_k + 1;
-        const int64_t d_state = n_embd_head_v;
-        // ceiling division
-        const int64_t dt_rank = (d_model / 16) + (d_model % 16 > 0);
+        const int64_t d_state = hparams.ssm_d_state;
+        const int64_t dt_rank = hparams.ssm_dt_rank;
 
         struct ggml_tensor * cur;
         struct ggml_tensor * inpL;
@@ -8063,10 +8112,9 @@ struct llm_build_context {
         cb(inpL, "inp_embd", -1);
 
         for (int il = 0; il < n_layer; ++il) {
-            // (ab)using the kv cache to store the state
-            // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
-            ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], (d_conv-1)*(d_inner), kv_self.size);
-            ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il],  (d_state)*(d_inner), kv_self.size);
+            // (ab)using the KV cache to store the states
+            ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
+            ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
 
             // clear states of sequences which are starting at the beginning of this batch
             {
@@ -8501,7 +8549,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
         }
     }
 
-    if (kv_self.unlimited) {
+    if (kv_self.recurrent) {
         const int64_t n_kv = kv_self.n;
 
         {
@@ -8667,7 +8715,7 @@ static int llama_decode_internal(
         return 1;
     }
 
-    if (!kv_self.unlimited) {
+    if (!kv_self.recurrent) {
         // a heuristic, to avoid attending the full cache if it is not yet utilized
         // after enough generations, the benefit from this heuristic disappears
         // if we start defragmenting the cache, the benefit from this will be more important
@@ -9056,7 +9104,7 @@ static void llama_kv_cache_update_internal(struct llama_context & lctx) {
         }
     }
 
-    if (lctx.kv_self.unlimited && lctx.kv_self.do_copy) {
+    if (lctx.kv_self.recurrent && lctx.kv_self.do_copy) {
         llama_set_s_copy(lctx);
 
         {
@@ -12725,7 +12773,7 @@ struct llama_context * llama_new_context_with_model(
         // graph inputs
         {
             ggml_init_params init_params = {
-                /* .mem_size   */ ggml_tensor_overhead()*(8 + 3*(ctx->kv_self.unlimited)),
+                /* .mem_size   */ ggml_tensor_overhead()*(8 + 3*(ctx->kv_self.recurrent)),
                 /* .mem_buffer */ nullptr,
                 /* .no_alloc   */ true,
             };
@@ -12739,7 +12787,7 @@ struct llama_context * llama_new_context_with_model(
             ctx->inp_K_shift = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size);
             ctx->inp_mean    = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_F32, cparams.n_batch, cparams.n_batch);
             ctx->inp_cls     = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, cparams.n_batch);
-            if (ctx->kv_self.unlimited) {
+            if (ctx->kv_self.recurrent) {
                 ctx->inp_s_copy = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_I32, kv_size);
                 ctx->inp_s_mask = ggml_new_tensor_1d(ctx->ctx_input, GGML_TYPE_F32, kv_size);
                 ctx->inp_s_seq  = ggml_new_tensor_2d(ctx->ctx_input, GGML_TYPE_I32, kv_size, cparams.n_batch);
@@ -12753,7 +12801,7 @@ struct llama_context * llama_new_context_with_model(
             ggml_set_name(ctx->inp_K_shift, "inp_K_shift");
             ggml_set_name(ctx->inp_mean,    "inp_mean");
             ggml_set_name(ctx->inp_cls,     "inp_cls");
-            if (ctx->kv_self.unlimited) {
+            if (ctx->kv_self.recurrent) {
                 ggml_set_name(ctx->inp_s_copy, "inp_s_copy");
                 ggml_set_name(ctx->inp_s_mask, "inp_s_mask");
                 ggml_set_name(ctx->inp_s_seq,  "inp_s_seq");

From 1af1000f1053a2bc8864cb87acc0d3da62dcd950 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 2 Mar 2024 11:12:30 -0500
Subject: [PATCH 29/41] mamba : more correctly update the "used" field of the
 KV cache

---
 llama.cpp | 31 +++++++++++++++++--------------
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index eb1f02e426d1e..fe24f509628e6 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -2212,22 +2212,19 @@ static bool llama_kv_cache_find_slot(
         // For recurrent state architectures (like Mamba),
         // each KV cache cell can store the state for a whole sequence.
 
-        // starting point to find the minimum seq_id used in the batch
-        cache.head = cache.size - 1;
-        // likewise, to find the max seq_id in the batch
-        cache.used = 0;
+        llama_seq_id min = cache.size - 1;
+        llama_seq_id max = 0;
+
         for (uint32_t i = 0; i < n_tokens; ++i) {
             for (int32_t j = 0; j < batch.n_seq_id[i]; ++j) {
                 llama_seq_id seq_id = batch.seq_id[i][j];
                 // make sure it's a valid seq_id
-                if ((uint32_t)seq_id < cache.size) {
-                    // the number of "used" cells is simply the biggest seq_id
-                    if (cache.used < (uint32_t)seq_id) {
-                        cache.used = seq_id;
+                if ((uint32_t) seq_id < cache.size) {
+                    if (seq_id > max) {
+                        max = seq_id;
                     }
-                    // the "head" is the smallest seq_id
-                    if (cache.head > (uint32_t)seq_id) {
-                        cache.head = seq_id;
+                    if (seq_id < min) {
+                        min = seq_id;
                     }
                     // Assuming the tokens are in-order
                     if (batch.pos[i] != cache.cells[seq_id].pos + 1) {
@@ -2236,6 +2233,9 @@ static bool llama_kv_cache_find_slot(
                         LLAMA_LOG_WARN("%s: non-consecutive token position %d after %d for sequence %d\n",
                             __func__, batch.pos[i], cache.cells[seq_id].pos, seq_id);
                     }
+                    if (cache.cells[seq_id].pos < 0 && 0 <= batch.pos[i]) {
+                        cache.used += 1;
+                    }
                     cache.cells[seq_id].pos = batch.pos[i];
                     // NOTE: seq_ids are not inserted here; they are handled when the input tensors are set
                 } else {
@@ -2247,9 +2247,12 @@ static bool llama_kv_cache_find_slot(
             }
         }
 
-        cache.n = cache.used - cache.head + 1;
-        // sanity check (max >= min)
-        return cache.used >= cache.head;
+        // allow getting the range of used cells, from head to head + n
+        cache.head = min;
+        cache.n    = max - min + 1;
+
+        // sanity check
+        return max >= min;
     }
     // otherwise, one cell per token.
 

From d52dd501f084ce9a8c1d883dedcf14eeda0ac5a4 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 2 Mar 2024 21:39:28 -0500
Subject: [PATCH 30/41] ggml : in ggml_ssm_scan, use a threshold for soft_plus

This is how the official Mamba implementation does it,
and it's also what torch.nn.Softplus does.
---
 ggml.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/ggml.c b/ggml.c
index 981a2302a41c1..9b5d0302bf48a 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14904,7 +14904,8 @@ static void ggml_compute_forward_ssm_scan_f32(
 
         // d_inner
         for (int i1 = 0; i1 < ir; ++i1) {
-            float dt_soft_plus = log1pf(expf(dt[i1]));
+            // ref: https://github.com/state-spaces/mamba/blob/34076d664838588a3c97727b263478ab9f621a07/mamba_ssm/ops/triton/selective_state_update.py#L78
+            float dt_soft_plus = dt[i1] <= 20.0f ? log1pf(expf(dt[i1])) : dt[i1];
             float x_dt = x[i1] * dt_soft_plus;
             float sumf = 0.0f;
             // d_state

From b83fbc92873d7f5d0a703372eb4e15c451974ad2 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sat, 2 Mar 2024 23:39:19 -0500
Subject: [PATCH 31/41] convert : for Mamba, fallback to internal NeoX
 tokenizer

The resulting models are exactly the same
as if the tokenizer.json and tokenizer_config.json of GPT-NeoX were there.
---
 convert-hf-to-gguf.py | 29 +++++++++++++++++++++++++++--
 1 file changed, 27 insertions(+), 2 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index bed830ce6e4c1..a288d73f8081e 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1852,8 +1852,33 @@ def set_vocab(self):
         vocab_size = self.hparams["vocab_size"]
         # Round vocab size to next multiple of 8
         pad_vocab = self.hparams.get("pad_vocab_size_multiple", 8)
-        self.hparams["vocab_size"] = ((vocab_size + (pad_vocab - 1)) // pad_vocab) * pad_vocab
-        return self._set_vocab_gpt2()
+        # pad using ceiling division
+        # ref: https://stackoverflow.com/a/17511341/22827863
+        vocab_size = -(vocab_size // -pad_vocab) * pad_vocab
+        self.hparams["vocab_size"] = vocab_size
+
+        if (self.dir_model / "tokenizer.json").is_file():
+            self._set_vocab_gpt2()
+        else:
+            # Use the GPT-NeoX tokenizer when no tokenizer files are present
+            tokenizer_path = Path(sys.path[0]) / "models" / "ggml-vocab-gpt-neox.gguf"
+            print(f"Using tokenizer from '{os.path.relpath(tokenizer_path, os.getcwd())}'")
+            neox_reader = gguf.GGUFReader(tokenizer_path, "r")
+
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.MODEL)
+            self.gguf_writer.add_tokenizer_model(bytes(field.parts[-1]))
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.LIST)
+            self.gguf_writer.add_token_list([bytes(field.parts[i]) for i in field.data][:vocab_size])
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.TOKEN_TYPE)
+            self.gguf_writer.add_token_types([field.parts[i].tolist()[0] for i in field.data][:vocab_size])
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.MERGES)
+            self.gguf_writer.add_token_merges([bytes(field.parts[i]) for i in field.data])
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.BOS_ID)
+            self.gguf_writer.add_bos_token_id(field.parts[-1].tolist()[0])
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.EOS_ID)
+            self.gguf_writer.add_eos_token_id(field.parts[-1].tolist()[0])
+            field = neox_reader.get_field(gguf.Keys.Tokenizer.UNK_ID)
+            self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
 
     def set_gguf_parameters(self):
         d_model = self.hparams["d_model"]

From eefb794bd7ee9086c4c1c300acbb80ac75d214e6 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Sun, 3 Mar 2024 13:55:48 -0500
Subject: [PATCH 32/41] mamba : support state saving and restoring

---
 llama.cpp | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index fe24f509628e6..9b17bf347eca8 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -13344,8 +13344,8 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
         const auto & hparams = ctx->model.hparams;
 
         const uint32_t n_layer      = hparams.n_layer;
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
 
         const size_t   kv_buf_size = kv_self.total_size();
         const uint32_t kv_head     = llama_kv_cache_cell_max(kv_self);
@@ -13366,6 +13366,17 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
                 ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
                 data_ctx->write(tmp_buf.data(), tmp_buf.size());
 
+                if (kv_self.recurrent) {
+                    // v is contiguous for recurrent models
+                    // TODO: use other tensors for state models than k and v
+                    const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
+
+                    tmp_buf.resize(v_size);
+                    ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), 0, tmp_buf.size());
+                    data_ctx->write(tmp_buf.data(), tmp_buf.size());
+                    continue;
+                }
+
                 // v is not contiguous, copy row by row
                 const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
                 const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);
@@ -13456,8 +13467,8 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
         const auto & hparams = ctx->model.hparams;
 
         const uint32_t n_layer      = hparams.n_layer;
-        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
-        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
+        const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa() + hparams.n_embd_k_s();
+        const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa() + hparams.n_embd_v_s();
 
         size_t   kv_buf_size;
         uint32_t kv_head;
@@ -13478,6 +13489,16 @@ size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
                 ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
                 inp += k_size;
 
+                if (kv_self.recurrent) {
+                    // v is contiguous for recurrent models
+                    // TODO: use other tensors for state models than k and v
+                    const size_t v_size = ggml_row_size(kv_self.v_l[il]->type, n_embd_v_gqa*kv_head);
+
+                    ggml_backend_tensor_set(kv_self.v_l[il], inp, 0, v_size);
+                    inp += v_size;
+                    continue;
+                }
+
                 // v is not contiguous, copy row by row
                 const size_t v_row_size   = ggml_row_size(kv_self.v_l[il]->type, kv_head);
                 const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, kv_size);

From 2a99d1b2437375988fc00eb71c43776909e0da32 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 4 Mar 2024 10:10:50 -0500
Subject: [PATCH 33/41] ggml : implicitly pass src tensors through dst for
 Mamba-related ops

---
 ggml.c | 47 +++++++++++++++++++----------------------------
 1 file changed, 19 insertions(+), 28 deletions(-)

diff --git a/ggml.c b/ggml.c
index 9b5d0302bf48a..919f84fa93e61 100644
--- a/ggml.c
+++ b/ggml.c
@@ -14700,15 +14700,16 @@ static void ggml_compute_forward_flash_attn_back(
 
 static void ggml_compute_forward_ssm_conv_f32(
         const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0, // conv_state
-        const struct ggml_tensor * src1, // x
-        const struct ggml_tensor * src2, // conv1d.weight
-        const struct ggml_tensor * src3, // state_seq
         struct ggml_tensor * dst) {
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
     }
 
+    const struct ggml_tensor * src0 = dst->src[0]; // conv_state
+    const struct ggml_tensor * src1 = dst->src[1]; // x
+    const struct ggml_tensor * src2 = dst->src[2]; // conv1d.weight
+    const struct ggml_tensor * src3 = dst->src[3]; // state_seq
+
     const int ith = params->ith;
     const int nth = params->nth;
 
@@ -14808,15 +14809,11 @@ static void ggml_compute_forward_ssm_conv_f32(
 
 static void ggml_compute_forward_ssm_conv(
         const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * src2,
-        const struct ggml_tensor * src3,
         struct ggml_tensor * dst) {
-    switch (src0->type) {
+    switch (dst->src[0]->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_ssm_conv_f32(params, src0, src1, src2, src3, dst);
+                ggml_compute_forward_ssm_conv_f32(params, dst);
             } break;
         default:
             {
@@ -14829,18 +14826,19 @@ static void ggml_compute_forward_ssm_conv(
 
 static void ggml_compute_forward_ssm_scan_f32(
         const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0, // s
-        const struct ggml_tensor * src1, // x
-        const struct ggml_tensor * src2, // dt
-        const struct ggml_tensor * src3, // A
-        const struct ggml_tensor * src4, // B
-        const struct ggml_tensor * src5, // C
-        const struct ggml_tensor * src6, // sq
         struct ggml_tensor * dst) {
     if (params->type == GGML_TASK_TYPE_INIT || params->type == GGML_TASK_TYPE_FINALIZE) {
         return;
     }
 
+    const struct ggml_tensor * src0 = dst->src[0]; // s
+    const struct ggml_tensor * src1 = dst->src[1]; // x
+    const struct ggml_tensor * src2 = dst->src[2]; // dt
+    const struct ggml_tensor * src3 = dst->src[3]; // A
+    const struct ggml_tensor * src4 = dst->src[4]; // B
+    const struct ggml_tensor * src5 = dst->src[5]; // C
+    const struct ggml_tensor * src6 = dst->src[6]; // sq
+
     const int ith = params->ith;
     const int nth = params->nth;
 
@@ -14936,18 +14934,11 @@ static void ggml_compute_forward_ssm_scan_f32(
 
 static void ggml_compute_forward_ssm_scan(
         const struct ggml_compute_params * params,
-        const struct ggml_tensor * src0,
-        const struct ggml_tensor * src1,
-        const struct ggml_tensor * src2,
-        const struct ggml_tensor * src3,
-        const struct ggml_tensor * src4,
-        const struct ggml_tensor * src5,
-        const struct ggml_tensor * src6,
         struct ggml_tensor * dst) {
-    switch (src0->type) {
+    switch (dst->src[0]->type) {
         case GGML_TYPE_F32:
             {
-                ggml_compute_forward_ssm_scan_f32(params, src0, src1, src2, src3, src4, src5, src6, dst);
+                ggml_compute_forward_ssm_scan_f32(params, dst);
             } break;
         default:
             {
@@ -16009,11 +16000,11 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
             } break;
         case GGML_OP_SSM_CONV:
             {
-                ggml_compute_forward_ssm_conv(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor);
+                ggml_compute_forward_ssm_conv(params, tensor);
             } break;
         case GGML_OP_SSM_SCAN:
             {
-                ggml_compute_forward_ssm_scan(params, tensor->src[0], tensor->src[1], tensor->src[2], tensor->src[3], tensor->src[4], tensor->src[5], tensor->src[6], tensor);
+                ggml_compute_forward_ssm_scan(params, tensor);
             } break;
         case GGML_OP_WIN_PART:
             {

From 93fd4b8d5bc4a190adc2cae897eaf518e38a6fc3 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Mon, 4 Mar 2024 15:57:40 -0500
Subject: [PATCH 34/41] mamba : clarify some comments

---
 llama.cpp | 6 +++---
 llama.h   | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index 9b17bf347eca8..74a802fd4d72e 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -1712,14 +1712,14 @@ struct llama_hparams {
         return n_embd_head_v * n_head_kv;
     }
 
-    uint32_t n_embd_k_s() const { // dimension of the recurrent convolution state embeddings
+    uint32_t n_embd_k_s() const { // dimension of the rolling state embeddings
         // corresponds to Mamba's conv_states size
         // TODO: maybe support other convolution strides than 1
         // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
         return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
     }
 
-    uint32_t n_embd_v_s() const { // dimension of the ssm scan state embeddings
+    uint32_t n_embd_v_s() const { // dimension of the recurrent state embeddings
         // corresponds to Mamba's ssm_states size
         return ssm_d_state * ssm_d_inner;
     }
@@ -8573,7 +8573,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
                 }
             }
         }
-        // For Mamba (and other constant-time-and-size architectures),
+        // For Mamba (and other recurrent architectures),
         // update the correct state(s)/sequence(s) for each token of the batch.
         // Like with the KQ_mask, if a token in the batch has multiple sequences,
         // they are assumed to be equivalent (not here, but in ggml_ssm_scan and ggml_ssm_conv).
diff --git a/llama.h b/llama.h
index f0aca1fe5410a..ee804a658b4d8 100644
--- a/llama.h
+++ b/llama.h
@@ -235,7 +235,7 @@ extern "C" {
         uint32_t seed;              // RNG seed, -1 for random
         uint32_t n_ctx;             // text context, 0 = from model
         uint32_t n_batch;           // prompt processing maximum batch size
-        uint32_t n_parallel;        // number of parallel sequences
+        uint32_t n_parallel;        // number of parallel sequences (i.e. distinct states for recurrent models)
         uint32_t n_threads;         // number of threads to use for generation
         uint32_t n_threads_batch;   // number of threads to use for batch processing
 

From 7cd5a1f986e9821b14bf40f04d24153ca2339185 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 7 Mar 2024 13:52:58 -0500
Subject: [PATCH 35/41] server : fix cache_tokens not getting correctly resized

Otherwise, when the "we have to evaluate at least 1 token" special case
was triggered, an extra token was kept in cache_tokens even if it was
removed from the KV cache.

For Mamba, this caused useless prompt reprocessing when the previous
request triggered the above case.
---
 examples/server/server.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index a920f2d92498d..23c5189a5ba8c 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -1797,9 +1797,6 @@ struct server_context {
                                 // reuse any previously computed tokens that are common with the new prompt
                                 slot.n_past = common_part(slot.cache_tokens, prompt_tokens);
 
-                                // remove the non-common part from the cache
-                                slot.cache_tokens.resize(slot.n_past);
-
                                 // push the prompt into the sampling context (do not apply grammar)
                                 for (int i = 0; i < slot.n_past; ++i) {
                                     llama_sampling_accept(slot.ctx_sampling, ctx, slot.cache_tokens[i], false);
@@ -1846,11 +1843,13 @@ struct server_context {
                         slot.n_past = 0;
                         slot.n_past_se = 0;
                         slot.ga_i = 0;
-                        slot.cache_tokens.clear();
                         // TODO: is the system prompt ever in the sampling context?
                         llama_sampling_reset(slot.ctx_sampling);
                     }
 
+                    // remove the non-common part from the cache
+                    slot.cache_tokens.resize(slot.n_past);
+
                     LOG_INFO("kv cache rm [p0, end)", {
                         { "id_slot", slot.id },
                         { "id_task", slot.id_task },

From d8024a486b3deffcd93c525034a0687adbb9ad04 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 7 Mar 2024 20:28:42 -0500
Subject: [PATCH 36/41] convert-hf : support new metadata keys for Mamba

For the models available at
https://huggingface.co/collections/state-spaces/transformers-compatible-mamba-65e7b40ab87e5297e45ae406
---
 convert-hf-to-gguf.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index b8c2bf0795f3d..d526e3157e614 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1884,14 +1884,15 @@ def set_vocab(self):
             self.gguf_writer.add_unk_token_id(field.parts[-1].tolist()[0])
 
     def set_gguf_parameters(self):
-        d_model = self.hparams["d_model"]
-        d_conv  = self.hparams.get("d_conv", 4)
-        d_inner = self.hparams.get("d_inner", 2 * d_model)
-        d_state = self.hparams.get("d_state", 16)
+        d_model = self.find_hparam(["hidden_size", "d_model"])
+        d_conv  = self.find_hparam(["conv_kernel", "d_conv"], optional=True) or 4
+        d_inner = self.find_hparam(["intermediate_size", "d_inner"], optional=True) or 2 * d_model
+        d_state = self.find_hparam(["state_size", "d_state"], optional=True) or 16
         # ceiling division
         # ref: https://stackoverflow.com/a/17511341/22827863
         # ref: https://github.com/state-spaces/mamba/blob/ce59daea3a090d011d6476c6e5b97f6d58ddad8b/mamba_ssm/modules/mamba_simple.py#L58
-        dt_rank = self.hparams.get("dt_rank", -(d_model // -16))
+        dt_rank = self.find_hparam(["time_step_rank", "dt_rank"], optional=True) or -(d_model // -16)
+        rms_norm_eps = self.find_hparam(["layer_norm_epsilon", "rms_norm_eps"], optional=True) or 1e-5
 
         # Fail early for models which don't have a block expansion factor of 2
         assert d_inner == 2 * d_model
@@ -1906,7 +1907,7 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_ssm_inner_length(d_inner)
         self.gguf_writer.add_ssm_state_length(d_state)
         self.gguf_writer.add_ssm_dt_rank(dt_rank)
-        self.gguf_writer.add_layer_norm_rms_eps(self.hparams.get("rms_norm_eps", 1e-5))
+        self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
         self.gguf_writer.add_file_type(self.ftype)
 
     def write_tensors(self):

From 17e4d6c96af7c20d8f3963f6e2465ee2753b3f6d Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 7 Mar 2024 21:32:48 -0500
Subject: [PATCH 37/41] mamba : rename metadata to be more similar to
 transformers library

This breaks existing converted-to-GGUF models,
but the metadata names are more "standard".

mamba : support mamba-*-hf models

These models share their token_embd.weight with their output.weight
---
 convert-hf-to-gguf.py          |  8 ++++----
 gguf-py/gguf/constants.py      | 16 ++++++++--------
 gguf-py/gguf/gguf_writer.py    | 16 ++++++++--------
 gguf-py/gguf/tensor_mapping.py |  7 ++++---
 llama.cpp                      | 35 ++++++++++++++++++++--------------
 5 files changed, 45 insertions(+), 37 deletions(-)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index d526e3157e614..3318be35c73fb 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1903,10 +1903,10 @@ def set_gguf_parameters(self):
         self.gguf_writer.add_feed_forward_length(0) # unused, but seemingly required when loading
         self.gguf_writer.add_head_count(0) # unused, but seemingly required when loading
         self.gguf_writer.add_block_count(self.hparams["n_layer"])
-        self.gguf_writer.add_ssm_conv_kernel_size(d_conv)
-        self.gguf_writer.add_ssm_inner_length(d_inner)
-        self.gguf_writer.add_ssm_state_length(d_state)
-        self.gguf_writer.add_ssm_dt_rank(dt_rank)
+        self.gguf_writer.add_ssm_conv_kernel(d_conv)
+        self.gguf_writer.add_ssm_inner_size(d_inner)
+        self.gguf_writer.add_ssm_state_size(d_state)
+        self.gguf_writer.add_ssm_time_step_rank(dt_rank)
         self.gguf_writer.add_layer_norm_rms_eps(rms_norm_eps)
         self.gguf_writer.add_file_type(self.ftype)
 
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py
index 8030023f30648..b23badb1019c1 100644
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -62,10 +62,10 @@ class Rope:
         SCALING_FINETUNED    = "{arch}.rope.scaling.finetuned"
 
     class SSM:
-        CONV_KERNEL_SIZE     = "{arch}.ssm.d_conv"
-        INNER_LENGTH         = "{arch}.ssm.d_inner"
-        STATE_LENGTH         = "{arch}.ssm.d_state"
-        DT_RANK              = "{arch}.ssm.dt_rank"
+        CONV_KERNEL    = "{arch}.ssm.conv_kernel"
+        INNER_SIZE     = "{arch}.ssm.inner_size"
+        STATE_SIZE     = "{arch}.ssm.state_size"
+        TIME_STEP_RANK = "{arch}.ssm.time_step_rank"
 
     class Tokenizer:
         MODEL            = "tokenizer.ggml.model"
@@ -770,10 +770,10 @@ def get_type(val: Any) -> GGUFValueType:
 KEY_ROPE_SCALING_FINETUNED    = Keys.Rope.SCALING_FINETUNED
 
 # SSM
-KEY_SSM_CONV_KERNEL_SIZE = Keys.SSM.CONV_KERNEL_SIZE
-KEY_SSM_INNER_LENGTH     = Keys.SSM.INNER_LENGTH
-KEY_SSM_STATE_LENGTH     = Keys.SSM.STATE_LENGTH
-KEY_SSM_DT_RANK          = Keys.SSM.DT_RANK
+KEY_SSM_CONV_KERNEL    = Keys.SSM.CONV_KERNEL
+KEY_SSM_INNER_SIZE     = Keys.SSM.INNER_SIZE
+KEY_SSM_STATE_SIZE     = Keys.SSM.STATE_SIZE
+KEY_SSM_TIME_STEP_RANK = Keys.SSM.TIME_STEP_RANK
 
 # tokenization
 KEY_TOKENIZER_MODEL      = Keys.Tokenizer.MODEL
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py
index 146358e6971cf..e49c5db6866a2 100644
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -382,17 +382,17 @@ def add_rope_scaling_orig_ctx_len(self, value: int) -> None:
     def add_rope_scaling_finetuned(self, value: bool) -> None:
         self.add_bool(Keys.Rope.SCALING_FINETUNED.format(arch=self.arch), value)
 
-    def add_ssm_conv_kernel_size(self, value: int) -> None:
-        self.add_uint32(Keys.SSM.CONV_KERNEL_SIZE.format(arch=self.arch), value)
+    def add_ssm_conv_kernel(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.CONV_KERNEL.format(arch=self.arch), value)
 
-    def add_ssm_inner_length(self, value: int) -> None:
-        self.add_uint32(Keys.SSM.INNER_LENGTH.format(arch=self.arch), value)
+    def add_ssm_inner_size(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.INNER_SIZE.format(arch=self.arch), value)
 
-    def add_ssm_state_length(self, value: int) -> None:
-        self.add_uint32(Keys.SSM.STATE_LENGTH.format(arch=self.arch), value)
+    def add_ssm_state_size(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.STATE_SIZE.format(arch=self.arch), value)
 
-    def add_ssm_dt_rank(self, value: int) -> None:
-        self.add_uint32(Keys.SSM.DT_RANK.format(arch=self.arch), value)
+    def add_ssm_time_step_rank(self, value: int) -> None:
+        self.add_uint32(Keys.SSM.TIME_STEP_RANK.format(arch=self.arch), value)
 
     def add_tokenizer_model(self, model: str) -> None:
         self.add_string(Keys.Tokenizer.MODEL, model)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py
index 85af29549de6c..ed89955d8970f 100644
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -20,8 +20,9 @@ class TensorNameMap:
             "wte",                                       # gpt2
             "transformer.embd.wte",                      # phi2
             "model.tok_embeddings",                      # internlm2
-            "model.embedding",                           # mamba
+            "model.embedding",                           # mamba-qbert
             "backbone.embedding",                        # mamba
+            "backbone.embeddings",                       # mamba-hf
         ),
 
         # Token type embeddings
@@ -63,7 +64,7 @@ class TensorNameMap:
             "language_model.encoder.final_layernorm",  # persimmon
             "model.final_layernorm",                   # persimmon
             "lm_head.ln",                              # phi2
-            "model.norm_f",                            # mamba
+            "model.norm_f",                            # mamba-qbert
             "backbone.norm_f",                         # mamba
         ),
 
@@ -90,7 +91,7 @@ class TensorNameMap:
             "transformer.h.{bid}.ln",                               # phi2
             "model.layers.layers.{bid}.norm",                       # plamo
             "model.layers.{bid}.attention_norm",                    # internlm2
-            "model.layers.{bid}.norm",                              # mamba
+            "model.layers.{bid}.norm",                              # mamba-qbert
             "backbone.layers.{bid}.norm",                           # mamba
         ),
 
diff --git a/llama.cpp b/llama.cpp
index 5c5b7a119d7b4..a54ce43e98818 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -286,10 +286,10 @@ enum llm_kv {
     LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,
     LLM_KV_ROPE_SCALING_FINETUNED,
 
-    LLM_KV_SSM_D_INNER,
-    LLM_KV_SSM_D_CONV,
-    LLM_KV_SSM_D_STATE,
-    LLM_KV_SSM_DT_RANK,
+    LLM_KV_SSM_INNER_SIZE,
+    LLM_KV_SSM_CONV_KERNEL,
+    LLM_KV_SSM_STATE_SIZE,
+    LLM_KV_SSM_TIME_STEP_RANK,
 
     LLM_KV_TOKENIZER_MODEL,
     LLM_KV_TOKENIZER_LIST,
@@ -349,10 +349,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
     { LLM_KV_ROPE_SCALING_ORIG_CTX_LEN,     "%s.rope.scaling.original_context_length" },
     { LLM_KV_ROPE_SCALING_FINETUNED,        "%s.rope.scaling.finetuned"               },
 
-    { LLM_KV_SSM_D_CONV,                    "%s.ssm.d_conv" },
-    { LLM_KV_SSM_D_INNER,                   "%s.ssm.d_inner"},
-    { LLM_KV_SSM_D_STATE,                   "%s.ssm.d_state"},
-    { LLM_KV_SSM_DT_RANK,                   "%s.ssm.dt_rank"},
+    { LLM_KV_SSM_CONV_KERNEL,               "%s.ssm.conv_kernel"    },
+    { LLM_KV_SSM_INNER_SIZE,                "%s.ssm.inner_size"     },
+    { LLM_KV_SSM_STATE_SIZE,                "%s.ssm.state_size"     },
+    { LLM_KV_SSM_TIME_STEP_RANK,            "%s.ssm.time_step_rank" },
 
     { LLM_KV_TOKENIZER_MODEL,               "tokenizer.ggml.model"              },
     { LLM_KV_TOKENIZER_LIST,                "tokenizer.ggml.tokens"             },
@@ -3599,10 +3599,10 @@ static void llm_load_hparams(
             } break;
         case LLM_ARCH_MAMBA:
             {
-                ml.get_key(LLM_KV_SSM_D_CONV,  hparams.ssm_d_conv);
-                ml.get_key(LLM_KV_SSM_D_INNER, hparams.ssm_d_inner);
-                ml.get_key(LLM_KV_SSM_D_STATE, hparams.ssm_d_state);
-                ml.get_key(LLM_KV_SSM_DT_RANK, hparams.ssm_dt_rank);
+                ml.get_key(LLM_KV_SSM_CONV_KERNEL,    hparams.ssm_d_conv);
+                ml.get_key(LLM_KV_SSM_INNER_SIZE,     hparams.ssm_d_inner);
+                ml.get_key(LLM_KV_SSM_STATE_SIZE,     hparams.ssm_d_state);
+                ml.get_key(LLM_KV_SSM_TIME_STEP_RANK, hparams.ssm_dt_rank);
 
                 ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
 
@@ -4864,8 +4864,15 @@ static bool llm_load_tensors(
 
                     // output
                     {
-                        model.output_norm   = ml.create_tensor(ctx_output,       tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
-                        model.output        = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT,      "weight"), {n_embd, n_vocab});
+                        model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+
+                        model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, false);
+                        // if output is NULL, init from the input tok embed, duplicated to allow offloading
+                        if (model.output == NULL) {
+                            model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+                            ml.n_created--; // artificial tensor
+                            ml.size_data += ggml_nbytes(model.output);
+                        }
                     }
 
                     for (int i = 0; i < n_layer; ++i) {

From 1c8ea5584368d60aaa483668f00585aa27919a5f Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Thu, 7 Mar 2024 22:29:45 -0500
Subject: [PATCH 38/41] mamba : add missing spaces

This is purely a formatting change.
---
 llama.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index a54ce43e98818..dea44a4946db5 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -826,14 +826,14 @@ static const std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NA
             { LLM_TENSOR_TOKEN_EMBD,      "token_embd" },
             { LLM_TENSOR_OUTPUT_NORM,     "output_norm" },
             { LLM_TENSOR_OUTPUT,          "output" },
-            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm"},
-            { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in"},
-            { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d"},
-            { LLM_TENSOR_SSM_X,           "blk.%d.ssm_x"},
-            { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt"},
-            { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a"},
-            { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d"},
-            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out"},
+            { LLM_TENSOR_ATTN_NORM,       "blk.%d.attn_norm" },
+            { LLM_TENSOR_SSM_IN,          "blk.%d.ssm_in" },
+            { LLM_TENSOR_SSM_CONV1D,      "blk.%d.ssm_conv1d" },
+            { LLM_TENSOR_SSM_X,           "blk.%d.ssm_x" },
+            { LLM_TENSOR_SSM_DT,          "blk.%d.ssm_dt" },
+            { LLM_TENSOR_SSM_A,           "blk.%d.ssm_a" },
+            { LLM_TENSOR_SSM_D,           "blk.%d.ssm_d" },
+            { LLM_TENSOR_SSM_OUT,         "blk.%d.ssm_out" },
         },
     },
     {

From d0d32dced9dd206c645241fd44487980301d33b4 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 8 Mar 2024 10:06:33 -0500
Subject: [PATCH 39/41] convert-hf : omit output.weight when identical with
 token_embd.weight

Only for Mamba for now, but it might be relevant for other models eventually.
Most Mamba models actually share these two tensors, albeit implicitly.
---
 convert-hf-to-gguf.py | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py
index 3318be35c73fb..5eee320163d29 100755
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1913,6 +1913,11 @@ def set_gguf_parameters(self):
     def write_tensors(self):
         block_count = self.hparams["n_layer"]
         tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
+
+        tok_embd = None
+        tok_embd_name = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.TOKEN_EMBD] + ".weight"
+        output_name   = gguf.TENSOR_NAMES[gguf.MODEL_TENSOR.OUTPUT]     + ".weight"
+
         for name, data_torch in self.get_tensors():
             old_dtype = data_torch.dtype
 
@@ -1930,6 +1935,14 @@ def write_tensors(self):
                 print("A_log --> A ==> " + new_name)
                 data_torch = -torch.exp(data_torch)
 
+            # assuming token_embd.weight is seen before output.weight
+            if tok_embd is not None and new_name == output_name:
+                if torch.equal(tok_embd, data_torch):
+                    print(f"{output_name} is equivalent to {tok_embd_name}, omitting")
+                    continue
+            if new_name == tok_embd_name:
+                tok_embd = data_torch
+
             data = data_torch.squeeze().numpy()
 
             n_dims = len(data.shape)

From 3e5685f7eac32522ee41d61e6ff26fd1f390e734 Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 8 Mar 2024 11:03:37 -0500
Subject: [PATCH 40/41] readme : add Mamba to supported models, and add recent
 API changes

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index f754022de894d..d7dba73e62267 100644
--- a/README.md
+++ b/README.md
@@ -10,6 +10,7 @@ Inference of Meta's [LLaMA](https://arxiv.org/abs/2302.13971) model (and others)
 
 ### Recent API changes
 
+- [2024 Mar 8] `llama_kv_cache_seq_rm()` returns a `bool` instead of `void`, and new `llama_n_max_seq()` returns the upper limit of acceptable `seq_id` in batches (relevant when dealing with multiple sequences) https://github.com/ggerganov/llama.cpp/pull/5328
 - [2024 Mar 4] Embeddings API updated https://github.com/ggerganov/llama.cpp/pull/5796
 - [2024 Mar 3] `struct llama_context_params` https://github.com/ggerganov/llama.cpp/pull/5849
 
@@ -110,6 +111,7 @@ Typically finetunes of the base models below are supported as well.
 - [x] [InternLM2](https://huggingface.co/models?search=internlm2)
 - [x] [CodeShell](https://github.com/WisdomShell/codeshell)
 - [x] [Gemma](https://ai.google.dev/gemma)
+- [x] [Mamba](https://github.com/state-spaces/mamba)
 
 **Multimodal models:**
 

From 39579d3ceb3e5a47b18e82988cbbc9ea3c348f5b Mon Sep 17 00:00:00 2001
From: Francis Couture-Harpin <git@compilade.net>
Date: Fri, 8 Mar 2024 12:24:11 -0500
Subject: [PATCH 41/41] mamba : move state_seq and state_mask views outside
 layer loop

A few tensors were also missing `struct` in front of `ggml_tensor`.
---
 llama.cpp | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/llama.cpp b/llama.cpp
index dea44a4946db5..c475ea0f71c44 100644
--- a/llama.cpp
+++ b/llama.cpp
@@ -5540,9 +5540,11 @@ struct llm_build_context {
     struct ggml_cgraph * build_s_copy() {
         struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
 
+        GGML_ASSERT(kv_self.recurrent);
+
         for (int il = 0; il < n_layer; ++il) {
-            ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
-            ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
+            struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
+            struct ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
 
             conv_states = ggml_get_rows(ctx0, conv_states, lctx.inp_s_copy);
             ssm_states  = ggml_get_rows(ctx0,  ssm_states, lctx.inp_s_copy);
@@ -8171,14 +8173,16 @@ struct llm_build_context {
         inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, lctx.inp_tokens, lctx.inp_embd, cb);
         cb(inpL, "inp_embd", -1);
 
+        struct ggml_tensor * state_mask = ggml_view_2d(ctx0, lctx.inp_s_mask, 1, n_kv, lctx.inp_s_mask->nb[0], 0);
+        struct ggml_tensor * state_seq  = ggml_view_2d(ctx0, lctx.inp_s_seq, n_kv, n_tokens, n_kv*ggml_element_size(lctx.inp_s_seq), 0);
+
         for (int il = 0; il < n_layer; ++il) {
             // (ab)using the KV cache to store the states
-            ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
-            ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
+            struct ggml_tensor * conv_states = ggml_reshape_2d(ctx0, kv_self.k_l[il], hparams.n_embd_k_s(), kv_self.size);
+            struct ggml_tensor * ssm_states  = ggml_reshape_2d(ctx0, kv_self.v_l[il], hparams.n_embd_v_s(), kv_self.size);
 
             // clear states of sequences which are starting at the beginning of this batch
             {
-                ggml_tensor * state_mask = ggml_view_2d(ctx0, lctx.inp_s_mask, 1, n_kv, lctx.inp_s_mask->nb[0], 0);
                 conv_states = ggml_mul(ctx0,
                     ggml_view_2d(ctx0, conv_states, conv_states->ne[0], n_kv, conv_states->nb[1], kv_head*conv_states->nb[1]),
                     state_mask);
@@ -8203,8 +8207,6 @@ struct llm_build_context {
             struct ggml_tensor * x = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], 0);
             struct ggml_tensor * z = ggml_view_2d(ctx0, xz, d_inner, xz->ne[1], xz->nb[1], ggml_element_size(xz)*d_inner);
 
-            struct ggml_tensor * state_seq = ggml_view_2d(ctx0, lctx.inp_s_seq, n_kv, n_tokens, n_kv*ggml_element_size(lctx.inp_s_seq), 0);
-
             // conv
             {
                 // Custom operator which is needed only to ease simultaneous sequence processing.