llamafile : improve moe prompt eval speed on cpu

This change introduces a llamafile_mixmul() API, that allows tinyBLAS to speed up "Mixture of Expert" models. On my Threadripper the Mixtral 8x7b F16 weights now process prompts 2x faster. I am also seeing a 60 percent improvement with Mixtral 8x22b Q4_0. Support is provided for Q8_0; it is also supported by tinyBLAS. MoE models spend the most time in MUL_MAT_ID rather than MUL_MAT, which is why llamafile_sgemm() was not able to help them before. The new code works by decomposing the mixmul operation into fast 2d llamafile_sgemm() calls. This also adds BF16 support to tinyBLAS
ggerganov · Jun 28, 2024 · 2dd5d1f · 2dd5d1f
1 parent 8748d8a
commit 2dd5d1f
Show file tree

Hide file tree

Showing 5 changed files with 714 additions and 118 deletions.
diff --git a/common/common.cpp b/common/common.cpp
@@ -78,7 +78,7 @@ using json = nlohmann::ordered_json;
 //
 
 int32_t cpu_get_num_physical_cores() {
-#ifdef __linux__
+#if defined(__linux__) || defined(__COSMOPOLITAN__)
     // enumerate the set of thread siblings, num entries is num cores
     std::unordered_set<std::string> siblings;
     for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
@@ -113,7 +113,7 @@ int32_t cpu_get_num_physical_cores() {
     return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
 }
 
-#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
 #include <pthread.h>
 
 static void cpuid(unsigned leaf, unsigned subleaf,
@@ -167,7 +167,7 @@ static int cpu_count_math_cpus(int n_cpu) {
  * Returns number of CPUs on system that are useful for math.
  */
 int32_t cpu_get_num_math() {
-#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
+#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
     int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
     if (n_cpu < 1) {
         return cpu_get_num_physical_cores();

diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h
@@ -650,6 +650,21 @@ extern "C" {
         enum ggml_cgraph_eval_order order;
     };
 
+    struct ggml_compute_state_shared;
+
+    struct ggml_compute_params {
+        // ith = thread index, nth = number of threads
+        int ith, nth;
+
+        // work buffer for all threads
+        size_t wsize;
+        void * wdata;
+
+        struct ggml_compute_state_shared * shared;
+    };
+
+    void ggml_barrier(struct ggml_compute_state_shared * shared);
+
     // scratch buffer
     struct ggml_scratch {
         size_t offs;

diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c
@@ -1754,17 +1754,6 @@ struct ggml_compute_state {
     struct ggml_compute_state_shared * shared;
 };
 
-struct ggml_compute_params {
-    // ith = thread index, nth = number of threads
-    int ith, nth;
-
-    // work buffer for all threads
-    size_t wsize;
-    void * wdata;
-
-    struct ggml_compute_state_shared * shared;
-};
-
 //
 // fundamental operations
 //
@@ -2857,15 +2846,15 @@ inline static void ggml_critical_section_start(void) {
 }
 
 #ifdef GGML_USE_OPENMP
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
+void ggml_barrier(struct ggml_compute_state_shared * shared) {
     if (shared->n_threads == 1) {
         return;
     }
 
     #pragma omp barrier
 }
 #else
-static void ggml_barrier(struct ggml_compute_state_shared * shared) {
+void ggml_barrier(struct ggml_compute_state_shared * shared) {
     if (shared->n_threads == 1) {
         return;
     }
@@ -12306,11 +12295,16 @@ static void ggml_compute_forward_mul_mat_id(
     const struct ggml_tensor * src1 = dst->src[1];
     const struct ggml_tensor * ids = dst->src[2];
 
-    GGML_TENSOR_BINARY_OP_LOCALS
+#if GGML_USE_LLAMAFILE
+    if (llamafile_mixmul(params, src0, src1, ids, dst))
+        return;
+#endif
 
     const int ith = params->ith;
     const int nth = params->nth;
 
+    GGML_TENSOR_BINARY_OP_LOCALS
+
     const enum ggml_type type = src0->type;
 
     const bool src1_cont = ggml_is_contiguous(src1);
@@ -18536,6 +18530,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     cur = 0;
                     const struct ggml_tensor * src0 = node->src[0];
                     const struct ggml_tensor * src1 = node->src[1];
+#if GGML_USE_LLAMAFILE
+                    const struct ggml_tensor * src2 = node->src[2];
+#endif
                     const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
                     if (src1->type != vec_dot_type) {
                         cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
@@ -18544,6 +18541,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
                     cur += GGML_PAD(cur, sizeof(int64_t));       // align
                     cur += n_as * sizeof(int64_t);               // matrix_row_counts
                     cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
+#if GGML_USE_LLAMAFILE
+                    size_t cur2 = llamafile_mixmul_needs(src0, src1, src2);
+                    cur = cur > cur2 ? cur : cur2;
+#endif
                 } break;
             case GGML_OP_OUT_PROD:
                 {