Skip to content

Commit

Permalink
llamafile : improve moe prompt eval speed on cpu
Browse files Browse the repository at this point in the history
This change introduces a llamafile_mixmul() API, that allows tinyBLAS to
speed up "Mixture of Expert" models. On my Threadripper the Mixtral 8x7b
F16 weights now process prompts 2x faster. I am also seeing a 60 percent
improvement with Mixtral 8x22b Q4_0. Support is provided for Q8_0; it is
also supported by tinyBLAS. MoE models spend the most time in MUL_MAT_ID
rather than MUL_MAT, which is why llamafile_sgemm() was not able to help
them before. The new code works by decomposing the mixmul operation into
fast 2d llamafile_sgemm() calls. This also adds BF16 support to tinyBLAS
  • Loading branch information
jart committed Jun 28, 2024
1 parent 8748d8a commit 2dd5d1f
Show file tree
Hide file tree
Showing 5 changed files with 714 additions and 118 deletions.
6 changes: 3 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ using json = nlohmann::ordered_json;
//

int32_t cpu_get_num_physical_cores() {
#ifdef __linux__
#if defined(__linux__) || defined(__COSMOPOLITAN__)
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
Expand Down Expand Up @@ -113,7 +113,7 @@ int32_t cpu_get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}

#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
#include <pthread.h>

static void cpuid(unsigned leaf, unsigned subleaf,
Expand Down Expand Up @@ -167,7 +167,7 @@ static int cpu_count_math_cpus(int n_cpu) {
* Returns number of CPUs on system that are useful for math.
*/
int32_t cpu_get_num_math() {
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
if (n_cpu < 1) {
return cpu_get_num_physical_cores();
Expand Down
15 changes: 15 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,21 @@ extern "C" {
enum ggml_cgraph_eval_order order;
};

struct ggml_compute_state_shared;

struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;

// work buffer for all threads
size_t wsize;
void * wdata;

struct ggml_compute_state_shared * shared;
};

void ggml_barrier(struct ggml_compute_state_shared * shared);

// scratch buffer
struct ggml_scratch {
size_t offs;
Expand Down
29 changes: 15 additions & 14 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1754,17 +1754,6 @@ struct ggml_compute_state {
struct ggml_compute_state_shared * shared;
};

struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;

// work buffer for all threads
size_t wsize;
void * wdata;

struct ggml_compute_state_shared * shared;
};

//
// fundamental operations
//
Expand Down Expand Up @@ -2857,15 +2846,15 @@ inline static void ggml_critical_section_start(void) {
}

#ifdef GGML_USE_OPENMP
static void ggml_barrier(struct ggml_compute_state_shared * shared) {
void ggml_barrier(struct ggml_compute_state_shared * shared) {
if (shared->n_threads == 1) {
return;
}

#pragma omp barrier
}
#else
static void ggml_barrier(struct ggml_compute_state_shared * shared) {
void ggml_barrier(struct ggml_compute_state_shared * shared) {
if (shared->n_threads == 1) {
return;
}
Expand Down Expand Up @@ -12306,11 +12295,16 @@ static void ggml_compute_forward_mul_mat_id(
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * ids = dst->src[2];

GGML_TENSOR_BINARY_OP_LOCALS
#if GGML_USE_LLAMAFILE
if (llamafile_mixmul(params, src0, src1, ids, dst))
return;
#endif

const int ith = params->ith;
const int nth = params->nth;

GGML_TENSOR_BINARY_OP_LOCALS

const enum ggml_type type = src0->type;

const bool src1_cont = ggml_is_contiguous(src1);
Expand Down Expand Up @@ -18536,6 +18530,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
cur = 0;
const struct ggml_tensor * src0 = node->src[0];
const struct ggml_tensor * src1 = node->src[1];
#if GGML_USE_LLAMAFILE
const struct ggml_tensor * src2 = node->src[2];
#endif
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
if (src1->type != vec_dot_type) {
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
Expand All @@ -18544,6 +18541,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
cur += GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
#if GGML_USE_LLAMAFILE
size_t cur2 = llamafile_mixmul_needs(src0, src1, src2);
cur = cur > cur2 ? cur : cur2;
#endif
} break;
case GGML_OP_OUT_PROD:
{
Expand Down
Loading

0 comments on commit 2dd5d1f

Please sign in to comment.