Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

llamafile : improve moe prompt eval speed on cpu #6840

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ using json = nlohmann::ordered_json;
//

int32_t cpu_get_num_physical_cores() {
#ifdef __linux__
#if defined(__linux__) || defined(__COSMOPOLITAN__)
// enumerate the set of thread siblings, num entries is num cores
std::unordered_set<std::string> siblings;
for (uint32_t cpu=0; cpu < UINT32_MAX; ++cpu) {
Expand Down Expand Up @@ -113,7 +113,7 @@ int32_t cpu_get_num_physical_cores() {
return n_threads > 0 ? (n_threads <= 4 ? n_threads : n_threads / 2) : 4;
}

#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
#include <pthread.h>

static void cpuid(unsigned leaf, unsigned subleaf,
Expand Down Expand Up @@ -167,7 +167,7 @@ static int cpu_count_math_cpus(int n_cpu) {
* Returns number of CPUs on system that are useful for math.
*/
int32_t cpu_get_num_math() {
#if defined(__x86_64__) && defined(__linux__) && !defined(__ANDROID__)
#if defined(__x86_64__) && (defined(__linux__) || defined(__COSMOPOLITAN__)) && !defined(__ANDROID__)
int n_cpu = sysconf(_SC_NPROCESSORS_ONLN);
if (n_cpu < 1) {
return cpu_get_num_physical_cores();
Expand Down
15 changes: 15 additions & 0 deletions ggml/include/ggml.h
Original file line number Diff line number Diff line change
Expand Up @@ -650,6 +650,21 @@ extern "C" {
enum ggml_cgraph_eval_order order;
};

struct ggml_compute_state_shared;

struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;

// work buffer for all threads
size_t wsize;
void * wdata;

struct ggml_compute_state_shared * shared;
};

void ggml_barrier(struct ggml_compute_state_shared * shared);

// scratch buffer
struct ggml_scratch {
size_t offs;
Expand Down
29 changes: 15 additions & 14 deletions ggml/src/ggml.c
Original file line number Diff line number Diff line change
Expand Up @@ -1754,17 +1754,6 @@ struct ggml_compute_state {
struct ggml_compute_state_shared * shared;
};

struct ggml_compute_params {
// ith = thread index, nth = number of threads
int ith, nth;

// work buffer for all threads
size_t wsize;
void * wdata;

struct ggml_compute_state_shared * shared;
};

//
// fundamental operations
//
Expand Down Expand Up @@ -2857,15 +2846,15 @@ inline static void ggml_critical_section_start(void) {
}

#ifdef GGML_USE_OPENMP
static void ggml_barrier(struct ggml_compute_state_shared * shared) {
void ggml_barrier(struct ggml_compute_state_shared * shared) {
if (shared->n_threads == 1) {
return;
}

#pragma omp barrier
}
#else
static void ggml_barrier(struct ggml_compute_state_shared * shared) {
void ggml_barrier(struct ggml_compute_state_shared * shared) {
if (shared->n_threads == 1) {
return;
}
Expand Down Expand Up @@ -12306,11 +12295,16 @@ static void ggml_compute_forward_mul_mat_id(
const struct ggml_tensor * src1 = dst->src[1];
const struct ggml_tensor * ids = dst->src[2];

GGML_TENSOR_BINARY_OP_LOCALS
#if GGML_USE_LLAMAFILE
if (llamafile_mixmul(params, src0, src1, ids, dst))
return;
#endif

const int ith = params->ith;
const int nth = params->nth;

GGML_TENSOR_BINARY_OP_LOCALS

const enum ggml_type type = src0->type;

const bool src1_cont = ggml_is_contiguous(src1);
Expand Down Expand Up @@ -18536,6 +18530,9 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
cur = 0;
const struct ggml_tensor * src0 = node->src[0];
const struct ggml_tensor * src1 = node->src[1];
#if GGML_USE_LLAMAFILE
const struct ggml_tensor * src2 = node->src[2];
#endif
const enum ggml_type vec_dot_type = type_traits[src0->type].vec_dot_type;
if (src1->type != vec_dot_type) {
cur += ggml_row_size(vec_dot_type, ggml_nelements(src1));
Expand All @@ -18544,6 +18541,10 @@ struct ggml_cplan ggml_graph_plan(const struct ggml_cgraph * cgraph, int n_threa
cur += GGML_PAD(cur, sizeof(int64_t)); // align
cur += n_as * sizeof(int64_t); // matrix_row_counts
cur += n_as * src1->ne[2] * sizeof(int64_t); // matrix_rows
#if GGML_USE_LLAMAFILE
size_t cur2 = llamafile_mixmul_needs(src0, src1, src2);
cur = cur > cur2 ? cur : cur2;
#endif
} break;
case GGML_OP_OUT_PROD:
{
Expand Down
Loading
Loading