Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : update mul_mat_id to use the same tensor for all the experts #6387

Merged
merged 36 commits into from
Apr 3, 2024
Merged
Changes from 1 commit
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
0c7e21d
ggml : update mul_mat_id to use the same tensor for all the experts
slaren Mar 29, 2024
9c9fe60
update cuda
slaren Mar 29, 2024
2479900
minor
slaren Mar 29, 2024
93db37e
update metal
slaren Mar 29, 2024
325e5ef
update test-backend-ops
slaren Mar 29, 2024
26c09ad
fix cuda
slaren Mar 29, 2024
2abb6c7
Update ggml-metal.m
slaren Mar 30, 2024
6203d72
update convert.py
slaren Mar 30, 2024
4a5d50e
update convert-hf-to-gguf.py
slaren Mar 31, 2024
3b3298a
update convert.py for mixtral hf models
slaren Mar 31, 2024
8c2f7b8
Update convert-hf-to-gguf.py
slaren Mar 31, 2024
4531b02
cuda : support non-pow-2 number of experts
slaren Apr 1, 2024
6886fdb
allow quantize to work for split and merged experts models in the sam…
slaren Apr 1, 2024
deea200
cleanup + disable mmap automatically with split tensors models
slaren Apr 1, 2024
b4a6206
update imatrix
slaren Apr 2, 2024
8f84ca3
test-backend-ops : test qwen argsort
slaren Apr 2, 2024
5de4a5d
update grok model loading
slaren Apr 2, 2024
6875369
llama : add merged experts tensors to the grok tensor map
slaren Apr 2, 2024
6f33852
minor
slaren Apr 2, 2024
68d21de
gguf : bump version
slaren Apr 2, 2024
f27cbf3
fix quantizing of merged experts
slaren Apr 2, 2024
d08a1f4
convert-hf-to-gguf.py : update grok (untested)
slaren Apr 2, 2024
9530398
make linter happy
slaren Apr 2, 2024
f421b32
cuda/argsort : use shared memory instead of pool memory
slaren Apr 2, 2024
c704c77
convert : fix grok tensor names
ggerganov Apr 2, 2024
fe62909
metal : add support for non-pow-2 argsort
slaren Apr 2, 2024
31adc93
llama : more loader cleanup, better error checking
slaren Apr 2, 2024
86f3666
cuda : fix warning
slaren Apr 2, 2024
a1343ae
llama : still use mmap for loading old models, but copy the data to a…
slaren Apr 2, 2024
19dafaf
add review note
slaren Apr 3, 2024
3779b98
llama : remove ffn tensor counting + add sanity check
ggerganov Apr 3, 2024
e810899
convert : fix handling of n_experts == None
ggerganov Apr 3, 2024
fc719b6
imatrix : fix ncall counters
ggerganov Apr 3, 2024
822caa4
llama : produce error if imatrix size does not match
ggerganov Apr 3, 2024
a054283
quantize : terminate on errors + trace logs
ggerganov Apr 3, 2024
716e960
metal : pad shared memory to 16 bytes
ggerganov Apr 3, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
minor
  • Loading branch information
slaren committed Mar 29, 2024
commit 2479900a1ca1b1dff3b5d2b7ff2dcd1ef29b012d
2 changes: 1 addition & 1 deletion ggml.c
Original file line number Diff line number Diff line change
@@ -10989,7 +10989,7 @@ static void ggml_compute_forward_mul_mat_id(

// row groups
const int id = ggml_get_op_params_i32(dst, 0);
const int n_as = src0->ne[2]; //ggml_get_op_params_i32(dst, 1);
const int n_as = src0->ne[2];

char * wdata_src1_end = (src1->type == vec_dot_type) ?
(char *) params->wdata :
21 changes: 6 additions & 15 deletions llama.cpp
Original file line number Diff line number Diff line change
@@ -1868,10 +1868,6 @@ struct llama_layer {
struct ggml_tensor * ffn_down_exps;//[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_up_exps ;//[LLAMA_MAX_EXPERTS];

struct ggml_tensor * ffn_gate_exp[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_down_exp[LLAMA_MAX_EXPERTS];
struct ggml_tensor * ffn_up_exp [LLAMA_MAX_EXPERTS];

// ff bias
struct ggml_tensor * ffn_down_b; // b2
struct ggml_tensor * ffn_up_b; // b3
@@ -4477,21 +4473,16 @@ static bool llm_load_tensors(

// MoE branch
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
// hack
// individual tensors as views
layer.ffn_gate_exp[x] = ggml_view_2d(ctx_split, layer.ffn_gate_exps, n_embd, n_ff, layer.ffn_gate_exps->nb[1], layer.ffn_gate_exps->nb[2]*x);
layer.ffn_down_exp[x] = ggml_view_2d(ctx_split, layer.ffn_down_exps, n_ff, n_embd, layer.ffn_down_exps->nb[1], layer.ffn_down_exps->nb[2]*x);
layer.ffn_up_exp[x] = ggml_view_2d(ctx_split, layer.ffn_up_exps, n_embd, n_ff, layer.ffn_up_exps->nb[1], layer.ffn_up_exps->nb[2]*x);
ggml_tensor * ffn_gate_exp = ggml_view_2d(ctx_split, layer.ffn_gate_exps, n_embd, n_ff, layer.ffn_gate_exps->nb[1], layer.ffn_gate_exps->nb[2]*x);
ggml_tensor * ffn_down_exp = ggml_view_2d(ctx_split, layer.ffn_down_exps, n_ff, n_embd, layer.ffn_down_exps->nb[1], layer.ffn_down_exps->nb[2]*x);
ggml_tensor * ffn_up_exp = ggml_view_2d(ctx_split, layer.ffn_up_exps, n_embd, n_ff, layer.ffn_up_exps->nb[1], layer.ffn_up_exps->nb[2]*x);

ggml_set_name(layer.ffn_gate_exp[x], tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x).c_str());
ggml_set_name(layer.ffn_down_exp[x], tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x).c_str());
ggml_set_name(layer.ffn_up_exp[x], tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x).c_str());
ggml_set_name(ffn_gate_exp, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x).c_str());
ggml_set_name(ffn_down_exp, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x).c_str());
ggml_set_name(ffn_up_exp, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x).c_str());

ml.n_created += 3; // hack

//layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
//layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
//layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
}
}
}