Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ggml : update mul_mat_id to use the same tensor for all the experts #6387

Merged
merged 36 commits into from
Apr 3, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
0c7e21d
ggml : update mul_mat_id to use the same tensor for all the experts
slaren Mar 29, 2024
9c9fe60
update cuda
slaren Mar 29, 2024
2479900
minor
slaren Mar 29, 2024
93db37e
update metal
slaren Mar 29, 2024
325e5ef
update test-backend-ops
slaren Mar 29, 2024
26c09ad
fix cuda
slaren Mar 29, 2024
2abb6c7
Update ggml-metal.m
slaren Mar 30, 2024
6203d72
update convert.py
slaren Mar 30, 2024
4a5d50e
update convert-hf-to-gguf.py
slaren Mar 31, 2024
3b3298a
update convert.py for mixtral hf models
slaren Mar 31, 2024
8c2f7b8
Update convert-hf-to-gguf.py
slaren Mar 31, 2024
4531b02
cuda : support non-pow-2 number of experts
slaren Apr 1, 2024
6886fdb
allow quantize to work for split and merged experts models in the sam…
slaren Apr 1, 2024
deea200
cleanup + disable mmap automatically with split tensors models
slaren Apr 1, 2024
b4a6206
update imatrix
slaren Apr 2, 2024
8f84ca3
test-backend-ops : test qwen argsort
slaren Apr 2, 2024
5de4a5d
update grok model loading
slaren Apr 2, 2024
6875369
llama : add merged experts tensors to the grok tensor map
slaren Apr 2, 2024
6f33852
minor
slaren Apr 2, 2024
68d21de
gguf : bump version
slaren Apr 2, 2024
f27cbf3
fix quantizing of merged experts
slaren Apr 2, 2024
d08a1f4
convert-hf-to-gguf.py : update grok (untested)
slaren Apr 2, 2024
9530398
make linter happy
slaren Apr 2, 2024
f421b32
cuda/argsort : use shared memory instead of pool memory
slaren Apr 2, 2024
c704c77
convert : fix grok tensor names
ggerganov Apr 2, 2024
fe62909
metal : add support for non-pow-2 argsort
slaren Apr 2, 2024
31adc93
llama : more loader cleanup, better error checking
slaren Apr 2, 2024
86f3666
cuda : fix warning
slaren Apr 2, 2024
a1343ae
llama : still use mmap for loading old models, but copy the data to a…
slaren Apr 2, 2024
19dafaf
add review note
slaren Apr 3, 2024
3779b98
llama : remove ffn tensor counting + add sanity check
ggerganov Apr 3, 2024
e810899
convert : fix handling of n_experts == None
ggerganov Apr 3, 2024
fc719b6
imatrix : fix ncall counters
ggerganov Apr 3, 2024
822caa4
llama : produce error if imatrix size does not match
ggerganov Apr 3, 2024
a054283
quantize : terminate on errors + trace logs
ggerganov Apr 3, 2024
716e960
metal : pad shared memory to 16 bytes
ggerganov Apr 3, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
136 changes: 135 additions & 1 deletion convert-hf-to-gguf.py
Original file line number Diff line number Diff line change
Expand Up @@ -1216,6 +1216,8 @@ def write_tensors(self):
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
n_head = self.hparams.get("num_attention_heads")
n_kv_head = self.hparams.get("num_key_value_heads")
n_experts = self.hparams.get("num_local_experts")
experts = dict()
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
Expand All @@ -1236,6 +1238,49 @@ def write_tensors(self):

data = data.squeeze()

# process the experts separately
if name.find("block_sparse_moe.experts") != -1:
experts[name] = data
if len(experts) >= n_experts:
# merge the experts into a single 3d tensor
for bid in range(block_count):
for wid in range(1, 4):
full = True
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
if ename not in experts:
full = False
break
if not full:
continue

datas = []
for xid in range(n_experts):
ename = f"model.layers.{bid}.block_sparse_moe.experts.{xid}.w{wid}.weight"
datas.append(experts[ename])
del experts[ename]

data = np.stack(datas, axis=0)
data_dtype = data.dtype

if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)

if self.ftype == 1 and data_dtype == np.float32:
data = data.astype(np.float16)

merged_name = f"layers.{bid}.feed_forward.experts.w{wid}.weight"

new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()

print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")

self.gguf_writer.add_tensor(new_name, data)
continue

# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
Expand All @@ -1249,7 +1294,7 @@ def write_tensors(self):
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)

# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
# 1d tensors need to be converted to float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)

Expand All @@ -1261,6 +1306,9 @@ def write_tensors(self):

self.gguf_writer.add_tensor(new_name, data)

if len(experts) > 0:
raise ValueError(f"Unprocessed experts: {experts.keys()}")


@Model.register("GrokForCausalLM")
class GrokModel(Model):
Expand All @@ -1276,6 +1324,92 @@ def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_name("Grok")

def write_tensors(self):
block_count = self.hparams.get("n_layers", self.hparams.get("num_hidden_layers", self.hparams.get("n_layer")))
tensor_map = gguf.get_tensor_name_map(self.model_arch, block_count)
n_experts = self.hparams.get("num_local_experts")
experts = dict()
for name, data_torch in self.get_tensors():
# we don't need these
if name.endswith((".attention.masked_bias", ".attention.bias", ".attention.rotary_emb.inv_freq")):
continue

old_dtype = data_torch.dtype

# convert any unsupported data types to float32
if data_torch.dtype not in (torch.float16, torch.float32):
data_torch = data_torch.to(torch.float32)

data = data_torch.squeeze().numpy()

# process the experts separately
if name.find(".moe.") != -1:
experts[name] = data
if len(experts) >= n_experts:
# merge the experts into a single 3d tensor
for bid in range(block_count):
for wid in ["linear", "linear_1", "linear_v"]:
full = True
for xid in range(n_experts):
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
if ename not in experts:
full = False
break
if not full:
continue

datas = []
for xid in range(n_experts):
ename = f"transformer.decoder_layer.{bid}.moe.{xid}.{wid}.weight"
datas.append(experts[ename])
del experts[ename]

data = np.stack(datas, axis=0)
data_dtype = data.dtype

if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)

if self.ftype == 1 and data_dtype == np.float32:
data = data.astype(np.float16)

merged_name = f"transformer.decoder_layer.{bid}.moe.{wid}.weight"

new_name = tensor_map.get_name(merged_name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()

print(f"{new_name}, n_dims = {len(data.shape)}, shape = {data.shape} --> {data.dtype}")

self.gguf_writer.add_tensor(new_name, data)
continue

# map tensor names
new_name = tensor_map.get_name(name, try_suffixes=(".weight", ".bias"))
if new_name is None:
print(f"Can not map tensor {name!r}")
sys.exit()

n_dims = len(data.shape)
data_dtype = data.dtype

# if f32 desired, convert any float16 to float32
if self.ftype == 0 and data_dtype == np.float16:
data = data.astype(np.float32)

# TODO: Why cant we use these float16 as-is? There should be not reason to store float16 as float32
if self.ftype == 1 and data_dtype == np.float16 and n_dims == 1:
data = data.astype(np.float32)

# if f16 desired, convert any float32 2-dim weight tensors to float16
if self.ftype == 1 and data_dtype == np.float32 and name.endswith(".weight") and n_dims == 2:
data = data.astype(np.float16)

print(f"{new_name}, n_dims = {n_dims}, {old_dtype} --> {data.dtype}")

self.gguf_writer.add_tensor(new_name, data)


@Model.register("MiniCPMForCausalLM")
class MiniCPMModel(Model):
Expand Down
25 changes: 25 additions & 0 deletions convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -828,6 +828,15 @@ def load() -> Tensor:
return LazyTensor(load, s, lazy_tensor.data_type, 'part ' + lazy_tensor.description)


def pack_experts_lazy(lazy_tensors: list[LazyTensor]) -> LazyTensor:
def load() -> Tensor:
tensors = [lazy_tensor.load() for lazy_tensor in lazy_tensors]
return UnquantizedTensor(np.array([tensor.ndarray for tensor in tensors]))
s = lazy_tensors[0].shape.copy()
s.insert(0, len(lazy_tensors))
return LazyTensor(load, s, lazy_tensors[0].data_type, 'pack_experts ' + ' | '.join(lt.description for lt in lazy_tensors))


# Functionality that simulates `torch.load` but where individual tensors are
# only loaded into memory on demand, not all at once.
# PyTorch can't do this natively as of time of writing:
Expand Down Expand Up @@ -1246,6 +1255,22 @@ def convert_model_names(model: LazyModel, params: Params, skip_unknown: bool) ->

tmp = model

# merge experts into one tensor
if params.n_experts and params.n_experts > 0:
for i_l in range(params.n_layer):
for w in range(1, 4):
experts = []
for e in range(params.n_experts):
if f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight" in model:
experts.append(model[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"])
del tmp[f"layers.{i_l}.feed_forward.experts.{e}.w{w}.weight"]
elif f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight" in model:
experts.append(model[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"])
del tmp[f"model.layers.{i_l}.block_sparse_moe.experts.{e}.w{w}.weight"]
else:
raise ValueError(f"Expert tensor not found: layers.{i_l}.feed_forward.experts.{e}.w{w}.weight")
tmp[f"layers.{i_l}.feed_forward.experts.w{w}.weight"] = pack_experts_lazy(experts)

# HF models permut or pack some of the tensors, so we need to undo that
for i in itertools.count():
if f"model.layers.{i}.self_attn.q_proj.weight" in model:
Expand Down
43 changes: 23 additions & 20 deletions examples/imatrix/imatrix.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -98,35 +98,38 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *

const float * data = is_host ? (const float *) src1->data : m_src1_data.data();

// this has been adapted to the new format of storing merged experts in a single 3d tensor
// ref: https://github.com/ggerganov/llama.cpp/pull/6387
if (t->op == GGML_OP_MUL_MAT_ID) {
const int idx = ((int32_t *) t->op_params)[0];
const int n_as = ((int32_t *) t->op_params)[1];
const ggml_tensor * ids = t->src[2];
const int n_as = src0->ne[2];

// the top-k selected expert ids are stored in the src0 tensor
// for simplicity, always copy src0 to host, because it is small
// take into account that src0 is not contiguous!
GGML_ASSERT(src0->ne[1] == src1->ne[1]);
GGML_ASSERT(n_as*ggml_nrows(src0)*sizeof(int) == GGML_PAD(ggml_nbytes(src0), n_as*sizeof(int)));
m_ids.resize(ggml_nbytes(src0)/sizeof(int));
ggml_backend_tensor_get(src0, m_ids.data(), 0, ggml_nbytes(src0));
// the top-k selected expert ids are stored in the ids tensor
// for simplicity, always copy ids to host, because it is small
// take into account that ids is not contiguous!
GGML_ASSERT(ids->ne[1] == src1->ne[1]);
GGML_ASSERT(n_as*ggml_nrows(ids)*sizeof(int) == GGML_PAD(ggml_nbytes(ids), n_as*sizeof(int)));
m_ids.resize(ggml_nbytes(ids)/sizeof(int));
ggml_backend_tensor_get(ids, m_ids.data(), 0, ggml_nbytes(ids));

auto & e = m_stats[wname];

++e.ncall;
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed by replacing the line above with:
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;

// loop over all possible experts, regardless if they are used or not in the batch
// this is necessary to guarantee equal number of "ncall" for each tensor
for (int ex = 0; ex < n_as; ++ex) {
src0 = t->src[2 + ex];
wname = filter_tensor_name(src0->name);
auto& e = m_stats[wname];
size_t e_start = ex*src1->ne[0];
if (e.values.empty()) {
e.values.resize(src1->ne[0], 0);
e.values.resize(src1->ne[0]*n_as, 0);
}
else if (e.values.size() != (size_t)src1->ne[0]) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]);
else if (e.values.size() != (size_t)src1->ne[0]*n_as) {
fprintf(stderr, "Oops: inconsistent size for %s (%d vs %d)\n", wname.c_str(), (int)e.values.size(), (int)src1->ne[0]*n_as);
exit(1); //GGML_ASSERT(false);
}
// NOTE: since we select top-k experts, the number of calls for the expert tensors will be k times larger
// using the following line, we can correct for that if needed
//if (idx == t->src[0]->ne[0] - 1) ++e.ncall;
++e.ncall;
if (m_params.verbosity > 1) {
printf("%s[%d]: %32s, %s, %5d x %5d, %d\n", __func__, m_last_call, wname.c_str(), ggml_op_name(t->op), (int)src1->ne[0], (int)src1->ne[1], (int)src1->type);
}
Expand All @@ -136,7 +139,7 @@ bool IMatrixCollector::collect_imatrix(struct ggml_tensor * t, bool ask, void *
if (excur != ex) continue;
const float * x = data + row * src1->ne[0];
for (int j = 0; j < (int)src1->ne[0]; ++j) {
e.values[j] += x[j]*x[j];
e.values[e_start + j] += x[j]*x[j];
}
}
if (e.ncall > m_last_call) {
Expand Down
16 changes: 10 additions & 6 deletions examples/quantize/quantize.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -116,44 +116,48 @@ static void load_imatrix(const std::string & imatrix_file, std::unordered_map<st
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) {
printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
return;
exit(1);
}
int n_entries;
in.read((char *)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
return;
exit(1);
}
for (int i = 0; i < n_entries; ++i) {
int len; in.read((char *)&len, sizeof(len));
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
return;
exit(1);
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
auto & e = imatrix_data[std::move(name)];
auto & e = imatrix_data[name];
int ncall;
in.read((char *)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
printf("%s: failed reading number of values for entry %d\n", __func__, i);
imatrix_data = {};
return;
exit(1);
}
e.resize(nval);
in.read((char *)e.data(), nval*sizeof(float));
if (in.fail()) {
printf("%s: failed reading data for entry %d\n", __func__, i);
imatrix_data = {};
return;
exit(1);
}
if (ncall > 0) {
for (auto& v : e) v /= ncall;
}

if (getenv("LLAMA_TRACE")) {
printf("%s: loaded data (size = %6d, ncall = %6d) for '%s'\n", __func__, int(e.size()), ncall, name.c_str());
}
}
printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
}
Expand Down
Loading
Loading