Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Commit

Permalink
fix #369 - clean up tensor name interface
Browse files Browse the repository at this point in the history
  • Loading branch information
philpax committed Jul 16, 2023
1 parent 3062a08 commit dfede25
Show file tree
Hide file tree
Showing 5 changed files with 90 additions and 98 deletions.
8 changes: 1 addition & 7 deletions crates/ggml/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ pub const QNT_VERSION_FACTOR: u32 = sys::GGML_QNT_VERSION_FACTOR;
pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE;

/// The maximum length of a `ggml` tensor-name.
pub const MAX_NAME_LENGTH: u32 = sys::GGML_MAX_NAME;
pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize;

#[derive(Debug, Copy, Clone, PartialEq, Eq, Default)]
/// The type of a value in `ggml`.
Expand Down Expand Up @@ -479,9 +479,3 @@ pub fn cpu_has_blas() -> bool {
pub fn cpu_has_gpublas() -> bool {
unsafe { sys::ggml_cpu_has_gpublas() != 0 }
}

/// Sets the name of a tensor.
pub fn set_tensor_name(tensor: &Tensor, name: &str) {
let c_name = std::ffi::CString::new(name).unwrap();
unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) };
}
25 changes: 10 additions & 15 deletions crates/ggml/src/tensor.rs
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
use std::{os::raw::c_void, ptr::NonNull, sync::Weak};

use crate::{accelerator::Backend, context::ContextInner, i64_to_usize, sys, Type};

const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize;
use crate::{
accelerator::Backend, context::ContextInner, i64_to_usize, sys, Type, MAX_NAME_LENGTH,
};

/// Tensors are owned by the context. A tensor is alive as long as the
/// underlying context it was created with is alive.
Expand All @@ -21,30 +21,25 @@ impl Tensor {
///
/// # Safety
///
/// The name must be a valid UTF-8 string and must not be longer than `MAX_NAME_LENGTH` characters.
/// The name must be a valid UTF-8 string and must not be longer than [`MAX_NAME_LENGTH`] bytes.
pub fn set_name(mut self, name: &str) -> Tensor {
assert!(
name.len() <= MAX_NAME_LENGTH,
"Name '{}' is too long, max length is {} characters",
name,
"Tensor name must be less than {} bytes",
MAX_NAME_LENGTH
);

let bytes = name.as_bytes();
let mut array = [0i8; MAX_NAME_LENGTH];
array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::<Vec<_>>());

self.with_alive_ctx_mut(|t| unsafe { t.ptr.as_mut().name = array });
let c_name = std::ffi::CString::new(name).unwrap();
self.with_alive_ctx_mut(|t| unsafe { sys::ggml_set_name(t.ptr.as_ptr(), c_name.as_ptr()) });
self
}

/// Gets the name of the tensor
pub fn name(&self) -> String {
self.with_alive_ctx(|| {
let name = unsafe { self.ptr.as_ref().name };
let mut name = name.iter().map(|&x| x as u8).collect::<Vec<_>>();
name.retain(|&x| x != 0);
String::from_utf8(name).unwrap()
let name_ptr = unsafe { sys::ggml_get_name(self.ptr.as_ptr()) };
let name = unsafe { std::ffi::CStr::from_ptr(name_ptr) };
name.to_string_lossy().into_owned()
})
}

Expand Down
5 changes: 3 additions & 2 deletions crates/llm-base/src/inference_session.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,9 @@ impl InferenceSession {
// Build a graph
self.ctx0.recreate();
let ctx0 = &mut self.ctx0;
let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len());
ggml::set_tensor_name(&embd, "embd");
let mut embd = ctx0
.new_tensor_1d(ggml::Type::I32, input_tokens.len())
.set_name("embd");

let bc = BuildContext {
ctx0: RefCell::new(ctx0),
Expand Down
9 changes: 4 additions & 5 deletions crates/llm-base/src/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ use crate::{
pub use ggml::{format::FormatMagic, ContainerType};
use ggml::{
format::{LoadError as FormatLoadError, PartialHyperparameters, TensorLoadInfo},
Context,
Context, MAX_NAME_LENGTH,
};
use memmap2::Mmap;
use thiserror::Error;
Expand Down Expand Up @@ -752,10 +752,9 @@ impl<'a> FileContext<'a> {
}
}

// The tensor name is truncated to it's maximum length.
let max_name_length: usize = ggml::MAX_NAME_LENGTH.try_into().unwrap();
let tensor_name = if name.len() >= max_name_length {
&name[name.len() - max_name_length..]
// The tensor name is truncated to its maximum length.
let tensor_name = if name.len() >= MAX_NAME_LENGTH {
&name[name.len() - MAX_NAME_LENGTH..]
} else {
name
};
Expand Down
141 changes: 72 additions & 69 deletions crates/models/llama/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -167,30 +167,32 @@ impl KnownModel for Llama {

// self-attention
// compute Q and K and RoPE them
let q_current = ctx0.op_rope_inplace(
&ctx0.op_reshape_3d(
&ctx0.op_mul_mat(&self.layers[il].wq, &current),
n_embd / n_head,
n_head,
input_len,
),
session_len,
n_rot,
0,
);
ggml::set_tensor_name(&q_current, "Qcur");
let k_current = ctx0.op_rope_inplace(
&ctx0.op_reshape_3d(
&ctx0.op_mul_mat(&self.layers[il].wk, &current),
n_embd / n_head,
n_head,
input_len,
),
session_len,
n_rot,
0,
);
ggml::set_tensor_name(&k_current, "Kcur");
let q_current = ctx0
.op_rope_inplace(
&ctx0.op_reshape_3d(
&ctx0.op_mul_mat(&self.layers[il].wq, &current),
n_embd / n_head,
n_head,
input_len,
),
session_len,
n_rot,
0,
)
.set_name("Qcur");
let k_current = ctx0
.op_rope_inplace(
&ctx0.op_reshape_3d(
&ctx0.op_mul_mat(&self.layers[il].wk, &current),
n_embd / n_head,
n_head,
input_len,
),
session_len,
n_rot,
0,
)
.set_name("Kcur");

// store key and value to memory
// compute the transposed [N, n_embd] V matrix
Expand Down Expand Up @@ -218,67 +220,68 @@ impl KnownModel for Llama {
gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k));
gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v));

let q = ctx0.op_permute(&q_current, (0, 2, 1, 3));
ggml::set_tensor_name(&q, "Q");

let k = ctx0.op_permute(
&ctx0.op_reshape_3d(
&ctx0.op_view_1d(
builder.memory_k,
(session_len + input_len) * n_embd,
il * ctx_size * builder.memory_k.element_size() * n_embd,
let q = ctx0.op_permute(&q_current, (0, 2, 1, 3)).set_name("Q");

let k = ctx0
.op_permute(
&ctx0.op_reshape_3d(
&ctx0.op_view_1d(
builder.memory_k,
(session_len + input_len) * n_embd,
il * ctx_size * builder.memory_k.element_size() * n_embd,
),
n_embd / n_head,
n_head,
session_len + input_len,
),
n_embd / n_head,
n_head,
session_len + input_len,
),
(0, 2, 1, 3),
);
ggml::set_tensor_name(&k, "K");
(0, 2, 1, 3),
)
.set_name("K");

// K * Q
let k_q = ctx0.op_mul_mat(&k, &q);
ggml::set_tensor_name(&k_q, "KQ");
let k_q = ctx0.op_mul_mat(&k, &q).set_name("KQ");

// KQ_scaled = KQ / sqrt(n_embd/n_head)
let kq_scale = ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt()));
ggml::set_tensor_name(&kq_scale, "1/sqrt(n_embd/n_head)");
let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale);
ggml::set_tensor_name(&k_q_scaled, "KQ_scaled");
let kq_scale = ctx0
.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt()))
.set_name("1/sqrt(n_embd/n_head)");
let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale).set_name("KQ_scaled");

// KQ_masked = mask_past(KQ_scaled)
let k_q_masked = ctx0.op_diag_mask_inf_inplace(&k_q_scaled, session_len);
ggml::set_tensor_name(&k_q_masked, "KQ_masked");
let k_q_masked = ctx0
.op_diag_mask_inf_inplace(&k_q_scaled, session_len)
.set_name("KQ_masked");

// KQ = soft_max(KQ_masked)
let k_q_soft_max = ctx0.op_soft_max_inplace(&k_q_masked);
ggml::set_tensor_name(&k_q_soft_max, "KQ_soft_max");
let k_q_soft_max = ctx0
.op_soft_max_inplace(&k_q_masked)
.set_name("KQ_soft_max");

// split cached V into n_head heads
let v = ctx0.op_view_3d(
builder.memory_v,
(session_len + input_len, n_embd / n_head, n_head),
(
ctx_size * builder.memory_v.element_size(),
ctx_size * builder.memory_v.element_size() * n_embd / n_head,
),
il * ctx_size * builder.memory_v.element_size() * n_embd,
);
ggml::set_tensor_name(&v, "V");
let v = ctx0
.op_view_3d(
builder.memory_v,
(session_len + input_len, n_embd / n_head, n_head),
(
ctx_size * builder.memory_v.element_size(),
ctx_size * builder.memory_v.element_size() * n_embd / n_head,
),
il * ctx_size * builder.memory_v.element_size() * n_embd,
)
.set_name("V");

let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max);
ggml::set_tensor_name(&k_q_v, "KQV");
let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max).set_name("KQV");

// KQV_merged = KQV.permute(0, 2, 1, 3)
let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3));
ggml::set_tensor_name(&k_q_v_merged, "KQV_merged");
let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)).set_name("KQV_merged");

// cur = KQV_merged.contiguous().view(n_embd, N)
current = ctx0.op_cpy(
&k_q_v_merged,
&ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len),
);
ggml::set_tensor_name(&current, "KQV_merged_contiguous");
current = ctx0
.op_cpy(
&k_q_v_merged,
&ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len),
)
.set_name("KQV_merged_contiguous");

// projection (no bias)
current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
Expand Down

0 comments on commit dfede25

Please sign in to comment.