diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs index 277c02c0..597447f7 100644 --- a/crates/ggml/src/lib.rs +++ b/crates/ggml/src/lib.rs @@ -125,7 +125,7 @@ pub const QNT_VERSION_FACTOR: u32 = sys::GGML_QNT_VERSION_FACTOR; pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE; /// The maximum length of a `ggml` tensor-name. -pub const MAX_NAME_LENGTH: u32 = sys::GGML_MAX_NAME; +pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize; #[derive(Debug, Copy, Clone, PartialEq, Eq, Default)] /// The type of a value in `ggml`. @@ -479,9 +479,3 @@ pub fn cpu_has_blas() -> bool { pub fn cpu_has_gpublas() -> bool { unsafe { sys::ggml_cpu_has_gpublas() != 0 } } - -/// Sets the name of a tensor. -pub fn set_tensor_name(tensor: &Tensor, name: &str) { - let c_name = std::ffi::CString::new(name).unwrap(); - unsafe { sys::ggml_set_name(tensor.ptr.as_ptr(), c_name.as_ptr()) }; -} diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs index 34c2ab2d..09366e71 100644 --- a/crates/ggml/src/tensor.rs +++ b/crates/ggml/src/tensor.rs @@ -1,8 +1,8 @@ use std::{os::raw::c_void, ptr::NonNull, sync::Weak}; -use crate::{accelerator::Backend, context::ContextInner, i64_to_usize, sys, Type}; - -const MAX_NAME_LENGTH: usize = crate::MAX_NAME_LENGTH as usize; +use crate::{ + accelerator::Backend, context::ContextInner, i64_to_usize, sys, Type, MAX_NAME_LENGTH, +}; /// Tensors are owned by the context. A tensor is alive as long as the /// underlying context it was created with is alive. @@ -21,30 +21,25 @@ impl Tensor { /// /// # Safety /// - /// The name must be a valid UTF-8 string and must not be longer than `MAX_NAME_LENGTH` characters. + /// The name must be a valid UTF-8 string and must not be longer than [`MAX_NAME_LENGTH`] bytes. pub fn set_name(mut self, name: &str) -> Tensor { assert!( name.len() <= MAX_NAME_LENGTH, - "Name '{}' is too long, max length is {} characters", - name, + "Tensor name must be less than {} bytes", MAX_NAME_LENGTH ); - let bytes = name.as_bytes(); - let mut array = [0i8; MAX_NAME_LENGTH]; - array[..bytes.len()].copy_from_slice(&bytes.iter().map(|&x| x as i8).collect::>()); - - self.with_alive_ctx_mut(|t| unsafe { t.ptr.as_mut().name = array }); + let c_name = std::ffi::CString::new(name).unwrap(); + self.with_alive_ctx_mut(|t| unsafe { sys::ggml_set_name(t.ptr.as_ptr(), c_name.as_ptr()) }); self } /// Gets the name of the tensor pub fn name(&self) -> String { self.with_alive_ctx(|| { - let name = unsafe { self.ptr.as_ref().name }; - let mut name = name.iter().map(|&x| x as u8).collect::>(); - name.retain(|&x| x != 0); - String::from_utf8(name).unwrap() + let name_ptr = unsafe { sys::ggml_get_name(self.ptr.as_ptr()) }; + let name = unsafe { std::ffi::CStr::from_ptr(name_ptr) }; + name.to_string_lossy().into_owned() }) } diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs index 820c337f..42314f41 100644 --- a/crates/llm-base/src/inference_session.rs +++ b/crates/llm-base/src/inference_session.rs @@ -219,8 +219,9 @@ impl InferenceSession { // Build a graph self.ctx0.recreate(); let ctx0 = &mut self.ctx0; - let mut embd = ctx0.new_tensor_1d(ggml::Type::I32, input_tokens.len()); - ggml::set_tensor_name(&embd, "embd"); + let mut embd = ctx0 + .new_tensor_1d(ggml::Type::I32, input_tokens.len()) + .set_name("embd"); let bc = BuildContext { ctx0: RefCell::new(ctx0), diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs index 2ebb645d..c98ab048 100644 --- a/crates/llm-base/src/loader.rs +++ b/crates/llm-base/src/loader.rs @@ -14,7 +14,7 @@ use crate::{ pub use ggml::{format::FormatMagic, ContainerType}; use ggml::{ format::{LoadError as FormatLoadError, PartialHyperparameters, TensorLoadInfo}, - Context, + Context, MAX_NAME_LENGTH, }; use memmap2::Mmap; use thiserror::Error; @@ -752,10 +752,9 @@ impl<'a> FileContext<'a> { } } - // The tensor name is truncated to it's maximum length. - let max_name_length: usize = ggml::MAX_NAME_LENGTH.try_into().unwrap(); - let tensor_name = if name.len() >= max_name_length { - &name[name.len() - max_name_length..] + // The tensor name is truncated to its maximum length. + let tensor_name = if name.len() >= MAX_NAME_LENGTH { + &name[name.len() - MAX_NAME_LENGTH..] } else { name }; diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs index ac43a175..0b4d185b 100644 --- a/crates/models/llama/src/lib.rs +++ b/crates/models/llama/src/lib.rs @@ -167,30 +167,32 @@ impl KnownModel for Llama { // self-attention // compute Q and K and RoPE them - let q_current = ctx0.op_rope_inplace( - &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].wq, ¤t), - n_embd / n_head, - n_head, - input_len, - ), - session_len, - n_rot, - 0, - ); - ggml::set_tensor_name(&q_current, "Qcur"); - let k_current = ctx0.op_rope_inplace( - &ctx0.op_reshape_3d( - &ctx0.op_mul_mat(&self.layers[il].wk, ¤t), - n_embd / n_head, - n_head, - input_len, - ), - session_len, - n_rot, - 0, - ); - ggml::set_tensor_name(&k_current, "Kcur"); + let q_current = ctx0 + .op_rope_inplace( + &ctx0.op_reshape_3d( + &ctx0.op_mul_mat(&self.layers[il].wq, ¤t), + n_embd / n_head, + n_head, + input_len, + ), + session_len, + n_rot, + 0, + ) + .set_name("Qcur"); + let k_current = ctx0 + .op_rope_inplace( + &ctx0.op_reshape_3d( + &ctx0.op_mul_mat(&self.layers[il].wk, ¤t), + n_embd / n_head, + n_head, + input_len, + ), + session_len, + n_rot, + 0, + ) + .set_name("Kcur"); // store key and value to memory // compute the transposed [N, n_embd] V matrix @@ -218,67 +220,68 @@ impl KnownModel for Llama { gf.build_forward_expand(&ctx0.op_cpy(&k_current, &k)); gf.build_forward_expand(&ctx0.op_cpy(&v_current, &v)); - let q = ctx0.op_permute(&q_current, (0, 2, 1, 3)); - ggml::set_tensor_name(&q, "Q"); - - let k = ctx0.op_permute( - &ctx0.op_reshape_3d( - &ctx0.op_view_1d( - builder.memory_k, - (session_len + input_len) * n_embd, - il * ctx_size * builder.memory_k.element_size() * n_embd, + let q = ctx0.op_permute(&q_current, (0, 2, 1, 3)).set_name("Q"); + + let k = ctx0 + .op_permute( + &ctx0.op_reshape_3d( + &ctx0.op_view_1d( + builder.memory_k, + (session_len + input_len) * n_embd, + il * ctx_size * builder.memory_k.element_size() * n_embd, + ), + n_embd / n_head, + n_head, + session_len + input_len, ), - n_embd / n_head, - n_head, - session_len + input_len, - ), - (0, 2, 1, 3), - ); - ggml::set_tensor_name(&k, "K"); + (0, 2, 1, 3), + ) + .set_name("K"); // K * Q - let k_q = ctx0.op_mul_mat(&k, &q); - ggml::set_tensor_name(&k_q, "KQ"); + let k_q = ctx0.op_mul_mat(&k, &q).set_name("KQ"); // KQ_scaled = KQ / sqrt(n_embd/n_head) - let kq_scale = ctx0.new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())); - ggml::set_tensor_name(&kq_scale, "1/sqrt(n_embd/n_head)"); - let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale); - ggml::set_tensor_name(&k_q_scaled, "KQ_scaled"); + let kq_scale = ctx0 + .new_f32(1.0 / ((n_embd as f32 / n_head as f32).sqrt())) + .set_name("1/sqrt(n_embd/n_head)"); + let k_q_scaled = ctx0.op_scale_inplace(&k_q, &kq_scale).set_name("KQ_scaled"); // KQ_masked = mask_past(KQ_scaled) - let k_q_masked = ctx0.op_diag_mask_inf_inplace(&k_q_scaled, session_len); - ggml::set_tensor_name(&k_q_masked, "KQ_masked"); + let k_q_masked = ctx0 + .op_diag_mask_inf_inplace(&k_q_scaled, session_len) + .set_name("KQ_masked"); // KQ = soft_max(KQ_masked) - let k_q_soft_max = ctx0.op_soft_max_inplace(&k_q_masked); - ggml::set_tensor_name(&k_q_soft_max, "KQ_soft_max"); + let k_q_soft_max = ctx0 + .op_soft_max_inplace(&k_q_masked) + .set_name("KQ_soft_max"); // split cached V into n_head heads - let v = ctx0.op_view_3d( - builder.memory_v, - (session_len + input_len, n_embd / n_head, n_head), - ( - ctx_size * builder.memory_v.element_size(), - ctx_size * builder.memory_v.element_size() * n_embd / n_head, - ), - il * ctx_size * builder.memory_v.element_size() * n_embd, - ); - ggml::set_tensor_name(&v, "V"); + let v = ctx0 + .op_view_3d( + builder.memory_v, + (session_len + input_len, n_embd / n_head, n_head), + ( + ctx_size * builder.memory_v.element_size(), + ctx_size * builder.memory_v.element_size() * n_embd / n_head, + ), + il * ctx_size * builder.memory_v.element_size() * n_embd, + ) + .set_name("V"); - let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max); - ggml::set_tensor_name(&k_q_v, "KQV"); + let k_q_v = ctx0.op_mul_mat(&v, &k_q_soft_max).set_name("KQV"); // KQV_merged = KQV.permute(0, 2, 1, 3) - let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)); - ggml::set_tensor_name(&k_q_v_merged, "KQV_merged"); + let k_q_v_merged = ctx0.op_permute(&k_q_v, (0, 2, 1, 3)).set_name("KQV_merged"); // cur = KQV_merged.contiguous().view(n_embd, N) - current = ctx0.op_cpy( - &k_q_v_merged, - &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), - ); - ggml::set_tensor_name(¤t, "KQV_merged_contiguous"); + current = ctx0 + .op_cpy( + &k_q_v_merged, + &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, input_len), + ) + .set_name("KQV_merged_contiguous"); // projection (no bias) current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t);