From d279371c19d8bdc387e78ec62b672409f5320fcc Mon Sep 17 00:00:00 2001 From: Juho Peltonen Date: Fri, 7 Apr 2023 01:08:47 +0300 Subject: [PATCH 1/2] Reserve more eval memory and use ggml scratch buffers --- ggml/src/lib.rs | 39 +++++++++++++++++++++++++++++++++++++++ llama-rs/src/lib.rs | 32 +++++++++++++++++++++++++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs index 22c9eee8..a54b72ca 100644 --- a/ggml/src/lib.rs +++ b/ggml/src/lib.rs @@ -303,6 +303,26 @@ impl Context { pub fn used_mem(&self) -> usize { unsafe { ggml_sys::ggml_used_mem(self.ptr.as_ptr()) } } + + /// Set scratch buffer + pub fn use_scratch(&self, scratch_buffer: Option<&mut Buffer>) { + let (size, data) = if let Some(buffer) = scratch_buffer { + (buffer.data.len(), buffer.data.as_ptr() as *mut c_void) + } else { + (0, std::ptr::null_mut()) + }; + // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API + unsafe { + ggml_sys::ggml_set_scratch( + self.ptr.as_ptr(), + ggml_sys::ggml_scratch { + offs: 0, + size, + data, + }, + ); + } + } } impl Drop for Context { @@ -315,6 +335,25 @@ impl Drop for Context { } } +/// Pre-allocated buffer +pub struct Buffer { + data: Vec, +} + +impl Buffer { + /// Creates new buffer + pub fn new(size: usize) -> Self { + let mut data: Vec = Vec::with_capacity(size); + // SAFETY: contents are left uninitialized. Don't use them. + #[allow(clippy::uninit_vec)] + unsafe { + data.set_len(size) + }; + + Buffer { data } + } +} + /// Tensors are owned by the context. A tensor is alive as long as the /// underlying context it was created with is alive. pub struct Tensor { diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index d5ef2a23..8636a86e 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -27,6 +27,8 @@ mod util; /// The end of text token. pub const EOT_TOKEN_ID: TokenId = 2; // Hardcoded (for now?) +const SCRATCH_SIZE: usize = 512 * 1024 * 1024; // 512MB + /// The hyperparameters of the model. #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Deserialize)] pub struct Hyperparameters { @@ -103,6 +105,9 @@ pub struct InferenceSession { /// The logits that were last predicted by the network. Zeroed out otherwise. last_logits: Vec, + + /// Scratch buffers + scratch: [ggml::Buffer; 2], } impl InferenceSession { fn repetition_penalty_tokens(&self) -> &[TokenId] { @@ -128,10 +133,18 @@ impl Clone for InferenceSession { mem_per_token: self.mem_per_token, tokens: self.tokens.clone(), last_logits: self.last_logits.clone(), + scratch: inference_session_scratch_buffers(), } } } +fn inference_session_scratch_buffers() -> [ggml::Buffer; 2] { + [ + ggml::Buffer::new(SCRATCH_SIZE), + ggml::Buffer::new(SCRATCH_SIZE), + ] +} + #[derive(serde::Serialize, Clone, PartialEq)] /// A serializable snapshot of the inference process. Can be saved to disk. // Keep in sync with [InferenceSession] and [InferenceSnapshot] @@ -1116,6 +1129,7 @@ impl Model { mem_per_token: 0, tokens: vec![], last_logits: vec![0.0; n_vocab], + scratch: inference_session_scratch_buffers(), } } @@ -1150,7 +1164,15 @@ impl Model { // For the first run, we need to guess a maximum buffer size so we can measure // the actual memory consumption of the temporary ggml context. - let mut buf_size = 1024 * 1024 * 1024; + let mut buf_size = 1024 + * 1024 + * if n_layer >= 80 { + 1536 + } else if n_layer >= 60 { + 1280 + } else { + 1024 + }; if session.mem_per_token > 0 && session.mem_per_token * n > buf_size { // add 10% to account for ggml object overhead buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize; @@ -1189,6 +1211,8 @@ impl Model { let input_self_attention = input_layer.share(); let mut current: ggml::Tensor; + ctx0.use_scratch(Some(&mut session.scratch[0])); + // norm { current = ctx0.op_rms_norm(&input_layer); @@ -1312,6 +1336,8 @@ impl Model { current = ctx0.op_mul_mat(&self.layers[il].wo, ¤t); } + ctx0.use_scratch(Some(&mut session.scratch[1])); + let input_feed_forward = ctx0.op_add(¤t, &input_self_attention); // feed-forward network @@ -1345,6 +1371,8 @@ impl Model { input_layer = current; } + ctx0.use_scratch(Some(&mut session.scratch[0])); + // Used at the end to optionally extract the embeddings. let embeddings_tensor; @@ -1362,6 +1390,8 @@ impl Model { input_layer = ctx0.op_mul_mat(&self.output, &input_layer); } + ctx0.use_scratch(None); + // logits -> probs // inpL = ctx0.op_soft_max(&inpL); From c48ab9f85de6ed9e38fba221d50ed797a8c913bb Mon Sep 17 00:00:00 2001 From: Philpax Date: Thu, 13 Apr 2023 02:37:52 +0200 Subject: [PATCH 2/2] refactor: improve docs + minor safety stuff --- ggml/src/lib.rs | 26 +++++++++++++++++--------- llama-rs/src/lib.rs | 20 +++++++++++++++----- 2 files changed, 32 insertions(+), 14 deletions(-) diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs index 94f44131..cbfd37db 100644 --- a/ggml/src/lib.rs +++ b/ggml/src/lib.rs @@ -379,8 +379,10 @@ impl Context { unsafe { ggml_sys::ggml_used_mem(self.ptr.as_ptr()) } } - /// Set scratch buffer - pub fn use_scratch(&self, scratch_buffer: Option<&mut Buffer>) { + /// Sets the scratch buffer to be used by this [Context]. + /// + /// If `scratch_buffer` is `None`, the scratch buffer will be disabled. + pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a mut Buffer>) { let (size, data) = if let Some(buffer) = scratch_buffer { (buffer.data.len(), buffer.data.as_ptr() as *mut c_void) } else { @@ -410,22 +412,28 @@ impl Drop for Context { } } -/// Pre-allocated buffer +/// A buffer of memory that can be used as a scratch buffer for a [Context]. +/// +/// See [Context::use_scratch]. pub struct Buffer { - data: Vec, + data: Box<[u8]>, } impl Buffer { - /// Creates new buffer + /// Creates a new buffer of the specified size. pub fn new(size: usize) -> Self { let mut data: Vec = Vec::with_capacity(size); - // SAFETY: contents are left uninitialized. Don't use them. + + // SAFETY: The contents are intentionally uninitialized, as they will be passed to + // the ggml C API which will fill them with data. #[allow(clippy::uninit_vec)] unsafe { - data.set_len(size) - }; + data.set_len(size); + } - Buffer { data } + Buffer { + data: data.into_boxed_slice(), + } } } diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index aa04cc7d..1a1159ec 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -27,7 +27,11 @@ mod util; /// The end of text token. pub const EOT_TOKEN_ID: TokenId = 2; // Hardcoded (for now?) -const SCRATCH_SIZE: usize = 512 * 1024 * 1024; // 512MB +// The size of a scratch buffer used for inference. This is used for temporary +// storage of intermediate results during inference. +// +// The specific value was copied from `llama.cpp`. +const SCRATCH_SIZE: usize = 512 * 1024 * 1024; /// The hyperparameters of the model. #[derive(Debug, Default, PartialEq, Eq, PartialOrd, Ord, Deserialize)] @@ -106,7 +110,10 @@ pub struct InferenceSession { /// The logits that were last predicted by the network. Zeroed out otherwise. last_logits: Vec, - /// Scratch buffers + /// Scratch buffers used during inference. + /// + /// The number of scratch buffers was copied from `llama.cpp`. + /// There is no specific reason for this number, but one is insufficient. scratch: [ggml::Buffer; 2], } impl InferenceSession { @@ -1171,15 +1178,18 @@ impl Model { // For the first run, we need to guess a maximum buffer size so we can measure // the actual memory consumption of the temporary ggml context. - let mut buf_size = 1024 - * 1024 - * if n_layer >= 80 { + // + // These numbers are from `llama.cpp`, and could potentially be more efficient. + let mut buf_size = { + let buf_size_mb = if n_layer >= 80 { 1536 } else if n_layer >= 60 { 1280 } else { 1024 }; + buf_size_mb * 1024 * 1024 + }; if session.mem_per_token > 0 && session.mem_per_token * n > buf_size { // add 10% to account for ggml object overhead buf_size = (1.1f64 * session.mem_per_token as f64 * n as f64) as usize;