diff --git a/binaries/generate-ggml-bindings/src/main.rs b/binaries/generate-ggml-bindings/src/main.rs
index 39acbb86..ad73ddba 100644
--- a/binaries/generate-ggml-bindings/src/main.rs
+++ b/binaries/generate-ggml-bindings/src/main.rs
@@ -27,6 +27,8 @@ fn generate_main(ggml_path: &Path, src_path: &Path) {
         .allowlist_file(r".*ggml.h")
         .header(ggml_path.join("k_quants.h").to_string_lossy())
         .allowlist_file(r".*k_quants.h")
+        .header(ggml_path.join("ggml-alloc.h").to_string_lossy())
+        .allowlist_file(r".*ggml-alloc.h")
         // Suppress some warnings
         .raw_line("#![allow(non_upper_case_globals)]")
         .raw_line("#![allow(non_camel_case_types)]")
@@ -88,6 +90,9 @@ fn generate_metal(ggml_path: &Path, src_path: &Path) {
     generate_extra("metal", ggml_path, src_path, |b| {
         b.header(ggml_path.join("ggml-metal.h").to_string_lossy())
             .allowlist_file(r".*ggml-metal\.h")
+            .raw_line("use super::ggml_tensor;")
+            .raw_line("use super::ggml_log_callback;")
+            .raw_line("use super::ggml_cgraph;")
     });
 }
 
diff --git a/binaries/llm-test/configs/mpt.json b/binaries/llm-test/configs/mpt.json
index 57a8bc89..c5d9d8d0 100644
--- a/binaries/llm-test/configs/mpt.json
+++ b/binaries/llm-test/configs/mpt.json
@@ -6,7 +6,7 @@
         {
             "Inference": {
                 "input": "When a llama rides a crab, ",
-                "output": "When a llama rides a crab,  the llama is called the \"crab rider\".\nThe crabs are very popular in South America, especially Brazil. They have been used as transportation for many years and they can carry up to five people at once!",
+                "output": "When a llama rides a crab,  the llama is called the \"crab rider\"\nThe Llamas are an animal that can be found in The Maze. They have no special abilities, but they do drop Llamaskin and occasionally some other items when killed by players or monsters alike (see below). It's unknown if there was ever any sort of breeding system for these animals as it seems to only exist on this one world so far; however their existence has been confirmed through player reports from multiple worlds where people claim having seen them before being able see anything else about what happened after seeing just 1-2 at most per game session which makes me believe",
                 "maximum_token_count": 128
             }
         },
diff --git a/crates/ggml/src/accelerator/metal.rs b/crates/ggml/src/accelerator/metal.rs
index 8fced466..a15e39f1 100644
--- a/crates/ggml/src/accelerator/metal.rs
+++ b/crates/ggml/src/accelerator/metal.rs
@@ -14,8 +14,8 @@ pub struct MetalContext {
 
 impl MetalContext {
     /// Create a new Metal context
-    pub fn new(n_threads: usize) -> Self {
-        let raw = unsafe { metal::ggml_metal_init(n_threads.try_into().unwrap()) };
+    pub fn new() -> Self {
+        let raw = unsafe { metal::ggml_metal_init(1) };
 
         MetalContext {
             contexts: vec![],
@@ -83,19 +83,14 @@ impl MetalContext {
         unsafe {
             metal::ggml_metal_graph_compute(
                 self.ptr.as_ptr(),
-                graph.inner as *mut ggml_sys::ggml_cgraph as *mut metal::ggml_cgraph,
+                graph.inner as *mut ggml_sys::ggml_cgraph,
             );
         }
     }
 
     /// Reads a tensor from Metal
     pub fn get_tensor(&self, tensor: &Tensor) {
-        unsafe {
-            metal::ggml_metal_get_tensor(
-                self.ptr.as_ptr(),
-                tensor.ptr.as_ptr() as *mut metal::ggml_tensor,
-            )
-        }
+        unsafe { metal::ggml_metal_get_tensor(self.ptr.as_ptr(), tensor.ptr.as_ptr()) }
     }
 }
 
diff --git a/crates/ggml/src/accelerator/mod.rs b/crates/ggml/src/accelerator/mod.rs
index 2e1cef17..731de9bc 100644
--- a/crates/ggml/src/accelerator/mod.rs
+++ b/crates/ggml/src/accelerator/mod.rs
@@ -71,6 +71,7 @@ pub fn initialize(device: i32) {
         //TODO: Make this configurable
         sys::cuda::ggml_init_cublas();
         sys::cuda::ggml_cuda_set_main_device(device);
+        sys::cuda::ggml_cuda_set_mul_mat_q(true);
         let split = 1.0f32;
         sys::cuda::ggml_cuda_set_tensor_split(&split as *const f32);
     }
diff --git a/crates/ggml/src/context.rs b/crates/ggml/src/context.rs
index 4c97086d..96f81b4f 100644
--- a/crates/ggml/src/context.rs
+++ b/crates/ggml/src/context.rs
@@ -21,7 +21,7 @@ pub struct Context {
     /// allocated tensors. Tensors are owned by the object, so a [`Tensor`]
     /// contains a `Weak` reference underneath and doesn't let you do anything
     /// with it if the underlying context has been deallocated.
-    inner: Arc<ContextInner>,
+    pub inner: Arc<ContextInner>,
 
     /// The storage for this context. This is stored so that the buffer can be dropped when the context is dropped.
     storage: Option<ContextStorage>,
@@ -31,7 +31,7 @@ pub struct Context {
 }
 
 /// Contains state shared between a context and its tensors
-pub(crate) struct ContextInner {
+pub struct ContextInner {
     pub ptr: NonNull<sys::ggml_context>,
 
     /// Offloaded tensors. Used to free them when the context is dropped.
@@ -73,7 +73,12 @@ impl ContextInner {
 /// Controls how the context uses memory.
 pub enum ContextStorage {
     /// Use the provided buffer as memory.
-    Buffer(Buffer),
+    Buffer {
+        /// The buffer to use as memory.
+        buffer: Buffer,
+        /// Whether to allocate tensors into this buffer.
+        allocate: bool,
+    },
     /// Use the provided memory mapped file as memory.
     Mmap(Mmap),
     /// Allocate `mem_size` bytes of memory.
@@ -94,7 +99,10 @@ impl ContextStorage {
     /// Returns the `Buffer` if this is a `Buffer` variant.
     pub fn as_buffer(&self) -> Option<&Buffer> {
         match self {
-            Self::Buffer(v) => Some(v),
+            Self::Buffer {
+                buffer: v,
+                allocate: _,
+            } => Some(v),
             _ => None,
         }
     }
@@ -115,7 +123,16 @@ impl PartialEq for ContextStorage {
     fn eq(&self, other: &Self) -> bool {
         use ContextStorage::*;
         match (self, other) {
-            (Buffer(l0), Buffer(r0)) => l0 == r0,
+            (
+                Buffer {
+                    buffer: l0,
+                    allocate: l1,
+                },
+                Buffer {
+                    buffer: r0,
+                    allocate: r1,
+                },
+            ) => l0 == r0 && l1 == r1,
             (Mmap(l0), Mmap(r0)) => l0.as_ptr() == r0.as_ptr(),
             (Allocate { mem_size: l }, Allocate { mem_size: r }) => l == r,
             _ => false,
@@ -130,10 +147,10 @@ impl Context {
     /// Creates a new [Context] with the given storage.
     pub fn new(storage: ContextStorage) -> Self {
         let init_params = match &storage {
-            ContextStorage::Buffer(buffer) => sys::ggml_init_params {
+            ContextStorage::Buffer { buffer, allocate } => sys::ggml_init_params {
                 mem_size: buffer.size(),
                 mem_buffer: buffer.data,
-                no_alloc: false,
+                no_alloc: !allocate,
             },
             ContextStorage::Mmap(mmap) => sys::ggml_init_params {
                 mem_size: mmap.len(),
@@ -160,8 +177,8 @@ impl Context {
 
     /// Creates a new [Context] with the specified buffer.
     /// The buffer will be used by GGML.
-    pub fn new_with_buffer(buffer: Buffer) -> Self {
-        Self::new(ContextStorage::Buffer(buffer))
+    pub fn new_with_buffer(buffer: Buffer, allocate: bool) -> Self {
+        Self::new(ContextStorage::Buffer { buffer, allocate })
     }
 
     /// Creates a new [Context] with the specified memory mapped file.
@@ -206,28 +223,6 @@ impl Context {
         unsafe { sys::ggml_used_mem(self.as_ptr()) }
     }
 
-    /// Sets the scratch buffer to be used by this [Context].
-    ///
-    /// If `scratch_buffer` is `None`, the scratch buffer will be disabled.
-    pub fn use_scratch<'a>(&'a self, scratch_buffer: Option<&'a Buffer>) {
-        let (size, data) = if let Some(buffer) = scratch_buffer {
-            (buffer.size(), buffer.data)
-        } else {
-            (0, std::ptr::null_mut())
-        };
-        // SAFETY: this just passes (most likely uninitialized) memory buffer to the ggml C API
-        unsafe {
-            sys::ggml_set_scratch(
-                self.as_ptr(),
-                sys::ggml_scratch {
-                    offs: 0,
-                    size,
-                    data,
-                },
-            );
-        }
-    }
-
     /// Creates a new 1D tensor.
     pub fn new_tensor_1d(&self, typ: Type, ne0: usize) -> Tensor {
         let raw = unsafe { sys::ggml_new_tensor_1d(self.as_ptr(), typ.into(), usize_to_i64(ne0)) };
@@ -294,7 +289,7 @@ impl Context {
 
     /// Creates a new tensor with the values of `a`, but normalized.
     pub fn op_norm(&self, a: &Tensor) -> Tensor {
-        let tensor = unsafe { sys::ggml_norm(self.as_ptr(), a.ptr.as_ptr()) };
+        let tensor = unsafe { sys::ggml_norm(self.as_ptr(), a.ptr.as_ptr(), crate::DEFAULT_EPS) };
         self.new_tensor_raw(tensor)
     }
 
diff --git a/crates/ggml/src/lib.rs b/crates/ggml/src/lib.rs
index 8b6910eb..26bcc548 100644
--- a/crates/ggml/src/lib.rs
+++ b/crates/ggml/src/lib.rs
@@ -10,6 +10,8 @@
 use std::{
     alloc::Layout,
     os::raw::{c_int, c_void},
+    ptr::NonNull,
+    sync::Arc,
 };
 
 mod context;
@@ -129,7 +131,13 @@ pub const OBJECT_SIZE: usize = sys::GGML_OBJECT_SIZE;
 pub const MAX_NAME_LENGTH: usize = sys::GGML_MAX_NAME as usize;
 
 /// Default epsilon to use for RMS computation.
-pub const DEFAULT_EPS: f32 = sys::llama::LLAMA_DEFAULT_RMS_EPS as f32;
+pub const DEFAULT_EPS: f32 = 0.000005;
+
+/// Maximum number of nodes in a `ggml` graph.
+pub const MAX_NODES: usize = sys::GGML_MAX_NODES as usize;
+
+/// Alignment used for the Tensors in a `ggml` graph.
+pub const TENSOR_ALIGNMENT: usize = 32;
 
 /// Value overrides to use for RoPE.
 ///
@@ -280,10 +288,8 @@ impl Type {
     }
 }
 
-/// A buffer of memory that can be used as a scratch buffer for a [Context].
-///
-/// See [Context::use_scratch].
-#[derive(PartialEq, Eq)]
+/// A buffer of memory that can be used as a buffer for a [Context] or [GraphAllocator].
+#[derive(PartialEq, Eq, Debug)]
 pub struct Buffer {
     data: *mut c_void,
     layout: Layout,
@@ -304,10 +310,27 @@ impl Buffer {
         }
     }
 
+    /// Creates a new buffer of the specified size, without aligning it.
+    pub fn new_unaligned(size: usize) -> Self {
+        let layout = Layout::from_size_align(size, 1).unwrap();
+
+        unsafe {
+            Buffer {
+                data: std::alloc::alloc(layout).cast(),
+                layout,
+            }
+        }
+    }
+
     /// Returns the size of the buffer in bytes
     pub fn size(&self) -> usize {
         self.layout.size()
     }
+
+    /// Returns a pointer to the data in this buffer.
+    pub fn data(&mut self) -> *mut c_void {
+        self.data
+    }
 }
 
 impl Drop for Buffer {
@@ -333,6 +356,37 @@ impl ComputationGraph {
     pub fn build_forward_expand(&mut self, tensor: &Tensor) {
         unsafe { sys::ggml_build_forward_expand(self.inner, tensor.ptr.as_ptr()) }
     }
+
+    /// Returns the leafs in this graph.
+    pub fn leafs(&self, context: &Context) -> Vec<Tensor> {
+        let mut wrapped_leafs: Vec<Tensor> = vec![];
+        unsafe {
+            for leaf in self.inner.as_ref().unwrap().leafs {
+                if !leaf.is_null() {
+                    wrapped_leafs.push(Tensor {
+                        ptr: NonNull::new(leaf).expect("Should not be null"),
+                        inner: Arc::downgrade(&context.inner),
+                    })
+                }
+            }
+            wrapped_leafs
+        }
+    }
+    /// Returns the nodes in this graph.
+    pub fn nodes(&self, context: &Context) -> Vec<Tensor> {
+        let mut wrapped_nodes: Vec<Tensor> = vec![];
+        unsafe {
+            for leaf in self.inner.as_ref().unwrap().leafs {
+                if !leaf.is_null() {
+                    wrapped_nodes.push(Tensor {
+                        ptr: NonNull::new(leaf).expect("Should not be null"),
+                        inner: Arc::downgrade(&context.inner),
+                    })
+                }
+            }
+            wrapped_nodes
+        }
+    }
 }
 
 /// A `ggml` execution plan. Contains the information needed to execute a computation graph.
@@ -350,30 +404,79 @@ impl GraphExecutionPlan {
         }
     }
 
-    /// Creates a [Type::I8] work buffer with size `plan.work_size` for this [GraphExecutionPlan] in the given [Context].
-    fn create_work_buffer(&mut self, context: &Context) -> Tensor {
-        context.new_tensor_1d(Type::I8, self.inner.work_size)
-    }
+    /// Execute this [GraphExecutionPlan] in the given [Context].
+    pub fn execute(&mut self, buffer: &mut Vec<u8>) {
+        if self.inner.work_size > 0 {
+            buffer.resize(self.inner.work_size, 0);
+            self.inner.work_data = buffer.as_mut_ptr().cast();
+        }
 
-    /// Assign a work buffer to this [GraphExecutionPlan].
-    fn assign_work_buffer(&mut self, buffer: &mut Tensor) {
-        assert!(
-            buffer.get_type() == Type::I8,
-            "Work buffer must be of type i8"
-        );
         unsafe {
-            self.inner.work_data = buffer.data().cast();
+            sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
         }
     }
+}
 
-    /// Execute this [GraphExecutionPlan] in the given [Context].
-    pub fn execute(&mut self, context: &Context) {
-        let mut work_buffer = self.create_work_buffer(context);
-        self.assign_work_buffer(&mut work_buffer);
+#[derive(PartialEq, Eq, Debug)]
+/// Acts as a RAII-guard over a `sys::ggml_allocr`, allocating via
+/// `ggml_allocr_new` and dropping via `ggml_allocr_free`.
+/// Used to allocate the memory used by a computational graph.
+pub struct GraphAllocator {
+    /// The underlying `sys::ggml_allocr` pointer.
+    pub ptr: *mut sys::ggml_allocr,
+    /// The buffer used by this allocator.
+    pub buffer: Buffer,
+}
 
-        unsafe {
-            sys::ggml_graph_compute(self.inner_graph, &mut self.inner);
-        }
+impl GraphAllocator {
+    /// Create a new allocator with the specified buffer.
+    pub fn new(buffer: Buffer, tensor_alignment: usize) -> Self {
+        let ptr = unsafe { sys::ggml_allocr_new(buffer.data, buffer.size(), tensor_alignment) };
+        Self { ptr, buffer }
+    }
+
+    /// Create a new allocator to measure a computational graph.
+    pub fn new_measurement(tensor_alignment: usize) -> Self {
+        let ptr = unsafe { sys::ggml_allocr_new_measure(tensor_alignment) };
+        let buffer = Buffer::new(tensor_alignment);
+        Self { ptr, buffer }
+    }
+
+    /// Allocates a computational graph in the allocator and returns the size in bytes.
+    pub fn allocate_graph(&self, graph: &ComputationGraph) -> usize {
+        unsafe { sys::ggml_allocr_alloc_graph(self.ptr, graph.inner) }
+    }
+
+    /// Resets the allocator for a new forward pass.
+    pub fn reset(&self) {
+        unsafe { sys::ggml_allocr_reset(self.ptr) }
+    }
+
+    /// Returns true if the allocator is in measuring mode.
+    pub fn in_measuring_mode(&self) -> bool {
+        unsafe { sys::ggml_allocr_is_measure(self.ptr) }
+    }
+
+    /// Allocates memory for a given tensor in the allocator.
+    pub fn allocate(&self, tensor: &Tensor) {
+        unsafe { sys::ggml_allocr_alloc(self.ptr, tensor.ptr.as_ptr()) }
+    }
+
+    /// Switches the buffer used by the allocator.
+    pub fn resize_buffer(&mut self, graph_size: usize, tensor_alignment: usize) {
+        // Free the old allocator
+        unsafe { sys::ggml_allocr_free(self.ptr) }
+        //Resize the buffer
+        self.buffer = Buffer::new_unaligned(graph_size);
+        // Create a new allocator with the new buffer
+        self.ptr =
+            unsafe { sys::ggml_allocr_new(self.buffer.data, self.buffer.size(), tensor_alignment) };
+    }
+}
+
+impl Drop for GraphAllocator {
+    fn drop(&mut self) {
+        unsafe { sys::ggml_allocr_free(self.ptr) }
     }
 }
 
@@ -496,3 +599,8 @@ pub fn cpu_has_gpublas() -> bool {
 pub fn graph_overhead() -> usize {
     unsafe { sys::ggml_graph_overhead() }
 }
+
+/// Returns the tensor overhead in bytes.
+pub fn tensor_overhead() -> usize {
+    unsafe { sys::ggml_tensor_overhead() }
+}
diff --git a/crates/ggml/src/tensor.rs b/crates/ggml/src/tensor.rs
index 33d7114c..ee5354c2 100644
--- a/crates/ggml/src/tensor.rs
+++ b/crates/ggml/src/tensor.rs
@@ -52,6 +52,11 @@ impl Tensor {
         })
     }
 
+    /// Returns true if the 'extra' field of this tensor is set. e.g. by ggml-cuda
+    pub fn has_extras(&self) -> bool {
+        self.with_alive_ctx(|| unsafe { !self.ptr.as_ref().extra.is_null() })
+    }
+
     /// Sets the tensor's acceleration backend and moves the tensor's data to the new backend.
     pub fn transfer_to(mut self, backend: Backend) -> Tensor {
         self.with_alive_ctx_mut(|t| {
@@ -88,7 +93,7 @@ impl Tensor {
         self.with_alive_ctx(|| {
             #[cfg(feature = "cublas")]
             unsafe {
-                sys::cuda::ggml_cuda_assign_buffers(self.ptr.as_ptr());
+                sys::cuda::ggml_cuda_assign_buffers_no_alloc(self.ptr.as_ptr());
             }
         })
     }
@@ -111,6 +116,18 @@ impl Tensor {
         })
     }
 
+    /// If ggml-sys is compiled with CUDA support, this function will set the tensor's scratch offset.
+    /// If not, this is a no-op.
+    #[allow(unused_variables)]
+    pub fn assign_scratch_offset(&self, offset: usize) {
+        self.with_alive_ctx(|| {
+            #[cfg(feature = "cublas")]
+            unsafe {
+                sys::cuda::ggml_cuda_assign_scratch_offset(self.ptr.as_ptr(), offset);
+            }
+        })
+    }
+
     /// Creates a shared copy of this tensor pointer.
     pub fn share(&self) -> Self {
         Tensor {
diff --git a/crates/ggml/sys/build.rs b/crates/ggml/sys/build.rs
index b2921d47..ba7e876b 100644
--- a/crates/ggml/sys/build.rs
+++ b/crates/ggml/sys/build.rs
@@ -12,8 +12,13 @@ fn main() {
     let mut builder = cc::Build::new();
 
     let build = builder
-        .files(["llama-cpp/ggml.c", "llama-cpp/k_quants.c"])
+        .files([
+            "llama-cpp/ggml.c",
+            "llama-cpp/k_quants.c",
+            "llama-cpp/ggml-alloc.c",
+        ])
         .define("GGML_USE_K_QUANTS", None)
+        .define("QK_K", Some("256"))
         .includes(["llama-cpp"]);
 
     // This is a very basic heuristic for applying compile flags.
@@ -87,6 +92,10 @@ fn main() {
         _ => {}
     }
 
+    if compiler.is_like_gnu() && target_os == "linux" {
+        build.define("_GNU_SOURCE", None);
+    }
+
     if is_release {
         build.define("NDEBUG", None);
     }
diff --git a/crates/ggml/sys/llama-cpp b/crates/ggml/sys/llama-cpp
index 1a941869..da040034 160000
--- a/crates/ggml/sys/llama-cpp
+++ b/crates/ggml/sys/llama-cpp
@@ -1 +1 @@
-Subproject commit 1a941869cbef8e9cc351a6c6987e4ae3b0f021f7
+Subproject commit da0400344be12074e67dcabc565140289cf7efaa
diff --git a/crates/ggml/sys/src/cuda.rs b/crates/ggml/sys/src/cuda.rs
index a9ae1a8d..5208b66e 100644
--- a/crates/ggml/sys/src/cuda.rs
+++ b/crates/ggml/sys/src/cuda.rs
@@ -3,15 +3,17 @@
 use super::ggml_compute_params;
 use super::ggml_tensor;
 
+pub const GGML_CUDA_NAME: &[u8; 5usize] = b"CUDA\0";
+pub const GGML_CUBLAS_NAME: &[u8; 7usize] = b"cuBLAS\0";
 pub const GGML_CUDA_MAX_DEVICES: u32 = 16;
 extern "C" {
     pub fn ggml_init_cublas();
 }
 extern "C" {
-    pub fn ggml_cuda_set_tensor_split(tensor_split: *const f32);
+    pub fn ggml_cuda_host_malloc(size: usize) -> *mut ::std::os::raw::c_void;
 }
 extern "C" {
-    pub fn ggml_cuda_mul(src0: *const ggml_tensor, src1: *const ggml_tensor, dst: *mut ggml_tensor);
+    pub fn ggml_cuda_host_free(ptr: *mut ::std::os::raw::c_void);
 }
 extern "C" {
     pub fn ggml_cuda_can_mul_mat(
@@ -21,26 +23,7 @@ extern "C" {
     ) -> bool;
 }
 extern "C" {
-    pub fn ggml_cuda_mul_mat_get_wsize(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-    ) -> usize;
-}
-extern "C" {
-    pub fn ggml_cuda_mul_mat(
-        src0: *const ggml_tensor,
-        src1: *const ggml_tensor,
-        dst: *mut ggml_tensor,
-        wdata: *mut ::std::os::raw::c_void,
-        wsize: usize,
-    );
-}
-extern "C" {
-    pub fn ggml_cuda_host_malloc(size: usize) -> *mut ::std::os::raw::c_void;
-}
-extern "C" {
-    pub fn ggml_cuda_host_free(ptr: *mut ::std::os::raw::c_void);
+    pub fn ggml_cuda_set_tensor_split(tensor_split: *const f32);
 }
 extern "C" {
     pub fn ggml_cuda_transform_tensor(data: *mut ::std::os::raw::c_void, tensor: *mut ggml_tensor);
@@ -57,6 +40,12 @@ extern "C" {
 extern "C" {
     pub fn ggml_cuda_assign_buffers_force_inplace(tensor: *mut ggml_tensor);
 }
+extern "C" {
+    pub fn ggml_cuda_assign_buffers_no_alloc(tensor: *mut ggml_tensor);
+}
+extern "C" {
+    pub fn ggml_cuda_assign_scratch_offset(tensor: *mut ggml_tensor, offset: usize);
+}
 extern "C" {
     pub fn ggml_cuda_set_main_device(main_device: ::std::os::raw::c_int);
 }
@@ -75,3 +64,13 @@ extern "C" {
         tensor: *mut ggml_tensor,
     ) -> bool;
 }
+extern "C" {
+    pub fn ggml_cuda_get_device_count() -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn ggml_cuda_get_device_description(
+        device: ::std::os::raw::c_int,
+        description: *mut ::std::os::raw::c_char,
+        description_size: usize,
+    );
+}
diff --git a/crates/ggml/sys/src/lib.rs b/crates/ggml/sys/src/lib.rs
index 77b47802..71b34251 100644
--- a/crates/ggml/sys/src/lib.rs
+++ b/crates/ggml/sys/src/lib.rs
@@ -22,12 +22,17 @@ pub const GGML_MAX_NODES: u32 = 4096;
 pub const GGML_MAX_PARAMS: u32 = 256;
 pub const GGML_MAX_CONTEXTS: u32 = 64;
 pub const GGML_MAX_SRC: u32 = 6;
-pub const GGML_MAX_NAME: u32 = 48;
+pub const GGML_MAX_NAME: u32 = 64;
 pub const GGML_MAX_OP_PARAMS: u32 = 32;
 pub const GGML_DEFAULT_N_THREADS: u32 = 4;
+pub const GGML_MEM_ALIGN: u32 = 16;
 pub const GGML_EXIT_SUCCESS: u32 = 0;
 pub const GGML_EXIT_ABORTED: u32 = 1;
+pub const GGUF_MAGIC: u32 = 1179993927;
+pub const GGUF_VERSION: u32 = 2;
+pub const GGUF_DEFAULT_ALIGNMENT: u32 = 32;
 pub const GGML_GRAPH_HASHTABLE_SIZE: u32 = 8273;
+pub const GGML_N_TASKS_MAX: i32 = -1;
 pub const QK_K: u32 = 256;
 pub const K_SCALE_SIZE: u32 = 12;
 pub type ggml_fp16_t = u16;
@@ -103,49 +108,58 @@ pub const ggml_op_GGML_OP_MEAN: ggml_op = 13;
 pub const ggml_op_GGML_OP_ARGMAX: ggml_op = 14;
 pub const ggml_op_GGML_OP_REPEAT: ggml_op = 15;
 pub const ggml_op_GGML_OP_REPEAT_BACK: ggml_op = 16;
-pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 17;
-pub const ggml_op_GGML_OP_NORM: ggml_op = 18;
-pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 19;
-pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 20;
-pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 21;
-pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 22;
-pub const ggml_op_GGML_OP_SCALE: ggml_op = 23;
-pub const ggml_op_GGML_OP_SET: ggml_op = 24;
-pub const ggml_op_GGML_OP_CPY: ggml_op = 25;
-pub const ggml_op_GGML_OP_CONT: ggml_op = 26;
-pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 27;
-pub const ggml_op_GGML_OP_VIEW: ggml_op = 28;
-pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 29;
-pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 30;
-pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 31;
-pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 32;
-pub const ggml_op_GGML_OP_DIAG: ggml_op = 33;
-pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 34;
-pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 35;
-pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 36;
-pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 37;
-pub const ggml_op_GGML_OP_ROPE: ggml_op = 38;
-pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 39;
-pub const ggml_op_GGML_OP_ALIBI: ggml_op = 40;
-pub const ggml_op_GGML_OP_CLAMP: ggml_op = 41;
-pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 42;
-pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 43;
-pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 44;
-pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 45;
-pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 46;
-pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 47;
-pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 48;
-pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 49;
-pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 50;
-pub const ggml_op_GGML_OP_UNARY: ggml_op = 51;
-pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 52;
-pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 53;
-pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 54;
-pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 55;
-pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 56;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 57;
-pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 58;
-pub const ggml_op_GGML_OP_COUNT: ggml_op = 59;
+pub const ggml_op_GGML_OP_CONCAT: ggml_op = 17;
+pub const ggml_op_GGML_OP_SILU_BACK: ggml_op = 18;
+pub const ggml_op_GGML_OP_NORM: ggml_op = 19;
+pub const ggml_op_GGML_OP_RMS_NORM: ggml_op = 20;
+pub const ggml_op_GGML_OP_RMS_NORM_BACK: ggml_op = 21;
+pub const ggml_op_GGML_OP_GROUP_NORM: ggml_op = 22;
+pub const ggml_op_GGML_OP_MUL_MAT: ggml_op = 23;
+pub const ggml_op_GGML_OP_OUT_PROD: ggml_op = 24;
+pub const ggml_op_GGML_OP_SCALE: ggml_op = 25;
+pub const ggml_op_GGML_OP_SET: ggml_op = 26;
+pub const ggml_op_GGML_OP_CPY: ggml_op = 27;
+pub const ggml_op_GGML_OP_CONT: ggml_op = 28;
+pub const ggml_op_GGML_OP_RESHAPE: ggml_op = 29;
+pub const ggml_op_GGML_OP_VIEW: ggml_op = 30;
+pub const ggml_op_GGML_OP_PERMUTE: ggml_op = 31;
+pub const ggml_op_GGML_OP_TRANSPOSE: ggml_op = 32;
+pub const ggml_op_GGML_OP_GET_ROWS: ggml_op = 33;
+pub const ggml_op_GGML_OP_GET_ROWS_BACK: ggml_op = 34;
+pub const ggml_op_GGML_OP_DIAG: ggml_op = 35;
+pub const ggml_op_GGML_OP_DIAG_MASK_INF: ggml_op = 36;
+pub const ggml_op_GGML_OP_DIAG_MASK_ZERO: ggml_op = 37;
+pub const ggml_op_GGML_OP_SOFT_MAX: ggml_op = 38;
+pub const ggml_op_GGML_OP_SOFT_MAX_BACK: ggml_op = 39;
+pub const ggml_op_GGML_OP_ROPE: ggml_op = 40;
+pub const ggml_op_GGML_OP_ROPE_BACK: ggml_op = 41;
+pub const ggml_op_GGML_OP_ALIBI: ggml_op = 42;
+pub const ggml_op_GGML_OP_CLAMP: ggml_op = 43;
+pub const ggml_op_GGML_OP_CONV_1D: ggml_op = 44;
+pub const ggml_op_GGML_OP_CONV_2D: ggml_op = 45;
+pub const ggml_op_GGML_OP_CONV_TRANSPOSE_2D: ggml_op = 46;
+pub const ggml_op_GGML_OP_POOL_1D: ggml_op = 47;
+pub const ggml_op_GGML_OP_POOL_2D: ggml_op = 48;
+pub const ggml_op_GGML_OP_UPSCALE: ggml_op = 49;
+pub const ggml_op_GGML_OP_FLASH_ATTN: ggml_op = 50;
+pub const ggml_op_GGML_OP_FLASH_FF: ggml_op = 51;
+pub const ggml_op_GGML_OP_FLASH_ATTN_BACK: ggml_op = 52;
+pub const ggml_op_GGML_OP_WIN_PART: ggml_op = 53;
+pub const ggml_op_GGML_OP_WIN_UNPART: ggml_op = 54;
+pub const ggml_op_GGML_OP_GET_REL_POS: ggml_op = 55;
+pub const ggml_op_GGML_OP_ADD_REL_POS: ggml_op = 56;
+pub const ggml_op_GGML_OP_UNARY: ggml_op = 57;
+pub const ggml_op_GGML_OP_MAP_UNARY: ggml_op = 58;
+pub const ggml_op_GGML_OP_MAP_BINARY: ggml_op = 59;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1_F32: ggml_op = 60;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2_F32: ggml_op = 61;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3_F32: ggml_op = 62;
+pub const ggml_op_GGML_OP_MAP_CUSTOM1: ggml_op = 63;
+pub const ggml_op_GGML_OP_MAP_CUSTOM2: ggml_op = 64;
+pub const ggml_op_GGML_OP_MAP_CUSTOM3: ggml_op = 65;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS: ggml_op = 66;
+pub const ggml_op_GGML_OP_CROSS_ENTROPY_LOSS_BACK: ggml_op = 67;
+pub const ggml_op_GGML_OP_COUNT: ggml_op = 68;
 pub type ggml_op = ::std::os::raw::c_uint;
 pub const ggml_unary_op_GGML_UNARY_OP_ABS: ggml_unary_op = 0;
 pub const ggml_unary_op_GGML_UNARY_OP_SGN: ggml_unary_op = 1;
@@ -157,11 +171,15 @@ pub const ggml_unary_op_GGML_UNARY_OP_RELU: ggml_unary_op = 6;
 pub const ggml_unary_op_GGML_UNARY_OP_GELU: ggml_unary_op = 7;
 pub const ggml_unary_op_GGML_UNARY_OP_GELU_QUICK: ggml_unary_op = 8;
 pub const ggml_unary_op_GGML_UNARY_OP_SILU: ggml_unary_op = 9;
-pub type ggml_unary_op = ::std::os::raw::c_int;
+pub type ggml_unary_op = ::std::os::raw::c_uint;
 pub const ggml_object_type_GGML_OBJECT_TENSOR: ggml_object_type = 0;
 pub const ggml_object_type_GGML_OBJECT_GRAPH: ggml_object_type = 1;
 pub const ggml_object_type_GGML_OBJECT_WORK_BUFFER: ggml_object_type = 2;
-pub type ggml_object_type = ::std::os::raw::c_int;
+pub type ggml_object_type = ::std::os::raw::c_uint;
+pub const ggml_log_level_GGML_LOG_LEVEL_ERROR: ggml_log_level = 2;
+pub const ggml_log_level_GGML_LOG_LEVEL_WARN: ggml_log_level = 3;
+pub const ggml_log_level_GGML_LOG_LEVEL_INFO: ggml_log_level = 4;
+pub type ggml_log_level = ::std::os::raw::c_uint;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_object {
@@ -253,8 +271,10 @@ pub struct ggml_tensor {
     pub perf_runs: ::std::os::raw::c_int,
     pub perf_cycles: i64,
     pub perf_time_us: i64,
+    pub view_src: *mut ggml_tensor,
+    pub view_offs: usize,
     pub data: *mut ::std::os::raw::c_void,
-    pub name: [::std::os::raw::c_char; 48usize],
+    pub name: [::std::os::raw::c_char; 64usize],
     pub extra: *mut ::std::os::raw::c_void,
     pub padding: [::std::os::raw::c_char; 4usize],
 }
@@ -264,7 +284,7 @@ fn bindgen_test_layout_ggml_tensor() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_tensor>(),
-        272usize,
+        304usize,
         concat!("Size of: ", stringify!(ggml_tensor))
     );
     assert_eq!(
@@ -403,8 +423,28 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).view_src) as usize - ptr as usize },
         200usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(view_src)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).view_offs) as usize - ptr as usize },
+        208usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_tensor),
+            "::",
+            stringify!(view_offs)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).data) as usize - ptr as usize },
+        216usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -414,7 +454,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).name) as usize - ptr as usize },
-        208usize,
+        224usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -424,7 +464,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).extra) as usize - ptr as usize },
-        256usize,
+        288usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -434,7 +474,7 @@ fn bindgen_test_layout_ggml_tensor() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).padding) as usize - ptr as usize },
-        264usize,
+        296usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_tensor),
@@ -443,7 +483,7 @@ fn bindgen_test_layout_ggml_tensor() {
         )
     );
 }
-pub const GGML_TENSOR_SIZE: usize = 272;
+pub const GGML_TENSOR_SIZE: usize = 304;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_cplan {
@@ -867,6 +907,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_nbytes(tensor: *const ggml_tensor) -> usize;
 }
+extern "C" {
+    pub fn ggml_nbytes_pad(tensor: *const ggml_tensor) -> usize;
+}
 extern "C" {
     pub fn ggml_nbytes_split(
         tensor: *const ggml_tensor,
@@ -909,6 +952,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_is_permuted(tensor: *const ggml_tensor) -> bool;
 }
+extern "C" {
+    pub fn ggml_are_same_shape(t0: *const ggml_tensor, t1: *const ggml_tensor) -> bool;
+}
 extern "C" {
     pub fn ggml_tensor_overhead() -> usize;
 }
@@ -991,7 +1037,7 @@ extern "C" {
     pub fn ggml_dup_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *const ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_view_tensor(ctx: *mut ggml_context, src: *mut ggml_tensor) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_get_tensor(
@@ -1187,6 +1233,13 @@ extern "C" {
         b: *mut ggml_tensor,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_concat(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_abs(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
 }
@@ -1256,10 +1309,14 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_norm_inplace(ctx: *mut ggml_context, a: *mut ggml_tensor) -> *mut ggml_tensor;
+    pub fn ggml_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        eps: f32,
+    ) -> *mut ggml_tensor;
 }
 extern "C" {
     pub fn ggml_rms_norm(ctx: *mut ggml_context, a: *mut ggml_tensor, eps: f32)
@@ -1272,11 +1329,26 @@ extern "C" {
         eps: f32,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_group_norm(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_groups: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_group_norm_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_groups: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rms_norm_back(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
+        eps: f32,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1591,6 +1663,16 @@ extern "C" {
         freq_scale: f32,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_rope_xpos_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        n_past: ::std::os::raw::c_int,
+        n_dims: ::std::os::raw::c_int,
+        base: f32,
+        down: bool,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_rope_back(
         ctx: *mut ggml_context,
@@ -1599,6 +1681,10 @@ extern "C" {
         n_dims: ::std::os::raw::c_int,
         mode: ::std::os::raw::c_int,
         n_ctx: ::std::os::raw::c_int,
+        freq_base: f32,
+        freq_scale: f32,
+        xpos_base: f32,
+        xpos_down: bool,
     ) -> *mut ggml_tensor;
 }
 extern "C" {
@@ -1628,6 +1714,15 @@ extern "C" {
         d0: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_conv_1d_ph(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        s: ::std::os::raw::c_int,
+        d: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_conv_2d(
         ctx: *mut ggml_context,
@@ -1642,18 +1737,31 @@ extern "C" {
     ) -> *mut ggml_tensor;
 }
 extern "C" {
-    pub fn ggml_conv_1d_ph(
+    pub fn ggml_conv_2d_sk_p0(
         ctx: *mut ggml_context,
         a: *mut ggml_tensor,
         b: *mut ggml_tensor,
-        s: ::std::os::raw::c_int,
-        d: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_2d_s1_ph(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_conv_transpose_2d_p0(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        stride: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
 pub const ggml_op_pool_GGML_OP_POOL_MAX: ggml_op_pool = 0;
 pub const ggml_op_pool_GGML_OP_POOL_AVG: ggml_op_pool = 1;
 pub const ggml_op_pool_GGML_OP_POOL_COUNT: ggml_op_pool = 2;
-pub type ggml_op_pool = ::std::os::raw::c_int;
+pub type ggml_op_pool = ::std::os::raw::c_uint;
 extern "C" {
     pub fn ggml_pool_1d(
         ctx: *mut ggml_context,
@@ -1677,6 +1785,13 @@ extern "C" {
         p1: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_upscale(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        scale_factor: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_flash_attn(
         ctx: *mut ggml_context,
@@ -1722,6 +1837,44 @@ extern "C" {
         w: ::std::os::raw::c_int,
     ) -> *mut ggml_tensor;
 }
+extern "C" {
+    pub fn ggml_unary(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_unary_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        op: ggml_unary_op,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_get_rel_pos(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        qh: ::std::os::raw::c_int,
+        kh: ::std::os::raw::c_int,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_rel_pos(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        pw: *mut ggml_tensor,
+        ph: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_add_rel_pos_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        pw: *mut ggml_tensor,
+        ph: *mut ggml_tensor,
+    ) -> *mut ggml_tensor;
+}
 pub type ggml_unary_op_f32_t = ::std::option::Option<
     unsafe extern "C" fn(arg1: ::std::os::raw::c_int, arg2: *mut f32, arg3: *const f32),
 >;
@@ -1750,20 +1903,6 @@ pub type ggml_custom3_op_f32_t = ::std::option::Option<
         arg4: *const ggml_tensor,
     ),
 >;
-extern "C" {
-    pub fn ggml_unary(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        op: ggml_unary_op,
-    ) -> *mut ggml_tensor;
-}
-extern "C" {
-    pub fn ggml_unary_inplace(
-        ctx: *mut ggml_context,
-        a: *mut ggml_tensor,
-        op: ggml_unary_op,
-    ) -> *mut ggml_tensor;
-}
 extern "C" {
     pub fn ggml_map_unary_f32(
         ctx: *mut ggml_context,
@@ -1842,6 +1981,96 @@ extern "C" {
         fun: ggml_custom3_op_f32_t,
     ) -> *mut ggml_tensor;
 }
+pub type ggml_custom1_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+pub type ggml_custom2_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        b: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+pub type ggml_custom3_op_t = ::std::option::Option<
+    unsafe extern "C" fn(
+        dst: *mut ggml_tensor,
+        a: *const ggml_tensor,
+        b: *const ggml_tensor,
+        c: *const ggml_tensor,
+        ith: ::std::os::raw::c_int,
+        nth: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ),
+>;
+extern "C" {
+    pub fn ggml_map_custom1(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom1_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        fun: ggml_custom1_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom2_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        fun: ggml_custom2_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
+extern "C" {
+    pub fn ggml_map_custom3_inplace(
+        ctx: *mut ggml_context,
+        a: *mut ggml_tensor,
+        b: *mut ggml_tensor,
+        c: *mut ggml_tensor,
+        fun: ggml_custom3_op_t,
+        n_tasks: ::std::os::raw::c_int,
+        userdata: *mut ::std::os::raw::c_void,
+    ) -> *mut ggml_tensor;
+}
 extern "C" {
     pub fn ggml_cross_entropy_loss(
         ctx: *mut ggml_context,
@@ -1863,6 +2092,14 @@ extern "C" {
 extern "C" {
     pub fn ggml_build_forward_expand(cgraph: *mut ggml_cgraph, tensor: *mut ggml_tensor);
 }
+extern "C" {
+    pub fn ggml_build_backward_expand(
+        ctx: *mut ggml_context,
+        gf: *mut ggml_cgraph,
+        gb: *mut ggml_cgraph,
+        keep: bool,
+    );
+}
 extern "C" {
     pub fn ggml_build_forward(tensor: *mut ggml_tensor) -> ggml_cgraph;
 }
@@ -1952,6 +2189,15 @@ pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_STEP: ggml_opt_result = -126;
 pub const ggml_opt_result_GGML_LINESEARCH_MAXIMUM_ITERATIONS: ggml_opt_result = -125;
 pub const ggml_opt_result_GGML_LINESEARCH_INVALID_PARAMETERS: ggml_opt_result = -124;
 pub type ggml_opt_result = ::std::os::raw::c_int;
+pub type ggml_opt_callback =
+    ::std::option::Option<unsafe extern "C" fn(data: *mut ::std::os::raw::c_void, sched: *mut f32)>;
+pub type ggml_log_callback = ::std::option::Option<
+    unsafe extern "C" fn(
+        level: ggml_log_level,
+        text: *const ::std::os::raw::c_char,
+        user_data: *mut ::std::os::raw::c_void,
+    ),
+>;
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_params {
@@ -1971,12 +2217,14 @@ pub struct ggml_opt_params__bindgen_ty_1 {
     pub n_iter: ::std::os::raw::c_int,
     pub sched: f32,
     pub decay: f32,
+    pub decay_min_ndim: ::std::os::raw::c_int,
     pub alpha: f32,
     pub beta1: f32,
     pub beta2: f32,
     pub eps: f32,
     pub eps_f: f32,
     pub eps_g: f32,
+    pub gclip: f32,
 }
 #[test]
 fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
@@ -1985,7 +2233,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_params__bindgen_ty_1>(),
-        36usize,
+        44usize,
         concat!("Size of: ", stringify!(ggml_opt_params__bindgen_ty_1))
     );
     assert_eq!(
@@ -2024,8 +2272,18 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).decay_min_ndim) as usize - ptr as usize },
         12usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(decay_min_ndim)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).alpha) as usize - ptr as usize },
+        16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2035,7 +2293,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).beta1) as usize - ptr as usize },
-        16usize,
+        20usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2045,7 +2303,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).beta2) as usize - ptr as usize },
-        20usize,
+        24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2055,7 +2313,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps) as usize - ptr as usize },
-        24usize,
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2065,7 +2323,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps_f) as usize - ptr as usize },
-        28usize,
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2075,7 +2333,7 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).eps_g) as usize - ptr as usize },
-        32usize,
+        36usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params__bindgen_ty_1),
@@ -2083,6 +2341,16 @@ fn bindgen_test_layout_ggml_opt_params__bindgen_ty_1() {
             stringify!(eps_g)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).gclip) as usize - ptr as usize },
+        40usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_params__bindgen_ty_1),
+            "::",
+            stringify!(gclip)
+        )
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -2209,7 +2477,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_params>(),
-        96usize,
+        104usize,
         concat!("Size of: ", stringify!(ggml_opt_params))
     );
     assert_eq!(
@@ -2299,7 +2567,7 @@ fn bindgen_test_layout_ggml_opt_params() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).lbfgs) as usize - ptr as usize },
-        60usize,
+        68usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_params),
@@ -2316,19 +2584,16 @@ pub struct ggml_opt_context {
     pub iter: ::std::os::raw::c_int,
     pub nx: i64,
     pub just_initialized: bool,
+    pub loss_before: f32,
+    pub loss_after: f32,
     pub adam: ggml_opt_context__bindgen_ty_1,
     pub lbfgs: ggml_opt_context__bindgen_ty_2,
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_opt_context__bindgen_ty_1 {
-    pub x: *mut ggml_tensor,
-    pub g1: *mut ggml_tensor,
-    pub g2: *mut ggml_tensor,
     pub m: *mut ggml_tensor,
     pub v: *mut ggml_tensor,
-    pub mh: *mut ggml_tensor,
-    pub vh: *mut ggml_tensor,
     pub pf: *mut ggml_tensor,
     pub fx_best: f32,
     pub fx_prev: f32,
@@ -2341,7 +2606,7 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_context__bindgen_ty_1>(),
-        80usize,
+        40usize,
         concat!("Size of: ", stringify!(ggml_opt_context__bindgen_ty_1))
     );
     assert_eq!(
@@ -2350,113 +2615,63 @@ fn bindgen_test_layout_ggml_opt_context__bindgen_ty_1() {
         concat!("Alignment of ", stringify!(ggml_opt_context__bindgen_ty_1))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).x) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
         0usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(x)
+            stringify!(m)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).g1) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
         8usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(g1)
+            stringify!(v)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).g2) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).pf) as usize - ptr as usize },
         16usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(g2)
+            stringify!(pf)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).m) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).fx_best) as usize - ptr as usize },
         24usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(m)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).v) as usize - ptr as usize },
-        32usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(v)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).mh) as usize - ptr as usize },
-        40usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(mh)
+            stringify!(fx_best)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).vh) as usize - ptr as usize },
-        48usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).fx_prev) as usize - ptr as usize },
+        28usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(vh)
+            stringify!(fx_prev)
         )
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).pf) as usize - ptr as usize },
-        56usize,
+        unsafe { ::std::ptr::addr_of!((*ptr).n_no_improvement) as usize - ptr as usize },
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context__bindgen_ty_1),
             "::",
-            stringify!(pf)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).fx_best) as usize - ptr as usize },
-        64usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(fx_best)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).fx_prev) as usize - ptr as usize },
-        68usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(fx_prev)
-        )
-    );
-    assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).n_no_improvement) as usize - ptr as usize },
-        72usize,
-        concat!(
-            "Offset of field: ",
-            stringify!(ggml_opt_context__bindgen_ty_1),
-            "::",
-            stringify!(n_no_improvement)
+            stringify!(n_no_improvement)
         )
     );
 }
@@ -2662,7 +2877,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_opt_context>(),
-        312usize,
+        288usize,
         concat!("Size of: ", stringify!(ggml_opt_context))
     );
     assert_eq!(
@@ -2692,7 +2907,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).iter) as usize - ptr as usize },
-        104usize,
+        112usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2702,7 +2917,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).nx) as usize - ptr as usize },
-        112usize,
+        120usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2712,7 +2927,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).just_initialized) as usize - ptr as usize },
-        120usize,
+        128usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2720,9 +2935,29 @@ fn bindgen_test_layout_ggml_opt_context() {
             stringify!(just_initialized)
         )
     );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).loss_before) as usize - ptr as usize },
+        132usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_context),
+            "::",
+            stringify!(loss_before)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).loss_after) as usize - ptr as usize },
+        136usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_opt_context),
+            "::",
+            stringify!(loss_after)
+        )
+    );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).adam) as usize - ptr as usize },
-        128usize,
+        144usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2732,7 +2967,7 @@ fn bindgen_test_layout_ggml_opt_context() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).lbfgs) as usize - ptr as usize },
-        208usize,
+        184usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_opt_context),
@@ -2773,6 +3008,8 @@ extern "C" {
         f: *mut ggml_tensor,
         gf: *mut ggml_cgraph,
         gb: *mut ggml_cgraph,
+        callback: ggml_opt_callback,
+        callback_data: *mut ::std::os::raw::c_void,
     ) -> ggml_opt_result;
 }
 extern "C" {
@@ -2830,6 +3067,282 @@ extern "C" {
         hist: *mut i64,
     ) -> usize;
 }
+pub const gguf_type_GGUF_TYPE_UINT8: gguf_type = 0;
+pub const gguf_type_GGUF_TYPE_INT8: gguf_type = 1;
+pub const gguf_type_GGUF_TYPE_UINT16: gguf_type = 2;
+pub const gguf_type_GGUF_TYPE_INT16: gguf_type = 3;
+pub const gguf_type_GGUF_TYPE_UINT32: gguf_type = 4;
+pub const gguf_type_GGUF_TYPE_INT32: gguf_type = 5;
+pub const gguf_type_GGUF_TYPE_FLOAT32: gguf_type = 6;
+pub const gguf_type_GGUF_TYPE_BOOL: gguf_type = 7;
+pub const gguf_type_GGUF_TYPE_STRING: gguf_type = 8;
+pub const gguf_type_GGUF_TYPE_ARRAY: gguf_type = 9;
+pub const gguf_type_GGUF_TYPE_UINT64: gguf_type = 10;
+pub const gguf_type_GGUF_TYPE_INT64: gguf_type = 11;
+pub const gguf_type_GGUF_TYPE_FLOAT64: gguf_type = 12;
+pub const gguf_type_GGUF_TYPE_COUNT: gguf_type = 13;
+pub type gguf_type = ::std::os::raw::c_uint;
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct gguf_context {
+    _unused: [u8; 0],
+}
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct gguf_init_params {
+    pub no_alloc: bool,
+    pub ctx: *mut *mut ggml_context,
+}
+#[test]
+fn bindgen_test_layout_gguf_init_params() {
+    const UNINIT: ::std::mem::MaybeUninit<gguf_init_params> = ::std::mem::MaybeUninit::uninit();
+    let ptr = UNINIT.as_ptr();
+    assert_eq!(
+        ::std::mem::size_of::<gguf_init_params>(),
+        16usize,
+        concat!("Size of: ", stringify!(gguf_init_params))
+    );
+    assert_eq!(
+        ::std::mem::align_of::<gguf_init_params>(),
+        8usize,
+        concat!("Alignment of ", stringify!(gguf_init_params))
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).no_alloc) as usize - ptr as usize },
+        0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(gguf_init_params),
+            "::",
+            stringify!(no_alloc)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).ctx) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(gguf_init_params),
+            "::",
+            stringify!(ctx)
+        )
+    );
+}
+extern "C" {
+    pub fn gguf_init_empty() -> *mut gguf_context;
+}
+extern "C" {
+    pub fn gguf_init_from_file(
+        fname: *const ::std::os::raw::c_char,
+        params: gguf_init_params,
+    ) -> *mut gguf_context;
+}
+extern "C" {
+    pub fn gguf_free(ctx: *mut gguf_context);
+}
+extern "C" {
+    pub fn gguf_type_name(type_: gguf_type) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_version(ctx: *const gguf_context) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_alignment(ctx: *const gguf_context) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_data_offset(ctx: *const gguf_context) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_data(ctx: *const gguf_context) -> *mut ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn gguf_get_n_kv(ctx: *const gguf_context) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_find_key(
+        ctx: *const gguf_context,
+        key: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_key(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_kv_type(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> gguf_type;
+}
+extern "C" {
+    pub fn gguf_get_arr_type(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> gguf_type;
+}
+extern "C" {
+    pub fn gguf_get_val_u8(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> u8;
+}
+extern "C" {
+    pub fn gguf_get_val_i8(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> i8;
+}
+extern "C" {
+    pub fn gguf_get_val_u16(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> u16;
+}
+extern "C" {
+    pub fn gguf_get_val_i16(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> i16;
+}
+extern "C" {
+    pub fn gguf_get_val_u32(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> u32;
+}
+extern "C" {
+    pub fn gguf_get_val_i32(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> i32;
+}
+extern "C" {
+    pub fn gguf_get_val_f32(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> f32;
+}
+extern "C" {
+    pub fn gguf_get_val_u64(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> u64;
+}
+extern "C" {
+    pub fn gguf_get_val_i64(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> i64;
+}
+extern "C" {
+    pub fn gguf_get_val_f64(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> f64;
+}
+extern "C" {
+    pub fn gguf_get_val_bool(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> bool;
+}
+extern "C" {
+    pub fn gguf_get_val_str(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_arr_n(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_arr_data(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn gguf_get_arr_str(
+        ctx: *const gguf_context,
+        key_id: ::std::os::raw::c_int,
+        i: ::std::os::raw::c_int,
+    ) -> *const ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_get_n_tensors(ctx: *const gguf_context) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_find_tensor(
+        ctx: *const gguf_context,
+        name: *const ::std::os::raw::c_char,
+    ) -> ::std::os::raw::c_int;
+}
+extern "C" {
+    pub fn gguf_get_tensor_offset(ctx: *const gguf_context, i: ::std::os::raw::c_int) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_tensor_name(
+        ctx: *const gguf_context,
+        i: ::std::os::raw::c_int,
+    ) -> *mut ::std::os::raw::c_char;
+}
+extern "C" {
+    pub fn gguf_set_val_u8(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u8);
+}
+extern "C" {
+    pub fn gguf_set_val_i8(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i8);
+}
+extern "C" {
+    pub fn gguf_set_val_u16(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u16);
+}
+extern "C" {
+    pub fn gguf_set_val_i16(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i16);
+}
+extern "C" {
+    pub fn gguf_set_val_u32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u32);
+}
+extern "C" {
+    pub fn gguf_set_val_i32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i32);
+}
+extern "C" {
+    pub fn gguf_set_val_f32(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: f32);
+}
+extern "C" {
+    pub fn gguf_set_val_u64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: u64);
+}
+extern "C" {
+    pub fn gguf_set_val_i64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: i64);
+}
+extern "C" {
+    pub fn gguf_set_val_f64(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: f64);
+}
+extern "C" {
+    pub fn gguf_set_val_bool(ctx: *mut gguf_context, key: *const ::std::os::raw::c_char, val: bool);
+}
+extern "C" {
+    pub fn gguf_set_val_str(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        val: *const ::std::os::raw::c_char,
+    );
+}
+extern "C" {
+    pub fn gguf_set_arr_data(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        type_: gguf_type,
+        data: *const ::std::os::raw::c_void,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn gguf_set_arr_str(
+        ctx: *mut gguf_context,
+        key: *const ::std::os::raw::c_char,
+        data: *mut *const ::std::os::raw::c_char,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn gguf_set_kv(ctx: *mut gguf_context, src: *mut gguf_context);
+}
+extern "C" {
+    pub fn gguf_add_tensor(ctx: *mut gguf_context, tensor: *const ggml_tensor);
+}
+extern "C" {
+    pub fn gguf_set_tensor_type(
+        ctx: *mut gguf_context,
+        name: *const ::std::os::raw::c_char,
+        type_: ggml_type,
+    );
+}
+extern "C" {
+    pub fn gguf_set_tensor_data(
+        ctx: *mut gguf_context,
+        name: *const ::std::os::raw::c_char,
+        data: *const ::std::os::raw::c_void,
+        size: usize,
+    );
+}
+extern "C" {
+    pub fn gguf_write_to_file(
+        ctx: *const gguf_context,
+        fname: *const ::std::os::raw::c_char,
+        only_meta: bool,
+    );
+}
+extern "C" {
+    pub fn gguf_get_meta_size(ctx: *const gguf_context) -> usize;
+}
+extern "C" {
+    pub fn gguf_get_meta_data(ctx: *const gguf_context, data: *mut ::std::os::raw::c_void);
+}
 extern "C" {
     pub fn ggml_cpu_has_avx() -> ::std::os::raw::c_int;
 }
@@ -2854,6 +3367,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_cpu_has_arm_fma() -> ::std::os::raw::c_int;
 }
+extern "C" {
+    pub fn ggml_cpu_has_metal() -> ::std::os::raw::c_int;
+}
 extern "C" {
     pub fn ggml_cpu_has_f16c() -> ::std::os::raw::c_int;
 }
@@ -2878,6 +3394,9 @@ extern "C" {
 extern "C" {
     pub fn ggml_cpu_has_sse3() -> ::std::os::raw::c_int;
 }
+extern "C" {
+    pub fn ggml_cpu_has_ssse3() -> ::std::os::raw::c_int;
+}
 extern "C" {
     pub fn ggml_cpu_has_vsx() -> ::std::os::raw::c_int;
 }
@@ -2898,6 +3417,10 @@ pub type ggml_vec_dot_t = ::std::option::Option<
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
 pub struct ggml_type_traits_t {
+    pub type_name: *const ::std::os::raw::c_char,
+    pub blck_size: ::std::os::raw::c_int,
+    pub type_size: usize,
+    pub is_quantized: bool,
     pub to_float: ggml_to_float_t,
     pub from_float: ggml_from_float_t,
     pub from_float_reference: ggml_from_float_t,
@@ -2910,7 +3433,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     let ptr = UNINIT.as_ptr();
     assert_eq!(
         ::std::mem::size_of::<ggml_type_traits_t>(),
-        40usize,
+        72usize,
         concat!("Size of: ", stringify!(ggml_type_traits_t))
     );
     assert_eq!(
@@ -2919,8 +3442,48 @@ fn bindgen_test_layout_ggml_type_traits_t() {
         concat!("Alignment of ", stringify!(ggml_type_traits_t))
     );
     assert_eq!(
-        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
+        unsafe { ::std::ptr::addr_of!((*ptr).type_name) as usize - ptr as usize },
         0usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(type_name)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).blck_size) as usize - ptr as usize },
+        8usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(blck_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).type_size) as usize - ptr as usize },
+        16usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(type_size)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).is_quantized) as usize - ptr as usize },
+        24usize,
+        concat!(
+            "Offset of field: ",
+            stringify!(ggml_type_traits_t),
+            "::",
+            stringify!(is_quantized)
+        )
+    );
+    assert_eq!(
+        unsafe { ::std::ptr::addr_of!((*ptr).to_float) as usize - ptr as usize },
+        32usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2930,7 +3493,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).from_float) as usize - ptr as usize },
-        8usize,
+        40usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2940,7 +3503,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).from_float_reference) as usize - ptr as usize },
-        16usize,
+        48usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2950,7 +3513,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).vec_dot) as usize - ptr as usize },
-        24usize,
+        56usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2960,7 +3523,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
     assert_eq!(
         unsafe { ::std::ptr::addr_of!((*ptr).vec_dot_type) as usize - ptr as usize },
-        32usize,
+        64usize,
         concat!(
             "Offset of field: ",
             stringify!(ggml_type_traits_t),
@@ -2970,7 +3533,7 @@ fn bindgen_test_layout_ggml_type_traits_t() {
     );
 }
 extern "C" {
-    pub fn ggml_internal_get_type_traits(i: ggml_type) -> ggml_type_traits_t;
+    pub fn ggml_internal_get_type_traits(type_: ggml_type) -> ggml_type_traits_t;
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -3513,3 +4076,40 @@ extern "C" {
         hist: *mut i64,
     ) -> usize;
 }
+#[repr(C)]
+#[derive(Debug, Copy, Clone)]
+pub struct ggml_allocr {
+    _unused: [u8; 0],
+}
+extern "C" {
+    pub fn ggml_allocr_new(
+        data: *mut ::std::os::raw::c_void,
+        size: usize,
+        alignment: usize,
+    ) -> *mut ggml_allocr;
+}
+extern "C" {
+    pub fn ggml_allocr_new_measure(alignment: usize) -> *mut ggml_allocr;
+}
+extern "C" {
+    pub fn ggml_allocr_set_parse_seq(
+        alloc: *mut ggml_allocr,
+        list: *const ::std::os::raw::c_int,
+        n: ::std::os::raw::c_int,
+    );
+}
+extern "C" {
+    pub fn ggml_allocr_free(alloc: *mut ggml_allocr);
+}
+extern "C" {
+    pub fn ggml_allocr_is_measure(alloc: *mut ggml_allocr) -> bool;
+}
+extern "C" {
+    pub fn ggml_allocr_reset(alloc: *mut ggml_allocr);
+}
+extern "C" {
+    pub fn ggml_allocr_alloc(alloc: *mut ggml_allocr, tensor: *mut ggml_tensor);
+}
+extern "C" {
+    pub fn ggml_allocr_alloc_graph(alloc: *mut ggml_allocr, graph: *mut ggml_cgraph) -> usize;
+}
diff --git a/crates/ggml/sys/src/llama.rs b/crates/ggml/sys/src/llama.rs
index a8aa42ef..d3552cd9 100644
--- a/crates/ggml/sys/src/llama.rs
+++ b/crates/ggml/sys/src/llama.rs
@@ -1,18 +1,10 @@
 /* automatically generated by rust-bindgen 0.65.1 */
 
 pub const LLAMA_MAX_DEVICES: u32 = 1;
-pub const LLAMA_FILE_MAGIC_GGJT: u32 = 1734830708;
-pub const LLAMA_FILE_MAGIC_GGLA: u32 = 1734831201;
-pub const LLAMA_FILE_MAGIC_GGMF: u32 = 1734831462;
-pub const LLAMA_FILE_MAGIC_GGML: u32 = 1734831468;
+pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
 pub const LLAMA_FILE_MAGIC_GGSN: u32 = 1734833006;
-pub const LLAMA_FILE_VERSION: u32 = 3;
-pub const LLAMA_FILE_MAGIC: u32 = 1734830708;
-pub const LLAMA_FILE_MAGIC_UNVERSIONED: u32 = 1734831468;
 pub const LLAMA_SESSION_MAGIC: u32 = 1734833006;
 pub const LLAMA_SESSION_VERSION: u32 = 1;
-pub const LLAMA_DEFAULT_SEED: u32 = 4294967295;
-pub const LLAMA_DEFAULT_RMS_EPS: f64 = 0.000005;
 pub const LLAMA_FTYPE_ALL_F32: llama_ftype = 0;
 pub const LLAMA_FTYPE_MOSTLY_F16: llama_ftype = 1;
 pub const LLAMA_FTYPE_MOSTLY_Q4_0: llama_ftype = 2;
@@ -30,4 +22,5 @@ pub const LLAMA_FTYPE_MOSTLY_Q4_K_M: llama_ftype = 15;
 pub const LLAMA_FTYPE_MOSTLY_Q5_K_S: llama_ftype = 16;
 pub const LLAMA_FTYPE_MOSTLY_Q5_K_M: llama_ftype = 17;
 pub const LLAMA_FTYPE_MOSTLY_Q6_K: llama_ftype = 18;
-pub type llama_ftype = ::std::os::raw::c_int;
+pub const LLAMA_FTYPE_GUESSED: llama_ftype = 1024;
+pub type llama_ftype = ::std::os::raw::c_uint;
diff --git a/crates/ggml/sys/src/metal.rs b/crates/ggml/sys/src/metal.rs
index bbd16034..e2d7c621 100644
--- a/crates/ggml/sys/src/metal.rs
+++ b/crates/ggml/sys/src/metal.rs
@@ -1,15 +1,16 @@
 /* automatically generated by rust-bindgen 0.65.1 */
 
+use super::ggml_tensor;
+use super::ggml_log_callback;
+use super::ggml_cgraph;
+
 pub const GGML_METAL_MAX_BUFFERS: u32 = 16;
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_tensor {
-    _unused: [u8; 0],
-}
-#[repr(C)]
-#[derive(Debug, Copy, Clone)]
-pub struct ggml_cgraph {
-    _unused: [u8; 0],
+pub const GGML_METAL_MAX_COMMAND_BUFFERS: u32 = 32;
+extern "C" {
+    pub fn ggml_metal_log_set_callback(
+        log_callback: ggml_log_callback,
+        user_data: *mut ::std::os::raw::c_void,
+    );
 }
 #[repr(C)]
 #[derive(Debug, Copy, Clone)]
@@ -22,6 +23,12 @@ extern "C" {
 extern "C" {
     pub fn ggml_metal_free(ctx: *mut ggml_metal_context);
 }
+extern "C" {
+    pub fn ggml_metal_host_malloc(n: usize) -> *mut ::std::os::raw::c_void;
+}
+extern "C" {
+    pub fn ggml_metal_host_free(data: *mut ::std::os::raw::c_void);
+}
 extern "C" {
     pub fn ggml_metal_set_n_cb(ctx: *mut ggml_metal_context, n_cb: ::std::os::raw::c_int);
 }
@@ -41,10 +48,17 @@ extern "C" {
     pub fn ggml_metal_get_tensor(ctx: *mut ggml_metal_context, t: *mut ggml_tensor);
 }
 extern "C" {
-    pub fn ggml_metal_graph_find_concurrency(ctx: *mut ggml_metal_context, gf: *mut ggml_cgraph);
+    pub fn ggml_metal_graph_find_concurrency(
+        ctx: *mut ggml_metal_context,
+        gf: *mut ggml_cgraph,
+        check_mem: bool,
+    );
+}
+extern "C" {
+    pub fn ggml_metal_if_optimized(ctx: *mut ggml_metal_context) -> ::std::os::raw::c_int;
 }
 extern "C" {
-    pub fn ggml_metal_if_optimized(ctx: *mut ggml_metal_context) -> bool;
+    pub fn ggml_metal_get_concur_list(ctx: *mut ggml_metal_context) -> *mut ::std::os::raw::c_int;
 }
 extern "C" {
     pub fn ggml_metal_graph_compute(ctx: *mut ggml_metal_context, gf: *mut ggml_cgraph);
diff --git a/crates/llm-base/src/inference_session.rs b/crates/llm-base/src/inference_session.rs
index 493513e6..4a8bfec1 100644
--- a/crates/llm-base/src/inference_session.rs
+++ b/crates/llm-base/src/inference_session.rs
@@ -1,4 +1,4 @@
-use ggml::{Buffer, ComputationGraph, Context, GraphExecutionPlan, Tensor};
+use ggml::{Buffer, ComputationGraph, Context, GraphAllocator, GraphExecutionPlan, Tensor};
 use serde::Serialize;
 use std::{cell::RefCell, fmt::Display, sync::Arc};
 use thiserror::Error;
@@ -12,21 +12,6 @@ use crate::{
     TokenId, TokenUtf8Buffer, TokenizationError,
 };
 
-// The size of a scratch buffer used for inference. This is used for temporary
-// storage of intermediate results during inference.
-//
-// The specific value was copied from `llama.cpp`.
-const SCRATCH_SIZE: usize = 512 * 1024 * 1024;
-
-type ScratchBuffers = [ggml::Buffer; 2];
-
-fn scratch_buffers() -> ScratchBuffers {
-    [
-        ggml::Buffer::new(SCRATCH_SIZE),
-        ggml::Buffer::new(SCRATCH_SIZE),
-    ]
-}
-
 /// Result of graph building
 pub struct GraphOutputs {
     /// The output containing the model's result
@@ -34,6 +19,9 @@ pub struct GraphOutputs {
 
     /// The output containing embeddings
     pub embedding_result: Tensor,
+
+    /// The length of the output
+    pub output_length: usize,
 }
 
 /// An inference session represents the state of the text generation. This holds
@@ -66,7 +54,7 @@ pub struct InferenceSession {
 
     /// How many tokens have been fed into the model's working memory so far.
     #[doc(hidden)]
-    pub n_past: usize,
+    n_past: usize,
 
     /// How much memory is required per token for the temporary context used
     /// during inference.
@@ -90,21 +78,32 @@ pub struct InferenceSession {
 
     n_embd: usize,
 
-    scratch: ScratchBuffers,
+    /// Allocator used by this session
+    allocator: GraphAllocator,
+
+    ///Context size of this session
+    context_size: usize,
+
+    /// Work buffer for graph planing
+    work_buffer: Vec<u8>,
+
+    /// If the session can use the gpu
+    use_gpu: bool,
 }
 
 pub struct BuildContext<'session> {
     //FIXME: Borrowing issue, dont know how to fix it
     pub ctx0: RefCell<&'session mut Context>,
+    pub allocator: RefCell<&'session GraphAllocator>,
     pub embd: &'session Tensor,
     pub memory_k: &'session Tensor,
     pub memory_v: &'session Tensor,
-    pub scratch: &'session ScratchBuffers,
+    pub n_past: usize,
 }
 
 impl<'session> BuildContext<'session> {
-    pub fn get_scratch(&self, idx: usize) -> Option<&Buffer> {
-        Some(&self.scratch[idx])
+    pub fn input_length(&self) -> usize {
+        self.embd.nelements()
     }
 }
 
@@ -124,7 +123,7 @@ impl InferenceSession {
             ..
         } = *params;
 
-        let context_byte_size = {
+        let cache_byte_size = {
             let mut size = 0;
             size += mulf!(
                 context_size,
@@ -138,53 +137,48 @@ impl InferenceSession {
                 n_embd,
                 ggml::type_sizef(config.memory_v_type.into())
             ); // memory_v
-            size += (5 + 10 * n_layer) * 256; // object overhead
+            size += 2 * 1024 * 1024; // overhead
 
             size
         };
 
+        log::info!(
+            "Allocating {:.2} MB for KV-memory",
+            cache_byte_size / (1024 * 1024)
+        );
+
         if use_gpu {
             ggml::accelerator::initialize(0);
-            ggml::accelerator::set_scratch_size(config.n_batch * 1024 * 1024);
+            ggml::accelerator::set_scratch_size(0);
         }
 
         // TODO: revisit this with `Rc`, maybe? We should be able to prove that the session
         // context is only accessed from one thread at a time, but I've already spent enough
         // time on this as-is.
         #[allow(clippy::arc_with_non_send_sync)]
-        let session_ctx = Arc::new(ggml::Context::new_with_allocate(context_byte_size));
+        let session_ctx = Arc::new(ggml::Context::new_with_allocate(cache_byte_size));
 
         // Initialize key + value memory tensors
         let n_mem = n_layer * context_size;
         let n_elements = n_embd * n_mem;
         let (memory_k, memory_v) = kv_memory(&session_ctx, &config, use_gpu, n_elements);
 
-        let scratch = scratch_buffers();
-
-        // Allocate buffer for storing intermediate values during evaluation (ctx0 backing)
-        // For the first run, we need to guess a maximum buffer size so we can measure
-        // the actual memory consumption of the temporary ggml context.
-        //
-        // These numbers are from `llama.cpp`, and could potentially be more efficient.
-        let buf_size = {
-            let buf_size_mb = if n_layer >= 80 {
-                1536
-            } else if n_layer >= 60 {
-                1280
-            } else {
-                1024
-            };
-            buf_size_mb * 1024 * 1024 + ggml::graph_overhead()
-        };
-
+        // Allocate buffer for storing tensor and graph structs
+        let buf_size = ggml::graph_overhead() + (ggml::tensor_overhead() * ggml::MAX_NODES);
         let eval = Buffer::new(buf_size);
-        let ctx0 = ggml::Context::new_with_buffer(eval);
+        log::info!(
+            "Allocating {:.2} MB for eval-context",
+            buf_size / (1024 * 1024)
+        );
+
+        let ctx0 = ggml::Context::new_with_buffer(eval, false);
 
+        let allocator = GraphAllocator::new_measurement(ggml::TENSOR_ALIGNMENT);
         // Set up Metal support
         #[cfg(feature = "metal")]
         let metal_context = {
             if use_gpu {
-                let mut metal_context = MetalContext::new(config.n_threads);
+                let mut metal_context = MetalContext::new();
                 metal_context.add_scratch_buffer(ctx0.storage().as_buffer().unwrap());
 
                 for buf in scratch.iter() {
@@ -199,7 +193,7 @@ impl InferenceSession {
 
         InferenceSession {
             _session_ctx: session_ctx,
-            _memory_size: context_byte_size,
+            _memory_size: cache_byte_size,
             config,
             memory_k,
             memory_v,
@@ -212,7 +206,10 @@ impl InferenceSession {
             metal_context,
             ctx0,
             n_embd,
-            scratch,
+            allocator,
+            context_size,
+            work_buffer: vec![0],
+            use_gpu,
         }
     }
 
@@ -224,24 +221,98 @@ impl InferenceSession {
         builder: F,
     ) -> GraphOutputs
     where
-        F: FnOnce(BuildContext) -> (ComputationGraph, GraphOutputs),
+        F: Fn(BuildContext) -> (ComputationGraph, GraphOutputs),
     {
-        // Build a graph
+        // Check if we need to allocate the graph
+        if self.allocator.in_measuring_mode() {
+            // Build a graph
+            self.ctx0.recreate();
+            let ctx0 = &mut self.ctx0;
+
+            // If we are in measuring mode, we need to build a "worst case" graph, meaning the input has either `batch_size` or `context_size` tokens.
+            let max_n_tokens = self.config.n_batch.min(self.context_size);
+            // We assume the history is full
+            let max_n_past = self.context_size - max_n_tokens;
+            let embd = ctx0
+                .new_tensor_1d(ggml::Type::I32, max_n_tokens)
+                .set_name("embd");
+
+            self.allocator.allocate(&embd);
+
+            let bc = BuildContext {
+                ctx0: RefCell::new(ctx0),
+                allocator: RefCell::new(&self.allocator),
+                embd: &embd,
+                memory_k: &self.memory_k,
+                memory_v: &self.memory_v,
+                n_past: max_n_past,
+            };
+
+            let (mut worst_case_graph, built_result) = builder(bc);
+            // Expand the graph
+            worst_case_graph.build_forward_expand(&built_result.result);
+
+            // Allocate the graph
+            let graph_size =
+                self.allocator.allocate_graph(&worst_case_graph) + ggml::TENSOR_ALIGNMENT;
+            log::info!("Allocating {:.2} MB for graph", graph_size / (1024 * 1024));
+            // Pre-allocate the buffer for future use
+            self.allocator
+                .resize_buffer(graph_size, ggml::TENSOR_ALIGNMENT);
+
+            if self.use_gpu {
+                ggml::accelerator::set_scratch_size(graph_size);
+            }
+        }
+
+        // Reset the context and allocator
         self.ctx0.recreate();
+        self.allocator.reset();
         let ctx0 = &mut self.ctx0;
+
         let mut embd = ctx0
             .new_tensor_1d(ggml::Type::I32, input_tokens.len())
             .set_name("embd");
 
+        self.allocator.allocate(&embd);
+
         let bc = BuildContext {
             ctx0: RefCell::new(ctx0),
+            allocator: RefCell::new(&self.allocator),
             embd: &embd,
             memory_k: &self.memory_k,
             memory_v: &self.memory_v,
-            scratch: &mut self.scratch,
+            n_past: self.n_past,
         };
+
         let (mut built_gf, built_result) = builder(bc);
 
+        // Build the graph
+        built_gf.build_forward_expand(&built_result.result);
+
+        // Allocate the graph
+        self.allocator.allocate_graph(&built_gf);
+
+        #[cfg(feature = "cublas")]
+        {
+            for mut leaf in built_gf.leafs(&ctx0) {
+                if leaf.backend() == ggml::accelerator::Backend::Gpu && !leaf.has_extras() {
+                    unsafe {
+                        let offset = leaf.data().offset_from(self.allocator.buffer.data()) as usize;
+                        leaf.assign_scratch_offset(offset);
+                    }
+                }
+            }
+
+            for mut node in built_gf.nodes(&ctx0) {
+                if node.backend() == ggml::accelerator::Backend::Gpu && !node.has_extras() {
+                    unsafe {
+                        let offset = node.data().offset_from(self.allocator.buffer.data()) as usize;
+                        node.assign_scratch_offset(offset);
+                    }
+                }
+            }
+        }
         // Do Metal'y stuff
         #[cfg(feature = "metal")]
         {
@@ -253,9 +324,6 @@ impl InferenceSession {
         // Write input tokens
         unsafe { embd.write_data(bytemuck::cast_slice(input_tokens)) };
 
-        // Compute the graph
-        built_gf.build_forward_expand(&built_result.result);
-
         #[cfg(feature = "metal")]
         {
             // FIXME can only process one token at a time currently
@@ -276,7 +344,7 @@ impl InferenceSession {
         #[cfg(not(feature = "metal"))]
         {
             let mut plan = GraphExecutionPlan::new(&mut built_gf, self.config.n_threads);
-            plan.execute(ctx0);
+            plan.execute(&mut self.work_buffer);
         }
 
         // Adjust the required memory per token if we didn't know that already
@@ -291,6 +359,7 @@ impl InferenceSession {
         GraphOutputs {
             result: built_result.result.share(),
             embedding_result: built_result.embedding_result.share(),
+            output_length: input_tokens.len(),
         }
     }
 
diff --git a/crates/llm-base/src/loader.rs b/crates/llm-base/src/loader.rs
index d95ed348..2e80495c 100644
--- a/crates/llm-base/src/loader.rs
+++ b/crates/llm-base/src/loader.rs
@@ -32,13 +32,13 @@ pub struct FileType {
 impl From<FileType> for i32 {
     fn from(value: FileType) -> Self {
         (value.quantization_version * ggml::QNT_VERSION_FACTOR) as i32
-            + ggml::sys::llama::llama_ftype::from(value.format)
+            + ggml::sys::llama::llama_ftype::from(value.format) as i32
     }
 }
-impl TryFrom<i32> for FileType {
+impl TryFrom<u32> for FileType {
     type Error = ();
 
-    fn try_from(value: i32) -> Result<Self, Self::Error> {
+    fn try_from(value: u32) -> Result<Self, Self::Error> {
         let format = FileTypeFormat::try_from(
             ((value as u32) % ggml::QNT_VERSION_FACTOR) as ggml::sys::llama::llama_ftype,
         )?;
@@ -252,7 +252,7 @@ pub enum LoadError {
     #[error("unsupported ftype: {0}")]
     /// The `ftype` hyperparameter had an invalid value. This usually means that the format used
     /// by this file is unrecognized by this version of `llm`.
-    UnsupportedFileType(i32),
+    UnsupportedFileType(u32),
     #[error("invalid magic number {magic} for {path:?}")]
     /// An invalid magic number was encountered during the loading process.
     InvalidMagic {
diff --git a/crates/llm-base/src/lora.rs b/crates/llm-base/src/lora.rs
index c6d1d8a2..f433931e 100644
--- a/crates/llm-base/src/lora.rs
+++ b/crates/llm-base/src/lora.rs
@@ -128,8 +128,9 @@ impl LoraAdapter {
         gf.build_forward_expand(&output);
 
         //TODO: maybe pass the model's thread count to this context
+        let mut work_buffer = vec![0u8];
         let mut plan = GraphExecutionPlan::new(&mut gf, 8);
-        plan.execute(&patch_context);
+        plan.execute(&mut work_buffer);
 
         // Overwrite the original tensor.
         // The `output` and the `target_tensor` are not from the same context,
diff --git a/crates/llm-base/src/util.rs b/crates/llm-base/src/util.rs
index e63522a2..70fe2994 100644
--- a/crates/llm-base/src/util.rs
+++ b/crates/llm-base/src/util.rs
@@ -28,7 +28,7 @@ use crate::{FileType, LoadError};
 
 /// Read the filetype from a reader.
 pub fn read_filetype(reader: &mut dyn BufRead) -> Result<FileType, LoadError> {
-    let ftype = read_i32(reader)?;
+    let ftype = read_u32(reader)?;
     FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))
 }
 
diff --git a/crates/models/bert/src/lib.rs b/crates/models/bert/src/lib.rs
index 857ffcbc..9a8daf6e 100644
--- a/crates/models/bert/src/lib.rs
+++ b/crates/models/bert/src/lib.rs
@@ -1,13 +1,13 @@
 //! An implementation of [LLaMA](https://huggingface.co/docs/transformers/model_doc/llama) for the `llm` ecosystem.
 #![deny(missing_docs)]
 
-use std::{error::Error, sync::Arc};
+use std::error::Error;
 
 use llm_base::{
     ggml,
     model::{common, HyperparametersWriteError},
     util, FileType, GraphOutputs, InferenceSession, InferenceSessionConfig, KnownModel, LoadError,
-    ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
+    ModelContext, ModelParameters, OutputRequest, Regex, TensorLoader, TokenId, Tokenizer,
 };
 
 /// The BERT model.
@@ -29,7 +29,7 @@ pub struct Bert {
     layers: Vec<Layer>,
 
     // must be kept alive for the model
-    context: Arc<ggml::Context>,
+    context: ModelContext,
 }
 
 unsafe impl Send for Bert {}
@@ -158,7 +158,7 @@ impl KnownModel for Bert {
             params,
             tokenizer,
             layers,
-            context: Arc::new(context),
+            context,
         })
     }
 
@@ -181,7 +181,6 @@ impl KnownModel for Bert {
         output_request: &mut OutputRequest,
     ) {
         let input_len = input_tokens.len();
-        let _session_len = session.n_past;
         let _ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -198,7 +197,7 @@ impl KnownModel for Bert {
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
             let mut ctx0 = builder.ctx0.borrow_mut();
-            let gf = ggml::ComputationGraph::new();
+            let gf = ctx0.create_compute_graph();
 
             let embd = builder.embd;
 
@@ -356,6 +355,7 @@ impl KnownModel for Bert {
                 GraphOutputs {
                     result: input_layer.share(),
                     embedding_result: input_layer.share(),
+                    output_length: input_len,
                 },
             )
         });
diff --git a/crates/models/bloom/src/lib.rs b/crates/models/bloom/src/lib.rs
index efa1f338..fb26ff3d 100644
--- a/crates/models/bloom/src/lib.rs
+++ b/crates/models/bloom/src/lib.rs
@@ -119,8 +119,6 @@ impl KnownModel for Bloom {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -133,6 +131,8 @@ impl KnownModel for Bloom {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let session_len = builder.n_past;
+            let input_len = builder.input_length();
             let ctx0 = builder.ctx0.borrow();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
@@ -331,14 +331,25 @@ impl KnownModel for Bloom {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: input_len,
                 },
             )
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/falcon/src/lib.rs b/crates/models/falcon/src/lib.rs
index 0322e2f2..f9f6c5d7 100644
--- a/crates/models/falcon/src/lib.rs
+++ b/crates/models/falcon/src/lib.rs
@@ -156,8 +156,6 @@ impl KnownModel for Falcon {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -170,9 +168,12 @@ impl KnownModel for Falcon {
         } = self.hyperparameters;
 
         let head_dim = n_embd / n_head;
-        let n = input_len;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let input_len = builder.input_length();
+            let n = input_len;
+            let session_len = builder.n_past;
+
             let mut ctx0 = builder.ctx0.borrow_mut();
             let embd = builder.embd;
             let mut input_layer = ctx0.op_get_rows(&self.tok_embeddings, embd);
@@ -192,7 +193,6 @@ impl KnownModel for Falcon {
 
             for il in 0..n_layer {
                 // attention uses first scratch buffer
-                ctx0.use_scratch(builder.get_scratch(0));
                 ctx0.set_offloading(self.params.should_offload(il));
 
                 // self-attention
@@ -319,9 +319,6 @@ impl KnownModel for Falcon {
                 // projection
                 current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
 
-                // feed forward uses second scratch buffer
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 let inp_ff = layernorm_output.share();
                 let attn_out =
                     ctx0.op_cpy(&current, &ctx0.new_tensor_2d(ggml::Type::F32, n_embd, n));
@@ -336,8 +333,6 @@ impl KnownModel for Falcon {
                 input_layer = current.share();
             }
 
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // norm
             input_layer = ctx0.op_norm(&input_layer);
 
@@ -349,7 +344,6 @@ impl KnownModel for Falcon {
             let embeddings_tensor: ggml::Tensor = input_layer.share();
 
             ctx0.set_offloading(false);
-            ctx0.use_scratch(None);
 
             // lm_head
             input_layer = ctx0.op_mul_mat(&self.lm_head, &input_layer);
@@ -359,14 +353,25 @@ impl KnownModel for Falcon {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: n,
                 },
             )
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/gpt2/src/lib.rs b/crates/models/gpt2/src/lib.rs
index b4434ad5..d06eb1ec 100644
--- a/crates/models/gpt2/src/lib.rs
+++ b/crates/models/gpt2/src/lib.rs
@@ -141,8 +141,6 @@ impl KnownModel for Gpt2 {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -154,6 +152,8 @@ impl KnownModel for Gpt2 {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let input_len = builder.input_length();
+            let session_len = builder.n_past;
             let mut ctx0 = builder.ctx0.borrow_mut();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
@@ -174,7 +174,7 @@ impl KnownModel for Gpt2 {
             let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
                 ctx0.set_offloading(self.params.should_offload(il));
-                ctx0.use_scratch(builder.get_scratch(0));
+
                 // norm
                 let mut current = ctx0.op_norm(&input_layer);
                 current = ctx0.op_add(
@@ -281,8 +281,6 @@ impl KnownModel for Gpt2 {
                 // feed-forward
                 let ff_in = current.share();
 
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 // feed-forward normalization
                 current = ctx0.op_norm(&ff_in);
                 current = ctx0.op_add(
@@ -305,13 +303,10 @@ impl KnownModel for Gpt2 {
                 input_layer = ctx0.op_add(&current, &ff_in);
             }
 
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // normalization
             input_layer = ctx0.op_norm(&input_layer);
             input_layer = ctx0.op_add(&ctx0.op_mul(&input_layer, &self.ln_f_g), &self.ln_f_b);
 
-            ctx0.use_scratch(None);
             ctx0.set_offloading(false);
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();
@@ -324,14 +319,25 @@ impl KnownModel for Gpt2 {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: input_len,
                 },
             )
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/gptj/src/lib.rs b/crates/models/gptj/src/lib.rs
index c013625a..b4ee3d82 100644
--- a/crates/models/gptj/src/lib.rs
+++ b/crates/models/gptj/src/lib.rs
@@ -137,8 +137,6 @@ impl KnownModel for GptJ {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -151,6 +149,9 @@ impl KnownModel for GptJ {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let input_len = builder.input_length();
+            let session_len = builder.n_past;
+
             let mut ctx0 = builder.ctx0.borrow_mut();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
@@ -300,14 +301,25 @@ impl KnownModel for GptJ {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: input_len,
                 },
             )
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/gptneox/src/lib.rs b/crates/models/gptneox/src/lib.rs
index 9075eb01..e355fe22 100644
--- a/crates/models/gptneox/src/lib.rs
+++ b/crates/models/gptneox/src/lib.rs
@@ -159,8 +159,6 @@ impl KnownModel for GptNeoX {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let n = input_tokens.len();
-        let n_past = session.n_past;
         let n_ctx = self.params.context_size;
 
         let Hyperparameters {
@@ -174,6 +172,9 @@ impl KnownModel for GptNeoX {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let n = builder.input_length();
+            let n_past = builder.n_past;
+
             let mut ctx0 = builder.ctx0.borrow_mut();
             let embd = builder.embd;
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
@@ -186,8 +187,6 @@ impl KnownModel for GptNeoX {
 
             for il in 0..n_layer {
                 ctx0.set_offloading(self.params.should_offload(il));
-                // attention uses first scratch buffer
-                ctx0.use_scratch(builder.get_scratch(0));
 
                 // self-attention
                 let mut current = ctx0.op_norm(&input_layer);
@@ -301,9 +300,6 @@ impl KnownModel for GptNeoX {
                 current = ctx0.op_mul_mat(&self.layers[il].c_attn_proj_w, &current);
                 current = ctx0.op_add(&current, &self.layers[il].c_attn_proj_b);
 
-                // use the second scratch for the feed forward
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 let feedforward_input: Tensor;
                 if !use_parallel_residual {
                     feedforward_input = ctx0.op_add(&current, &input_layer);
@@ -326,9 +322,6 @@ impl KnownModel for GptNeoX {
                 }
             }
 
-            // use the first scratch for the norm
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // normalize the output
             input_layer = ctx0.op_norm(&input_layer);
             // inpL = ln_f_g*inpL + ln_f_b
@@ -336,8 +329,6 @@ impl KnownModel for GptNeoX {
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();
 
-            // Disable the scratchbuffer
-            ctx0.use_scratch(None);
             ctx0.set_offloading(false);
             // apply language model head
             input_layer = ctx0.op_mul_mat(&self.lmh_g, &input_layer);
@@ -347,14 +338,25 @@ impl KnownModel for GptNeoX {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: n,
                 },
             )
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, n);
-        common::extract_logits(output_request, &outputs.result, n_vocab, n);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/llama/src/lib.rs b/crates/models/llama/src/lib.rs
index a70f315f..69ab5aa8 100644
--- a/crates/models/llama/src/lib.rs
+++ b/crates/models/llama/src/lib.rs
@@ -147,8 +147,6 @@ impl KnownModel for Llama {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let input_len = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -164,7 +162,11 @@ impl KnownModel for Llama {
         let n_embd_gqa = n_embd / (n_head / n_head_kv);
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let session_len = builder.n_past;
+            let input_len = builder.input_length();
+
             let mut ctx0 = builder.ctx0.borrow_mut();
+
             let embd = builder.embd;
 
             let mut input_layer = ctx0.op_get_rows(&self.wte, embd);
@@ -177,8 +179,6 @@ impl KnownModel for Llama {
                 let input_self_attention = input_layer.share();
                 let mut current: ggml::Tensor;
 
-                ctx0.use_scratch(builder.get_scratch(0));
-
                 // norm
                 current = ctx0.op_rms_norm(&input_layer);
 
@@ -309,8 +309,6 @@ impl KnownModel for Llama {
                 // projection (no bias)
                 current = ctx0.op_mul_mat(&self.layers[il].wo, &current);
 
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 let input_feed_forward = ctx0.op_add(&current, &input_self_attention);
 
                 // feed-forward network
@@ -337,8 +335,6 @@ impl KnownModel for Llama {
                 input_layer = current;
             }
 
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // norm
             input_layer = ctx0.op_rms_norm(&input_layer);
 
@@ -351,20 +347,30 @@ impl KnownModel for Llama {
             // lm_head
             input_layer = ctx0.op_mul_mat(&self.output, &input_layer);
 
-            ctx0.use_scratch(None);
             (
                 gf,
                 GraphOutputs {
                     result: input_layer,
                     embedding_result,
+                    output_length: input_len,
                 },
             )
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, input_len);
-        common::extract_logits(output_request, &outputs.result, n_vocab, input_len);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, input_len);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {
diff --git a/crates/models/mpt/src/lib.rs b/crates/models/mpt/src/lib.rs
index 3d22efff..1e52d2d0 100644
--- a/crates/models/mpt/src/lib.rs
+++ b/crates/models/mpt/src/lib.rs
@@ -96,8 +96,6 @@ impl KnownModel for Mpt {
         input_tokens: &[TokenId],
         output_request: &mut OutputRequest,
     ) {
-        let n = input_tokens.len();
-        let session_len = session.n_past;
         let ctx_size = self.params.context_size;
 
         let Hyperparameters {
@@ -110,6 +108,8 @@ impl KnownModel for Mpt {
         } = self.hyperparameters;
 
         let outputs = session.compute(self.context.clone(), input_tokens, |builder| {
+            let n = builder.input_length();
+            let session_len = builder.n_past;
             let ctx0 = builder.ctx0.borrow();
             let (memory_k_size, memory_v_size) = (
                 builder.memory_k.element_size(),
@@ -123,9 +123,6 @@ impl KnownModel for Mpt {
 
             let mut gf = ctx0.create_compute_graph();
             for il in 0..n_layer {
-                // attention uses first scratch buffer
-                ctx0.use_scratch(builder.get_scratch(0));
-
                 let mut current = ctx0.op_norm(&input_layer);
                 current = ctx0.op_mul(&current, &self.layers[il].norm_1_weight);
 
@@ -213,9 +210,6 @@ impl KnownModel for Mpt {
 
                 input_layer = ctx0.op_add(&input_layer, &current);
 
-                // feed forward uses second scratch buffer
-                ctx0.use_scratch(builder.get_scratch(1));
-
                 current = ctx0.op_norm(&input_layer);
                 current = ctx0.op_mul(&current, &self.layers[il].norm_2_weight);
 
@@ -229,17 +223,12 @@ impl KnownModel for Mpt {
                 input_layer = ctx0.op_add(&input_layer, &current);
             }
 
-            //use scratch buffer 0 for the rest
-            ctx0.use_scratch(builder.get_scratch(0));
-
             // norm
             input_layer = ctx0.op_norm(&input_layer);
             input_layer = ctx0.op_mul(&input_layer, &self.norm);
 
             let embeddings_tensor: ggml::Tensor = input_layer.share();
 
-            // disable scratch buffer for last layer
-            ctx0.use_scratch(None);
             // output embedding weight tied to input embedding
             input_layer = ctx0.op_mul_mat(&self.wte, &input_layer);
 
@@ -248,14 +237,25 @@ impl KnownModel for Mpt {
                 GraphOutputs {
                     result: input_layer,
                     embedding_result: embeddings_tensor,
+                    output_length: n,
                 },
             )
         });
 
         // finish evaluation
-        common::read_last_token(session, &outputs.result, n_vocab, n);
-        common::extract_logits(output_request, &outputs.result, n_vocab, n);
-        common::extract_embeddings(output_request, &outputs.embedding_result, n_embd, n);
+        common::read_last_token(session, &outputs.result, n_vocab, outputs.output_length);
+        common::extract_logits(
+            output_request,
+            &outputs.result,
+            n_vocab,
+            outputs.output_length,
+        );
+        common::extract_embeddings(
+            output_request,
+            &outputs.embedding_result,
+            n_embd,
+            outputs.output_length,
+        );
     }
 
     fn hyperparameters(&self) -> &Self::Hyperparameters {