diff --git a/.vscode/settings.json b/.vscode/settings.json
index ddda313c..ba494b2e 100644
--- a/.vscode/settings.json
+++ b/.vscode/settings.json
@@ -1,3 +1,3 @@
 {
-  "rust-analyzer.cargo.features": ["convert"]
+  "rust-analyzer.cargo.features": ["convert", "quantize"]
 }
diff --git a/Cargo.lock b/Cargo.lock
index eff66fa1..58b2c4ba 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -295,6 +295,12 @@ dependencies = [
  "cfg-if",
 ]
 
+[[package]]
+name = "crunchy"
+version = "0.2.2"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7"
+
 [[package]]
 name = "csv"
 version = "1.2.1"
@@ -440,10 +446,11 @@ dependencies = [
 ]
 
 [[package]]
-name = "ggml-loader"
+name = "ggml-format"
 version = "0.1.0"
 dependencies = [
  "ggml",
+ "rand",
  "thiserror",
 ]
 
@@ -466,6 +473,15 @@ version = "0.3.1"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b"
 
+[[package]]
+name = "half"
+version = "2.2.1"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "02b4af3693f1b705df946e9fe5631932443781d0aabb423b62fcd4d73f6d2fd0"
+dependencies = [
+ "crunchy",
+]
+
 [[package]]
 name = "hashbrown"
 version = "0.7.2"
@@ -614,7 +630,8 @@ version = "0.1.0"
 dependencies = [
  "bytemuck",
  "ggml",
- "ggml-loader",
+ "ggml-format",
+ "half",
  "memmap2",
  "partial_sort",
  "protobuf",
diff --git a/Cargo.toml b/Cargo.toml
index f579b1c6..20eca429 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -2,7 +2,7 @@
 members = [
     "ggml-sys",
     "ggml",
-    "ggml-loader",
+    "ggml-format",
     "llama-rs",
     "llama-cli",
     "generate-ggml-bindings"
diff --git a/ggml-loader/Cargo.toml b/ggml-format/Cargo.toml
similarity index 79%
rename from ggml-loader/Cargo.toml
rename to ggml-format/Cargo.toml
index 2d088758..91daca22 100644
--- a/ggml-loader/Cargo.toml
+++ b/ggml-format/Cargo.toml
@@ -1,5 +1,5 @@
 [package]
-name = "ggml-loader"
+name = "ggml-format"
 version = "0.1.0"
 edition = "2021"
 
@@ -8,3 +8,6 @@ edition = "2021"
 [dependencies]
 ggml = { path = "../ggml" }
 thiserror = "1.0"
+
+[dev-dependencies]
+rand = "0.8"
diff --git a/ggml-format/src/lib.rs b/ggml-format/src/lib.rs
new file mode 100644
index 00000000..b26aa0f2
--- /dev/null
+++ b/ggml-format/src/lib.rs
@@ -0,0 +1,45 @@
+#![deny(missing_docs)]
+//! A reader and writer for the `ggml` model format.
+//!
+//! The reader supports the GGML, GGMF and GGJT container formats, but
+//! only single-part models.
+//!
+//! The writer isn't implemented yet. It will support the GGJT container
+//! format only.
+
+/// Utilities for reading and writing.
+pub mod util;
+
+mod loader;
+mod saver;
+#[cfg(test)]
+mod tests;
+
+pub use loader::{
+    data_size, load_model, LoadError, LoadHandler, PartialHyperparameters, TensorInfo,
+};
+pub use saver::{save_model, SaveError, SaveHandler, TensorData};
+
+/// The type of a tensor element.
+pub type ElementType = ggml::Type;
+
+#[derive(Debug, PartialEq, Clone, Copy)]
+/// The format of the file containing the model.
+pub enum ContainerType {
+    /// `GGML`: legacy format, oldest ggml tensor file format
+    Ggml,
+    /// `GGMF`: also legacy format. Introduces versioning. Newer than GGML, older than GGJT.
+    Ggmf,
+    /// `GGJT`: mmap-able format.
+    Ggjt,
+}
+impl ContainerType {
+    /// Does this container type support mmap?
+    pub fn support_mmap(&self) -> bool {
+        match self {
+            ContainerType::Ggml => false,
+            ContainerType::Ggmf => false,
+            ContainerType::Ggjt => true,
+        }
+    }
+}
diff --git a/ggml-format/src/loader.rs b/ggml-format/src/loader.rs
new file mode 100644
index 00000000..ffc99c9b
--- /dev/null
+++ b/ggml-format/src/loader.rs
@@ -0,0 +1,240 @@
+use std::{
+    error::Error,
+    io::{BufRead, Seek, SeekFrom},
+};
+
+use crate::{
+    util::{has_data_left, read_bytes_with_len, read_f32, read_i32, read_u32},
+    ContainerType, ElementType,
+};
+
+#[derive(Debug, thiserror::Error)]
+/// Errors that can occur while loading a model.
+pub enum LoadError<E: Error> {
+    #[error("invalid file magic number: {0}")]
+    /// The file magic number is invalid.
+    InvalidMagic(u32),
+    #[error("invalid ggml format: format={0:?} version={1}")]
+    /// An unsupported format version was found.
+    InvalidFormatVersion(ContainerType, u32),
+    #[error("non-specific I/O error")]
+    /// A non-specific IO error.
+    Io(#[from] std::io::Error),
+    #[error("could not convert bytes to a UTF-8 string")]
+    /// One of the strings encountered was not valid UTF-8.
+    InvalidUtf8(#[from] std::string::FromUtf8Error),
+    #[error("invalid integer conversion")]
+    /// One of the integers encountered could not be converted to a more appropriate type.
+    InvalidIntegerConversion(#[from] std::num::TryFromIntError),
+    #[error("implementation error")]
+    /// An error `E` was returned by the implementation of the loader.
+    ImplementationError(#[source] E),
+    #[error("unsupported tensor type {ftype} for tensor {tensor_name}")]
+    /// One of the tensors encountered had an unsupported data type.
+    UnsupportedElementType {
+        /// The name of the tensor.
+        tensor_name: String,
+        /// The format type that was encountered.
+        ftype: i32,
+    },
+    #[error("invariant broken: {0}")]
+    /// An invariant was broken.
+    InvariantBroken(String),
+}
+
+#[derive(Debug, Clone)]
+/// Information about a tensor that is read.
+pub struct TensorInfo {
+    /// The name of the tensor.
+    pub name: String,
+    /// The number of dimensions in the tensor.
+    pub n_dims: usize,
+    /// The dimensions of the tensor.
+    pub dims: [usize; 2],
+    /// The number of elements in the tensor.
+    pub n_elements: usize,
+    /// The type of the elements in the tensor.
+    pub element_type: ElementType,
+    /// start of tensor - start of file
+    pub start_offset: u64,
+}
+impl TensorInfo {
+    /// Get the dimensions of the tensor.
+    pub fn dims(&self) -> &[usize] {
+        &self.dims[0..self.n_dims]
+    }
+
+    /// Calculate the size of the tensor's values in bytes.
+    pub fn calc_size(&self) -> usize {
+        data_size(self.element_type, self.dims().iter().product())
+    }
+
+    /// Reads the tensor's data from the given reader in an owned fashion.
+    ///
+    /// The behaviour is undefined if the reader does not correspond to this info.
+    ///
+    /// Do not use this if loading with `mmap`.
+    pub fn read_data<R: BufRead + Seek>(&self, reader: &mut R) -> std::io::Result<Vec<u8>> {
+        let n_bytes = self.n_elements * ggml::type_size(self.element_type);
+        let mut data = vec![0; n_bytes];
+        reader.seek(SeekFrom::Start(self.start_offset))?;
+        reader.read_exact(&mut data)?;
+        Ok(data)
+    }
+}
+
+/// Returns the size occupied by a tensor's data in bytes given the element type and number of elements.
+pub fn data_size(element_type: ElementType, n_elements: usize) -> usize {
+    (ggml::type_size(element_type) * n_elements) / ggml::blck_size(element_type)
+}
+
+#[derive(Debug, Clone)]
+/// Information present within the hyperparameters that is required to continue loading the model.
+pub struct PartialHyperparameters {
+    /// The number of vocabulary tokens.
+    pub n_vocab: usize,
+}
+
+/// A handler for loading a model.
+pub trait LoadHandler<E: Error> {
+    /// Called when the container type is read.
+    fn container_type(&mut self, container_type: ContainerType) -> Result<(), E>;
+    /// Called when a vocabulary token is read.
+    fn vocabulary_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> Result<(), E>;
+    /// Called when the hyperparameters need to be read.
+    /// You must read the hyperparameters for your model here.
+    fn read_hyperparameters(
+        &mut self,
+        reader: &mut dyn BufRead,
+    ) -> Result<PartialHyperparameters, E>;
+    /// Called when a new tensor is found.
+    fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), E>;
+}
+
+/// Load a model from a `reader` with the `handler`, which will be called when certain events occur.
+pub fn load_model<E: Error, R: BufRead + Seek>(
+    reader: &mut R,
+    handler: &mut impl LoadHandler<E>,
+) -> Result<(), LoadError<E>> {
+    // Verify magic
+    let container_type: ContainerType = match read_u32(reader)? {
+        ggml::FILE_MAGIC_GGMF => ContainerType::Ggmf,
+        ggml::FILE_MAGIC_GGJT => ContainerType::Ggjt,
+        ggml::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml,
+        magic => return Err(LoadError::InvalidMagic(magic)),
+    };
+    handler
+        .container_type(container_type)
+        .map_err(LoadError::ImplementationError)?;
+
+    // Load format version
+    match container_type {
+        ContainerType::Ggmf | ContainerType::Ggjt => {
+            let _version: u32 = match read_u32(reader)? {
+                ggml::FORMAT_VERSION => ggml::FORMAT_VERSION,
+                version => return Err(LoadError::InvalidFormatVersion(container_type, version)),
+            };
+        }
+        ContainerType::Ggml => {}
+    }
+
+    // Load hyper params
+    let hparams = handler
+        .read_hyperparameters(reader)
+        .map_err(LoadError::ImplementationError)?;
+    let n_vocab = hparams.n_vocab;
+
+    // Load vocabulary
+    for i in 0..n_vocab {
+        let len = read_u32(reader)?.try_into()?;
+        let token = read_bytes_with_len(reader, len)?;
+        let token_score = match container_type {
+            ContainerType::Ggmf | ContainerType::Ggjt => read_f32(reader)?,
+            ContainerType::Ggml => {
+                // Legacy model, set empty score
+                0.
+            }
+        };
+        handler
+            .vocabulary_token(i, token, token_score)
+            .map_err(LoadError::ImplementationError)?;
+    }
+
+    // Load tensor data
+    match container_type {
+        ContainerType::Ggmf | ContainerType::Ggml => load_weights(reader, handler, false),
+        ContainerType::Ggjt => load_weights(reader, handler, true),
+    }
+}
+
+/// # Params
+///
+/// `align`
+/// align to 4 bytes before reading tensor weights
+fn load_weights<E: Error, R: BufRead + Seek>(
+    reader: &mut R,
+    handler: &mut impl LoadHandler<E>,
+    align: bool,
+) -> Result<(), LoadError<E>> {
+    while has_data_left(reader)? {
+        // load tensor header
+        let n_dims: usize = read_i32(reader)?.try_into()?;
+        let name_len = read_i32(reader)?;
+        let ftype = read_i32(reader)?;
+
+        let mut n_elements: usize = 1;
+        let mut dims = [1usize, 1];
+        let ne_len = dims.len();
+        if n_dims > ne_len {
+            return Err(LoadError::InvariantBroken(format!("{n_dims} <= {ne_len}")));
+        }
+
+        #[allow(clippy::needless_range_loop)]
+        for i in 0..n_dims {
+            let dim: usize = read_i32(reader)?.try_into()?;
+            dims[i] = dim;
+            n_elements *= dim;
+        }
+
+        // load tensor name
+        let name = String::from_utf8(read_bytes_with_len(reader, name_len.try_into()?)?)?;
+        let ftype = ggml::Type::try_from(ftype).map_err(|_| LoadError::UnsupportedElementType {
+            tensor_name: name.clone(),
+            ftype,
+        })?;
+
+        // sanity check
+        match ftype {
+            ElementType::Q4_0 | ElementType::Q4_1 => {
+                if dims[0] % 64 != 0 {
+                    return Err(LoadError::InvariantBroken(format!("{dims:?}[0] % 64 == 0")));
+                }
+            }
+            _ => {}
+        }
+
+        // load tensor weights
+        let offset_curr = reader.stream_position()?;
+        let offset_aligned: u64 = if align {
+            (offset_curr + 31) & !31
+        } else {
+            offset_curr
+        };
+
+        let tensor_info = TensorInfo {
+            name,
+            dims,
+            n_dims,
+            n_elements,
+            element_type: ftype,
+            start_offset: offset_aligned,
+        };
+        let n_bytes = tensor_info.calc_size();
+        handler
+            .tensor_buffer(tensor_info)
+            .map_err(LoadError::ImplementationError)?;
+        reader.seek(SeekFrom::Start(offset_aligned + n_bytes as u64))?;
+    }
+
+    Ok(())
+}
diff --git a/ggml-format/src/saver.rs b/ggml-format/src/saver.rs
new file mode 100644
index 00000000..565032a3
--- /dev/null
+++ b/ggml-format/src/saver.rs
@@ -0,0 +1,119 @@
+use std::{
+    error::Error,
+    io::{Seek, Write},
+};
+
+use crate::{util, ElementType};
+
+#[derive(Debug, thiserror::Error)]
+/// Errors that can occur while writing a model.
+pub enum SaveError<E: Error> {
+    #[error("non-specific I/O error")]
+    /// A non-specific IO error.
+    Io(#[from] std::io::Error),
+    #[error("invalid integer conversion")]
+    /// One of the integers encountered could not be converted to a more appropriate type.
+    InvalidIntegerConversion(#[from] std::num::TryFromIntError),
+    #[error("implementation error")]
+    /// An error `E` was returned by the implementation of the loader.
+    ImplementationError(#[source] E),
+    #[error("invariant broken: {0}")]
+    /// An invariant was broken.
+    InvariantBroken(String),
+}
+
+/// A handler for saving a model.
+pub trait SaveHandler<E: Error> {
+    /// Called when the hyperparameters are to be written.
+    /// You must write the hyperparameters to the given writer.
+    fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), E>;
+
+    /// Called when a tensor is to be written.
+    /// You must return data for the tensor to be saved.
+    fn tensor_data(&mut self, tensor_name: &str) -> Result<TensorData, E>;
+}
+
+/// Information about a tensor that is to be saved.
+#[derive(Clone, PartialEq, Debug)]
+pub struct TensorData {
+    /// The number of dimensions in the tensor.
+    pub n_dims: usize,
+    /// The dimensions of the tensor.
+    pub dims: [usize; 2],
+    /// The type of the elements in the tensor.
+    pub element_type: ElementType,
+    /// The data to save to disk.
+    // TODO: This can be done more efficiently by borrowing the data, but
+    // I wanted to avoid the lifetime parameter for now, especially as
+    // the naive solution would borrow `TensorData` for the lifetime of the
+    // handler, which is obviously not ideal if you're trying to transcode
+    // an existing file tensor-by-tensor.
+    pub data: Vec<u8>,
+}
+
+/// Saves a model to the given writer.
+///
+/// Only GGJT is supported.
+pub fn save_model<E: Error, W: Write + Seek>(
+    writer: &mut W,
+    handler: &mut dyn SaveHandler<E>,
+    vocabulary: &[(Vec<u8>, f32)],
+    tensor_names: &[String],
+) -> Result<(), SaveError<E>> {
+    // Write header and hyperparameters
+    util::write_u32(writer, ggml::FILE_MAGIC_GGJT)?;
+    util::write_u32(writer, ggml::FORMAT_VERSION)?;
+    handler
+        .write_hyperparameters(writer)
+        .map_err(SaveError::ImplementationError)?;
+
+    // Write vocabulary
+    for (token, score) in vocabulary {
+        util::write_u32(writer, token.len().try_into()?)?;
+        writer.write_all(token)?;
+        util::write_f32(writer, *score)?;
+    }
+
+    // Write tensors
+    for name in tensor_names {
+        let TensorData {
+            n_dims,
+            dims,
+            element_type,
+            data,
+        } = handler
+            .tensor_data(name)
+            .map_err(SaveError::ImplementationError)?;
+
+        match element_type {
+            ElementType::Q4_0 | ElementType::Q4_1 => {
+                if dims[0] % 64 != 0 {
+                    return Err(SaveError::InvariantBroken(format!("{dims:?}[0] % 64 == 0")));
+                }
+            }
+            _ => {}
+        }
+
+        // Write tensor header
+        util::write_i32(writer, n_dims.try_into()?)?;
+        util::write_i32(writer, name.len().try_into()?)?;
+        util::write_i32(writer, element_type.into())?;
+        for &dim in &dims[0..n_dims] {
+            util::write_i32(writer, dim.try_into()?)?;
+        }
+
+        // Write tensor name
+        writer.write_all(name.as_bytes())?;
+
+        // Align to nearest 32 bytes
+        let offset_curr = writer.stream_position()?;
+        let offset_aligned = (offset_curr + 31) & !31;
+        let padding = usize::try_from(offset_aligned - offset_curr)?;
+        writer.write_all(&vec![0; padding])?;
+
+        // Write tensor data
+        writer.write_all(&data)?;
+    }
+
+    Ok(())
+}
diff --git a/ggml-format/src/tests.rs b/ggml-format/src/tests.rs
new file mode 100644
index 00000000..91d925bb
--- /dev/null
+++ b/ggml-format/src/tests.rs
@@ -0,0 +1,176 @@
+use std::{
+    collections::BTreeMap,
+    error::Error,
+    io::{BufRead, Write},
+};
+
+use crate::*;
+use rand::{distributions::Uniform, prelude::*};
+
+#[derive(Debug)]
+struct DummyError;
+impl std::fmt::Display for DummyError {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        std::fmt::Debug::fmt(&self, f)
+    }
+}
+impl Error for DummyError {}
+
+#[test]
+fn can_roundtrip_loader_and_saver() {
+    let vocabulary = vec![
+        ("blazingly".as_bytes().to_vec(), 0.1),
+        ("fast".as_bytes().to_vec(), 0.2),
+        ("memory".as_bytes().to_vec(), 0.3),
+        ("efficient".as_bytes().to_vec(), 0.4),
+    ];
+
+    let mut rng = rand::thread_rng();
+    let element_type = ggml::Type::F16;
+    let model = Model {
+        hyperparameters: Hyperparameters {
+            some_hyperparameter: random(),
+            some_other_hyperparameter: random(),
+            vocabulary_size: vocabulary.len().try_into().unwrap(),
+        },
+        vocabulary,
+        tensors: (0..10)
+            .map(|i| {
+                let n_dims = Uniform::from(1..3).sample(&mut rng);
+                let dims = (0..n_dims)
+                    .map(|_| Uniform::from(1..10).sample(&mut rng))
+                    .chain(std::iter::repeat(1).take(2 - n_dims))
+                    .collect::<Vec<_>>();
+
+                let n_elements = dims.iter().product::<usize>();
+                let data = (0..data_size(element_type, n_elements))
+                    .map(|_| random())
+                    .collect::<Vec<_>>();
+
+                (
+                    format!("tensor_{}", i),
+                    TensorData {
+                        n_dims,
+                        dims: dims.try_into().unwrap(),
+                        element_type,
+                        data,
+                    },
+                )
+            })
+            .collect(),
+    };
+
+    // Save the model.
+    let mut buffer = Vec::new();
+    let mut cursor = std::io::Cursor::new(&mut buffer);
+    let mut save_handler = MockSaveHandler { model: &model };
+    save_model(
+        &mut cursor,
+        &mut save_handler,
+        &model.vocabulary,
+        &model.tensors.keys().cloned().collect::<Vec<String>>(),
+    )
+    .unwrap();
+
+    // Load the model and confirm that it is the same as the original.
+    let mut cursor = std::io::Cursor::new(&buffer);
+    let mut load_handler = MockLoadHandler {
+        data: &buffer,
+        loaded_model: Model::default(),
+    };
+    load_model(&mut cursor, &mut load_handler).unwrap();
+    assert_eq!(load_handler.loaded_model, model);
+}
+
+#[derive(Default, PartialEq, Debug)]
+struct Hyperparameters {
+    some_hyperparameter: u32,
+    some_other_hyperparameter: u32,
+    vocabulary_size: u32,
+}
+impl Hyperparameters {
+    fn read(reader: &mut dyn BufRead) -> Result<Self, std::io::Error> {
+        Ok(Self {
+            some_hyperparameter: util::read_u32(reader)?,
+            some_other_hyperparameter: util::read_u32(reader)? as u32,
+            vocabulary_size: util::read_u32(reader)?,
+        })
+    }
+
+    fn write(&self, writer: &mut dyn Write) -> Result<(), std::io::Error> {
+        util::write_u32(writer, self.some_hyperparameter)?;
+        util::write_u32(writer, self.some_other_hyperparameter as u32)?;
+        util::write_u32(writer, self.vocabulary_size)?;
+        Ok(())
+    }
+}
+
+#[derive(Default, PartialEq, Debug)]
+struct Model {
+    hyperparameters: Hyperparameters,
+    vocabulary: Vec<(Vec<u8>, f32)>,
+    tensors: BTreeMap<String, TensorData>,
+}
+
+struct MockSaveHandler<'a> {
+    model: &'a Model,
+}
+impl SaveHandler<DummyError> for MockSaveHandler<'_> {
+    fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), DummyError> {
+        self.model.hyperparameters.write(writer).unwrap();
+        Ok(())
+    }
+
+    fn tensor_data(&mut self, tensor_name: &str) -> Result<TensorData, DummyError> {
+        self.model
+            .tensors
+            .get(tensor_name)
+            .cloned()
+            .ok_or(DummyError)
+    }
+}
+
+struct MockLoadHandler<'a> {
+    data: &'a [u8],
+    loaded_model: Model,
+}
+impl LoadHandler<DummyError> for MockLoadHandler<'_> {
+    fn container_type(&mut self, container_type: ContainerType) -> Result<(), DummyError> {
+        assert_eq!(container_type, ContainerType::Ggjt);
+        Ok(())
+    }
+
+    fn vocabulary_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> Result<(), DummyError> {
+        assert_eq!(i, self.loaded_model.vocabulary.len());
+        self.loaded_model.vocabulary.push((token, score));
+        Ok(())
+    }
+
+    fn read_hyperparameters(
+        &mut self,
+        reader: &mut dyn BufRead,
+    ) -> Result<PartialHyperparameters, DummyError> {
+        self.loaded_model.hyperparameters = Hyperparameters::read(reader).unwrap();
+        Ok(PartialHyperparameters {
+            n_vocab: self
+                .loaded_model
+                .hyperparameters
+                .vocabulary_size
+                .try_into()
+                .unwrap(),
+        })
+    }
+
+    fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), DummyError> {
+        let data = TensorData {
+            n_dims: info.n_dims,
+            dims: info.dims,
+            element_type: info.element_type,
+            data: info
+                .read_data(&mut std::io::Cursor::new(self.data))
+                .unwrap(),
+        };
+        self.loaded_model.tensors.insert(info.name, data);
+        Ok(())
+    }
+}
diff --git a/ggml-format/src/util.rs b/ggml-format/src/util.rs
new file mode 100644
index 00000000..ac215feb
--- /dev/null
+++ b/ggml-format/src/util.rs
@@ -0,0 +1,55 @@
+pub use std::fs::File;
+pub use std::io::{BufRead, BufReader, BufWriter, Read, Seek, SeekFrom, Write};
+
+/// Read a fixed-size array of bytes from a reader.
+pub fn read_bytes<const N: usize>(reader: &mut dyn BufRead) -> Result<[u8; N], std::io::Error> {
+    let mut bytes = [0u8; N];
+    reader.read_exact(&mut bytes)?;
+    Ok(bytes)
+}
+
+/// Read a `i32` from a reader.
+pub fn read_i32(reader: &mut dyn BufRead) -> Result<i32, std::io::Error> {
+    Ok(i32::from_le_bytes(read_bytes::<4>(reader)?))
+}
+
+/// Read a `u32` from a reader.
+pub fn read_u32(reader: &mut dyn BufRead) -> Result<u32, std::io::Error> {
+    Ok(u32::from_le_bytes(read_bytes::<4>(reader)?))
+}
+
+/// Read a `f32` from a reader.
+pub fn read_f32(reader: &mut dyn BufRead) -> Result<f32, std::io::Error> {
+    Ok(f32::from_le_bytes(read_bytes::<4>(reader)?))
+}
+
+/// Read a variable-length array of bytes from a reader.
+pub fn read_bytes_with_len(
+    reader: &mut dyn BufRead,
+    len: usize,
+) -> Result<Vec<u8>, std::io::Error> {
+    let mut bytes = vec![0u8; len];
+    reader.read_exact(&mut bytes)?;
+    Ok(bytes)
+}
+
+/// Write a `i32` from a writer.
+pub fn write_i32(writer: &mut dyn Write, value: i32) -> Result<(), std::io::Error> {
+    writer.write_all(&value.to_le_bytes())
+}
+
+/// Write a `u32` from a writer.
+pub fn write_u32(writer: &mut dyn Write, value: u32) -> Result<(), std::io::Error> {
+    writer.write_all(&value.to_le_bytes())
+}
+
+/// Write a `f32` from a writer.
+pub fn write_f32(writer: &mut dyn Write, value: f32) -> Result<(), std::io::Error> {
+    writer.write_all(&value.to_le_bytes())
+}
+
+// NOTE: Implementation from #![feature(buf_read_has_data_left)]
+/// Check if there is any data left in the reader.
+pub fn has_data_left(reader: &mut impl BufRead) -> Result<bool, std::io::Error> {
+    reader.fill_buf().map(|b| !b.is_empty())
+}
diff --git a/ggml-loader/src/lib.rs b/ggml-loader/src/lib.rs
deleted file mode 100644
index 47239f08..00000000
--- a/ggml-loader/src/lib.rs
+++ /dev/null
@@ -1,244 +0,0 @@
-//! standalone model loader
-//!
-//! Only the hyperparameter is llama-specific. Everything else can be reused for other LLM.
-#![allow(clippy::nonminimal_bool)]
-
-pub mod util;
-
-use std::ops::ControlFlow;
-use util::*;
-
-pub type ElementType = ggml::Type;
-
-/// the format of the file containing the model
-#[derive(Debug, PartialEq, Clone, Copy)]
-#[allow(clippy::upper_case_acronyms)]
-pub enum ContainerType {
-    /// legacy format, oldest ggml tensor file format
-    GGML,
-    /// also legacy format, newer than GGML, older than GGJT
-    GGMF,
-    /// mmap-able format
-    GGJT,
-}
-impl ContainerType {
-    pub fn support_mmap(&self) -> bool {
-        match self {
-            ContainerType::GGML => false,
-            ContainerType::GGMF => false,
-            ContainerType::GGJT => true,
-        }
-    }
-}
-
-#[derive(Debug, thiserror::Error)]
-pub enum LoadError<T> {
-    #[error("invalid file magic number: {0}")]
-    InvalidMagic(u32),
-
-    #[error("invalid ggml format: version={0}")]
-    InvalidFormatVersion(u32),
-
-    #[error("{0}")]
-    Io(#[from] std::io::Error),
-
-    #[error("{0}")]
-    FailedCast(#[from] std::num::TryFromIntError),
-
-    /// return `ControlFlow::Break` from any of the `cb_*` function to trigger this error
-    #[error("user requested interrupt: {0}")]
-    UserInterrupted(T),
-
-    #[error("unsupported tensor dtype/f16_: {0}")]
-    UnsupportedElementType(i32),
-
-    /// sanity check failed
-    #[error("invariant broken: {0}")]
-    InvariantBroken(String),
-}
-
-#[derive(Debug, Clone)]
-pub struct TensorInfo {
-    pub name: Vec<u8>,
-    pub n_dims: usize,
-    pub dims: [usize; 2],
-    pub n_elements: usize,
-    pub element_type: ElementType,
-    /// start of tensor - start of file
-    pub start_offset: u64,
-}
-impl TensorInfo {
-    pub fn calc_size(&self) -> usize {
-        let mut size = ggml::type_size(self.element_type);
-        for &dim in &self.dims[0..self.n_dims] {
-            size *= dim;
-        }
-        size / ggml::blck_size(self.element_type)
-    }
-}
-
-/// Info in hyperparameter used for later loading tasks. Used in callback.
-/// see [`LoadHandler::load_hyper_parameters`]
-#[derive(Debug, Clone)]
-pub struct PartialHyperparameters {
-    pub n_vocab: usize,
-}
-
-pub enum TensorDataTreatment<'a> {
-    CopyInto(&'a mut [u8]),
-    Skip,
-}
-
-#[allow(unused_variables)]
-pub trait LoadHandler<T, R: BufRead + Seek> {
-    fn got_container_type(&mut self, container_type: ContainerType) -> ControlFlow<T> {
-        ControlFlow::Continue(())
-    }
-
-    fn got_vocab_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> ControlFlow<T> {
-        ControlFlow::Continue(())
-    }
-
-    fn load_hyper_parameters(&mut self, reader: &mut R) -> ControlFlow<T, PartialHyperparameters>;
-
-    /// callback to get tensor buffer to populate
-    ///
-    /// # Returns
-    ///
-    /// `None` to skip copying
-    /// `Some(buf)` to provide a buffer for copying weights into
-    fn tensor_buffer(&mut self, info: TensorInfo) -> ControlFlow<T, TensorDataTreatment>;
-}
-
-#[test]
-fn can_be_vtable() {
-    use std::mem::MaybeUninit;
-    let _a: MaybeUninit<Box<dyn LoadHandler<(), std::fs::File>>> = MaybeUninit::uninit();
-}
-
-pub fn load_model_from_reader<T, R: BufRead + Seek>(
-    reader: &mut R,
-    handler: &mut impl LoadHandler<T, R>,
-) -> Result<(), LoadError<T>> {
-    // Verify magic
-    let container_type: ContainerType = match read_u32(reader)? {
-        ggml::FILE_MAGIC_GGMF => ContainerType::GGMF,
-        ggml::FILE_MAGIC_GGJT => ContainerType::GGJT,
-        ggml::FILE_MAGIC_UNVERSIONED => ContainerType::GGML,
-        magic => return Err(LoadError::InvalidMagic(magic)),
-    };
-    controlflow_to_result(handler.got_container_type(container_type))?;
-
-    // Load format version
-    match container_type {
-        ContainerType::GGMF | ContainerType::GGJT => {
-            let _version: u32 = match read_u32(reader)? {
-                ggml::FORMAT_VERSION => ggml::FORMAT_VERSION,
-                version => return Err(LoadError::InvalidFormatVersion(version)),
-            };
-        }
-        ContainerType::GGML => {}
-    }
-
-    // Load hyper params
-    let hparams = controlflow_to_result(handler.load_hyper_parameters(reader))?;
-    let n_vocab = hparams.n_vocab;
-
-    // Load vocabulary
-    for i in 0..n_vocab {
-        let len = read_u32(reader)?.try_into()?;
-        let token = read_bytes_with_len(reader, len)?;
-        let token_score = match container_type {
-            ContainerType::GGMF | ContainerType::GGJT => read_f32(reader)?,
-            ContainerType::GGML => {
-                // Legacy model, set empty score
-                0.
-            }
-        };
-        controlflow_to_result(handler.got_vocab_token(i, token, token_score))?;
-    }
-
-    // Load tensor data
-    match container_type {
-        ContainerType::GGMF | ContainerType::GGML => load_weights(reader, handler, false),
-        ContainerType::GGJT => load_weights(reader, handler, true),
-    }
-}
-
-/// # Params
-///
-/// `align`
-/// align to 4 bytes before reading tensor weights
-pub fn load_weights<T, R: BufRead + Seek>(
-    reader: &mut R,
-    handler: &mut impl LoadHandler<T, R>,
-    align: bool,
-) -> Result<(), LoadError<T>> {
-    while has_data_left(reader)? {
-        // load tensor header
-        let n_dims: usize = read_i32(reader)?.try_into()?;
-        let name_len = read_i32(reader)?;
-        let ftype = read_i32(reader)?;
-        let ftype =
-            ggml::Type::try_from(ftype).map_err(|_| LoadError::UnsupportedElementType(ftype))?;
-
-        let mut n_elements: usize = 1;
-        let mut dims = [1usize, 1];
-        let ne_len = dims.len();
-        if !(n_dims <= ne_len) {
-            return Err(LoadError::InvariantBroken(format!("{n_dims} <= {ne_len}")));
-        }
-        #[allow(clippy::needless_range_loop)]
-        for i in 0..n_dims {
-            let dim: usize = read_i32(reader)?.try_into()?;
-            dims[i] = dim;
-            n_elements *= dim;
-        }
-
-        // load tensor name
-        let name = read_bytes_with_len(reader, name_len.try_into()?)?;
-
-        // sanity check
-        match ftype {
-            ElementType::Q4_0 | ElementType::Q4_1 => {
-                if !(dims[0] % 64 == 0) {
-                    return Err(LoadError::InvariantBroken(format!("{dims:?}[0] % 64 == 0")));
-                }
-            }
-            _ => {}
-        }
-
-        // load tensor weights
-        let offset_curr = reader.stream_position()?;
-        let offset_aligned: u64 = if align {
-            (offset_curr + 31) & !31
-        } else {
-            offset_curr
-        };
-
-        let tensor_info = TensorInfo {
-            name,
-            dims,
-            n_dims,
-            n_elements,
-            element_type: ftype,
-            start_offset: offset_aligned,
-        };
-        let n_bytes = tensor_info.calc_size();
-
-        match controlflow_to_result(handler.tensor_buffer(tensor_info))? {
-            TensorDataTreatment::CopyInto(buf) => {
-                if align {
-                    reader.seek(SeekFrom::Start(offset_aligned))?;
-                }
-                reader.read_exact(buf)?;
-            }
-            TensorDataTreatment::Skip => {
-                // skip if no buffer is given
-                reader.seek(SeekFrom::Start(offset_aligned + n_bytes as u64))?;
-            }
-        }
-    }
-
-    Ok(())
-}
diff --git a/ggml-loader/src/util.rs b/ggml-loader/src/util.rs
deleted file mode 100644
index 9a759aac..00000000
--- a/ggml-loader/src/util.rs
+++ /dev/null
@@ -1,50 +0,0 @@
-pub use std::io::{BufRead, Seek, SeekFrom};
-use std::ops::ControlFlow;
-
-use crate::LoadError;
-
-pub fn read_bytes<const N: usize>(reader: &mut impl BufRead) -> Result<[u8; N], std::io::Error> {
-    let mut bytes = [0u8; N];
-    reader.read_exact(&mut bytes)?;
-    Ok(bytes)
-}
-
-pub fn read_i32(reader: &mut impl BufRead) -> Result<i32, std::io::Error> {
-    Ok(i32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-pub fn read_u32(reader: &mut impl BufRead) -> Result<u32, std::io::Error> {
-    Ok(u32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-pub fn read_f32(reader: &mut impl BufRead) -> Result<f32, std::io::Error> {
-    Ok(f32::from_le_bytes(read_bytes::<4>(reader)?))
-}
-
-pub fn read_bytes_with_len(
-    reader: &mut impl BufRead,
-    len: usize,
-) -> Result<Vec<u8>, std::io::Error> {
-    let mut bytes = vec![0u8; len];
-    reader.read_exact(&mut bytes)?;
-    Ok(bytes)
-}
-
-// NOTE: Implementation from #![feature(buf_read_has_data_left)]
-pub fn has_data_left(reader: &mut impl BufRead) -> Result<bool, std::io::Error> {
-    reader.fill_buf().map(|b| !b.is_empty())
-}
-
-pub fn controlflow_to_result<A, B>(x: ControlFlow<A, B>) -> Result<B, LoadError<A>> {
-    match x {
-        ControlFlow::Continue(x) => Ok(x),
-        ControlFlow::Break(y) => Err(LoadError::UserInterrupted(y)),
-    }
-}
-
-pub fn result_to_controlflow<A, B, C: Into<A>>(x: Result<B, C>) -> ControlFlow<A, B> {
-    match x {
-        Ok(x) => ControlFlow::Continue(x),
-        Err(y) => ControlFlow::Break(y.into()),
-    }
-}
diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs
index 37188a64..6d8905f8 100644
--- a/ggml/src/lib.rs
+++ b/ggml/src/lib.rs
@@ -9,7 +9,7 @@
 //! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed.
 
 use std::{
-    ffi::c_void,
+    os::raw::{c_int, c_void},
     ptr::NonNull,
     sync::{Arc, Weak},
 };
@@ -272,7 +272,7 @@ impl Context {
     pub unsafe fn op_map_unary(
         &self,
         a: &Tensor,
-        fun: unsafe extern "C" fn(cnt: ::std::os::raw::c_int, dst: *mut f32, src: *const f32),
+        fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src: *const f32),
     ) -> Tensor {
         let tensor =
             unsafe { ggml_sys::ggml_map_unary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), Some(fun)) };
@@ -298,12 +298,7 @@ impl Context {
         &self,
         a: &Tensor,
         b: &Tensor,
-        fun: unsafe extern "C" fn(
-            cnt: ::std::os::raw::c_int,
-            dst: *mut f32,
-            src0: *const f32,
-            src1: *const f32,
-        ),
+        fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src0: *const f32, src1: *const f32),
     ) -> Tensor {
         let tensor = unsafe {
             ggml_sys::ggml_map_binary_f32(
@@ -325,14 +320,8 @@ impl Context {
     }
 
     /// Creates a 2D view over `a`.
-    pub fn op_view_2d(
-        &self,
-        a: &Tensor,
-        ne0: usize,
-        ne1: usize,
-        nb1: usize,
-        offset: usize,
-    ) -> Tensor {
+    pub fn op_view_2d(&self, a: &Tensor, ne: (usize, usize), nb1: usize, offset: usize) -> Tensor {
+        let (ne0, ne1) = ne;
         let tensor = unsafe {
             ggml_sys::ggml_view_2d(
                 self.ptr.as_ptr(),
@@ -347,17 +336,15 @@ impl Context {
     }
 
     /// Creates a 3d view over `a`.
-    #[allow(clippy::too_many_arguments)]
     pub fn op_view_3d(
         &self,
         a: &Tensor,
-        ne0: usize,
-        ne1: usize,
-        ne2: usize,
-        nb1: usize,
-        nb2: usize,
+        ne: (usize, usize, usize),
+        nb: (usize, usize),
         offset: usize,
     ) -> Tensor {
+        let (ne0, ne1, ne2) = ne;
+        let (nb1, nb2) = nb;
         let tensor = unsafe {
             ggml_sys::ggml_view_3d(
                 self.ptr.as_ptr(),
@@ -697,3 +684,53 @@ fn i32_to_usize(val: i32) -> usize {
 fn i64_to_usize(val: i64) -> usize {
     usize::try_from(val).unwrap()
 }
+
+/// Contains the result of a quantization operation.
+pub struct QuantizationResult {
+    /// The quantized output.
+    pub output: Vec<u8>,
+    /// The quantization history.
+    pub history: Vec<i64>,
+}
+
+/// Quantizes `src` into `dst` using `q4_0` quantization.
+///
+/// You must ensure that `src.len() == n_elements`, and `n_elements_0`
+/// is the first dimension of `src`.
+pub fn quantize_q4_0(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult {
+    quantize_impl(src, n_elements, n_elements_0, ggml_sys::ggml_quantize_q4_0)
+}
+
+/// Quantizes `src` into `dst` using `q4_1` quantization.
+///
+/// You must ensure that `src.len() == n_elements`, and `n_elements_0`
+/// is the first dimension of `src`.
+pub fn quantize_q4_1(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult {
+    quantize_impl(src, n_elements, n_elements_0, ggml_sys::ggml_quantize_q4_1)
+}
+
+fn quantize_impl(
+    src: &[f32],
+    n_elements: usize,
+    n_elements_0: usize,
+    quantizer: unsafe extern "C" fn(*const f32, *mut c_void, c_int, c_int, *mut i64) -> usize,
+) -> QuantizationResult {
+    assert_eq!(src.len(), n_elements);
+    assert_eq!(n_elements % n_elements_0, 0);
+
+    // A conservative multiplier of 4 is used here.
+    let mut output = vec![0u8; n_elements * 4];
+    let mut history = vec![0i64; 16];
+    let output_size = unsafe {
+        quantizer(
+            src.as_ptr(),
+            output.as_mut_ptr() as *mut c_void,
+            n_elements.try_into().unwrap(),
+            n_elements_0.try_into().unwrap(),
+            history.as_mut_ptr(),
+        )
+    };
+
+    output.resize(output_size, 0u8);
+    QuantizationResult { output, history }
+}
diff --git a/llama-cli/Cargo.toml b/llama-cli/Cargo.toml
index d4914b15..15ba6a9c 100644
--- a/llama-cli/Cargo.toml
+++ b/llama-cli/Cargo.toml
@@ -6,7 +6,7 @@ version = {workspace = true}
 # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html
 
 [dependencies]
-llama-rs = {path = "../llama-rs", features = ["convert"]}
+llama-rs = { path = "../llama-rs", features = ["convert", "quantize"] }
 
 rand = {workspace = true}
 
diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs
index fc064017..b14a3e73 100644
--- a/llama-cli/src/cli_args.rs
+++ b/llama-cli/src/cli_args.rs
@@ -40,6 +40,9 @@ pub enum Args {
     ///
     /// For reference, see [the PR](https://github.com/rustformers/llama-rs/pull/83).
     Convert(Box<Convert>),
+
+    /// Quantize a GGML model to 4-bit.
+    Quantize(Box<Quantize>),
 }
 
 #[derive(Parser, Debug)]
@@ -244,7 +247,7 @@ fn parse_bias(s: &str) -> Result<TokenBias, String> {
 pub struct ModelLoad {
     /// Where to load the model path from
     #[arg(long, short = 'm')]
-    pub model_path: String,
+    pub model_path: PathBuf,
 
     /// Sets the size of the context (in tokens). Allows feeding longer prompts.
     /// Note that this affects memory.
@@ -376,7 +379,6 @@ pub struct Convert {
     #[arg(long, short = 't', value_enum, default_value_t = FileType::Q4_0)]
     pub file_type: FileType,
 }
-
 #[derive(Parser, Debug, ValueEnum, Clone, Copy)]
 pub enum FileType {
     /// Quantized 4-bit (type 0).
@@ -398,3 +400,34 @@ impl From<FileType> for llama_rs::FileType {
         }
     }
 }
+
+#[derive(Parser, Debug)]
+pub struct Quantize {
+    /// The path to the model to quantize
+    #[arg()]
+    pub source: PathBuf,
+
+    /// The path to save the quantized model to
+    #[arg()]
+    pub destination: PathBuf,
+
+    /// The format to convert to
+    pub target: QuantizationTarget,
+}
+
+#[derive(Parser, Debug, ValueEnum, Clone, Copy)]
+#[clap(rename_all = "snake_case")]
+pub enum QuantizationTarget {
+    /// Quantized 4-bit (type 0).
+    Q4_0,
+    /// Quantized 4-bit (type 1).
+    Q4_1,
+}
+impl From<QuantizationTarget> for llama_rs::ElementType {
+    fn from(t: QuantizationTarget) -> Self {
+        match t {
+            QuantizationTarget::Q4_0 => llama_rs::ElementType::Q4_0,
+            QuantizationTarget::Q4_1 => llama_rs::ElementType::Q4_1,
+        }
+    }
+}
diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs
index de8323b0..cc142875 100644
--- a/llama-cli/src/main.rs
+++ b/llama-cli/src/main.rs
@@ -2,7 +2,7 @@ use std::{convert::Infallible, io::Write};
 
 use clap::Parser;
 use cli_args::Args;
-use color_eyre::eyre::Result;
+use color_eyre::eyre::{Context, Result};
 use llama_rs::{convert::convert_pth_to_ggml, InferenceError};
 use rustyline::error::ReadlineError;
 
@@ -23,6 +23,7 @@ fn main() -> Result<()> {
         Args::Repl(args) => interactive(&args, false)?,
         Args::ChatExperimental(args) => interactive(&args, true)?,
         Args::Convert(args) => convert_pth_to_ggml(&args.directory, args.file_type.into()),
+        Args::Quantize(args) => quantize(&args)?,
     }
 
     Ok(())
@@ -184,6 +185,44 @@ fn interactive(
     Ok(())
 }
 
+fn quantize(args: &cli_args::Quantize) -> Result<()> {
+    use llama_rs::quantize::{quantize, QuantizeProgress::*};
+    quantize(
+        &args.source,
+        &args.destination,
+        args.target.into(),
+        |progress| match progress {
+            HyperparametersLoaded(_) => log::info!("Loaded hyperparameters"),
+            TensorLoading {
+                name,
+                dims,
+                element_type,
+                n_elements,
+            } => log::info!(
+                "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)"
+            ),
+            TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"),
+            TensorQuantized {
+                name,
+                original_size,
+                reduced_size,
+                history,
+            } => log::info!(
+            "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})"
+        ),
+            TensorSkipped { name, size } => log::info!("Skipped tensor `{name}` ({size} bytes)"),
+            Finished {
+                original_size,
+                reduced_size,
+                history,
+            } => log::info!(
+                "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})"
+            ),
+        },
+    )
+    .wrap_err("failed to quantize model")
+}
+
 fn load_prompt_file_with_prompt(
     prompt_file: &cli_args::PromptFile,
     prompt: Option<&str>,
diff --git a/llama-rs/Cargo.toml b/llama-rs/Cargo.toml
index 7ed254a4..7b3e2b6e 100644
--- a/llama-rs/Cargo.toml
+++ b/llama-rs/Cargo.toml
@@ -8,7 +8,7 @@ rust-version = "1.65"
 
 [dependencies]
 ggml = { path = "../ggml" }
-ggml-loader = { path = "../ggml-loader" }
+ggml-format = { path = "../ggml-format" }
 
 rand = { workspace = true }
 
@@ -24,5 +24,9 @@ serde_json = { version = "1.0", optional = true }
 protobuf = { version = "= 2.14.0", optional = true }
 rust_tokenizers = { version = "3.1.2", optional = true }
 
+# Used for the `quantize` feature
+half = { version = "2.2.1", optional = true }
+
 [features]
 convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"]
+quantize = ["dep:half"]
diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs
index 88d26d0c..802229ac 100644
--- a/llama-rs/src/lib.rs
+++ b/llama-rs/src/lib.rs
@@ -5,6 +5,8 @@ use thiserror::Error;
 
 #[cfg(feature = "convert")]
 pub mod convert;
+#[cfg(feature = "quantize")]
+pub mod quantize;
 
 mod inference_session;
 mod loader;
diff --git a/llama-rs/src/loader.rs b/llama-rs/src/loader.rs
index 8b92378e..2b72a517 100644
--- a/llama-rs/src/loader.rs
+++ b/llama-rs/src/loader.rs
@@ -12,8 +12,10 @@ use crate::{
     LoadError, LoadProgress, Model, TokenId, Vocabulary,
 };
 use crate::{ElementType, Hyperparameters};
-use ggml_loader::util::*;
-use ggml_loader::ContainerType;
+use ggml_format::{
+    util::{has_data_left, read_bytes_with_len, read_f32, read_i32, read_u32},
+    ContainerType,
+};
 use memmap2::Mmap;
 
 pub(crate) fn load(
@@ -34,26 +36,33 @@ pub(crate) fn load(
     let mut reader = BufReader::new(&file);
 
     // Verify magic
-    let model_type: ContainerType = match read_u32(&mut reader)? {
-        ggml::FILE_MAGIC_GGMF => ContainerType::GGMF,
-        ggml::FILE_MAGIC_GGJT => ContainerType::GGJT,
-        ggml::FILE_MAGIC_UNVERSIONED => ContainerType::GGML,
+    let magic = read_u32(&mut reader)?;
+    let model_type: ContainerType = match magic {
+        ggml::FILE_MAGIC_GGMF => ContainerType::Ggmf,
+        ggml::FILE_MAGIC_GGJT => ContainerType::Ggjt,
+        ggml::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml,
         _ => {
             return Err(LoadError::InvalidMagic {
                 path: main_path.to_owned(),
+                magic,
             })
         }
     };
 
     // Load format version
     match model_type {
-        ContainerType::GGMF | ContainerType::GGJT => {
+        ContainerType::Ggmf | ContainerType::Ggjt => {
             let _version: u32 = match read_u32(&mut reader)? {
                 ggml::FORMAT_VERSION => ggml::FORMAT_VERSION,
-                version => return Err(LoadError::InvalidFormatVersion { version }),
+                version => {
+                    return Err(LoadError::InvalidFormatVersion {
+                        container_type: model_type,
+                        version,
+                    })
+                }
             };
         }
-        ContainerType::GGML => {}
+        ContainerType::Ggml => {}
     }
 
     // =================
@@ -93,8 +102,8 @@ pub(crate) fn load(
             let token = read_bytes_with_len(&mut reader, len.try_into()?)?;
 
             let score = match model_type {
-                ContainerType::GGMF | ContainerType::GGJT => read_f32(&mut reader)?,
-                ContainerType::GGML => {
+                ContainerType::Ggmf | ContainerType::Ggjt => read_f32(&mut reader)?,
+                ContainerType::Ggml => {
                     // Legacy model, set empty score
                     0.
                 }
@@ -168,7 +177,7 @@ pub(crate) fn load(
 
     let mut model = Model::new_loader1(context, hparams, vocabulary, n_ff, wtype, mmap);
     match model_type {
-        ContainerType::GGMF | ContainerType::GGML => {
+        ContainerType::Ggmf | ContainerType::Ggml => {
             let file_offset = reader.stream_position()?;
             drop(reader);
             load_weights_ggmf_or_unversioned(
@@ -178,7 +187,7 @@ pub(crate) fn load(
                 model.tensors_mut(),
             )?
         }
-        ContainerType::GGJT => {
+        ContainerType::Ggjt => {
             load_weights_ggjt(
                 &mut reader,
                 mmap_ptr,
@@ -243,7 +252,14 @@ fn load_weights_ggmf_or_unversioned(
             let length = read_i32(&mut part_reader)?;
             let ftype = read_i32(&mut part_reader)?;
 
-            let (nelements, ne, tensor_name, tensor, split_type, bpe) = load_tensor_header_ggmf(
+            let TensorHeaderGgmf {
+                nelements,
+                ne,
+                tensor_name,
+                tensor,
+                split_type,
+                bpe,
+            } = load_tensor_header_ggmf(
                 n_dims,
                 &mut part_reader,
                 length,
@@ -341,7 +357,14 @@ fn load_weights_ggmf_or_unversioned(
     Ok(())
 }
 
-#[allow(clippy::type_complexity)]
+struct TensorHeaderGgmf<'a> {
+    nelements: usize,
+    ne: [i64; 2],
+    tensor_name: String,
+    tensor: &'a mut ggml::Tensor,
+    split_type: i32,
+    bpe: usize,
+}
 fn load_tensor_header_ggmf<'a>(
     n_dims: usize,
     reader: &mut impl BufRead,
@@ -350,7 +373,7 @@ fn load_tensor_header_ggmf<'a>(
     path: &Path,
     n_parts: usize,
     ftype: i32,
-) -> Result<(usize, [i64; 2], String, &'a mut ggml::Tensor, i32, usize), LoadError> {
+) -> Result<TensorHeaderGgmf<'a>, LoadError> {
     let mut nelements = 1;
     let mut ne = [1i64, 1i64];
     assert!(n_dims <= ne.len());
@@ -364,13 +387,12 @@ fn load_tensor_header_ggmf<'a>(
         else {
             return Err(LoadError::UnknownTensor { tensor_name, path: path.to_owned() });
         };
-    #[allow(clippy::if_same_then_else)]
     let split_type = if tensor_name.contains("tok_embeddings") {
         0
     } else if tensor_name.contains("layers") {
-        if tensor_name.contains("attention.wo.weight") {
-            0
-        } else if tensor_name.contains("feed_forward.w2.weight") {
+        if tensor_name.contains("attention.wo.weight")
+            || tensor_name.contains("feed_forward.w2.weight")
+        {
             0
         } else {
             1
@@ -417,14 +439,21 @@ fn load_tensor_header_ggmf<'a>(
     let bpe = match bpe {
         Some(x) => x,
         None => {
-            return Err(LoadError::InvalidFtype {
+            return Err(LoadError::UnsupportedElementType {
                 tensor_name,
                 ftype,
                 path: path.to_owned(),
             });
         }
     };
-    Ok((nelements, ne, tensor_name, tensor, split_type, bpe))
+    Ok(TensorHeaderGgmf {
+        nelements,
+        ne,
+        tensor_name,
+        tensor,
+        split_type,
+        bpe,
+    })
 }
 
 fn tensor_type_size(ftype: i32, ne: [i64; 2]) -> Option<usize> {
@@ -496,7 +525,7 @@ fn load_weights_ggjt(
         match tensor_type_size(ftype, ne) {
             Some(_) => {}
             None => {
-                return Err(LoadError::InvalidFtype {
+                return Err(LoadError::UnsupportedElementType {
                     tensor_name,
                     ftype,
                     path: path.to_owned(),
diff --git a/llama-rs/src/loader2.rs b/llama-rs/src/loader2.rs
index ead8bfb3..aec7377c 100644
--- a/llama-rs/src/loader2.rs
+++ b/llama-rs/src/loader2.rs
@@ -1,12 +1,12 @@
-use ggml_loader::util::*;
-use ggml_loader::*;
+use ggml_format::{
+    util::read_i32, ContainerType, LoadError as FormatLoadError, PartialHyperparameters, TensorInfo,
+};
 use memmap2::Mmap;
 
 use std::{
     collections::HashMap,
     fs::File,
-    io::{BufRead, BufReader, Read, Seek},
-    ops::ControlFlow,
+    io::{BufRead, BufReader, Read, Seek, SeekFrom},
     path::{Path, PathBuf},
 };
 
@@ -16,19 +16,29 @@ use crate::{
 };
 
 impl LoadError {
-    fn from_ggml_loader_error(value: ggml_loader::LoadError<LoadError>, path: PathBuf) -> Self {
+    pub(crate) fn from_format_error(value: FormatLoadError<LoadError>, path: PathBuf) -> Self {
         match value {
-            ggml_loader::LoadError::InvalidMagic(_magic) => LoadError::InvalidMagic { path },
-            ggml_loader::LoadError::InvalidFormatVersion(version) => {
-                LoadError::InvalidFormatVersion { version }
+            FormatLoadError::InvalidMagic(magic) => LoadError::InvalidMagic { path, magic },
+            FormatLoadError::InvalidFormatVersion(container_type, version) => {
+                LoadError::InvalidFormatVersion {
+                    container_type,
+                    version,
+                }
             }
-            ggml_loader::LoadError::Io(err) => LoadError::Io(err),
-            ggml_loader::LoadError::FailedCast(err) => LoadError::InvalidIntegerConversion(err),
-            ggml_loader::LoadError::UserInterrupted(err) => err,
-            ggml_loader::LoadError::UnsupportedElementType(ty) => {
-                LoadError::HyperparametersF16Invalid { ftype: ty }
+            FormatLoadError::Io(err) => LoadError::Io(err),
+            FormatLoadError::InvalidUtf8(err) => LoadError::InvalidUtf8(err),
+            FormatLoadError::InvalidIntegerConversion(err) => {
+                LoadError::InvalidIntegerConversion(err)
+            }
+            FormatLoadError::ImplementationError(err) => err,
+            FormatLoadError::UnsupportedElementType { tensor_name, ftype } => {
+                LoadError::UnsupportedElementType {
+                    path,
+                    tensor_name,
+                    ftype,
+                }
             }
-            ggml_loader::LoadError::InvariantBroken(invariant) => {
+            FormatLoadError::InvariantBroken(invariant) => {
                 LoadError::InvariantBroken { path, invariant }
             }
         }
@@ -62,28 +72,25 @@ pub(crate) fn load(
         total_parts: 1,
     });
 
-    let mut loader = Loader::new(
-        path.clone(),
-        n_context_tokens,
-        prefer_mmap,
-        load_progress_callback,
-    );
-    let use_mmap = loader.mmap_active();
+    let mut loader = Loader::new(n_context_tokens, load_progress_callback);
 
-    ggml_loader::load_model_from_reader(&mut reader, &mut loader)
-        .map_err(|err| LoadError::from_ggml_loader_error(err, path.clone()))?;
+    ggml_format::load_model(&mut reader, &mut loader)
+        .map_err(|err| LoadError::from_format_error(err, path.clone()))?;
 
     let Loader {
         hyperparameters,
         vocabulary,
         tensors,
         mut load_progress_callback,
+        container_type,
         ..
     } = loader;
 
     let Hyperparameters { n_embd, n_mult, .. } = hyperparameters;
     let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult;
 
+    let use_mmap = prefer_mmap && container_type.support_mmap();
+
     let ctx_size = tensors
         .values()
         .map(|ti| {
@@ -187,110 +194,75 @@ pub(crate) fn load(
     Ok(model)
 }
 
-struct Loader<F: FnMut(LoadProgress)> {
+pub(crate) struct Loader<F: FnMut(LoadProgress)> {
     // Input
-    path: PathBuf,
     n_ctx: usize,
-    prefer_mmap: bool,
     load_progress_callback: F,
 
     // Output
-    container_type: ContainerType,
-    hyperparameters: Hyperparameters,
-    vocabulary: Vocabulary,
-    tensors: HashMap<String, TensorInfo>,
+    pub(crate) container_type: ContainerType,
+    pub(crate) hyperparameters: Hyperparameters,
+    pub(crate) vocabulary: Vocabulary,
+    pub(crate) tensors: HashMap<String, TensorInfo>,
 }
 impl<F: FnMut(LoadProgress)> Loader<F> {
-    fn new(path: PathBuf, n_ctx: usize, prefer_mmap: bool, load_progress_callback: F) -> Self {
+    pub(crate) fn new(n_ctx: usize, load_progress_callback: F) -> Self {
         Self {
-            path,
             n_ctx,
-            prefer_mmap,
             load_progress_callback,
 
-            container_type: ContainerType::GGJT,
+            container_type: ContainerType::Ggjt,
             hyperparameters: Hyperparameters::default(),
             vocabulary: Vocabulary::default(),
             tensors: HashMap::default(),
         }
     }
 }
-
-impl<F: FnMut(LoadProgress)> ggml_loader::LoadHandler<LoadError, BufReader<&File>> for Loader<F> {
-    fn load_hyper_parameters(
-        &mut self,
-        reader: &mut BufReader<&File>,
-    ) -> ControlFlow<LoadError, PartialHyperparameters> {
-        let (hyperparameters, partial) = match load_hyperparameters(reader, self.n_ctx) {
-            Ok(t) => t,
-            Err(err) => {
-                return ControlFlow::Break(LoadError::from_ggml_loader_error(
-                    err,
-                    self.path.clone(),
-                ))
-            }
-        };
-        self.hyperparameters = hyperparameters;
-        (self.load_progress_callback)(LoadProgress::HyperparametersLoaded(&self.hyperparameters));
-
-        ControlFlow::Continue(partial)
+impl<F: FnMut(LoadProgress)> ggml_format::LoadHandler<LoadError> for Loader<F> {
+    fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> {
+        self.container_type = container_type;
+        Ok(())
     }
 
-    fn got_container_type(&mut self, t: ContainerType) -> ControlFlow<LoadError> {
-        self.container_type = t;
-        ControlFlow::Continue(())
-    }
-
-    fn got_vocab_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> ControlFlow<LoadError> {
+    fn vocabulary_token(&mut self, i: usize, token: Vec<u8>, score: f32) -> Result<(), LoadError> {
         let id = match TokenId::try_from(i) {
             Ok(id) => id,
-            Err(err) => return ControlFlow::Break(LoadError::InvalidIntegerConversion(err)),
+            Err(err) => return Err(LoadError::InvalidIntegerConversion(err)),
         };
         self.vocabulary.push_token(id, token, score);
 
-        ControlFlow::Continue(())
+        Ok(())
     }
 
-    fn tensor_buffer(&mut self, info: TensorInfo) -> ControlFlow<LoadError, TensorDataTreatment> {
-        let tensor_name = match String::from_utf8(info.name.clone()) {
-            Ok(n) => n,
-            Err(err) => return ControlFlow::Break(LoadError::InvalidUtf8(err)),
+    fn read_hyperparameters(
+        &mut self,
+        reader: &mut dyn BufRead,
+    ) -> Result<PartialHyperparameters, LoadError> {
+        // NOTE: Field order matters! Data is laid out in the file exactly in this order.
+        let hyperparameters = Hyperparameters {
+            n_vocab: read_i32(reader)?.try_into()?,
+            n_embd: read_i32(reader)?.try_into()?,
+            n_mult: read_i32(reader)?.try_into()?,
+            n_head: read_i32(reader)?.try_into()?,
+            n_layer: read_i32(reader)?.try_into()?,
+            n_rot: read_i32(reader)?.try_into()?,
+            file_type: {
+                let ftype = read_i32(reader)?;
+                FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))?
+            },
+            n_ctx: self.n_ctx,
+        };
+        let partial = PartialHyperparameters {
+            n_vocab: hyperparameters.n_vocab,
         };
+        self.hyperparameters = hyperparameters;
+        (self.load_progress_callback)(LoadProgress::HyperparametersLoaded(&self.hyperparameters));
 
-        self.tensors.insert(tensor_name, info);
-        ControlFlow::Continue(TensorDataTreatment::Skip)
+        Ok(partial)
     }
-}
 
-impl<F: FnMut(LoadProgress)> Loader<F> {
-    fn mmap_active(&mut self) -> bool {
-        self.prefer_mmap && self.container_type.support_mmap()
+    fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), LoadError> {
+        self.tensors.insert(info.name.clone(), info);
+        Ok(())
     }
 }
-
-/// use this to load params for llama model inside [`LoadHandler::load_hyper_parameters`]
-fn load_hyperparameters<R: BufRead + Seek>(
-    reader: &mut R,
-    n_ctx: usize,
-) -> Result<(Hyperparameters, PartialHyperparameters), ggml_loader::LoadError<LoadError>> {
-    // NOTE: Field order matters! Data is laid out in the file exactly in this order.
-    let hparams = Hyperparameters {
-        n_vocab: read_i32(reader)?.try_into()?,
-        n_embd: read_i32(reader)?.try_into()?,
-        n_mult: read_i32(reader)?.try_into()?,
-        n_head: read_i32(reader)?.try_into()?,
-        n_layer: read_i32(reader)?.try_into()?,
-        n_rot: read_i32(reader)?.try_into()?,
-        file_type: {
-            let ftype = read_i32(reader)?;
-            FileType::try_from(ftype).map_err(|_| {
-                ggml_loader::LoadError::UserInterrupted(LoadError::UnsupportedFileType(ftype))
-            })?
-        },
-        n_ctx,
-    };
-    let partial = PartialHyperparameters {
-        n_vocab: hparams.n_vocab,
-    };
-    Ok((hparams, partial))
-}
diff --git a/llama-rs/src/loader_common.rs b/llama-rs/src/loader_common.rs
index fe44da46..a40c45de 100644
--- a/llama-rs/src/loader_common.rs
+++ b/llama-rs/src/loader_common.rs
@@ -3,6 +3,7 @@ use std::{
     path::{Path, PathBuf},
 };
 
+use ggml_format::ContainerType;
 use thiserror::Error;
 
 use crate::{util::FindAllModelFilesError, Hyperparameters};
@@ -152,10 +153,14 @@ pub enum LoadError {
     InvalidMagic {
         /// The path that failed.
         path: PathBuf,
+        /// The magic number that was encountered.
+        magic: u32,
     },
     #[error("invalid file format version {version}")]
     /// The version of the format is not supported by this version of `llama-rs`.
     InvalidFormatVersion {
+        /// The format that was encountered.
+        container_type: ContainerType,
         /// The version that was encountered.
         version: u32,
     },
@@ -184,7 +189,7 @@ pub enum LoadError {
     },
     /// The tensor `tensor_name` did not have the expected format type.
     #[error("invalid ftype {ftype} for tensor `{tensor_name}` in {path:?}")]
-    InvalidFtype {
+    UnsupportedElementType {
         /// The name of the tensor.
         tensor_name: String,
         /// The format type that was encountered.
diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs
index d7cfa91f..730afc14 100644
--- a/llama-rs/src/model.rs
+++ b/llama-rs/src/model.rs
@@ -340,8 +340,7 @@ impl Model {
 
                     let v = ctx0.op_view_2d(
                         &session.memory_v,
-                        n,
-                        n_embd,
+                        (n, n_embd),
                         n_ctx * memv_elsize,
                         (il * n_ctx) * memv_elsize * n_embd + n_past * memv_elsize,
                     );
@@ -388,11 +387,8 @@ impl Model {
                 // split cached V into n_head heads
                 let v = ctx0.op_view_3d(
                     &session.memory_v,
-                    n_past + n,
-                    n_embd / n_head,
-                    n_head,
-                    n_ctx * memv_elsize,
-                    n_ctx * memv_elsize * n_embd / n_head,
+                    (n_past + n, n_embd / n_head, n_head),
+                    (n_ctx * memv_elsize, n_ctx * memv_elsize * n_embd / n_head),
                     il * n_ctx * memv_elsize * n_embd,
                 );
 
diff --git a/llama-rs/src/quantize.rs b/llama-rs/src/quantize.rs
new file mode 100644
index 00000000..dd7ec58b
--- /dev/null
+++ b/llama-rs/src/quantize.rs
@@ -0,0 +1,337 @@
+//! Implements quantization of weights.
+
+use crate::{loader2::Loader, Hyperparameters, LoadError, LoadProgress};
+use ggml_format::{util::write_i32, SaveError, SaveHandler, TensorData, TensorInfo};
+use half::f16;
+use std::{
+    collections::HashMap,
+    fs::File,
+    io::{BufReader, BufWriter, Write},
+    path::{Path, PathBuf},
+    sync::Arc,
+};
+use thiserror::Error;
+
+#[derive(Clone, Debug)]
+
+/// Progress of quantization.
+pub enum QuantizeProgress<'a> {
+    /// Hyperparameters have been loaded.
+    HyperparametersLoaded(&'a Hyperparameters),
+    /// A tensor is being loaded.
+    TensorLoading {
+        /// Name of the tensor.
+        name: &'a str,
+        /// Size of the tensor.
+        dims: [usize; 2],
+        /// Type of the tensor.
+        element_type: ggml::Type,
+        /// Number of elements in the tensor.
+        n_elements: usize,
+    },
+    /// A tensor is being quantized.
+    TensorQuantizing {
+        /// Name of the tensor.
+        name: &'a str,
+    },
+    /// A tensor has been quantized.
+    TensorQuantized {
+        /// Name of the tensor.
+        name: &'a str,
+        /// The original size of the tensor.
+        original_size: usize,
+        /// The reduced size of the tensor.
+        reduced_size: usize,
+        /// The history of the quantization.
+        history: Vec<f32>,
+    },
+    /// A tensor has been skipped.
+    TensorSkipped {
+        /// Name of the tensor.
+        name: &'a str,
+        /// The original size (in bytes) of the tensor data.
+        size: usize,
+    },
+    /// A model has been quantized.
+    Finished {
+        /// The original size of the model.
+        original_size: f32,
+        /// The reduced size of the model.
+        reduced_size: f32,
+        /// The history of the quantization.
+        history: Vec<f32>,
+    },
+}
+
+#[derive(Error, Debug)]
+/// Errors encountered during the quantization process.
+pub enum QuantizeError {
+    #[error("could not load model")]
+    /// There was an error while attempting to load the model.
+    Load(#[from] LoadError),
+    #[error("non-specific I/O error")]
+    /// A non-specific IO error.
+    Io(#[from] std::io::Error),
+    #[error("could not convert bytes to a UTF-8 string")]
+    /// One of the strings encountered was not valid UTF-8.
+    InvalidUtf8(#[from] std::string::FromUtf8Error),
+    #[error("invalid integer conversion")]
+    /// One of the integers encountered could not be converted to a more appropriate type.
+    InvalidIntegerConversion(#[from] std::num::TryFromIntError),
+    #[error("could not create file {path:?}")]
+    /// A file failed to create.
+    CreateFileFailed {
+        /// The original error.
+        source: std::io::Error,
+        /// The path that failed.
+        path: PathBuf,
+    },
+    /// An invariant was broken.
+    ///
+    /// This error is not relevant unless `loader2` is being used.
+    #[error("invariant broken: {invariant} in {path:?}")]
+    InvariantBroken {
+        /// The path that failed.
+        path: PathBuf,
+        /// The invariant that was broken.
+        invariant: String,
+    },
+    /// Attempted to quantize to an invalid target.
+    #[error("invalid quantization target {element_type:?}")]
+    InvalidQuantizationTarget {
+        /// The quantization target.
+        element_type: ggml::Type,
+    },
+    /// The quantization process encountered an unsupported element type.
+    #[error("unsupported element type {element_type:?}")]
+    UnsupportedElementType {
+        /// The element type.
+        element_type: ggml::Type,
+    },
+}
+impl QuantizeError {
+    pub(crate) fn from_format_error(value: SaveError<QuantizeError>, path: PathBuf) -> Self {
+        match value {
+            SaveError::Io(io) => QuantizeError::Io(io),
+            SaveError::InvalidIntegerConversion(e) => QuantizeError::InvalidIntegerConversion(e),
+            SaveError::ImplementationError(e) => e,
+            SaveError::InvariantBroken(invariant) => {
+                QuantizeError::InvariantBroken { path, invariant }
+            }
+        }
+    }
+}
+
+/// Quantizes a model.
+pub fn quantize(
+    path_in: impl AsRef<Path>,
+    path_out: impl AsRef<Path>,
+    desired_type: ggml::Type,
+    progress_callback: impl Fn(QuantizeProgress),
+) -> Result<(), QuantizeError> {
+    // Sanity check
+    if !matches!(desired_type, ggml::Type::Q4_0 | ggml::Type::Q4_1) {
+        return Err(QuantizeError::InvalidQuantizationTarget {
+            element_type: desired_type,
+        });
+    }
+
+    // Load the model
+    let progress_callback = Arc::new(progress_callback);
+
+    let path_in = path_in.as_ref();
+    let mut file_in = File::open(path_in).map_err(|e| LoadError::OpenFileFailed {
+        source: e,
+        path: path_in.to_owned(),
+    })?;
+    let mut reader = BufReader::new(&file_in);
+    let mut loader = Loader::new(0, {
+        let progress_callback = progress_callback.clone();
+        move |p| {
+            if let LoadProgress::HyperparametersLoaded(h) = p {
+                progress_callback(QuantizeProgress::HyperparametersLoaded(h))
+            }
+        }
+    });
+    ggml_format::load_model(&mut reader, &mut loader)
+        .map_err(|err| LoadError::from_format_error(err, path_in.to_owned()))?;
+
+    // Save the quantized model, quantizing as we go
+    let Loader {
+        hyperparameters,
+        vocabulary,
+        tensors,
+        ..
+    } = loader;
+
+    let vocabulary = vocabulary
+        .id_to_token
+        .iter()
+        .cloned()
+        .zip(vocabulary.id_to_token_score)
+        .collect::<Vec<_>>();
+
+    let path_out = path_out.as_ref();
+    let mut writer = BufWriter::new(File::create(path_out)?);
+    let mut saver = QuantizeSaver::new(
+        desired_type,
+        &hyperparameters,
+        &tensors,
+        &mut file_in,
+        |p| progress_callback(p),
+    );
+    ggml_format::save_model(
+        &mut writer,
+        &mut saver,
+        &vocabulary,
+        &tensors.keys().cloned().collect::<Vec<_>>(),
+    )
+    .map_err(|err| QuantizeError::from_format_error(err, path_out.to_owned()))?;
+
+    // Final report
+    let sum_all: i64 = saver.history_all.iter().sum();
+    progress_callback(QuantizeProgress::Finished {
+        original_size: saver.total_size_original as f32 / 1024.0 / 1024.0,
+        reduced_size: saver.total_size_new as f32 / 1024.0 / 1024.0,
+        history: saver
+            .history_all
+            .iter()
+            .map(|hist| *hist as f32 / sum_all as f32)
+            .collect(),
+    });
+
+    Ok(())
+}
+
+struct QuantizeSaver<'a, F: Fn(QuantizeProgress)> {
+    // Input
+    quantization_type: ggml::Type,
+    hyperparameters: &'a Hyperparameters,
+    tensors: &'a HashMap<String, TensorInfo>,
+    source_file: &'a mut File,
+    progress_callback: F,
+
+    // Output
+    total_size_original: usize,
+    total_size_new: usize,
+    history_all: Vec<i64>,
+}
+impl<'a, F: Fn(QuantizeProgress)> QuantizeSaver<'a, F> {
+    fn new(
+        quantization_type: ggml::Type,
+        hyperparameters: &'a Hyperparameters,
+        tensors: &'a HashMap<String, TensorInfo>,
+        source_file: &'a mut File,
+        progress_callback: F,
+    ) -> Self {
+        Self {
+            quantization_type,
+            hyperparameters,
+            tensors,
+            source_file,
+            progress_callback,
+
+            total_size_original: 0,
+            total_size_new: 0,
+            history_all: vec![0; 16],
+        }
+    }
+}
+impl<F: Fn(QuantizeProgress)> SaveHandler<QuantizeError> for QuantizeSaver<'_, F> {
+    fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), QuantizeError> {
+        let h = self.hyperparameters;
+        write_i32(writer, h.n_vocab.try_into()?)?;
+        write_i32(writer, h.n_embd.try_into()?)?;
+        write_i32(writer, h.n_mult.try_into()?)?;
+        write_i32(writer, h.n_head.try_into()?)?;
+        write_i32(writer, h.n_layer.try_into()?)?;
+        write_i32(writer, h.n_rot.try_into()?)?;
+        write_i32(writer, h.file_type.into())?;
+        Ok(())
+    }
+
+    fn tensor_data(&mut self, tensor_name: &str) -> Result<TensorData, QuantizeError> {
+        let tensor = self.tensors.get(tensor_name).expect(
+            "tensor not found; should be impossible due to handler being populated from loader",
+        );
+
+        (self.progress_callback)(QuantizeProgress::TensorLoading {
+            name: tensor_name,
+            dims: tensor.dims,
+            n_elements: tensor.n_elements,
+            element_type: tensor.element_type,
+        });
+
+        // Quantize only 2D tensors
+        let quantize = tensor_name.contains("weight") && tensor.n_dims == 2;
+        let raw_data = tensor.read_data(&mut BufReader::new(&mut self.source_file))?;
+
+        if quantize && !matches!(tensor.element_type, ggml::Type::F32 | ggml::Type::F16) {
+            return Err(QuantizeError::UnsupportedElementType {
+                element_type: tensor.element_type,
+            });
+        }
+
+        self.total_size_original += raw_data.len();
+
+        let (element_type, data) = if quantize {
+            (self.progress_callback)(QuantizeProgress::TensorQuantizing { name: tensor_name });
+
+            let data_f32: Vec<f32> = match tensor.element_type {
+                ggml::Type::F32 => raw_data
+                    .chunks_exact(4)
+                    .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap()))
+                    .collect(),
+                ggml::Type::F16 => raw_data
+                    .chunks_exact(2)
+                    .map(|chunk| {
+                        f16::from_bits(u16::from_le_bytes(chunk.try_into().unwrap())).to_f32()
+                    })
+                    .collect(),
+                _ => unreachable!(),
+            };
+
+            let result = match self.quantization_type {
+                ggml::Type::Q4_0 => {
+                    ggml::quantize_q4_0(&data_f32, tensor.n_elements, tensor.dims[0])
+                }
+                ggml::Type::Q4_1 => {
+                    ggml::quantize_q4_1(&data_f32, tensor.n_elements, tensor.dims[0])
+                }
+                _ => unreachable!(),
+            };
+            let new_data = result.output;
+
+            let mut history_new = vec![];
+            for (i, val) in result.history.iter().enumerate() {
+                self.history_all[i] += val;
+                history_new.push(*val as f32 / tensor.n_elements as f32);
+            }
+
+            (self.progress_callback)(QuantizeProgress::TensorQuantized {
+                name: tensor_name,
+                original_size: raw_data.len(),
+                reduced_size: new_data.len(),
+                history: history_new,
+            });
+
+            self.total_size_new += new_data.len();
+
+            (self.quantization_type, new_data)
+        } else {
+            (self.progress_callback)(QuantizeProgress::TensorSkipped {
+                name: tensor_name,
+                size: raw_data.len(),
+            });
+            self.total_size_new += raw_data.len();
+            (tensor.element_type, raw_data)
+        };
+
+        Ok(TensorData {
+            n_dims: tensor.n_dims,
+            dims: tensor.dims,
+            element_type,
+            data,
+        })
+    }
+}