diff --git a/.vscode/settings.json b/.vscode/settings.json index ddda313c..ba494b2e 100644 --- a/.vscode/settings.json +++ b/.vscode/settings.json @@ -1,3 +1,3 @@ { - "rust-analyzer.cargo.features": ["convert"] + "rust-analyzer.cargo.features": ["convert", "quantize"] } diff --git a/Cargo.lock b/Cargo.lock index eff66fa1..58b2c4ba 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -295,6 +295,12 @@ dependencies = [ "cfg-if", ] +[[package]] +name = "crunchy" +version = "0.2.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7a81dae078cea95a014a339291cec439d2f232ebe854a9d672b796c6afafa9b7" + [[package]] name = "csv" version = "1.2.1" @@ -440,10 +446,11 @@ dependencies = [ ] [[package]] -name = "ggml-loader" +name = "ggml-format" version = "0.1.0" dependencies = [ "ggml", + "rand", "thiserror", ] @@ -466,6 +473,15 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d2fabcfbdc87f4758337ca535fb41a6d701b65693ce38287d856d1674551ec9b" +[[package]] +name = "half" +version = "2.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02b4af3693f1b705df946e9fe5631932443781d0aabb423b62fcd4d73f6d2fd0" +dependencies = [ + "crunchy", +] + [[package]] name = "hashbrown" version = "0.7.2" @@ -614,7 +630,8 @@ version = "0.1.0" dependencies = [ "bytemuck", "ggml", - "ggml-loader", + "ggml-format", + "half", "memmap2", "partial_sort", "protobuf", diff --git a/Cargo.toml b/Cargo.toml index f579b1c6..20eca429 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -2,7 +2,7 @@ members = [ "ggml-sys", "ggml", - "ggml-loader", + "ggml-format", "llama-rs", "llama-cli", "generate-ggml-bindings" diff --git a/ggml-loader/Cargo.toml b/ggml-format/Cargo.toml similarity index 79% rename from ggml-loader/Cargo.toml rename to ggml-format/Cargo.toml index 2d088758..91daca22 100644 --- a/ggml-loader/Cargo.toml +++ b/ggml-format/Cargo.toml @@ -1,5 +1,5 @@ [package] -name = "ggml-loader" +name = "ggml-format" version = "0.1.0" edition = "2021" @@ -8,3 +8,6 @@ edition = "2021" [dependencies] ggml = { path = "../ggml" } thiserror = "1.0" + +[dev-dependencies] +rand = "0.8" diff --git a/ggml-format/src/lib.rs b/ggml-format/src/lib.rs new file mode 100644 index 00000000..b26aa0f2 --- /dev/null +++ b/ggml-format/src/lib.rs @@ -0,0 +1,45 @@ +#![deny(missing_docs)] +//! A reader and writer for the `ggml` model format. +//! +//! The reader supports the GGML, GGMF and GGJT container formats, but +//! only single-part models. +//! +//! The writer isn't implemented yet. It will support the GGJT container +//! format only. + +/// Utilities for reading and writing. +pub mod util; + +mod loader; +mod saver; +#[cfg(test)] +mod tests; + +pub use loader::{ + data_size, load_model, LoadError, LoadHandler, PartialHyperparameters, TensorInfo, +}; +pub use saver::{save_model, SaveError, SaveHandler, TensorData}; + +/// The type of a tensor element. +pub type ElementType = ggml::Type; + +#[derive(Debug, PartialEq, Clone, Copy)] +/// The format of the file containing the model. +pub enum ContainerType { + /// `GGML`: legacy format, oldest ggml tensor file format + Ggml, + /// `GGMF`: also legacy format. Introduces versioning. Newer than GGML, older than GGJT. + Ggmf, + /// `GGJT`: mmap-able format. + Ggjt, +} +impl ContainerType { + /// Does this container type support mmap? + pub fn support_mmap(&self) -> bool { + match self { + ContainerType::Ggml => false, + ContainerType::Ggmf => false, + ContainerType::Ggjt => true, + } + } +} diff --git a/ggml-format/src/loader.rs b/ggml-format/src/loader.rs new file mode 100644 index 00000000..ffc99c9b --- /dev/null +++ b/ggml-format/src/loader.rs @@ -0,0 +1,240 @@ +use std::{ + error::Error, + io::{BufRead, Seek, SeekFrom}, +}; + +use crate::{ + util::{has_data_left, read_bytes_with_len, read_f32, read_i32, read_u32}, + ContainerType, ElementType, +}; + +#[derive(Debug, thiserror::Error)] +/// Errors that can occur while loading a model. +pub enum LoadError { + #[error("invalid file magic number: {0}")] + /// The file magic number is invalid. + InvalidMagic(u32), + #[error("invalid ggml format: format={0:?} version={1}")] + /// An unsupported format version was found. + InvalidFormatVersion(ContainerType, u32), + #[error("non-specific I/O error")] + /// A non-specific IO error. + Io(#[from] std::io::Error), + #[error("could not convert bytes to a UTF-8 string")] + /// One of the strings encountered was not valid UTF-8. + InvalidUtf8(#[from] std::string::FromUtf8Error), + #[error("invalid integer conversion")] + /// One of the integers encountered could not be converted to a more appropriate type. + InvalidIntegerConversion(#[from] std::num::TryFromIntError), + #[error("implementation error")] + /// An error `E` was returned by the implementation of the loader. + ImplementationError(#[source] E), + #[error("unsupported tensor type {ftype} for tensor {tensor_name}")] + /// One of the tensors encountered had an unsupported data type. + UnsupportedElementType { + /// The name of the tensor. + tensor_name: String, + /// The format type that was encountered. + ftype: i32, + }, + #[error("invariant broken: {0}")] + /// An invariant was broken. + InvariantBroken(String), +} + +#[derive(Debug, Clone)] +/// Information about a tensor that is read. +pub struct TensorInfo { + /// The name of the tensor. + pub name: String, + /// The number of dimensions in the tensor. + pub n_dims: usize, + /// The dimensions of the tensor. + pub dims: [usize; 2], + /// The number of elements in the tensor. + pub n_elements: usize, + /// The type of the elements in the tensor. + pub element_type: ElementType, + /// start of tensor - start of file + pub start_offset: u64, +} +impl TensorInfo { + /// Get the dimensions of the tensor. + pub fn dims(&self) -> &[usize] { + &self.dims[0..self.n_dims] + } + + /// Calculate the size of the tensor's values in bytes. + pub fn calc_size(&self) -> usize { + data_size(self.element_type, self.dims().iter().product()) + } + + /// Reads the tensor's data from the given reader in an owned fashion. + /// + /// The behaviour is undefined if the reader does not correspond to this info. + /// + /// Do not use this if loading with `mmap`. + pub fn read_data(&self, reader: &mut R) -> std::io::Result> { + let n_bytes = self.n_elements * ggml::type_size(self.element_type); + let mut data = vec![0; n_bytes]; + reader.seek(SeekFrom::Start(self.start_offset))?; + reader.read_exact(&mut data)?; + Ok(data) + } +} + +/// Returns the size occupied by a tensor's data in bytes given the element type and number of elements. +pub fn data_size(element_type: ElementType, n_elements: usize) -> usize { + (ggml::type_size(element_type) * n_elements) / ggml::blck_size(element_type) +} + +#[derive(Debug, Clone)] +/// Information present within the hyperparameters that is required to continue loading the model. +pub struct PartialHyperparameters { + /// The number of vocabulary tokens. + pub n_vocab: usize, +} + +/// A handler for loading a model. +pub trait LoadHandler { + /// Called when the container type is read. + fn container_type(&mut self, container_type: ContainerType) -> Result<(), E>; + /// Called when a vocabulary token is read. + fn vocabulary_token(&mut self, i: usize, token: Vec, score: f32) -> Result<(), E>; + /// Called when the hyperparameters need to be read. + /// You must read the hyperparameters for your model here. + fn read_hyperparameters( + &mut self, + reader: &mut dyn BufRead, + ) -> Result; + /// Called when a new tensor is found. + fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), E>; +} + +/// Load a model from a `reader` with the `handler`, which will be called when certain events occur. +pub fn load_model( + reader: &mut R, + handler: &mut impl LoadHandler, +) -> Result<(), LoadError> { + // Verify magic + let container_type: ContainerType = match read_u32(reader)? { + ggml::FILE_MAGIC_GGMF => ContainerType::Ggmf, + ggml::FILE_MAGIC_GGJT => ContainerType::Ggjt, + ggml::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml, + magic => return Err(LoadError::InvalidMagic(magic)), + }; + handler + .container_type(container_type) + .map_err(LoadError::ImplementationError)?; + + // Load format version + match container_type { + ContainerType::Ggmf | ContainerType::Ggjt => { + let _version: u32 = match read_u32(reader)? { + ggml::FORMAT_VERSION => ggml::FORMAT_VERSION, + version => return Err(LoadError::InvalidFormatVersion(container_type, version)), + }; + } + ContainerType::Ggml => {} + } + + // Load hyper params + let hparams = handler + .read_hyperparameters(reader) + .map_err(LoadError::ImplementationError)?; + let n_vocab = hparams.n_vocab; + + // Load vocabulary + for i in 0..n_vocab { + let len = read_u32(reader)?.try_into()?; + let token = read_bytes_with_len(reader, len)?; + let token_score = match container_type { + ContainerType::Ggmf | ContainerType::Ggjt => read_f32(reader)?, + ContainerType::Ggml => { + // Legacy model, set empty score + 0. + } + }; + handler + .vocabulary_token(i, token, token_score) + .map_err(LoadError::ImplementationError)?; + } + + // Load tensor data + match container_type { + ContainerType::Ggmf | ContainerType::Ggml => load_weights(reader, handler, false), + ContainerType::Ggjt => load_weights(reader, handler, true), + } +} + +/// # Params +/// +/// `align` +/// align to 4 bytes before reading tensor weights +fn load_weights( + reader: &mut R, + handler: &mut impl LoadHandler, + align: bool, +) -> Result<(), LoadError> { + while has_data_left(reader)? { + // load tensor header + let n_dims: usize = read_i32(reader)?.try_into()?; + let name_len = read_i32(reader)?; + let ftype = read_i32(reader)?; + + let mut n_elements: usize = 1; + let mut dims = [1usize, 1]; + let ne_len = dims.len(); + if n_dims > ne_len { + return Err(LoadError::InvariantBroken(format!("{n_dims} <= {ne_len}"))); + } + + #[allow(clippy::needless_range_loop)] + for i in 0..n_dims { + let dim: usize = read_i32(reader)?.try_into()?; + dims[i] = dim; + n_elements *= dim; + } + + // load tensor name + let name = String::from_utf8(read_bytes_with_len(reader, name_len.try_into()?)?)?; + let ftype = ggml::Type::try_from(ftype).map_err(|_| LoadError::UnsupportedElementType { + tensor_name: name.clone(), + ftype, + })?; + + // sanity check + match ftype { + ElementType::Q4_0 | ElementType::Q4_1 => { + if dims[0] % 64 != 0 { + return Err(LoadError::InvariantBroken(format!("{dims:?}[0] % 64 == 0"))); + } + } + _ => {} + } + + // load tensor weights + let offset_curr = reader.stream_position()?; + let offset_aligned: u64 = if align { + (offset_curr + 31) & !31 + } else { + offset_curr + }; + + let tensor_info = TensorInfo { + name, + dims, + n_dims, + n_elements, + element_type: ftype, + start_offset: offset_aligned, + }; + let n_bytes = tensor_info.calc_size(); + handler + .tensor_buffer(tensor_info) + .map_err(LoadError::ImplementationError)?; + reader.seek(SeekFrom::Start(offset_aligned + n_bytes as u64))?; + } + + Ok(()) +} diff --git a/ggml-format/src/saver.rs b/ggml-format/src/saver.rs new file mode 100644 index 00000000..565032a3 --- /dev/null +++ b/ggml-format/src/saver.rs @@ -0,0 +1,119 @@ +use std::{ + error::Error, + io::{Seek, Write}, +}; + +use crate::{util, ElementType}; + +#[derive(Debug, thiserror::Error)] +/// Errors that can occur while writing a model. +pub enum SaveError { + #[error("non-specific I/O error")] + /// A non-specific IO error. + Io(#[from] std::io::Error), + #[error("invalid integer conversion")] + /// One of the integers encountered could not be converted to a more appropriate type. + InvalidIntegerConversion(#[from] std::num::TryFromIntError), + #[error("implementation error")] + /// An error `E` was returned by the implementation of the loader. + ImplementationError(#[source] E), + #[error("invariant broken: {0}")] + /// An invariant was broken. + InvariantBroken(String), +} + +/// A handler for saving a model. +pub trait SaveHandler { + /// Called when the hyperparameters are to be written. + /// You must write the hyperparameters to the given writer. + fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), E>; + + /// Called when a tensor is to be written. + /// You must return data for the tensor to be saved. + fn tensor_data(&mut self, tensor_name: &str) -> Result; +} + +/// Information about a tensor that is to be saved. +#[derive(Clone, PartialEq, Debug)] +pub struct TensorData { + /// The number of dimensions in the tensor. + pub n_dims: usize, + /// The dimensions of the tensor. + pub dims: [usize; 2], + /// The type of the elements in the tensor. + pub element_type: ElementType, + /// The data to save to disk. + // TODO: This can be done more efficiently by borrowing the data, but + // I wanted to avoid the lifetime parameter for now, especially as + // the naive solution would borrow `TensorData` for the lifetime of the + // handler, which is obviously not ideal if you're trying to transcode + // an existing file tensor-by-tensor. + pub data: Vec, +} + +/// Saves a model to the given writer. +/// +/// Only GGJT is supported. +pub fn save_model( + writer: &mut W, + handler: &mut dyn SaveHandler, + vocabulary: &[(Vec, f32)], + tensor_names: &[String], +) -> Result<(), SaveError> { + // Write header and hyperparameters + util::write_u32(writer, ggml::FILE_MAGIC_GGJT)?; + util::write_u32(writer, ggml::FORMAT_VERSION)?; + handler + .write_hyperparameters(writer) + .map_err(SaveError::ImplementationError)?; + + // Write vocabulary + for (token, score) in vocabulary { + util::write_u32(writer, token.len().try_into()?)?; + writer.write_all(token)?; + util::write_f32(writer, *score)?; + } + + // Write tensors + for name in tensor_names { + let TensorData { + n_dims, + dims, + element_type, + data, + } = handler + .tensor_data(name) + .map_err(SaveError::ImplementationError)?; + + match element_type { + ElementType::Q4_0 | ElementType::Q4_1 => { + if dims[0] % 64 != 0 { + return Err(SaveError::InvariantBroken(format!("{dims:?}[0] % 64 == 0"))); + } + } + _ => {} + } + + // Write tensor header + util::write_i32(writer, n_dims.try_into()?)?; + util::write_i32(writer, name.len().try_into()?)?; + util::write_i32(writer, element_type.into())?; + for &dim in &dims[0..n_dims] { + util::write_i32(writer, dim.try_into()?)?; + } + + // Write tensor name + writer.write_all(name.as_bytes())?; + + // Align to nearest 32 bytes + let offset_curr = writer.stream_position()?; + let offset_aligned = (offset_curr + 31) & !31; + let padding = usize::try_from(offset_aligned - offset_curr)?; + writer.write_all(&vec![0; padding])?; + + // Write tensor data + writer.write_all(&data)?; + } + + Ok(()) +} diff --git a/ggml-format/src/tests.rs b/ggml-format/src/tests.rs new file mode 100644 index 00000000..91d925bb --- /dev/null +++ b/ggml-format/src/tests.rs @@ -0,0 +1,176 @@ +use std::{ + collections::BTreeMap, + error::Error, + io::{BufRead, Write}, +}; + +use crate::*; +use rand::{distributions::Uniform, prelude::*}; + +#[derive(Debug)] +struct DummyError; +impl std::fmt::Display for DummyError { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + std::fmt::Debug::fmt(&self, f) + } +} +impl Error for DummyError {} + +#[test] +fn can_roundtrip_loader_and_saver() { + let vocabulary = vec![ + ("blazingly".as_bytes().to_vec(), 0.1), + ("fast".as_bytes().to_vec(), 0.2), + ("memory".as_bytes().to_vec(), 0.3), + ("efficient".as_bytes().to_vec(), 0.4), + ]; + + let mut rng = rand::thread_rng(); + let element_type = ggml::Type::F16; + let model = Model { + hyperparameters: Hyperparameters { + some_hyperparameter: random(), + some_other_hyperparameter: random(), + vocabulary_size: vocabulary.len().try_into().unwrap(), + }, + vocabulary, + tensors: (0..10) + .map(|i| { + let n_dims = Uniform::from(1..3).sample(&mut rng); + let dims = (0..n_dims) + .map(|_| Uniform::from(1..10).sample(&mut rng)) + .chain(std::iter::repeat(1).take(2 - n_dims)) + .collect::>(); + + let n_elements = dims.iter().product::(); + let data = (0..data_size(element_type, n_elements)) + .map(|_| random()) + .collect::>(); + + ( + format!("tensor_{}", i), + TensorData { + n_dims, + dims: dims.try_into().unwrap(), + element_type, + data, + }, + ) + }) + .collect(), + }; + + // Save the model. + let mut buffer = Vec::new(); + let mut cursor = std::io::Cursor::new(&mut buffer); + let mut save_handler = MockSaveHandler { model: &model }; + save_model( + &mut cursor, + &mut save_handler, + &model.vocabulary, + &model.tensors.keys().cloned().collect::>(), + ) + .unwrap(); + + // Load the model and confirm that it is the same as the original. + let mut cursor = std::io::Cursor::new(&buffer); + let mut load_handler = MockLoadHandler { + data: &buffer, + loaded_model: Model::default(), + }; + load_model(&mut cursor, &mut load_handler).unwrap(); + assert_eq!(load_handler.loaded_model, model); +} + +#[derive(Default, PartialEq, Debug)] +struct Hyperparameters { + some_hyperparameter: u32, + some_other_hyperparameter: u32, + vocabulary_size: u32, +} +impl Hyperparameters { + fn read(reader: &mut dyn BufRead) -> Result { + Ok(Self { + some_hyperparameter: util::read_u32(reader)?, + some_other_hyperparameter: util::read_u32(reader)? as u32, + vocabulary_size: util::read_u32(reader)?, + }) + } + + fn write(&self, writer: &mut dyn Write) -> Result<(), std::io::Error> { + util::write_u32(writer, self.some_hyperparameter)?; + util::write_u32(writer, self.some_other_hyperparameter as u32)?; + util::write_u32(writer, self.vocabulary_size)?; + Ok(()) + } +} + +#[derive(Default, PartialEq, Debug)] +struct Model { + hyperparameters: Hyperparameters, + vocabulary: Vec<(Vec, f32)>, + tensors: BTreeMap, +} + +struct MockSaveHandler<'a> { + model: &'a Model, +} +impl SaveHandler for MockSaveHandler<'_> { + fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), DummyError> { + self.model.hyperparameters.write(writer).unwrap(); + Ok(()) + } + + fn tensor_data(&mut self, tensor_name: &str) -> Result { + self.model + .tensors + .get(tensor_name) + .cloned() + .ok_or(DummyError) + } +} + +struct MockLoadHandler<'a> { + data: &'a [u8], + loaded_model: Model, +} +impl LoadHandler for MockLoadHandler<'_> { + fn container_type(&mut self, container_type: ContainerType) -> Result<(), DummyError> { + assert_eq!(container_type, ContainerType::Ggjt); + Ok(()) + } + + fn vocabulary_token(&mut self, i: usize, token: Vec, score: f32) -> Result<(), DummyError> { + assert_eq!(i, self.loaded_model.vocabulary.len()); + self.loaded_model.vocabulary.push((token, score)); + Ok(()) + } + + fn read_hyperparameters( + &mut self, + reader: &mut dyn BufRead, + ) -> Result { + self.loaded_model.hyperparameters = Hyperparameters::read(reader).unwrap(); + Ok(PartialHyperparameters { + n_vocab: self + .loaded_model + .hyperparameters + .vocabulary_size + .try_into() + .unwrap(), + }) + } + + fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), DummyError> { + let data = TensorData { + n_dims: info.n_dims, + dims: info.dims, + element_type: info.element_type, + data: info + .read_data(&mut std::io::Cursor::new(self.data)) + .unwrap(), + }; + self.loaded_model.tensors.insert(info.name, data); + Ok(()) + } +} diff --git a/ggml-format/src/util.rs b/ggml-format/src/util.rs new file mode 100644 index 00000000..ac215feb --- /dev/null +++ b/ggml-format/src/util.rs @@ -0,0 +1,55 @@ +pub use std::fs::File; +pub use std::io::{BufRead, BufReader, BufWriter, Read, Seek, SeekFrom, Write}; + +/// Read a fixed-size array of bytes from a reader. +pub fn read_bytes(reader: &mut dyn BufRead) -> Result<[u8; N], std::io::Error> { + let mut bytes = [0u8; N]; + reader.read_exact(&mut bytes)?; + Ok(bytes) +} + +/// Read a `i32` from a reader. +pub fn read_i32(reader: &mut dyn BufRead) -> Result { + Ok(i32::from_le_bytes(read_bytes::<4>(reader)?)) +} + +/// Read a `u32` from a reader. +pub fn read_u32(reader: &mut dyn BufRead) -> Result { + Ok(u32::from_le_bytes(read_bytes::<4>(reader)?)) +} + +/// Read a `f32` from a reader. +pub fn read_f32(reader: &mut dyn BufRead) -> Result { + Ok(f32::from_le_bytes(read_bytes::<4>(reader)?)) +} + +/// Read a variable-length array of bytes from a reader. +pub fn read_bytes_with_len( + reader: &mut dyn BufRead, + len: usize, +) -> Result, std::io::Error> { + let mut bytes = vec![0u8; len]; + reader.read_exact(&mut bytes)?; + Ok(bytes) +} + +/// Write a `i32` from a writer. +pub fn write_i32(writer: &mut dyn Write, value: i32) -> Result<(), std::io::Error> { + writer.write_all(&value.to_le_bytes()) +} + +/// Write a `u32` from a writer. +pub fn write_u32(writer: &mut dyn Write, value: u32) -> Result<(), std::io::Error> { + writer.write_all(&value.to_le_bytes()) +} + +/// Write a `f32` from a writer. +pub fn write_f32(writer: &mut dyn Write, value: f32) -> Result<(), std::io::Error> { + writer.write_all(&value.to_le_bytes()) +} + +// NOTE: Implementation from #![feature(buf_read_has_data_left)] +/// Check if there is any data left in the reader. +pub fn has_data_left(reader: &mut impl BufRead) -> Result { + reader.fill_buf().map(|b| !b.is_empty()) +} diff --git a/ggml-loader/src/lib.rs b/ggml-loader/src/lib.rs deleted file mode 100644 index 47239f08..00000000 --- a/ggml-loader/src/lib.rs +++ /dev/null @@ -1,244 +0,0 @@ -//! standalone model loader -//! -//! Only the hyperparameter is llama-specific. Everything else can be reused for other LLM. -#![allow(clippy::nonminimal_bool)] - -pub mod util; - -use std::ops::ControlFlow; -use util::*; - -pub type ElementType = ggml::Type; - -/// the format of the file containing the model -#[derive(Debug, PartialEq, Clone, Copy)] -#[allow(clippy::upper_case_acronyms)] -pub enum ContainerType { - /// legacy format, oldest ggml tensor file format - GGML, - /// also legacy format, newer than GGML, older than GGJT - GGMF, - /// mmap-able format - GGJT, -} -impl ContainerType { - pub fn support_mmap(&self) -> bool { - match self { - ContainerType::GGML => false, - ContainerType::GGMF => false, - ContainerType::GGJT => true, - } - } -} - -#[derive(Debug, thiserror::Error)] -pub enum LoadError { - #[error("invalid file magic number: {0}")] - InvalidMagic(u32), - - #[error("invalid ggml format: version={0}")] - InvalidFormatVersion(u32), - - #[error("{0}")] - Io(#[from] std::io::Error), - - #[error("{0}")] - FailedCast(#[from] std::num::TryFromIntError), - - /// return `ControlFlow::Break` from any of the `cb_*` function to trigger this error - #[error("user requested interrupt: {0}")] - UserInterrupted(T), - - #[error("unsupported tensor dtype/f16_: {0}")] - UnsupportedElementType(i32), - - /// sanity check failed - #[error("invariant broken: {0}")] - InvariantBroken(String), -} - -#[derive(Debug, Clone)] -pub struct TensorInfo { - pub name: Vec, - pub n_dims: usize, - pub dims: [usize; 2], - pub n_elements: usize, - pub element_type: ElementType, - /// start of tensor - start of file - pub start_offset: u64, -} -impl TensorInfo { - pub fn calc_size(&self) -> usize { - let mut size = ggml::type_size(self.element_type); - for &dim in &self.dims[0..self.n_dims] { - size *= dim; - } - size / ggml::blck_size(self.element_type) - } -} - -/// Info in hyperparameter used for later loading tasks. Used in callback. -/// see [`LoadHandler::load_hyper_parameters`] -#[derive(Debug, Clone)] -pub struct PartialHyperparameters { - pub n_vocab: usize, -} - -pub enum TensorDataTreatment<'a> { - CopyInto(&'a mut [u8]), - Skip, -} - -#[allow(unused_variables)] -pub trait LoadHandler { - fn got_container_type(&mut self, container_type: ContainerType) -> ControlFlow { - ControlFlow::Continue(()) - } - - fn got_vocab_token(&mut self, i: usize, token: Vec, score: f32) -> ControlFlow { - ControlFlow::Continue(()) - } - - fn load_hyper_parameters(&mut self, reader: &mut R) -> ControlFlow; - - /// callback to get tensor buffer to populate - /// - /// # Returns - /// - /// `None` to skip copying - /// `Some(buf)` to provide a buffer for copying weights into - fn tensor_buffer(&mut self, info: TensorInfo) -> ControlFlow; -} - -#[test] -fn can_be_vtable() { - use std::mem::MaybeUninit; - let _a: MaybeUninit>> = MaybeUninit::uninit(); -} - -pub fn load_model_from_reader( - reader: &mut R, - handler: &mut impl LoadHandler, -) -> Result<(), LoadError> { - // Verify magic - let container_type: ContainerType = match read_u32(reader)? { - ggml::FILE_MAGIC_GGMF => ContainerType::GGMF, - ggml::FILE_MAGIC_GGJT => ContainerType::GGJT, - ggml::FILE_MAGIC_UNVERSIONED => ContainerType::GGML, - magic => return Err(LoadError::InvalidMagic(magic)), - }; - controlflow_to_result(handler.got_container_type(container_type))?; - - // Load format version - match container_type { - ContainerType::GGMF | ContainerType::GGJT => { - let _version: u32 = match read_u32(reader)? { - ggml::FORMAT_VERSION => ggml::FORMAT_VERSION, - version => return Err(LoadError::InvalidFormatVersion(version)), - }; - } - ContainerType::GGML => {} - } - - // Load hyper params - let hparams = controlflow_to_result(handler.load_hyper_parameters(reader))?; - let n_vocab = hparams.n_vocab; - - // Load vocabulary - for i in 0..n_vocab { - let len = read_u32(reader)?.try_into()?; - let token = read_bytes_with_len(reader, len)?; - let token_score = match container_type { - ContainerType::GGMF | ContainerType::GGJT => read_f32(reader)?, - ContainerType::GGML => { - // Legacy model, set empty score - 0. - } - }; - controlflow_to_result(handler.got_vocab_token(i, token, token_score))?; - } - - // Load tensor data - match container_type { - ContainerType::GGMF | ContainerType::GGML => load_weights(reader, handler, false), - ContainerType::GGJT => load_weights(reader, handler, true), - } -} - -/// # Params -/// -/// `align` -/// align to 4 bytes before reading tensor weights -pub fn load_weights( - reader: &mut R, - handler: &mut impl LoadHandler, - align: bool, -) -> Result<(), LoadError> { - while has_data_left(reader)? { - // load tensor header - let n_dims: usize = read_i32(reader)?.try_into()?; - let name_len = read_i32(reader)?; - let ftype = read_i32(reader)?; - let ftype = - ggml::Type::try_from(ftype).map_err(|_| LoadError::UnsupportedElementType(ftype))?; - - let mut n_elements: usize = 1; - let mut dims = [1usize, 1]; - let ne_len = dims.len(); - if !(n_dims <= ne_len) { - return Err(LoadError::InvariantBroken(format!("{n_dims} <= {ne_len}"))); - } - #[allow(clippy::needless_range_loop)] - for i in 0..n_dims { - let dim: usize = read_i32(reader)?.try_into()?; - dims[i] = dim; - n_elements *= dim; - } - - // load tensor name - let name = read_bytes_with_len(reader, name_len.try_into()?)?; - - // sanity check - match ftype { - ElementType::Q4_0 | ElementType::Q4_1 => { - if !(dims[0] % 64 == 0) { - return Err(LoadError::InvariantBroken(format!("{dims:?}[0] % 64 == 0"))); - } - } - _ => {} - } - - // load tensor weights - let offset_curr = reader.stream_position()?; - let offset_aligned: u64 = if align { - (offset_curr + 31) & !31 - } else { - offset_curr - }; - - let tensor_info = TensorInfo { - name, - dims, - n_dims, - n_elements, - element_type: ftype, - start_offset: offset_aligned, - }; - let n_bytes = tensor_info.calc_size(); - - match controlflow_to_result(handler.tensor_buffer(tensor_info))? { - TensorDataTreatment::CopyInto(buf) => { - if align { - reader.seek(SeekFrom::Start(offset_aligned))?; - } - reader.read_exact(buf)?; - } - TensorDataTreatment::Skip => { - // skip if no buffer is given - reader.seek(SeekFrom::Start(offset_aligned + n_bytes as u64))?; - } - } - } - - Ok(()) -} diff --git a/ggml-loader/src/util.rs b/ggml-loader/src/util.rs deleted file mode 100644 index 9a759aac..00000000 --- a/ggml-loader/src/util.rs +++ /dev/null @@ -1,50 +0,0 @@ -pub use std::io::{BufRead, Seek, SeekFrom}; -use std::ops::ControlFlow; - -use crate::LoadError; - -pub fn read_bytes(reader: &mut impl BufRead) -> Result<[u8; N], std::io::Error> { - let mut bytes = [0u8; N]; - reader.read_exact(&mut bytes)?; - Ok(bytes) -} - -pub fn read_i32(reader: &mut impl BufRead) -> Result { - Ok(i32::from_le_bytes(read_bytes::<4>(reader)?)) -} - -pub fn read_u32(reader: &mut impl BufRead) -> Result { - Ok(u32::from_le_bytes(read_bytes::<4>(reader)?)) -} - -pub fn read_f32(reader: &mut impl BufRead) -> Result { - Ok(f32::from_le_bytes(read_bytes::<4>(reader)?)) -} - -pub fn read_bytes_with_len( - reader: &mut impl BufRead, - len: usize, -) -> Result, std::io::Error> { - let mut bytes = vec![0u8; len]; - reader.read_exact(&mut bytes)?; - Ok(bytes) -} - -// NOTE: Implementation from #![feature(buf_read_has_data_left)] -pub fn has_data_left(reader: &mut impl BufRead) -> Result { - reader.fill_buf().map(|b| !b.is_empty()) -} - -pub fn controlflow_to_result(x: ControlFlow) -> Result> { - match x { - ControlFlow::Continue(x) => Ok(x), - ControlFlow::Break(y) => Err(LoadError::UserInterrupted(y)), - } -} - -pub fn result_to_controlflow>(x: Result) -> ControlFlow { - match x { - Ok(x) => ControlFlow::Continue(x), - Err(y) => ControlFlow::Break(y.into()), - } -} diff --git a/ggml/src/lib.rs b/ggml/src/lib.rs index 37188a64..6d8905f8 100644 --- a/ggml/src/lib.rs +++ b/ggml/src/lib.rs @@ -9,7 +9,7 @@ //! All [Tensor]s are nodes in this computational graph, and values cannot be retrieved until computation is completed. use std::{ - ffi::c_void, + os::raw::{c_int, c_void}, ptr::NonNull, sync::{Arc, Weak}, }; @@ -272,7 +272,7 @@ impl Context { pub unsafe fn op_map_unary( &self, a: &Tensor, - fun: unsafe extern "C" fn(cnt: ::std::os::raw::c_int, dst: *mut f32, src: *const f32), + fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src: *const f32), ) -> Tensor { let tensor = unsafe { ggml_sys::ggml_map_unary_f32(self.ptr.as_ptr(), a.ptr.as_ptr(), Some(fun)) }; @@ -298,12 +298,7 @@ impl Context { &self, a: &Tensor, b: &Tensor, - fun: unsafe extern "C" fn( - cnt: ::std::os::raw::c_int, - dst: *mut f32, - src0: *const f32, - src1: *const f32, - ), + fun: unsafe extern "C" fn(cnt: c_int, dst: *mut f32, src0: *const f32, src1: *const f32), ) -> Tensor { let tensor = unsafe { ggml_sys::ggml_map_binary_f32( @@ -325,14 +320,8 @@ impl Context { } /// Creates a 2D view over `a`. - pub fn op_view_2d( - &self, - a: &Tensor, - ne0: usize, - ne1: usize, - nb1: usize, - offset: usize, - ) -> Tensor { + pub fn op_view_2d(&self, a: &Tensor, ne: (usize, usize), nb1: usize, offset: usize) -> Tensor { + let (ne0, ne1) = ne; let tensor = unsafe { ggml_sys::ggml_view_2d( self.ptr.as_ptr(), @@ -347,17 +336,15 @@ impl Context { } /// Creates a 3d view over `a`. - #[allow(clippy::too_many_arguments)] pub fn op_view_3d( &self, a: &Tensor, - ne0: usize, - ne1: usize, - ne2: usize, - nb1: usize, - nb2: usize, + ne: (usize, usize, usize), + nb: (usize, usize), offset: usize, ) -> Tensor { + let (ne0, ne1, ne2) = ne; + let (nb1, nb2) = nb; let tensor = unsafe { ggml_sys::ggml_view_3d( self.ptr.as_ptr(), @@ -697,3 +684,53 @@ fn i32_to_usize(val: i32) -> usize { fn i64_to_usize(val: i64) -> usize { usize::try_from(val).unwrap() } + +/// Contains the result of a quantization operation. +pub struct QuantizationResult { + /// The quantized output. + pub output: Vec, + /// The quantization history. + pub history: Vec, +} + +/// Quantizes `src` into `dst` using `q4_0` quantization. +/// +/// You must ensure that `src.len() == n_elements`, and `n_elements_0` +/// is the first dimension of `src`. +pub fn quantize_q4_0(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult { + quantize_impl(src, n_elements, n_elements_0, ggml_sys::ggml_quantize_q4_0) +} + +/// Quantizes `src` into `dst` using `q4_1` quantization. +/// +/// You must ensure that `src.len() == n_elements`, and `n_elements_0` +/// is the first dimension of `src`. +pub fn quantize_q4_1(src: &[f32], n_elements: usize, n_elements_0: usize) -> QuantizationResult { + quantize_impl(src, n_elements, n_elements_0, ggml_sys::ggml_quantize_q4_1) +} + +fn quantize_impl( + src: &[f32], + n_elements: usize, + n_elements_0: usize, + quantizer: unsafe extern "C" fn(*const f32, *mut c_void, c_int, c_int, *mut i64) -> usize, +) -> QuantizationResult { + assert_eq!(src.len(), n_elements); + assert_eq!(n_elements % n_elements_0, 0); + + // A conservative multiplier of 4 is used here. + let mut output = vec![0u8; n_elements * 4]; + let mut history = vec![0i64; 16]; + let output_size = unsafe { + quantizer( + src.as_ptr(), + output.as_mut_ptr() as *mut c_void, + n_elements.try_into().unwrap(), + n_elements_0.try_into().unwrap(), + history.as_mut_ptr(), + ) + }; + + output.resize(output_size, 0u8); + QuantizationResult { output, history } +} diff --git a/llama-cli/Cargo.toml b/llama-cli/Cargo.toml index d4914b15..15ba6a9c 100644 --- a/llama-cli/Cargo.toml +++ b/llama-cli/Cargo.toml @@ -6,7 +6,7 @@ version = {workspace = true} # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html [dependencies] -llama-rs = {path = "../llama-rs", features = ["convert"]} +llama-rs = { path = "../llama-rs", features = ["convert", "quantize"] } rand = {workspace = true} diff --git a/llama-cli/src/cli_args.rs b/llama-cli/src/cli_args.rs index fc064017..b14a3e73 100644 --- a/llama-cli/src/cli_args.rs +++ b/llama-cli/src/cli_args.rs @@ -40,6 +40,9 @@ pub enum Args { /// /// For reference, see [the PR](https://github.com/rustformers/llama-rs/pull/83). Convert(Box), + + /// Quantize a GGML model to 4-bit. + Quantize(Box), } #[derive(Parser, Debug)] @@ -244,7 +247,7 @@ fn parse_bias(s: &str) -> Result { pub struct ModelLoad { /// Where to load the model path from #[arg(long, short = 'm')] - pub model_path: String, + pub model_path: PathBuf, /// Sets the size of the context (in tokens). Allows feeding longer prompts. /// Note that this affects memory. @@ -376,7 +379,6 @@ pub struct Convert { #[arg(long, short = 't', value_enum, default_value_t = FileType::Q4_0)] pub file_type: FileType, } - #[derive(Parser, Debug, ValueEnum, Clone, Copy)] pub enum FileType { /// Quantized 4-bit (type 0). @@ -398,3 +400,34 @@ impl From for llama_rs::FileType { } } } + +#[derive(Parser, Debug)] +pub struct Quantize { + /// The path to the model to quantize + #[arg()] + pub source: PathBuf, + + /// The path to save the quantized model to + #[arg()] + pub destination: PathBuf, + + /// The format to convert to + pub target: QuantizationTarget, +} + +#[derive(Parser, Debug, ValueEnum, Clone, Copy)] +#[clap(rename_all = "snake_case")] +pub enum QuantizationTarget { + /// Quantized 4-bit (type 0). + Q4_0, + /// Quantized 4-bit (type 1). + Q4_1, +} +impl From for llama_rs::ElementType { + fn from(t: QuantizationTarget) -> Self { + match t { + QuantizationTarget::Q4_0 => llama_rs::ElementType::Q4_0, + QuantizationTarget::Q4_1 => llama_rs::ElementType::Q4_1, + } + } +} diff --git a/llama-cli/src/main.rs b/llama-cli/src/main.rs index de8323b0..cc142875 100644 --- a/llama-cli/src/main.rs +++ b/llama-cli/src/main.rs @@ -2,7 +2,7 @@ use std::{convert::Infallible, io::Write}; use clap::Parser; use cli_args::Args; -use color_eyre::eyre::Result; +use color_eyre::eyre::{Context, Result}; use llama_rs::{convert::convert_pth_to_ggml, InferenceError}; use rustyline::error::ReadlineError; @@ -23,6 +23,7 @@ fn main() -> Result<()> { Args::Repl(args) => interactive(&args, false)?, Args::ChatExperimental(args) => interactive(&args, true)?, Args::Convert(args) => convert_pth_to_ggml(&args.directory, args.file_type.into()), + Args::Quantize(args) => quantize(&args)?, } Ok(()) @@ -184,6 +185,44 @@ fn interactive( Ok(()) } +fn quantize(args: &cli_args::Quantize) -> Result<()> { + use llama_rs::quantize::{quantize, QuantizeProgress::*}; + quantize( + &args.source, + &args.destination, + args.target.into(), + |progress| match progress { + HyperparametersLoaded(_) => log::info!("Loaded hyperparameters"), + TensorLoading { + name, + dims, + element_type, + n_elements, + } => log::info!( + "Loading tensor `{name}` ({n_elements} ({dims:?}) {element_type} elements)" + ), + TensorQuantizing { name } => log::info!("Quantizing tensor `{name}`"), + TensorQuantized { + name, + original_size, + reduced_size, + history, + } => log::info!( + "Quantized tensor `{name}` from {original_size} to {reduced_size} bytes ({history:?})" + ), + TensorSkipped { name, size } => log::info!("Skipped tensor `{name}` ({size} bytes)"), + Finished { + original_size, + reduced_size, + history, + } => log::info!( + "Finished quantization from {original_size} to {reduced_size} bytes ({history:?})" + ), + }, + ) + .wrap_err("failed to quantize model") +} + fn load_prompt_file_with_prompt( prompt_file: &cli_args::PromptFile, prompt: Option<&str>, diff --git a/llama-rs/Cargo.toml b/llama-rs/Cargo.toml index 7ed254a4..7b3e2b6e 100644 --- a/llama-rs/Cargo.toml +++ b/llama-rs/Cargo.toml @@ -8,7 +8,7 @@ rust-version = "1.65" [dependencies] ggml = { path = "../ggml" } -ggml-loader = { path = "../ggml-loader" } +ggml-format = { path = "../ggml-format" } rand = { workspace = true } @@ -24,5 +24,9 @@ serde_json = { version = "1.0", optional = true } protobuf = { version = "= 2.14.0", optional = true } rust_tokenizers = { version = "3.1.2", optional = true } +# Used for the `quantize` feature +half = { version = "2.2.1", optional = true } + [features] convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"] +quantize = ["dep:half"] diff --git a/llama-rs/src/lib.rs b/llama-rs/src/lib.rs index 88d26d0c..802229ac 100644 --- a/llama-rs/src/lib.rs +++ b/llama-rs/src/lib.rs @@ -5,6 +5,8 @@ use thiserror::Error; #[cfg(feature = "convert")] pub mod convert; +#[cfg(feature = "quantize")] +pub mod quantize; mod inference_session; mod loader; diff --git a/llama-rs/src/loader.rs b/llama-rs/src/loader.rs index 8b92378e..2b72a517 100644 --- a/llama-rs/src/loader.rs +++ b/llama-rs/src/loader.rs @@ -12,8 +12,10 @@ use crate::{ LoadError, LoadProgress, Model, TokenId, Vocabulary, }; use crate::{ElementType, Hyperparameters}; -use ggml_loader::util::*; -use ggml_loader::ContainerType; +use ggml_format::{ + util::{has_data_left, read_bytes_with_len, read_f32, read_i32, read_u32}, + ContainerType, +}; use memmap2::Mmap; pub(crate) fn load( @@ -34,26 +36,33 @@ pub(crate) fn load( let mut reader = BufReader::new(&file); // Verify magic - let model_type: ContainerType = match read_u32(&mut reader)? { - ggml::FILE_MAGIC_GGMF => ContainerType::GGMF, - ggml::FILE_MAGIC_GGJT => ContainerType::GGJT, - ggml::FILE_MAGIC_UNVERSIONED => ContainerType::GGML, + let magic = read_u32(&mut reader)?; + let model_type: ContainerType = match magic { + ggml::FILE_MAGIC_GGMF => ContainerType::Ggmf, + ggml::FILE_MAGIC_GGJT => ContainerType::Ggjt, + ggml::FILE_MAGIC_UNVERSIONED => ContainerType::Ggml, _ => { return Err(LoadError::InvalidMagic { path: main_path.to_owned(), + magic, }) } }; // Load format version match model_type { - ContainerType::GGMF | ContainerType::GGJT => { + ContainerType::Ggmf | ContainerType::Ggjt => { let _version: u32 = match read_u32(&mut reader)? { ggml::FORMAT_VERSION => ggml::FORMAT_VERSION, - version => return Err(LoadError::InvalidFormatVersion { version }), + version => { + return Err(LoadError::InvalidFormatVersion { + container_type: model_type, + version, + }) + } }; } - ContainerType::GGML => {} + ContainerType::Ggml => {} } // ================= @@ -93,8 +102,8 @@ pub(crate) fn load( let token = read_bytes_with_len(&mut reader, len.try_into()?)?; let score = match model_type { - ContainerType::GGMF | ContainerType::GGJT => read_f32(&mut reader)?, - ContainerType::GGML => { + ContainerType::Ggmf | ContainerType::Ggjt => read_f32(&mut reader)?, + ContainerType::Ggml => { // Legacy model, set empty score 0. } @@ -168,7 +177,7 @@ pub(crate) fn load( let mut model = Model::new_loader1(context, hparams, vocabulary, n_ff, wtype, mmap); match model_type { - ContainerType::GGMF | ContainerType::GGML => { + ContainerType::Ggmf | ContainerType::Ggml => { let file_offset = reader.stream_position()?; drop(reader); load_weights_ggmf_or_unversioned( @@ -178,7 +187,7 @@ pub(crate) fn load( model.tensors_mut(), )? } - ContainerType::GGJT => { + ContainerType::Ggjt => { load_weights_ggjt( &mut reader, mmap_ptr, @@ -243,7 +252,14 @@ fn load_weights_ggmf_or_unversioned( let length = read_i32(&mut part_reader)?; let ftype = read_i32(&mut part_reader)?; - let (nelements, ne, tensor_name, tensor, split_type, bpe) = load_tensor_header_ggmf( + let TensorHeaderGgmf { + nelements, + ne, + tensor_name, + tensor, + split_type, + bpe, + } = load_tensor_header_ggmf( n_dims, &mut part_reader, length, @@ -341,7 +357,14 @@ fn load_weights_ggmf_or_unversioned( Ok(()) } -#[allow(clippy::type_complexity)] +struct TensorHeaderGgmf<'a> { + nelements: usize, + ne: [i64; 2], + tensor_name: String, + tensor: &'a mut ggml::Tensor, + split_type: i32, + bpe: usize, +} fn load_tensor_header_ggmf<'a>( n_dims: usize, reader: &mut impl BufRead, @@ -350,7 +373,7 @@ fn load_tensor_header_ggmf<'a>( path: &Path, n_parts: usize, ftype: i32, -) -> Result<(usize, [i64; 2], String, &'a mut ggml::Tensor, i32, usize), LoadError> { +) -> Result, LoadError> { let mut nelements = 1; let mut ne = [1i64, 1i64]; assert!(n_dims <= ne.len()); @@ -364,13 +387,12 @@ fn load_tensor_header_ggmf<'a>( else { return Err(LoadError::UnknownTensor { tensor_name, path: path.to_owned() }); }; - #[allow(clippy::if_same_then_else)] let split_type = if tensor_name.contains("tok_embeddings") { 0 } else if tensor_name.contains("layers") { - if tensor_name.contains("attention.wo.weight") { - 0 - } else if tensor_name.contains("feed_forward.w2.weight") { + if tensor_name.contains("attention.wo.weight") + || tensor_name.contains("feed_forward.w2.weight") + { 0 } else { 1 @@ -417,14 +439,21 @@ fn load_tensor_header_ggmf<'a>( let bpe = match bpe { Some(x) => x, None => { - return Err(LoadError::InvalidFtype { + return Err(LoadError::UnsupportedElementType { tensor_name, ftype, path: path.to_owned(), }); } }; - Ok((nelements, ne, tensor_name, tensor, split_type, bpe)) + Ok(TensorHeaderGgmf { + nelements, + ne, + tensor_name, + tensor, + split_type, + bpe, + }) } fn tensor_type_size(ftype: i32, ne: [i64; 2]) -> Option { @@ -496,7 +525,7 @@ fn load_weights_ggjt( match tensor_type_size(ftype, ne) { Some(_) => {} None => { - return Err(LoadError::InvalidFtype { + return Err(LoadError::UnsupportedElementType { tensor_name, ftype, path: path.to_owned(), diff --git a/llama-rs/src/loader2.rs b/llama-rs/src/loader2.rs index ead8bfb3..aec7377c 100644 --- a/llama-rs/src/loader2.rs +++ b/llama-rs/src/loader2.rs @@ -1,12 +1,12 @@ -use ggml_loader::util::*; -use ggml_loader::*; +use ggml_format::{ + util::read_i32, ContainerType, LoadError as FormatLoadError, PartialHyperparameters, TensorInfo, +}; use memmap2::Mmap; use std::{ collections::HashMap, fs::File, - io::{BufRead, BufReader, Read, Seek}, - ops::ControlFlow, + io::{BufRead, BufReader, Read, Seek, SeekFrom}, path::{Path, PathBuf}, }; @@ -16,19 +16,29 @@ use crate::{ }; impl LoadError { - fn from_ggml_loader_error(value: ggml_loader::LoadError, path: PathBuf) -> Self { + pub(crate) fn from_format_error(value: FormatLoadError, path: PathBuf) -> Self { match value { - ggml_loader::LoadError::InvalidMagic(_magic) => LoadError::InvalidMagic { path }, - ggml_loader::LoadError::InvalidFormatVersion(version) => { - LoadError::InvalidFormatVersion { version } + FormatLoadError::InvalidMagic(magic) => LoadError::InvalidMagic { path, magic }, + FormatLoadError::InvalidFormatVersion(container_type, version) => { + LoadError::InvalidFormatVersion { + container_type, + version, + } } - ggml_loader::LoadError::Io(err) => LoadError::Io(err), - ggml_loader::LoadError::FailedCast(err) => LoadError::InvalidIntegerConversion(err), - ggml_loader::LoadError::UserInterrupted(err) => err, - ggml_loader::LoadError::UnsupportedElementType(ty) => { - LoadError::HyperparametersF16Invalid { ftype: ty } + FormatLoadError::Io(err) => LoadError::Io(err), + FormatLoadError::InvalidUtf8(err) => LoadError::InvalidUtf8(err), + FormatLoadError::InvalidIntegerConversion(err) => { + LoadError::InvalidIntegerConversion(err) + } + FormatLoadError::ImplementationError(err) => err, + FormatLoadError::UnsupportedElementType { tensor_name, ftype } => { + LoadError::UnsupportedElementType { + path, + tensor_name, + ftype, + } } - ggml_loader::LoadError::InvariantBroken(invariant) => { + FormatLoadError::InvariantBroken(invariant) => { LoadError::InvariantBroken { path, invariant } } } @@ -62,28 +72,25 @@ pub(crate) fn load( total_parts: 1, }); - let mut loader = Loader::new( - path.clone(), - n_context_tokens, - prefer_mmap, - load_progress_callback, - ); - let use_mmap = loader.mmap_active(); + let mut loader = Loader::new(n_context_tokens, load_progress_callback); - ggml_loader::load_model_from_reader(&mut reader, &mut loader) - .map_err(|err| LoadError::from_ggml_loader_error(err, path.clone()))?; + ggml_format::load_model(&mut reader, &mut loader) + .map_err(|err| LoadError::from_format_error(err, path.clone()))?; let Loader { hyperparameters, vocabulary, tensors, mut load_progress_callback, + container_type, .. } = loader; let Hyperparameters { n_embd, n_mult, .. } = hyperparameters; let n_ff = ((2 * (4 * n_embd) / 3 + n_mult - 1) / n_mult) * n_mult; + let use_mmap = prefer_mmap && container_type.support_mmap(); + let ctx_size = tensors .values() .map(|ti| { @@ -187,110 +194,75 @@ pub(crate) fn load( Ok(model) } -struct Loader { +pub(crate) struct Loader { // Input - path: PathBuf, n_ctx: usize, - prefer_mmap: bool, load_progress_callback: F, // Output - container_type: ContainerType, - hyperparameters: Hyperparameters, - vocabulary: Vocabulary, - tensors: HashMap, + pub(crate) container_type: ContainerType, + pub(crate) hyperparameters: Hyperparameters, + pub(crate) vocabulary: Vocabulary, + pub(crate) tensors: HashMap, } impl Loader { - fn new(path: PathBuf, n_ctx: usize, prefer_mmap: bool, load_progress_callback: F) -> Self { + pub(crate) fn new(n_ctx: usize, load_progress_callback: F) -> Self { Self { - path, n_ctx, - prefer_mmap, load_progress_callback, - container_type: ContainerType::GGJT, + container_type: ContainerType::Ggjt, hyperparameters: Hyperparameters::default(), vocabulary: Vocabulary::default(), tensors: HashMap::default(), } } } - -impl ggml_loader::LoadHandler> for Loader { - fn load_hyper_parameters( - &mut self, - reader: &mut BufReader<&File>, - ) -> ControlFlow { - let (hyperparameters, partial) = match load_hyperparameters(reader, self.n_ctx) { - Ok(t) => t, - Err(err) => { - return ControlFlow::Break(LoadError::from_ggml_loader_error( - err, - self.path.clone(), - )) - } - }; - self.hyperparameters = hyperparameters; - (self.load_progress_callback)(LoadProgress::HyperparametersLoaded(&self.hyperparameters)); - - ControlFlow::Continue(partial) +impl ggml_format::LoadHandler for Loader { + fn container_type(&mut self, container_type: ContainerType) -> Result<(), LoadError> { + self.container_type = container_type; + Ok(()) } - fn got_container_type(&mut self, t: ContainerType) -> ControlFlow { - self.container_type = t; - ControlFlow::Continue(()) - } - - fn got_vocab_token(&mut self, i: usize, token: Vec, score: f32) -> ControlFlow { + fn vocabulary_token(&mut self, i: usize, token: Vec, score: f32) -> Result<(), LoadError> { let id = match TokenId::try_from(i) { Ok(id) => id, - Err(err) => return ControlFlow::Break(LoadError::InvalidIntegerConversion(err)), + Err(err) => return Err(LoadError::InvalidIntegerConversion(err)), }; self.vocabulary.push_token(id, token, score); - ControlFlow::Continue(()) + Ok(()) } - fn tensor_buffer(&mut self, info: TensorInfo) -> ControlFlow { - let tensor_name = match String::from_utf8(info.name.clone()) { - Ok(n) => n, - Err(err) => return ControlFlow::Break(LoadError::InvalidUtf8(err)), + fn read_hyperparameters( + &mut self, + reader: &mut dyn BufRead, + ) -> Result { + // NOTE: Field order matters! Data is laid out in the file exactly in this order. + let hyperparameters = Hyperparameters { + n_vocab: read_i32(reader)?.try_into()?, + n_embd: read_i32(reader)?.try_into()?, + n_mult: read_i32(reader)?.try_into()?, + n_head: read_i32(reader)?.try_into()?, + n_layer: read_i32(reader)?.try_into()?, + n_rot: read_i32(reader)?.try_into()?, + file_type: { + let ftype = read_i32(reader)?; + FileType::try_from(ftype).map_err(|_| LoadError::UnsupportedFileType(ftype))? + }, + n_ctx: self.n_ctx, + }; + let partial = PartialHyperparameters { + n_vocab: hyperparameters.n_vocab, }; + self.hyperparameters = hyperparameters; + (self.load_progress_callback)(LoadProgress::HyperparametersLoaded(&self.hyperparameters)); - self.tensors.insert(tensor_name, info); - ControlFlow::Continue(TensorDataTreatment::Skip) + Ok(partial) } -} -impl Loader { - fn mmap_active(&mut self) -> bool { - self.prefer_mmap && self.container_type.support_mmap() + fn tensor_buffer(&mut self, info: TensorInfo) -> Result<(), LoadError> { + self.tensors.insert(info.name.clone(), info); + Ok(()) } } - -/// use this to load params for llama model inside [`LoadHandler::load_hyper_parameters`] -fn load_hyperparameters( - reader: &mut R, - n_ctx: usize, -) -> Result<(Hyperparameters, PartialHyperparameters), ggml_loader::LoadError> { - // NOTE: Field order matters! Data is laid out in the file exactly in this order. - let hparams = Hyperparameters { - n_vocab: read_i32(reader)?.try_into()?, - n_embd: read_i32(reader)?.try_into()?, - n_mult: read_i32(reader)?.try_into()?, - n_head: read_i32(reader)?.try_into()?, - n_layer: read_i32(reader)?.try_into()?, - n_rot: read_i32(reader)?.try_into()?, - file_type: { - let ftype = read_i32(reader)?; - FileType::try_from(ftype).map_err(|_| { - ggml_loader::LoadError::UserInterrupted(LoadError::UnsupportedFileType(ftype)) - })? - }, - n_ctx, - }; - let partial = PartialHyperparameters { - n_vocab: hparams.n_vocab, - }; - Ok((hparams, partial)) -} diff --git a/llama-rs/src/loader_common.rs b/llama-rs/src/loader_common.rs index fe44da46..a40c45de 100644 --- a/llama-rs/src/loader_common.rs +++ b/llama-rs/src/loader_common.rs @@ -3,6 +3,7 @@ use std::{ path::{Path, PathBuf}, }; +use ggml_format::ContainerType; use thiserror::Error; use crate::{util::FindAllModelFilesError, Hyperparameters}; @@ -152,10 +153,14 @@ pub enum LoadError { InvalidMagic { /// The path that failed. path: PathBuf, + /// The magic number that was encountered. + magic: u32, }, #[error("invalid file format version {version}")] /// The version of the format is not supported by this version of `llama-rs`. InvalidFormatVersion { + /// The format that was encountered. + container_type: ContainerType, /// The version that was encountered. version: u32, }, @@ -184,7 +189,7 @@ pub enum LoadError { }, /// The tensor `tensor_name` did not have the expected format type. #[error("invalid ftype {ftype} for tensor `{tensor_name}` in {path:?}")] - InvalidFtype { + UnsupportedElementType { /// The name of the tensor. tensor_name: String, /// The format type that was encountered. diff --git a/llama-rs/src/model.rs b/llama-rs/src/model.rs index d7cfa91f..730afc14 100644 --- a/llama-rs/src/model.rs +++ b/llama-rs/src/model.rs @@ -340,8 +340,7 @@ impl Model { let v = ctx0.op_view_2d( &session.memory_v, - n, - n_embd, + (n, n_embd), n_ctx * memv_elsize, (il * n_ctx) * memv_elsize * n_embd + n_past * memv_elsize, ); @@ -388,11 +387,8 @@ impl Model { // split cached V into n_head heads let v = ctx0.op_view_3d( &session.memory_v, - n_past + n, - n_embd / n_head, - n_head, - n_ctx * memv_elsize, - n_ctx * memv_elsize * n_embd / n_head, + (n_past + n, n_embd / n_head, n_head), + (n_ctx * memv_elsize, n_ctx * memv_elsize * n_embd / n_head), il * n_ctx * memv_elsize * n_embd, ); diff --git a/llama-rs/src/quantize.rs b/llama-rs/src/quantize.rs new file mode 100644 index 00000000..dd7ec58b --- /dev/null +++ b/llama-rs/src/quantize.rs @@ -0,0 +1,337 @@ +//! Implements quantization of weights. + +use crate::{loader2::Loader, Hyperparameters, LoadError, LoadProgress}; +use ggml_format::{util::write_i32, SaveError, SaveHandler, TensorData, TensorInfo}; +use half::f16; +use std::{ + collections::HashMap, + fs::File, + io::{BufReader, BufWriter, Write}, + path::{Path, PathBuf}, + sync::Arc, +}; +use thiserror::Error; + +#[derive(Clone, Debug)] + +/// Progress of quantization. +pub enum QuantizeProgress<'a> { + /// Hyperparameters have been loaded. + HyperparametersLoaded(&'a Hyperparameters), + /// A tensor is being loaded. + TensorLoading { + /// Name of the tensor. + name: &'a str, + /// Size of the tensor. + dims: [usize; 2], + /// Type of the tensor. + element_type: ggml::Type, + /// Number of elements in the tensor. + n_elements: usize, + }, + /// A tensor is being quantized. + TensorQuantizing { + /// Name of the tensor. + name: &'a str, + }, + /// A tensor has been quantized. + TensorQuantized { + /// Name of the tensor. + name: &'a str, + /// The original size of the tensor. + original_size: usize, + /// The reduced size of the tensor. + reduced_size: usize, + /// The history of the quantization. + history: Vec, + }, + /// A tensor has been skipped. + TensorSkipped { + /// Name of the tensor. + name: &'a str, + /// The original size (in bytes) of the tensor data. + size: usize, + }, + /// A model has been quantized. + Finished { + /// The original size of the model. + original_size: f32, + /// The reduced size of the model. + reduced_size: f32, + /// The history of the quantization. + history: Vec, + }, +} + +#[derive(Error, Debug)] +/// Errors encountered during the quantization process. +pub enum QuantizeError { + #[error("could not load model")] + /// There was an error while attempting to load the model. + Load(#[from] LoadError), + #[error("non-specific I/O error")] + /// A non-specific IO error. + Io(#[from] std::io::Error), + #[error("could not convert bytes to a UTF-8 string")] + /// One of the strings encountered was not valid UTF-8. + InvalidUtf8(#[from] std::string::FromUtf8Error), + #[error("invalid integer conversion")] + /// One of the integers encountered could not be converted to a more appropriate type. + InvalidIntegerConversion(#[from] std::num::TryFromIntError), + #[error("could not create file {path:?}")] + /// A file failed to create. + CreateFileFailed { + /// The original error. + source: std::io::Error, + /// The path that failed. + path: PathBuf, + }, + /// An invariant was broken. + /// + /// This error is not relevant unless `loader2` is being used. + #[error("invariant broken: {invariant} in {path:?}")] + InvariantBroken { + /// The path that failed. + path: PathBuf, + /// The invariant that was broken. + invariant: String, + }, + /// Attempted to quantize to an invalid target. + #[error("invalid quantization target {element_type:?}")] + InvalidQuantizationTarget { + /// The quantization target. + element_type: ggml::Type, + }, + /// The quantization process encountered an unsupported element type. + #[error("unsupported element type {element_type:?}")] + UnsupportedElementType { + /// The element type. + element_type: ggml::Type, + }, +} +impl QuantizeError { + pub(crate) fn from_format_error(value: SaveError, path: PathBuf) -> Self { + match value { + SaveError::Io(io) => QuantizeError::Io(io), + SaveError::InvalidIntegerConversion(e) => QuantizeError::InvalidIntegerConversion(e), + SaveError::ImplementationError(e) => e, + SaveError::InvariantBroken(invariant) => { + QuantizeError::InvariantBroken { path, invariant } + } + } + } +} + +/// Quantizes a model. +pub fn quantize( + path_in: impl AsRef, + path_out: impl AsRef, + desired_type: ggml::Type, + progress_callback: impl Fn(QuantizeProgress), +) -> Result<(), QuantizeError> { + // Sanity check + if !matches!(desired_type, ggml::Type::Q4_0 | ggml::Type::Q4_1) { + return Err(QuantizeError::InvalidQuantizationTarget { + element_type: desired_type, + }); + } + + // Load the model + let progress_callback = Arc::new(progress_callback); + + let path_in = path_in.as_ref(); + let mut file_in = File::open(path_in).map_err(|e| LoadError::OpenFileFailed { + source: e, + path: path_in.to_owned(), + })?; + let mut reader = BufReader::new(&file_in); + let mut loader = Loader::new(0, { + let progress_callback = progress_callback.clone(); + move |p| { + if let LoadProgress::HyperparametersLoaded(h) = p { + progress_callback(QuantizeProgress::HyperparametersLoaded(h)) + } + } + }); + ggml_format::load_model(&mut reader, &mut loader) + .map_err(|err| LoadError::from_format_error(err, path_in.to_owned()))?; + + // Save the quantized model, quantizing as we go + let Loader { + hyperparameters, + vocabulary, + tensors, + .. + } = loader; + + let vocabulary = vocabulary + .id_to_token + .iter() + .cloned() + .zip(vocabulary.id_to_token_score) + .collect::>(); + + let path_out = path_out.as_ref(); + let mut writer = BufWriter::new(File::create(path_out)?); + let mut saver = QuantizeSaver::new( + desired_type, + &hyperparameters, + &tensors, + &mut file_in, + |p| progress_callback(p), + ); + ggml_format::save_model( + &mut writer, + &mut saver, + &vocabulary, + &tensors.keys().cloned().collect::>(), + ) + .map_err(|err| QuantizeError::from_format_error(err, path_out.to_owned()))?; + + // Final report + let sum_all: i64 = saver.history_all.iter().sum(); + progress_callback(QuantizeProgress::Finished { + original_size: saver.total_size_original as f32 / 1024.0 / 1024.0, + reduced_size: saver.total_size_new as f32 / 1024.0 / 1024.0, + history: saver + .history_all + .iter() + .map(|hist| *hist as f32 / sum_all as f32) + .collect(), + }); + + Ok(()) +} + +struct QuantizeSaver<'a, F: Fn(QuantizeProgress)> { + // Input + quantization_type: ggml::Type, + hyperparameters: &'a Hyperparameters, + tensors: &'a HashMap, + source_file: &'a mut File, + progress_callback: F, + + // Output + total_size_original: usize, + total_size_new: usize, + history_all: Vec, +} +impl<'a, F: Fn(QuantizeProgress)> QuantizeSaver<'a, F> { + fn new( + quantization_type: ggml::Type, + hyperparameters: &'a Hyperparameters, + tensors: &'a HashMap, + source_file: &'a mut File, + progress_callback: F, + ) -> Self { + Self { + quantization_type, + hyperparameters, + tensors, + source_file, + progress_callback, + + total_size_original: 0, + total_size_new: 0, + history_all: vec![0; 16], + } + } +} +impl SaveHandler for QuantizeSaver<'_, F> { + fn write_hyperparameters(&mut self, writer: &mut dyn Write) -> Result<(), QuantizeError> { + let h = self.hyperparameters; + write_i32(writer, h.n_vocab.try_into()?)?; + write_i32(writer, h.n_embd.try_into()?)?; + write_i32(writer, h.n_mult.try_into()?)?; + write_i32(writer, h.n_head.try_into()?)?; + write_i32(writer, h.n_layer.try_into()?)?; + write_i32(writer, h.n_rot.try_into()?)?; + write_i32(writer, h.file_type.into())?; + Ok(()) + } + + fn tensor_data(&mut self, tensor_name: &str) -> Result { + let tensor = self.tensors.get(tensor_name).expect( + "tensor not found; should be impossible due to handler being populated from loader", + ); + + (self.progress_callback)(QuantizeProgress::TensorLoading { + name: tensor_name, + dims: tensor.dims, + n_elements: tensor.n_elements, + element_type: tensor.element_type, + }); + + // Quantize only 2D tensors + let quantize = tensor_name.contains("weight") && tensor.n_dims == 2; + let raw_data = tensor.read_data(&mut BufReader::new(&mut self.source_file))?; + + if quantize && !matches!(tensor.element_type, ggml::Type::F32 | ggml::Type::F16) { + return Err(QuantizeError::UnsupportedElementType { + element_type: tensor.element_type, + }); + } + + self.total_size_original += raw_data.len(); + + let (element_type, data) = if quantize { + (self.progress_callback)(QuantizeProgress::TensorQuantizing { name: tensor_name }); + + let data_f32: Vec = match tensor.element_type { + ggml::Type::F32 => raw_data + .chunks_exact(4) + .map(|chunk| f32::from_le_bytes(chunk.try_into().unwrap())) + .collect(), + ggml::Type::F16 => raw_data + .chunks_exact(2) + .map(|chunk| { + f16::from_bits(u16::from_le_bytes(chunk.try_into().unwrap())).to_f32() + }) + .collect(), + _ => unreachable!(), + }; + + let result = match self.quantization_type { + ggml::Type::Q4_0 => { + ggml::quantize_q4_0(&data_f32, tensor.n_elements, tensor.dims[0]) + } + ggml::Type::Q4_1 => { + ggml::quantize_q4_1(&data_f32, tensor.n_elements, tensor.dims[0]) + } + _ => unreachable!(), + }; + let new_data = result.output; + + let mut history_new = vec![]; + for (i, val) in result.history.iter().enumerate() { + self.history_all[i] += val; + history_new.push(*val as f32 / tensor.n_elements as f32); + } + + (self.progress_callback)(QuantizeProgress::TensorQuantized { + name: tensor_name, + original_size: raw_data.len(), + reduced_size: new_data.len(), + history: history_new, + }); + + self.total_size_new += new_data.len(); + + (self.quantization_type, new_data) + } else { + (self.progress_callback)(QuantizeProgress::TensorSkipped { + name: tensor_name, + size: raw_data.len(), + }); + self.total_size_new += raw_data.len(); + (tensor.element_type, raw_data) + }; + + Ok(TensorData { + n_dims: tensor.n_dims, + dims: tensor.dims, + element_type, + data, + }) + } +}