Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Standalone loader #125

Merged
merged 46 commits into from
Apr 22, 2023
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
46 commits
Select commit Hold shift + click to select a range
bdbea68
Add loader stub for GGJT
iacore Apr 6, 2023
b0a666f
Add loading code for ggjt
iacore Apr 6, 2023
9eefdc5
code cleanup that doesn't change anything
iacore Apr 6, 2023
c212c53
more code cleanup
iacore Apr 6, 2023
bfaec3a
minor change
iacore Apr 7, 2023
b6044ee
Add non-mmap loader for GGJT
iacore Apr 7, 2023
1872dda
Prefer traits in loader.rs
iacore Apr 7, 2023
ec1fca7
cargo fmt
iacore Apr 7, 2023
cc846ae
cargo clippy --fix
iacore Apr 7, 2023
bf847dd
Remove ggml::Tensor::set_data
iacore Apr 7, 2023
ea7094c
fix(llama): buffer tokens until valid UTF-8
philpax Apr 7, 2023
c848d5e
Add standalone loader
iacore Apr 8, 2023
8390593
Move loader to standalone crate llama-loader
iacore Apr 8, 2023
15fe19b
[llama-loader] Support non-copy loader
iacore Apr 8, 2023
2e9311d
Use functions from the new crate
iacore Apr 8, 2023
4dd0fc5
Merge branch 'main' into llama-loader
philpax Apr 13, 2023
c40e36e
Merge branch 'main' of github.com:rustformers/llama-rs into llama-loader
philpax Apr 13, 2023
34429e0
refactor(llama): pass mut tensors down
philpax Apr 13, 2023
38e7d58
feat/loader Make hparams configurable
iacore Apr 14, 2023
5dfc55d
feat/loader Add hook to support multi-part model loading
iacore Apr 14, 2023
48efd74
rename llama-loader to ggml-loader
iacore Apr 14, 2023
0fbbedd
Merge branch 'main' into llama-loader
philpax Apr 19, 2023
d65996d
fix
jon-chuang Apr 12, 2023
267d8ae
no_alloc
jon-chuang Apr 12, 2023
81a6979
chore: fix clippy
philpax Apr 19, 2023
80d189e
refactor(util): make find_all_model_files error
philpax Apr 19, 2023
85e1148
UnsupportedElementtype -> UnsupportedElementType
philpax Apr 19, 2023
3f29992
feat: experimental loader2 wire-up (incomplete)
philpax Apr 19, 2023
94951c4
fix dead doc link
philpax Apr 19, 2023
69f355b
feat: turn mmap on by default, add --no-mmap
philpax Apr 19, 2023
17bc0cc
Fix loading GGJT
iacore Apr 20, 2023
6641ae9
minor fix
iacore Apr 20, 2023
3910b6a
Add mmap
iacore Apr 20, 2023
e4834bd
cargo fmt
iacore Apr 20, 2023
c380cee
Make loader2 default
iacore Apr 20, 2023
5b9788b
fix: remove dbg!(start_pos)
philpax Apr 22, 2023
cbf0756
fix: respect --no-mmap
philpax Apr 22, 2023
8813b0f
Merge branch 'main' of github.com:rustformers/llama-rs into llama-loader
philpax Apr 22, 2023
430abfe
chore: remove old comments
philpax Apr 22, 2023
bf6a917
chore: remove unused error case
philpax Apr 22, 2023
9b908ae
fix: remove some panics
philpax Apr 22, 2023
d8c4ca6
feat: remove AlreadyAdded error
philpax Apr 22, 2023
cabc4c9
minor fix
iacore Apr 22, 2023
1930496
fix: Vocabulary::push_token is infallible
philpax Apr 22, 2023
bdb9856
fix: bail on multipart models with loader2
philpax Apr 22, 2023
b41fe14
refactor: make Vocabulary::push_token pub(crate)
philpax Apr 22, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

19 changes: 5 additions & 14 deletions ggml-loader/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,6 @@ pub trait LoadHandler<T, R: BufRead + Seek> {

fn load_hyper_parameters(&mut self, reader: &mut R) -> ControlFlow<T, PartialHyperparameters>;

/// multi-file loading is not supported
/// To handle that yourself, return [`ControlFlow::Break(_)`] here
fn load_multipart(&mut self, reader: &mut R) -> ControlFlow<T> {
ControlFlow::Continue(())
}

/// callback to get tensor buffer to populate
///
/// # Returns
Expand All @@ -128,7 +122,7 @@ pub fn load_model_from_reader<T, R: BufRead + Seek>(
ggml::FILE_MAGIC_UNVERSIONED => ContainerType::GGML,
magic => return Err(LoadError::InvalidMagic(magic)),
};
retchk(handler.got_container_type(container_type))?;
controlflow_to_result(handler.got_container_type(container_type))?;

// Load format version
match container_type {
Expand All @@ -142,7 +136,7 @@ pub fn load_model_from_reader<T, R: BufRead + Seek>(
}

// Load hyper params
let hparams = retchk(handler.load_hyper_parameters(reader))?;
let hparams = controlflow_to_result(handler.load_hyper_parameters(reader))?;
let n_vocab = hparams.n_vocab;

// Load vocabulary
Expand All @@ -156,15 +150,12 @@ pub fn load_model_from_reader<T, R: BufRead + Seek>(
0.
}
};
retchk(handler.got_vocab_token(i, token, token_score))?;
controlflow_to_result(handler.got_vocab_token(i, token, token_score))?;
}

// Load tensor data
match container_type {
ContainerType::GGMF | ContainerType::GGML => {
retchk(handler.load_multipart(reader))?;
load_weights(reader, handler, false)
}
ContainerType::GGMF | ContainerType::GGML => load_weights(reader, handler, false),
ContainerType::GGJT => load_weights(reader, handler, true),
}
}
Expand Down Expand Up @@ -227,7 +218,7 @@ pub fn load_weights<T, R: BufRead + Seek>(
start_offset: offset_aligned,
};

match retchk(handler.tensor_buffer(tensor_info))? {
match controlflow_to_result(handler.tensor_buffer(tensor_info))? {
TensorDataTreatment::CopyInto(buf) => {
if align {
reader.seek(SeekFrom::Start(offset_aligned))?;
Expand Down
4 changes: 2 additions & 2 deletions ggml-loader/src/util.rs
Original file line number Diff line number Diff line change
Expand Up @@ -62,14 +62,14 @@ pub fn decode_element_type_res<T>(ftype: i32) -> Result<ElementType, LoadError<T
}
}

pub fn retchk<A, B>(x: ControlFlow<A, B>) -> Result<B, LoadError<A>> {
pub fn controlflow_to_result<A, B>(x: ControlFlow<A, B>) -> Result<B, LoadError<A>> {
match x {
ControlFlow::Continue(x) => Ok(x),
ControlFlow::Break(y) => Err(LoadError::UserInterrupted(y)),
}
}

pub fn brkchk<A, B, C: Into<A>>(x: Result<B, C>) -> ControlFlow<A, B> {
pub fn result_to_controlflow<A, B, C: Into<A>>(x: Result<B, C>) -> ControlFlow<A, B> {
match x {
Ok(x) => ControlFlow::Continue(x),
Err(y) => ControlFlow::Break(y.into()),
Expand Down
2 changes: 1 addition & 1 deletion llama-cli/src/cli_args.rs
Original file line number Diff line number Diff line change
Expand Up @@ -288,7 +288,7 @@ impl ModelLoad {
} => {
let current_part = current_part + 1;
log::info!(
"Loading model part {}/{} from '{}' (mmap: {})\n",
"Loading model part {}/{} from '{}' (mmap preferred: {})\n",
current_part,
total_parts,
file.to_string_lossy(),
Expand Down
1 change: 0 additions & 1 deletion llama-rs/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ memmap2 = "0.5.10"
serde_json = { version = "1.0", optional = true }
protobuf = { version = "= 2.14.0", optional = true }
rust_tokenizers = { version = "3.1.2", optional = true }
log = "*"

[features]
convert = ["dep:serde_json", "dep:protobuf", "dep:rust_tokenizers"]
2 changes: 1 addition & 1 deletion llama-rs/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ pub use inference_session::{
pub use loader_common::{LoadError, LoadProgress};
pub use model::{Hyperparameters, Model};
pub use util::TokenUtf8Buffer;
pub use vocabulary::{AddTokenError, TokenBias, TokenId, Vocabulary};
pub use vocabulary::{TokenBias, TokenId, Vocabulary};

/// The end of text token.
pub const EOT_TOKEN_ID: TokenId = 2; // Hardcoded (for now?)
Expand Down
2 changes: 1 addition & 1 deletion llama-rs/src/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ pub(crate) fn load(
}
};

vocab.push_token(id, token, score)?;
vocab.push_token(id, token, score);
}

vocab
Expand Down
22 changes: 11 additions & 11 deletions llama-rs/src/loader2.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@ use std::{
path::{Path, PathBuf},
};

use crate::{util::mulf, Hyperparameters, LoadError, LoadProgress, Model, TokenId, Vocabulary};
use crate::{
util::{self, mulf},
Hyperparameters, LoadError, LoadProgress, Model, TokenId, Vocabulary,
};

impl LoadError {
fn from_ggml_loader_error(value: ggml_loader::LoadError<LoadError>, path: PathBuf) -> Self {
Expand Down Expand Up @@ -39,6 +42,11 @@ pub(crate) fn load(
) -> Result<Model, LoadError> {
let main_path = path.as_ref();

let paths = util::find_all_model_files(main_path)?;
if paths.len() != 1 {
return Err(LoadError::MultipartNotSupported { paths });
}

let file = File::open(main_path).map_err(|e| LoadError::OpenFileFailed {
source: e,
path: main_path.to_owned(),
Expand Down Expand Up @@ -110,24 +118,16 @@ impl<F: FnMut(LoadProgress)> ggml_loader::LoadHandler<LoadError, BufReader<&File
Ok(id) => id,
Err(err) => return ControlFlow::Break(LoadError::InvalidIntegerConversion(err)),
};
if let Err(err) = self.vocab.push_token(id, token, score) {
return ControlFlow::Break(LoadError::from(err));
}

ControlFlow::Continue(())
}
self.vocab.push_token(id, token, score);

fn load_multipart(&mut self, _reader: &mut BufReader<&File>) -> ControlFlow<LoadError> {
// todo
log::warn!("multipart model is not supported");
ControlFlow::Continue(())
}

fn tensor_buffer(&mut self, info: TensorInfo) -> ControlFlow<LoadError, TensorDataTreatment> {
let model = match &mut self.model {
Some(model) => model,
None => {
let model = brkchk(self.create_model(self.vocab.clone()))?;
let model = result_to_controlflow(self.create_model(self.vocab.clone()))?;
self.model.insert(model)
}
};
Expand Down
13 changes: 9 additions & 4 deletions llama-rs/src/loader_common.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ use std::path::{Path, PathBuf};

use thiserror::Error;

use crate::{util::FindAllModelFilesError, vocabulary::AddTokenError, Hyperparameters};
use crate::{util::FindAllModelFilesError, Hyperparameters};

/// Each variant represents a step within the process of loading the model.
/// These can be used to report progress to the user.
Expand Down Expand Up @@ -78,9 +78,6 @@ pub enum LoadError {
#[error("invalid integer conversion")]
/// One of the integers encountered could not be converted to a more appropriate type.
InvalidIntegerConversion(#[from] std::num::TryFromIntError),
/// While loading, a token could not be added to the vocabulary.
#[error("failed to add token to vocabulary: {0}")]
VocabularyAddTokenFailed(#[from] AddTokenError),
#[error("unsupported f16_: {0}")]
/// One of the integers encountered could not be converted to a more appropriate type.
UnsupportedElementType(i32),
Expand Down Expand Up @@ -149,6 +146,14 @@ pub enum LoadError {
/// The path that failed.
path: PathBuf,
},
/// Multiple parts of the model were found.
///
/// Multi-part models are not supported. Please convert the model to a single part.
#[error("multipart models are not supported")]
MultipartNotSupported {
/// The paths that were found.
paths: Vec<PathBuf>,
},
}
impl From<FindAllModelFilesError> for LoadError {
fn from(value: FindAllModelFilesError) -> Self {
Expand Down
3 changes: 3 additions & 0 deletions llama-rs/src/model.rs
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,9 @@ impl Model {
n_context_tokens: usize,
load_progress_callback: impl FnMut(LoadProgress),
) -> Result<Model, LoadError> {
// Loader2 is the default. It can support GGML, GGMF and GGJT, but does not support multipart models.
//
// Loader1 is the old loader. It can support multipart models, but will be deprecated.
let use_loader_2: bool = match std::env::var("GGML_LOADER").as_deref() {
Ok("2") => true,
Ok("1") => false,
Expand Down
32 changes: 6 additions & 26 deletions llama-rs/src/vocabulary.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,5 @@
use std::{collections::HashMap, str::FromStr};

use thiserror::Error;

use crate::InferenceError;

/// The identifier of a token in a vocabulary.
Expand All @@ -26,41 +24,23 @@ pub struct Vocabulary {
pub(crate) max_token_length: usize,
}

#[derive(Debug, Clone, Error)]
/// Errors encountered when adding a token to a vocabulary.
pub enum AddTokenError {
#[error("the id of token added should be {expected_id}; is {actual_id}")]
/// The token that was added does not have the expected ID.
WrongId {
/// The expected ID.
expected_id: TokenId,
/// The actual ID.
actual_id: TokenId,
},
}

impl Vocabulary {
/// Add a token to the vocabulary.
///
/// The token added must have `id` directly after the last token in the vocabulary.
pub fn push_token(
&mut self,
id: TokenId,
content: Token,
score: TokenScore,
) -> Result<(), AddTokenError> {
pub fn push_token(&mut self, id: TokenId, content: Token, score: TokenScore) {
philpax marked this conversation as resolved.
Show resolved Hide resolved
// These are loader invariants. If this is broken, then the loader is broken and this is a bug,
// not an issue with the model itself.
assert_eq!(self.id_to_token.len(), self.id_to_token_score.len());
if self.id_to_token.len() != id as usize || self.id_to_token_score.len() != id as usize {
return Err(AddTokenError::WrongId {
expected_id: self.id_to_token.len() as TokenId,
actual_id: id,
});
let expected_id = self.id_to_token.len() as TokenId;
panic!("the id of token added should be {expected_id}; is {id}");
}

self.max_token_length = self.max_token_length.max(content.len());
self.id_to_token.push(content.clone());
self.id_to_token_score.push(score);
self.token_to_id.insert(content, id);
Ok(())
}

pub(crate) fn token(&self, idx: usize) -> &[u8] {
Expand Down