Skip to content
This repository has been archived by the owner on Jun 24, 2024. It is now read-only.

Commit

Permalink
feat: remove AlreadyAdded error
Browse files Browse the repository at this point in the history
Apparently some models just have token dupes? /shrug
  • Loading branch information
philpax committed Apr 22, 2023
1 parent 9b908ae commit 6946823
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 31 deletions.
32 changes: 10 additions & 22 deletions llama-rs/src/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,37 +84,25 @@ pub(crate) fn load(
// Load vocabulary
// ===============
let vocabulary = {
let mut id_to_token = vec![];
let mut id_to_token_score = vec![];
let mut token_to_id = HashMap::new();
let mut max_token_length = 0;
let mut vocab = Vocabulary::default();

for i in 0..hparams.n_vocab {
let len = read_i32(&mut reader)?;
let id = i as TokenId;
let token = read_bytes_with_len(&mut reader, len.try_into()?)?;
max_token_length = max_token_length.max(token.len());
id_to_token.push(token.clone());
token_to_id.insert(token, TokenId::try_from(i)?);

// Token score, currently unused
match model_type {
ContainerType::GGMF | ContainerType::GGJT => {
let score = read_f32(&mut reader)?;
id_to_token_score.push(score);
}

let score = match model_type {
ContainerType::GGMF | ContainerType::GGJT => read_f32(&mut reader)?,
ContainerType::GGML => {
// Legacy model, set empty score
id_to_token_score.push(0.);
0.
}
}
}
};

Vocabulary {
id_to_token,
id_to_token_score,
token_to_id,
max_token_length,
vocab.push_token(id, token, score)?;
}

vocab
};

// for the big tensors, we have the option to store the data in 16-bit
Expand Down
10 changes: 1 addition & 9 deletions llama-rs/src/vocabulary.rs
Original file line number Diff line number Diff line change
Expand Up @@ -37,12 +37,6 @@ pub enum AddTokenError {
/// The actual ID.
actual_id: TokenId,
},
#[error("a token with the same id already exists, id={id}")]
/// A token with the same ID was already added.
AlreadyAdded {
/// The ID of the token that was already added.
id: TokenId,
},
}

impl Vocabulary {
Expand All @@ -65,9 +59,7 @@ impl Vocabulary {
self.max_token_length = self.max_token_length.max(content.len());
self.id_to_token.push(content.clone());
self.id_to_token_score.push(score);
if self.token_to_id.insert(content, id).is_some() {
return Err(AddTokenError::AlreadyAdded { id });
}
self.token_to_id.insert(content.clone(), id);
Ok(())
}

Expand Down

0 comments on commit 6946823

Please sign in to comment.