rustformers · philpax · Apr 22, 2023 · Apr 6, 2023 · Apr 6, 2023 · Apr 6, 2023
diff --git a/llama-rs/src/loader.rs b/llama-rs/src/loader.rs
@@ -84,37 +84,25 @@ pub(crate) fn load(
     // Load vocabulary
     // ===============
     let vocabulary = {
-        let mut id_to_token = vec![];
-        let mut id_to_token_score = vec![];
-        let mut token_to_id = HashMap::new();
-        let mut max_token_length = 0;
+        let mut vocab = Vocabulary::default();
 
         for i in 0..hparams.n_vocab {
             let len = read_i32(&mut reader)?;
+            let id = i as TokenId;
             let token = read_bytes_with_len(&mut reader, len.try_into()?)?;
-            max_token_length = max_token_length.max(token.len());
-            id_to_token.push(token.clone());
-            token_to_id.insert(token, TokenId::try_from(i)?);
-
-            // Token score, currently unused
-            match model_type {
-                ContainerType::GGMF | ContainerType::GGJT => {
-                    let score = read_f32(&mut reader)?;
-                    id_to_token_score.push(score);
-                }
+
+            let score = match model_type {
+                ContainerType::GGMF | ContainerType::GGJT => read_f32(&mut reader)?,
                 ContainerType::GGML => {
                     // Legacy model, set empty score
-                    id_to_token_score.push(0.);
+                    0.
                 }
-            }
-        }
+            };
 
-        Vocabulary {
-            id_to_token,
-            id_to_token_score,
-            token_to_id,
-            max_token_length,
+            vocab.push_token(id, token, score)?;
         }
+
+        vocab
     };
 
     // for the big tensors, we have the option to store the data in 16-bit

diff --git a/llama-rs/src/vocabulary.rs b/llama-rs/src/vocabulary.rs
@@ -37,12 +37,6 @@ pub enum AddTokenError {
         /// The actual ID.
         actual_id: TokenId,
     },
-    #[error("a token with the same id already exists, id={id}")]
-    /// A token with the same ID was already added.
-    AlreadyAdded {
-        /// The ID of the token that was already added.
-        id: TokenId,
-    },
 }
 
 impl Vocabulary {
@@ -65,9 +59,7 @@ impl Vocabulary {
         self.max_token_length = self.max_token_length.max(content.len());
         self.id_to_token.push(content.clone());
         self.id_to_token_score.push(score);
-        if self.token_to_id.insert(content, id).is_some() {
-            return Err(AddTokenError::AlreadyAdded { id });
-        }
+        self.token_to_id.insert(content, id);
         Ok(())
     }