mlfoundations · jmercat · Jul 24, 2024 · Jul 24, 2024 · Jul 31, 2024
diff --git a/eval/eval_openlm_ckpt.py b/eval/eval_openlm_ckpt.py
@@ -46,8 +46,9 @@ def evaluate(model, tokenizer, cfg):
 
     composer_model = SimpleComposerOpenLMCausalLM(model, tokenizer)
 
+    cfg_icl_tasks = [dict(i) for i in cfg.icl_tasks]
     evaluators, logger_keys = build_icl_evaluators(
-        cfg.icl_tasks, tokenizer, cfg.max_seq_len, cfg.device_eval_batch_size
+        cfg_icl_tasks, tokenizer, cfg.max_seq_len, cfg.device_eval_batch_size
     )
 
     in_memory_logger = InMemoryLogger()  # track metrics in the in_memory_logger
@@ -123,6 +124,7 @@ def main():
 
     state_dict = checkpoint["state_dict"]
     state_dict = {x.replace("module.", ""): y for x, y in state_dict.items()}
+    state_dict = {x.replace("_orig_mod.", ""): y for x, y in state_dict.items()}
     open_lm.model.load_state_dict(state_dict)
     open_lm.model.eval()
 

diff --git a/open_lm/model_configs/open_lm_1b_swiglutorch.json b/open_lm/model_configs/open_lm_1b_swiglutorch.json
@@ -0,0 +1,14 @@
+{
+    "hidden_dim": 2048,
+    "n_layers": 24,
+    "n_heads": 16,
+    "seq_len": 2048,
+    "vocab_size": 50432,
+    "post_embed_norm": false,
+    "weight_tying": false,
+    "model_norm": "gain_only_lp_layer_norm",
+    "norm_type": "gain_only_lp_layer_norm",
+    "ffn_type": "swiglu_torch",
+    "qk_norm": true,
+    "positional_embedding_type": "rotary"
+}
diff --git a/open_lm/utils/llm_foundry_wrapper.py b/open_lm/utils/llm_foundry_wrapper.py
@@ -3,7 +3,7 @@
 
 """Implements a Hugging Causal LM wrapped inside a :class:`.ComposerModel`."""
 
-from typing import Mapping, Union
+from typing import Union
 from llmfoundry.eval.metrics.nlp import (
     InContextLearningLMAccuracy,
     InContextLearningLMExpectedCalibrationError,

diff --git a/open_lm/utils/transformers/hf_model.py b/open_lm/utils/transformers/hf_model.py
@@ -98,13 +98,28 @@ def forward(
         ```"""
         assert position_ids is None, "Position IDs are not supported"
         # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+
+        # If input_ids are all 0 after a certain point, we can skip that
+        num_zeros = 0
+        if input_ids is not None:
+            # Find indices of the last non-zero element in each row
+            last_nonzero_indices = torch.max(torch.nonzero(input_ids, as_tuple=True)[1])
+            if last_nonzero_indices < input_ids.shape[1] - 1:
+                num_zeros = input_ids.shape[1] - last_nonzero_indices - 1
+                input_ids = input_ids[:, : last_nonzero_indices + 1]
+                if attention_mask is not None:
+                    attention_mask = attention_mask[:, : last_nonzero_indices + 1]
+                if labels is not None:
+                    labels = labels[:, : last_nonzero_indices + 1]
+
         logits, _, past_key_values = self.model(
             input_ids=input_ids,
             inputs_embeds=inputs_embeds,
             past_key_values=past_key_values,
             use_cache=use_cache,
             attention_mask=attention_mask,
         )
+
         loss = None
         if labels is not None:
             shift_logits = logits[..., :-1, :].contiguous()
@@ -115,6 +130,13 @@ def forward(
             loss = loss_fct(shift_logits, shift_labels)
 
         output = CausalLMOutputWithPast(logits=logits, past_key_values=past_key_values, loss=loss)
+
+        # Add back the 0 that we removed
+        if num_zeros:
+            fake_logits = torch.zeros(logits.shape[0], num_zeros, logits.shape[2], device=logits.device)
+            fake_logits[:, :, 0] = 1
+            output.logits = torch.cat([output.logits, fake_logits], dim=1)
+
         return output
 
     def prepare_inputs_for_generation(