diff --git a/README.md b/README.md index 33d7216..e160ccc 100644 --- a/README.md +++ b/README.md @@ -205,8 +205,6 @@ We use the EleutherAI evaluation harness to evaluate our model accuracy. To eval python eval.py --checkpoint_path checkpoints/$MODEL_REPO/model.pth --compile --tasks hellaswag winogrande ``` -Note: Generative tasks are currently not supported for gpt-fast - Installation Instructions for the evaluation harness: https://github.com/EleutherAI/lm-evaluation-harness/tree/master#install ### GPTQ diff --git a/eval.py b/eval.py index 712ee18..613da2f 100644 --- a/eval.py +++ b/eval.py @@ -6,11 +6,12 @@ import sys import time from pathlib import Path -from typing import Optional +from typing import List, Optional, Tuple import torch import torch._dynamo.config import torch._inductor.config +from torch.nn.utils.rnn import pad_sequence torch._dynamo.config.automatic_dynamic_shapes = True torch._inductor.config.triton.unique_kernel_names = True @@ -21,6 +22,7 @@ from tokenizer import get_tokenizer from model import Transformer +from generate import generate try: import lm_eval @@ -91,12 +93,14 @@ def __init__( model: Transformer, tokenizer, max_seq_length: Optional[int]=None, + batch_size: int = 1, ): super().__init__() self._model = model self._tokenizer = tokenizer self._device = torch.device('cuda') self._max_seq_length = 2048 if max_seq_length is None else max_seq_length + self._batch_size = batch_size @property def eot_token_id(self): @@ -112,7 +116,7 @@ def max_gen_toks(self): @property def batch_size(self): - return 1 + return self._batch_size @property def device(self): @@ -127,6 +131,24 @@ def tok_encode(self, string: str, **kwargs): encoded = encoded.tolist() return encoded + def tok_batch_encode( + self, text: List[str], **kwargs + ) -> Tuple[torch.Tensor, torch.Tensor]: + tokenized_text = [self.tok_encode(x) for x in text] + + # pad left + x = pad_sequence( + [ + torch.tensor(x[::-1]) for x in tokenized_text + ], # first flip each sequence and pad + batch_first=True, + padding_value=self._tokenizer.pad_id(), + ).flip( + dims=[1] + ) # flip back to correct order + + return x, torch.ones_like(x) # return 'mask' b/c it's expected by the harness + def tok_decode(self, tokens): decoded = self._tokenizer.decode(tokens) return decoded @@ -147,8 +169,49 @@ def _model_call(self, inps): logits = model_forward(self._model, x, input_pos) return logits - def _model_generate(self, context, max_length, eos_token_id): - raise Exception('unimplemented') + def _model_generate(self, context, max_length, stop, **generation_kwargs): + curr_batch_size = context.size(0) + assert curr_batch_size == 1, "Currently generation only supports batch size of 1. Provided prompt has batch size {curr_batch_size}." + + # temperature = 0.0 if not set + # if do_sample is false and temp==0.0: + # remove temperature, as do_sample=False takes care of this + # and we don't want a warning from HF + generation_kwargs["temperature"] = generation_kwargs.get("temperature", 0.0) + do_sample = generation_kwargs.get("do_sample", None) + + # The temperature has to be a strictly positive float -- if it is 0.0, use greedy decoding strategies + if generation_kwargs.get("temperature") == 0.0 and do_sample is None: + generation_kwargs["do_sample"] = do_sample = False + + # TODO: handle top_p and top_k + + # Setup caches for a given batch size + # Technically this is not necessary, but it's a good way to ensure that + # the caches won't error on a different batch size. In addition, caches + # are not needed for a regular model call, so we just setup here + # TODO: call setup_cache_padded_seq_input_pos_max_seq_length_for_prefill() instead + with context.device: + self._model.setup_caches(max_batch_size=curr_batch_size, max_seq_length=max_length) + + # TODO: currently, the generate() function assumes 1D tensor with batch size 1. Need to update it to accept 2D tensor + context = context.flatten(0) + + toks, accept_counts = generate( + self._model, + context, + max_new_tokens=self.max_gen_toks, + interactive=False, + draft_model=None, + temperature=generation_kwargs["temperature"], + # top_k=None, # do_sample is not supported currently + # stop_tokens=self._tokenizer.stop_tokens, + ) + + # TODO: output from generate() is 1D tensor with batch size 1. Need to update to return 2D tensor. + toks = toks.unsqueeze(0) + + return torch.tensor(toks, dtype=torch.int32) @torch.no_grad() diff --git a/tokenizer.py b/tokenizer.py index f60b3c1..519d34c 100644 --- a/tokenizer.py +++ b/tokenizer.py @@ -21,6 +21,9 @@ def bos_id(self): def eos_id(self): raise NotImplementedError("This method should be overridden by subclasses.") + def pad_id(self): + raise NotImplementedError("This method should be overridden by subclasses.") + class SentencePieceWrapper(TokenizerInterface): def __init__(self, model_path): super().__init__(model_path) @@ -38,6 +41,10 @@ def bos_id(self): def eos_id(self): return self.processor.eos_id() + def pad_id(self): + # TODO: handle other models that do have pad_id + return self.processor.eos_id() + class TiktokenWrapper(TokenizerInterface): """ Tokenizing and encoding/decoding text using the Tiktoken tokenizer. @@ -94,6 +101,10 @@ def bos_id(self): def eos_id(self): return self._eos_id + def pad_id(self): + # TODO: handle other models that do have pad_id + return self._eos_id + def get_tokenizer(tokenizer_model_path, model_name): """ Factory function to get the appropriate tokenizer based on the model name.