diff --git a/nemo_aligner/utils/trt_llm.py b/nemo_aligner/utils/trt_llm.py index 1f879064d..9f1a9345f 100644 --- a/nemo_aligner/utils/trt_llm.py +++ b/nemo_aligner/utils/trt_llm.py @@ -44,8 +44,9 @@ def append_and_repad_list(list_of_items, item_to_append, pad_id): class GPTGenerateTRTLLM: - # If a tokenizer does not have a pad_id, we use a large negative number and replace - # with self.eos_id after generation. + # Use a reserved negative number since there is variation between tokenizers if + # they (1) have a pad_id (2) don't have a pad_id or (3) have None as the pad_id. + # This pad_id is replaced with eos_id after generation. DEFAULT_PAD_ID = -42 def __init__( @@ -72,12 +73,6 @@ def __init__( "You are trying to use NeMo-Aligner's TensorRT-LLM acceleration for LLM generation. Please build the dockerfile to enable this feature: https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile" ) - # If this assert turns out to be a blocker with some tokenizers, potential workarounds could be to: - # - add a config option to allow specifying which token we pass as `end_id` to TRT-LLM (should - # be a token that the model is guaranteed to never generate) - assert ( - tokenizer.pad_id != tokenizer.eos_id - ), f"We require tokenizers to have a different {tokenizer.pad_id=} than {tokenizer.eos_id=} when using TRT-LLM. This is to make sure all code goes into the same path and include the eos_id when the response lengths are computed" assert max_input_len > 0 assert max_generation_length > 0 assert ( @@ -104,7 +99,7 @@ def __init__( rng_generator.manual_seed(seed) self.rng_generator = rng_generator - self.pad_id = tokenizer.pad_id if tokenizer.pad_id is not None else GPTGenerateTRTLLM.DEFAULT_PAD_ID + self.pad_id = GPTGenerateTRTLLM.DEFAULT_PAD_ID self.eos_id = tokenizer.eos_id end_strings = list(end_strings)