From 79630dd561d19e95fb037978c0c652b5ccefc18a Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Mon, 11 Nov 2024 17:45:35 -0800
Subject: [PATCH 1/5] feat: TRTLLM API handle tokenizers without pad_id (e.g.,
 tiktoken)

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 nemo_aligner/utils/trt_llm.py | 22 +++++++++++++++-------
 1 file changed, 15 insertions(+), 7 deletions(-)

diff --git a/nemo_aligner/utils/trt_llm.py b/nemo_aligner/utils/trt_llm.py
index 1f879064d..70e85daea 100644
--- a/nemo_aligner/utils/trt_llm.py
+++ b/nemo_aligner/utils/trt_llm.py
@@ -72,12 +72,6 @@ def __init__(
                 "You are trying to use NeMo-Aligner's TensorRT-LLM acceleration for LLM generation. Please build the dockerfile to enable this feature: https://github.com/NVIDIA/NeMo-Aligner/blob/main/Dockerfile"
             )
 
-        # If this assert turns out to be a blocker with some tokenizers, potential workarounds could be to:
-        #   - add a config option to allow specifying which token we pass as `end_id` to TRT-LLM (should
-        #     be a token that the model is guaranteed to never generate)
-        assert (
-            tokenizer.pad_id != tokenizer.eos_id
-        ), f"We require tokenizers to have a different {tokenizer.pad_id=} than {tokenizer.eos_id=} when using TRT-LLM. This is to make sure all code goes into the same path and include the eos_id when the response lengths are computed"
         assert max_input_len > 0
         assert max_generation_length > 0
         assert (
@@ -104,7 +98,21 @@ def __init__(
         rng_generator.manual_seed(seed)
         self.rng_generator = rng_generator
 
-        self.pad_id = tokenizer.pad_id if tokenizer.pad_id is not None else GPTGenerateTRTLLM.DEFAULT_PAD_ID
+        if hasattr(tokenizer, 'pad_id'):
+            # If this assert turns out to be a blocker with some tokenizers, potential workarounds could be to:
+            #   - add a config option to allow specifying which token we pass as `end_id` to TRT-LLM (should
+            #     be a token that the model is guaranteed to never generate)
+            assert (
+                tokenizer.pad_id != tokenizer.eos_id
+            ), (
+                f"We require tokenizers to have a different {tokenizer.pad_id=} than {tokenizer.eos_id=} "
+                "when using TRT-LLM. This is to make sure all code goes into the same path and include the "
+                "eos_id when the response lengths are computed"
+            )
+            self.pad_id = getattr(tokenizer, 'pad_id', GPTGenerateTRTLLM.DEFAULT_PAD_ID)
+        else:
+            # Tiktoken tokenizers doesn't have pad_id
+            self.pad_id = GPTGenerateTRTLLM.DEFAULT_PAD_ID
         self.eos_id = tokenizer.eos_id
         end_strings = list(end_strings)
 

From 72ad72b9802d2591dd80f40f02fdd474e5ef2c51 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Tue, 12 Nov 2024 03:45:14 +0000
Subject: [PATCH 2/5] ci: adds cherry-pick to enable GPT TRTLLM version with
 bias=False

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 Dockerfile | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Dockerfile b/Dockerfile
index 44a9f8651..0b99d8d69 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -130,10 +130,12 @@ git fetch -a
 # 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
 # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
 # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
+# 80ff72cfedf77f68272f35773579d9d71e5ed10c: fix(export): GPT models w/ bias=False convert properly NeMo#11255
 for pr_and_commit in \
   "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
   "10652 60e677423667c029dd05875da72bf0719774f844" \
   "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
+  "11255 80ff72cfedf77f68272f35773579d9d71e5ed10c" \
 ; do
   pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
   head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")

From 3dd6eb57fc4d54dda6e0882469785161422a89b3 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 12 Nov 2024 03:58:19 +0000
Subject: [PATCH 3/5] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

Signed-off-by: NeMo-Aligner CI <nemo-aligner-ci@nvidia.com>
---
 nemo_aligner/utils/trt_llm.py | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/nemo_aligner/utils/trt_llm.py b/nemo_aligner/utils/trt_llm.py
index 70e85daea..30facee3d 100644
--- a/nemo_aligner/utils/trt_llm.py
+++ b/nemo_aligner/utils/trt_llm.py
@@ -98,18 +98,16 @@ def __init__(
         rng_generator.manual_seed(seed)
         self.rng_generator = rng_generator
 
-        if hasattr(tokenizer, 'pad_id'):
+        if hasattr(tokenizer, "pad_id"):
             # If this assert turns out to be a blocker with some tokenizers, potential workarounds could be to:
             #   - add a config option to allow specifying which token we pass as `end_id` to TRT-LLM (should
             #     be a token that the model is guaranteed to never generate)
-            assert (
-                tokenizer.pad_id != tokenizer.eos_id
-            ), (
+            assert tokenizer.pad_id != tokenizer.eos_id, (
                 f"We require tokenizers to have a different {tokenizer.pad_id=} than {tokenizer.eos_id=} "
                 "when using TRT-LLM. This is to make sure all code goes into the same path and include the "
                 "eos_id when the response lengths are computed"
             )
-            self.pad_id = getattr(tokenizer, 'pad_id', GPTGenerateTRTLLM.DEFAULT_PAD_ID)
+            self.pad_id = getattr(tokenizer, "pad_id", GPTGenerateTRTLLM.DEFAULT_PAD_ID)
         else:
             # Tiktoken tokenizers doesn't have pad_id
             self.pad_id = GPTGenerateTRTLLM.DEFAULT_PAD_ID

From 83f1c0f1e37eee75cad610d0f7306f10c8292fd3 Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Wed, 13 Nov 2024 23:26:12 +0000
Subject: [PATCH 4/5] go with pad_id=-42 all the time

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 nemo_aligner/utils/trt_llm.py | 19 ++++---------------
 1 file changed, 4 insertions(+), 15 deletions(-)

diff --git a/nemo_aligner/utils/trt_llm.py b/nemo_aligner/utils/trt_llm.py
index 30facee3d..9f1a9345f 100644
--- a/nemo_aligner/utils/trt_llm.py
+++ b/nemo_aligner/utils/trt_llm.py
@@ -44,8 +44,9 @@ def append_and_repad_list(list_of_items, item_to_append, pad_id):
 
 
 class GPTGenerateTRTLLM:
-    # If a tokenizer does not have a pad_id, we use a large negative number and replace
-    # with self.eos_id after generation.
+    # Use a reserved negative number since there is variation between tokenizers if
+    #  they (1) have a pad_id (2) don't have a pad_id or (3) have None as the pad_id.
+    #  This pad_id is replaced with eos_id after generation.
     DEFAULT_PAD_ID = -42
 
     def __init__(
@@ -98,19 +99,7 @@ def __init__(
         rng_generator.manual_seed(seed)
         self.rng_generator = rng_generator
 
-        if hasattr(tokenizer, "pad_id"):
-            # If this assert turns out to be a blocker with some tokenizers, potential workarounds could be to:
-            #   - add a config option to allow specifying which token we pass as `end_id` to TRT-LLM (should
-            #     be a token that the model is guaranteed to never generate)
-            assert tokenizer.pad_id != tokenizer.eos_id, (
-                f"We require tokenizers to have a different {tokenizer.pad_id=} than {tokenizer.eos_id=} "
-                "when using TRT-LLM. This is to make sure all code goes into the same path and include the "
-                "eos_id when the response lengths are computed"
-            )
-            self.pad_id = getattr(tokenizer, "pad_id", GPTGenerateTRTLLM.DEFAULT_PAD_ID)
-        else:
-            # Tiktoken tokenizers doesn't have pad_id
-            self.pad_id = GPTGenerateTRTLLM.DEFAULT_PAD_ID
+        self.pad_id = GPTGenerateTRTLLM.DEFAULT_PAD_ID
         self.eos_id = tokenizer.eos_id
         end_strings = list(end_strings)
 

From bd9fa0b36be41f99fe87afb468126eba4cd7c49e Mon Sep 17 00:00:00 2001
From: Terry Kong <terryk@nvidia.com>
Date: Fri, 15 Nov 2024 15:54:21 -0800
Subject: [PATCH 5/5] revert the dockerfile change

Signed-off-by: Terry Kong <terryk@nvidia.com>
---
 Dockerfile | 2 --
 1 file changed, 2 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0b99d8d69..44a9f8651 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -130,12 +130,10 @@ git fetch -a
 # 60e677423667c029dd05875da72bf0719774f844: [feat] Update get_model_parallel_src_rank to support tp-pp-dp ordering NeMo#10652
 # 0deaf6716cb4f20766c995ce25d129795f1ae200: fix[export]: update API for disabling device reassignment in TRTLLM for Aligner NeMo#10863
 # (superceded by 10863) 148543d6e9c66ff1f8562e84484448202249811d: feat: Migrate GPTSession refit path in Nemo export to ModelRunner for Aligner NeMo#10654
-# 80ff72cfedf77f68272f35773579d9d71e5ed10c: fix(export): GPT models w/ bias=False convert properly NeMo#11255
 for pr_and_commit in \
   "10651 0c92fe17df4642ffc33d5d8c0c83fda729e3910c" \
   "10652 60e677423667c029dd05875da72bf0719774f844" \
   "10863 0deaf6716cb4f20766c995ce25d129795f1ae200" \
-  "11255 80ff72cfedf77f68272f35773579d9d71e5ed10c" \
 ; do
   pr=$(cut -f1 -d' ' <<<"$pr_and_commit")
   head_pr_commit=$(cut -f2 -d' ' <<<"$pr_and_commit")