From 92eca29c9eca469ebeb3e4d39a56c3bd921619d3 Mon Sep 17 00:00:00 2001
From: mathemakitten <helen.ngo14@gmail.com>
Date: Mon, 31 Oct 2022 11:19:48 -0400
Subject: [PATCH 1/3] Stop using model-defined truncation

---
 measurements/perplexity/perplexity.py | 8 ++++----
 metrics/perplexity/perplexity.py      | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/measurements/perplexity/perplexity.py b/measurements/perplexity/perplexity.py
index 0a9289641..0f1973d2e 100644
--- a/measurements/perplexity/perplexity.py
+++ b/measurements/perplexity/perplexity.py
@@ -100,7 +100,7 @@ def _info(self):
             reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
         )
 
-    def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
+    def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None):
 
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
@@ -131,15 +131,15 @@ def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool =
             assert (
                 tokenizer.bos_token is not None
             ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
-            max_tokenized_len = model.config.max_length - 1
+            max_tokenized_len = max_length - 1
         else:
-            max_tokenized_len = model.config.max_length
+            max_tokenized_len = max_length
 
         encodings = tokenizer(
             data,
             add_special_tokens=False,
             padding=True,
-            truncation=True,
+            truncation=True if max_tokenized_len else False,
             max_length=max_tokenized_len,
             return_tensors="pt",
             return_attention_mask=True,
diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py
index 939d7fbbd..79f44fb29 100644
--- a/metrics/perplexity/perplexity.py
+++ b/metrics/perplexity/perplexity.py
@@ -100,7 +100,7 @@ def _info(self):
             reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
         )
 
-    def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None):
+    def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None):
 
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
@@ -131,15 +131,15 @@ def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token:
             assert (
                 tokenizer.bos_token is not None
             ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False"
-            max_tokenized_len = model.config.max_length - 1
+            max_tokenized_len = max_length - 1
         else:
-            max_tokenized_len = model.config.max_length
+            max_tokenized_len = max_length
 
         encodings = tokenizer(
             predictions,
             add_special_tokens=False,
             padding=True,
-            truncation=True,
+            truncation=True if max_tokenized_len else False,
             max_length=max_tokenized_len,
             return_tensors="pt",
             return_attention_mask=True,

From 111609fcd402ddc439284c5f0e3388990be38c98 Mon Sep 17 00:00:00 2001
From: mathemakitten <helen.ngo14@gmail.com>
Date: Mon, 31 Oct 2022 11:21:09 -0400
Subject: [PATCH 2/3] Formatting

---
 measurements/perplexity/perplexity.py | 4 +++-
 metrics/perplexity/perplexity.py      | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/measurements/perplexity/perplexity.py b/measurements/perplexity/perplexity.py
index 0f1973d2e..81ec05c72 100644
--- a/measurements/perplexity/perplexity.py
+++ b/measurements/perplexity/perplexity.py
@@ -100,7 +100,9 @@ def _info(self):
             reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
         )
 
-    def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None):
+    def _compute(
+        self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
+    ):
 
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."
diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py
index 79f44fb29..85036a2ce 100644
--- a/metrics/perplexity/perplexity.py
+++ b/metrics/perplexity/perplexity.py
@@ -100,7 +100,9 @@ def _info(self):
             reference_urls=["https://huggingface.co/docs/transformers/perplexity"],
         )
 
-    def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None):
+    def _compute(
+        self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None
+    ):
 
         if device is not None:
             assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu."

From 99ddb79ba8ce8829d49eca07fa488be621e3bab1 Mon Sep 17 00:00:00 2001
From: mathemakitten <helen.ngo14@gmail.com>
Date: Mon, 31 Oct 2022 12:50:47 -0400
Subject: [PATCH 3/3] If start token and also max length defined

---
 measurements/perplexity/perplexity.py | 2 +-
 metrics/perplexity/perplexity.py      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/measurements/perplexity/perplexity.py b/measurements/perplexity/perplexity.py
index 81ec05c72..85f82142a 100644
--- a/measurements/perplexity/perplexity.py
+++ b/measurements/perplexity/perplexity.py
@@ -128,7 +128,7 @@ def _compute(
             # assign one of the special tokens to also be the pad token
             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
 
-        if add_start_token:
+        if add_start_token and max_length:
             # leave room for <BOS> token to be added:
             assert (
                 tokenizer.bos_token is not None
diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py
index 85036a2ce..ad307e8ad 100644
--- a/metrics/perplexity/perplexity.py
+++ b/metrics/perplexity/perplexity.py
@@ -128,7 +128,7 @@ def _compute(
             # assign one of the special tokens to also be the pad token
             tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]})
 
-        if add_start_token:
+        if add_start_token and max_length:
             # leave room for <BOS> token to be added:
             assert (
                 tokenizer.bos_token is not None