From 92eca29c9eca469ebeb3e4d39a56c3bd921619d3 Mon Sep 17 00:00:00 2001 From: mathemakitten Date: Mon, 31 Oct 2022 11:19:48 -0400 Subject: [PATCH 1/3] Stop using model-defined truncation --- measurements/perplexity/perplexity.py | 8 ++++---- metrics/perplexity/perplexity.py | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/measurements/perplexity/perplexity.py b/measurements/perplexity/perplexity.py index 0a9289641..0f1973d2e 100644 --- a/measurements/perplexity/perplexity.py +++ b/measurements/perplexity/perplexity.py @@ -100,7 +100,7 @@ def _info(self): reference_urls=["https://huggingface.co/docs/transformers/perplexity"], ) - def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None): + def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None): if device is not None: assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu." @@ -131,15 +131,15 @@ def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = assert ( tokenizer.bos_token is not None ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False" - max_tokenized_len = model.config.max_length - 1 + max_tokenized_len = max_length - 1 else: - max_tokenized_len = model.config.max_length + max_tokenized_len = max_length encodings = tokenizer( data, add_special_tokens=False, padding=True, - truncation=True, + truncation=True if max_tokenized_len else False, max_length=max_tokenized_len, return_tensors="pt", return_attention_mask=True, diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py index 939d7fbbd..79f44fb29 100644 --- a/metrics/perplexity/perplexity.py +++ b/metrics/perplexity/perplexity.py @@ -100,7 +100,7 @@ def _info(self): reference_urls=["https://huggingface.co/docs/transformers/perplexity"], ) - def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None): + def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None): if device is not None: assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu." @@ -131,15 +131,15 @@ def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: assert ( tokenizer.bos_token is not None ), "Input model must already have a BOS token if using add_start_token=True. Please use a different model, or set add_start_token=False" - max_tokenized_len = model.config.max_length - 1 + max_tokenized_len = max_length - 1 else: - max_tokenized_len = model.config.max_length + max_tokenized_len = max_length encodings = tokenizer( predictions, add_special_tokens=False, padding=True, - truncation=True, + truncation=True if max_tokenized_len else False, max_length=max_tokenized_len, return_tensors="pt", return_attention_mask=True, From 111609fcd402ddc439284c5f0e3388990be38c98 Mon Sep 17 00:00:00 2001 From: mathemakitten Date: Mon, 31 Oct 2022 11:21:09 -0400 Subject: [PATCH 2/3] Formatting --- measurements/perplexity/perplexity.py | 4 +++- metrics/perplexity/perplexity.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/measurements/perplexity/perplexity.py b/measurements/perplexity/perplexity.py index 0f1973d2e..81ec05c72 100644 --- a/measurements/perplexity/perplexity.py +++ b/measurements/perplexity/perplexity.py @@ -100,7 +100,9 @@ def _info(self): reference_urls=["https://huggingface.co/docs/transformers/perplexity"], ) - def _compute(self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None): + def _compute( + self, data, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None + ): if device is not None: assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu." diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py index 79f44fb29..85036a2ce 100644 --- a/metrics/perplexity/perplexity.py +++ b/metrics/perplexity/perplexity.py @@ -100,7 +100,9 @@ def _info(self): reference_urls=["https://huggingface.co/docs/transformers/perplexity"], ) - def _compute(self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None): + def _compute( + self, predictions, model_id, batch_size: int = 16, add_start_token: bool = True, device=None, max_length=None + ): if device is not None: assert device in ["gpu", "cpu", "cuda"], "device should be either gpu or cpu." From 99ddb79ba8ce8829d49eca07fa488be621e3bab1 Mon Sep 17 00:00:00 2001 From: mathemakitten Date: Mon, 31 Oct 2022 12:50:47 -0400 Subject: [PATCH 3/3] If start token and also max length defined --- measurements/perplexity/perplexity.py | 2 +- metrics/perplexity/perplexity.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/measurements/perplexity/perplexity.py b/measurements/perplexity/perplexity.py index 81ec05c72..85f82142a 100644 --- a/measurements/perplexity/perplexity.py +++ b/measurements/perplexity/perplexity.py @@ -128,7 +128,7 @@ def _compute( # assign one of the special tokens to also be the pad token tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]}) - if add_start_token: + if add_start_token and max_length: # leave room for token to be added: assert ( tokenizer.bos_token is not None diff --git a/metrics/perplexity/perplexity.py b/metrics/perplexity/perplexity.py index 85036a2ce..ad307e8ad 100644 --- a/metrics/perplexity/perplexity.py +++ b/metrics/perplexity/perplexity.py @@ -128,7 +128,7 @@ def _compute( # assign one of the special tokens to also be the pad token tokenizer.add_special_tokens({"pad_token": existing_special_tokens[0]}) - if add_start_token: + if add_start_token and max_length: # leave room for token to be added: assert ( tokenizer.bos_token is not None