From cd7720720fcc148c81ff201a820ea9260d1c8b66 Mon Sep 17 00:00:00 2001 From: Baber Date: Wed, 2 Oct 2024 01:59:50 +0500 Subject: [PATCH] tokenizer: trust-remote-code --- lm_eval/models/api_models.py | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/lm_eval/models/api_models.py b/lm_eval/models/api_models.py index 7098893947..9abc6de98f 100644 --- a/lm_eval/models/api_models.py +++ b/lm_eval/models/api_models.py @@ -73,9 +73,12 @@ def __init__( seed: int = 1234, max_length: Optional[int] = 2048, add_bos_token: bool = False, - custom_prefix_token_id=None, + custom_prefix_token_id: int = None, # send the requests as tokens or strings - tokenized_requests=True, + tokenized_requests: bool = True, + trust_remote_code: bool = False, + revision: Optional[str] = "main", + use_fast_tokenizer: bool = True, **kwargs, ) -> None: super().__init__() @@ -128,7 +131,10 @@ def __init__( import transformers self.tokenizer = transformers.AutoTokenizer.from_pretrained( - self.tokenizer if self.tokenizer else self.model + self.tokenizer if self.tokenizer else self.model, + trust_remote_code=trust_remote_code, + revision=revision, + use_fast=use_fast_tokenizer, ) # Not used as the API will handle padding but to mirror the behavior of the HFLM self.tokenizer = configure_pad_token(self.tokenizer) @@ -153,6 +159,9 @@ def __init__( assert isinstance(tokenizer, str), "tokenizer must be a string" self.tokenizer = transformers.AutoTokenizer.from_pretrained( tokenizer, + trust_remote_code=trust_remote_code, + revision=revision, + use_fast=use_fast_tokenizer, ) @abc.abstractmethod