Skip to content

Commit

Permalink
[TokenizerFast] can_save_slow_tokenizer as a property for when `v…
Browse files Browse the repository at this point in the history
…ocab_file`'s folder was removed (huggingface#25626)

* pad token should be None by default

* fix tests

* nits

* check if isfile vocabfile

* add warning if sp model folder was deleted

* save SPM when missing folder for sloz

* update the ` can_save_slow_tokenizer`  to be a property

* first batch

* second batch

* missing one
  • Loading branch information
ArthurZucker authored and blbadger committed Nov 8, 2023
1 parent da41092 commit 7875778
Show file tree
Hide file tree
Showing 25 changed files with 108 additions and 24 deletions.
5 changes: 4 additions & 1 deletion src/transformers/models/albert/tokenization_albert_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,10 @@ def __init__(
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/barthez/tokenization_barthez_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,10 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
6 changes: 5 additions & 1 deletion src/transformers/models/bertweet/tokenization_bertweet.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,8 +398,12 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)

if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)

if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
copyfile(self.merges_file, out_merge_file)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -150,7 +150,10 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,10 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/cpm/tokenization_cpm_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,6 @@ def __init__(
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

try:
import jieba
Expand All @@ -153,6 +152,10 @@ def __init__(
self.jieba = jieba
self.translator = str.maketrans(" \n", "\u2582\u2583")

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

# Copied from transformers.models.xlnet.tokenization_xlnet_fast.XLNetTokenizerFast.build_inputs_with_special_tokens
def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,10 @@ def __init__(
self.do_lower_case = do_lower_case
self.split_by_punct = split_by_punct
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
"""
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/fnet/tokenization_fnet_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,10 @@ def __init__(
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,6 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

# additional properties
self.cls_token_box = cls_token_box
Expand All @@ -266,6 +265,10 @@ def __init__(
self.pad_token_label = pad_token_label
self.only_label_first_subword = only_label_first_subword

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

@add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
def __call__(
self,
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/llama/tokenization_llama_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,10 @@ def __init__(
self.update_post_processor()
self.use_default_system_prompt = use_default_system_prompt
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def update_post_processor(self):
"""
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/mbart/tokenization_mbart_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,6 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

_additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()

Expand All @@ -149,6 +148,10 @@ def __init__(
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self._src_lang)

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

@property
def src_lang(self) -> str:
return self._src_lang
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/mbart50/tokenization_mbart50_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,6 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

self.lang_code_to_id = {
lang_code: self.convert_tokens_to_ids(lang_code) for lang_code in FAIRSEQ_LANGUAGE_CODES
Expand All @@ -158,6 +157,10 @@ def __init__(
self.cur_lang_code_id = self.lang_code_to_id[self._src_lang]
self.set_src_lang_special_tokens(self._src_lang)

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

@property
def src_lang(self) -> str:
return self._src_lang
Expand Down
6 changes: 5 additions & 1 deletion src/transformers/models/mluke/tokenization_mluke.py
Original file line number Diff line number Diff line change
Expand Up @@ -1494,8 +1494,12 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)

if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)

entity_vocab_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["entity_vocab_file"]
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/nllb/tokenization_nllb_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,6 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

_additional_special_tokens = FAIRSEQ_LANGUAGE_CODES.copy()

Expand All @@ -195,6 +194,10 @@ def __init__(
self.tgt_lang = tgt_lang
self.set_src_lang_special_tokens(self._src_lang)

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

@property
def src_lang(self) -> str:
return self._src_lang
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/pegasus/tokenization_pegasus_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,10 @@ def __init__(
**kwargs,
)
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def _special_token_mask(self, seq):
all_special_ids = set(self.all_special_ids) # call it once instead of inside list comp
Expand Down
6 changes: 5 additions & 1 deletion src/transformers/models/phobert/tokenization_phobert.py
Original file line number Diff line number Diff line change
Expand Up @@ -324,8 +324,12 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
)

if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)

if os.path.abspath(self.merges_file) != os.path.abspath(out_merge_file):
copyfile(self.merges_file, out_merge_file)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -110,7 +110,10 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
if not self.can_save_slow_tokenizer:
Expand Down
6 changes: 5 additions & 1 deletion src/transformers/models/rembert/tokenization_rembert.py
Original file line number Diff line number Diff line change
Expand Up @@ -263,7 +263,11 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
)

if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
copyfile(self.vocab_file, out_vocab_file)
elif not os.path.isfile(self.vocab_file):
with open(out_vocab_file, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto()
fi.write(content_spiece_model)

return (out_vocab_file,)
5 changes: 4 additions & 1 deletion src/transformers/models/rembert/tokenization_rembert_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,7 +139,10 @@ def __init__(
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/t5/tokenization_t5_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,9 +142,12 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True
self._extra_ids = extra_ids

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

@staticmethod
def _eventually_correct_t5_max_length(pretrained_model_name_or_path, max_model_length, init_max_model_length):
if pretrained_model_name_or_path in T5TokenizerFast.max_model_input_sizes:
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/xglm/tokenization_xglm_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,10 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -189,6 +189,10 @@ def __init__(
for k in self.fairseq_tokens_to_ids.keys():
self.unique_no_split_tokens.append(k)

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,10 @@ def __init__(
)

self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
5 changes: 4 additions & 1 deletion src/transformers/models/xlnet/tokenization_xlnet_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,7 +169,10 @@ def __init__(
self.remove_space = remove_space
self.keep_accents = keep_accents
self.vocab_file = vocab_file
self.can_save_slow_tokenizer = False if not self.vocab_file else True

@property
def can_save_slow_tokenizer(self) -> bool:
return os.path.isfile(self.vocab_file) if self.vocab_file else False

def build_inputs_with_special_tokens(
self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
Expand Down
9 changes: 8 additions & 1 deletion src/transformers/tokenization_utils_fast.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,6 @@ class PreTrainedTokenizerFast(PreTrainedTokenizerBase):

vocab_files_names = VOCAB_FILES_NAMES
slow_tokenizer_class: PreTrainedTokenizer = None
can_save_slow_tokenizer: bool = True

def __init__(self, *args, **kwargs):
tokenizer_object = kwargs.pop("tokenizer_object", None)
Expand Down Expand Up @@ -159,6 +158,14 @@ def __init__(self, *args, **kwargs):
def is_fast(self) -> bool:
return True

@property
def can_save_slow_tokenizer(self) -> bool:
"""
`bool`: Whether or not the slow tokenizer can be saved. Usually for sentencepiece based slow tokenizer, this
can only be `True` if the original `"sentencepiece.model"` was not deleted.
"""
return True

@property
def vocab_size(self) -> int:
"""
Expand Down

0 comments on commit 7875778

Please sign in to comment.