Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Tokenizer] Support for loading added_tokens_decoder #8997

18 changes: 17 additions & 1 deletion paddlenlp/transformers/gemma/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,18 @@
"""Returns vocab size"""
return self.sp_model.get_piece_size()

def __len__(self):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

解决无法添加token的问题

"""
Returns the vocabulary size. added_tokens_encoder has to be added in the sp_model
"""
added_size = 0

for id in self.added_tokens_decoder:
if id >= self.sp_model.get_piece_size():
added_size += 1

return self.vocab_size + added_size

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer.get_vocab
def get_vocab(self):
"""Returns vocab as a dict"""
Expand All @@ -127,11 +139,15 @@
# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_token_to_id
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
return self.sp_model.piece_to_id(token)
if token in self.added_tokens_encoder:
return self.added_tokens_encoder[token]

Check warning on line 143 in paddlenlp/transformers/gemma/tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/gemma/tokenizer.py#L143

Added line #L143 was not covered by tests
return self.sp_model.PieceToId(token)

# Copied from transformers.models.llama.tokenization_llama.LlamaTokenizer._convert_id_to_token
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
if index in self.added_tokens_decoder:
return self.added_tokens_decoder[index]

Check warning on line 150 in paddlenlp/transformers/gemma/tokenizer.py

View check run for this annotation

Codecov / codecov/patch

paddlenlp/transformers/gemma/tokenizer.py#L150

Added line #L150 was not covered by tests
token = self.sp_model.IdToPiece(index)
return token

Expand Down
14 changes: 13 additions & 1 deletion paddlenlp/transformers/llama/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,18 @@ def vocab_size(self):
"""Returns vocab size"""
return self.sp_model.get_piece_size()

def __len__(self):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

解决无法添加token的问题

"""
Returns the vocabulary size. added_tokens_encoder has to be added in the sp_model
"""
added_size = 0

for id in self.added_tokens_decoder:
if id >= self.sp_model.get_piece_size():
added_size += 1

return self.vocab_size + added_size

@property
def bos_token_id(self) -> Optional[int]:
return self.sp_model.bos_id()
Expand All @@ -104,7 +116,7 @@ def _convert_token_to_id(self, token):

def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
token = self.sp_model.IdToPiece(index)
token = self.sp_model.id_to_piece(index)
return token

def convert_tokens_to_string(self, tokens):
Expand Down
6 changes: 6 additions & 0 deletions paddlenlp/transformers/mamba/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,12 @@ def vocab_size(self):
"""
return len(self.encoder)

def __len__(self):
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

mamba tokenizer的added_tokens_decoder中包含 [0,1]两个重复tokens,之前的计算方式会重复计算这两个token

"""
Size of the full vocabulary with the added tokens.
"""
return len(dict(self.encoder, **self.added_tokens_encoder))

def bpe(self, token):
if token in self.cache:
return self.cache[token]
Expand Down
14 changes: 10 additions & 4 deletions paddlenlp/transformers/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -940,8 +940,10 @@ def _pre_init(self, original_init, *args, **kwargs):
init_dict.pop("self", None)
super(PretrainedTokenizer, self).__init__(**init_dict)

self.added_tokens_encoder: Dict[str, int] = {}
self.added_tokens_decoder: Dict[int, str] = {}
self.added_tokens_decoder: Dict[int, AddedToken] = {}
self.added_tokens_decoder.update(kwargs.pop("added_tokens_decoder", {}))
self.added_tokens_encoder: Dict[str, int] = {k.content: v for v, k in self.added_tokens_decoder.items()}

self.unique_no_split_tokens: List[str] = []
self.tokens_trie = Trie()

Expand Down Expand Up @@ -1211,7 +1213,9 @@ def convert_tokens_to_string(self, tokens):
def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
if isinstance(ids, int):
if ids in self.added_tokens_decoder:
return self.added_tokens_decoder[ids]
token = self.added_tokens_decoder[ids]
token = token.content if isinstance(token, AddedToken) else token
return token
else:
return self._convert_id_to_token(ids)
tokens = []
Expand All @@ -1220,7 +1224,9 @@ def convert_ids_to_tokens(self, ids, skip_special_tokens=False):
if skip_special_tokens and index in self.all_special_ids:
continue
if index in self.added_tokens_decoder:
tokens.append(self.added_tokens_decoder[index])
token = self.added_tokens_decoder[index]
token = token.content if isinstance(token, AddedToken) else token
tokens.append(token)
else:
tokens.append(self._convert_id_to_token(index))
return tokens
Expand Down
19 changes: 16 additions & 3 deletions paddlenlp/transformers/tokenizer_utils_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -1543,6 +1543,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
else:
init_kwargs = init_configuration

pass_added_tokens_file = False
# Handle tokenizer serialization of added and special tokens
added_tokens_decoder: Dict[int, AddedToken] = {}
# if we have info on the slow added tokens
Expand All @@ -1558,6 +1559,8 @@ def from_pretrained(cls, pretrained_model_name_or_path, *args, **kwargs):
)
init_kwargs["added_tokens_decoder"] = added_tokens_decoder

pass_added_tokens_file = True

# position args are stored in kwargs, maybe better not include
init_args = init_kwargs.pop("init_args", ())
init_kwargs.pop("init_class", None)
Expand Down Expand Up @@ -1585,7 +1588,6 @@ def convert_added_tokens(obj):
if model_max_length is not None and isinstance(model_max_length, (int, float)):
init_kwargs["model_max_length"] = min(init_kwargs.get("model_max_length", int(1e30)), model_max_length)

added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
# Merge resolved_vocab_files arguments in init_kwargs if not including.
# Maybe need more ways to load resources.
for args_name, file_path in resolved_vocab_files.items():
Expand All @@ -1612,6 +1614,7 @@ def convert_added_tokens(obj):
chat_template = init_kwargs.pop("chat_template", None)
if chat_template is not None:
tokenizer.init_chat_template(chat_template)

special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
if special_tokens_map_file is not None:
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
Expand All @@ -1620,16 +1623,17 @@ def convert_added_tokens(obj):
if key in kwargs and kwargs[key]:
# This value has already been redefined by the kwargs
# We keep this new value and ignore the one stored in the special_tokens_map_file

continue

if isinstance(value, dict):
value = AddedToken(**value)
elif isinstance(value, list):
value = [AddedToken(**token) if isinstance(token, dict) else token for token in value]
setattr(tokenizer, key, value)

# Add supplementary tokens.
special_tokens = tokenizer.all_special_tokens
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
added_tokens_file = None if pass_added_tokens_file else added_tokens_file
if added_tokens_file is not None:
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
added_tok_encoder = json.load(added_tokens_handle)
Expand Down Expand Up @@ -1727,6 +1731,15 @@ def convert_added_tokens(obj: Union[AddedToken, Any], add_type_field=True):
# add_type_field=True to allow dicts in the kwargs / differentiate from AddedToken serialization
tokenizer_config = convert_added_tokens(tokenizer_config, add_type_field=True)

# Process added tokens seperatly: allows previous versions to ignore it!
added_tokens = {}
for key, value in self.added_tokens_decoder.items():
if isinstance(value, AddedToken):
added_tokens[key] = value.__getstate__()
else:
added_tokens[key] = AddedToken(value).__getstate__()
tokenizer_config["added_tokens_decoder"] = added_tokens

# Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
tokenizer_class = self.__class__.__name__
tokenizer_config["tokenizer_class"] = tokenizer_class
Expand Down
Loading