Skip to content

Commit

Permalink
fix compatibility for ppnimilm, skep, unifiedtransformer, gpt2 (#2063)
Browse files Browse the repository at this point in the history
* upgrade tokenizer

* upgrade tokenizer

* upgrade datacollator

* minor fix

* minor fix

* fix mbart tokenizer

* minor fix

* minor fix

* minor fix

* fix compatity for autotokenizer

* fix offset mapping

* fix tokenizer doc

* fix doc

* fix tokenizer

* fix ppminilm tokenizer

* fix offset mapping

* fix tokenizer warning

* fix get_input_ids for compatibility

* add doc

* minor fix

Co-authored-by: Jiaqi Liu <709153940@qq.com>
  • Loading branch information
smallv0221 and LiuChiachi authored May 7, 2022
1 parent f7b7d52 commit 8cb27ad
Show file tree
Hide file tree
Showing 4 changed files with 20 additions and 42 deletions.
37 changes: 13 additions & 24 deletions paddlenlp/transformers/gpt/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,7 @@ def __init__(
**kwargs # The token of newline.
):
self._model_file = model_file
self.eol_token = eol_token
if not os.path.isfile(model_file):
raise ValueError(
"Can't find a model file at path '{}'. To load the "
Expand All @@ -138,29 +139,11 @@ def __init__(
self.sp.Load(model_file)
self.translator = str.maketrans(" \n", "\u2582\u2583")

'''
def tokenize(self, text):
"""
Converts a string to a list of tokens.
Args:
text (str): The text to be tokenized.
Returns:
List[str]: A list of string representing converted tokens.
Example:
.. code-block::
from paddlenlp.transformers import GPTChineseTokenizer
tokenizer = GPTChineseTokenizer.from_pretrained('gpt-cpm-large-cn')
print(tokenizer.tokenize('欢迎使用百度飞桨!'))
# ['▁欢迎', '▁使用', '▁百度', '▁飞', '桨', '▁!']
"""
return self._tokenize(text)
'''
@property
def eol_token_id(self):
if self.eol_token is None:
return None
return self.convert_tokens_to_ids(self.eol_token)

def _tokenize(self, text):
""" Tokenize a string. """
Expand Down Expand Up @@ -386,7 +369,7 @@ def __init__(
unk_token = AddedToken(
unk_token, lstrip=False,
rstrip=False) if isinstance(unk_token, str) else unk_token

self.eol_token = eol_token
self._build_special_tokens_map_extended(
bos_token=pad_token, eos_token=eos_token, unk_token=unk_token)

Expand Down Expand Up @@ -425,6 +408,12 @@ def vocab_size(self):

return len(self.encoder)

@property
def eol_token_id(self):
if self.eol_token is None:
return None
return self.convert_tokens_to_ids(self.eol_token)

def bpe(self, token):
if token in self.cache:
return self.cache[token]
Expand Down
3 changes: 2 additions & 1 deletion paddlenlp/transformers/ppminilm/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,8 @@ def __init__(self,
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]"):
mask_token="[MASK]",
**kwargs):

if not os.path.isfile(vocab_file):
raise ValueError(
Expand Down
20 changes: 4 additions & 16 deletions paddlenlp/transformers/tokenizer_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -983,14 +983,8 @@ def get_input_ids(text):
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
if is_split_into_words:
tokens = list(
itertools.chain(*(self.tokenize(
t, is_split_into_words=True, **kwargs)
for t in text)))
return self.convert_tokens_to_ids(tokens)
else:
return self.convert_tokens_to_ids(text)
#TODO aligns with HuggingFace here in breaking change
return self.convert_tokens_to_ids(text)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], int):
Expand Down Expand Up @@ -1065,14 +1059,8 @@ def get_input_ids(text):
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], str):
if is_split_into_words:
tokens = list(
itertools.chain(*(self.tokenize(
t, is_split_into_words=True, **kwargs)
for t in text)))
return self.convert_tokens_to_ids(tokens)
else:
return self.convert_tokens_to_ids(text)
#TODO aligns with HuggingFace here in breaking change
return self.convert_tokens_to_ids(text)
elif isinstance(text,
(list, tuple)) and len(text) > 0 and isinstance(
text[0], int):
Expand Down
2 changes: 1 addition & 1 deletion paddlenlp/transformers/unified_transformer/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,7 @@ def __init__(self,
sentencepiece_model_file,
do_lower_case=False,
unk_token="[UNK]",
pad_token="[PAD]",
pad_token="[UNK]",
cls_token="[CLS]",
sep_token="[SEP]",
mask_token="[MASK]",
Expand Down

0 comments on commit 8cb27ad

Please sign in to comment.