Skip to content

Commit

Permalink
[Unittest] Add unittest for RoBERTa, ALBERT and ERNIE (#2972)
Browse files Browse the repository at this point in the history
* add roberta tokenizer unittest

* add roberta tokenizer unittest

* add roberta modeling unittest

* add unittest for albert modeling

* add albert unittest

* add ernie unittest

* upgrade unittest for albert tokenizer

* upgrade unittest for ernie tokenizer

* fix unittest for ernie tokenizer
  • Loading branch information
yingyibiao authored Aug 18, 2022
1 parent 451ccac commit 869dd96
Show file tree
Hide file tree
Showing 20 changed files with 2,207 additions and 523 deletions.
83 changes: 63 additions & 20 deletions paddlenlp/transformers/albert/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -572,6 +572,16 @@ class AlbertEnglishTokenizer(PretrainedTokenizer):
"pad_token": "<pad>",
},
}
max_model_input_sizes = {
"albert-base-v1": 512,
"albert-large-v1": 512,
"albert-xlarge-v1": 512,
"albert-xxlarge-v1": 512,
"albert-base-v2": 512,
"albert-large-v2": 512,
"albert-xlarge-v2": 512,
"albert-xxlarge-v2": 512,
}

def __init__(self,
sentencepiece_model_file,
Expand All @@ -585,27 +595,41 @@ def __init__(self,
pad_token="<pad>",
cls_token="[CLS]",
mask_token="[MASK]",
sp_model_kwargs=None,
**kwargs):

self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
self.do_lower_case = do_lower_case
self.remove_space = remove_space
self.keep_accents = keep_accents
self.sentencepiece_model_file = sentencepiece_model_file
self.sp_model = spm.SentencePieceProcessor()

self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(sentencepiece_model_file)

@property
def vocab_size(self):
return len(self.sp_model)

def get_vocab(self):
vocab = {
self.convert_ids_to_tokens(i): i
for i in range(self.vocab_size)
}
vocab.update(self.added_tokens_encoder)
return vocab

def __getstate__(self):
state = self.__dict__.copy()
state["sp_model"] = None
return state

def __setstate__(self, d):
self.__dict__ = d
self.sp_model = spm.SentencePieceProcessor()
if not hasattr(self, "sp_model_kwargs"):
self.sp_model_kwargs = {}

self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
self.sp_model.Load(self.sentencepiece_model_file)

def preprocess_text(self, inputs):
Expand All @@ -624,14 +648,10 @@ def preprocess_text(self, inputs):

return outputs

def _tokenize(self, text, sample=False):
def _tokenize(self, text):
"""Tokenize a string."""
text = self.preprocess_text(text)

if not sample:
pieces = self.sp_model.EncodeAsPieces(text)
else:
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
pieces = self.sp_model.encode(text, out_type=str)
new_pieces = []
for piece in pieces:
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():
Expand Down Expand Up @@ -721,9 +741,16 @@ def create_token_type_ids_from_sequences(self,
def save_resources(self, save_directory):
for name, file_name in self.resource_files_names.items():
save_path = os.path.join(save_directory, file_name)
if os.path.abspath(self.sentencepiece_model_file
) != os.path.abspath(save_path):
if os.path.abspath(
self.sentencepiece_model_file) != os.path.abspath(
save_path) and os.path.isfile(
self.sentencepiece_model_file):
copyfile(self.sentencepiece_model_file, save_path)
elif not os.path.isfile(self.sentencepiece_model_file):
with open(save_path, "wb") as fi:
content_spiece_model = self.sp_model.serialized_model_proto(
)
fi.write(content_spiece_model)


class AlbertChineseTokenizer(BertTokenizer):
Expand Down Expand Up @@ -776,22 +803,38 @@ class AlbertChineseTokenizer(BertTokenizer):
"pad_token": "[PAD]",
},
}
max_model_input_sizes = {
"albert-chinese-tiny": 512,
"albert-chinese-small": 512,
"albert-chinese-base": 512,
"albert-chinese-large": 512,
"albert-chinese-xlarge": 512,
"albert-chinese-xxlarge": 512,
}

def __init__(self,
vocab_file,
do_lower_case=False,
do_lower_case=True,
do_basic_tokenize=True,
never_split=None,
unk_token="[UNK]",
sep_token="[SEP]",
pad_token="[PAD]",
cls_token="[CLS]",
mask_token="[MASK]",
tokenize_chinese_chars=True,
strip_accents=None,
**kwargs):
super(AlbertChineseTokenizer, self).__init__(
vocab_file,
do_lower_case=do_lower_case,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
)
super(AlbertChineseTokenizer,
self).__init__(vocab_file,
do_lower_case=do_lower_case,
do_basic_tokenize=do_basic_tokenize,
never_split=never_split,
unk_token=unk_token,
sep_token=sep_token,
pad_token=pad_token,
cls_token=cls_token,
mask_token=mask_token,
tokenize_chinese_chars=tokenize_chinese_chars,
strip_accents=strip_accents,
**kwargs)
6 changes: 6 additions & 0 deletions paddlenlp/transformers/ernie/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,6 +854,12 @@ def __init__(self,
self.pooler = ErniePooler(hidden_size, weight_attr)
self.apply(self.init_weights)

def get_input_embeddings(self):
return self.embeddings.word_embeddings

def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value

def forward(self,
input_ids,
token_type_ids=None,
Expand Down
44 changes: 44 additions & 0 deletions paddlenlp/transformers/ernie/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,6 +278,47 @@ class ErnieTokenizer(PretrainedTokenizer):
"do_lower_case": True
},
}
max_model_input_sizes = {
"ernie-1.0": 513,
"ernie-1.0-base-zh": 513,
"ernie-1.0-large-zh-cw": 512,
"ernie-tiny": 600,
"ernie-2.0-base-zh": 513,
"ernie-2.0-large-zh": 512,
"ernie-2.0-base-en": 512,
"ernie-2.0-base-en-finetuned-squad": 512,
"ernie-2.0-large-en": 512,
"ernie-gen-base-en": 1024,
"ernie-gen-large-en": 1024,
"ernie-gen-large-en-430g": 1024,
"rocketqa-zh-dureader-query-encoder": 513,
"rocketqa-zh-dureader-para-encoder": 513,
"rocketqa-v1-marco-query-encoder": 512,
"rocketqa-v1-marco-para-encoder": 512,
"rocketqa-zh-dureader-cross-encoder": 513,
"rocketqa-v1-marco-cross-encoder": 512,
"ernie-3.0-base-zh": 2048,
"ernie-3.0-xbase-zh": 2048,
"ernie-3.0-medium-zh": 2048,
"ernie-3.0-mini-zh": 2048,
"ernie-3.0-micro-zh": 2048,
"ernie-3.0-nano-zh": 2048,
"rocketqa-zh-base-query-encoder": 2048,
"rocketqa-zh-base-para-encoder": 2048,
"rocketqa-zh-medium-query-encoder": 2048,
"rocketqa-zh-medium-para-encoder": 2048,
"rocketqa-zh-mini-query-encoder": 2048,
"rocketqa-zh-mini-para-encoder": 2048,
"rocketqa-zh-micro-query-encoder": 2048,
"rocketqa-zh-micro-para-encoder": 2048,
"rocketqa-zh-nano-query-encoder": 2048,
"rocketqa-zh-nano-para-encoder": 2048,
"rocketqa-base-cross-encoder": 2048,
"rocketqa-medium-cross-encoder": 2048,
"rocketqa-mini-cross-encoder": 2048,
"rocketqa-micro-cross-encoder": 2048,
"rocketqa-nano-cross-encoder": 2048,
}

def __init__(self,
vocab_file,
Expand Down Expand Up @@ -311,6 +352,9 @@ def vocab_size(self):
"""
return len(self.vocab)

def get_vocab(self):
return dict(self.vocab._token_to_idx, **self.added_tokens_encoder)

def _tokenize(self, text):
r"""
End-to-end tokenization for ERNIE models.
Expand Down
12 changes: 12 additions & 0 deletions paddlenlp/transformers/gpt/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -367,6 +367,7 @@ def __init__(
eos_token='<|endoftext|>',
unk_token='<|endoftext|>',
eol_token='\u010a',
add_prefix_space=False,
**kwargs # The token of newline.
):
pad_token = AddedToken(pad_token,
Expand Down Expand Up @@ -406,6 +407,8 @@ def __init__(
bpe_merges = [tuple(merge.split()) for merge in bpe_data]
self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
self.cache = {}
self.add_prefix_space = add_prefix_space

re = try_import("regex")
self.pat = re.compile(
r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+"""
Expand Down Expand Up @@ -538,3 +541,12 @@ def convert_tokens_to_string(self, tokens):
text = bytearray([self.byte_decoder[c]
for c in text]).decode('utf-8', errors=self.errors)
return text

def prepare_for_tokenization(self,
text,
is_split_into_words=False,
**kwargs):
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
if is_split_into_words or add_prefix_space:
text = " " + text
return (text, kwargs)
6 changes: 6 additions & 0 deletions paddlenlp/transformers/roberta/modeling.py
Original file line number Diff line number Diff line change
Expand Up @@ -344,6 +344,12 @@ def __init__(self,
self.pooler = RobertaPooler(hidden_size)
self.apply(self.init_weights)

def get_input_embeddings(self):
return self.embeddings.word_embeddings

def set_input_embeddings(self, value):
self.embeddings.word_embeddings = value

def forward(self,
input_ids,
token_type_ids=None,
Expand Down
Loading

0 comments on commit 869dd96

Please sign in to comment.