From 24cd7787fb7df81f2a1c6cef66a6fe5a15f3620a Mon Sep 17 00:00:00 2001 From: Yaojie Lu Date: Wed, 18 May 2022 11:53:39 +0800 Subject: [PATCH 1/3] [duuie] fix max_seq_len bug in duuie code * add kwargs for t5-bert-tokenizer * use add_special_tokens=False return_token_type_ids=None for span/name encode --- .../DuUIE/uie/evaluation/sel2record.py | 4 ++-- .../DuUIE/uie/seq2struct/data_collator.py | 9 ++++----- .../DuUIE/uie/seq2struct/t5_bert_tokenizer.py | 7 ++++--- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/examples/information_extraction/DuUIE/uie/evaluation/sel2record.py b/examples/information_extraction/DuUIE/uie/evaluation/sel2record.py index 4812971aceec..5ec44d34de8d 100644 --- a/examples/information_extraction/DuUIE/uie/evaluation/sel2record.py +++ b/examples/information_extraction/DuUIE/uie/evaluation/sel2record.py @@ -764,9 +764,9 @@ def clean_wildcard(x): def fix_unk_from_text_with_tokenizer(span, text, tokenizer, unk=''): unk_id = tokenizer.vocab.to_indices(unk) tokenized_span = tokenizer.encode( - span, add_special_tokens=False)['input_ids'] + span, add_special_tokens=False, return_token_type_ids=None)['input_ids'] tokenized_text = tokenizer.encode( - text, add_special_tokens=False)['input_ids'] + text, add_special_tokens=False, return_token_type_ids=None)['input_ids'] matched = match_sublist(tokenized_text, tokenized_span) if len(matched) == 0: diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py b/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py index 8b38ca347bb6..23e65d2cf9cb 100644 --- a/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py +++ b/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py @@ -114,11 +114,10 @@ def get_ordered_dict(schema_name_list, tokenizer): """ schema_ordered_dict = OrderedDict() for name in schema_name_list: - # tokenizer.encode("人物") - encoded_name = tokenizer.encode(name) - # {'input_ids': [8, 122, 1]} - # -> [8, 122] - schema_ordered_dict[name] = encoded_name["input_ids"][:-1:] + # tokenizer.encode("人物") -> [8, 122] + encoded_name = tokenizer.encode( + name, add_special_tokens=False, return_token_type_ids=None) + schema_ordered_dict[name] = encoded_name["input_ids"] return schema_ordered_dict @staticmethod diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py index 121124986d30..01ee952ecdeb 100644 --- a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py +++ b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py @@ -40,7 +40,8 @@ def __init__(self, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, **kwargs, ) - + if space_token not in self._additional_special_tokens: + self._additional_special_tokens += [space_token] self._space_token = space_token def get_vocab(self): @@ -51,7 +52,7 @@ def get_vocab(self): vocab.update(self.added_tokens_encoder) return vocab - def tokenize(self, text): + def tokenize(self, text, **kwargs): import re # Remove space between split_bracket = re.compile( @@ -64,7 +65,7 @@ def tokenize(self, text): new_text_list += [item[0].strip(), item[1]] text = "".join(new_text_list) text = text.replace(' ', self._space_token) - return super().tokenize(text) + return super().tokenize(text, **kwargs) def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]: """Do not add eos again if user already added it.""" From e916b10dc7a7e268134b73e5ffb95ba46ea47997 Mon Sep 17 00:00:00 2001 From: Yaojie Lu Date: Wed, 18 May 2022 12:04:32 +0800 Subject: [PATCH 2/3] [duuie] fix max_seq_len bug in duuie code * add kwargs for t5-bert-tokenizer * use add_special_tokens=False return_token_type_ids=None for span/name encode --- .../DuUIE/uie/seq2struct/t5_bert_tokenizer.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py index 01ee952ecdeb..5f13b10b7016 100644 --- a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py +++ b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py @@ -40,8 +40,10 @@ def __init__(self, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, **kwargs, ) + if space_token not in self._additional_special_tokens: self._additional_special_tokens += [space_token] + self._space_token = space_token def get_vocab(self): From ec324715e7fdda9f1b98d0b04c70075146c536c5 Mon Sep 17 00:00:00 2001 From: Yaojie Lu Date: Wed, 18 May 2022 12:16:39 +0800 Subject: [PATCH 3/3] [duuie] fix max_seq_len bug in duuie code * add kwargs for t5-bert-tokenizer * use add_special_tokens=False return_token_type_ids=None for span/name encode --- .../DuUIE/uie/seq2struct/data_collator.py | 6 +++--- .../DuUIE/uie/seq2struct/t5_bert_tokenizer.py | 1 - 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py b/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py index 23e65d2cf9cb..57e0897a4637 100644 --- a/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py +++ b/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py @@ -115,9 +115,9 @@ def get_ordered_dict(schema_name_list, tokenizer): schema_ordered_dict = OrderedDict() for name in schema_name_list: # tokenizer.encode("人物") -> [8, 122] - encoded_name = tokenizer.encode( - name, add_special_tokens=False, return_token_type_ids=None) - schema_ordered_dict[name] = encoded_name["input_ids"] + encoded_name = tokenizer.encode( + name, add_special_tokens=False, return_token_type_ids=None) + schema_ordered_dict[name] = encoded_name["input_ids"] return schema_ordered_dict @staticmethod diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py index 5f13b10b7016..042dc3044c01 100644 --- a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py +++ b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py @@ -40,7 +40,6 @@ def __init__(self, tokenize_chinese_chars=tokenize_chinese_chars, strip_accents=strip_accents, **kwargs, ) - if space_token not in self._additional_special_tokens: self._additional_special_tokens += [space_token]