From 24cd7787fb7df81f2a1c6cef66a6fe5a15f3620a Mon Sep 17 00:00:00 2001
From: Yaojie Lu <yaojie.lu@outlook.com>
Date: Wed, 18 May 2022 11:53:39 +0800
Subject: [PATCH 1/3] [duuie] fix max_seq_len bug in duuie code * add kwargs
 for t5-bert-tokenizer * use add_special_tokens=False
 return_token_type_ids=None for span/name encode

---
 .../DuUIE/uie/evaluation/sel2record.py                   | 4 ++--
 .../DuUIE/uie/seq2struct/data_collator.py                | 9 ++++-----
 .../DuUIE/uie/seq2struct/t5_bert_tokenizer.py            | 7 ++++---
 3 files changed, 10 insertions(+), 10 deletions(-)
diff --git a/examples/information_extraction/DuUIE/uie/evaluation/sel2record.py b/examples/information_extraction/DuUIE/uie/evaluation/sel2record.py
index 4812971aceec..5ec44d34de8d 100644
--- a/examples/information_extraction/DuUIE/uie/evaluation/sel2record.py
+++ b/examples/information_extraction/DuUIE/uie/evaluation/sel2record.py
@@ -764,9 +764,9 @@ def clean_wildcard(x):
 def fix_unk_from_text_with_tokenizer(span, text, tokenizer, unk='<unk>'):
     unk_id = tokenizer.vocab.to_indices(unk)
     tokenized_span = tokenizer.encode(
-        span, add_special_tokens=False)['input_ids']
+        span, add_special_tokens=False, return_token_type_ids=None)['input_ids']
     tokenized_text = tokenizer.encode(
-        text, add_special_tokens=False)['input_ids']
+        text, add_special_tokens=False, return_token_type_ids=None)['input_ids']
 
     matched = match_sublist(tokenized_text, tokenized_span)
     if len(matched) == 0:
diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py b/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py
index 8b38ca347bb6..23e65d2cf9cb 100644
--- a/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py
+++ b/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py
@@ -114,11 +114,10 @@ def get_ordered_dict(schema_name_list, tokenizer):
         """
         schema_ordered_dict = OrderedDict()
         for name in schema_name_list:
-            # tokenizer.encode("人物")
-            encoded_name = tokenizer.encode(name)
-            # {'input_ids': [8, 122, 1]}
-            #   -> [8, 122]
-            schema_ordered_dict[name] = encoded_name["input_ids"][:-1:]
+            # tokenizer.encode("人物") -> [8, 122]
+             encoded_name = tokenizer.encode(
+                 name, add_special_tokens=False, return_token_type_ids=None)
+             schema_ordered_dict[name] = encoded_name["input_ids"]
         return schema_ordered_dict
 
     @staticmethod
diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
index 121124986d30..01ee952ecdeb 100644
--- a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
+++ b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
@@ -40,7 +40,8 @@ def __init__(self,
             tokenize_chinese_chars=tokenize_chinese_chars,
             strip_accents=strip_accents,
             **kwargs, )
-
+        if space_token not in self._additional_special_tokens:
+            self._additional_special_tokens += [space_token]
         self._space_token = space_token
 
     def get_vocab(self):
@@ -51,7 +52,7 @@ def get_vocab(self):
         vocab.update(self.added_tokens_encoder)
         return vocab
 
-    def tokenize(self, text):
+    def tokenize(self, text, **kwargs):
         import re
         # Remove space between <extra_id_*> <spot> <asoc>
         split_bracket = re.compile(
@@ -64,7 +65,7 @@ def tokenize(self, text):
                 new_text_list += [item[0].strip(), item[1]]
             text = "".join(new_text_list)
         text = text.replace(' ', self._space_token)
-        return super().tokenize(text)
+        return super().tokenize(text, **kwargs)
 
     def _add_eos_if_not_present(self, token_ids: List[int]) -> List[int]:
         """Do not add eos again if user already added it."""

From e916b10dc7a7e268134b73e5ffb95ba46ea47997 Mon Sep 17 00:00:00 2001
From: Yaojie Lu <yaojie.lu@outlook.com>
Date: Wed, 18 May 2022 12:04:32 +0800
Subject: [PATCH 2/3] [duuie] fix max_seq_len bug in duuie code * add kwargs
 for t5-bert-tokenizer * use add_special_tokens=False
 return_token_type_ids=None for span/name encode

---
 .../DuUIE/uie/seq2struct/t5_bert_tokenizer.py                   | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
index 01ee952ecdeb..5f13b10b7016 100644
--- a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
+++ b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
@@ -40,8 +40,10 @@ def __init__(self,
             tokenize_chinese_chars=tokenize_chinese_chars,
             strip_accents=strip_accents,
             **kwargs, )
+
         if space_token not in self._additional_special_tokens:
             self._additional_special_tokens += [space_token]
+
         self._space_token = space_token
 
     def get_vocab(self):

From ec324715e7fdda9f1b98d0b04c70075146c536c5 Mon Sep 17 00:00:00 2001
From: Yaojie Lu <yaojie.lu@outlook.com>
Date: Wed, 18 May 2022 12:16:39 +0800
Subject: [PATCH 3/3] [duuie] fix max_seq_len bug in duuie code * add kwargs
 for t5-bert-tokenizer * use add_special_tokens=False
 return_token_type_ids=None for span/name encode

---
 .../DuUIE/uie/seq2struct/data_collator.py                   | 6 +++---
 .../DuUIE/uie/seq2struct/t5_bert_tokenizer.py               | 1 -
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py b/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py
index 23e65d2cf9cb..57e0897a4637 100644
--- a/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py
+++ b/examples/information_extraction/DuUIE/uie/seq2struct/data_collator.py
@@ -115,9 +115,9 @@ def get_ordered_dict(schema_name_list, tokenizer):
         schema_ordered_dict = OrderedDict()
         for name in schema_name_list:
             # tokenizer.encode("人物") -> [8, 122]
-             encoded_name = tokenizer.encode(
-                 name, add_special_tokens=False, return_token_type_ids=None)
-             schema_ordered_dict[name] = encoded_name["input_ids"]
+            encoded_name = tokenizer.encode(
+                name, add_special_tokens=False, return_token_type_ids=None)
+            schema_ordered_dict[name] = encoded_name["input_ids"]
         return schema_ordered_dict
 
     @staticmethod
diff --git a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
index 5f13b10b7016..042dc3044c01 100644
--- a/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
+++ b/examples/information_extraction/DuUIE/uie/seq2struct/t5_bert_tokenizer.py
@@ -40,7 +40,6 @@ def __init__(self,
             tokenize_chinese_chars=tokenize_chinese_chars,
             strip_accents=strip_accents,
             **kwargs, )
-
         if space_token not in self._additional_special_tokens:
             self._additional_special_tokens += [space_token]