[STACL] Fix Dataloader'collate_fn to return numpy.array (PaddlePaddle…

…#326) * feat: add text_simultaneous_translation: STACL * refactor: update requirement and readme, rename function/class name * feat: add tokenizer * refactor: refactor vocab,data_reader * refactor: rename TransformerModel, fix src_slf_attn_bias * docs: update README and requirements * fix: avoid getting the whole source in advance * docs: update docs * refactor: refactor tokenizer * docs: update pretrained model BLEU * docs: update docs * style: change code style(line too long) * docs: update information_extraction link * docs: add pretrained models download link * docs: add STACL to example_readme;enrich parameters description * fix: fix the wrong shape of validation label * docs: set default parameters to yaml * refactor: use paddlenlp.transformers api * fix Dataloader'collate_fn to return numpy.array * fix: increase tokenizer support for Windows * docs: add code annotation * docs: english annotation Co-authored-by: Guo Sheng <whucsgs@163.com>
wangxicoding · Apr 30, 2021 · fd880ac · fd880ac
1 parent 9585ea2
commit fd880ac
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 87 deletions.
diff --git a/examples/machine_translation/text_simultaneous_translation/model.py b/examples/machine_translation/text_simultaneous_translation/model.py
@@ -19,6 +19,7 @@
 import paddle
 import paddle.nn as nn
 import paddle.nn.functional as F
+from paddlenlp.transformers import WordEmbedding, PositionalEmbedding
 
 
 class CrossEntropyCriterion(nn.Layer):
@@ -48,68 +49,6 @@ def forward(self, predict, label):
         return sum_cost, avg_cost, token_num
 
 
-def position_encoding_init(n_position, d_pos_vec, dtype="float32"):
-    """
-    Generate the initial values for the sinusoid position encoding table.
-    """
-    channels = d_pos_vec
-    position = np.arange(n_position)
-    num_timescales = channels // 2
-    log_timescale_increment = (np.log(float(1e4) / float(1)) /
-                               (num_timescales - 1))
-    inv_timescales = np.exp(
-        np.arange(num_timescales) * -log_timescale_increment)
-
-    scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
-                                                               0)
-    signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
-    signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
-    position_enc = signal
-    return position_enc.astype(dtype)
-
-
-class WordEmbedding(nn.Layer):
-    """
-    Word Embedding + Scale
-    """
-
-    def __init__(self, vocab_size, emb_dim, bos_idx=0):
-        super(WordEmbedding, self).__init__()
-        self.emb_dim = emb_dim
-
-        self.word_embedding = nn.Embedding(
-            num_embeddings=vocab_size,
-            embedding_dim=emb_dim,
-            padding_idx=bos_idx,
-            weight_attr=paddle.ParamAttr(
-                initializer=nn.initializer.Normal(0., emb_dim**-0.5)))
-
-    def forward(self, word):
-        word_emb = self.emb_dim**0.5 * self.word_embedding(word)
-        return word_emb
-
-
-class PositionalEmbedding(nn.Layer):
-    """
-    Positional Embedding
-    """
-
-    def __init__(self, emb_dim, max_length, bos_idx=0):
-        super(PositionalEmbedding, self).__init__()
-        self.emb_dim = emb_dim
-
-        self.pos_encoder = nn.Embedding(
-            num_embeddings=max_length,
-            embedding_dim=self.emb_dim,
-            weight_attr=paddle.ParamAttr(initializer=nn.initializer.Assign(
-                position_encoding_init(max_length, self.emb_dim))))
-
-    def forward(self, pos):
-        pos_emb = self.pos_encoder(pos)
-        pos_emb.stop_gradient = True
-        return pos_emb
-
-
 class DecoderLayer(nn.TransformerDecoderLayer):
     def __init__(self, *args, **kwargs):
         super(DecoderLayer, self).__init__(*args, **kwargs)

diff --git a/examples/machine_translation/text_simultaneous_translation/predict.py b/examples/machine_translation/text_simultaneous_translation/predict.py
@@ -19,8 +19,9 @@
 from attrdict import AttrDict
 
 import paddle
+from paddlenlp.transformers import position_encoding_init
 import reader
-from model import SimultaneousTransformer, position_encoding_init
+from model import SimultaneousTransformer
 
 
 def parse_args():
@@ -90,7 +91,7 @@ def do_predict(args):
 
     with paddle.no_grad():
         for input_data in test_loader:
-            (src_word, _) = input_data
+            (src_word, ) = input_data
 
             finished_seq, finished_scores = transformer.greedy_search(
                 src_word, max_len=args.max_out_len, waitk=args.waitk)

diff --git a/examples/machine_translation/text_simultaneous_translation/reader.py b/examples/machine_translation/text_simultaneous_translation/reader.py
@@ -14,7 +14,7 @@
 
 from functools import partial
 from paddle.io import DataLoader
-from paddlenlp.data import Vocab
+from paddlenlp.data import Vocab, Pad
 from paddlenlp.data.sampler import SamplerHelper
 from paddlenlp.datasets import load_dataset
 
@@ -169,20 +169,10 @@ def prepare_train_input(insts, pad_idx):
     """
     Put all padded data needed by training into a list.
     """
-    src_max_len = max([len(inst[0]) for inst in insts])
-    trg_max_len = max([len(inst[1]) for inst in insts]) - 1
-    src_word = [
-        inst[0] + [pad_idx] * (src_max_len - len(inst[0])) for inst in insts
-    ]
-    trg_word = [
-        inst[1][:-1] + [pad_idx] * (trg_max_len - len(inst[1][:-1]))
-        for inst in insts
-    ]
-    lbl_word = [
-        inst[1][1:] + [pad_idx] * (trg_max_len - len(inst[1][1:]))
-        for inst in insts
-    ]
-
+    word_pad = Pad(pad_idx)
+    src_word = word_pad([inst[0] for inst in insts])
+    trg_word = word_pad(inst[1][:-1] for inst in insts)
+    lbl_word = word_pad([inst[1][1:] for inst in insts])
     data_inputs = [src_word, trg_word, lbl_word]
 
     return data_inputs
@@ -192,14 +182,10 @@ def prepare_infer_input(insts, pad_idx):
     """
     Put all padded data needed by beam search decoder into a list.
     """
-    src_max_len = max([len(inst[0]) for inst in insts])
-    src_word = [
-        inst[0] + [pad_idx] * (src_max_len - len(inst[0])) for inst in insts
-    ]
-
-    trg_word = [inst[1] for inst in insts]
+    word_pad = Pad(pad_idx)
+    src_word = word_pad(inst[0] for inst in insts)
 
-    return [src_word, trg_word]
+    return [src_word, ]
 
 
 class SortType(object):

diff --git a/examples/machine_translation/text_simultaneous_translation/utils/tokenizer.py b/examples/machine_translation/text_simultaneous_translation/utils/tokenizer.py
@@ -12,10 +12,15 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import _locale
 import jieba
-
 from subword_nmt import subword_nmt
 
+# By default, the Windows system opens the file with GBK code,
+# and the subword_nmt package does not support setting open encoding,
+# so it is set to UTF-8 uniformly.
+_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])
+
 
 class STACLTokenizer:
     def __init__(self, bpe_dict, is_chinese):