Skip to content

Commit

Permalink
[STACL] Fix Dataloader'collate_fn to return numpy.array (PaddlePaddle…
Browse files Browse the repository at this point in the history
…#326)

* feat: add text_simultaneous_translation: STACL

* refactor: update requirement and readme, rename function/class name

* feat: add tokenizer

* refactor: refactor vocab,data_reader

* refactor: rename TransformerModel, fix src_slf_attn_bias

* docs: update README and requirements

* fix: avoid getting the whole source in advance

* docs: update docs

* refactor: refactor tokenizer

* docs: update pretrained model BLEU

* docs: update docs

* style: change code style(line too long)

* docs: update information_extraction link

* docs: add pretrained models download link

* docs: add STACL to example_readme;enrich parameters description

* fix: fix the wrong shape of validation label

* docs: set default parameters to yaml

* refactor: use paddlenlp.transformers api

* fix Dataloader'collate_fn to return numpy.array

* fix: increase tokenizer support for  Windows

* docs: add code annotation

* docs: english annotation

Co-authored-by: Guo Sheng <whucsgs@163.com>
  • Loading branch information
gongel and guoshengCS authored Apr 30, 2021
1 parent 9585ea2 commit fd880ac
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 87 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
import paddle
import paddle.nn as nn
import paddle.nn.functional as F
from paddlenlp.transformers import WordEmbedding, PositionalEmbedding


class CrossEntropyCriterion(nn.Layer):
Expand Down Expand Up @@ -48,68 +49,6 @@ def forward(self, predict, label):
return sum_cost, avg_cost, token_num


def position_encoding_init(n_position, d_pos_vec, dtype="float32"):
"""
Generate the initial values for the sinusoid position encoding table.
"""
channels = d_pos_vec
position = np.arange(n_position)
num_timescales = channels // 2
log_timescale_increment = (np.log(float(1e4) / float(1)) /
(num_timescales - 1))
inv_timescales = np.exp(
np.arange(num_timescales) * -log_timescale_increment)

scaled_time = np.expand_dims(position, 1) * np.expand_dims(inv_timescales,
0)
signal = np.concatenate([np.sin(scaled_time), np.cos(scaled_time)], axis=1)
signal = np.pad(signal, [[0, 0], [0, np.mod(channels, 2)]], 'constant')
position_enc = signal
return position_enc.astype(dtype)


class WordEmbedding(nn.Layer):
"""
Word Embedding + Scale
"""

def __init__(self, vocab_size, emb_dim, bos_idx=0):
super(WordEmbedding, self).__init__()
self.emb_dim = emb_dim

self.word_embedding = nn.Embedding(
num_embeddings=vocab_size,
embedding_dim=emb_dim,
padding_idx=bos_idx,
weight_attr=paddle.ParamAttr(
initializer=nn.initializer.Normal(0., emb_dim**-0.5)))

def forward(self, word):
word_emb = self.emb_dim**0.5 * self.word_embedding(word)
return word_emb


class PositionalEmbedding(nn.Layer):
"""
Positional Embedding
"""

def __init__(self, emb_dim, max_length, bos_idx=0):
super(PositionalEmbedding, self).__init__()
self.emb_dim = emb_dim

self.pos_encoder = nn.Embedding(
num_embeddings=max_length,
embedding_dim=self.emb_dim,
weight_attr=paddle.ParamAttr(initializer=nn.initializer.Assign(
position_encoding_init(max_length, self.emb_dim))))

def forward(self, pos):
pos_emb = self.pos_encoder(pos)
pos_emb.stop_gradient = True
return pos_emb


class DecoderLayer(nn.TransformerDecoderLayer):
def __init__(self, *args, **kwargs):
super(DecoderLayer, self).__init__(*args, **kwargs)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,9 @@
from attrdict import AttrDict

import paddle
from paddlenlp.transformers import position_encoding_init
import reader
from model import SimultaneousTransformer, position_encoding_init
from model import SimultaneousTransformer


def parse_args():
Expand Down Expand Up @@ -90,7 +91,7 @@ def do_predict(args):

with paddle.no_grad():
for input_data in test_loader:
(src_word, _) = input_data
(src_word, ) = input_data

finished_seq, finished_scores = transformer.greedy_search(
src_word, max_len=args.max_out_len, waitk=args.waitk)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@

from functools import partial
from paddle.io import DataLoader
from paddlenlp.data import Vocab
from paddlenlp.data import Vocab, Pad
from paddlenlp.data.sampler import SamplerHelper
from paddlenlp.datasets import load_dataset

Expand Down Expand Up @@ -169,20 +169,10 @@ def prepare_train_input(insts, pad_idx):
"""
Put all padded data needed by training into a list.
"""
src_max_len = max([len(inst[0]) for inst in insts])
trg_max_len = max([len(inst[1]) for inst in insts]) - 1
src_word = [
inst[0] + [pad_idx] * (src_max_len - len(inst[0])) for inst in insts
]
trg_word = [
inst[1][:-1] + [pad_idx] * (trg_max_len - len(inst[1][:-1]))
for inst in insts
]
lbl_word = [
inst[1][1:] + [pad_idx] * (trg_max_len - len(inst[1][1:]))
for inst in insts
]

word_pad = Pad(pad_idx)
src_word = word_pad([inst[0] for inst in insts])
trg_word = word_pad(inst[1][:-1] for inst in insts)
lbl_word = word_pad([inst[1][1:] for inst in insts])
data_inputs = [src_word, trg_word, lbl_word]

return data_inputs
Expand All @@ -192,14 +182,10 @@ def prepare_infer_input(insts, pad_idx):
"""
Put all padded data needed by beam search decoder into a list.
"""
src_max_len = max([len(inst[0]) for inst in insts])
src_word = [
inst[0] + [pad_idx] * (src_max_len - len(inst[0])) for inst in insts
]

trg_word = [inst[1] for inst in insts]
word_pad = Pad(pad_idx)
src_word = word_pad(inst[0] for inst in insts)

return [src_word, trg_word]
return [src_word, ]


class SortType(object):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,15 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import _locale
import jieba

from subword_nmt import subword_nmt

# By default, the Windows system opens the file with GBK code,
# and the subword_nmt package does not support setting open encoding,
# so it is set to UTF-8 uniformly.
_locale._getdefaultlocale = (lambda *args: ['en_US', 'utf8'])


class STACLTokenizer:
def __init__(self, bpe_dict, is_chinese):
Expand Down

0 comments on commit fd880ac

Please sign in to comment.