From 5603ba356a50985e73c65af835880b4e744febc5 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Tue, 28 Aug 2018 13:54:20 -0700 Subject: [PATCH 01/12] partial progress on rewriting modules.py --- parlai/agents/seq2seq/modules.py | 605 ++++++++++++++----------------- parlai/agents/seq2seq/seq2seq.py | 10 +- 2 files changed, 281 insertions(+), 334 deletions(-) diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py index 366235f90ae..bf9d49a36d3 100644 --- a/parlai/agents/seq2seq/modules.py +++ b/parlai/agents/seq2seq/modules.py @@ -34,7 +34,6 @@ def __init__(self, opt, num_features, super().__init__() self.opt = opt - self.rank = opt['rank_candidates'] self.attn_type = opt['attention'] self.NULL_IDX = padding_idx @@ -43,7 +42,7 @@ def __init__(self, opt, num_features, self.longest_label = longest_label rnn_class = Seq2seq.RNN_OPTS[opt['rnn_class']] - self.decoder = Decoder( + self.decoder = RNNDecoder( num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class, emb_size=opt['embeddingsize'], hidden_size=opt['hiddensize'], num_layers=opt['numlayers'], dropout=opt['dropout'], @@ -51,22 +50,22 @@ def __init__(self, opt, num_features, attn_type=opt['attention'], attn_length=opt['attention_length'], attn_time=opt.get('attention_time'), bidir_input=opt['bidirectional'], - numsoftmax=opt.get('numsoftmax', 1), - softmax_layer_bias=opt.get('softmax_layer_bias', False)) + numsoftmax=opt.get('numsoftmax', 1)) shared_lt = (self.decoder.lt if opt['lookuptable'] in ['enc_dec', 'all'] else None) shared_rnn = self.decoder.rnn if opt['decoder'] == 'shared' else None - self.encoder = Encoder( + self.encoder = RNNEncoder( num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class, emb_size=opt['embeddingsize'], hidden_size=opt['hiddensize'], num_layers=opt['numlayers'], dropout=opt['dropout'], bidirectional=opt['bidirectional'], shared_lt=shared_lt, shared_rnn=shared_rnn) - if self.rank: + if opt['rank_candidates']: self.ranker = Ranker( self.decoder, + self.START, padding_idx=self.NULL_IDX, attn_type=opt['attention']) @@ -78,192 +77,104 @@ def __init__(self, opt, num_features, if not os.path.exists(self.beam_dump_path): os.makedirs(self.beam_dump_path) - def unbeamize_hidden(self, hidden, beam_size, batch_size): - """ - Creates a view of the hidden where batch axis is collapsed with beam axis, - we need to do this for batched beam search, i.e. we emulate bigger mini-batch - :param hidden: hidden state of the decoder - :param beam_size: beam size, i.e. num of hypothesis - :param batch_size: number of samples in the mini batch - :return: view of the hidden - """ - if isinstance(hidden, tuple): - num_layers = hidden[0].size(0) - hidden_size = hidden[0].size(-1) - return (hidden[0].view(num_layers, batch_size * beam_size, hidden_size), - hidden[1].view(num_layers, batch_size * beam_size, hidden_size)) - else: # GRU - num_layers = hidden.size(0) - hidden_size = hidden.size(-1) - return hidden.view(num_layers, batch_size * beam_size, hidden_size) - - def unbeamize_enc_out(self, enc_out, beam_size, batch_size): - hidden_size = enc_out.size(-1) - return enc_out.view(batch_size * beam_size, -1, hidden_size) - - def forward(self, xs, ys=None, cands=None, cand_indices=None, prev_enc=None, - rank_during_training=False, beam_size=1, topk=1): + def _encode(self, xs, x_lens=None, prev_enc=None): + if prev_enc is not None: + return prev_enc + else: + enc_out, hidden = self.encoder(xs, x_lens) + attn_mask = xs.ne(0).float() if self.attn_type != 'none' else None + return hidden, enc_out, attn_mask + + def _rank(self, cand_params, encoder_states): + if cand_params is not None: + return self.ranker.forward(cand_params, encoder_states) + return None + + def _starts(self, bsz): + return self.START.detach().expand(bsz, 1) + + def _decode_forced(self, ys, encoder_states): + bsz = ys.size(0) + seqlen = ys.size(1) + + hidden = encoder_states[0] + attn_params = (encoder_states[1], encoder_states[2]) + + # input to model is START + each target except the last + y_in = ys.narrow(1, 0, seqlen - 1) + xs = torch.cat([self._starts(bsz), y_in], 1) + + scores = [] + if self.attn_type == 'none': + # do the whole thing in one go + output, hidden = self.decoder(xs, hidden, attn_params) + score = self.output(output) + scores.append(score) + else: + # need to feed in one token at a time so we can do attention + # TODO: do we need to do this? actually shouldn't need to since we + # don't do input feeding + for i in range(seqlen): + xi = xs.select(1, i) + output, hidden = self.decoder(xi, hidden, attn_params) + score = self.output(output) + scores.append(score) + + return scores + + def _decode(self, encoder_states, maxlen): + hidden = encoder_states[0] + attn_params = (encoder_states[1], encoder_states[2]) + bsz = hidden.size(0) + + xs = self._starts(bsz) # input start token + + scores = [] + for _ in range(maxlen): + # generate at most longest_label tokens + output, hidden = self.decoder(xs, hidden, attn_params) + score = self.output(output) + scores.append(score) + xs = scores.max(1)[0] # next input is current predicted output + + return scores + + def forward(self, xs, x_lens=None, ys=None, cand_params=None, + prev_enc=None, rank_during_training=False, maxlen=None): """Get output predictions from the model. - Arguments: - xs -- input to the encoder - ys -- expected output from the decoder - cands -- set of candidates to rank, if applicable - cand_indices -- indices to match candidates with their appropriate xs - prev_enc -- if you know you'll pass in the same xs multiple times and - the model is in eval mode, you can pass in the encoder output from - the last forward pass to skip recalcuating the same encoder output - rank_during_training -- (default False) if set, ranks any available - cands during training as well + :param xs: (bsz x seqlen) LongTensor input to the encoder + :param x_lens: (list of ints) length of each input sequence + :param ys: expected output from the decoder. used for teacher forcing to calculate loss. + :param cand_params: set of candidates to rank, and indices to match candidates with their appropriate xs + :param prev_enc: if you know you'll pass in the same xs multiple times, you can pass in the encoder output from the last forward pass to skip recalcuating the same encoder output + : """ - input_xs = xs - nbest_beam_preds, nbest_beam_scores = None, None - bsz = len(xs) if ys is not None: # keep track of longest label we've ever seen # we'll never produce longer ones than that during prediction self.longest_label = max(self.longest_label, ys.size(1)) - if prev_enc is not None: - enc_out, hidden, attn_mask = prev_enc - else: - enc_out, hidden = self.encoder(xs) - attn_mask = xs.ne(0).float() if self.attn_type != 'none' else None - encoder_states = (enc_out, hidden, attn_mask) - start = self.START.detach() - starts = start.expand(bsz, 1) + encoder_states = self.encode(xs, x_lens, prev_enc) - predictions = [] - scores = [] - cand_preds, cand_scores = None, None - if self.rank and cands is not None: - decode_params = (start, hidden, enc_out, attn_mask) - if self.training: - if rank_during_training: - cand_preds, cand_scores = self.ranker.forward(cands, cand_indices, decode_params=decode_params) - else: - cand_preds, cand_scores = self.ranker.forward(cands, cand_indices, decode_params=decode_params) + # rank candidates if they are available + cand_scores = self._rank(cand_params, encoder_states) if ys is not None: - y_in = ys.narrow(1, 0, ys.size(1) - 1) - xs = torch.cat([starts, y_in], 1) - if self.attn_type == 'none': - preds, score, hidden = self.decoder(xs, hidden, enc_out, attn_mask) - predictions.append(preds) - scores.append(score) - else: - for i in range(ys.size(1)): - xi = xs.select(1, i) - preds, score, hidden = self.decoder(xi, hidden, enc_out, attn_mask) - predictions.append(preds) - scores.append(score) + # use teacher forcing + scores = self._decode_forced(ys, encoder_states) else: - # here we do search: supported search types: greedy, beam search - if beam_size == 1: - done = [False for _ in range(bsz)] - total_done = 0 - xs = starts - - for _ in range(self.longest_label): - # generate at most longest_label tokens - preds, score, hidden = self.decoder(xs, hidden, enc_out, attn_mask, topk) - scores.append(score) - xs = preds - predictions.append(preds) - - # check if we've produced the end token - for b in range(bsz): - if not done[b]: - # only add more tokens for examples that aren't done - if preds.data[b][0] == self.END_IDX: - # if we produced END, we're done - done[b] = True - total_done += 1 - if total_done == bsz: - # no need to generate any more - break - - elif beam_size > 1: - enc_out, hidden = encoder_states[0], encoder_states[1] # take it from encoder - enc_out = enc_out.unsqueeze(1).repeat(1, beam_size, 1, 1) - # create batch size num of beams - data_device = enc_out.device - beams = [Beam(beam_size, 3, 0, 1, 2, min_n_best=beam_size / 2, cuda=data_device) for _ in range(bsz)] - # init the input with start token - xs = starts - # repeat tensors to support batched beam - xs = xs.repeat(1, beam_size) - attn_mask = input_xs.ne(0).float() - attn_mask = attn_mask.unsqueeze(1).repeat(1, beam_size, 1) - repeated_hidden = [] - - if isinstance(hidden, tuple): - for i in range(len(hidden)): - repeated_hidden.append(hidden[i].unsqueeze(2).repeat(1, 1, beam_size, 1)) - hidden = self.unbeamize_hidden(tuple(repeated_hidden), beam_size, bsz) - else: # GRU - repeated_hidden = hidden.unsqueeze(2).repeat(1, 1, beam_size, 1) - hidden = self.unbeamize_hidden(repeated_hidden, beam_size, bsz) - enc_out = self.unbeamize_enc_out(enc_out, beam_size, bsz) - xs = xs.view(bsz * beam_size, -1) - for step in range(self.longest_label): - if all((b.done() for b in beams)): - break - out = self.decoder(xs, hidden, enc_out) - scores = out[1] - scores = scores.view(bsz, beam_size, -1) # -1 is a vocab size - for i, b in enumerate(beams): - b.advance(F.log_softmax(scores[i, :], dim=-1)) - xs = torch.cat([b.get_output_from_current_step() for b in beams]).unsqueeze(-1) - permute_hidden_idx = torch.cat( - [beam_size * i + b.get_backtrack_from_current_step() for i, b in enumerate(beams)]) - new_hidden = out[2] - if isinstance(hidden, tuple): - for i in range(len(hidden)): - hidden[i].data.copy_(new_hidden[i].data.index_select(dim=1, index=permute_hidden_idx)) - else: # GRU - hidden.data.copy_(new_hidden.data.index_select(dim=1, index=permute_hidden_idx)) - - for b in beams: - b.check_finished() - beam_pred = [b.get_pretty_hypothesis(b.get_top_hyp()[0])[1:] for b in beams] - # these beam scores are rescored with length penalty! - beam_scores = torch.stack([b.get_top_hyp()[1] for b in beams]) - pad_length = max([t.size(0) for t in beam_pred]) - beam_pred = torch.stack([pad(t, length=pad_length, dim=0) for t in beam_pred], dim=0) - - # prepare n best list for each beam - n_best_beam_tails = [b.get_rescored_finished(n_best=len(b.finished)) for b in beams] - nbest_beam_scores = [] - nbest_beam_preds = [] - for i, beamtails in enumerate(n_best_beam_tails): - perbeam_preds = [] - perbeam_scores = [] - for tail in beamtails: - perbeam_preds.append(beams[i].get_pretty_hypothesis(beams[i].get_hyp_from_finished(tail))) - perbeam_scores.append(tail.score) - nbest_beam_scores.append(perbeam_scores) - nbest_beam_preds.append(perbeam_preds) - - if self.beam_log_freq > 0.0: - num_dump = round(bsz * self.beam_log_freq) - for i in range(num_dump): - dot_graph = beams[i].get_beam_dot(dictionary=self.dict) - dot_graph.write_png(os.path.join(self.beam_dump_path, "{}.png".format(self.beam_dump_filecnt))) - self.beam_dump_filecnt += 1 - - predictions = beam_pred - scores = beam_scores - - if isinstance(predictions, list): - predictions = torch.cat(predictions, 1) + scores = self._decode(encoder_states, maxlen or self.longest_label) + if isinstance(scores, list): scores = torch.cat(scores, 1) - return predictions, scores, cand_preds, cand_scores, encoder_states, nbest_beam_preds, nbest_beam_scores + return scores, cand_scores, encoder_states + +class RNNEncoder(nn.Module): + """RNN Encoder.""" -class Encoder(nn.Module): def __init__(self, num_features, padding_idx=0, rnn_class='lstm', emb_size=128, hidden_size=128, num_layers=2, dropout=0.1, bidirectional=False, shared_lt=None, shared_rnn=None, @@ -284,20 +195,25 @@ def __init__(self, num_features, padding_idx=0, rnn_class='lstm', if shared_rnn is None: self.rnn = rnn_class(emb_size, hidden_size, num_layers, - dropout=dropout, batch_first=True, - bidirectional=bidirectional) + dropout=dropout if num_layers > 1 else 0, + batch_first=True, bidirectional=bidirectional) elif bidirectional: raise RuntimeError('Cannot share decoder with bidir encoder.') else: self.rnn = shared_rnn - def forward(self, xs): + def forward(self, xs, x_lens=None): + """Encode sequence. + + :param xs: (bsz x seqlen) LongTensor of input token indices + """ bsz = len(xs) # embed input tokens xes = self.dropout(self.lt(xs)) try: - x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] + if x_lens is None: + x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] xes = pack_padded_sequence(xes, x_lens, batch_first=True) packed = True except ValueError: @@ -309,29 +225,27 @@ def forward(self, xs): encoder_output, _ = pad_packed_sequence(encoder_output, batch_first=True) if self.dirs > 1: - # take elementwise max between forward and backward hidden states - # NOTE: currently using max, but maybe should use Linear + # project to decoder dimension by taking sum of forward and back if isinstance(self.rnn, nn.LSTM): - hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).max(1)[0], - hidden[1].view(-1, self.dirs, bsz, self.hsz).max(1)[0]) + hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).sum(1), + hidden[1].view(-1, self.dirs, bsz, self.hsz).sum(1)) else: - hidden = hidden.view(-1, self.dirs, bsz, self.hsz).max(1)[0] + hidden = hidden.view(-1, self.dirs, bsz, self.hsz).sum(1) return encoder_output, hidden -class Decoder(nn.Module): +class RNNDecoder(nn.Module): + """Recurrent decoder module. + + Can be used as a standalone language model or paired with an encoder. + """ + def __init__(self, num_features, padding_idx=0, rnn_class='lstm', emb_size=128, hidden_size=128, num_layers=2, dropout=0.1, - bidir_input=False, share_output=True, - attn_type='none', attn_length=-1, attn_time='pre', - sparse=False, numsoftmax=1, softmax_layer_bias=False): + bidir_input=False, attn_type='none', attn_time='pre', + attn_length=-1, sparse=False): super().__init__() - - if padding_idx != 0: - raise RuntimeError('This module\'s output layer needs to be fixed ' - 'if you want a padding_idx other than zero.') - self.dropout = nn.Dropout(p=dropout) self.layers = num_layers self.hsz = hidden_size @@ -340,21 +254,8 @@ def __init__(self, num_features, padding_idx=0, rnn_class='lstm', self.lt = nn.Embedding(num_features, emb_size, padding_idx=padding_idx, sparse=sparse) self.rnn = rnn_class(emb_size, hidden_size, num_layers, - dropout=dropout, batch_first=True) - - # rnn output to embedding - if hidden_size != emb_size and numsoftmax == 1: - # self.o2e = RandomProjection(hidden_size, emb_size) - # other option here is to learn these weights - self.o2e = nn.Linear(hidden_size, emb_size, bias=False) - else: - # no need for any transformation here - self.o2e = lambda x: x - # embedding to scores, use custom linear to possibly share weights - shared_weight = self.lt.weight if share_output else None - self.e2s = Linear(emb_size, num_features, bias=softmax_layer_bias, - shared_weight=shared_weight) - self.shared = shared_weight is not None + dropout=dropout if num_layers > 1 else 0, + batch_first=True) self.attn_type = attn_type self.attn_time = attn_time @@ -365,69 +266,145 @@ def __init__(self, num_features, padding_idx=0, rnn_class='lstm', attn_length=attn_length, attn_time=attn_time) + def forward(self, xs, hidden=None, attn_params=None): + """Decode from input tokens. + + :param xs: (bsz x seqlen) LongTensor of input token indices + :param hidden: hidden state to feed into decoder. default (None) + initializes tensors using the RNN's defaults. + :param attn_params: (optional) tuple containing attention parameters, + default AttentionLayer needs encoder_output states + and attention mask (e.g. encoder_input.ne(0)) + + :returns: output state(s), hidden state. + output state of the encoder. for an RNN, this is + (bsz, seq_len, num_directions * hidden_size). + hidden state will be same dimensions as input + hidden state. for an RNN, this is a tensor of sizes + (bsz, num_layers * num_directions, hidden_size). + """ + # sequence indices => sequence embeddings + xes = self.dropout(self.lt(xs)) + + if self.attn_time == 'pre': + # modify input vectors with attention + xes = self.attention(xes, hidden, attn_params) + + # feed tokens into rnn + output, new_hidden = self.rnn(xes, hidden) + + if self.attn_time == 'post': + # modify output vectors with attention + output = self.attention(output, new_hidden, attn_params) + + return output, new_hidden + + +class OutputLayer(nn.Module): + """Takes in final states and returns distribution over candidates.""" + + def __init__(self, num_features, hidden_size, emb_size, numsoftmax=1, + shared_weight=None): + """Initialize output layer. + + :param num_features: number of candidates to rank + :param hidden_size: (last) dimension of the input vectors + :param emb_size: (last) dimension of the candidate vectors + :param num_softmax: (default 1) number of softmaxes to calculate. + see arxiv.org/abs/1711.03953 for more info. + increasing this slows down computation but can + add more expressivity to the embeddings. + :param shared_weight: (num_features x esz) vector of + """ + # embedding to scores + if shared_weight is None: + # just a regular linear layer + self.e2s = nn.Linear(emb_size, num_features, bias=True) + else: + # use shared weights and a bias layer instead + self.weight = shared_weight + self.bias = Parameter(torch.Tensor(num_features)) + self.reset_parameters() + self.e2s = lambda x: F.linear(x, self.weight, self.bias) + self.numsoftmax = numsoftmax if numsoftmax > 1: self.softmax = nn.Softmax(dim=1) self.prior = nn.Linear(hidden_size, numsoftmax, bias=False) self.latent = nn.Linear(hidden_size, numsoftmax * emb_size) self.activation = nn.Tanh() + else: + # rnn output to embedding + if hidden_size != emb_size: + # learn projection to correct dimensions + self.o2e = nn.Linear(hidden_size, emb_size, bias=True) + else: + # no need for any transformation here + self.o2e = lambda x: x - def forward(self, xs, hidden, encoder_output, attn_mask=None, topk=1): - xes = self.dropout(self.lt(xs)) - if self.attn_time == 'pre': - xes = self.attention(xes, hidden, encoder_output, attn_mask) - if xes.dim() == 2: - # if only one token inputted, sometimes needs unsquezing - xes.unsqueeze_(1) - output, new_hidden = self.rnn(xes, hidden) - if self.attn_time == 'post': - output = self.attention(output, new_hidden, encoder_output, attn_mask) + def reset_parameters(self): + """Reset bias param.""" + if hasattr(self, 'bias'): + stdv = 1. / math.sqrt(self.bias.size(1)) + self.bias.data.uniform_(-stdv, stdv) + + def forward(self, input): + """Compute scores from inputs. + + :param input: (bsz x seq_len x num_directions * hidden_size) tensor of + states, e.g. the output states of an RNN + :returns: (bsz x seqlen x num_cands) scores for each candidate + """ + # next compute scores over dictionary if self.numsoftmax > 1: - bsz = xs.size(0) - seqlen = xs.size(1) if xs.dim() > 1 else 1 - latent = self.latent(output) + bsz = input.size(0) + seqlen = input.size(1) if input.dim() > 1 else 1 + + # first compute different softmax scores based on input vec + # hsz => num_softmax * esz + latent = self.latent(input) active = self.dropout(self.activation(latent)) + # esz => num_features logit = self.e2s(active.view(-1, self.esz)) - prior_logit = self.prior(output).view(-1, self.numsoftmax) - prior = self.softmax(prior_logit) # softmax over numsoftmax's + # calculate priors: distribution over which softmax scores to use + # hsz => num_softmax + prior_logit = self.prior(input).view(-1, self.numsoftmax) + # softmax over numsoftmax's + prior = self.softmax(prior_logit) + # now combine priors with logits prob = self.softmax(logit).view(bsz * seqlen, self.numsoftmax, -1) probs = (prob * prior.unsqueeze(2)).sum(1).view(bsz, seqlen, -1) scores = probs.log() else: - e = self.dropout(self.o2e(output)) + # hsz => esz, good time for dropout + e = self.dropout(self.o2e(input)) + # esz => num_features scores = self.e2s(e) - # select top scoring index, excluding the padding symbol (at idx zero) - # we can do topk sampling from renoramlized softmax here, default topk=1 is greedy - if topk == 1: - _max_score, idx = scores.narrow(2, 1, scores.size(2) - 1).max(2) - elif topk > 1: - max_score, idx = torch.topk(F.softmax(scores.narrow(2, 1, scores.size(2) - 1), 2), topk, dim=2, sorted=False) - probs = F.softmax(scores.narrow(2, 1, scores.size(2) - 1).gather(2, idx), 2).squeeze(1) - dist = torch.distributions.categorical.Categorical(probs) - samples = dist.sample() - idx = idx.gather(-1, samples.unsqueeze(1).unsqueeze(-1)).squeeze(-1) - preds = idx.add_(1) - - return preds, scores, new_hidden + return scores class Ranker(object): - def __init__(self, decoder, padding_idx=0, attn_type='none'): + def __init__(self, decoder, start_token, padding_idx=0, attn_type='none'): super().__init__() self.decoder = decoder + self.START = start_token self.NULL_IDX = padding_idx self.attn_type = attn_type - def forward(self, cands, cand_inds, decode_params): - start, hidden, enc_out, attn_mask = decode_params + def _starts(self, bsz): + return self.START.detach().expand(bsz, 1) + + def forward(self, cand_params, encoder_states): + cands, cand_inds = cand_params + hidden, enc_out, attn_mask = encoder_states hid, cell = (hidden, None) if isinstance(hidden, torch.Tensor) else hidden if len(cand_inds) != hid.size(1): - cand_indices = start.detach().new(cand_inds) + cand_indices = self.START.detach().new(cand_inds) hid = hid.index_select(1, cand_indices) if cell is None: hidden = hid @@ -444,7 +421,7 @@ def forward(self, cands, cand_inds, decode_params): curr_cs = cands[i] n_cs = curr_cs.size(0) - starts = start.expand(n_cs).unsqueeze(1) + starts = self._starts(n_cs).unsqueeze(1) scores = 0 seqlens = 0 # select just the one hidden state @@ -491,80 +468,15 @@ def forward(self, cands, cand_inds, decode_params): max_len = max(len(c) for c in cand_scores) cand_scores = torch.cat([pad(c, max_len).unsqueeze(0) for c in cand_scores], 0) - preds = cand_scores.sort(1, True)[1] - return preds, cand_scores - - -class Linear(nn.Module): - """Custom Linear layer which allows for sharing weights (e.g. with an - nn.Embedding layer). - """ - def __init__(self, in_features, out_features, bias=True, - shared_weight=None): - super().__init__() - self.in_features = in_features - self.out_features = out_features - self.shared = shared_weight is not None + return cand_scores - # init weight - if not self.shared: - self.weight = Parameter(torch.Tensor(out_features, in_features)) - else: - if (shared_weight.size(0) != out_features or - shared_weight.size(1) != in_features): - raise RuntimeError('wrong dimensions for shared weights') - self.weight = shared_weight - - # init bias - if bias: - self.bias = Parameter(torch.Tensor(out_features)) - else: - self.register_parameter('bias', None) - self.reset_parameters() - - def reset_parameters(self): - stdv = 1. / math.sqrt(self.weight.size(1)) - if not self.shared: - # weight is shared so don't overwrite it - self.weight.data.uniform_(-stdv, stdv) - if self.bias is not None: - self.bias.data.uniform_(-stdv, stdv) - def forward(self, input): - weight = self.weight - if self.shared: - # detach weight to prevent gradients from changing weight - # (but need to detach every time so weights are up to date) - weight = weight.detach() - return F.linear(input, weight, self.bias) - - def __repr__(self): - return self.__class__.__name__ + ' (' \ - + str(self.in_features) + ' -> ' \ - + str(self.out_features) + ')' - - -class RandomProjection(nn.Module): - """Randomly project input to different dimensionality.""" - def __init__(self, in_features, out_features): - super().__init__() - self.in_features = in_features - self.out_features = out_features - self.weight = Parameter(torch.Tensor(out_features, in_features), - requires_grad=False) # fix weights - self.reset_parameters() - - def reset_parameters(self): - # experimentally: std=1 appears to affect scale too much, so using 0.1 - self.weight.data.normal_(std=0.1) - # other init option: set randomly to 1 or -1 - # self.weight.data.bernoulli_(self.weight.fill_(0.5)).mul_(2).sub_(1) - - def forward(self, input): - return F.linear(input, self.weight) +class AttentionLayer(nn.Module): + """Computes attention between hidden and encoder states. + See arxiv.org/abs/1508.04025 for more info on each attention type. + """ -class AttentionLayer(nn.Module): def __init__(self, attn_type, hidden_size, emb_size, bidirectional=False, attn_length=-1, attn_time='pre'): super().__init__() @@ -581,6 +493,8 @@ def __init__(self, attn_type, hidden_size, emb_size, bidirectional=False, input_dim = hsz else: raise RuntimeError('unsupported attention time') + + # linear layer for combining applied attention weights with input self.attn_combine = nn.Linear(hszXdirs + input_dim, input_dim, bias=False) @@ -599,8 +513,23 @@ def __init__(self, attn_type, hidden_size, emb_size, bidirectional=False, # equivalent to dot if attn is identity self.attn = nn.Linear(hsz, hszXdirs, bias=False) - def forward(self, xes, hidden, enc_out, attn_mask=None): + def forward(self, xes, hidden, attn_params): + """Compute attention over attn_params given input and hidden states. + + :param xes: input state. will be combined with applied + attention. + :param hidden: hidden state from model. will be used to select + states to attend to in from the attn_params. + :param attn_params: tuple of encoder output states and a mask showing + which input indices are nonzero. + + :returns: output, attn_weights + output is a new state of same size as input state `xes`. + attn_weights are the weights given to each state in the + encoder outputs. + """ if self.attention == 'none': + # do nothing, no attention return xes if type(hidden) == tuple: @@ -608,40 +537,60 @@ def forward(self, xes, hidden, enc_out, attn_mask=None): hidden = hidden[0] last_hidden = hidden[-1] # select hidden state from last RNN layer + enc_out, attn_mask = attn_params + bsz, seqlen, hszXnumdir = enc_out.size() + numlayersXnumdir = last_hidden.size(1) + if self.attention == 'local': - if enc_out.size(1) > self.max_length: - offset = enc_out.size(1) - self.max_length - enc_out = enc_out.narrow(1, offset, self.max_length) + # local attention weights aren't based on encoder states h_merged = torch.cat((xes.squeeze(1), last_hidden), 1) attn_weights = F.softmax(self.attn(h_merged), dim=1) - if attn_weights.size(1) > enc_out.size(1): - attn_weights = attn_weights.narrow(1, 0, enc_out.size(1)) + + # adjust state sizes to the fixed window size + if seqlen > self.max_length: + offset = seqlen - self.max_length + enc_out = enc_out.narrow(1, offset, self.max_length) + seqlen = self.max_length + if attn_weights.size(1) > seqlen: + attn_weights = attn_weights.narrow(1, 0, seqlen) else: hid = last_hidden.unsqueeze(1) if self.attention == 'concat': - hid = hid.expand(last_hidden.size(0), - enc_out.size(1), - last_hidden.size(1)) + # concat hidden state and encoder outputs + hid = hid.expand(bsz, seqlen, numlayersXnumdir) h_merged = torch.cat((enc_out, hid), 2) + # then do linear combination of them with activation active = F.tanh(self.attn(h_merged)) attn_w_premask = self.attn_v(active).squeeze(2) elif self.attention == 'dot': - if hid.size(2) != enc_out.size(2): + # dot product between hidden and encoder outputs + if numlayersXnumdir != hszXnumdir: # enc_out has two directions, so double hid hid = torch.cat([hid, hid], 2) - attn_w_premask = ( - torch.bmm(hid, enc_out.transpose(1, 2)).squeeze(1)) + enc_t = enc_out.transpose(1, 2) + attn_w_premask = torch.bmm(hid, enc_t).squeeze(1) elif self.attention == 'general': + # before doing dot product, transform hidden state with linear + # same as dot if linear is identity hid = self.attn(hid) - attn_w_premask = ( - torch.bmm(hid, enc_out.transpose(1, 2)).squeeze(1)) - # calculate activation scores + enc_t = enc_out.transpose(1, 2) + attn_w_premask = torch.bmm(hid, enc_t).squeeze(1) + + # calculate activation scores, apply mask if needed if attn_mask is not None: # remove activation from NULL symbols + import pdb; pdb.set_trace() # is this the best operation? attn_w_premask -= (1 - attn_mask) * 1e20 attn_weights = F.softmax(attn_w_premask, dim=1) + # apply the attention weights to the encoder states attn_applied = torch.bmm(attn_weights.unsqueeze(1), enc_out) + # concatenate the input and encoder states merged = torch.cat((xes.squeeze(1), attn_applied.squeeze(1)), 1) + # combine them with a linear layer and tanh activation output = F.tanh(self.attn_combine(merged).unsqueeze(1)) - return output + print('make sure last_hidden == xes') + import pdb; pdb.set_trace() + # TODO: remove tanh? + + return output, attn_weights diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py index 24ae228315c..612cdd11dc6 100644 --- a/parlai/agents/seq2seq/seq2seq.py +++ b/parlai/agents/seq2seq/seq2seq.py @@ -43,8 +43,8 @@ class Seq2seqAgent(TorchAgent): # legacy agent code is located in parlai/legacy_agents VERSION = 1 - @staticmethod - def add_cmdline_args(argparser): + @classmethod + def add_cmdline_args(cls, argparser): """Add command-line arguments specifically for this agent.""" agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('--init-model', type=str, default=None, @@ -66,7 +66,8 @@ def add_cmdline_args(argparser): help='whether to encode the context with a ' 'bidirectional rnn') agent.add_argument('-att', '--attention', default='none', - choices=['none', 'concat', 'general', 'dot', 'local'], + choices=['none', 'concat', 'general', 'dot', + 'local'], help='Choices: none, concat, general, local. ' 'If set local, also set attention-length. ' '(see arxiv.org/abs/1508.04025)') @@ -108,9 +109,6 @@ def add_cmdline_args(argparser): help='Top k sampling from renormalized softmax in ' 'test/valid time, default 1 means simple ' 'greedy max output') - agent.add_argument('--softmax-layer-bias', type='bool', default=False, - help='Put True if you want to include the bias in ' - 'decoder.e2s layer') agent.add_argument('--seq2seq_version', default=Seq2seqAgent.VERSION) TorchAgent.add_cmdline_args(argparser) Seq2seqAgent.dictionary_class().add_cmdline_args(argparser) From a5b8823421817bce4abc950f3dd4a66c208b786a Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Tue, 28 Aug 2018 15:10:37 -0700 Subject: [PATCH 02/12] partial --- parlai/agents/seq2seq/modules.py | 191 +++++++++++++++++++------------ parlai/agents/seq2seq/seq2seq.py | 36 +++--- 2 files changed, 138 insertions(+), 89 deletions(-) diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py index bf9d49a36d3..3dc904f0657 100644 --- a/parlai/agents/seq2seq/modules.py +++ b/parlai/agents/seq2seq/modules.py @@ -10,12 +10,17 @@ from torch.nn.parameter import Parameter from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence import torch.nn.functional as F -from parlai.core.torch_agent import Beam -from parlai.core.dict import DictionaryAgent -import os def pad(tensor, length, dim=0): + """Pad tensor to a specific length. + + :param tensor: vector to pad + :param length: new length + :param dim: (default 0) dimension to pad + + :returns: padded tensor if the tensor is shorter than length + """ if tensor.size(dim) < length: return torch.cat( [tensor, tensor.new(*tensor.size()[:dim], @@ -26,64 +31,73 @@ def pad(tensor, length, dim=0): return tensor +def opt_to_kwargs(opt): + """Get kwargs for seq2seq from opt.""" + kwargs = {} + for k in ['numlayers', 'dropout', 'bidirectional', 'rnn_class', + 'lookuptable', 'decoder', 'numsoftmax', 'rank_candidates', + 'attention', 'attention_length', 'attention_time']: + if k in opt: + kwargs[k] = opt[k] + return kwargs + + class Seq2seq(nn.Module): + """Sequence to sequence parent module.""" + RNN_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM} - def __init__(self, opt, num_features, - padding_idx=0, start_idx=1, end_idx=2, longest_label=1): - super().__init__() - self.opt = opt + def __init__( + self, num_features, embeddingsize, hiddensize, numlayers=2, dropout=0, + bidirectional=False, rnn_class='lstm', lookuptable='unique', + decoder='same', numsoftmax=1, rank_candidates=False, + attention='none', attention_length=48, attention_time='post', + padding_idx=0, start_idx=1, longest_label=1, + ): + """Initialize seq2seq model. - self.attn_type = opt['attention'] + See cmdline args in Seq2seqAgent for description of arguments. + """ + super().__init__() + self.attn_type = attention self.NULL_IDX = padding_idx - self.END_IDX = end_idx self.register_buffer('START', torch.LongTensor([start_idx])) self.longest_label = longest_label - rnn_class = Seq2seq.RNN_OPTS[opt['rnn_class']] + rnn_class = Seq2seq.RNN_OPTS[rnn_class] self.decoder = RNNDecoder( num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class, - emb_size=opt['embeddingsize'], hidden_size=opt['hiddensize'], - num_layers=opt['numlayers'], dropout=opt['dropout'], - share_output=opt['lookuptable'] in ['dec_out', 'all'], - attn_type=opt['attention'], attn_length=opt['attention_length'], - attn_time=opt.get('attention_time'), - bidir_input=opt['bidirectional'], - numsoftmax=opt.get('numsoftmax', 1)) + embeddingsize=embeddingsize, hiddensize=hiddensize, + numlayers=numlayers, dropout=dropout, + share_output=lookuptable in ['dec_out', 'all'], + attn_type=attention, attn_length=attention_length, + attn_time=attention_time, + bidir_input=bidirectional, + numsoftmax=numsoftmax) shared_lt = (self.decoder.lt - if opt['lookuptable'] in ['enc_dec', 'all'] else None) - shared_rnn = self.decoder.rnn if opt['decoder'] == 'shared' else None + if lookuptable in ['enc_dec', 'all'] else None) + shared_rnn = self.decoder.rnn if decoder == 'shared' else None self.encoder = RNNEncoder( num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class, - emb_size=opt['embeddingsize'], hidden_size=opt['hiddensize'], - num_layers=opt['numlayers'], dropout=opt['dropout'], - bidirectional=opt['bidirectional'], + embeddingsize=embeddingsize, hiddensize=hiddensize, + num_layers=numlayers, dropout=dropout, + bidirectional=bidirectional, shared_lt=shared_lt, shared_rnn=shared_rnn) - if opt['rank_candidates']: + if rank_candidates: self.ranker = Ranker( self.decoder, self.START, padding_idx=self.NULL_IDX, - attn_type=opt['attention']) + attn_type=attention) - self.beam_log_freq = opt.get('beam_log_freq', 0.0) - if self.beam_log_freq > 0.0: - self.dict = DictionaryAgent(opt) - self.beam_dump_filecnt = 0 - self.beam_dump_path = opt['model_file'] + '.beam_dump' - if not os.path.exists(self.beam_dump_path): - os.makedirs(self.beam_dump_path) - - def _encode(self, xs, x_lens=None, prev_enc=None): + def _encode(self, xs, prev_enc=None): if prev_enc is not None: return prev_enc else: - enc_out, hidden = self.encoder(xs, x_lens) - attn_mask = xs.ne(0).float() if self.attn_type != 'none' else None - return hidden, enc_out, attn_mask + return self.encoder(xs) def _rank(self, cand_params, encoder_states): if cand_params is not None: @@ -139,23 +153,37 @@ def _decode(self, encoder_states, maxlen): return scores - def forward(self, xs, x_lens=None, ys=None, cand_params=None, - prev_enc=None, rank_during_training=False, maxlen=None): + def forward(self, xs, ys=None, cand_params=None, prev_enc=None, + maxlen=None): """Get output predictions from the model. - :param xs: (bsz x seqlen) LongTensor input to the encoder - :param x_lens: (list of ints) length of each input sequence - :param ys: expected output from the decoder. used for teacher forcing to calculate loss. - :param cand_params: set of candidates to rank, and indices to match candidates with their appropriate xs - :param prev_enc: if you know you'll pass in the same xs multiple times, you can pass in the encoder output from the last forward pass to skip recalcuating the same encoder output - : + :param xs: (bsz x seqlen) LongTensor input to the encoder + :param ys: expected output from the decoder. used for teacher + forcing to calculate loss. + :param cand_params: set of candidates to rank, and indices to match + candidates with their appropriate xs. + :param prev_enc: if you know you'll pass in the same xs multiple + times, you can pass in the encoder output from the + last forward pass to skip recalcuating the same + encoder output. + :param maxlen: max number of tokens to decode. if not set, will + use the length of the longest label this model + has seen. ignored when ys is not None. + + :returns: scores, candidate scores, and encoder states + scores contains the model's predicted token scores. + (bsz x seqlen x num_features) + candidate scores are the score the model assigned to each candidate + (bsz x num_cands) + encoder states are the (hidden, output, attn_mask) states from the + encoder. feed this back in to skip encoding on the next call. """ if ys is not None: # keep track of longest label we've ever seen # we'll never produce longer ones than that during prediction self.longest_label = max(self.longest_label, ys.size(1)) - encoder_states = self.encode(xs, x_lens, prev_enc) + encoder_states = self.encode(xs, prev_enc) # rank candidates if they are available cand_scores = self._rank(cand_params, encoder_states) @@ -176,25 +204,26 @@ class RNNEncoder(nn.Module): """RNN Encoder.""" def __init__(self, num_features, padding_idx=0, rnn_class='lstm', - emb_size=128, hidden_size=128, num_layers=2, dropout=0.1, + embeddingsize=128, hiddensize=128, num_layers=2, dropout=0.1, bidirectional=False, shared_lt=None, shared_rnn=None, sparse=False): + """Initialize recurrent encoder.""" super().__init__() self.dropout = nn.Dropout(p=dropout) self.layers = num_layers self.dirs = 2 if bidirectional else 1 - self.hsz = hidden_size + self.hsz = hiddensize if shared_lt is None: - self.lt = nn.Embedding(num_features, emb_size, + self.lt = nn.Embedding(num_features, embeddingsize, padding_idx=padding_idx, sparse=sparse) else: self.lt = shared_lt if shared_rnn is None: - self.rnn = rnn_class(emb_size, hidden_size, num_layers, + self.rnn = rnn_class(embeddingsize, hiddensize, num_layers, dropout=dropout if num_layers > 1 else 0, batch_first=True, bidirectional=bidirectional) elif bidirectional: @@ -202,18 +231,24 @@ def __init__(self, num_features, padding_idx=0, rnn_class='lstm', else: self.rnn = shared_rnn - def forward(self, xs, x_lens=None): + def forward(self, xs): """Encode sequence. :param xs: (bsz x seqlen) LongTensor of input token indices + + :returns: encoder outputs, hidden state, attention mask + encoder outputs are the output state at each step of the encoding. + the hidden state is the final hidden state of the encoder. + the attention mask is a mask of which input values are nonzero. """ bsz = len(xs) # embed input tokens xes = self.dropout(self.lt(xs)) + attn_mask = xs.ne(0).float() try: - if x_lens is None: - x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data] + x_lens = torch.sum(attn_mask.int(), dim=1) + import pdb; pdb.set_trace() xes = pack_padded_sequence(xes, x_lens, batch_first=True) packed = True except ValueError: @@ -232,7 +267,7 @@ def forward(self, xs, x_lens=None): else: hidden = hidden.view(-1, self.dirs, bsz, self.hsz).sum(1) - return encoder_output, hidden + return encoder_output, hidden, attn_mask class RNNDecoder(nn.Module): @@ -242,26 +277,26 @@ class RNNDecoder(nn.Module): """ def __init__(self, num_features, padding_idx=0, rnn_class='lstm', - emb_size=128, hidden_size=128, num_layers=2, dropout=0.1, + embeddingsize=128, hiddensize=128, num_layers=2, dropout=0.1, bidir_input=False, attn_type='none', attn_time='pre', attn_length=-1, sparse=False): super().__init__() self.dropout = nn.Dropout(p=dropout) self.layers = num_layers - self.hsz = hidden_size - self.esz = emb_size + self.hsz = hiddensize + self.esz = embeddingsize - self.lt = nn.Embedding(num_features, emb_size, padding_idx=padding_idx, - sparse=sparse) - self.rnn = rnn_class(emb_size, hidden_size, num_layers, + self.lt = nn.Embedding(num_features, embeddingsize, + padding_idx=padding_idx, sparse=sparse) + self.rnn = rnn_class(embeddingsize, hiddensize, num_layers, dropout=dropout if num_layers > 1 else 0, batch_first=True) self.attn_type = attn_type self.attn_time = attn_time self.attention = AttentionLayer(attn_type=attn_type, - hidden_size=hidden_size, - emb_size=emb_size, + hiddensize=hiddensize, + embeddingsize=embeddingsize, bidirectional=bidir_input, attn_length=attn_length, attn_time=attn_time) @@ -278,10 +313,10 @@ def forward(self, xs, hidden=None, attn_params=None): :returns: output state(s), hidden state. output state of the encoder. for an RNN, this is - (bsz, seq_len, num_directions * hidden_size). + (bsz, seq_len, num_directions * hiddensize). hidden state will be same dimensions as input hidden state. for an RNN, this is a tensor of sizes - (bsz, num_layers * num_directions, hidden_size). + (bsz, num_layers * num_directions, hiddensize). """ # sequence indices => sequence embeddings xes = self.dropout(self.lt(xs)) @@ -303,13 +338,13 @@ def forward(self, xs, hidden=None, attn_params=None): class OutputLayer(nn.Module): """Takes in final states and returns distribution over candidates.""" - def __init__(self, num_features, hidden_size, emb_size, numsoftmax=1, + def __init__(self, num_features, hiddensize, embeddingsize, numsoftmax=1, shared_weight=None): """Initialize output layer. :param num_features: number of candidates to rank - :param hidden_size: (last) dimension of the input vectors - :param emb_size: (last) dimension of the candidate vectors + :param hiddensize: (last) dimension of the input vectors + :param embeddingsize: (last) dimension of the candidate vectors :param num_softmax: (default 1) number of softmaxes to calculate. see arxiv.org/abs/1711.03953 for more info. increasing this slows down computation but can @@ -319,7 +354,7 @@ def __init__(self, num_features, hidden_size, emb_size, numsoftmax=1, # embedding to scores if shared_weight is None: # just a regular linear layer - self.e2s = nn.Linear(emb_size, num_features, bias=True) + self.e2s = nn.Linear(embeddingsize, num_features, bias=True) else: # use shared weights and a bias layer instead self.weight = shared_weight @@ -330,14 +365,14 @@ def __init__(self, num_features, hidden_size, emb_size, numsoftmax=1, self.numsoftmax = numsoftmax if numsoftmax > 1: self.softmax = nn.Softmax(dim=1) - self.prior = nn.Linear(hidden_size, numsoftmax, bias=False) - self.latent = nn.Linear(hidden_size, numsoftmax * emb_size) + self.prior = nn.Linear(hiddensize, numsoftmax, bias=False) + self.latent = nn.Linear(hiddensize, numsoftmax * embeddingsize) self.activation = nn.Tanh() else: # rnn output to embedding - if hidden_size != emb_size: + if hiddensize != embeddingsize: # learn projection to correct dimensions - self.o2e = nn.Linear(hidden_size, emb_size, bias=True) + self.o2e = nn.Linear(hiddensize, embeddingsize, bias=True) else: # no need for any transformation here self.o2e = lambda x: x @@ -351,7 +386,7 @@ def reset_parameters(self): def forward(self, input): """Compute scores from inputs. - :param input: (bsz x seq_len x num_directions * hidden_size) tensor of + :param input: (bsz x seq_len x num_directions * hiddensize) tensor of states, e.g. the output states of an RNN :returns: (bsz x seqlen x num_cands) scores for each candidate @@ -388,7 +423,10 @@ def forward(self, input): class Ranker(object): + """Basic ranker which uses average log-prob of sequences for ranking.""" + def __init__(self, decoder, start_token, padding_idx=0, attn_type='none'): + """Intialize ranker.""" super().__init__() self.decoder = decoder self.START = start_token @@ -477,17 +515,18 @@ class AttentionLayer(nn.Module): See arxiv.org/abs/1508.04025 for more info on each attention type. """ - def __init__(self, attn_type, hidden_size, emb_size, bidirectional=False, - attn_length=-1, attn_time='pre'): + def __init__(self, attn_type, hiddensize, embeddingsize, + bidirectional=False, attn_length=-1, attn_time='pre'): + """Initialize attention layer.""" super().__init__() self.attention = attn_type if self.attention != 'none': - hsz = hidden_size + hsz = hiddensize hszXdirs = hsz * (2 if bidirectional else 1) if attn_time == 'pre': # attention happens on the input embeddings - input_dim = emb_size + input_dim = embeddingsize elif attn_time == 'post': # attention happens on the output of the rnn input_dim = hsz diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py index 612cdd11dc6..e9bd254feee 100644 --- a/parlai/agents/seq2seq/seq2seq.py +++ b/parlai/agents/seq2seq/seq2seq.py @@ -116,7 +116,7 @@ def add_cmdline_args(cls, argparser): @staticmethod def model_version(): - """Current version of this model, counting up from 0. + """Return current version of this model, counting up from 0. Models are not backwards-compatible with older versions. """ @@ -420,10 +420,16 @@ def receive_metrics(self, metrics_dict): class mydefaultdict(defaultdict): - """Custom defaultdict which overrides defaults requested by the get - function with the default factory. + """Get function also uses default_factory for this defaultdict. + + This makes dict.get() behave like dict[] if a default is not provided. """ + def get(self, key, default=None): + """Return value at key or default if key is not in dict. + + If a default is not provided, return the default factory value. + """ # override default from "get" (like "__getitem__" already is) return super().get(key, default or self.default_factory()) @@ -437,27 +443,33 @@ class PerplexityEvaluatorAgent(Seq2seqAgent): """ def __init__(self, opt, shared=None): + """Initialize evaluator.""" super().__init__(opt, shared) self.prev_enc = None self.last_xs = None def next_word_probability(self, partial_out): - """Return probability distribution over next words given an input and - partial true output. This is used to calculate the per-word perplexity. + """Return probability distribution over next words. + + This probability is based on both nn input and partial true output. + This is used to calculate the per-word perplexity. Arguments: observation -- input observation dict partial_out -- list of previous "true" words - Returns a dict, where each key is a word and each value is a probability - score for that word. Unset keys assume a probability of zero. + Returns a dict, where each key is a word and each value is a + probability score for that word. + Unset keys will use a probability of 1e-7. e.g. {'text': 'Run test program.'}, ['hello'] => {'world': 1.0} """ obs = self.observation xs = obs['text_vec'].unsqueeze(0) - ys = self._vectorize_text(' '.join(partial_out), False, True, self.truncate).unsqueeze(0) + ys = self._vectorize_text( + ' '.join(partial_out), False, True, self.truncate + ).unsqueeze(0) if self.prev_enc is not None and self.last_xs is not None and ( xs.shape[1] != self.last_xs.shape[1] or (xs == self.last_xs).sum().item() != xs.shape[1]): @@ -466,14 +478,12 @@ def next_word_probability(self, partial_out): self.last_xs = xs self.model.eval() - # no need to predict farther ahead - # if you pass in any ys, this will be ignored - self.model.longest_label = 1 out = self.model( xs, ys=(ys if len(partial_out) > 0 else None), - prev_enc=self.prev_enc) - scores, self.prev_enc = out[1], out[4] + prev_enc=self.prev_enc, + maxlen=1) + scores, self.prev_enc = out[0], out[2] # scores is bsz x seqlen x num_words, so select probs of current index probs = F.softmax(scores.select(1, -1), dim=1).squeeze() dist = mydefaultdict(lambda: 1e-7) # default probability for any token From a928f6c8e82b92c0f53a2c16f3c9bc8527aff494 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Tue, 28 Aug 2018 15:47:42 -0700 Subject: [PATCH 03/12] move ranker --- parlai/agents/seq2seq/modules.py | 224 +++++++++++++++---------------- 1 file changed, 106 insertions(+), 118 deletions(-) diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py index 3dc904f0657..a45088323d2 100644 --- a/parlai/agents/seq2seq/modules.py +++ b/parlai/agents/seq2seq/modules.py @@ -5,6 +5,7 @@ # of patent rights can be found in the PATENTS file in the same directory. import math + import torch import torch.nn as nn from torch.nn.parameter import Parameter @@ -35,7 +36,7 @@ def opt_to_kwargs(opt): """Get kwargs for seq2seq from opt.""" kwargs = {} for k in ['numlayers', 'dropout', 'bidirectional', 'rnn_class', - 'lookuptable', 'decoder', 'numsoftmax', 'rank_candidates', + 'lookuptable', 'decoder', 'numsoftmax', 'attention', 'attention_length', 'attention_time']: if k in opt: kwargs[k] = opt[k] @@ -50,7 +51,7 @@ class Seq2seq(nn.Module): def __init__( self, num_features, embeddingsize, hiddensize, numlayers=2, dropout=0, bidirectional=False, rnn_class='lstm', lookuptable='unique', - decoder='same', numsoftmax=1, rank_candidates=False, + decoder='same', numsoftmax=1, attention='none', attention_length=48, attention_time='post', padding_idx=0, start_idx=1, longest_label=1, ): @@ -67,31 +68,29 @@ def __init__( rnn_class = Seq2seq.RNN_OPTS[rnn_class] self.decoder = RNNDecoder( - num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class, - embeddingsize=embeddingsize, hiddensize=hiddensize, + num_features, embeddingsize, hiddensize, + padding_idx=self.NULL_IDX, rnn_class=rnn_class, numlayers=numlayers, dropout=dropout, - share_output=lookuptable in ['dec_out', 'all'], attn_type=attention, attn_length=attention_length, attn_time=attention_time, bidir_input=bidirectional, numsoftmax=numsoftmax) shared_lt = (self.decoder.lt - if lookuptable in ['enc_dec', 'all'] else None) + if lookuptable in ('enc_dec', 'all') else None) shared_rnn = self.decoder.rnn if decoder == 'shared' else None self.encoder = RNNEncoder( - num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class, - embeddingsize=embeddingsize, hiddensize=hiddensize, + num_features, embeddingsize, hiddensize, + padding_idx=self.NULL_IDX, rnn_class=rnn_class, num_layers=numlayers, dropout=dropout, bidirectional=bidirectional, shared_lt=shared_lt, shared_rnn=shared_rnn) - if rank_candidates: - self.ranker = Ranker( - self.decoder, - self.START, - padding_idx=self.NULL_IDX, - attn_type=attention) + shared_weight = (self.decoder.lt + if lookuptable in ('dec_out', 'all') else None) + self.output = OutputLayer( + num_features, embeddingsize, hiddensize, numsoftmax=numsoftmax, + shared_weight=shared_weight) def _encode(self, xs, prev_enc=None): if prev_enc is not None: @@ -99,11 +98,6 @@ def _encode(self, xs, prev_enc=None): else: return self.encoder(xs) - def _rank(self, cand_params, encoder_states): - if cand_params is not None: - return self.ranker.forward(cand_params, encoder_states) - return None - def _starts(self, bsz): return self.START.detach().expand(bsz, 1) @@ -149,10 +143,90 @@ def _decode(self, encoder_states, maxlen): output, hidden = self.decoder(xs, hidden, attn_params) score = self.output(output) scores.append(score) - xs = scores.max(1)[0] # next input is current predicted output + xs = score.max(1)[0] # next input is current predicted output return scores + def _align_inds(self, encoder_states, cand_inds): + hidden, enc_out, attn_mask = encoder_states + + # LSTM or GRU/RNN hidden state? + if isinstance(hidden, torch.Tensor): + hid, cell = hidden + else: + hid, cell = hidden, None + + if len(cand_inds) != hid.size(1): + # if the number of candidates is mismatched from the number of + # hidden states, we throw out the hidden states we won't rank with + cand_indices = hidden.new(cand_inds) + hid = hid.index_select(1, cand_indices) + if cell is None: + hidden = hid + else: + cell = cell.index_select(1, cand_indices) + hidden = (hid, cell) + + if self.attn_type != 'none': + enc_out = enc_out.index_select(0, cand_indices) + attn_mask = attn_mask.index_select(0, cand_indices) + + return hidden, enc_out, attn_mask + + def _extract_cur(self, encoder_states, index, num_cands): + hidden, enc_out, attn_mask = encoder_states + if isinstance(hidden, torch.Tensor): + nl = hidden.size(0) + hsz = hidden.size(-1) + cur_hid = (hidden.select(1, index).unsqueeze(1) + .expand(nl, num_cands, hsz)) + else: + nl = hidden[0].size(0) + hsz = hidden[0].size(-1) + cur_hid = (hidden[0].select(1, index).unsqueeze(1) + .expand(nl, num_cands, hsz).contiguous(), + hidden[1].select(1, index).unsqueeze(1) + .expand(nl, num_cands, hsz).contiguous()) + + if self.attn_type != 'none': + cur_enc = (enc_out[index].unsqueeze(0) + .expand(num_cands, enc_out.size(1), hsz)) + cur_mask = (attn_mask[index].unsqueeze(0) + .expand(num_cands, attn_mask.size(-1))) + return cur_hid, cur_enc, cur_mask + + def _rank(self, cand_params, encoder_states): + """Rank each cand by the average log-probability of the sequence.""" + if cand_params is None: + return None + + cands, cand_inds = cand_params + encoder_states = self._align_inds(encoder_states, cand_inds) + + cand_scores = [] + for batch_idx in range(len(cands)): + # we do one set of candidates at a time + curr_cs = cands[batch_idx] + num_cands = curr_cs.size(0) + + # select just the one hidden state + cur_enc_states = self._extract_cur( + encoder_states, num_cands, batch_idx) + + score = self.decode_forced(curr_cs, cur_enc_states) + true_score = F.log_softmax(score, dim=2).gather( + 2, curr_cs.unsqueeze(2)) + nonzero = curr_cs.ne(0).float() + scores = (true_score.squeeze(2) * nonzero).sum(1) + seqlens = nonzero.sum(1) + scores /= seqlens + cand_scores.append(scores) + + max_len = max(len(c) for c in cand_scores) + cand_scores = torch.cat( + [pad(c, max_len).unsqueeze(0) for c in cand_scores], 0) + return cand_scores + def forward(self, xs, ys=None, cand_params=None, prev_enc=None, maxlen=None): """Get output predictions from the model. @@ -194,17 +268,15 @@ def forward(self, xs, ys=None, cand_params=None, prev_enc=None, else: scores = self._decode(encoder_states, maxlen or self.longest_label) - if isinstance(scores, list): - scores = torch.cat(scores, 1) - + scores = torch.cat(scores, 1) return scores, cand_scores, encoder_states class RNNEncoder(nn.Module): """RNN Encoder.""" - def __init__(self, num_features, padding_idx=0, rnn_class='lstm', - embeddingsize=128, hiddensize=128, num_layers=2, dropout=0.1, + def __init__(self, num_features, embeddingsize, hiddensize, + padding_idx=0, rnn_class='lstm', num_layers=2, dropout=0.1, bidirectional=False, shared_lt=None, shared_rnn=None, sparse=False): """Initialize recurrent encoder.""" @@ -276,10 +348,11 @@ class RNNDecoder(nn.Module): Can be used as a standalone language model or paired with an encoder. """ - def __init__(self, num_features, padding_idx=0, rnn_class='lstm', - embeddingsize=128, hiddensize=128, num_layers=2, dropout=0.1, + def __init__(self, num_features, embeddingsize, hiddensize, + padding_idx=0, rnn_class='lstm', num_layers=2, dropout=0.1, bidir_input=False, attn_type='none', attn_time='pre', attn_length=-1, sparse=False): + """Initialize recurrent decoder.""" super().__init__() self.dropout = nn.Dropout(p=dropout) self.layers = num_layers @@ -338,18 +411,20 @@ def forward(self, xs, hidden=None, attn_params=None): class OutputLayer(nn.Module): """Takes in final states and returns distribution over candidates.""" - def __init__(self, num_features, hiddensize, embeddingsize, numsoftmax=1, + def __init__(self, num_features, embeddingsize, hiddensize, numsoftmax=1, shared_weight=None): """Initialize output layer. :param num_features: number of candidates to rank - :param hiddensize: (last) dimension of the input vectors - :param embeddingsize: (last) dimension of the candidate vectors + :param hiddensize: (last) dimension of the input vectors + :param embeddingsize: (last) dimension of the candidate vectors :param num_softmax: (default 1) number of softmaxes to calculate. see arxiv.org/abs/1711.03953 for more info. increasing this slows down computation but can add more expressivity to the embeddings. - :param shared_weight: (num_features x esz) vector of + :param shared_weight: (num_features x esz) vector of weights to use as + the final linear layer's weight matrix. default + None starts with a new linear layer. """ # embedding to scores if shared_weight is None: @@ -422,93 +497,6 @@ def forward(self, input): return scores -class Ranker(object): - """Basic ranker which uses average log-prob of sequences for ranking.""" - - def __init__(self, decoder, start_token, padding_idx=0, attn_type='none'): - """Intialize ranker.""" - super().__init__() - self.decoder = decoder - self.START = start_token - self.NULL_IDX = padding_idx - self.attn_type = attn_type - - def _starts(self, bsz): - return self.START.detach().expand(bsz, 1) - - def forward(self, cand_params, encoder_states): - cands, cand_inds = cand_params - hidden, enc_out, attn_mask = encoder_states - - hid, cell = (hidden, None) if isinstance(hidden, torch.Tensor) else hidden - if len(cand_inds) != hid.size(1): - cand_indices = self.START.detach().new(cand_inds) - hid = hid.index_select(1, cand_indices) - if cell is None: - hidden = hid - else: - cell = cell.index_select(1, cand_indices) - hidden = (hid, cell) - enc_out = enc_out.index_select(0, cand_indices) - if attn_mask is not None: - attn_mask = attn_mask.index_select(0, cand_indices) - - cand_scores = [] - - for i in range(len(cands)): - curr_cs = cands[i] - - n_cs = curr_cs.size(0) - starts = self._starts(n_cs).unsqueeze(1) - scores = 0 - seqlens = 0 - # select just the one hidden state - if isinstance(hidden, torch.Tensor): - nl = hidden.size(0) - hsz = hidden.size(-1) - cur_hid = hidden.select(1, i).unsqueeze(1).expand(nl, n_cs, hsz) - else: - nl = hidden[0].size(0) - hsz = hidden[0].size(-1) - cur_hid = (hidden[0].select(1, i).unsqueeze(1).expand(nl, n_cs, hsz).contiguous(), - hidden[1].select(1, i).unsqueeze(1).expand(nl, n_cs, hsz).contiguous()) - - cur_enc, cur_mask = None, None - if attn_mask is not None: - cur_mask = attn_mask[i].unsqueeze(0).expand(n_cs, attn_mask.size(-1)) - cur_enc = enc_out[i].unsqueeze(0).expand(n_cs, enc_out.size(1), hsz) - # this is pretty much copied from the training forward above - if curr_cs.size(1) > 1: - c_in = curr_cs.narrow(1, 0, curr_cs.size(1) - 1) - xs = torch.cat([starts, c_in], 1) - else: - xs, c_in = starts, curr_cs - if self.attn_type == 'none': - preds, score, cur_hid = self.decoder(xs, cur_hid, cur_enc, cur_mask) - true_score = F.log_softmax(score, dim=2).gather( - 2, curr_cs.unsqueeze(2)) - nonzero = curr_cs.ne(0).float() - scores = (true_score.squeeze(2) * nonzero).sum(1) - seqlens = nonzero.sum(1) - else: - for i in range(curr_cs.size(1)): - xi = xs.select(1, i) - ci = curr_cs.select(1, i) - preds, score, cur_hid = self.decoder(xi, cur_hid, cur_enc, cur_mask) - true_score = F.log_softmax(score, dim=2).gather( - 2, ci.unsqueeze(1).unsqueeze(2)) - nonzero = ci.ne(0).float() - scores += true_score.squeeze(2).squeeze(1) * nonzero - seqlens += nonzero - - scores /= seqlens # **len_penalty? - cand_scores.append(scores) - - max_len = max(len(c) for c in cand_scores) - cand_scores = torch.cat([pad(c, max_len).unsqueeze(0) for c in cand_scores], 0) - return cand_scores - - class AttentionLayer(nn.Module): """Computes attention between hidden and encoder states. From 80eff35eca48108bbe995892a4f0d6d9d0940326 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Tue, 28 Aug 2018 16:26:16 -0700 Subject: [PATCH 04/12] fix bugs --- parlai/agents/seq2seq/modules.py | 77 +++++++++++++++++--------------- parlai/agents/seq2seq/seq2seq.py | 40 +++++++++-------- parlai/core/torch_agent.py | 2 +- 3 files changed, 62 insertions(+), 57 deletions(-) diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py index a45088323d2..2cb72cbd3fb 100644 --- a/parlai/agents/seq2seq/modules.py +++ b/parlai/agents/seq2seq/modules.py @@ -13,6 +13,17 @@ import torch.nn.functional as F +def opt_to_kwargs(opt): + """Get kwargs for seq2seq from opt.""" + kwargs = {} + for k in ['numlayers', 'dropout', 'bidirectional', 'rnn_class', + 'lookuptable', 'decoder', 'numsoftmax', + 'attention', 'attention_length', 'attention_time']: + if k in opt: + kwargs[k] = opt[k] + return kwargs + + def pad(tensor, length, dim=0): """Pad tensor to a specific length. @@ -32,17 +43,6 @@ def pad(tensor, length, dim=0): return tensor -def opt_to_kwargs(opt): - """Get kwargs for seq2seq from opt.""" - kwargs = {} - for k in ['numlayers', 'dropout', 'bidirectional', 'rnn_class', - 'lookuptable', 'decoder', 'numsoftmax', - 'attention', 'attention_length', 'attention_time']: - if k in opt: - kwargs[k] = opt[k] - return kwargs - - class Seq2seq(nn.Module): """Sequence to sequence parent module.""" @@ -73,8 +73,7 @@ def __init__( numlayers=numlayers, dropout=dropout, attn_type=attention, attn_length=attention_length, attn_time=attention_time, - bidir_input=bidirectional, - numsoftmax=numsoftmax) + bidir_input=bidirectional) shared_lt = (self.decoder.lt if lookuptable in ('enc_dec', 'all') else None) @@ -82,15 +81,15 @@ def __init__( self.encoder = RNNEncoder( num_features, embeddingsize, hiddensize, padding_idx=self.NULL_IDX, rnn_class=rnn_class, - num_layers=numlayers, dropout=dropout, + numlayers=numlayers, dropout=dropout, bidirectional=bidirectional, shared_lt=shared_lt, shared_rnn=shared_rnn) shared_weight = (self.decoder.lt if lookuptable in ('dec_out', 'all') else None) self.output = OutputLayer( - num_features, embeddingsize, hiddensize, numsoftmax=numsoftmax, - shared_weight=shared_weight) + num_features, embeddingsize, hiddensize, dropout=dropout, + numsoftmax=numsoftmax, shared_weight=shared_weight) def _encode(self, xs, prev_enc=None): if prev_enc is not None: @@ -133,7 +132,7 @@ def _decode_forced(self, ys, encoder_states): def _decode(self, encoder_states, maxlen): hidden = encoder_states[0] attn_params = (encoder_states[1], encoder_states[2]) - bsz = hidden.size(0) + bsz = encoder_states[1].size(0) xs = self._starts(bsz) # input start token @@ -143,7 +142,7 @@ def _decode(self, encoder_states, maxlen): output, hidden = self.decoder(xs, hidden, attn_params) score = self.output(output) scores.append(score) - xs = score.max(1)[0] # next input is current predicted output + xs = score.max(1)[1] # next input is current predicted output return scores @@ -152,9 +151,9 @@ def _align_inds(self, encoder_states, cand_inds): # LSTM or GRU/RNN hidden state? if isinstance(hidden, torch.Tensor): - hid, cell = hidden - else: hid, cell = hidden, None + else: + hid, cell = hidden if len(cand_inds) != hid.size(1): # if the number of candidates is mismatched from the number of @@ -201,6 +200,8 @@ def _rank(self, cand_params, encoder_states): return None cands, cand_inds = cand_params + if cands is None: + return None encoder_states = self._align_inds(encoder_states, cand_inds) cand_scores = [] @@ -257,7 +258,7 @@ def forward(self, xs, ys=None, cand_params=None, prev_enc=None, # we'll never produce longer ones than that during prediction self.longest_label = max(self.longest_label, ys.size(1)) - encoder_states = self.encode(xs, prev_enc) + encoder_states = self._encode(xs, prev_enc) # rank candidates if they are available cand_scores = self._rank(cand_params, encoder_states) @@ -276,14 +277,14 @@ class RNNEncoder(nn.Module): """RNN Encoder.""" def __init__(self, num_features, embeddingsize, hiddensize, - padding_idx=0, rnn_class='lstm', num_layers=2, dropout=0.1, + padding_idx=0, rnn_class='lstm', numlayers=2, dropout=0.1, bidirectional=False, shared_lt=None, shared_rnn=None, sparse=False): """Initialize recurrent encoder.""" super().__init__() self.dropout = nn.Dropout(p=dropout) - self.layers = num_layers + self.layers = numlayers self.dirs = 2 if bidirectional else 1 self.hsz = hiddensize @@ -295,8 +296,8 @@ def __init__(self, num_features, embeddingsize, hiddensize, self.lt = shared_lt if shared_rnn is None: - self.rnn = rnn_class(embeddingsize, hiddensize, num_layers, - dropout=dropout if num_layers > 1 else 0, + self.rnn = rnn_class(embeddingsize, hiddensize, numlayers, + dropout=dropout if numlayers > 1 else 0, batch_first=True, bidirectional=bidirectional) elif bidirectional: raise RuntimeError('Cannot share decoder with bidir encoder.') @@ -320,7 +321,6 @@ def forward(self, xs): attn_mask = xs.ne(0).float() try: x_lens = torch.sum(attn_mask.int(), dim=1) - import pdb; pdb.set_trace() xes = pack_padded_sequence(xes, x_lens, batch_first=True) packed = True except ValueError: @@ -339,7 +339,7 @@ def forward(self, xs): else: hidden = hidden.view(-1, self.dirs, bsz, self.hsz).sum(1) - return encoder_output, hidden, attn_mask + return hidden, encoder_output, attn_mask class RNNDecoder(nn.Module): @@ -349,20 +349,20 @@ class RNNDecoder(nn.Module): """ def __init__(self, num_features, embeddingsize, hiddensize, - padding_idx=0, rnn_class='lstm', num_layers=2, dropout=0.1, + padding_idx=0, rnn_class='lstm', numlayers=2, dropout=0.1, bidir_input=False, attn_type='none', attn_time='pre', attn_length=-1, sparse=False): """Initialize recurrent decoder.""" super().__init__() self.dropout = nn.Dropout(p=dropout) - self.layers = num_layers + self.layers = numlayers self.hsz = hiddensize self.esz = embeddingsize self.lt = nn.Embedding(num_features, embeddingsize, padding_idx=padding_idx, sparse=sparse) - self.rnn = rnn_class(embeddingsize, hiddensize, num_layers, - dropout=dropout if num_layers > 1 else 0, + self.rnn = rnn_class(embeddingsize, hiddensize, numlayers, + dropout=dropout if numlayers > 1 else 0, batch_first=True) self.attn_type = attn_type @@ -389,7 +389,7 @@ def forward(self, xs, hidden=None, attn_params=None): (bsz, seq_len, num_directions * hiddensize). hidden state will be same dimensions as input hidden state. for an RNN, this is a tensor of sizes - (bsz, num_layers * num_directions, hiddensize). + (bsz, numlayers * num_directions, hiddensize). """ # sequence indices => sequence embeddings xes = self.dropout(self.lt(xs)) @@ -411,14 +411,14 @@ def forward(self, xs, hidden=None, attn_params=None): class OutputLayer(nn.Module): """Takes in final states and returns distribution over candidates.""" - def __init__(self, num_features, embeddingsize, hiddensize, numsoftmax=1, - shared_weight=None): + def __init__(self, num_features, embeddingsize, hiddensize, dropout=0, + numsoftmax=1, shared_weight=None): """Initialize output layer. :param num_features: number of candidates to rank :param hiddensize: (last) dimension of the input vectors :param embeddingsize: (last) dimension of the candidate vectors - :param num_softmax: (default 1) number of softmaxes to calculate. + :param numsoftmax: (default 1) number of softmaxes to calculate. see arxiv.org/abs/1711.03953 for more info. increasing this slows down computation but can add more expressivity to the embeddings. @@ -426,6 +426,9 @@ def __init__(self, num_features, embeddingsize, hiddensize, numsoftmax=1, the final linear layer's weight matrix. default None starts with a new linear layer. """ + super().__init__() + self.dropout = nn.Dropout(p=dropout) + # embedding to scores if shared_weight is None: # just a regular linear layer @@ -472,14 +475,14 @@ def forward(self, input): seqlen = input.size(1) if input.dim() > 1 else 1 # first compute different softmax scores based on input vec - # hsz => num_softmax * esz + # hsz => numsoftmax * esz latent = self.latent(input) active = self.dropout(self.activation(latent)) # esz => num_features logit = self.e2s(active.view(-1, self.esz)) # calculate priors: distribution over which softmax scores to use - # hsz => num_softmax + # hsz => numsoftmax prior_logit = self.prior(input).view(-1, self.numsoftmax) # softmax over numsoftmax's prior = self.softmax(prior_logit) diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py index e9bd254feee..f537f09427e 100644 --- a/parlai/agents/seq2seq/seq2seq.py +++ b/parlai/agents/seq2seq/seq2seq.py @@ -7,7 +7,7 @@ from parlai.core.torch_agent import TorchAgent, Output from parlai.core.utils import padded_tensor, round_sigfigs from parlai.core.thread_utils import SharedTable -from .modules import Seq2seq +from .modules import Seq2seq, opt_to_kwargs import torch import torch.nn as nn @@ -187,15 +187,15 @@ def __init__(self, opt, shared=None): self.reset() def _init_model(self, states=None): + """Initialize model, override to change model setup.""" opt = self.opt - if not hasattr(self, 'model_class'): - # this allows child classes to override this but inherit init - self.model_class = Seq2seq - self.model = self.model_class( - opt, len(self.dict), padding_idx=self.NULL_IDX, - start_idx=self.START_IDX, end_idx=self.END_IDX, - longest_label=states.get('longest_label', 1)) + kwargs = opt_to_kwargs(opt) + self.model = Seq2seq( + len(self.dict), opt['embeddingsize'], opt['hiddensize'], + padding_idx=self.NULL_IDX, start_idx=self.START_IDX, + longest_label=states.get('longest_label', 1), + **kwargs) if (opt.get('dict_tokenizer') == 'bpe' and opt['embedding_type'] != 'random'): @@ -223,6 +223,8 @@ def _init_model(self, states=None): if opt['lookuptable'] in ['dec_out', 'all']: self.model.decoder.e2s.weight.requires_grad = False + return self.model + def _v2t(self, vec): """Convert token indices to string of tokens.""" new_vec = [] @@ -310,12 +312,12 @@ def train_step(self, batch): self.truncate or 180) self.model.train() self.zero_grad() - preds = None try: out = self.model(batch.text_vec, batch.label_vec) # generated response - preds, scores = out[0], out[1] + scores = out[0] + preds = scores.max(2)[1] score_view = scores.view(-1, scores.size(-1)) loss = self.criterion(score_view, batch.label_vec.view(-1)) @@ -359,16 +361,15 @@ def _pick_cands(self, cand_preds, cand_inds, cands): def eval_step(self, batch): """Evaluate a single batch of examples.""" self.model.eval() - cands, cand_inds = self._build_cands(batch) - out = self.model(batch.text_vec, ys=None, - cands=cands, cand_indices=cand_inds, - beam_size=self.beam_size, topk=self.topk) - preds, cand_preds = out[0], out[2] + cand_params = self._build_cands(batch) + out = self.model(batch.text_vec, ys=None, cand_params=cand_params) + scores, cand_scores = out[0], out[1] if batch.label_vec is not None: # calculate loss on targets out = self.model(batch.text_vec, batch.label_vec) - f_preds, scores = out[0], out[1] + scores = out[0] + f_preds = scores.max(2)[1] # forced preds score_view = scores.view(-1, scores.size(-1)) loss = self.criterion(score_view, batch.label_vec.view(-1)) # save loss to metrics @@ -380,12 +381,13 @@ def eval_step(self, batch): self.metrics['num_tokens'] += target_tokens cand_choices = None - if cand_preds is not None: + if cand_scores is not None: + cand_preds = cand_scores.sort(1, True)[1] # now select the text of the cands based on their scores - cand_choices = self._pick_cands(cand_preds, cand_inds, + cand_choices = self._pick_cands(cand_preds, cand_params[1], batch.candidates) - text = [self._v2t(p) for p in preds] + text = [self._v2t(p) for p in scores.max(2)[1].cpu()] return Output(text, cand_choices) def save(self, path=None): diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py index 11ade93e53d..17b5281a634 100644 --- a/parlai/core/torch_agent.py +++ b/parlai/core/torch_agent.py @@ -775,7 +775,7 @@ def _init_cuda_buffer(self, model, criterion, batchsize, maxlen): try: print('preinitializing pytorch cuda buffer') dummy = torch.ones(batchsize, maxlen).long().cuda() - sc = model(dummy, dummy)[1] + sc = model(dummy, dummy)[0] loss = criterion(sc.view(-1, sc.size(-1)), dummy.view(-1)) loss.backward() self.buffer_initialized = True From 589f0dfd3604e605a721d94a4abd05c9cc0b7ff6 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Tue, 28 Aug 2018 16:32:39 -0700 Subject: [PATCH 05/12] more bug fixes --- parlai/agents/seq2seq/modules.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py index 2cb72cbd3fb..e6e8bd65cc0 100644 --- a/parlai/agents/seq2seq/modules.py +++ b/parlai/agents/seq2seq/modules.py @@ -92,15 +92,18 @@ def __init__( numsoftmax=numsoftmax, shared_weight=shared_weight) def _encode(self, xs, prev_enc=None): + """Encode the input or return cached encoder state.""" if prev_enc is not None: return prev_enc else: return self.encoder(xs) def _starts(self, bsz): + """Return bsz start tokens.""" return self.START.detach().expand(bsz, 1) def _decode_forced(self, ys, encoder_states): + """Decode with teacher forcing.""" bsz = ys.size(0) seqlen = ys.size(1) @@ -127,9 +130,11 @@ def _decode_forced(self, ys, encoder_states): score = self.output(output) scores.append(score) + scores = torch.cat(scores, 1) return scores def _decode(self, encoder_states, maxlen): + """Decode maxlen tokens.""" hidden = encoder_states[0] attn_params = (encoder_states[1], encoder_states[2]) bsz = encoder_states[1].size(0) @@ -144,9 +149,11 @@ def _decode(self, encoder_states, maxlen): scores.append(score) xs = score.max(1)[1] # next input is current predicted output + scores = torch.cat(scores, 1) return scores def _align_inds(self, encoder_states, cand_inds): + """Select the encoder states relevant to valid candidates.""" hidden, enc_out, attn_mask = encoder_states # LSTM or GRU/RNN hidden state? @@ -173,6 +180,7 @@ def _align_inds(self, encoder_states, cand_inds): return hidden, enc_out, attn_mask def _extract_cur(self, encoder_states, index, num_cands): + """Extract encoder states at current index and expand them.""" hidden, enc_out, attn_mask = encoder_states if isinstance(hidden, torch.Tensor): nl = hidden.size(0) @@ -187,6 +195,7 @@ def _extract_cur(self, encoder_states, index, num_cands): hidden[1].select(1, index).unsqueeze(1) .expand(nl, num_cands, hsz).contiguous()) + cur_enc, cur_mask = None, None if self.attn_type != 'none': cur_enc = (enc_out[index].unsqueeze(0) .expand(num_cands, enc_out.size(1), hsz)) @@ -212,9 +221,9 @@ def _rank(self, cand_params, encoder_states): # select just the one hidden state cur_enc_states = self._extract_cur( - encoder_states, num_cands, batch_idx) + encoder_states, batch_idx, num_cands) - score = self.decode_forced(curr_cs, cur_enc_states) + score = self._decode_forced(curr_cs, cur_enc_states) true_score = F.log_softmax(score, dim=2).gather( 2, curr_cs.unsqueeze(2)) nonzero = curr_cs.ne(0).float() @@ -269,7 +278,6 @@ def forward(self, xs, ys=None, cand_params=None, prev_enc=None, else: scores = self._decode(encoder_states, maxlen or self.longest_label) - scores = torch.cat(scores, 1) return scores, cand_scores, encoder_states From 1e76d0d0bac1b9a574e30287976e2e8108e85522 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Tue, 28 Aug 2018 16:45:55 -0700 Subject: [PATCH 06/12] bug fixes for convai2 --- parlai/agents/seq2seq/modules.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py index e6e8bd65cc0..c82a428ae3b 100644 --- a/parlai/agents/seq2seq/modules.py +++ b/parlai/agents/seq2seq/modules.py @@ -125,7 +125,7 @@ def _decode_forced(self, ys, encoder_states): # TODO: do we need to do this? actually shouldn't need to since we # don't do input feeding for i in range(seqlen): - xi = xs.select(1, i) + xi = xs.select(1, i).unsqueeze(1) output, hidden = self.decoder(xi, hidden, attn_params) score = self.output(output) scores.append(score) @@ -404,14 +404,14 @@ def forward(self, xs, hidden=None, attn_params=None): if self.attn_time == 'pre': # modify input vectors with attention - xes = self.attention(xes, hidden, attn_params) + xes, _attw = self.attention(xes, hidden, attn_params) # feed tokens into rnn output, new_hidden = self.rnn(xes, hidden) if self.attn_time == 'post': # modify output vectors with attention - output = self.attention(output, new_hidden, attn_params) + output, _attw = self.attention(output, new_hidden, attn_params) return output, new_hidden @@ -617,7 +617,7 @@ def forward(self, xes, hidden, attn_params): # calculate activation scores, apply mask if needed if attn_mask is not None: # remove activation from NULL symbols - import pdb; pdb.set_trace() # is this the best operation? + # TODO: is this the best operation? attn_w_premask -= (1 - attn_mask) * 1e20 attn_weights = F.softmax(attn_w_premask, dim=1) @@ -627,8 +627,5 @@ def forward(self, xes, hidden, attn_params): merged = torch.cat((xes.squeeze(1), attn_applied.squeeze(1)), 1) # combine them with a linear layer and tanh activation output = F.tanh(self.attn_combine(merged).unsqueeze(1)) - print('make sure last_hidden == xes') - import pdb; pdb.set_trace() - # TODO: remove tanh? return output, attn_weights From 9df6d0e227a561f5df17dce29c5342d1f598ab23 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Wed, 29 Aug 2018 07:38:01 -0700 Subject: [PATCH 07/12] bug fix --- parlai/agents/seq2seq/modules.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py index c82a428ae3b..4e6d3f6ad38 100644 --- a/parlai/agents/seq2seq/modules.py +++ b/parlai/agents/seq2seq/modules.py @@ -147,7 +147,7 @@ def _decode(self, encoder_states, maxlen): output, hidden = self.decoder(xs, hidden, attn_params) score = self.output(output) scores.append(score) - xs = score.max(1)[1] # next input is current predicted output + xs = score.max(2)[1] # next input is current predicted output scores = torch.cat(scores, 1) return scores From 1dc9594ef0d0b6636fbb02666d6695013ebc9465 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Wed, 29 Aug 2018 07:47:01 -0700 Subject: [PATCH 08/12] version --- parlai/agents/seq2seq/seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py index f537f09427e..dec2c150d74 100644 --- a/parlai/agents/seq2seq/seq2seq.py +++ b/parlai/agents/seq2seq/seq2seq.py @@ -38,7 +38,7 @@ class Seq2seqAgent(TorchAgent): `(Luong et al. 2015) `_ """ - # version 1 split from version 0 on Aug 24, 2018 + # version 1 split from version 0 on Aug 29, 2018 # to use version 0, use --model legacy:seq2seq:0 # legacy agent code is located in parlai/legacy_agents VERSION = 1 From 2e1efddfb5091980a992917618ce09b29f1792b7 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Wed, 29 Aug 2018 07:50:47 -0700 Subject: [PATCH 09/12] lint --- parlai/agents/seq2seq/seq2seq.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py index dec2c150d74..76de2902af5 100644 --- a/parlai/agents/seq2seq/seq2seq.py +++ b/parlai/agents/seq2seq/seq2seq.py @@ -48,7 +48,7 @@ def add_cmdline_args(cls, argparser): """Add command-line arguments specifically for this agent.""" agent = argparser.add_argument_group('Seq2Seq Arguments') agent.add_argument('--init-model', type=str, default=None, - help='load dict/features/weights/opts from this file') + help='load dict/model/opts from this path') agent.add_argument('-hs', '--hiddensize', type=int, default=128, help='size of the hidden layers') agent.add_argument('-esz', '--embeddingsize', type=int, default=128, From 4d8f1035778e7d5d2e42541873e203fd0900d580 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Wed, 29 Aug 2018 09:27:17 -0700 Subject: [PATCH 10/12] stephen suggestions, bug fix, move cuda buffer back to s2s --- parlai/agents/seq2seq/modules.py | 36 +++++++++++++------------- parlai/agents/seq2seq/seq2seq.py | 44 ++++++++++++++++++++++---------- parlai/core/torch_agent.py | 19 -------------- 3 files changed, 49 insertions(+), 50 deletions(-) diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py index 4e6d3f6ad38..7d51c73a2a2 100644 --- a/parlai/agents/seq2seq/modules.py +++ b/parlai/agents/seq2seq/modules.py @@ -24,7 +24,7 @@ def opt_to_kwargs(opt): return kwargs -def pad(tensor, length, dim=0): +def pad(tensor, length, dim=0, pad=0): """Pad tensor to a specific length. :param tensor: vector to pad @@ -37,7 +37,7 @@ def pad(tensor, length, dim=0): return torch.cat( [tensor, tensor.new(*tensor.size()[:dim], length - tensor.size(dim), - *tensor.size()[dim + 1:]).zero_()], + *tensor.size()[dim + 1:]).fill_(pad)], dim=dim) else: return tensor @@ -107,8 +107,8 @@ def _decode_forced(self, ys, encoder_states): bsz = ys.size(0) seqlen = ys.size(1) - hidden = encoder_states[0] - attn_params = (encoder_states[1], encoder_states[2]) + hidden = encoder_states[1] + attn_params = (encoder_states[0], encoder_states[2]) # input to model is START + each target except the last y_in = ys.narrow(1, 0, seqlen - 1) @@ -135,9 +135,9 @@ def _decode_forced(self, ys, encoder_states): def _decode(self, encoder_states, maxlen): """Decode maxlen tokens.""" - hidden = encoder_states[0] - attn_params = (encoder_states[1], encoder_states[2]) - bsz = encoder_states[1].size(0) + hidden = encoder_states[1] + attn_params = (encoder_states[0], encoder_states[2]) + bsz = encoder_states[0].size(0) xs = self._starts(bsz) # input start token @@ -154,7 +154,7 @@ def _decode(self, encoder_states, maxlen): def _align_inds(self, encoder_states, cand_inds): """Select the encoder states relevant to valid candidates.""" - hidden, enc_out, attn_mask = encoder_states + enc_out, hidden, attn_mask = encoder_states # LSTM or GRU/RNN hidden state? if isinstance(hidden, torch.Tensor): @@ -177,11 +177,11 @@ def _align_inds(self, encoder_states, cand_inds): enc_out = enc_out.index_select(0, cand_indices) attn_mask = attn_mask.index_select(0, cand_indices) - return hidden, enc_out, attn_mask + return enc_out, hidden, attn_mask def _extract_cur(self, encoder_states, index, num_cands): """Extract encoder states at current index and expand them.""" - hidden, enc_out, attn_mask = encoder_states + enc_out, hidden, attn_mask = encoder_states if isinstance(hidden, torch.Tensor): nl = hidden.size(0) hsz = hidden.size(-1) @@ -201,7 +201,7 @@ def _extract_cur(self, encoder_states, index, num_cands): .expand(num_cands, enc_out.size(1), hsz)) cur_mask = (attn_mask[index].unsqueeze(0) .expand(num_cands, attn_mask.size(-1))) - return cur_hid, cur_enc, cur_mask + return cur_enc, cur_hid, cur_mask def _rank(self, cand_params, encoder_states): """Rank each cand by the average log-probability of the sequence.""" @@ -234,7 +234,8 @@ def _rank(self, cand_params, encoder_states): max_len = max(len(c) for c in cand_scores) cand_scores = torch.cat( - [pad(c, max_len).unsqueeze(0) for c in cand_scores], 0) + [pad(c, max_len, pad=self.NULL_IDX).unsqueeze(0) + for c in cand_scores], 0) return cand_scores def forward(self, xs, ys=None, cand_params=None, prev_enc=None, @@ -259,7 +260,7 @@ def forward(self, xs, ys=None, cand_params=None, prev_enc=None, (bsz x seqlen x num_features) candidate scores are the score the model assigned to each candidate (bsz x num_cands) - encoder states are the (hidden, output, attn_mask) states from the + encoder states are the (output, hidden, attn_mask) states from the encoder. feed this back in to skip encoding on the next call. """ if ys is not None: @@ -326,7 +327,7 @@ def forward(self, xs): # embed input tokens xes = self.dropout(self.lt(xs)) - attn_mask = xs.ne(0).float() + attn_mask = xs.ne(0) try: x_lens = torch.sum(attn_mask.int(), dim=1) xes = pack_padded_sequence(xes, x_lens, batch_first=True) @@ -347,7 +348,7 @@ def forward(self, xs): else: hidden = hidden.view(-1, self.dirs, bsz, self.hsz).sum(1) - return hidden, encoder_output, attn_mask + return encoder_output, hidden, attn_mask class RNNDecoder(nn.Module): @@ -568,7 +569,7 @@ def forward(self, xes, hidden, attn_params): """ if self.attention == 'none': # do nothing, no attention - return xes + return xes, None if type(hidden) == tuple: # for lstms use the "hidden" state not the cell state @@ -617,8 +618,7 @@ def forward(self, xes, hidden, attn_params): # calculate activation scores, apply mask if needed if attn_mask is not None: # remove activation from NULL symbols - # TODO: is this the best operation? - attn_w_premask -= (1 - attn_mask) * 1e20 + attn_w_premask.masked_fill_((1 - attn_mask), -1e20) attn_weights = F.softmax(attn_w_premask, dim=1) # apply the attention weights to the encoder states diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py index 76de2902af5..5e39e775030 100644 --- a/parlai/agents/seq2seq/seq2seq.py +++ b/parlai/agents/seq2seq/seq2seq.py @@ -38,11 +38,6 @@ class Seq2seqAgent(TorchAgent): `(Luong et al. 2015) `_ """ - # version 1 split from version 0 on Aug 29, 2018 - # to use version 0, use --model legacy:seq2seq:0 - # legacy agent code is located in parlai/legacy_agents - VERSION = 1 - @classmethod def add_cmdline_args(cls, argparser): """Add command-line arguments specifically for this agent.""" @@ -109,7 +104,6 @@ def add_cmdline_args(cls, argparser): help='Top k sampling from renormalized softmax in ' 'test/valid time, default 1 means simple ' 'greedy max output') - agent.add_argument('--seq2seq_version', default=Seq2seqAgent.VERSION) TorchAgent.add_cmdline_args(argparser) Seq2seqAgent.dictionary_class().add_cmdline_args(argparser) return agent @@ -119,6 +113,9 @@ def model_version(): """Return current version of this model, counting up from 0. Models are not backwards-compatible with older versions. + Version 1 split from version 0 on Aug 29, 2018. + To use version 0, use --model legacy:seq2seq:0 + (legacy agent code is located in parlai/legacy_agents). """ return 1 @@ -304,6 +301,26 @@ def batchify(self, *args, **kwargs): kwargs['sort'] = True # need sorted for pack_padded return super().batchify(*args, **kwargs) + def _init_cuda_buffer(self, model, criterion, batchsize, maxlen): + """Pre-initialize CUDA buffer by doing fake forward pass.""" + if self.use_cuda and not hasattr(self, 'buffer_initialized'): + try: + print('preinitializing pytorch cuda buffer') + dummy = torch.ones(batchsize, maxlen).long().cuda() + out = model(dummy, dummy) + sc = out[0] # scores + loss = criterion(sc.view(-1, sc.size(-1)), dummy.view(-1)) + loss.backward() + self.buffer_initialized = True + except RuntimeError as e: + if 'out of memory' in str(e): + m = ('CUDA OOM: Lower batch size (-bs) from {} or lower ' + ' max sequence length (-tr) from {}' + ''.format(batchsize, maxlen)) + raise RuntimeError(m) + else: + raise e + def train_step(self, batch): """Train on a single batch of examples.""" batchsize = batch.text_vec.size(0) @@ -317,7 +334,7 @@ def train_step(self, batch): # generated response scores = out[0] - preds = scores.max(2)[1] + _, preds = scores.max(2) score_view = scores.view(-1, scores.size(-1)) loss = self.criterion(score_view, batch.label_vec.view(-1)) @@ -364,13 +381,14 @@ def eval_step(self, batch): cand_params = self._build_cands(batch) out = self.model(batch.text_vec, ys=None, cand_params=cand_params) scores, cand_scores = out[0], out[1] + _, preds = scores.max(2) if batch.label_vec is not None: - # calculate loss on targets + # calculate loss on targets with teacher forcing out = self.model(batch.text_vec, batch.label_vec) - scores = out[0] - f_preds = scores.max(2)[1] # forced preds - score_view = scores.view(-1, scores.size(-1)) + f_scores = out[0] # forced scores + _, f_preds = f_scores.max(2) # forced preds + score_view = f_scores.view(-1, f_scores.size(-1)) loss = self.criterion(score_view, batch.label_vec.view(-1)) # save loss to metrics notnull = batch.label_vec.ne(self.NULL_IDX) @@ -387,7 +405,7 @@ def eval_step(self, batch): cand_choices = self._pick_cands(cand_preds, cand_params[1], batch.candidates) - text = [self._v2t(p) for p in scores.max(2)[1].cpu()] + text = [self._v2t(p) for p in preds.cpu()] return Output(text, cand_choices) def save(self, path=None): @@ -407,7 +425,7 @@ def save(self, path=None): # save opt file with open(path + ".opt", 'wb') as handle: # save version string - self.opt['model_version'] = Seq2seqAgent.VERSION + self.opt['model_version'] = self.model_version() pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL) def load(self, path): diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py index 17b5281a634..d811ae6d2ef 100644 --- a/parlai/core/torch_agent.py +++ b/parlai/core/torch_agent.py @@ -769,25 +769,6 @@ def act(self): """Call batch_act with the singleton batch.""" return self.batch_act([self.observation])[0] - def _init_cuda_buffer(self, model, criterion, batchsize, maxlen): - """Pre-initialize CUDA buffer by doing fake forward pass.""" - if self.use_cuda and not hasattr(self, 'buffer_initialized'): - try: - print('preinitializing pytorch cuda buffer') - dummy = torch.ones(batchsize, maxlen).long().cuda() - sc = model(dummy, dummy)[0] - loss = criterion(sc.view(-1, sc.size(-1)), dummy.view(-1)) - loss.backward() - self.buffer_initialized = True - except RuntimeError as e: - if 'out of memory' in str(e): - m = ('CUDA OOM: Lower batch size (-bs) from {} or lower ' - ' max sequence length (-tr) from {}' - ''.format(batchsize, maxlen)) - raise RuntimeError(m) - else: - raise e - def batch_act(self, observations): """Process a batch of observations (batchsize list of message dicts). From 98f0ebaeb8f8d0ef145b931b4bae4736541835f2 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Wed, 29 Aug 2018 10:54:17 -0700 Subject: [PATCH 11/12] flake fixes --- parlai/agents/seq2seq/seq2seq.py | 8 +- parlai/core/torch_agent.py | 122 +++++++++++++++++++------------ 2 files changed, 81 insertions(+), 49 deletions(-) diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py index 5e39e775030..1e387caaa23 100644 --- a/parlai/agents/seq2seq/seq2seq.py +++ b/parlai/agents/seq2seq/seq2seq.py @@ -133,8 +133,8 @@ def __init__(self, opt, shared=None): if init_model is not None: # if we are loading a model, should load its dict too - if (os.path.isfile(init_model + '.dict') - or opt['dict_file'] is None): + if (os.path.isfile(init_model + '.dict') or + opt['dict_file'] is None): opt['dict_file'] = init_model + '.dict' super().__init__(opt, shared) opt = self.opt @@ -194,8 +194,8 @@ def _init_model(self, states=None): longest_label=states.get('longest_label', 1), **kwargs) - if (opt.get('dict_tokenizer') == 'bpe' - and opt['embedding_type'] != 'random'): + if (opt.get('dict_tokenizer') == 'bpe' and + opt['embedding_type'] != 'random'): print('skipping preinitialization of embeddings for bpe') elif not states and opt['embedding_type'] != 'random': # `not states`: only set up embeddings if not loading model diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py index d811ae6d2ef..c60bc0b2bb1 100644 --- a/parlai/core/torch_agent.py +++ b/parlai/core/torch_agent.py @@ -844,10 +844,12 @@ def __init__(self, beam_size, min_length=3, padding_token=0, bos_token=1, # backtracking id to hypothesis at previous time step self.bookkeep = [] # output tokens at each time step - self.outputs = [torch.Tensor(self.beam_size).long().fill_(padding_token).to(self.device)] + self.outputs = [torch.Tensor(self.beam_size).long() + .fill_(padding_token).to(self.device)] # keeps tuples (score, time_step, hyp_id) self.finished = [] - self.HypothesisTail = namedtuple('HypothesisTail', ['timestep', 'hypid', 'score', 'tokenid']) + self.HypothesisTail = namedtuple( + 'HypothesisTail', ['timestep', 'hypid', 'score', 'tokenid']) self.eos_top = False self.eos_top_ts = None self.n_best_counter = 0 @@ -866,9 +868,10 @@ def advance(self, softmax_probs): # hypos are the same initially beam_scores = softmax_probs[0] else: - # we need to sum up hypo scores and current softmax scores before topk + # we need to sum up hypo scores and curr softmax scores before topk # [beam_size, voc_size] - beam_scores = softmax_probs + self.scores.unsqueeze(1).expand_as(softmax_probs) + beam_scores = (softmax_probs + + self.scores.unsqueeze(1).expand_as(softmax_probs)) for i in range(self.outputs[-1].size(0)): # if previous output hypo token had eos # we penalize those word probs to never be chosen @@ -878,12 +881,15 @@ def advance(self, softmax_probs): flatten_beam_scores = beam_scores.view(-1) # [beam_size * voc_size] with torch.no_grad(): - best_scores, best_idxs = torch.topk(flatten_beam_scores, self.beam_size, dim=-1) + best_scores, best_idxs = torch.topk( + flatten_beam_scores, self.beam_size, dim=-1) self.scores = best_scores self.all_scores.append(self.scores) - hyp_ids = best_idxs / voc_size # get the backtracking hypothesis id as a multiple of full voc_sizes - tok_ids = best_idxs % voc_size # get the actual word id from residual of the same division + # get the backtracking hypothesis id as a multiple of full voc_sizes + hyp_ids = best_idxs / voc_size + # get the actual word id from residual of the same division + tok_ids = best_idxs % voc_size self.outputs.append(tok_ids) self.bookkeep.append(hyp_ids) @@ -892,7 +898,9 @@ def advance(self, softmax_probs): for hypid in range(self.beam_size): if self.outputs[-1][hypid] == self.eos: # this is finished hypo, adding to finished - eostail = self.HypothesisTail(timestep=len(self.outputs) - 1, hypid=hypid, score=self.scores[hypid], + eostail = self.HypothesisTail(timestep=len(self.outputs) - 1, + hypid=hypid, + score=self.scores[hypid], tokenid=self.eos) self.finished.append(eostail) self.n_best_counter += 1 @@ -906,27 +914,30 @@ def done(self): return self.eos_top and self.n_best_counter >= self.min_n_best def get_top_hyp(self): - """ - Helper function to get single best hypothesis + """Get single best hypothesis. + :return: hypothesis sequence and the final score """ top_hypothesis_tail = self.get_rescored_finished(n_best=1)[0] - return self.get_hyp_from_finished(top_hypothesis_tail), top_hypothesis_tail.score + return (self.get_hyp_from_finished(top_hypothesis_tail), + top_hypothesis_tail.score) def get_hyp_from_finished(self, hypothesis_tail): - """ - Extract hypothesis ending with EOS at timestep with hyp_id + """Extract hypothesis ending with EOS at timestep with hyp_id. + :param timestep: timestep with range up to len(self.outputs)-1 :param hyp_id: id with range up to beam_size-1 :return: hypothesis sequence """ - assert self.outputs[hypothesis_tail.timestep][hypothesis_tail.hypid] == self.eos + assert (self.outputs[hypothesis_tail.timestep][hypothesis_tail.hypid] + == self.eos) assert hypothesis_tail.tokenid == self.eos hyp_idx = [] endback = hypothesis_tail.hypid for i in range(hypothesis_tail.timestep, -1, -1): - hyp_idx.append(self.HypothesisTail(timestep=i, hypid=endback, score=self.all_scores[i][endback], - tokenid=self.outputs[i][endback])) + hyp_idx.append(self.HypothesisTail( + timestep=i, hypid=endback, score=self.all_scores[i][endback], + tokenid=self.outputs[i][endback])) endback = self.bookkeep[i - 1][endback] return hyp_idx @@ -949,12 +960,15 @@ def get_rescored_finished(self, n_best=None): rescored_finished = [] for finished_item in self.finished: current_length = finished_item.timestep + 1 - length_penalty = math.pow((1 + current_length) / 6, 0.65) # this is from Google NMT paper - rescored_finished.append(self.HypothesisTail(timestep=finished_item.timestep, hypid=finished_item.hypid, - score=finished_item.score / length_penalty, - tokenid=finished_item.tokenid)) + # these weights are from Google NMT paper + length_penalty = math.pow((1 + current_length) / 6, 0.65) + rescored_finished.append(self.HypothesisTail( + timestep=finished_item.timestep, hypid=finished_item.hypid, + score=finished_item.score / length_penalty, + tokenid=finished_item.tokenid)) - srted = sorted(rescored_finished, key=attrgetter('score'), reverse=True) + srted = sorted(rescored_finished, key=attrgetter('score'), + reverse=True) if n_best is not None: srted = srted[:n_best] @@ -962,26 +976,30 @@ def get_rescored_finished(self, n_best=None): return srted def check_finished(self): - """ - this function checks if self.finished is empty and adds hyptail - in that case (this will be suboptimal hypothesis since - the model did not get any EOS) - :return: None + """Checks if self.finished is empty and add hyptail in that case. + + This will be suboptimal hypothesis since the model did not get any EOS + + :returns: None """ if len(self.finished) == 0: - # we change output because we want outputs to have this eos to pass assert in L102, it is ok since empty self.finished means junk prediction anyway + # we change output because we want outputs to have eos + # to pass assert in L102, it is ok since empty self.finished + # means junk prediction anyway self.outputs[-1][0] = self.eos - hyptail = self.HypothesisTail(timestep=len(self.outputs) - 1, hypid=0, score=self.all_scores[-1][0], + hyptail = self.HypothesisTail(timestep=len(self.outputs) - 1, + hypid=0, + score=self.all_scores[-1][0], tokenid=self.outputs[-1][0]) self.finished.append(hyptail) def get_beam_dot(self, dictionary=None, n_best=None): - """ - Creates pydot graph representation of the beam + """Creates pydot graph representation of the beam. + :param outputs: self.outputs from the beam :param dictionary: tok 2 word dict to save words in the tree nodes - :return: pydot graph + :returns: pydot graph """ try: import pydot @@ -997,18 +1015,22 @@ def get_beam_dot(self, dictionary=None, n_best=None): # get top nbest hyp top_hyp_idx_n_best = [] - n_best_colors = ['aquamarine', 'chocolate1', 'deepskyblue', 'green2', 'tan'] + n_best_colors = ['aquamarine', 'chocolate1', 'deepskyblue', + 'green2', 'tan'] sorted_finished = self.get_rescored_finished(n_best=n_best) for hyptail in sorted_finished: + # do not include EOS since it has rescored score not from original + # self.all_scores, we color EOS with black top_hyp_idx_n_best.append(self.get_hyp_from_finished( - hyptail)) # do not include EOS since it has rescored score not from original self.all_scores, we color EOS with black + hyptail)) # create nodes for tstep, lis in enumerate(outputs): for hypid, token in enumerate(lis): if tstep == 0: hypid = 0 # collapse all __NULL__ nodes - node_tail = self.HypothesisTail(timestep=tstep, hypid=hypid, score=all_scores[tstep][hypid], + node_tail = self.HypothesisTail(timestep=tstep, hypid=hypid, + score=all_scores[tstep][hypid], tokenid=token) color = 'white' rank = None @@ -1018,20 +1040,30 @@ def get_beam_dot(self, dictionary=None, n_best=None): color = n_best_colors[i] rank = i break - label = "<{}".format( - dictionary.vec2txt([token]) if dictionary is not None else token) + " : " + "{:.{prec}f}>".format( - all_scores[tstep][hypid], prec=3) - graph.add_node(pydot.Node(node_tail.__repr__(), label=label, fillcolor=color, style='filled', - xlabel='{}'.format(rank) if rank is not None else '')) + label = ( + "<{}".format(dictionary.vec2txt([token]) + if dictionary is not None else token) + + " : " + + "{:.{prec}f}>".format(all_scores[tstep][hypid], prec=3)) + + graph.add_node(pydot.Node( + node_tail.__repr__(), label=label, fillcolor=color, + style='filled', + xlabel='{}'.format(rank) if rank is not None else '')) + # create edges for revtstep, lis in reversed(list(enumerate(bookkeep))): for i, prev_id in enumerate(lis): - from_node = graph.get_node('"{}"'.format( - self.HypothesisTail(timestep=revtstep, hypid=prev_id, score=all_scores[revtstep][prev_id], - tokenid=outputs[revtstep][prev_id]).__repr__()))[0] - to_node = graph.get_node('"{}"'.format( - self.HypothesisTail(timestep=revtstep + 1, hypid=i, score=all_scores[revtstep + 1][i], - tokenid=outputs[revtstep + 1][i]).__repr__()))[0] + from_node = graph.get_node( + '"{}"'.format(self.HypothesisTail( + timestep=revtstep, hypid=prev_id, + score=all_scores[revtstep][prev_id], + tokenid=outputs[revtstep][prev_id]).__repr__()))[0] + to_node = graph.get_node( + '"{}"'.format(self.HypothesisTail( + timestep=revtstep + 1, hypid=i, + score=all_scores[revtstep + 1][i], + tokenid=outputs[revtstep + 1][i]).__repr__()))[0] newedge = pydot.Edge(from_node.get_name(), to_node.get_name()) graph.add_edge(newedge) From 4f14eca51eaa3d9733b8e761d3d3557c79444154 Mon Sep 17 00:00:00 2001 From: Alexander Miller Date: Wed, 29 Aug 2018 11:12:44 -0700 Subject: [PATCH 12/12] another flake fix --- parlai/core/torch_agent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py index c60bc0b2bb1..54ba33e4611 100644 --- a/parlai/core/torch_agent.py +++ b/parlai/core/torch_agent.py @@ -929,8 +929,8 @@ def get_hyp_from_finished(self, hypothesis_tail): :param hyp_id: id with range up to beam_size-1 :return: hypothesis sequence """ - assert (self.outputs[hypothesis_tail.timestep][hypothesis_tail.hypid] - == self.eos) + assert (self.outputs[hypothesis_tail.timestep] + [hypothesis_tail.hypid] == self.eos) assert hypothesis_tail.tokenid == self.eos hyp_idx = [] endback = hypothesis_tail.hypid