From 5603ba356a50985e73c65af835880b4e744febc5 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Tue, 28 Aug 2018 13:54:20 -0700
Subject: [PATCH 01/12] partial progress on rewriting modules.py

---
 parlai/agents/seq2seq/modules.py | 605 ++++++++++++++-----------------
 parlai/agents/seq2seq/seq2seq.py |  10 +-
 2 files changed, 281 insertions(+), 334 deletions(-)

diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py
index 366235f90ae..bf9d49a36d3 100644
--- a/parlai/agents/seq2seq/modules.py
+++ b/parlai/agents/seq2seq/modules.py
@@ -34,7 +34,6 @@ def __init__(self, opt, num_features,
         super().__init__()
         self.opt = opt
 
-        self.rank = opt['rank_candidates']
         self.attn_type = opt['attention']
 
         self.NULL_IDX = padding_idx
@@ -43,7 +42,7 @@ def __init__(self, opt, num_features,
         self.longest_label = longest_label
 
         rnn_class = Seq2seq.RNN_OPTS[opt['rnn_class']]
-        self.decoder = Decoder(
+        self.decoder = RNNDecoder(
             num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class,
             emb_size=opt['embeddingsize'], hidden_size=opt['hiddensize'],
             num_layers=opt['numlayers'], dropout=opt['dropout'],
@@ -51,22 +50,22 @@ def __init__(self, opt, num_features,
             attn_type=opt['attention'], attn_length=opt['attention_length'],
             attn_time=opt.get('attention_time'),
             bidir_input=opt['bidirectional'],
-            numsoftmax=opt.get('numsoftmax', 1),
-            softmax_layer_bias=opt.get('softmax_layer_bias', False))
+            numsoftmax=opt.get('numsoftmax', 1))
 
         shared_lt = (self.decoder.lt
                      if opt['lookuptable'] in ['enc_dec', 'all'] else None)
         shared_rnn = self.decoder.rnn if opt['decoder'] == 'shared' else None
-        self.encoder = Encoder(
+        self.encoder = RNNEncoder(
             num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class,
             emb_size=opt['embeddingsize'], hidden_size=opt['hiddensize'],
             num_layers=opt['numlayers'], dropout=opt['dropout'],
             bidirectional=opt['bidirectional'],
             shared_lt=shared_lt, shared_rnn=shared_rnn)
 
-        if self.rank:
+        if opt['rank_candidates']:
             self.ranker = Ranker(
                 self.decoder,
+                self.START,
                 padding_idx=self.NULL_IDX,
                 attn_type=opt['attention'])
 
@@ -78,192 +77,104 @@ def __init__(self, opt, num_features,
             if not os.path.exists(self.beam_dump_path):
                 os.makedirs(self.beam_dump_path)
 
-    def unbeamize_hidden(self, hidden, beam_size, batch_size):
-        """
-        Creates a view of the hidden where batch axis is collapsed with beam axis,
-        we need to do this for batched beam search, i.e. we emulate bigger mini-batch
-        :param hidden: hidden state of the decoder
-        :param beam_size: beam size, i.e. num of hypothesis
-        :param batch_size: number of samples in the mini batch
-        :return: view of the hidden
-        """
-        if isinstance(hidden, tuple):
-            num_layers = hidden[0].size(0)
-            hidden_size = hidden[0].size(-1)
-            return (hidden[0].view(num_layers, batch_size * beam_size, hidden_size),
-                hidden[1].view(num_layers, batch_size * beam_size, hidden_size))
-        else:  # GRU
-            num_layers = hidden.size(0)
-            hidden_size = hidden.size(-1)
-            return hidden.view(num_layers, batch_size * beam_size, hidden_size)
-
-    def unbeamize_enc_out(self, enc_out, beam_size, batch_size):
-        hidden_size = enc_out.size(-1)
-        return enc_out.view(batch_size * beam_size, -1, hidden_size)
-
-    def forward(self, xs, ys=None, cands=None, cand_indices=None, prev_enc=None,
-                rank_during_training=False, beam_size=1, topk=1):
+    def _encode(self, xs, x_lens=None, prev_enc=None):
+        if prev_enc is not None:
+            return prev_enc
+        else:
+            enc_out, hidden = self.encoder(xs, x_lens)
+            attn_mask = xs.ne(0).float() if self.attn_type != 'none' else None
+            return hidden, enc_out, attn_mask
+
+    def _rank(self, cand_params, encoder_states):
+        if cand_params is not None:
+            return self.ranker.forward(cand_params, encoder_states)
+        return None
+
+    def _starts(self, bsz):
+        return self.START.detach().expand(bsz, 1)
+
+    def _decode_forced(self, ys, encoder_states):
+        bsz = ys.size(0)
+        seqlen = ys.size(1)
+
+        hidden = encoder_states[0]
+        attn_params = (encoder_states[1], encoder_states[2])
+
+        # input to model is START + each target except the last
+        y_in = ys.narrow(1, 0, seqlen - 1)
+        xs = torch.cat([self._starts(bsz), y_in], 1)
+
+        scores = []
+        if self.attn_type == 'none':
+            # do the whole thing in one go
+            output, hidden = self.decoder(xs, hidden, attn_params)
+            score = self.output(output)
+            scores.append(score)
+        else:
+            # need to feed in one token at a time so we can do attention
+            # TODO: do we need to do this? actually shouldn't need to since we
+            # don't do input feeding
+            for i in range(seqlen):
+                xi = xs.select(1, i)
+                output, hidden = self.decoder(xi, hidden, attn_params)
+                score = self.output(output)
+                scores.append(score)
+
+        return scores
+
+    def _decode(self, encoder_states, maxlen):
+        hidden = encoder_states[0]
+        attn_params = (encoder_states[1], encoder_states[2])
+        bsz = hidden.size(0)
+
+        xs = self._starts(bsz)  # input start token
+
+        scores = []
+        for _ in range(maxlen):
+            # generate at most longest_label tokens
+            output, hidden = self.decoder(xs, hidden, attn_params)
+            score = self.output(output)
+            scores.append(score)
+            xs = scores.max(1)[0]  # next input is current predicted output
+
+        return scores
+
+    def forward(self, xs, x_lens=None, ys=None, cand_params=None,
+                prev_enc=None, rank_during_training=False, maxlen=None):
         """Get output predictions from the model.
 
-        Arguments:
-        xs -- input to the encoder
-        ys -- expected output from the decoder
-        cands -- set of candidates to rank, if applicable
-        cand_indices -- indices to match candidates with their appropriate xs
-        prev_enc -- if you know you'll pass in the same xs multiple times and
-            the model is in eval mode, you can pass in the encoder output from
-            the last forward pass to skip recalcuating the same encoder output
-        rank_during_training -- (default False) if set, ranks any available
-            cands during training as well
+        :param xs: (bsz x seqlen) LongTensor input to the encoder
+        :param x_lens: (list of ints) length of each input sequence
+        :param ys: expected output from the decoder. used for teacher forcing to calculate loss.
+        :param cand_params: set of candidates to rank, and indices to match candidates with their appropriate xs
+        :param prev_enc: if you know you'll pass in the same xs multiple times, you can pass in the encoder output from the last forward pass to skip recalcuating the same encoder output
+        :
         """
-        input_xs = xs
-        nbest_beam_preds, nbest_beam_scores = None, None
-        bsz = len(xs)
         if ys is not None:
             # keep track of longest label we've ever seen
             # we'll never produce longer ones than that during prediction
             self.longest_label = max(self.longest_label, ys.size(1))
 
-        if prev_enc is not None:
-            enc_out, hidden, attn_mask = prev_enc
-        else:
-            enc_out, hidden = self.encoder(xs)
-            attn_mask = xs.ne(0).float() if self.attn_type != 'none' else None
-        encoder_states = (enc_out, hidden, attn_mask)
-        start = self.START.detach()
-        starts = start.expand(bsz, 1)
+        encoder_states = self.encode(xs, x_lens, prev_enc)
 
-        predictions = []
-        scores = []
-        cand_preds, cand_scores = None, None
-        if self.rank and cands is not None:
-            decode_params = (start, hidden, enc_out, attn_mask)
-            if self.training:
-                if rank_during_training:
-                    cand_preds, cand_scores = self.ranker.forward(cands, cand_indices, decode_params=decode_params)
-            else:
-                cand_preds, cand_scores = self.ranker.forward(cands, cand_indices, decode_params=decode_params)
+        # rank candidates if they are available
+        cand_scores = self._rank(cand_params, encoder_states)
 
         if ys is not None:
-            y_in = ys.narrow(1, 0, ys.size(1) - 1)
-            xs = torch.cat([starts, y_in], 1)
-            if self.attn_type == 'none':
-                preds, score, hidden = self.decoder(xs, hidden, enc_out, attn_mask)
-                predictions.append(preds)
-                scores.append(score)
-            else:
-                for i in range(ys.size(1)):
-                    xi = xs.select(1, i)
-                    preds, score, hidden = self.decoder(xi, hidden, enc_out, attn_mask)
-                    predictions.append(preds)
-                    scores.append(score)
+            # use teacher forcing
+            scores = self._decode_forced(ys, encoder_states)
         else:
-            # here we do search: supported search types: greedy, beam search
-            if beam_size == 1:
-                done = [False for _ in range(bsz)]
-                total_done = 0
-                xs = starts
-
-                for _ in range(self.longest_label):
-                    # generate at most longest_label tokens
-                    preds, score, hidden = self.decoder(xs, hidden, enc_out, attn_mask, topk)
-                    scores.append(score)
-                    xs = preds
-                    predictions.append(preds)
-
-                    # check if we've produced the end token
-                    for b in range(bsz):
-                        if not done[b]:
-                            # only add more tokens for examples that aren't done
-                            if preds.data[b][0] == self.END_IDX:
-                                # if we produced END, we're done
-                                done[b] = True
-                                total_done += 1
-                    if total_done == bsz:
-                        # no need to generate any more
-                        break
-
-            elif beam_size > 1:
-                enc_out, hidden = encoder_states[0], encoder_states[1]  # take it from encoder
-                enc_out = enc_out.unsqueeze(1).repeat(1, beam_size, 1, 1)
-                # create batch size num of beams
-                data_device = enc_out.device
-                beams = [Beam(beam_size, 3, 0, 1, 2, min_n_best=beam_size / 2, cuda=data_device) for _ in range(bsz)]
-                # init the input with start token
-                xs = starts
-                # repeat tensors to support batched beam
-                xs = xs.repeat(1, beam_size)
-                attn_mask = input_xs.ne(0).float()
-                attn_mask = attn_mask.unsqueeze(1).repeat(1, beam_size, 1)
-                repeated_hidden = []
-
-                if isinstance(hidden, tuple):
-                    for i in range(len(hidden)):
-                        repeated_hidden.append(hidden[i].unsqueeze(2).repeat(1, 1, beam_size, 1))
-                    hidden = self.unbeamize_hidden(tuple(repeated_hidden), beam_size, bsz)
-                else:  # GRU
-                    repeated_hidden = hidden.unsqueeze(2).repeat(1, 1, beam_size, 1)
-                    hidden = self.unbeamize_hidden(repeated_hidden, beam_size, bsz)
-                enc_out = self.unbeamize_enc_out(enc_out, beam_size, bsz)
-                xs = xs.view(bsz * beam_size, -1)
-                for step in range(self.longest_label):
-                    if all((b.done() for b in beams)):
-                        break
-                    out = self.decoder(xs, hidden, enc_out)
-                    scores = out[1]
-                    scores = scores.view(bsz, beam_size, -1)  # -1 is a vocab size
-                    for i, b in enumerate(beams):
-                        b.advance(F.log_softmax(scores[i, :], dim=-1))
-                    xs = torch.cat([b.get_output_from_current_step() for b in beams]).unsqueeze(-1)
-                    permute_hidden_idx = torch.cat(
-                        [beam_size * i + b.get_backtrack_from_current_step() for i, b in enumerate(beams)])
-                    new_hidden = out[2]
-                    if isinstance(hidden, tuple):
-                        for i in range(len(hidden)):
-                            hidden[i].data.copy_(new_hidden[i].data.index_select(dim=1, index=permute_hidden_idx))
-                    else:  # GRU
-                        hidden.data.copy_(new_hidden.data.index_select(dim=1, index=permute_hidden_idx))
-
-                for b in beams:
-                    b.check_finished()
-                beam_pred = [b.get_pretty_hypothesis(b.get_top_hyp()[0])[1:] for b in beams]
-                # these beam scores are rescored with length penalty!
-                beam_scores = torch.stack([b.get_top_hyp()[1] for b in beams])
-                pad_length = max([t.size(0) for t in beam_pred])
-                beam_pred = torch.stack([pad(t, length=pad_length, dim=0) for t in beam_pred], dim=0)
-
-                #  prepare n best list for each beam
-                n_best_beam_tails = [b.get_rescored_finished(n_best=len(b.finished)) for b in beams]
-                nbest_beam_scores = []
-                nbest_beam_preds = []
-                for i, beamtails in enumerate(n_best_beam_tails):
-                    perbeam_preds = []
-                    perbeam_scores = []
-                    for tail in beamtails:
-                        perbeam_preds.append(beams[i].get_pretty_hypothesis(beams[i].get_hyp_from_finished(tail)))
-                        perbeam_scores.append(tail.score)
-                    nbest_beam_scores.append(perbeam_scores)
-                    nbest_beam_preds.append(perbeam_preds)
-
-                if self.beam_log_freq > 0.0:
-                    num_dump = round(bsz * self.beam_log_freq)
-                    for i in range(num_dump):
-                        dot_graph = beams[i].get_beam_dot(dictionary=self.dict)
-                        dot_graph.write_png(os.path.join(self.beam_dump_path, "{}.png".format(self.beam_dump_filecnt)))
-                        self.beam_dump_filecnt += 1
-
-                predictions = beam_pred
-                scores = beam_scores
-
-        if isinstance(predictions, list):
-            predictions = torch.cat(predictions, 1)
+            scores = self._decode(encoder_states, maxlen or self.longest_label)
+
         if isinstance(scores, list):
             scores = torch.cat(scores, 1)
 
-        return predictions, scores, cand_preds, cand_scores, encoder_states, nbest_beam_preds, nbest_beam_scores
+        return scores, cand_scores, encoder_states
+
 
+class RNNEncoder(nn.Module):
+    """RNN Encoder."""
 
-class Encoder(nn.Module):
     def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
                  emb_size=128, hidden_size=128, num_layers=2, dropout=0.1,
                  bidirectional=False, shared_lt=None, shared_rnn=None,
@@ -284,20 +195,25 @@ def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
 
         if shared_rnn is None:
             self.rnn = rnn_class(emb_size, hidden_size, num_layers,
-                                 dropout=dropout, batch_first=True,
-                                 bidirectional=bidirectional)
+                                 dropout=dropout if num_layers > 1 else 0,
+                                 batch_first=True, bidirectional=bidirectional)
         elif bidirectional:
             raise RuntimeError('Cannot share decoder with bidir encoder.')
         else:
             self.rnn = shared_rnn
 
-    def forward(self, xs):
+    def forward(self, xs, x_lens=None):
+        """Encode sequence.
+
+        :param xs: (bsz x seqlen) LongTensor of input token indices
+        """
         bsz = len(xs)
 
         # embed input tokens
         xes = self.dropout(self.lt(xs))
         try:
-            x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
+            if x_lens is None:
+                x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
             xes = pack_padded_sequence(xes, x_lens, batch_first=True)
             packed = True
         except ValueError:
@@ -309,29 +225,27 @@ def forward(self, xs):
             encoder_output, _ = pad_packed_sequence(encoder_output,
                                                     batch_first=True)
         if self.dirs > 1:
-            # take elementwise max between forward and backward hidden states
-            # NOTE: currently using max, but maybe should use Linear
+            # project to decoder dimension by taking sum of forward and back
             if isinstance(self.rnn, nn.LSTM):
-                hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).max(1)[0],
-                          hidden[1].view(-1, self.dirs, bsz, self.hsz).max(1)[0])
+                hidden = (hidden[0].view(-1, self.dirs, bsz, self.hsz).sum(1),
+                          hidden[1].view(-1, self.dirs, bsz, self.hsz).sum(1))
             else:
-                hidden = hidden.view(-1, self.dirs, bsz, self.hsz).max(1)[0]
+                hidden = hidden.view(-1, self.dirs, bsz, self.hsz).sum(1)
 
         return encoder_output, hidden
 
 
-class Decoder(nn.Module):
+class RNNDecoder(nn.Module):
+    """Recurrent decoder module.
+
+    Can be used as a standalone language model or paired with an encoder.
+    """
+
     def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
                  emb_size=128, hidden_size=128, num_layers=2, dropout=0.1,
-                 bidir_input=False, share_output=True,
-                 attn_type='none', attn_length=-1, attn_time='pre',
-                 sparse=False, numsoftmax=1, softmax_layer_bias=False):
+                 bidir_input=False, attn_type='none', attn_time='pre',
+                 attn_length=-1, sparse=False):
         super().__init__()
-
-        if padding_idx != 0:
-            raise RuntimeError('This module\'s output layer needs to be fixed '
-                               'if you want a padding_idx other than zero.')
-
         self.dropout = nn.Dropout(p=dropout)
         self.layers = num_layers
         self.hsz = hidden_size
@@ -340,21 +254,8 @@ def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
         self.lt = nn.Embedding(num_features, emb_size, padding_idx=padding_idx,
                                sparse=sparse)
         self.rnn = rnn_class(emb_size, hidden_size, num_layers,
-                             dropout=dropout, batch_first=True)
-
-        # rnn output to embedding
-        if hidden_size != emb_size and numsoftmax == 1:
-            # self.o2e = RandomProjection(hidden_size, emb_size)
-            # other option here is to learn these weights
-            self.o2e = nn.Linear(hidden_size, emb_size, bias=False)
-        else:
-            # no need for any transformation here
-            self.o2e = lambda x: x
-        # embedding to scores, use custom linear to possibly share weights
-        shared_weight = self.lt.weight if share_output else None
-        self.e2s = Linear(emb_size, num_features, bias=softmax_layer_bias,
-                          shared_weight=shared_weight)
-        self.shared = shared_weight is not None
+                             dropout=dropout if num_layers > 1 else 0,
+                             batch_first=True)
 
         self.attn_type = attn_type
         self.attn_time = attn_time
@@ -365,69 +266,145 @@ def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
                                         attn_length=attn_length,
                                         attn_time=attn_time)
 
+    def forward(self, xs, hidden=None, attn_params=None):
+        """Decode from input tokens.
+
+        :param xs:          (bsz x seqlen) LongTensor of input token indices
+        :param hidden:      hidden state to feed into decoder. default (None)
+                            initializes tensors using the RNN's defaults.
+        :param attn_params: (optional) tuple containing attention parameters,
+                            default AttentionLayer needs encoder_output states
+                            and attention mask (e.g. encoder_input.ne(0))
+
+        :returns:           output state(s), hidden state.
+                            output state of the encoder. for an RNN, this is
+                            (bsz, seq_len, num_directions * hidden_size).
+                            hidden state will be same dimensions as input
+                            hidden state. for an RNN, this is a tensor of sizes
+                            (bsz, num_layers * num_directions, hidden_size).
+        """
+        # sequence indices => sequence embeddings
+        xes = self.dropout(self.lt(xs))
+
+        if self.attn_time == 'pre':
+            # modify input vectors with attention
+            xes = self.attention(xes, hidden, attn_params)
+
+        # feed tokens into rnn
+        output, new_hidden = self.rnn(xes, hidden)
+
+        if self.attn_time == 'post':
+            # modify output vectors with attention
+            output = self.attention(output, new_hidden, attn_params)
+
+        return output, new_hidden
+
+
+class OutputLayer(nn.Module):
+    """Takes in final states and returns distribution over candidates."""
+
+    def __init__(self, num_features, hidden_size, emb_size, numsoftmax=1,
+                 shared_weight=None):
+        """Initialize output layer.
+
+        :param num_features:  number of candidates to rank
+        :param hidden_size:   (last) dimension of the input vectors
+        :param emb_size:      (last) dimension of the candidate vectors
+        :param num_softmax:   (default 1) number of softmaxes to calculate.
+                              see arxiv.org/abs/1711.03953 for more info.
+                              increasing this slows down computation but can
+                              add more expressivity to the embeddings.
+        :param shared_weight: (num_features x esz) vector of
+        """
+        # embedding to scores
+        if shared_weight is None:
+            # just a regular linear layer
+            self.e2s = nn.Linear(emb_size, num_features, bias=True)
+        else:
+            # use shared weights and a bias layer instead
+            self.weight = shared_weight
+            self.bias = Parameter(torch.Tensor(num_features))
+            self.reset_parameters()
+            self.e2s = lambda x: F.linear(x, self.weight, self.bias)
+
         self.numsoftmax = numsoftmax
         if numsoftmax > 1:
             self.softmax = nn.Softmax(dim=1)
             self.prior = nn.Linear(hidden_size, numsoftmax, bias=False)
             self.latent = nn.Linear(hidden_size, numsoftmax * emb_size)
             self.activation = nn.Tanh()
+        else:
+            # rnn output to embedding
+            if hidden_size != emb_size:
+                # learn projection to correct dimensions
+                self.o2e = nn.Linear(hidden_size, emb_size, bias=True)
+            else:
+                # no need for any transformation here
+                self.o2e = lambda x: x
 
-    def forward(self, xs, hidden, encoder_output, attn_mask=None, topk=1):
-        xes = self.dropout(self.lt(xs))
-        if self.attn_time == 'pre':
-            xes = self.attention(xes, hidden, encoder_output, attn_mask)
-        if xes.dim() == 2:
-            # if only one token inputted, sometimes needs unsquezing
-            xes.unsqueeze_(1)
-        output, new_hidden = self.rnn(xes, hidden)
-        if self.attn_time == 'post':
-            output = self.attention(output, new_hidden, encoder_output, attn_mask)
+    def reset_parameters(self):
+        """Reset bias param."""
+        if hasattr(self, 'bias'):
+            stdv = 1. / math.sqrt(self.bias.size(1))
+            self.bias.data.uniform_(-stdv, stdv)
+
+    def forward(self, input):
+        """Compute scores from inputs.
+
+        :param input: (bsz x seq_len x num_directions * hidden_size) tensor of
+                       states, e.g. the output states of an RNN
 
+        :returns: (bsz x seqlen x num_cands) scores for each candidate
+        """
+        # next compute scores over dictionary
         if self.numsoftmax > 1:
-            bsz = xs.size(0)
-            seqlen = xs.size(1) if xs.dim() > 1 else 1
-            latent = self.latent(output)
+            bsz = input.size(0)
+            seqlen = input.size(1) if input.dim() > 1 else 1
+
+            # first compute different softmax scores based on input vec
+            # hsz => num_softmax * esz
+            latent = self.latent(input)
             active = self.dropout(self.activation(latent))
+            # esz => num_features
             logit = self.e2s(active.view(-1, self.esz))
 
-            prior_logit = self.prior(output).view(-1, self.numsoftmax)
-            prior = self.softmax(prior_logit)  # softmax over numsoftmax's
+            # calculate priors: distribution over which softmax scores to use
+            # hsz => num_softmax
+            prior_logit = self.prior(input).view(-1, self.numsoftmax)
+            # softmax over numsoftmax's
+            prior = self.softmax(prior_logit)
 
+            # now combine priors with logits
             prob = self.softmax(logit).view(bsz * seqlen, self.numsoftmax, -1)
             probs = (prob * prior.unsqueeze(2)).sum(1).view(bsz, seqlen, -1)
             scores = probs.log()
         else:
-            e = self.dropout(self.o2e(output))
+            # hsz => esz, good time for dropout
+            e = self.dropout(self.o2e(input))
+            # esz => num_features
             scores = self.e2s(e)
 
-        # select top scoring index, excluding the padding symbol (at idx zero)
-        # we can do topk sampling from renoramlized softmax here, default topk=1 is greedy
-        if topk == 1:
-            _max_score, idx = scores.narrow(2, 1, scores.size(2) - 1).max(2)
-        elif topk > 1:
-            max_score, idx = torch.topk(F.softmax(scores.narrow(2, 1, scores.size(2) - 1), 2), topk, dim=2, sorted=False)
-            probs = F.softmax(scores.narrow(2, 1, scores.size(2) - 1).gather(2, idx), 2).squeeze(1)
-            dist = torch.distributions.categorical.Categorical(probs)
-            samples = dist.sample()
-            idx = idx.gather(-1, samples.unsqueeze(1).unsqueeze(-1)).squeeze(-1)
-        preds = idx.add_(1)
-
-        return preds, scores, new_hidden
+        return scores
 
 
 class Ranker(object):
-    def __init__(self, decoder, padding_idx=0, attn_type='none'):
+    def __init__(self, decoder, start_token, padding_idx=0, attn_type='none'):
         super().__init__()
         self.decoder = decoder
+        self.START = start_token
         self.NULL_IDX = padding_idx
         self.attn_type = attn_type
 
-    def forward(self, cands, cand_inds, decode_params):
-        start, hidden, enc_out, attn_mask = decode_params
+    def _starts(self, bsz):
+        return self.START.detach().expand(bsz, 1)
+
+    def forward(self, cand_params, encoder_states):
+        cands, cand_inds = cand_params
+        hidden, enc_out, attn_mask = encoder_states
 
         hid, cell = (hidden, None) if isinstance(hidden, torch.Tensor) else hidden
         if len(cand_inds) != hid.size(1):
-            cand_indices = start.detach().new(cand_inds)
+            cand_indices = self.START.detach().new(cand_inds)
             hid = hid.index_select(1, cand_indices)
             if cell is None:
                 hidden = hid
@@ -444,7 +421,7 @@ def forward(self, cands, cand_inds, decode_params):
             curr_cs = cands[i]
 
             n_cs = curr_cs.size(0)
-            starts = start.expand(n_cs).unsqueeze(1)
+            starts = self._starts(n_cs).unsqueeze(1)
             scores = 0
             seqlens = 0
             # select just the one hidden state
@@ -491,80 +468,15 @@ def forward(self, cands, cand_inds, decode_params):
 
         max_len = max(len(c) for c in cand_scores)
         cand_scores = torch.cat([pad(c, max_len).unsqueeze(0) for c in cand_scores], 0)
-        preds = cand_scores.sort(1, True)[1]
-        return preds, cand_scores
-
-
-class Linear(nn.Module):
-    """Custom Linear layer which allows for sharing weights (e.g. with an
-    nn.Embedding layer).
-    """
-    def __init__(self, in_features, out_features, bias=True,
-                 shared_weight=None):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.shared = shared_weight is not None
+        return cand_scores
 
-        # init weight
-        if not self.shared:
-            self.weight = Parameter(torch.Tensor(out_features, in_features))
-        else:
-            if (shared_weight.size(0) != out_features or
-                    shared_weight.size(1) != in_features):
-                raise RuntimeError('wrong dimensions for shared weights')
-            self.weight = shared_weight
-
-        # init bias
-        if bias:
-            self.bias = Parameter(torch.Tensor(out_features))
-        else:
-            self.register_parameter('bias', None)
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        stdv = 1. / math.sqrt(self.weight.size(1))
-        if not self.shared:
-            # weight is shared so don't overwrite it
-            self.weight.data.uniform_(-stdv, stdv)
-        if self.bias is not None:
-            self.bias.data.uniform_(-stdv, stdv)
 
-    def forward(self, input):
-        weight = self.weight
-        if self.shared:
-            # detach weight to prevent gradients from changing weight
-            # (but need to detach every time so weights are up to date)
-            weight = weight.detach()
-        return F.linear(input, weight, self.bias)
-
-    def __repr__(self):
-        return self.__class__.__name__ + ' (' \
-            + str(self.in_features) + ' -> ' \
-            + str(self.out_features) + ')'
-
-
-class RandomProjection(nn.Module):
-    """Randomly project input to different dimensionality."""
-    def __init__(self, in_features, out_features):
-        super().__init__()
-        self.in_features = in_features
-        self.out_features = out_features
-        self.weight = Parameter(torch.Tensor(out_features, in_features),
-                                requires_grad=False)  # fix weights
-        self.reset_parameters()
-
-    def reset_parameters(self):
-        # experimentally: std=1 appears to affect scale too much, so using 0.1
-        self.weight.data.normal_(std=0.1)
-        # other init option: set randomly to 1 or -1
-        # self.weight.data.bernoulli_(self.weight.fill_(0.5)).mul_(2).sub_(1)
-
-    def forward(self, input):
-        return F.linear(input, self.weight)
+class AttentionLayer(nn.Module):
+    """Computes attention between hidden and encoder states.
 
+    See arxiv.org/abs/1508.04025 for more info on each attention type.
+    """
 
-class AttentionLayer(nn.Module):
     def __init__(self, attn_type, hidden_size, emb_size, bidirectional=False,
                  attn_length=-1, attn_time='pre'):
         super().__init__()
@@ -581,6 +493,8 @@ def __init__(self, attn_type, hidden_size, emb_size, bidirectional=False,
                 input_dim = hsz
             else:
                 raise RuntimeError('unsupported attention time')
+
+            # linear layer for combining applied attention weights with input
             self.attn_combine = nn.Linear(hszXdirs + input_dim, input_dim,
                                           bias=False)
 
@@ -599,8 +513,23 @@ def __init__(self, attn_type, hidden_size, emb_size, bidirectional=False,
                 # equivalent to dot if attn is identity
                 self.attn = nn.Linear(hsz, hszXdirs, bias=False)
 
-    def forward(self, xes, hidden, enc_out, attn_mask=None):
+    def forward(self, xes, hidden, attn_params):
+        """Compute attention over attn_params given input and hidden states.
+
+        :param xes:         input state. will be combined with applied
+                            attention.
+        :param hidden:      hidden state from model. will be used to select
+                            states to attend to in from the attn_params.
+        :param attn_params: tuple of encoder output states and a mask showing
+                            which input indices are nonzero.
+
+        :returns: output, attn_weights
+                  output is a new state of same size as input state `xes`.
+                  attn_weights are the weights given to each state in the
+                  encoder outputs.
+        """
         if self.attention == 'none':
+            # do nothing, no attention
             return xes
 
         if type(hidden) == tuple:
@@ -608,40 +537,60 @@ def forward(self, xes, hidden, enc_out, attn_mask=None):
             hidden = hidden[0]
         last_hidden = hidden[-1]  # select hidden state from last RNN layer
 
+        enc_out, attn_mask = attn_params
+        bsz, seqlen, hszXnumdir = enc_out.size()
+        numlayersXnumdir = last_hidden.size(1)
+
         if self.attention == 'local':
-            if enc_out.size(1) > self.max_length:
-                offset = enc_out.size(1) - self.max_length
-                enc_out = enc_out.narrow(1, offset, self.max_length)
+            # local attention weights aren't based on encoder states
             h_merged = torch.cat((xes.squeeze(1), last_hidden), 1)
             attn_weights = F.softmax(self.attn(h_merged), dim=1)
-            if attn_weights.size(1) > enc_out.size(1):
-                attn_weights = attn_weights.narrow(1, 0, enc_out.size(1))
+
+            # adjust state sizes to the fixed window size
+            if seqlen > self.max_length:
+                offset = seqlen - self.max_length
+                enc_out = enc_out.narrow(1, offset, self.max_length)
+                seqlen = self.max_length
+            if attn_weights.size(1) > seqlen:
+                attn_weights = attn_weights.narrow(1, 0, seqlen)
         else:
             hid = last_hidden.unsqueeze(1)
             if self.attention == 'concat':
-                hid = hid.expand(last_hidden.size(0),
-                                 enc_out.size(1),
-                                 last_hidden.size(1))
+                # concat hidden state and encoder outputs
+                hid = hid.expand(bsz, seqlen, numlayersXnumdir)
                 h_merged = torch.cat((enc_out, hid), 2)
+                # then do linear combination of them with activation
                 active = F.tanh(self.attn(h_merged))
                 attn_w_premask = self.attn_v(active).squeeze(2)
             elif self.attention == 'dot':
-                if hid.size(2) != enc_out.size(2):
+                # dot product between hidden and encoder outputs
+                if numlayersXnumdir != hszXnumdir:
                     # enc_out has two directions, so double hid
                     hid = torch.cat([hid, hid], 2)
-                attn_w_premask = (
-                    torch.bmm(hid, enc_out.transpose(1, 2)).squeeze(1))
+                enc_t = enc_out.transpose(1, 2)
+                attn_w_premask = torch.bmm(hid, enc_t).squeeze(1)
             elif self.attention == 'general':
+                # before doing dot product, transform hidden state with linear
+                # same as dot if linear is identity
                 hid = self.attn(hid)
-                attn_w_premask = (
-                    torch.bmm(hid, enc_out.transpose(1, 2)).squeeze(1))
-            # calculate activation scores
+                enc_t = enc_out.transpose(1, 2)
+                attn_w_premask = torch.bmm(hid, enc_t).squeeze(1)
+
+            # calculate activation scores, apply mask if needed
             if attn_mask is not None:
                 # remove activation from NULL symbols
+                import pdb; pdb.set_trace()  # is this the best operation?
                 attn_w_premask -= (1 - attn_mask) * 1e20
             attn_weights = F.softmax(attn_w_premask, dim=1)
 
+        # apply the attention weights to the encoder states
         attn_applied = torch.bmm(attn_weights.unsqueeze(1), enc_out)
+        # concatenate the input and encoder states
         merged = torch.cat((xes.squeeze(1), attn_applied.squeeze(1)), 1)
+        # combine them with a linear layer and tanh activation
         output = F.tanh(self.attn_combine(merged).unsqueeze(1))
-        return output
+        print('make sure last_hidden == xes')
+        import pdb; pdb.set_trace()
+        # TODO: remove tanh?
+
+        return output, attn_weights
diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py
index 24ae228315c..612cdd11dc6 100644
--- a/parlai/agents/seq2seq/seq2seq.py
+++ b/parlai/agents/seq2seq/seq2seq.py
@@ -43,8 +43,8 @@ class Seq2seqAgent(TorchAgent):
     # legacy agent code is located in parlai/legacy_agents
     VERSION = 1
 
-    @staticmethod
-    def add_cmdline_args(argparser):
+    @classmethod
+    def add_cmdline_args(cls, argparser):
         """Add command-line arguments specifically for this agent."""
         agent = argparser.add_argument_group('Seq2Seq Arguments')
         agent.add_argument('--init-model', type=str, default=None,
@@ -66,7 +66,8 @@ def add_cmdline_args(argparser):
                            help='whether to encode the context with a '
                                 'bidirectional rnn')
         agent.add_argument('-att', '--attention', default='none',
-                           choices=['none', 'concat', 'general', 'dot', 'local'],
+                           choices=['none', 'concat', 'general', 'dot',
+                                    'local'],
                            help='Choices: none, concat, general, local. '
                                 'If set local, also set attention-length. '
                                 '(see arxiv.org/abs/1508.04025)')
@@ -108,9 +109,6 @@ def add_cmdline_args(argparser):
                            help='Top k sampling from renormalized softmax in '
                                 'test/valid time, default 1 means simple '
                                 'greedy max output')
-        agent.add_argument('--softmax-layer-bias', type='bool', default=False,
-                           help='Put True if you want to include the bias in '
-                                'decoder.e2s layer')
         agent.add_argument('--seq2seq_version', default=Seq2seqAgent.VERSION)
         TorchAgent.add_cmdline_args(argparser)
         Seq2seqAgent.dictionary_class().add_cmdline_args(argparser)

From a5b8823421817bce4abc950f3dd4a66c208b786a Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Tue, 28 Aug 2018 15:10:37 -0700
Subject: [PATCH 02/12] partial

---
 parlai/agents/seq2seq/modules.py | 191 +++++++++++++++++++------------
 parlai/agents/seq2seq/seq2seq.py |  36 +++---
 2 files changed, 138 insertions(+), 89 deletions(-)

diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py
index bf9d49a36d3..3dc904f0657 100644
--- a/parlai/agents/seq2seq/modules.py
+++ b/parlai/agents/seq2seq/modules.py
@@ -10,12 +10,17 @@
 from torch.nn.parameter import Parameter
 from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence
 import torch.nn.functional as F
-from parlai.core.torch_agent import Beam
-from parlai.core.dict import DictionaryAgent
-import os
 
 
 def pad(tensor, length, dim=0):
+    """Pad tensor to a specific length.
+
+    :param tensor: vector to pad
+    :param length: new length
+    :param dim: (default 0) dimension to pad
+
+    :returns: padded tensor if the tensor is shorter than length
+    """
     if tensor.size(dim) < length:
         return torch.cat(
             [tensor, tensor.new(*tensor.size()[:dim],
@@ -26,64 +31,73 @@ def pad(tensor, length, dim=0):
         return tensor
 
 
+def opt_to_kwargs(opt):
+    """Get kwargs for seq2seq from opt."""
+    kwargs = {}
+    for k in ['numlayers', 'dropout', 'bidirectional', 'rnn_class',
+              'lookuptable', 'decoder', 'numsoftmax', 'rank_candidates',
+              'attention', 'attention_length', 'attention_time']:
+        if k in opt:
+            kwargs[k] = opt[k]
+    return kwargs
+
+
 class Seq2seq(nn.Module):
+    """Sequence to sequence parent module."""
+
     RNN_OPTS = {'rnn': nn.RNN, 'gru': nn.GRU, 'lstm': nn.LSTM}
 
-    def __init__(self, opt, num_features,
-                 padding_idx=0, start_idx=1, end_idx=2, longest_label=1):
-        super().__init__()
-        self.opt = opt
+    def __init__(
+        self, num_features, embeddingsize, hiddensize, numlayers=2, dropout=0,
+        bidirectional=False, rnn_class='lstm', lookuptable='unique',
+        decoder='same', numsoftmax=1, rank_candidates=False,
+        attention='none', attention_length=48, attention_time='post',
+        padding_idx=0, start_idx=1, longest_label=1,
+    ):
+        """Initialize seq2seq model.
 
-        self.attn_type = opt['attention']
+        See cmdline args in Seq2seqAgent for description of arguments.
+        """
+        super().__init__()
+        self.attn_type = attention
 
         self.NULL_IDX = padding_idx
-        self.END_IDX = end_idx
         self.register_buffer('START', torch.LongTensor([start_idx]))
         self.longest_label = longest_label
 
-        rnn_class = Seq2seq.RNN_OPTS[opt['rnn_class']]
+        rnn_class = Seq2seq.RNN_OPTS[rnn_class]
         self.decoder = RNNDecoder(
             num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class,
-            emb_size=opt['embeddingsize'], hidden_size=opt['hiddensize'],
-            num_layers=opt['numlayers'], dropout=opt['dropout'],
-            share_output=opt['lookuptable'] in ['dec_out', 'all'],
-            attn_type=opt['attention'], attn_length=opt['attention_length'],
-            attn_time=opt.get('attention_time'),
-            bidir_input=opt['bidirectional'],
-            numsoftmax=opt.get('numsoftmax', 1))
+            embeddingsize=embeddingsize, hiddensize=hiddensize,
+            numlayers=numlayers, dropout=dropout,
+            share_output=lookuptable in ['dec_out', 'all'],
+            attn_type=attention, attn_length=attention_length,
+            attn_time=attention_time,
+            bidir_input=bidirectional,
+            numsoftmax=numsoftmax)
 
         shared_lt = (self.decoder.lt
-                     if opt['lookuptable'] in ['enc_dec', 'all'] else None)
-        shared_rnn = self.decoder.rnn if opt['decoder'] == 'shared' else None
+                     if lookuptable in ['enc_dec', 'all'] else None)
+        shared_rnn = self.decoder.rnn if decoder == 'shared' else None
         self.encoder = RNNEncoder(
             num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class,
-            emb_size=opt['embeddingsize'], hidden_size=opt['hiddensize'],
-            num_layers=opt['numlayers'], dropout=opt['dropout'],
-            bidirectional=opt['bidirectional'],
+            embeddingsize=embeddingsize, hiddensize=hiddensize,
+            num_layers=numlayers, dropout=dropout,
+            bidirectional=bidirectional,
             shared_lt=shared_lt, shared_rnn=shared_rnn)
 
-        if opt['rank_candidates']:
+        if rank_candidates:
             self.ranker = Ranker(
                 self.decoder,
                 self.START,
                 padding_idx=self.NULL_IDX,
-                attn_type=opt['attention'])
+                attn_type=attention)
 
-        self.beam_log_freq = opt.get('beam_log_freq', 0.0)
-        if self.beam_log_freq > 0.0:
-            self.dict = DictionaryAgent(opt)
-            self.beam_dump_filecnt = 0
-            self.beam_dump_path = opt['model_file'] + '.beam_dump'
-            if not os.path.exists(self.beam_dump_path):
-                os.makedirs(self.beam_dump_path)
-
-    def _encode(self, xs, x_lens=None, prev_enc=None):
+    def _encode(self, xs, prev_enc=None):
         if prev_enc is not None:
             return prev_enc
         else:
-            enc_out, hidden = self.encoder(xs, x_lens)
-            attn_mask = xs.ne(0).float() if self.attn_type != 'none' else None
-            return hidden, enc_out, attn_mask
+            return self.encoder(xs)
 
     def _rank(self, cand_params, encoder_states):
         if cand_params is not None:
@@ -139,23 +153,37 @@ def _decode(self, encoder_states, maxlen):
 
         return scores
 
-    def forward(self, xs, x_lens=None, ys=None, cand_params=None,
-                prev_enc=None, rank_during_training=False, maxlen=None):
+    def forward(self, xs, ys=None, cand_params=None, prev_enc=None,
+                maxlen=None):
         """Get output predictions from the model.
 
-        :param xs: (bsz x seqlen) LongTensor input to the encoder
-        :param x_lens: (list of ints) length of each input sequence
-        :param ys: expected output from the decoder. used for teacher forcing to calculate loss.
-        :param cand_params: set of candidates to rank, and indices to match candidates with their appropriate xs
-        :param prev_enc: if you know you'll pass in the same xs multiple times, you can pass in the encoder output from the last forward pass to skip recalcuating the same encoder output
-        :
+        :param xs:          (bsz x seqlen) LongTensor input to the encoder
+        :param ys:          expected output from the decoder. used for teacher
+                            forcing to calculate loss.
+        :param cand_params: set of candidates to rank, and indices to match
+                            candidates with their appropriate xs.
+        :param prev_enc:    if you know you'll pass in the same xs multiple
+                            times, you can pass in the encoder output from the
+                            last forward pass to skip recalcuating the same
+                            encoder output.
+        :param maxlen:      max number of tokens to decode. if not set, will
+                            use the length of the longest label this model
+                            has seen. ignored when ys is not None.
+
+        :returns: scores, candidate scores, and encoder states
+            scores contains the model's predicted token scores.
+                (bsz x seqlen x num_features)
+            candidate scores are the score the model assigned to each candidate
+                (bsz x num_cands)
+            encoder states are the (hidden, output, attn_mask) states from the
+                encoder. feed this back in to skip encoding on the next call.
         """
         if ys is not None:
             # keep track of longest label we've ever seen
             # we'll never produce longer ones than that during prediction
             self.longest_label = max(self.longest_label, ys.size(1))
 
-        encoder_states = self.encode(xs, x_lens, prev_enc)
+        encoder_states = self.encode(xs, prev_enc)
 
         # rank candidates if they are available
         cand_scores = self._rank(cand_params, encoder_states)
@@ -176,25 +204,26 @@ class RNNEncoder(nn.Module):
     """RNN Encoder."""
 
     def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
-                 emb_size=128, hidden_size=128, num_layers=2, dropout=0.1,
+                 embeddingsize=128, hiddensize=128, num_layers=2, dropout=0.1,
                  bidirectional=False, shared_lt=None, shared_rnn=None,
                  sparse=False):
+        """Initialize recurrent encoder."""
         super().__init__()
 
         self.dropout = nn.Dropout(p=dropout)
         self.layers = num_layers
         self.dirs = 2 if bidirectional else 1
-        self.hsz = hidden_size
+        self.hsz = hiddensize
 
         if shared_lt is None:
-            self.lt = nn.Embedding(num_features, emb_size,
+            self.lt = nn.Embedding(num_features, embeddingsize,
                                    padding_idx=padding_idx,
                                    sparse=sparse)
         else:
             self.lt = shared_lt
 
         if shared_rnn is None:
-            self.rnn = rnn_class(emb_size, hidden_size, num_layers,
+            self.rnn = rnn_class(embeddingsize, hiddensize, num_layers,
                                  dropout=dropout if num_layers > 1 else 0,
                                  batch_first=True, bidirectional=bidirectional)
         elif bidirectional:
@@ -202,18 +231,24 @@ def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
         else:
             self.rnn = shared_rnn
 
-    def forward(self, xs, x_lens=None):
+    def forward(self, xs):
         """Encode sequence.
 
         :param xs: (bsz x seqlen) LongTensor of input token indices
+
+        :returns: encoder outputs, hidden state, attention mask
+            encoder outputs are the output state at each step of the encoding.
+            the hidden state is the final hidden state of the encoder.
+            the attention mask is a mask of which input values are nonzero.
         """
         bsz = len(xs)
 
         # embed input tokens
         xes = self.dropout(self.lt(xs))
+        attn_mask = xs.ne(0).float()
         try:
-            if x_lens is None:
-                x_lens = [x for x in torch.sum((xs > 0).int(), dim=1).data]
+            x_lens = torch.sum(attn_mask.int(), dim=1)
+            import pdb; pdb.set_trace()
             xes = pack_padded_sequence(xes, x_lens, batch_first=True)
             packed = True
         except ValueError:
@@ -232,7 +267,7 @@ def forward(self, xs, x_lens=None):
             else:
                 hidden = hidden.view(-1, self.dirs, bsz, self.hsz).sum(1)
 
-        return encoder_output, hidden
+        return encoder_output, hidden, attn_mask
 
 
 class RNNDecoder(nn.Module):
@@ -242,26 +277,26 @@ class RNNDecoder(nn.Module):
     """
 
     def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
-                 emb_size=128, hidden_size=128, num_layers=2, dropout=0.1,
+                 embeddingsize=128, hiddensize=128, num_layers=2, dropout=0.1,
                  bidir_input=False, attn_type='none', attn_time='pre',
                  attn_length=-1, sparse=False):
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
         self.layers = num_layers
-        self.hsz = hidden_size
-        self.esz = emb_size
+        self.hsz = hiddensize
+        self.esz = embeddingsize
 
-        self.lt = nn.Embedding(num_features, emb_size, padding_idx=padding_idx,
-                               sparse=sparse)
-        self.rnn = rnn_class(emb_size, hidden_size, num_layers,
+        self.lt = nn.Embedding(num_features, embeddingsize,
+                               padding_idx=padding_idx, sparse=sparse)
+        self.rnn = rnn_class(embeddingsize, hiddensize, num_layers,
                              dropout=dropout if num_layers > 1 else 0,
                              batch_first=True)
 
         self.attn_type = attn_type
         self.attn_time = attn_time
         self.attention = AttentionLayer(attn_type=attn_type,
-                                        hidden_size=hidden_size,
-                                        emb_size=emb_size,
+                                        hiddensize=hiddensize,
+                                        embeddingsize=embeddingsize,
                                         bidirectional=bidir_input,
                                         attn_length=attn_length,
                                         attn_time=attn_time)
@@ -278,10 +313,10 @@ def forward(self, xs, hidden=None, attn_params=None):
 
         :returns:           output state(s), hidden state.
                             output state of the encoder. for an RNN, this is
-                            (bsz, seq_len, num_directions * hidden_size).
+                            (bsz, seq_len, num_directions * hiddensize).
                             hidden state will be same dimensions as input
                             hidden state. for an RNN, this is a tensor of sizes
-                            (bsz, num_layers * num_directions, hidden_size).
+                            (bsz, num_layers * num_directions, hiddensize).
         """
         # sequence indices => sequence embeddings
         xes = self.dropout(self.lt(xs))
@@ -303,13 +338,13 @@ def forward(self, xs, hidden=None, attn_params=None):
 class OutputLayer(nn.Module):
     """Takes in final states and returns distribution over candidates."""
 
-    def __init__(self, num_features, hidden_size, emb_size, numsoftmax=1,
+    def __init__(self, num_features, hiddensize, embeddingsize, numsoftmax=1,
                  shared_weight=None):
         """Initialize output layer.
 
         :param num_features:  number of candidates to rank
-        :param hidden_size:   (last) dimension of the input vectors
-        :param emb_size:      (last) dimension of the candidate vectors
+        :param hiddensize:   (last) dimension of the input vectors
+        :param embeddingsize:      (last) dimension of the candidate vectors
         :param num_softmax:   (default 1) number of softmaxes to calculate.
                               see arxiv.org/abs/1711.03953 for more info.
                               increasing this slows down computation but can
@@ -319,7 +354,7 @@ def __init__(self, num_features, hidden_size, emb_size, numsoftmax=1,
         # embedding to scores
         if shared_weight is None:
             # just a regular linear layer
-            self.e2s = nn.Linear(emb_size, num_features, bias=True)
+            self.e2s = nn.Linear(embeddingsize, num_features, bias=True)
         else:
             # use shared weights and a bias layer instead
             self.weight = shared_weight
@@ -330,14 +365,14 @@ def __init__(self, num_features, hidden_size, emb_size, numsoftmax=1,
         self.numsoftmax = numsoftmax
         if numsoftmax > 1:
             self.softmax = nn.Softmax(dim=1)
-            self.prior = nn.Linear(hidden_size, numsoftmax, bias=False)
-            self.latent = nn.Linear(hidden_size, numsoftmax * emb_size)
+            self.prior = nn.Linear(hiddensize, numsoftmax, bias=False)
+            self.latent = nn.Linear(hiddensize, numsoftmax * embeddingsize)
             self.activation = nn.Tanh()
         else:
             # rnn output to embedding
-            if hidden_size != emb_size:
+            if hiddensize != embeddingsize:
                 # learn projection to correct dimensions
-                self.o2e = nn.Linear(hidden_size, emb_size, bias=True)
+                self.o2e = nn.Linear(hiddensize, embeddingsize, bias=True)
             else:
                 # no need for any transformation here
                 self.o2e = lambda x: x
@@ -351,7 +386,7 @@ def reset_parameters(self):
     def forward(self, input):
         """Compute scores from inputs.
 
-        :param input: (bsz x seq_len x num_directions * hidden_size) tensor of
+        :param input: (bsz x seq_len x num_directions * hiddensize) tensor of
                        states, e.g. the output states of an RNN
 
         :returns: (bsz x seqlen x num_cands) scores for each candidate
@@ -388,7 +423,10 @@ def forward(self, input):
 
 
 class Ranker(object):
+    """Basic ranker which uses average log-prob of sequences for ranking."""
+
     def __init__(self, decoder, start_token, padding_idx=0, attn_type='none'):
+        """Intialize ranker."""
         super().__init__()
         self.decoder = decoder
         self.START = start_token
@@ -477,17 +515,18 @@ class AttentionLayer(nn.Module):
     See arxiv.org/abs/1508.04025 for more info on each attention type.
     """
 
-    def __init__(self, attn_type, hidden_size, emb_size, bidirectional=False,
-                 attn_length=-1, attn_time='pre'):
+    def __init__(self, attn_type, hiddensize, embeddingsize,
+                 bidirectional=False, attn_length=-1, attn_time='pre'):
+        """Initialize attention layer."""
         super().__init__()
         self.attention = attn_type
 
         if self.attention != 'none':
-            hsz = hidden_size
+            hsz = hiddensize
             hszXdirs = hsz * (2 if bidirectional else 1)
             if attn_time == 'pre':
                 # attention happens on the input embeddings
-                input_dim = emb_size
+                input_dim = embeddingsize
             elif attn_time == 'post':
                 # attention happens on the output of the rnn
                 input_dim = hsz
diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py
index 612cdd11dc6..e9bd254feee 100644
--- a/parlai/agents/seq2seq/seq2seq.py
+++ b/parlai/agents/seq2seq/seq2seq.py
@@ -116,7 +116,7 @@ def add_cmdline_args(cls, argparser):
 
     @staticmethod
     def model_version():
-        """Current version of this model, counting up from 0.
+        """Return current version of this model, counting up from 0.
 
         Models are not backwards-compatible with older versions.
         """
@@ -420,10 +420,16 @@ def receive_metrics(self, metrics_dict):
 
 
 class mydefaultdict(defaultdict):
-    """Custom defaultdict which overrides defaults requested by the get
-    function with the default factory.
+    """Get function also uses default_factory for this defaultdict.
+
+    This makes dict.get() behave like dict[] if a default is not provided.
     """
+
     def get(self, key, default=None):
+        """Return value at key or default if key is not in dict.
+
+        If a default is not provided, return the default factory value.
+        """
         # override default from "get" (like "__getitem__" already is)
         return super().get(key, default or self.default_factory())
 
@@ -437,27 +443,33 @@ class PerplexityEvaluatorAgent(Seq2seqAgent):
     """
 
     def __init__(self, opt, shared=None):
+        """Initialize evaluator."""
         super().__init__(opt, shared)
         self.prev_enc = None
         self.last_xs = None
 
     def next_word_probability(self, partial_out):
-        """Return probability distribution over next words given an input and
-        partial true output. This is used to calculate the per-word perplexity.
+        """Return probability distribution over next words.
+
+        This probability is based on both nn input and partial true output.
+        This is used to calculate the per-word perplexity.
 
         Arguments:
         observation -- input observation dict
         partial_out -- list of previous "true" words
 
-        Returns a dict, where each key is a word and each value is a probability
-        score for that word. Unset keys assume a probability of zero.
+        Returns a dict, where each key is a word and each value is a
+        probability score for that word.
+        Unset keys will use a probability of 1e-7.
 
         e.g.
         {'text': 'Run test program.'}, ['hello'] => {'world': 1.0}
         """
         obs = self.observation
         xs = obs['text_vec'].unsqueeze(0)
-        ys = self._vectorize_text(' '.join(partial_out), False, True, self.truncate).unsqueeze(0)
+        ys = self._vectorize_text(
+            ' '.join(partial_out), False, True, self.truncate
+        ).unsqueeze(0)
         if self.prev_enc is not None and self.last_xs is not None and (
                 xs.shape[1] != self.last_xs.shape[1] or
                 (xs == self.last_xs).sum().item() != xs.shape[1]):
@@ -466,14 +478,12 @@ def next_word_probability(self, partial_out):
         self.last_xs = xs
 
         self.model.eval()
-        # no need to predict farther ahead
-        # if you pass in any ys, this will be ignored
-        self.model.longest_label = 1
         out = self.model(
             xs,
             ys=(ys if len(partial_out) > 0 else None),
-            prev_enc=self.prev_enc)
-        scores, self.prev_enc = out[1], out[4]
+            prev_enc=self.prev_enc,
+            maxlen=1)
+        scores, self.prev_enc = out[0], out[2]
         # scores is bsz x seqlen x num_words, so select probs of current index
         probs = F.softmax(scores.select(1, -1), dim=1).squeeze()
         dist = mydefaultdict(lambda: 1e-7)  # default probability for any token

From a928f6c8e82b92c0f53a2c16f3c9bc8527aff494 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Tue, 28 Aug 2018 15:47:42 -0700
Subject: [PATCH 03/12] move ranker

---
 parlai/agents/seq2seq/modules.py | 224 +++++++++++++++----------------
 1 file changed, 106 insertions(+), 118 deletions(-)

diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py
index 3dc904f0657..a45088323d2 100644
--- a/parlai/agents/seq2seq/modules.py
+++ b/parlai/agents/seq2seq/modules.py
@@ -5,6 +5,7 @@
 # of patent rights can be found in the PATENTS file in the same directory.
 
 import math
+
 import torch
 import torch.nn as nn
 from torch.nn.parameter import Parameter
@@ -35,7 +36,7 @@ def opt_to_kwargs(opt):
     """Get kwargs for seq2seq from opt."""
     kwargs = {}
     for k in ['numlayers', 'dropout', 'bidirectional', 'rnn_class',
-              'lookuptable', 'decoder', 'numsoftmax', 'rank_candidates',
+              'lookuptable', 'decoder', 'numsoftmax',
               'attention', 'attention_length', 'attention_time']:
         if k in opt:
             kwargs[k] = opt[k]
@@ -50,7 +51,7 @@ class Seq2seq(nn.Module):
     def __init__(
         self, num_features, embeddingsize, hiddensize, numlayers=2, dropout=0,
         bidirectional=False, rnn_class='lstm', lookuptable='unique',
-        decoder='same', numsoftmax=1, rank_candidates=False,
+        decoder='same', numsoftmax=1,
         attention='none', attention_length=48, attention_time='post',
         padding_idx=0, start_idx=1, longest_label=1,
     ):
@@ -67,31 +68,29 @@ def __init__(
 
         rnn_class = Seq2seq.RNN_OPTS[rnn_class]
         self.decoder = RNNDecoder(
-            num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class,
-            embeddingsize=embeddingsize, hiddensize=hiddensize,
+            num_features, embeddingsize, hiddensize,
+            padding_idx=self.NULL_IDX, rnn_class=rnn_class,
             numlayers=numlayers, dropout=dropout,
-            share_output=lookuptable in ['dec_out', 'all'],
             attn_type=attention, attn_length=attention_length,
             attn_time=attention_time,
             bidir_input=bidirectional,
             numsoftmax=numsoftmax)
 
         shared_lt = (self.decoder.lt
-                     if lookuptable in ['enc_dec', 'all'] else None)
+                     if lookuptable in ('enc_dec', 'all') else None)
         shared_rnn = self.decoder.rnn if decoder == 'shared' else None
         self.encoder = RNNEncoder(
-            num_features, padding_idx=self.NULL_IDX, rnn_class=rnn_class,
-            embeddingsize=embeddingsize, hiddensize=hiddensize,
+            num_features, embeddingsize, hiddensize,
+            padding_idx=self.NULL_IDX, rnn_class=rnn_class,
             num_layers=numlayers, dropout=dropout,
             bidirectional=bidirectional,
             shared_lt=shared_lt, shared_rnn=shared_rnn)
 
-        if rank_candidates:
-            self.ranker = Ranker(
-                self.decoder,
-                self.START,
-                padding_idx=self.NULL_IDX,
-                attn_type=attention)
+        shared_weight = (self.decoder.lt
+                         if lookuptable in ('dec_out', 'all') else None)
+        self.output = OutputLayer(
+            num_features, embeddingsize, hiddensize, numsoftmax=numsoftmax,
+            shared_weight=shared_weight)
 
     def _encode(self, xs, prev_enc=None):
         if prev_enc is not None:
@@ -99,11 +98,6 @@ def _encode(self, xs, prev_enc=None):
         else:
             return self.encoder(xs)
 
-    def _rank(self, cand_params, encoder_states):
-        if cand_params is not None:
-            return self.ranker.forward(cand_params, encoder_states)
-        return None
-
     def _starts(self, bsz):
         return self.START.detach().expand(bsz, 1)
 
@@ -149,10 +143,90 @@ def _decode(self, encoder_states, maxlen):
             output, hidden = self.decoder(xs, hidden, attn_params)
             score = self.output(output)
             scores.append(score)
-            xs = scores.max(1)[0]  # next input is current predicted output
+            xs = score.max(1)[0]  # next input is current predicted output
 
         return scores
 
+    def _align_inds(self, encoder_states, cand_inds):
+        hidden, enc_out, attn_mask = encoder_states
+
+        # LSTM or GRU/RNN hidden state?
+        if isinstance(hidden, torch.Tensor):
+            hid, cell = hidden
+        else:
+            hid, cell = hidden, None
+
+        if len(cand_inds) != hid.size(1):
+            # if the number of candidates is mismatched from the number of
+            # hidden states, we throw out the hidden states we won't rank with
+            cand_indices = hidden.new(cand_inds)
+            hid = hid.index_select(1, cand_indices)
+            if cell is None:
+                hidden = hid
+            else:
+                cell = cell.index_select(1, cand_indices)
+                hidden = (hid, cell)
+
+            if self.attn_type != 'none':
+                enc_out = enc_out.index_select(0, cand_indices)
+                attn_mask = attn_mask.index_select(0, cand_indices)
+
+        return hidden, enc_out, attn_mask
+
+    def _extract_cur(self, encoder_states, index, num_cands):
+        hidden, enc_out, attn_mask = encoder_states
+        if isinstance(hidden, torch.Tensor):
+            nl = hidden.size(0)
+            hsz = hidden.size(-1)
+            cur_hid = (hidden.select(1, index).unsqueeze(1)
+                       .expand(nl, num_cands, hsz))
+        else:
+            nl = hidden[0].size(0)
+            hsz = hidden[0].size(-1)
+            cur_hid = (hidden[0].select(1, index).unsqueeze(1)
+                       .expand(nl, num_cands, hsz).contiguous(),
+                       hidden[1].select(1, index).unsqueeze(1)
+                       .expand(nl, num_cands, hsz).contiguous())
+
+        if self.attn_type != 'none':
+            cur_enc = (enc_out[index].unsqueeze(0)
+                       .expand(num_cands, enc_out.size(1), hsz))
+            cur_mask = (attn_mask[index].unsqueeze(0)
+                        .expand(num_cands, attn_mask.size(-1)))
+        return cur_hid, cur_enc, cur_mask
+
+    def _rank(self, cand_params, encoder_states):
+        """Rank each cand by the average log-probability of the sequence."""
+        if cand_params is None:
+            return None
+
+        cands, cand_inds = cand_params
+        encoder_states = self._align_inds(encoder_states, cand_inds)
+
+        cand_scores = []
+        for batch_idx in range(len(cands)):
+            # we do one set of candidates at a time
+            curr_cs = cands[batch_idx]
+            num_cands = curr_cs.size(0)
+
+            # select just the one hidden state
+            cur_enc_states = self._extract_cur(
+                encoder_states, num_cands, batch_idx)
+
+            score = self.decode_forced(curr_cs, cur_enc_states)
+            true_score = F.log_softmax(score, dim=2).gather(
+                2, curr_cs.unsqueeze(2))
+            nonzero = curr_cs.ne(0).float()
+            scores = (true_score.squeeze(2) * nonzero).sum(1)
+            seqlens = nonzero.sum(1)
+            scores /= seqlens
+            cand_scores.append(scores)
+
+        max_len = max(len(c) for c in cand_scores)
+        cand_scores = torch.cat(
+            [pad(c, max_len).unsqueeze(0) for c in cand_scores], 0)
+        return cand_scores
+
     def forward(self, xs, ys=None, cand_params=None, prev_enc=None,
                 maxlen=None):
         """Get output predictions from the model.
@@ -194,17 +268,15 @@ def forward(self, xs, ys=None, cand_params=None, prev_enc=None,
         else:
             scores = self._decode(encoder_states, maxlen or self.longest_label)
 
-        if isinstance(scores, list):
-            scores = torch.cat(scores, 1)
-
+        scores = torch.cat(scores, 1)
         return scores, cand_scores, encoder_states
 
 
 class RNNEncoder(nn.Module):
     """RNN Encoder."""
 
-    def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
-                 embeddingsize=128, hiddensize=128, num_layers=2, dropout=0.1,
+    def __init__(self, num_features, embeddingsize, hiddensize,
+                 padding_idx=0, rnn_class='lstm', num_layers=2, dropout=0.1,
                  bidirectional=False, shared_lt=None, shared_rnn=None,
                  sparse=False):
         """Initialize recurrent encoder."""
@@ -276,10 +348,11 @@ class RNNDecoder(nn.Module):
     Can be used as a standalone language model or paired with an encoder.
     """
 
-    def __init__(self, num_features, padding_idx=0, rnn_class='lstm',
-                 embeddingsize=128, hiddensize=128, num_layers=2, dropout=0.1,
+    def __init__(self, num_features, embeddingsize, hiddensize,
+                 padding_idx=0, rnn_class='lstm', num_layers=2, dropout=0.1,
                  bidir_input=False, attn_type='none', attn_time='pre',
                  attn_length=-1, sparse=False):
+        """Initialize recurrent decoder."""
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
         self.layers = num_layers
@@ -338,18 +411,20 @@ def forward(self, xs, hidden=None, attn_params=None):
 class OutputLayer(nn.Module):
     """Takes in final states and returns distribution over candidates."""
 
-    def __init__(self, num_features, hiddensize, embeddingsize, numsoftmax=1,
+    def __init__(self, num_features, embeddingsize, hiddensize, numsoftmax=1,
                  shared_weight=None):
         """Initialize output layer.
 
         :param num_features:  number of candidates to rank
-        :param hiddensize:   (last) dimension of the input vectors
-        :param embeddingsize:      (last) dimension of the candidate vectors
+        :param hiddensize:    (last) dimension of the input vectors
+        :param embeddingsize: (last) dimension of the candidate vectors
         :param num_softmax:   (default 1) number of softmaxes to calculate.
                               see arxiv.org/abs/1711.03953 for more info.
                               increasing this slows down computation but can
                               add more expressivity to the embeddings.
-        :param shared_weight: (num_features x esz) vector of
+        :param shared_weight: (num_features x esz) vector of weights to use as
+                              the final linear layer's weight matrix. default
+                              None starts with a new linear layer.
         """
         # embedding to scores
         if shared_weight is None:
@@ -422,93 +497,6 @@ def forward(self, input):
         return scores
 
 
-class Ranker(object):
-    """Basic ranker which uses average log-prob of sequences for ranking."""
-
-    def __init__(self, decoder, start_token, padding_idx=0, attn_type='none'):
-        """Intialize ranker."""
-        super().__init__()
-        self.decoder = decoder
-        self.START = start_token
-        self.NULL_IDX = padding_idx
-        self.attn_type = attn_type
-
-    def _starts(self, bsz):
-        return self.START.detach().expand(bsz, 1)
-
-    def forward(self, cand_params, encoder_states):
-        cands, cand_inds = cand_params
-        hidden, enc_out, attn_mask = encoder_states
-
-        hid, cell = (hidden, None) if isinstance(hidden, torch.Tensor) else hidden
-        if len(cand_inds) != hid.size(1):
-            cand_indices = self.START.detach().new(cand_inds)
-            hid = hid.index_select(1, cand_indices)
-            if cell is None:
-                hidden = hid
-            else:
-                cell = cell.index_select(1, cand_indices)
-                hidden = (hid, cell)
-            enc_out = enc_out.index_select(0, cand_indices)
-            if attn_mask is not None:
-                attn_mask = attn_mask.index_select(0, cand_indices)
-
-        cand_scores = []
-
-        for i in range(len(cands)):
-            curr_cs = cands[i]
-
-            n_cs = curr_cs.size(0)
-            starts = self._starts(n_cs).unsqueeze(1)
-            scores = 0
-            seqlens = 0
-            # select just the one hidden state
-            if isinstance(hidden, torch.Tensor):
-                nl = hidden.size(0)
-                hsz = hidden.size(-1)
-                cur_hid = hidden.select(1, i).unsqueeze(1).expand(nl, n_cs, hsz)
-            else:
-                nl = hidden[0].size(0)
-                hsz = hidden[0].size(-1)
-                cur_hid = (hidden[0].select(1, i).unsqueeze(1).expand(nl, n_cs, hsz).contiguous(),
-                           hidden[1].select(1, i).unsqueeze(1).expand(nl, n_cs, hsz).contiguous())
-
-            cur_enc, cur_mask = None, None
-            if attn_mask is not None:
-                cur_mask = attn_mask[i].unsqueeze(0).expand(n_cs, attn_mask.size(-1))
-                cur_enc = enc_out[i].unsqueeze(0).expand(n_cs, enc_out.size(1), hsz)
-            # this is pretty much copied from the training forward above
-            if curr_cs.size(1) > 1:
-                c_in = curr_cs.narrow(1, 0, curr_cs.size(1) - 1)
-                xs = torch.cat([starts, c_in], 1)
-            else:
-                xs, c_in = starts, curr_cs
-            if self.attn_type == 'none':
-                preds, score, cur_hid = self.decoder(xs, cur_hid, cur_enc, cur_mask)
-                true_score = F.log_softmax(score, dim=2).gather(
-                    2, curr_cs.unsqueeze(2))
-                nonzero = curr_cs.ne(0).float()
-                scores = (true_score.squeeze(2) * nonzero).sum(1)
-                seqlens = nonzero.sum(1)
-            else:
-                for i in range(curr_cs.size(1)):
-                    xi = xs.select(1, i)
-                    ci = curr_cs.select(1, i)
-                    preds, score, cur_hid = self.decoder(xi, cur_hid, cur_enc, cur_mask)
-                    true_score = F.log_softmax(score, dim=2).gather(
-                        2, ci.unsqueeze(1).unsqueeze(2))
-                    nonzero = ci.ne(0).float()
-                    scores += true_score.squeeze(2).squeeze(1) * nonzero
-                    seqlens += nonzero
-
-            scores /= seqlens  # **len_penalty?
-            cand_scores.append(scores)
-
-        max_len = max(len(c) for c in cand_scores)
-        cand_scores = torch.cat([pad(c, max_len).unsqueeze(0) for c in cand_scores], 0)
-        return cand_scores
-
-
 class AttentionLayer(nn.Module):
     """Computes attention between hidden and encoder states.
 

From 80eff35eca48108bbe995892a4f0d6d9d0940326 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Tue, 28 Aug 2018 16:26:16 -0700
Subject: [PATCH 04/12] fix bugs

---
 parlai/agents/seq2seq/modules.py | 77 +++++++++++++++++---------------
 parlai/agents/seq2seq/seq2seq.py | 40 +++++++++--------
 parlai/core/torch_agent.py       |  2 +-
 3 files changed, 62 insertions(+), 57 deletions(-)

diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py
index a45088323d2..2cb72cbd3fb 100644
--- a/parlai/agents/seq2seq/modules.py
+++ b/parlai/agents/seq2seq/modules.py
@@ -13,6 +13,17 @@
 import torch.nn.functional as F
 
 
+def opt_to_kwargs(opt):
+    """Get kwargs for seq2seq from opt."""
+    kwargs = {}
+    for k in ['numlayers', 'dropout', 'bidirectional', 'rnn_class',
+              'lookuptable', 'decoder', 'numsoftmax',
+              'attention', 'attention_length', 'attention_time']:
+        if k in opt:
+            kwargs[k] = opt[k]
+    return kwargs
+
+
 def pad(tensor, length, dim=0):
     """Pad tensor to a specific length.
 
@@ -32,17 +43,6 @@ def pad(tensor, length, dim=0):
         return tensor
 
 
-def opt_to_kwargs(opt):
-    """Get kwargs for seq2seq from opt."""
-    kwargs = {}
-    for k in ['numlayers', 'dropout', 'bidirectional', 'rnn_class',
-              'lookuptable', 'decoder', 'numsoftmax',
-              'attention', 'attention_length', 'attention_time']:
-        if k in opt:
-            kwargs[k] = opt[k]
-    return kwargs
-
-
 class Seq2seq(nn.Module):
     """Sequence to sequence parent module."""
 
@@ -73,8 +73,7 @@ def __init__(
             numlayers=numlayers, dropout=dropout,
             attn_type=attention, attn_length=attention_length,
             attn_time=attention_time,
-            bidir_input=bidirectional,
-            numsoftmax=numsoftmax)
+            bidir_input=bidirectional)
 
         shared_lt = (self.decoder.lt
                      if lookuptable in ('enc_dec', 'all') else None)
@@ -82,15 +81,15 @@ def __init__(
         self.encoder = RNNEncoder(
             num_features, embeddingsize, hiddensize,
             padding_idx=self.NULL_IDX, rnn_class=rnn_class,
-            num_layers=numlayers, dropout=dropout,
+            numlayers=numlayers, dropout=dropout,
             bidirectional=bidirectional,
             shared_lt=shared_lt, shared_rnn=shared_rnn)
 
         shared_weight = (self.decoder.lt
                          if lookuptable in ('dec_out', 'all') else None)
         self.output = OutputLayer(
-            num_features, embeddingsize, hiddensize, numsoftmax=numsoftmax,
-            shared_weight=shared_weight)
+            num_features, embeddingsize, hiddensize, dropout=dropout,
+            numsoftmax=numsoftmax, shared_weight=shared_weight)
 
     def _encode(self, xs, prev_enc=None):
         if prev_enc is not None:
@@ -133,7 +132,7 @@ def _decode_forced(self, ys, encoder_states):
     def _decode(self, encoder_states, maxlen):
         hidden = encoder_states[0]
         attn_params = (encoder_states[1], encoder_states[2])
-        bsz = hidden.size(0)
+        bsz = encoder_states[1].size(0)
 
         xs = self._starts(bsz)  # input start token
 
@@ -143,7 +142,7 @@ def _decode(self, encoder_states, maxlen):
             output, hidden = self.decoder(xs, hidden, attn_params)
             score = self.output(output)
             scores.append(score)
-            xs = score.max(1)[0]  # next input is current predicted output
+            xs = score.max(1)[1]  # next input is current predicted output
 
         return scores
 
@@ -152,9 +151,9 @@ def _align_inds(self, encoder_states, cand_inds):
 
         # LSTM or GRU/RNN hidden state?
         if isinstance(hidden, torch.Tensor):
-            hid, cell = hidden
-        else:
             hid, cell = hidden, None
+        else:
+            hid, cell = hidden
 
         if len(cand_inds) != hid.size(1):
             # if the number of candidates is mismatched from the number of
@@ -201,6 +200,8 @@ def _rank(self, cand_params, encoder_states):
             return None
 
         cands, cand_inds = cand_params
+        if cands is None:
+            return None
         encoder_states = self._align_inds(encoder_states, cand_inds)
 
         cand_scores = []
@@ -257,7 +258,7 @@ def forward(self, xs, ys=None, cand_params=None, prev_enc=None,
             # we'll never produce longer ones than that during prediction
             self.longest_label = max(self.longest_label, ys.size(1))
 
-        encoder_states = self.encode(xs, prev_enc)
+        encoder_states = self._encode(xs, prev_enc)
 
         # rank candidates if they are available
         cand_scores = self._rank(cand_params, encoder_states)
@@ -276,14 +277,14 @@ class RNNEncoder(nn.Module):
     """RNN Encoder."""
 
     def __init__(self, num_features, embeddingsize, hiddensize,
-                 padding_idx=0, rnn_class='lstm', num_layers=2, dropout=0.1,
+                 padding_idx=0, rnn_class='lstm', numlayers=2, dropout=0.1,
                  bidirectional=False, shared_lt=None, shared_rnn=None,
                  sparse=False):
         """Initialize recurrent encoder."""
         super().__init__()
 
         self.dropout = nn.Dropout(p=dropout)
-        self.layers = num_layers
+        self.layers = numlayers
         self.dirs = 2 if bidirectional else 1
         self.hsz = hiddensize
 
@@ -295,8 +296,8 @@ def __init__(self, num_features, embeddingsize, hiddensize,
             self.lt = shared_lt
 
         if shared_rnn is None:
-            self.rnn = rnn_class(embeddingsize, hiddensize, num_layers,
-                                 dropout=dropout if num_layers > 1 else 0,
+            self.rnn = rnn_class(embeddingsize, hiddensize, numlayers,
+                                 dropout=dropout if numlayers > 1 else 0,
                                  batch_first=True, bidirectional=bidirectional)
         elif bidirectional:
             raise RuntimeError('Cannot share decoder with bidir encoder.')
@@ -320,7 +321,6 @@ def forward(self, xs):
         attn_mask = xs.ne(0).float()
         try:
             x_lens = torch.sum(attn_mask.int(), dim=1)
-            import pdb; pdb.set_trace()
             xes = pack_padded_sequence(xes, x_lens, batch_first=True)
             packed = True
         except ValueError:
@@ -339,7 +339,7 @@ def forward(self, xs):
             else:
                 hidden = hidden.view(-1, self.dirs, bsz, self.hsz).sum(1)
 
-        return encoder_output, hidden, attn_mask
+        return hidden, encoder_output, attn_mask
 
 
 class RNNDecoder(nn.Module):
@@ -349,20 +349,20 @@ class RNNDecoder(nn.Module):
     """
 
     def __init__(self, num_features, embeddingsize, hiddensize,
-                 padding_idx=0, rnn_class='lstm', num_layers=2, dropout=0.1,
+                 padding_idx=0, rnn_class='lstm', numlayers=2, dropout=0.1,
                  bidir_input=False, attn_type='none', attn_time='pre',
                  attn_length=-1, sparse=False):
         """Initialize recurrent decoder."""
         super().__init__()
         self.dropout = nn.Dropout(p=dropout)
-        self.layers = num_layers
+        self.layers = numlayers
         self.hsz = hiddensize
         self.esz = embeddingsize
 
         self.lt = nn.Embedding(num_features, embeddingsize,
                                padding_idx=padding_idx, sparse=sparse)
-        self.rnn = rnn_class(embeddingsize, hiddensize, num_layers,
-                             dropout=dropout if num_layers > 1 else 0,
+        self.rnn = rnn_class(embeddingsize, hiddensize, numlayers,
+                             dropout=dropout if numlayers > 1 else 0,
                              batch_first=True)
 
         self.attn_type = attn_type
@@ -389,7 +389,7 @@ def forward(self, xs, hidden=None, attn_params=None):
                             (bsz, seq_len, num_directions * hiddensize).
                             hidden state will be same dimensions as input
                             hidden state. for an RNN, this is a tensor of sizes
-                            (bsz, num_layers * num_directions, hiddensize).
+                            (bsz, numlayers * num_directions, hiddensize).
         """
         # sequence indices => sequence embeddings
         xes = self.dropout(self.lt(xs))
@@ -411,14 +411,14 @@ def forward(self, xs, hidden=None, attn_params=None):
 class OutputLayer(nn.Module):
     """Takes in final states and returns distribution over candidates."""
 
-    def __init__(self, num_features, embeddingsize, hiddensize, numsoftmax=1,
-                 shared_weight=None):
+    def __init__(self, num_features, embeddingsize, hiddensize, dropout=0,
+                 numsoftmax=1, shared_weight=None):
         """Initialize output layer.
 
         :param num_features:  number of candidates to rank
         :param hiddensize:    (last) dimension of the input vectors
         :param embeddingsize: (last) dimension of the candidate vectors
-        :param num_softmax:   (default 1) number of softmaxes to calculate.
+        :param numsoftmax:   (default 1) number of softmaxes to calculate.
                               see arxiv.org/abs/1711.03953 for more info.
                               increasing this slows down computation but can
                               add more expressivity to the embeddings.
@@ -426,6 +426,9 @@ def __init__(self, num_features, embeddingsize, hiddensize, numsoftmax=1,
                               the final linear layer's weight matrix. default
                               None starts with a new linear layer.
         """
+        super().__init__()
+        self.dropout = nn.Dropout(p=dropout)
+
         # embedding to scores
         if shared_weight is None:
             # just a regular linear layer
@@ -472,14 +475,14 @@ def forward(self, input):
             seqlen = input.size(1) if input.dim() > 1 else 1
 
             # first compute different softmax scores based on input vec
-            # hsz => num_softmax * esz
+            # hsz => numsoftmax * esz
             latent = self.latent(input)
             active = self.dropout(self.activation(latent))
             # esz => num_features
             logit = self.e2s(active.view(-1, self.esz))
 
             # calculate priors: distribution over which softmax scores to use
-            # hsz => num_softmax
+            # hsz => numsoftmax
             prior_logit = self.prior(input).view(-1, self.numsoftmax)
             # softmax over numsoftmax's
             prior = self.softmax(prior_logit)
diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py
index e9bd254feee..f537f09427e 100644
--- a/parlai/agents/seq2seq/seq2seq.py
+++ b/parlai/agents/seq2seq/seq2seq.py
@@ -7,7 +7,7 @@
 from parlai.core.torch_agent import TorchAgent, Output
 from parlai.core.utils import padded_tensor, round_sigfigs
 from parlai.core.thread_utils import SharedTable
-from .modules import Seq2seq
+from .modules import Seq2seq, opt_to_kwargs
 
 import torch
 import torch.nn as nn
@@ -187,15 +187,15 @@ def __init__(self, opt, shared=None):
         self.reset()
 
     def _init_model(self, states=None):
+        """Initialize model, override to change model setup."""
         opt = self.opt
 
-        if not hasattr(self, 'model_class'):
-            # this allows child classes to override this but inherit init
-            self.model_class = Seq2seq
-        self.model = self.model_class(
-            opt, len(self.dict), padding_idx=self.NULL_IDX,
-            start_idx=self.START_IDX, end_idx=self.END_IDX,
-            longest_label=states.get('longest_label', 1))
+        kwargs = opt_to_kwargs(opt)
+        self.model = Seq2seq(
+            len(self.dict), opt['embeddingsize'], opt['hiddensize'],
+            padding_idx=self.NULL_IDX, start_idx=self.START_IDX,
+            longest_label=states.get('longest_label', 1),
+            **kwargs)
 
         if (opt.get('dict_tokenizer') == 'bpe'
                 and opt['embedding_type'] != 'random'):
@@ -223,6 +223,8 @@ def _init_model(self, states=None):
             if opt['lookuptable'] in ['dec_out', 'all']:
                 self.model.decoder.e2s.weight.requires_grad = False
 
+        return self.model
+
     def _v2t(self, vec):
         """Convert token indices to string of tokens."""
         new_vec = []
@@ -310,12 +312,12 @@ def train_step(self, batch):
                                self.truncate or 180)
         self.model.train()
         self.zero_grad()
-        preds = None
         try:
             out = self.model(batch.text_vec, batch.label_vec)
 
             # generated response
-            preds, scores = out[0], out[1]
+            scores = out[0]
+            preds = scores.max(2)[1]
 
             score_view = scores.view(-1, scores.size(-1))
             loss = self.criterion(score_view, batch.label_vec.view(-1))
@@ -359,16 +361,15 @@ def _pick_cands(self, cand_preds, cand_inds, cands):
     def eval_step(self, batch):
         """Evaluate a single batch of examples."""
         self.model.eval()
-        cands, cand_inds = self._build_cands(batch)
-        out = self.model(batch.text_vec, ys=None,
-                         cands=cands, cand_indices=cand_inds,
-                         beam_size=self.beam_size, topk=self.topk)
-        preds, cand_preds = out[0], out[2]
+        cand_params = self._build_cands(batch)
+        out = self.model(batch.text_vec, ys=None, cand_params=cand_params)
+        scores, cand_scores = out[0], out[1]
 
         if batch.label_vec is not None:
             # calculate loss on targets
             out = self.model(batch.text_vec, batch.label_vec)
-            f_preds, scores = out[0], out[1]
+            scores = out[0]
+            f_preds = scores.max(2)[1]  # forced preds
             score_view = scores.view(-1, scores.size(-1))
             loss = self.criterion(score_view, batch.label_vec.view(-1))
             # save loss to metrics
@@ -380,12 +381,13 @@ def eval_step(self, batch):
             self.metrics['num_tokens'] += target_tokens
 
         cand_choices = None
-        if cand_preds is not None:
+        if cand_scores is not None:
+            cand_preds = cand_scores.sort(1, True)[1]
             # now select the text of the cands based on their scores
-            cand_choices = self._pick_cands(cand_preds, cand_inds,
+            cand_choices = self._pick_cands(cand_preds, cand_params[1],
                                             batch.candidates)
 
-        text = [self._v2t(p) for p in preds]
+        text = [self._v2t(p) for p in scores.max(2)[1].cpu()]
         return Output(text, cand_choices)
 
     def save(self, path=None):
diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py
index 11ade93e53d..17b5281a634 100644
--- a/parlai/core/torch_agent.py
+++ b/parlai/core/torch_agent.py
@@ -775,7 +775,7 @@ def _init_cuda_buffer(self, model, criterion, batchsize, maxlen):
             try:
                 print('preinitializing pytorch cuda buffer')
                 dummy = torch.ones(batchsize, maxlen).long().cuda()
-                sc = model(dummy, dummy)[1]
+                sc = model(dummy, dummy)[0]
                 loss = criterion(sc.view(-1, sc.size(-1)), dummy.view(-1))
                 loss.backward()
                 self.buffer_initialized = True

From 589f0dfd3604e605a721d94a4abd05c9cc0b7ff6 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Tue, 28 Aug 2018 16:32:39 -0700
Subject: [PATCH 05/12] more bug fixes

---
 parlai/agents/seq2seq/modules.py | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py
index 2cb72cbd3fb..e6e8bd65cc0 100644
--- a/parlai/agents/seq2seq/modules.py
+++ b/parlai/agents/seq2seq/modules.py
@@ -92,15 +92,18 @@ def __init__(
             numsoftmax=numsoftmax, shared_weight=shared_weight)
 
     def _encode(self, xs, prev_enc=None):
+        """Encode the input or return cached encoder state."""
         if prev_enc is not None:
             return prev_enc
         else:
             return self.encoder(xs)
 
     def _starts(self, bsz):
+        """Return bsz start tokens."""
         return self.START.detach().expand(bsz, 1)
 
     def _decode_forced(self, ys, encoder_states):
+        """Decode with teacher forcing."""
         bsz = ys.size(0)
         seqlen = ys.size(1)
 
@@ -127,9 +130,11 @@ def _decode_forced(self, ys, encoder_states):
                 score = self.output(output)
                 scores.append(score)
 
+        scores = torch.cat(scores, 1)
         return scores
 
     def _decode(self, encoder_states, maxlen):
+        """Decode maxlen tokens."""
         hidden = encoder_states[0]
         attn_params = (encoder_states[1], encoder_states[2])
         bsz = encoder_states[1].size(0)
@@ -144,9 +149,11 @@ def _decode(self, encoder_states, maxlen):
             scores.append(score)
             xs = score.max(1)[1]  # next input is current predicted output
 
+        scores = torch.cat(scores, 1)
         return scores
 
     def _align_inds(self, encoder_states, cand_inds):
+        """Select the encoder states relevant to valid candidates."""
         hidden, enc_out, attn_mask = encoder_states
 
         # LSTM or GRU/RNN hidden state?
@@ -173,6 +180,7 @@ def _align_inds(self, encoder_states, cand_inds):
         return hidden, enc_out, attn_mask
 
     def _extract_cur(self, encoder_states, index, num_cands):
+        """Extract encoder states at current index and expand them."""
         hidden, enc_out, attn_mask = encoder_states
         if isinstance(hidden, torch.Tensor):
             nl = hidden.size(0)
@@ -187,6 +195,7 @@ def _extract_cur(self, encoder_states, index, num_cands):
                        hidden[1].select(1, index).unsqueeze(1)
                        .expand(nl, num_cands, hsz).contiguous())
 
+        cur_enc, cur_mask = None, None
         if self.attn_type != 'none':
             cur_enc = (enc_out[index].unsqueeze(0)
                        .expand(num_cands, enc_out.size(1), hsz))
@@ -212,9 +221,9 @@ def _rank(self, cand_params, encoder_states):
 
             # select just the one hidden state
             cur_enc_states = self._extract_cur(
-                encoder_states, num_cands, batch_idx)
+                encoder_states, batch_idx, num_cands)
 
-            score = self.decode_forced(curr_cs, cur_enc_states)
+            score = self._decode_forced(curr_cs, cur_enc_states)
             true_score = F.log_softmax(score, dim=2).gather(
                 2, curr_cs.unsqueeze(2))
             nonzero = curr_cs.ne(0).float()
@@ -269,7 +278,6 @@ def forward(self, xs, ys=None, cand_params=None, prev_enc=None,
         else:
             scores = self._decode(encoder_states, maxlen or self.longest_label)
 
-        scores = torch.cat(scores, 1)
         return scores, cand_scores, encoder_states
 
 

From 1e76d0d0bac1b9a574e30287976e2e8108e85522 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Tue, 28 Aug 2018 16:45:55 -0700
Subject: [PATCH 06/12] bug fixes for convai2

---
 parlai/agents/seq2seq/modules.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py
index e6e8bd65cc0..c82a428ae3b 100644
--- a/parlai/agents/seq2seq/modules.py
+++ b/parlai/agents/seq2seq/modules.py
@@ -125,7 +125,7 @@ def _decode_forced(self, ys, encoder_states):
             # TODO: do we need to do this? actually shouldn't need to since we
             # don't do input feeding
             for i in range(seqlen):
-                xi = xs.select(1, i)
+                xi = xs.select(1, i).unsqueeze(1)
                 output, hidden = self.decoder(xi, hidden, attn_params)
                 score = self.output(output)
                 scores.append(score)
@@ -404,14 +404,14 @@ def forward(self, xs, hidden=None, attn_params=None):
 
         if self.attn_time == 'pre':
             # modify input vectors with attention
-            xes = self.attention(xes, hidden, attn_params)
+            xes, _attw = self.attention(xes, hidden, attn_params)
 
         # feed tokens into rnn
         output, new_hidden = self.rnn(xes, hidden)
 
         if self.attn_time == 'post':
             # modify output vectors with attention
-            output = self.attention(output, new_hidden, attn_params)
+            output, _attw = self.attention(output, new_hidden, attn_params)
 
         return output, new_hidden
 
@@ -617,7 +617,7 @@ def forward(self, xes, hidden, attn_params):
             # calculate activation scores, apply mask if needed
             if attn_mask is not None:
                 # remove activation from NULL symbols
-                import pdb; pdb.set_trace()  # is this the best operation?
+                # TODO: is this the best operation?
                 attn_w_premask -= (1 - attn_mask) * 1e20
             attn_weights = F.softmax(attn_w_premask, dim=1)
 
@@ -627,8 +627,5 @@ def forward(self, xes, hidden, attn_params):
         merged = torch.cat((xes.squeeze(1), attn_applied.squeeze(1)), 1)
         # combine them with a linear layer and tanh activation
         output = F.tanh(self.attn_combine(merged).unsqueeze(1))
-        print('make sure last_hidden == xes')
-        import pdb; pdb.set_trace()
-        # TODO: remove tanh?
 
         return output, attn_weights

From 9df6d0e227a561f5df17dce29c5342d1f598ab23 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Wed, 29 Aug 2018 07:38:01 -0700
Subject: [PATCH 07/12] bug fix

---
 parlai/agents/seq2seq/modules.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py
index c82a428ae3b..4e6d3f6ad38 100644
--- a/parlai/agents/seq2seq/modules.py
+++ b/parlai/agents/seq2seq/modules.py
@@ -147,7 +147,7 @@ def _decode(self, encoder_states, maxlen):
             output, hidden = self.decoder(xs, hidden, attn_params)
             score = self.output(output)
             scores.append(score)
-            xs = score.max(1)[1]  # next input is current predicted output
+            xs = score.max(2)[1]  # next input is current predicted output
 
         scores = torch.cat(scores, 1)
         return scores

From 1dc9594ef0d0b6636fbb02666d6695013ebc9465 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Wed, 29 Aug 2018 07:47:01 -0700
Subject: [PATCH 08/12] version

---
 parlai/agents/seq2seq/seq2seq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py
index f537f09427e..dec2c150d74 100644
--- a/parlai/agents/seq2seq/seq2seq.py
+++ b/parlai/agents/seq2seq/seq2seq.py
@@ -38,7 +38,7 @@ class Seq2seqAgent(TorchAgent):
       `(Luong et al. 2015) <arxiv.org/abs/1508.04025>`_
     """
 
-    # version 1 split from version 0 on Aug 24, 2018
+    # version 1 split from version 0 on Aug 29, 2018
     # to use version 0, use --model legacy:seq2seq:0
     # legacy agent code is located in parlai/legacy_agents
     VERSION = 1

From 2e1efddfb5091980a992917618ce09b29f1792b7 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Wed, 29 Aug 2018 07:50:47 -0700
Subject: [PATCH 09/12] lint

---
 parlai/agents/seq2seq/seq2seq.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py
index dec2c150d74..76de2902af5 100644
--- a/parlai/agents/seq2seq/seq2seq.py
+++ b/parlai/agents/seq2seq/seq2seq.py
@@ -48,7 +48,7 @@ def add_cmdline_args(cls, argparser):
         """Add command-line arguments specifically for this agent."""
         agent = argparser.add_argument_group('Seq2Seq Arguments')
         agent.add_argument('--init-model', type=str, default=None,
-                           help='load dict/features/weights/opts from this file')
+                           help='load dict/model/opts from this path')
         agent.add_argument('-hs', '--hiddensize', type=int, default=128,
                            help='size of the hidden layers')
         agent.add_argument('-esz', '--embeddingsize', type=int, default=128,

From 4d8f1035778e7d5d2e42541873e203fd0900d580 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Wed, 29 Aug 2018 09:27:17 -0700
Subject: [PATCH 10/12] stephen suggestions, bug fix, move cuda buffer back to
 s2s

---
 parlai/agents/seq2seq/modules.py | 36 +++++++++++++-------------
 parlai/agents/seq2seq/seq2seq.py | 44 ++++++++++++++++++++++----------
 parlai/core/torch_agent.py       | 19 --------------
 3 files changed, 49 insertions(+), 50 deletions(-)

diff --git a/parlai/agents/seq2seq/modules.py b/parlai/agents/seq2seq/modules.py
index 4e6d3f6ad38..7d51c73a2a2 100644
--- a/parlai/agents/seq2seq/modules.py
+++ b/parlai/agents/seq2seq/modules.py
@@ -24,7 +24,7 @@ def opt_to_kwargs(opt):
     return kwargs
 
 
-def pad(tensor, length, dim=0):
+def pad(tensor, length, dim=0, pad=0):
     """Pad tensor to a specific length.
 
     :param tensor: vector to pad
@@ -37,7 +37,7 @@ def pad(tensor, length, dim=0):
         return torch.cat(
             [tensor, tensor.new(*tensor.size()[:dim],
                                 length - tensor.size(dim),
-                                *tensor.size()[dim + 1:]).zero_()],
+                                *tensor.size()[dim + 1:]).fill_(pad)],
             dim=dim)
     else:
         return tensor
@@ -107,8 +107,8 @@ def _decode_forced(self, ys, encoder_states):
         bsz = ys.size(0)
         seqlen = ys.size(1)
 
-        hidden = encoder_states[0]
-        attn_params = (encoder_states[1], encoder_states[2])
+        hidden = encoder_states[1]
+        attn_params = (encoder_states[0], encoder_states[2])
 
         # input to model is START + each target except the last
         y_in = ys.narrow(1, 0, seqlen - 1)
@@ -135,9 +135,9 @@ def _decode_forced(self, ys, encoder_states):
 
     def _decode(self, encoder_states, maxlen):
         """Decode maxlen tokens."""
-        hidden = encoder_states[0]
-        attn_params = (encoder_states[1], encoder_states[2])
-        bsz = encoder_states[1].size(0)
+        hidden = encoder_states[1]
+        attn_params = (encoder_states[0], encoder_states[2])
+        bsz = encoder_states[0].size(0)
 
         xs = self._starts(bsz)  # input start token
 
@@ -154,7 +154,7 @@ def _decode(self, encoder_states, maxlen):
 
     def _align_inds(self, encoder_states, cand_inds):
         """Select the encoder states relevant to valid candidates."""
-        hidden, enc_out, attn_mask = encoder_states
+        enc_out, hidden, attn_mask = encoder_states
 
         # LSTM or GRU/RNN hidden state?
         if isinstance(hidden, torch.Tensor):
@@ -177,11 +177,11 @@ def _align_inds(self, encoder_states, cand_inds):
                 enc_out = enc_out.index_select(0, cand_indices)
                 attn_mask = attn_mask.index_select(0, cand_indices)
 
-        return hidden, enc_out, attn_mask
+        return enc_out, hidden, attn_mask
 
     def _extract_cur(self, encoder_states, index, num_cands):
         """Extract encoder states at current index and expand them."""
-        hidden, enc_out, attn_mask = encoder_states
+        enc_out, hidden, attn_mask = encoder_states
         if isinstance(hidden, torch.Tensor):
             nl = hidden.size(0)
             hsz = hidden.size(-1)
@@ -201,7 +201,7 @@ def _extract_cur(self, encoder_states, index, num_cands):
                        .expand(num_cands, enc_out.size(1), hsz))
             cur_mask = (attn_mask[index].unsqueeze(0)
                         .expand(num_cands, attn_mask.size(-1)))
-        return cur_hid, cur_enc, cur_mask
+        return cur_enc, cur_hid, cur_mask
 
     def _rank(self, cand_params, encoder_states):
         """Rank each cand by the average log-probability of the sequence."""
@@ -234,7 +234,8 @@ def _rank(self, cand_params, encoder_states):
 
         max_len = max(len(c) for c in cand_scores)
         cand_scores = torch.cat(
-            [pad(c, max_len).unsqueeze(0) for c in cand_scores], 0)
+            [pad(c, max_len, pad=self.NULL_IDX).unsqueeze(0)
+             for c in cand_scores], 0)
         return cand_scores
 
     def forward(self, xs, ys=None, cand_params=None, prev_enc=None,
@@ -259,7 +260,7 @@ def forward(self, xs, ys=None, cand_params=None, prev_enc=None,
                 (bsz x seqlen x num_features)
             candidate scores are the score the model assigned to each candidate
                 (bsz x num_cands)
-            encoder states are the (hidden, output, attn_mask) states from the
+            encoder states are the (output, hidden, attn_mask) states from the
                 encoder. feed this back in to skip encoding on the next call.
         """
         if ys is not None:
@@ -326,7 +327,7 @@ def forward(self, xs):
 
         # embed input tokens
         xes = self.dropout(self.lt(xs))
-        attn_mask = xs.ne(0).float()
+        attn_mask = xs.ne(0)
         try:
             x_lens = torch.sum(attn_mask.int(), dim=1)
             xes = pack_padded_sequence(xes, x_lens, batch_first=True)
@@ -347,7 +348,7 @@ def forward(self, xs):
             else:
                 hidden = hidden.view(-1, self.dirs, bsz, self.hsz).sum(1)
 
-        return hidden, encoder_output, attn_mask
+        return encoder_output, hidden, attn_mask
 
 
 class RNNDecoder(nn.Module):
@@ -568,7 +569,7 @@ def forward(self, xes, hidden, attn_params):
         """
         if self.attention == 'none':
             # do nothing, no attention
-            return xes
+            return xes, None
 
         if type(hidden) == tuple:
             # for lstms use the "hidden" state not the cell state
@@ -617,8 +618,7 @@ def forward(self, xes, hidden, attn_params):
             # calculate activation scores, apply mask if needed
             if attn_mask is not None:
                 # remove activation from NULL symbols
-                # TODO: is this the best operation?
-                attn_w_premask -= (1 - attn_mask) * 1e20
+                attn_w_premask.masked_fill_((1 - attn_mask), -1e20)
             attn_weights = F.softmax(attn_w_premask, dim=1)
 
         # apply the attention weights to the encoder states
diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py
index 76de2902af5..5e39e775030 100644
--- a/parlai/agents/seq2seq/seq2seq.py
+++ b/parlai/agents/seq2seq/seq2seq.py
@@ -38,11 +38,6 @@ class Seq2seqAgent(TorchAgent):
       `(Luong et al. 2015) <arxiv.org/abs/1508.04025>`_
     """
 
-    # version 1 split from version 0 on Aug 29, 2018
-    # to use version 0, use --model legacy:seq2seq:0
-    # legacy agent code is located in parlai/legacy_agents
-    VERSION = 1
-
     @classmethod
     def add_cmdline_args(cls, argparser):
         """Add command-line arguments specifically for this agent."""
@@ -109,7 +104,6 @@ def add_cmdline_args(cls, argparser):
                            help='Top k sampling from renormalized softmax in '
                                 'test/valid time, default 1 means simple '
                                 'greedy max output')
-        agent.add_argument('--seq2seq_version', default=Seq2seqAgent.VERSION)
         TorchAgent.add_cmdline_args(argparser)
         Seq2seqAgent.dictionary_class().add_cmdline_args(argparser)
         return agent
@@ -119,6 +113,9 @@ def model_version():
         """Return current version of this model, counting up from 0.
 
         Models are not backwards-compatible with older versions.
+        Version 1 split from version 0 on Aug 29, 2018.
+        To use version 0, use --model legacy:seq2seq:0
+        (legacy agent code is located in parlai/legacy_agents).
         """
         return 1
 
@@ -304,6 +301,26 @@ def batchify(self, *args, **kwargs):
         kwargs['sort'] = True  # need sorted for pack_padded
         return super().batchify(*args, **kwargs)
 
+    def _init_cuda_buffer(self, model, criterion, batchsize, maxlen):
+        """Pre-initialize CUDA buffer by doing fake forward pass."""
+        if self.use_cuda and not hasattr(self, 'buffer_initialized'):
+            try:
+                print('preinitializing pytorch cuda buffer')
+                dummy = torch.ones(batchsize, maxlen).long().cuda()
+                out = model(dummy, dummy)
+                sc = out[0]  # scores
+                loss = criterion(sc.view(-1, sc.size(-1)), dummy.view(-1))
+                loss.backward()
+                self.buffer_initialized = True
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    m = ('CUDA OOM: Lower batch size (-bs) from {} or lower '
+                         ' max sequence length (-tr) from {}'
+                         ''.format(batchsize, maxlen))
+                    raise RuntimeError(m)
+                else:
+                    raise e
+
     def train_step(self, batch):
         """Train on a single batch of examples."""
         batchsize = batch.text_vec.size(0)
@@ -317,7 +334,7 @@ def train_step(self, batch):
 
             # generated response
             scores = out[0]
-            preds = scores.max(2)[1]
+            _, preds = scores.max(2)
 
             score_view = scores.view(-1, scores.size(-1))
             loss = self.criterion(score_view, batch.label_vec.view(-1))
@@ -364,13 +381,14 @@ def eval_step(self, batch):
         cand_params = self._build_cands(batch)
         out = self.model(batch.text_vec, ys=None, cand_params=cand_params)
         scores, cand_scores = out[0], out[1]
+        _, preds = scores.max(2)
 
         if batch.label_vec is not None:
-            # calculate loss on targets
+            # calculate loss on targets with teacher forcing
             out = self.model(batch.text_vec, batch.label_vec)
-            scores = out[0]
-            f_preds = scores.max(2)[1]  # forced preds
-            score_view = scores.view(-1, scores.size(-1))
+            f_scores = out[0]  # forced scores
+            _, f_preds = f_scores.max(2)  # forced preds
+            score_view = f_scores.view(-1, f_scores.size(-1))
             loss = self.criterion(score_view, batch.label_vec.view(-1))
             # save loss to metrics
             notnull = batch.label_vec.ne(self.NULL_IDX)
@@ -387,7 +405,7 @@ def eval_step(self, batch):
             cand_choices = self._pick_cands(cand_preds, cand_params[1],
                                             batch.candidates)
 
-        text = [self._v2t(p) for p in scores.max(2)[1].cpu()]
+        text = [self._v2t(p) for p in preds.cpu()]
         return Output(text, cand_choices)
 
     def save(self, path=None):
@@ -407,7 +425,7 @@ def save(self, path=None):
             # save opt file
             with open(path + ".opt", 'wb') as handle:
                 # save version string
-                self.opt['model_version'] = Seq2seqAgent.VERSION
+                self.opt['model_version'] = self.model_version()
                 pickle.dump(self.opt, handle, protocol=pickle.HIGHEST_PROTOCOL)
 
     def load(self, path):
diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py
index 17b5281a634..d811ae6d2ef 100644
--- a/parlai/core/torch_agent.py
+++ b/parlai/core/torch_agent.py
@@ -769,25 +769,6 @@ def act(self):
         """Call batch_act with the singleton batch."""
         return self.batch_act([self.observation])[0]
 
-    def _init_cuda_buffer(self, model, criterion, batchsize, maxlen):
-        """Pre-initialize CUDA buffer by doing fake forward pass."""
-        if self.use_cuda and not hasattr(self, 'buffer_initialized'):
-            try:
-                print('preinitializing pytorch cuda buffer')
-                dummy = torch.ones(batchsize, maxlen).long().cuda()
-                sc = model(dummy, dummy)[0]
-                loss = criterion(sc.view(-1, sc.size(-1)), dummy.view(-1))
-                loss.backward()
-                self.buffer_initialized = True
-            except RuntimeError as e:
-                if 'out of memory' in str(e):
-                    m = ('CUDA OOM: Lower batch size (-bs) from {} or lower '
-                         ' max sequence length (-tr) from {}'
-                         ''.format(batchsize, maxlen))
-                    raise RuntimeError(m)
-                else:
-                    raise e
-
     def batch_act(self, observations):
         """Process a batch of observations (batchsize list of message dicts).
 

From 98f0ebaeb8f8d0ef145b931b4bae4736541835f2 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Wed, 29 Aug 2018 10:54:17 -0700
Subject: [PATCH 11/12] flake fixes

---
 parlai/agents/seq2seq/seq2seq.py |   8 +-
 parlai/core/torch_agent.py       | 122 +++++++++++++++++++------------
 2 files changed, 81 insertions(+), 49 deletions(-)

diff --git a/parlai/agents/seq2seq/seq2seq.py b/parlai/agents/seq2seq/seq2seq.py
index 5e39e775030..1e387caaa23 100644
--- a/parlai/agents/seq2seq/seq2seq.py
+++ b/parlai/agents/seq2seq/seq2seq.py
@@ -133,8 +133,8 @@ def __init__(self, opt, shared=None):
 
             if init_model is not None:
                 # if we are loading a model, should load its dict too
-                if (os.path.isfile(init_model + '.dict')
-                        or opt['dict_file'] is None):
+                if (os.path.isfile(init_model + '.dict') or
+                        opt['dict_file'] is None):
                     opt['dict_file'] = init_model + '.dict'
         super().__init__(opt, shared)
         opt = self.opt
@@ -194,8 +194,8 @@ def _init_model(self, states=None):
             longest_label=states.get('longest_label', 1),
             **kwargs)
 
-        if (opt.get('dict_tokenizer') == 'bpe'
-                and opt['embedding_type'] != 'random'):
+        if (opt.get('dict_tokenizer') == 'bpe' and
+                opt['embedding_type'] != 'random'):
             print('skipping preinitialization of embeddings for bpe')
         elif not states and opt['embedding_type'] != 'random':
             # `not states`: only set up embeddings if not loading model
diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py
index d811ae6d2ef..c60bc0b2bb1 100644
--- a/parlai/core/torch_agent.py
+++ b/parlai/core/torch_agent.py
@@ -844,10 +844,12 @@ def __init__(self, beam_size, min_length=3, padding_token=0, bos_token=1,
         # backtracking id to hypothesis at previous time step
         self.bookkeep = []
         # output tokens at each time step
-        self.outputs = [torch.Tensor(self.beam_size).long().fill_(padding_token).to(self.device)]
+        self.outputs = [torch.Tensor(self.beam_size).long()
+                        .fill_(padding_token).to(self.device)]
         # keeps tuples (score, time_step, hyp_id)
         self.finished = []
-        self.HypothesisTail = namedtuple('HypothesisTail', ['timestep', 'hypid', 'score', 'tokenid'])
+        self.HypothesisTail = namedtuple(
+            'HypothesisTail', ['timestep', 'hypid', 'score', 'tokenid'])
         self.eos_top = False
         self.eos_top_ts = None
         self.n_best_counter = 0
@@ -866,9 +868,10 @@ def advance(self, softmax_probs):
             # hypos are the same initially
             beam_scores = softmax_probs[0]
         else:
-            # we need to sum up hypo scores and current softmax scores before topk
+            # we need to sum up hypo scores and curr softmax scores before topk
             # [beam_size, voc_size]
-            beam_scores = softmax_probs + self.scores.unsqueeze(1).expand_as(softmax_probs)
+            beam_scores = (softmax_probs +
+                           self.scores.unsqueeze(1).expand_as(softmax_probs))
             for i in range(self.outputs[-1].size(0)):
                 #  if previous output hypo token had eos
                 # we penalize those word probs to never be chosen
@@ -878,12 +881,15 @@ def advance(self, softmax_probs):
 
         flatten_beam_scores = beam_scores.view(-1)  # [beam_size * voc_size]
         with torch.no_grad():
-            best_scores, best_idxs = torch.topk(flatten_beam_scores, self.beam_size, dim=-1)
+            best_scores, best_idxs = torch.topk(
+                flatten_beam_scores, self.beam_size, dim=-1)
 
         self.scores = best_scores
         self.all_scores.append(self.scores)
-        hyp_ids = best_idxs / voc_size  # get the backtracking hypothesis id as a multiple of full voc_sizes
-        tok_ids = best_idxs % voc_size  # get the actual word id from residual of the same division
+        # get the backtracking hypothesis id as a multiple of full voc_sizes
+        hyp_ids = best_idxs / voc_size
+        # get the actual word id from residual of the same division
+        tok_ids = best_idxs % voc_size
 
         self.outputs.append(tok_ids)
         self.bookkeep.append(hyp_ids)
@@ -892,7 +898,9 @@ def advance(self, softmax_probs):
         for hypid in range(self.beam_size):
             if self.outputs[-1][hypid] == self.eos:
                 #  this is finished hypo, adding to finished
-                eostail = self.HypothesisTail(timestep=len(self.outputs) - 1, hypid=hypid, score=self.scores[hypid],
+                eostail = self.HypothesisTail(timestep=len(self.outputs) - 1,
+                                              hypid=hypid,
+                                              score=self.scores[hypid],
                                               tokenid=self.eos)
                 self.finished.append(eostail)
                 self.n_best_counter += 1
@@ -906,27 +914,30 @@ def done(self):
         return self.eos_top and self.n_best_counter >= self.min_n_best
 
     def get_top_hyp(self):
-        """
-        Helper function to get single best hypothesis
+        """Get single best hypothesis.
+
         :return: hypothesis sequence and the final score
         """
         top_hypothesis_tail = self.get_rescored_finished(n_best=1)[0]
-        return self.get_hyp_from_finished(top_hypothesis_tail), top_hypothesis_tail.score
+        return (self.get_hyp_from_finished(top_hypothesis_tail),
+                top_hypothesis_tail.score)
 
     def get_hyp_from_finished(self, hypothesis_tail):
-        """
-        Extract hypothesis ending with EOS at timestep with hyp_id
+        """Extract hypothesis ending with EOS at timestep with hyp_id.
+
         :param timestep: timestep with range up to len(self.outputs)-1
         :param hyp_id: id with range up to beam_size-1
         :return: hypothesis sequence
         """
-        assert self.outputs[hypothesis_tail.timestep][hypothesis_tail.hypid] == self.eos
+        assert (self.outputs[hypothesis_tail.timestep][hypothesis_tail.hypid]
+                == self.eos)
         assert hypothesis_tail.tokenid == self.eos
         hyp_idx = []
         endback = hypothesis_tail.hypid
         for i in range(hypothesis_tail.timestep, -1, -1):
-            hyp_idx.append(self.HypothesisTail(timestep=i, hypid=endback, score=self.all_scores[i][endback],
-                                               tokenid=self.outputs[i][endback]))
+            hyp_idx.append(self.HypothesisTail(
+                timestep=i, hypid=endback, score=self.all_scores[i][endback],
+                tokenid=self.outputs[i][endback]))
             endback = self.bookkeep[i - 1][endback]
 
         return hyp_idx
@@ -949,12 +960,15 @@ def get_rescored_finished(self, n_best=None):
         rescored_finished = []
         for finished_item in self.finished:
             current_length = finished_item.timestep + 1
-            length_penalty = math.pow((1 + current_length) / 6, 0.65)  # this is from Google NMT paper
-            rescored_finished.append(self.HypothesisTail(timestep=finished_item.timestep, hypid=finished_item.hypid,
-                                                         score=finished_item.score / length_penalty,
-                                                         tokenid=finished_item.tokenid))
+            # these weights are from Google NMT paper
+            length_penalty = math.pow((1 + current_length) / 6, 0.65)
+            rescored_finished.append(self.HypothesisTail(
+                timestep=finished_item.timestep, hypid=finished_item.hypid,
+                score=finished_item.score / length_penalty,
+                tokenid=finished_item.tokenid))
 
-        srted = sorted(rescored_finished, key=attrgetter('score'), reverse=True)
+        srted = sorted(rescored_finished, key=attrgetter('score'),
+                       reverse=True)
 
         if n_best is not None:
             srted = srted[:n_best]
@@ -962,26 +976,30 @@ def get_rescored_finished(self, n_best=None):
         return srted
 
     def check_finished(self):
-        """
-        this function checks if self.finished is empty and adds hyptail
-        in that case (this will be suboptimal hypothesis since
-        the model did not get any EOS)
-        :return: None
+        """Checks if self.finished is empty and add hyptail in that case.
+
+        This will be suboptimal hypothesis since the model did not get any EOS
+
+        :returns: None
         """
         if len(self.finished) == 0:
-            # we change output because we want outputs to have this eos to pass assert in L102, it is ok since empty self.finished means junk prediction anyway
+            # we change output because we want outputs to have eos
+            # to pass assert in L102, it is ok since empty self.finished
+            # means junk prediction anyway
             self.outputs[-1][0] = self.eos
-            hyptail = self.HypothesisTail(timestep=len(self.outputs) - 1, hypid=0, score=self.all_scores[-1][0],
+            hyptail = self.HypothesisTail(timestep=len(self.outputs) - 1,
+                                          hypid=0,
+                                          score=self.all_scores[-1][0],
                                           tokenid=self.outputs[-1][0])
 
             self.finished.append(hyptail)
 
     def get_beam_dot(self, dictionary=None, n_best=None):
-        """
-        Creates pydot graph representation of the beam
+        """Creates pydot graph representation of the beam.
+
         :param outputs: self.outputs from the beam
         :param dictionary: tok 2 word dict to save words in the tree nodes
-        :return: pydot graph
+        :returns: pydot graph
         """
         try:
             import pydot
@@ -997,18 +1015,22 @@ def get_beam_dot(self, dictionary=None, n_best=None):
 
         # get top nbest hyp
         top_hyp_idx_n_best = []
-        n_best_colors = ['aquamarine', 'chocolate1', 'deepskyblue', 'green2', 'tan']
+        n_best_colors = ['aquamarine', 'chocolate1', 'deepskyblue',
+                         'green2', 'tan']
         sorted_finished = self.get_rescored_finished(n_best=n_best)
         for hyptail in sorted_finished:
+            # do not include EOS since it has rescored score not from original
+            # self.all_scores, we color EOS with black
             top_hyp_idx_n_best.append(self.get_hyp_from_finished(
-                hyptail))  # do not include EOS since it has rescored score not from original self.all_scores, we color EOS with black
+                hyptail))
 
         # create nodes
         for tstep, lis in enumerate(outputs):
             for hypid, token in enumerate(lis):
                 if tstep == 0:
                     hypid = 0  # collapse all __NULL__ nodes
-                node_tail = self.HypothesisTail(timestep=tstep, hypid=hypid, score=all_scores[tstep][hypid],
+                node_tail = self.HypothesisTail(timestep=tstep, hypid=hypid,
+                                                score=all_scores[tstep][hypid],
                                                 tokenid=token)
                 color = 'white'
                 rank = None
@@ -1018,20 +1040,30 @@ def get_beam_dot(self, dictionary=None, n_best=None):
                             color = n_best_colors[i]
                         rank = i
                         break
-                label = "<{}".format(
-                    dictionary.vec2txt([token]) if dictionary is not None else token) + " : " + "{:.{prec}f}>".format(
-                    all_scores[tstep][hypid], prec=3)
-                graph.add_node(pydot.Node(node_tail.__repr__(), label=label, fillcolor=color, style='filled',
-                                          xlabel='{}'.format(rank) if rank is not None else ''))
+                label = (
+                    "<{}".format(dictionary.vec2txt([token])
+                                 if dictionary is not None else token) +
+                    " : " +
+                    "{:.{prec}f}>".format(all_scores[tstep][hypid], prec=3))
+
+                graph.add_node(pydot.Node(
+                    node_tail.__repr__(), label=label, fillcolor=color,
+                    style='filled',
+                    xlabel='{}'.format(rank) if rank is not None else ''))
+
         # create edges
         for revtstep, lis in reversed(list(enumerate(bookkeep))):
             for i, prev_id in enumerate(lis):
-                from_node = graph.get_node('"{}"'.format(
-                    self.HypothesisTail(timestep=revtstep, hypid=prev_id, score=all_scores[revtstep][prev_id],
-                                        tokenid=outputs[revtstep][prev_id]).__repr__()))[0]
-                to_node = graph.get_node('"{}"'.format(
-                    self.HypothesisTail(timestep=revtstep + 1, hypid=i, score=all_scores[revtstep + 1][i],
-                                        tokenid=outputs[revtstep + 1][i]).__repr__()))[0]
+                from_node = graph.get_node(
+                    '"{}"'.format(self.HypothesisTail(
+                        timestep=revtstep, hypid=prev_id,
+                        score=all_scores[revtstep][prev_id],
+                        tokenid=outputs[revtstep][prev_id]).__repr__()))[0]
+                to_node = graph.get_node(
+                    '"{}"'.format(self.HypothesisTail(
+                        timestep=revtstep + 1, hypid=i,
+                        score=all_scores[revtstep + 1][i],
+                        tokenid=outputs[revtstep + 1][i]).__repr__()))[0]
                 newedge = pydot.Edge(from_node.get_name(), to_node.get_name())
                 graph.add_edge(newedge)
 

From 4f14eca51eaa3d9733b8e761d3d3557c79444154 Mon Sep 17 00:00:00 2001
From: Alexander Miller <ahm@fb.com>
Date: Wed, 29 Aug 2018 11:12:44 -0700
Subject: [PATCH 12/12] another flake fix

---
 parlai/core/torch_agent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/parlai/core/torch_agent.py b/parlai/core/torch_agent.py
index c60bc0b2bb1..54ba33e4611 100644
--- a/parlai/core/torch_agent.py
+++ b/parlai/core/torch_agent.py
@@ -929,8 +929,8 @@ def get_hyp_from_finished(self, hypothesis_tail):
         :param hyp_id: id with range up to beam_size-1
         :return: hypothesis sequence
         """
-        assert (self.outputs[hypothesis_tail.timestep][hypothesis_tail.hypid]
-                == self.eos)
+        assert (self.outputs[hypothesis_tail.timestep]
+                [hypothesis_tail.hypid] == self.eos)
         assert hypothesis_tail.tokenid == self.eos
         hyp_idx = []
         endback = hypothesis_tail.hypid