bots.py

"""Module with classes for questioner and answerer bots. A generic ChatBotAgent is defined,
which is extended separate by Questioner and Answerer bot classes.

Refer ParlAI docs on general semantics of a ParlAI Agent:
    * http://parl.ai/static/docs/basic_tutorial.html#agents
    * http://parl.ai/static/docs/agents.html#parlai.core.agents.Agent
"""
from __future__ import absolute_import, division
import math

import torch
from torch import nn
from torch.autograd import Variable
from torch.autograd import backward as autograd_backward

from parlai.core.agents import Agent


def xavier_init(module):
    """Xavier initializer for module parameters."""
    for parameter in module.parameters():
        if len(parameter.data.shape) == 1:
            # 1D vector means bias
            parameter.data.fill_(0)
        else:
            fan_in = parameter.data.size(0)
            fan_out = parameter.data.size(1)
            parameter.data.normal_(0, math.sqrt(2 / (fan_in + fan_out)))
    return module


class ChatBotAgent(Agent, nn.Module):
    """Parent class for both, questioner and answerer bots. Extends a ParlAI Agent and PyTorch
    module. This class is generic implementation of how a bot should look like / act and observe
    in a generic ParlAI dialog world. It comprises of state tensor, actions list and
    observation dict.

    This parent class provides a ``listen_net`` embedding module to embed the text tokens. Also,
    ``speak_net`` module takes out a token to be given as action, based on state of agent. Both
    questioner and answerer agents observe and act in a generic way.

    Attributes
    ----------
    opt : dict
        Command-line opts passed into constructor from the world.
    observation : dict
        Observations dict exchanged during dialogs and on starting fresh episode. Has keys as
        described in ParlAI docs ('text', 'image', 'episode_done', 'reward').
    actions : list
        List of action tensors by agent, acted in the current dialog episode.
    h_state, c_state : torch.autograd.Variable
        State of the agent.
    eval_flag : boolean
        Flag indicating whether agent in training or evaluation mode.
    listen_net : nn.Embedding
    speak_net : nn.Linear
    softmax : nn.Softmax
    """
    def __init__(self, opt, shared=None):
        super(ChatBotAgent, self).__init__(opt, shared)
        nn.Module.__init__(self)
        self.id = 'ChatBotAgent'
        self.observation = None
        # standard initializations
        self.h_state = torch.Tensor()
        self.c_state = torch.Tensor()
        self.eval_flag = False
        self.actions = []

        # modules (common)
        self.listen_net = nn.Embedding(self.opt['in_vocab_size'], self.opt['embed_size'])
        self.speak_net = nn.Linear(self.opt['hidden_size'], self.opt['out_vocab_size'])
        self.softmax = nn.Softmax()

        # xavier init of listen_net and speak_net
        for module in {self.listen_net, self.speak_net}:
            module = xavier_init(module)

    def observe(self, observation):
        """Given an input token, interact for next round."""
        self.observation = observation
        if not observation.get('episode_done'):
            # embed and pass through LSTM
            token_embeds = self.listen_net(observation['text'])

            # concat with image representation (valid for abot)
            if 'image' in observation:
                token_embeds = torch.cat((token_embeds, observation['image']), 1)
            # remove all dimensions with size one
            token_embeds = token_embeds.squeeze(1)
            # update agent state using these tokens
            self.h_state, self.c_state = self.rnn(token_embeds, (self.h_state, self.c_state))
        else:
            if observation.get('reward') is not None:
                for action in self.actions:
                    action.reinforce(observation['reward'])
                autograd_backward(self.actions, [None for _ in self.actions], retain_graph=True)

                # clamp all gradients between (-5, 5)
                for parameter in self.parameters():
                    parameter.grad.data.clamp_(min=-5, max=5)

    def act(self):
        """Speak a token."""
        # compute softmax and choose a token
        out_distr = self.softmax(self.speak_net(self.h_state))

        if self.eval_flag:
            _, actions = out_distr.max(1)
            actions = actions.unsqueeze(1)
        else:
            actions = out_distr.multinomial()
            self.actions.append(actions)
        return {'text': actions.squeeze(1), 'id': self.id}

    def reset(self, batch_size=None, retain_actions=False):
        """Reset state and actions. ``opt.batch_size`` is not always used because batch_size
        changes when complete data is passed (for validation)."""
        if batch_size is None:
            batch_size = self.opt['batch_size']
        self.h_state = Variable(torch.zeros(batch_size, self.opt['hidden_size']))
        self.c_state = Variable(torch.zeros(batch_size, self.opt['hidden_size']))
        if self.opt.get('use_gpu'):
            self.h_state, self.c_state = self.h_state.cuda(), self.c_state.cuda()

        if not retain_actions:
            self.actions = []

    def train(self):
        """Switch to training mode."""
        self.eval_flag = False

    def eval(self):
        """Switch to evaluation mode."""
        self.eval_flag = True

    def forward(self):
        """Dummy forward pass."""
        pass


class Questioner(ChatBotAgent):
    """Questioner bot - extending a ParlAI Agent as well as a PyTorch module. Answerer is modeled
    as a combination of a speaker network, a listener LSTM, and a prediction network.

    At the start of new episode of dialog, a task is observed by questioner bot, which is embedded
    via listener LSTM. At each round, questioner observes the answer and acts by modelling the
    probability of output utterances based on previous state. After observing the reply from
    answerer, the listener LSTM updates the state by processing both tokens (question/answer) of
    the dialog exchange. In the final round, the prediction LSTM is unrolled twice to produce
    questioner's prediction based on the final state and assigned task.

    Attributes
    ----------
    rnn : nn.LSTMCell
        Listener LSTM module. Embedding module before listener is provided by base class.
    predict_rnn, predict_net : nn.LSTMCell, nn.Linear
        Collectively form the prediction network module.
    task_offset : int
        Offset in terms of one-hot encoding, task vectors come after width equal to question and
        answer vocabulary.
    listen_offset : int
        Offset due to listening response of answer bot. Answer token one-hot vectors would
        require width equal to answer vocabulary - next question vectors would be after that.
    """
    def __init__(self, opt, shared=None):
        opt['in_vocab_size'] = opt['q_out_vocab'] + opt['a_out_vocab'] + opt['task_vocab']
        opt['out_vocab_size'] = opt['q_out_vocab']
        super(Questioner, self).__init__(opt, shared)
        self.id = 'QBot'

        # always condition on task
        self.rnn = nn.LSTMCell(self.opt['embed_size'], self.opt['hidden_size'])

        # additional prediction network, start token included
        num_preds = sum([len(ii) for ii in self.opt['props'].values()])
        # network for predicting
        self.predict_rnn = nn.LSTMCell(self.opt['embed_size'], self.opt['hidden_size'])
        self.predict_net = nn.Linear(self.opt['hidden_size'], num_preds)

        # xavier init of rnn, predict_rnn, predict_net
        for module in {self.rnn, self.predict_rnn, self.predict_net}:
            module = xavier_init(module)

        # setting offset
        self.task_offset = opt['q_out_vocab'] + opt['a_out_vocab']
        self.listen_offset = opt['a_out_vocab']

    def predict(self, tasks, num_tokens):
        """Return an answer from the task."""
        guess_tokens = []
        guess_distr = []

        for _ in range(num_tokens):
            # explicit task dependence
            task_embeds = self.listen_net(tasks)
            task_embeds = task_embeds.squeeze()
            # unroll twice, compute softmax and choose a token
            self.h_state, self.c_state = self.predict_rnn(task_embeds,
                                                          (self.h_state, self.c_state))
            out_distr = self.softmax(self.predict_net(self.h_state))

            # if evaluating
            if self.eval_flag:
                _, actions = out_distr.max(1)
            else:
                actions = out_distr.multinomial()
                # record actions
                self.actions.append(actions)

            # record the guess and distribution
            guess_tokens.append(actions)
            guess_distr.append(out_distr)

        # return prediction
        return guess_tokens, guess_distr


class Answerer(ChatBotAgent):
    """Answerer bot - extending a ParlAI Agent as well as a PyTorch module. Answerer is modeled
    as a combination of a speaker network, a listener LSTM, and an image encoder.

    While observing, it embeds the received question tokens and concatenates them with image
    embeds, using them to update its state by listener LSTM. Answerer bot acts by choosing a
    token based on softmax probabilities obtained after passing the state through speak net. The
    image encoder embeds each one-hot attribute vector via a linear layer and concatenates all
    three encodings to obtain a unified image instance representation.

    Attributes
    ----------
    rnn : nn.LSTMCell
        Listener LSTM module. Embedding module before listener is provided by base class.
    img_net : nn.Embedding
        Image Instance Encoder module.
    listen_offset : int
        Offset due to listening response of question bot. Question token one-hot vectors would
        require width equal to question vocabulary - answer vectors would be after that.
    """
    def __init__(self, opt, shared=None):
        opt['in_vocab_size'] = opt['q_out_vocab'] + opt['a_out_vocab']
        opt['out_vocab_size'] = opt['a_out_vocab']
        super(Answerer, self).__init__(opt, shared)
        self.id = 'ABot'

        # number of attribute values
        num_attrs = sum([len(ii) for ii in self.opt['props'].values()])
        # number of unique attributes
        num_unique_attrs = len(self.opt['props'])

        # rnn input size
        rnn_input_size = num_unique_attrs * self.opt['img_feat_size'] + self.opt['embed_size']

        self.img_net = nn.Embedding(num_attrs, self.opt['img_feat_size'])
        self.rnn = nn.LSTMCell(rnn_input_size, self.opt['hidden_size'])

        # xavier init of img_net and rnn
        for module in {self.img_net, self.rnn}:
            module = xavier_init(module)

        # set offset
        self.listen_offset = opt['q_out_vocab']

    def embed_image(self, image):
        """Embed the image attributes color, shape and style into vectors of length 20 each, and
        concatenate them to make a feature vector representing the image.
        """
        embeds = self.img_net(image)
        features = torch.cat(embeds.transpose(0, 1), 1)
        return features