sru.py

import tensorflow as tf
from tensorflow.python.util import nest


class SimpleSRUCell(tf.contrib.rnn.RNNCell):
    """Implements a simple distribution based recurrent unit that keeps moving
    averages of the mean map embeddings of features of inputs.
    """
    """
    num_stats: phi size 
    mavg_alphas: array of alpha 
    output_dims: output size 
    recur_dims: r size 
    """

    def __init__(self, num_stats, mavg_alphas, output_dims, recur_dims,
                 summarize=True, learn_alphas=False, linear_out=False,
                 include_input=False, activation=tf.nn.relu):
        self._num_stats = num_stats
        self._output_dims = output_dims
        self._recur_dims = recur_dims
        if learn_alphas:
            init_logit_alphas = -tf.log(1.0/mavg_alphas-1)
            logit_alphas = tf.get_variable(
                'logit_alphas', initializer=init_logit_alphas
            )
            self._mavg_alphas = tf.reshape(tf.sigmoid(logit_alphas), [1, -1, 1])
        else:
            self._mavg_alphas = tf.reshape(mavg_alphas, [1, -1, 1])
        self._nalphas = int(mavg_alphas.get_shape()[0])
        self._summarize = summarize
        self._linear_out = linear_out
        self._activation = activation
        self._include_input = include_input

        # add for tflearn 
        self.W = []
        self.b = []

    """
    alpha*num_stats
    """
    @property
    def state_size(self):
        return int(self._nalphas * self._num_stats)

    @property
    def output_size(self):
        return self._output_dims

    def __call__(self, inputs, state, scope=None):
        """
        recur*: r
        state*: mu 
        stats*: phi 
        _mavg_alphas: alpha vector 
        """
        with tf.variable_scope(scope or type(self).__name__):
            # Make statistics on input.
            if self._recur_dims > 0:
                """
                r_t = f(W^r mu_{t-1} + b^r)
                """
                recur_output = self._activation(_linear(
                    state, self._recur_dims, True, scope='recur_feats'
                ), name='recur_feats_act')
                """
                phi_t = W^phi r_t + W^x x_t + b^phi 
                """
                stats = self._activation(_linear(
                    [inputs, recur_output], self._num_stats, True, scope='stats'
                ), name='stats_act')
            else:
                stats = self._activation(_linear(
                    inputs, self._num_stats, True, scope='stats'
                ), name='stats_act')
            # Compute moving averages of statistics for the state.
            with tf.variable_scope('out_state'):
                state_tensor = tf.reshape(
                    state, [-1, self._nalphas, self._num_stats], 'state_tensor'
                )
                stats_tensor = tf.reshape(
                    stats, [-1, 1, self._num_stats], 'stats_tensor'
                )
                """
                mu_t = alpha*mu_{t-1} + (1-alpha)*phi_t
                """
                out_state = tf.reshape(self._mavg_alphas*state_tensor +
                                       (1-self._mavg_alphas)*stats_tensor,
                                       [-1, self.state_size], 'out_state')
            # Compute the output.
            if self._include_input:
                output_vars = [out_state, inputs]
            else:
                output_vars = out_state
            """
            o_t = W^o mu_t + b^o
            """
            output = _linear(
                output_vars, self._output_dims, True, scope='output'
            )
            if not self._linear_out:
                output = self._activation(output, name='output_act')

            # Retrieve RNN Variables
            if not self.W: 
                with tf.variable_scope('recur_feats', reuse=True):
                    self.W.append(tf.get_variable('Matrix'))
                    self.b.append(tf.get_variable('Bias'))
                with tf.variable_scope('stats', reuse=True):
                    self.W.append(tf.get_variable('Matrix'))
                    self.b.append(tf.get_variable('Bias'))
                with tf.variable_scope('output', reuse=True):
                    self.W.append(tf.get_variable('Matrix'))
                    self.b.append(tf.get_variable('Bias'))
                print("W = ", self.W)
                print("b = ", self.b)

            """
            o_t and mu_t 
            """
            return (output, out_state)


# No longer publicly expose function in tensorflow.
def _linear(args, output_size, bias, bias_start=0.0, scope=None):
    """Linear map: sum_i(args[i] * W[i]), where W[i] is a variable.

    Args:
      args: a 2D Tensor or a list of 2D, batch x n, Tensors.
      output_size: int, second dimension of W[i].
      bias: boolean, whether to add a bias term or not.
      bias_start: starting value to initialize the bias; 0 by default.
      scope: VariableScope for the created subgraph; defaults to "Linear".

    Returns:
      A 2D Tensor with shape [batch x output_size] equal to
      sum_i(args[i] * W[i]), where W[i]s are newly created matrices.

    Raises:
      ValueError: if some of the arguments has unspecified or wrong shape.
    """
    if args is None or (nest.is_sequence(args) and not args):
        raise ValueError("`args` must be specified")
    if not nest.is_sequence(args):
        args = [args]

    # Calculate the total size of arguments on dimension 1.
    total_arg_size = 0
    shapes = [a.get_shape().as_list() for a in args]
    for shape in shapes:
        if len(shape) != 2:
            raise ValueError(
                "Linear is expecting 2D arguments: %s" %
                str(shapes))
        if not shape[1]:
            raise ValueError(
                "Linear expects shape[1] of arguments: %s" %
                str(shapes))
        else:
            total_arg_size += shape[1]

    dtype = [a.dtype for a in args][0]

    # Now the computation.
    with tf.variable_scope(scope or "Linear"):
        matrix = tf.get_variable(
            "Matrix", [total_arg_size, output_size], dtype=dtype)
        if len(args) == 1:
            res = tf.matmul(args[0], matrix)
        else:
            res = tf.matmul(tf.concat(args, 1), matrix)
        if not bias:
            return res
        bias_term = tf.get_variable(
            "Bias", [output_size],
            dtype=dtype,
            initializer=tf.constant_initializer(bias_start, dtype=dtype)
        )
    return res + bias_term