Merge branch 'develop'

CannyLab · Feb 20, 2019 · 1721fcb · 1721fcb
2 parents 361a3cd + 98282e9
commit 1721fcb
Show file tree

Hide file tree

Showing 41 changed files with 3,338 additions and 1,975 deletions.
diff --git a/examples/rl.py b/examples/rl.py
@@ -1,18 +1,19 @@
 import argparse
 import itertools
 
+import gym
+import numpy as np
 import tensorflow as tf
 from tensorflow.keras.layers import Dense, Input
-import numpy as np
-import gym
 
-from rinokeras.rl.policies import StandardPolicy, LSTMPolicy
-from rinokeras.rl.trainers import PolicyGradient, PPO
-from rinokeras.rl.env_runners import PGEnvironmentRunner, BatchRollout
+from rinokeras.rl.env_runners import BatchRollout, PGEnvironmentRunner
+from rinokeras.rl.policies import LSTMPolicy, StandardPolicy
+from rinokeras.rl.trainers import PPO, PolicyGradient
 from rinokeras.train import TrainGraph
 
 parser = argparse.ArgumentParser('Rinokeras RL Example Script')
-parser.add_argument('--env', type=str, default='CartPole-v0', help='Which gym environment to run on')
+parser.add_argument('--env', type=str, default='CartPole-v0',
+                    help='Which gym environment to run on')
 parser.add_argument('--policy', type=str, choices=['standard', 'lstm'], default='standard',
                     help='Which type of policy to run')
 parser.add_argument('--alg', type=str, choices=['vpg', 'ppo'], default='vpg',
@@ -39,7 +40,8 @@
 
 # Placeholders
 obs_ph = Input((None,) + env.observation_space.shape)
-act_ph = Input((None,) + (() if discrete else env.action_space.shape), dtype=tf.int32 if discrete else tf.float32)
+act_ph = Input((None,) + (() if discrete else env.action_space.shape),
+               dtype=tf.int32 if discrete else tf.float32)
 val_ph = Input((None,))
 seqlen_ph = Input((), dtype=tf.int32)
 
@@ -48,8 +50,10 @@
     action_shape, 'discrete' if discrete else 'continuous', embedding_model, model_dim,
     initial_logstd=args.logstd, n_layers_logits=1, n_layers_value=1, take_greedy_actions=False)
 
-experiment = algorithms[args.alg](policy, distribution_strategy=tf.contrib.distribute.OneDeviceStrategy('/cpu:0'))
-graph = TrainGraph.from_experiment(experiment, (obs_ph, act_ph, val_ph, seqlen_ph))
+experiment = algorithms[args.alg](
+    policy, distribution_strategy=tf.contrib.distribute.OneDeviceStrategy('/cpu:0'))
+graph = TrainGraph.from_experiment(
+    experiment, (obs_ph, act_ph, val_ph, seqlen_ph))
 
 runner = PGEnvironmentRunner(env, policy, gamma)
 sess = tf.InteractiveSession()
@@ -58,17 +62,22 @@
 all_rewards = []
 
 # Do Training
+
 for t in itertools.count():
     rollouts = []
+
     for _ in range(n_rollouts_per_batch):
         rollouts.append(next(runner))  # type: ignore
 
-    batch_rollout = BatchRollout(rollouts, variable_length=True, keep_as_separate_rollouts=True)
+    batch_rollout = BatchRollout(
+        rollouts, variable_length=True, keep_as_separate_rollouts=True)
 
     if args.alg == 'ppo':
         experiment.update_old_model()
+
     for _ in range(n_updates_per_batch):
-        loss = graph.run('update', (batch_rollout.obs, batch_rollout.act, batch_rollout.val, batch_rollout.seqlens))
+        loss = graph.run('update', (batch_rollout.obs, batch_rollout.act,
+                                    batch_rollout.val, batch_rollout.seqlens))
 
     curr_std = graph.run(policy.action_distribution.logstd)
 
@@ -87,4 +96,5 @@
     if t > 500:
         break
 
-np.save('-'.join([args.env, args.policy, args.alg, 'logstd=' + str(args.logstd)]) + '.npy', np.array(all_rewards))
+np.save('-'.join([args.env, args.policy, args.alg,
+                  'logstd=' + str(args.logstd)]) + '.npy', np.array(all_rewards))
diff --git a/rinokeras/common/attention.py b/rinokeras/common/attention.py
@@ -6,6 +6,7 @@
 import tensorflow.keras.backend as K  # pylint: disable=E0611
 
 from .layers import WeightNormDense as Dense
+from .layers import LayerNorm
 
 
 class LuongAttention(Layer):
@@ -62,6 +63,7 @@ class AttentionQKV(Model):
     def __init__(self,
                  key_depth: int,
                  value_depth: int = None,
+                 kernel_initializer: Optional[tf.keras.initializers.Initializer] = 'glorot_uniform',
                  kernel_regularizer=None,
                  bias_regularizer=None,
                  activity_regularizer=None) -> None:
@@ -75,15 +77,18 @@ def __init__(self,
         self.kernel_regularizer = kernel_regularizer
         self.bias_regularizer = bias_regularizer
         self.activity_regularizer = activity_regularizer
-
         self.query_layer = Dense(self.key_depth, use_bias=False,
+                                 kernel_initializer=kernel_initializer,
                                  kernel_regularizer=self.kernel_regularizer,
                                  bias_regularizer=self.bias_regularizer,
                                  activity_regularizer=self.activity_regularizer)
+        self.query_norm = LayerNorm()
         self.projection_layer = Dense(self.key_depth + self.value_depth, use_bias=False,
+                                      kernel_initializer=kernel_initializer,
                                       kernel_regularizer=self.kernel_regularizer,
                                       bias_regularizer=self.bias_regularizer,
                                       activity_regularizer=self.activity_regularizer)
+        self.projection_norm = LayerNorm()
 
     def call(self, inputs):
         """
@@ -92,8 +97,8 @@ def call(self, inputs):
                 memory_antecedent -> tensor w/ shape [batch_size, n_keyval, channels]
         """
         query_antecedent, memory_antecedent = inputs
-        queries = self.query_layer(query_antecedent)
-        projection = self.projection_layer(memory_antecedent)
+        queries = self.query_norm(self.query_layer(query_antecedent))
+        projection = self.projection_norm(self.projection_layer(memory_antecedent))
         keys, values = tf.split(projection, tf.stack((self.key_depth, self.value_depth)), axis=-1)
 
         return [queries, keys, values]
@@ -106,9 +111,13 @@ class TrilinearSimilarity(Layer):
     Based on https://arxiv.org/pdf/1611.01603.pdf.
     """
 
-    def __init__(self, dropout: Optional[float] = None, regularizer=None) -> None:
+    def __init__(self,
+                 dropout: Optional[float] = None,
+                 kernel_initializer: Optional[tf.keras.initializers.Initializer] = 'glorot_uniform',
+                 regularizer=None) -> None:
         super().__init__()
         self.dropout = Dropout(0 if dropout is None else dropout)
+        self.kernel_initializer = kernel_initializer
         self.regularizer = regularizer
 
     def build(self, input_shapes):
@@ -126,16 +135,16 @@ def build(self, input_shapes):
 
         self.query_weights = self.add_weight('query_weights',
                                              shape=(query_channels, 1),
-                                             initializer=tf.keras.initializers.glorot_uniform(),
+                                             initializer=self.kernel_initializer,
                                              regularizer=self.regularizer)
         self.context_weights = self.add_weight('context_weights',
                                                shape=(context_channels, 1),
-                                               initializer=tf.keras.initializers.glorot_uniform(),
+                                               initializer=self.kernel_initializer,
                                                regularizer=self.regularizer)
         self.dot_weights = self.add_weight('dot_weights',
                                            shape=(context_channels,
                                                   context_channels),
-                                           initializer=tf.keras.initializers.glorot_uniform(),
+                                           initializer=self.kernel_initializer,
                                            regularizer=self.regularizer)
         super().build(input_shapes)
 
@@ -267,6 +276,7 @@ def call(self, queries, keys, values, mask=None):
         else:
             weights = self.attention_function(masked_similarity)
         weights = self.dropout(weights)
+        tf.add_to_collection('ATTENTION_WEIGHTS', weights)
         output = tf.matmul(weights, values)
         return output, weights
 
@@ -292,7 +302,7 @@ def build(self, input_shape):
         for shape in input_shape:
             assert shape[-1] % self.n_heads == 0, 'Shape of feature input must be divisible by n_heads'
 
-    def call(self, inputs, mask=None):
+    def call(self, inputs, mask=None, return_attention_weights=False):
         """Fast multi-head attention.
 
         :param queries: Tensor with shape [batch_size, n_queries, depth_k]
@@ -306,10 +316,13 @@ def call(self, inputs, mask=None):
         queries_split = self._split_heads(queries)
         keys_split = self._split_heads(keys)
         values_split = self._split_heads(values)
-        attention_output_split, _ = self.attention_map(
+        attention_output_split, attention_weights = self.attention_map(
             queries_split, keys_split, values_split, mask=mask)
         output = self._combine_heads(attention_output_split)
-        return output
+        if return_attention_weights:
+            return output, attention_weights
+        else:
+            return output
 
     def _split_heads(self, tensor):
         tensor.shape.assert_has_rank(3)
@@ -342,6 +355,8 @@ def __init__(self,
                  similarity_metric: str,
                  n_heads: int,
                  dropout: Optional[float] = None,
+                 key_size: Optional[int] = None,
+                 kernel_initializer: Optional[tf.keras.initializers.Initializer] = 'glorot_uniform',
                  kernel_regularizer=None,
                  bias_regularizer=None,
                  activity_regularizer=None) -> None:
@@ -351,12 +366,16 @@ def __init__(self,
                 "Haven't got around to implementing other attention types yet!")
 
         self.similarity_metric = similarity_metric
+        self.key_size = key_size
         self.n_heads = n_heads
+        assert key_size is None or key_size % n_heads == 0, \
+            'Key size must be divisible by n_heads if provided'
 
         self.similarity_metric = ScaledDotProductSimilarity()
         self.attention_layer = MultiHeadAttentionMap(
             self.similarity_metric, n_heads, dropout)
 
+        self.kernel_initializer = kernel_initializer
         self.kernel_regularizer = kernel_regularizer
         self.bias_regularizer = bias_regularizer
         self.activity_regularizer = activity_regularizer
@@ -365,21 +384,23 @@ def __init__(self,
 
     def build(self, input_shapes):
         query_antecedent_shape, memory_antecedent_shape = input_shapes
-        qa_channels = query_antecedent_shape[-1]
+        qa_channels = query_antecedent_shape[-1] if self.key_size is None else self.key_size
         ma_channels = memory_antecedent_shape[-1]
         assert qa_channels % self.n_heads == 0 and ma_channels % self.n_heads == 0, \
             'Feature size must be divisible by n_heads'
         # assert qa_channels == ma_channels, 'Cannot combine tensors with different shapes'
         self.compute_qkv = AttentionQKV(qa_channels, ma_channels,
+                                        kernel_initializer=self.kernel_initializer,
                                         kernel_regularizer=self.kernel_regularizer,
                                         bias_regularizer=self.bias_regularizer,
                                         activity_regularizer=self.activity_regularizer)
         self.output_layer = Dense(qa_channels, use_bias=False,
+                                  kernel_initializer=self.kernel_initializer,
                                   kernel_regularizer=self.kernel_regularizer,
                                   bias_regularizer=self.bias_regularizer,
                                   activity_regularizer=self.activity_regularizer)
 
-    def call(self, inputs, mask=None):
+    def call(self, inputs, mask=None, return_attention_weights=False):
         """Fast multi-head self attention.
 
             :param inputs: tuple of (query_antecedent, memory_antecedent)
@@ -389,10 +410,14 @@ def call(self, inputs, mask=None):
         assert isinstance(inputs, tuple) or isinstance(inputs, list) and len(inputs) == 2, \
             'Must pass query and memory'
         q, k, v = self.compute_qkv(inputs)
-        attention_output = self.attention_layer((q, k, v), mask=mask)
+        attention_output, attention_weights = self.attention_layer(
+            (q, k, v), mask=mask, return_attention_weights=True)
         output = self.output_layer(attention_output)
         output = self.dropout(output)
-        return output
+        if return_attention_weights:
+            return output, attention_weights
+        else:
+            return output
 
 
 class SelfAttention(Model):
@@ -410,13 +435,17 @@ def __init__(self,
         super().__init__()
         self.multi_attention = MultiHeadAttention(similarity_metric, n_heads, dropout, **kwargs)
 
-    def call(self, inputs, mask=None):
-        return self.multi_attention((inputs, inputs), mask=mask)
+    def call(self, inputs, mask=None, return_attention_weights=False):
+        return self.multi_attention((inputs, inputs), mask=mask, return_attention_weights=return_attention_weights)
 
 
 class ContextQueryAttention(Model):
 
-    def __init__(self, attention_type: str = "trilinear", dropout: Optional[float] = None, regularizer=None) -> None:
+    def __init__(self,
+                 attention_type: str = "trilinear",
+                 dropout: Optional[float] = None,
+                 kernel_initializer: Optional[tf.keras.initializers.Initializer] = 'glorot_uniform',
+                 regularizer=None) -> None:
         super().__init__()
         if attention_type != "trilinear":
             raise NotImplementedError(
@@ -425,7 +454,10 @@ def __init__(self, attention_type: str = "trilinear", dropout: Optional[float] =
         self.attention_type = attention_type
         self.dropout = Dropout(0 if dropout is None else dropout)
         self.apply_mask = ApplyAttentionMask()
-        self.trilinear_similarity = TrilinearSimilarity(dropout, regularizer=regularizer)
+        self.trilinear_similarity = TrilinearSimilarity(
+            dropout,
+            kernel_initializer=kernel_initializer,
+            regularizer=regularizer)
 
     def call(self, query, context=None, mask=None):
         """

diff --git a/rinokeras/common/distributions.py b/rinokeras/common/distributions.py
@@ -15,52 +15,60 @@ def call(self, logits, greedy=False):
         return NotImplemented
 
     @abstractmethod
-    def logp_actions(self, logits, actions):
+    def logp_actions(self, actions):
         return NotImplemented
 
-    def prob_actions(self, logits, actions):
-        return tf.exp(self.logp(logits, actions))
+    def neglogp(self, actions):
+        return -self.logp_actions(actions)
+
+    def prob_actions(self, actions):
+        return tf.exp(self.logp(self._logits, actions))
 
     @abstractmethod
-    def entropy(self, logits):
+    def entropy(self):
         return NotImplemented
 
 
 class CategoricalPd(Pd):
 
     def call(self, logits, greedy=False):
+        self._logits = logits
         if greedy:
             action = tf.argmax(logits, -1)
         else:
-            if logits.shape.ndims == 2:
-                action = tf.squeeze(tf.multinomial(logits, 1), -1)
-            else:
+            u = tf.random_uniform(tf.shape(logits), dtype=logits.dtype)
+            action = tf.argmax(logits - tf.log(-tf.log(u)), axis=-1)
+            # if logits.shape.ndims == 2:
+                # action = tf.squeeze(tf.multinomial(logits, 1), -1)
+            # else:
 
-                fixed_shapes = logits.shape.as_list()[:-1]
-                variable_shapes = tf.shape(logits)[:-1]
-                action_shape = [fs if fs is not None else variable_shapes[i] for i, fs in enumerate(fixed_shapes)]
+                # fixed_shapes = logits.shape.as_list()[:-1]
+                # variable_shapes = tf.shape(logits)[:-1]
+                # action_shape = [fs if fs is not None else variable_shapes[i] for i, fs in enumerate(fixed_shapes)]
 
-                logits = tf.reshape(logits, (-1, logits.shape[-1]))
-                action = tf.squeeze(tf.multinomial(logits, 1), -1)
-                action = tf.reshape(action, action_shape)
+                # logits = tf.reshape(logits, (-1, logits.shape[-1]))
+                # action = tf.squeeze(tf.multinomial(logits, 1), -1)
+                # action = tf.reshape(action, action_shape)
 
         return action
 
-    def logp_actions(self, logits, actions):
-        probs = tf.nn.softmax(logits - tf.reduce_max(logits, -1, keepdims=True))
+    def logp_actions(self, actions):
+        probs = tf.nn.softmax(self._logits - tf.reduce_max(self._logits, -1, keepdims=True))
         indices = tf.one_hot(actions, depth=probs.shape[-1])
         prob_act = tf.reduce_max(probs * indices, -1)
         logp_act = tf.log(prob_act + 1e-8)
         return logp_act
         # return -tf.nn.sparse_softmax_cross_entropy_with_logits(
             # labels=actions, logits=logits - tf.reduce_max(logits, -1, keepdims=True))
 
-    def entropy(self, logits):
+    def entropy(self):
         # Have to calculate these manually b/c logp_action provides probabilities
         # for a specific action.
-        probs = tf.nn.softmax(logits - tf.reduce_max(logits, -1, keepdims=True))
-        logprobs = tf.log(probs)
-        return - tf.reduce_mean(probs * logprobs, axis=-1)
+        a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True)
+        ea0 = tf.exp(a0)
+        z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
+        p0 = ea0 / z0
+        return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)
 
 
 class DiagGaussianPd(Pd):
@@ -80,15 +88,16 @@ def build(self, input_shape):
         # super().build(input_shape)
 
     def call(self, logits, greedy=False):
+        self._logits = logits
         return logits if greedy else self._add_noise(logits)
 
-    def logp_actions(self, logits, actions):
-        sqdiff = tf.squared_difference(actions, logits)
+    def logp_actions(self, actions):
+        sqdiff = tf.squared_difference(actions, self._logits)
         reduction_axes = np.arange(-1, -self._ndim_action - 1, -1)
         divconst = np.log(2.0 * np.pi) * tf.cast(tf.reduce_prod(tf.shape(actions)[1:]), tf.float32) + tf.reduce_sum(self.logstd)
         return -0.5 * (tf.reduce_sum(sqdiff / self.std, reduction_axes) + divconst)
 
-    def entropy(self, logits):
+    def entropy(self):
         return tf.reduce_sum(self._add_noise.logstd + 0.5 * np.log(2.0 * np.pi * np.e))
 
     @property