Skip to content
This repository has been archived by the owner on May 3, 2022. It is now read-only.

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
DavidMChan committed Feb 20, 2019
2 parents 361a3cd + 98282e9 commit 1721fcb
Show file tree
Hide file tree
Showing 41 changed files with 3,338 additions and 1,975 deletions.
34 changes: 22 additions & 12 deletions examples/rl.py
Original file line number Diff line number Diff line change
@@ -1,18 +1,19 @@
import argparse
import itertools

import gym
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Dense, Input
import numpy as np
import gym

from rinokeras.rl.policies import StandardPolicy, LSTMPolicy
from rinokeras.rl.trainers import PolicyGradient, PPO
from rinokeras.rl.env_runners import PGEnvironmentRunner, BatchRollout
from rinokeras.rl.env_runners import BatchRollout, PGEnvironmentRunner
from rinokeras.rl.policies import LSTMPolicy, StandardPolicy
from rinokeras.rl.trainers import PPO, PolicyGradient
from rinokeras.train import TrainGraph

parser = argparse.ArgumentParser('Rinokeras RL Example Script')
parser.add_argument('--env', type=str, default='CartPole-v0', help='Which gym environment to run on')
parser.add_argument('--env', type=str, default='CartPole-v0',
help='Which gym environment to run on')
parser.add_argument('--policy', type=str, choices=['standard', 'lstm'], default='standard',
help='Which type of policy to run')
parser.add_argument('--alg', type=str, choices=['vpg', 'ppo'], default='vpg',
Expand All @@ -39,7 +40,8 @@

# Placeholders
obs_ph = Input((None,) + env.observation_space.shape)
act_ph = Input((None,) + (() if discrete else env.action_space.shape), dtype=tf.int32 if discrete else tf.float32)
act_ph = Input((None,) + (() if discrete else env.action_space.shape),
dtype=tf.int32 if discrete else tf.float32)
val_ph = Input((None,))
seqlen_ph = Input((), dtype=tf.int32)

Expand All @@ -48,8 +50,10 @@
action_shape, 'discrete' if discrete else 'continuous', embedding_model, model_dim,
initial_logstd=args.logstd, n_layers_logits=1, n_layers_value=1, take_greedy_actions=False)

experiment = algorithms[args.alg](policy, distribution_strategy=tf.contrib.distribute.OneDeviceStrategy('/cpu:0'))
graph = TrainGraph.from_experiment(experiment, (obs_ph, act_ph, val_ph, seqlen_ph))
experiment = algorithms[args.alg](
policy, distribution_strategy=tf.contrib.distribute.OneDeviceStrategy('/cpu:0'))
graph = TrainGraph.from_experiment(
experiment, (obs_ph, act_ph, val_ph, seqlen_ph))

runner = PGEnvironmentRunner(env, policy, gamma)
sess = tf.InteractiveSession()
Expand All @@ -58,17 +62,22 @@
all_rewards = []

# Do Training

for t in itertools.count():
rollouts = []

for _ in range(n_rollouts_per_batch):
rollouts.append(next(runner)) # type: ignore

batch_rollout = BatchRollout(rollouts, variable_length=True, keep_as_separate_rollouts=True)
batch_rollout = BatchRollout(
rollouts, variable_length=True, keep_as_separate_rollouts=True)

if args.alg == 'ppo':
experiment.update_old_model()

for _ in range(n_updates_per_batch):
loss = graph.run('update', (batch_rollout.obs, batch_rollout.act, batch_rollout.val, batch_rollout.seqlens))
loss = graph.run('update', (batch_rollout.obs, batch_rollout.act,
batch_rollout.val, batch_rollout.seqlens))

curr_std = graph.run(policy.action_distribution.logstd)

Expand All @@ -87,4 +96,5 @@
if t > 500:
break

np.save('-'.join([args.env, args.policy, args.alg, 'logstd=' + str(args.logstd)]) + '.npy', np.array(all_rewards))
np.save('-'.join([args.env, args.policy, args.alg,
'logstd=' + str(args.logstd)]) + '.npy', np.array(all_rewards))
68 changes: 50 additions & 18 deletions rinokeras/common/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import tensorflow.keras.backend as K # pylint: disable=E0611

from .layers import WeightNormDense as Dense
from .layers import LayerNorm


class LuongAttention(Layer):
Expand Down Expand Up @@ -62,6 +63,7 @@ class AttentionQKV(Model):
def __init__(self,
key_depth: int,
value_depth: int = None,
kernel_initializer: Optional[tf.keras.initializers.Initializer] = 'glorot_uniform',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None) -> None:
Expand All @@ -75,15 +77,18 @@ def __init__(self,
self.kernel_regularizer = kernel_regularizer
self.bias_regularizer = bias_regularizer
self.activity_regularizer = activity_regularizer

self.query_layer = Dense(self.key_depth, use_bias=False,
kernel_initializer=kernel_initializer,
kernel_regularizer=self.kernel_regularizer,
bias_regularizer=self.bias_regularizer,
activity_regularizer=self.activity_regularizer)
self.query_norm = LayerNorm()
self.projection_layer = Dense(self.key_depth + self.value_depth, use_bias=False,
kernel_initializer=kernel_initializer,
kernel_regularizer=self.kernel_regularizer,
bias_regularizer=self.bias_regularizer,
activity_regularizer=self.activity_regularizer)
self.projection_norm = LayerNorm()

def call(self, inputs):
"""
Expand All @@ -92,8 +97,8 @@ def call(self, inputs):
memory_antecedent -> tensor w/ shape [batch_size, n_keyval, channels]
"""
query_antecedent, memory_antecedent = inputs
queries = self.query_layer(query_antecedent)
projection = self.projection_layer(memory_antecedent)
queries = self.query_norm(self.query_layer(query_antecedent))
projection = self.projection_norm(self.projection_layer(memory_antecedent))
keys, values = tf.split(projection, tf.stack((self.key_depth, self.value_depth)), axis=-1)

return [queries, keys, values]
Expand All @@ -106,9 +111,13 @@ class TrilinearSimilarity(Layer):
Based on https://arxiv.org/pdf/1611.01603.pdf.
"""

def __init__(self, dropout: Optional[float] = None, regularizer=None) -> None:
def __init__(self,
dropout: Optional[float] = None,
kernel_initializer: Optional[tf.keras.initializers.Initializer] = 'glorot_uniform',
regularizer=None) -> None:
super().__init__()
self.dropout = Dropout(0 if dropout is None else dropout)
self.kernel_initializer = kernel_initializer
self.regularizer = regularizer

def build(self, input_shapes):
Expand All @@ -126,16 +135,16 @@ def build(self, input_shapes):

self.query_weights = self.add_weight('query_weights',
shape=(query_channels, 1),
initializer=tf.keras.initializers.glorot_uniform(),
initializer=self.kernel_initializer,
regularizer=self.regularizer)
self.context_weights = self.add_weight('context_weights',
shape=(context_channels, 1),
initializer=tf.keras.initializers.glorot_uniform(),
initializer=self.kernel_initializer,
regularizer=self.regularizer)
self.dot_weights = self.add_weight('dot_weights',
shape=(context_channels,
context_channels),
initializer=tf.keras.initializers.glorot_uniform(),
initializer=self.kernel_initializer,
regularizer=self.regularizer)
super().build(input_shapes)

Expand Down Expand Up @@ -267,6 +276,7 @@ def call(self, queries, keys, values, mask=None):
else:
weights = self.attention_function(masked_similarity)
weights = self.dropout(weights)
tf.add_to_collection('ATTENTION_WEIGHTS', weights)
output = tf.matmul(weights, values)
return output, weights

Expand All @@ -292,7 +302,7 @@ def build(self, input_shape):
for shape in input_shape:
assert shape[-1] % self.n_heads == 0, 'Shape of feature input must be divisible by n_heads'

def call(self, inputs, mask=None):
def call(self, inputs, mask=None, return_attention_weights=False):
"""Fast multi-head attention.
:param queries: Tensor with shape [batch_size, n_queries, depth_k]
Expand All @@ -306,10 +316,13 @@ def call(self, inputs, mask=None):
queries_split = self._split_heads(queries)
keys_split = self._split_heads(keys)
values_split = self._split_heads(values)
attention_output_split, _ = self.attention_map(
attention_output_split, attention_weights = self.attention_map(
queries_split, keys_split, values_split, mask=mask)
output = self._combine_heads(attention_output_split)
return output
if return_attention_weights:
return output, attention_weights
else:
return output

def _split_heads(self, tensor):
tensor.shape.assert_has_rank(3)
Expand Down Expand Up @@ -342,6 +355,8 @@ def __init__(self,
similarity_metric: str,
n_heads: int,
dropout: Optional[float] = None,
key_size: Optional[int] = None,
kernel_initializer: Optional[tf.keras.initializers.Initializer] = 'glorot_uniform',
kernel_regularizer=None,
bias_regularizer=None,
activity_regularizer=None) -> None:
Expand All @@ -351,12 +366,16 @@ def __init__(self,
"Haven't got around to implementing other attention types yet!")

self.similarity_metric = similarity_metric
self.key_size = key_size
self.n_heads = n_heads
assert key_size is None or key_size % n_heads == 0, \
'Key size must be divisible by n_heads if provided'

self.similarity_metric = ScaledDotProductSimilarity()
self.attention_layer = MultiHeadAttentionMap(
self.similarity_metric, n_heads, dropout)

self.kernel_initializer = kernel_initializer
self.kernel_regularizer = kernel_regularizer
self.bias_regularizer = bias_regularizer
self.activity_regularizer = activity_regularizer
Expand All @@ -365,21 +384,23 @@ def __init__(self,

def build(self, input_shapes):
query_antecedent_shape, memory_antecedent_shape = input_shapes
qa_channels = query_antecedent_shape[-1]
qa_channels = query_antecedent_shape[-1] if self.key_size is None else self.key_size
ma_channels = memory_antecedent_shape[-1]
assert qa_channels % self.n_heads == 0 and ma_channels % self.n_heads == 0, \
'Feature size must be divisible by n_heads'
# assert qa_channels == ma_channels, 'Cannot combine tensors with different shapes'
self.compute_qkv = AttentionQKV(qa_channels, ma_channels,
kernel_initializer=self.kernel_initializer,
kernel_regularizer=self.kernel_regularizer,
bias_regularizer=self.bias_regularizer,
activity_regularizer=self.activity_regularizer)
self.output_layer = Dense(qa_channels, use_bias=False,
kernel_initializer=self.kernel_initializer,
kernel_regularizer=self.kernel_regularizer,
bias_regularizer=self.bias_regularizer,
activity_regularizer=self.activity_regularizer)

def call(self, inputs, mask=None):
def call(self, inputs, mask=None, return_attention_weights=False):
"""Fast multi-head self attention.
:param inputs: tuple of (query_antecedent, memory_antecedent)
Expand All @@ -389,10 +410,14 @@ def call(self, inputs, mask=None):
assert isinstance(inputs, tuple) or isinstance(inputs, list) and len(inputs) == 2, \
'Must pass query and memory'
q, k, v = self.compute_qkv(inputs)
attention_output = self.attention_layer((q, k, v), mask=mask)
attention_output, attention_weights = self.attention_layer(
(q, k, v), mask=mask, return_attention_weights=True)
output = self.output_layer(attention_output)
output = self.dropout(output)
return output
if return_attention_weights:
return output, attention_weights
else:
return output


class SelfAttention(Model):
Expand All @@ -410,13 +435,17 @@ def __init__(self,
super().__init__()
self.multi_attention = MultiHeadAttention(similarity_metric, n_heads, dropout, **kwargs)

def call(self, inputs, mask=None):
return self.multi_attention((inputs, inputs), mask=mask)
def call(self, inputs, mask=None, return_attention_weights=False):
return self.multi_attention((inputs, inputs), mask=mask, return_attention_weights=return_attention_weights)


class ContextQueryAttention(Model):

def __init__(self, attention_type: str = "trilinear", dropout: Optional[float] = None, regularizer=None) -> None:
def __init__(self,
attention_type: str = "trilinear",
dropout: Optional[float] = None,
kernel_initializer: Optional[tf.keras.initializers.Initializer] = 'glorot_uniform',
regularizer=None) -> None:
super().__init__()
if attention_type != "trilinear":
raise NotImplementedError(
Expand All @@ -425,7 +454,10 @@ def __init__(self, attention_type: str = "trilinear", dropout: Optional[float] =
self.attention_type = attention_type
self.dropout = Dropout(0 if dropout is None else dropout)
self.apply_mask = ApplyAttentionMask()
self.trilinear_similarity = TrilinearSimilarity(dropout, regularizer=regularizer)
self.trilinear_similarity = TrilinearSimilarity(
dropout,
kernel_initializer=kernel_initializer,
regularizer=regularizer)

def call(self, query, context=None, mask=None):
"""
Expand Down
53 changes: 31 additions & 22 deletions rinokeras/common/distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,52 +15,60 @@ def call(self, logits, greedy=False):
return NotImplemented

@abstractmethod
def logp_actions(self, logits, actions):
def logp_actions(self, actions):
return NotImplemented

def prob_actions(self, logits, actions):
return tf.exp(self.logp(logits, actions))
def neglogp(self, actions):
return -self.logp_actions(actions)

def prob_actions(self, actions):
return tf.exp(self.logp(self._logits, actions))

@abstractmethod
def entropy(self, logits):
def entropy(self):
return NotImplemented


class CategoricalPd(Pd):

def call(self, logits, greedy=False):
self._logits = logits
if greedy:
action = tf.argmax(logits, -1)
else:
if logits.shape.ndims == 2:
action = tf.squeeze(tf.multinomial(logits, 1), -1)
else:
u = tf.random_uniform(tf.shape(logits), dtype=logits.dtype)
action = tf.argmax(logits - tf.log(-tf.log(u)), axis=-1)
# if logits.shape.ndims == 2:
# action = tf.squeeze(tf.multinomial(logits, 1), -1)
# else:

fixed_shapes = logits.shape.as_list()[:-1]
variable_shapes = tf.shape(logits)[:-1]
action_shape = [fs if fs is not None else variable_shapes[i] for i, fs in enumerate(fixed_shapes)]
# fixed_shapes = logits.shape.as_list()[:-1]
# variable_shapes = tf.shape(logits)[:-1]
# action_shape = [fs if fs is not None else variable_shapes[i] for i, fs in enumerate(fixed_shapes)]

logits = tf.reshape(logits, (-1, logits.shape[-1]))
action = tf.squeeze(tf.multinomial(logits, 1), -1)
action = tf.reshape(action, action_shape)
# logits = tf.reshape(logits, (-1, logits.shape[-1]))
# action = tf.squeeze(tf.multinomial(logits, 1), -1)
# action = tf.reshape(action, action_shape)

return action

def logp_actions(self, logits, actions):
probs = tf.nn.softmax(logits - tf.reduce_max(logits, -1, keepdims=True))
def logp_actions(self, actions):
probs = tf.nn.softmax(self._logits - tf.reduce_max(self._logits, -1, keepdims=True))
indices = tf.one_hot(actions, depth=probs.shape[-1])
prob_act = tf.reduce_max(probs * indices, -1)
logp_act = tf.log(prob_act + 1e-8)
return logp_act
# return -tf.nn.sparse_softmax_cross_entropy_with_logits(
# labels=actions, logits=logits - tf.reduce_max(logits, -1, keepdims=True))

def entropy(self, logits):
def entropy(self):
# Have to calculate these manually b/c logp_action provides probabilities
# for a specific action.
probs = tf.nn.softmax(logits - tf.reduce_max(logits, -1, keepdims=True))
logprobs = tf.log(probs)
return - tf.reduce_mean(probs * logprobs, axis=-1)
a0 = self._logits - tf.reduce_max(self._logits, axis=-1, keepdims=True)
ea0 = tf.exp(a0)
z0 = tf.reduce_sum(ea0, axis=-1, keepdims=True)
p0 = ea0 / z0
return tf.reduce_sum(p0 * (tf.log(z0) - a0), axis=-1)


class DiagGaussianPd(Pd):
Expand All @@ -80,15 +88,16 @@ def build(self, input_shape):
# super().build(input_shape)

def call(self, logits, greedy=False):
self._logits = logits
return logits if greedy else self._add_noise(logits)

def logp_actions(self, logits, actions):
sqdiff = tf.squared_difference(actions, logits)
def logp_actions(self, actions):
sqdiff = tf.squared_difference(actions, self._logits)
reduction_axes = np.arange(-1, -self._ndim_action - 1, -1)
divconst = np.log(2.0 * np.pi) * tf.cast(tf.reduce_prod(tf.shape(actions)[1:]), tf.float32) + tf.reduce_sum(self.logstd)
return -0.5 * (tf.reduce_sum(sqdiff / self.std, reduction_axes) + divconst)

def entropy(self, logits):
def entropy(self):
return tf.reduce_sum(self._add_noise.logstd + 0.5 * np.log(2.0 * np.pi * np.e))

@property
Expand Down
Loading

0 comments on commit 1721fcb

Please sign in to comment.