Skip to content

Commit

Permalink
Merge pull request #577 from muupan/atlas
Browse files Browse the repository at this point in the history
Add an example script for RoboschoolAtlasForwardWalk-v1
  • Loading branch information
ummavi authored Nov 19, 2019
2 parents c9c32be + 8fe46ce commit 23f3032
Show file tree
Hide file tree
Showing 6 changed files with 297 additions and 0 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ ChainerRL is a deep reinforcement learning library that implements various state
![Breakout](assets/breakout.gif)
![Humanoid](assets/humanoid.gif)
![Grasping](assets/grasping.gif)
![Atlas](examples/atlas/assets/atlas.gif)

## Installation

Expand Down
33 changes: 33 additions & 0 deletions examples/atlas/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
# Atlas Learning to Walk

This directory contains an example script that trains the Boston Dynamics Robot Atlas to walk using the environment `RoboschoolAtlasForwardWalk-v1` of OpenAI Roboschool.

![Atlas](assets/atlas.gif)

## Files

- `train_soft_actor_critic_atlas.py`: Training and evalution of a Soft Actor-Critic agent

## Requirements

- roboschool (https://github.com/openai/roboschool)
- You can install from PyPI: `pip install roboschool`

## How to run

Train 10M steps using 4 simulators in parallel.
```
python examples/atlas/train_soft_actor_critic_atlas.py --num-envs 4
```

Watch how the learned agent performs. `<path to agent>` must be a path to a directory where the agent was saved (e.g. `10000000_finish` and `best` created inside the output directory specified as `--outdir`). You can specify `--monitor` to write video files as MP4.
```
python examples/atlas/train_soft_actor_critic_atlas.py --demo --render --load <path to agent> [--monitor]
```

## Results

Below is the learning curve of the example script with 4 CPUs and a single GPU, averaged over three trials with different random seeds. Each trial took around 4 days for 10M steps. After every 100K timesteps, the agent is evaluated for 20 episodes.

![LearningCurve](assets/learningcurve.png)

Binary file added examples/atlas/assets/atlas.gif
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/atlas/assets/learningcurve.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
250 changes: 250 additions & 0 deletions examples/atlas/train_soft_actor_critic_atlas.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
"""A training script of Soft Actor-Critic on RoboschoolAtlasForwardWalk-v1."""
from __future__ import print_function
from __future__ import unicode_literals
from __future__ import division
from __future__ import absolute_import
from future import standard_library
standard_library.install_aliases() # NOQA

import argparse
import functools
import logging
import os
import sys

import chainer
from chainer import functions as F
from chainer import links as L
from chainer import optimizers
import gym
import gym.wrappers
import numpy as np

import chainerrl
from chainerrl import experiments
from chainerrl import misc
from chainerrl import replay_buffer


def concat_obs_and_action(obs, action):
"""Concat observation and action to feed the critic."""
return F.concat((obs, action), axis=-1)


def make_env(args, seed, test):
if args.env.startswith('Roboschool'):
import roboschool # NOQA
env = gym.make(args.env)
# Unwrap TimiLimit wrapper
assert isinstance(env, gym.wrappers.TimeLimit)
env = env.env
# Use different random seeds for train and test envs
env_seed = 2 ** 32 - 1 - seed if test else seed
env.seed(int(env_seed))
# Cast observations to float32 because our model uses float32
env = chainerrl.wrappers.CastObservationToFloat32(env)
# Normalize action space to [-1, 1]^n
env = chainerrl.wrappers.NormalizeActionSpace(env)
if args.monitor:
env = chainerrl.wrappers.Monitor(
env, args.outdir, force=True, video_callable=lambda _: True)
if args.render:
env = chainerrl.wrappers.Render(env, mode='human')
return env


def main():

parser = argparse.ArgumentParser()
parser.add_argument('--outdir', type=str, default='results',
help='Directory path to save output files.'
' If it does not exist, it will be created.')
parser.add_argument('--env', type=str,
default='RoboschoolAtlasForwardWalk-v1',
help='OpenAI Gym env to perform algorithm on.')
parser.add_argument('--num-envs', type=int, default=4,
help='Number of envs run in parallel.')
parser.add_argument('--seed', type=int, default=0,
help='Random seed [0, 2 ** 32)')
parser.add_argument('--gpu', type=int, default=0,
help='GPU to use, set to -1 if no GPU.')
parser.add_argument('--load', type=str, default='',
help='Directory to load agent from.')
parser.add_argument('--steps', type=int, default=10 ** 7,
help='Total number of timesteps to train the agent.')
parser.add_argument('--eval-n-runs', type=int, default=20,
help='Number of episodes run for each evaluation.')
parser.add_argument('--eval-interval', type=int, default=100000,
help='Interval in timesteps between evaluations.')
parser.add_argument('--replay-start-size', type=int, default=10000,
help='Minimum replay buffer size before ' +
'performing gradient updates.')
parser.add_argument('--update-interval', type=int, default=1,
help='Interval in timesteps between model updates.')
parser.add_argument('--batch-size', type=int, default=256,
help='Minibatch size')
parser.add_argument('--render', action='store_true',
help='Render env states in a GUI window.')
parser.add_argument('--demo', action='store_true',
help='Just run evaluation, not training.')
parser.add_argument('--monitor', action='store_true',
help='Wrap env with Monitor to write videos.')
parser.add_argument('--log-interval', type=int, default=1000,
help='Interval in timesteps between outputting log'
' messages during training')
parser.add_argument('--logger-level', type=int, default=logging.INFO,
help='Level of the root logger.')
parser.add_argument('--n-hidden-channels', type=int, default=1024,
help='Number of hidden channels of NN models.')
parser.add_argument('--discount', type=float, default=0.98,
help='Discount factor.')
parser.add_argument('--n-step-return', type=int, default=3,
help='N-step return.')
parser.add_argument('--lr', type=float, default=3e-4,
help='Learning rate.')
parser.add_argument('--adam-eps', type=float, default=1e-1,
help='Adam eps.')
args = parser.parse_args()

logging.basicConfig(level=args.logger_level)

args.outdir = experiments.prepare_output_dir(
args, args.outdir, argv=sys.argv)
print('Output files are saved in {}'.format(args.outdir))

# Set a random seed used in ChainerRL
misc.set_random_seed(args.seed, gpus=(args.gpu,))

# Set different random seeds for different subprocesses.
# If seed=0 and processes=4, subprocess seeds are [0, 1, 2, 3].
# If seed=1 and processes=4, subprocess seeds are [4, 5, 6, 7].
process_seeds = np.arange(args.num_envs) + args.seed * args.num_envs
assert process_seeds.max() < 2 ** 32

def make_batch_env(test):
return chainerrl.envs.MultiprocessVectorEnv(
[functools.partial(make_env, args, process_seeds[idx], test)
for idx, env in enumerate(range(args.num_envs))])

sample_env = make_env(args, process_seeds[0], test=False)
timestep_limit = sample_env.spec.tags.get(
'wrapper_config.TimeLimit.max_episode_steps')
obs_space = sample_env.observation_space
action_space = sample_env.action_space
print('Observation space:', obs_space)
print('Action space:', action_space)
del sample_env

action_size = action_space.low.size

winit = chainer.initializers.GlorotUniform()
winit_policy_output = chainer.initializers.GlorotUniform()

def squashed_diagonal_gaussian_head(x):
assert x.shape[-1] == action_size * 2
mean, log_scale = F.split_axis(x, 2, axis=1)
log_scale = F.clip(log_scale, -20., 2.)
var = F.exp(log_scale * 2)
return chainerrl.distribution.SquashedGaussianDistribution(
mean, var=var)

policy = chainer.Sequential(
L.Linear(None, args.n_hidden_channels, initialW=winit),
F.relu,
L.Linear(None, args.n_hidden_channels, initialW=winit),
F.relu,
L.Linear(None, action_size * 2, initialW=winit_policy_output),
squashed_diagonal_gaussian_head,
)
policy_optimizer = optimizers.Adam(
args.lr, eps=args.adam_eps).setup(policy)

def make_q_func_with_optimizer():
q_func = chainer.Sequential(
concat_obs_and_action,
L.Linear(None, args.n_hidden_channels, initialW=winit),
F.relu,
L.Linear(None, args.n_hidden_channels, initialW=winit),
F.relu,
L.Linear(None, 1, initialW=winit),
)
q_func_optimizer = optimizers.Adam(
args.lr, eps=args.adam_eps).setup(q_func)
return q_func, q_func_optimizer

q_func1, q_func1_optimizer = make_q_func_with_optimizer()
q_func2, q_func2_optimizer = make_q_func_with_optimizer()

# Draw the computational graph and save it in the output directory.
fake_obs = chainer.Variable(
policy.xp.zeros_like(obs_space.low, dtype=np.float32)[None],
name='observation')
fake_action = chainer.Variable(
policy.xp.zeros_like(action_space.low, dtype=np.float32)[None],
name='action')
chainerrl.misc.draw_computational_graph(
[policy(fake_obs)], os.path.join(args.outdir, 'policy'))
chainerrl.misc.draw_computational_graph(
[q_func1(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func1'))
chainerrl.misc.draw_computational_graph(
[q_func2(fake_obs, fake_action)], os.path.join(args.outdir, 'q_func2'))

rbuf = replay_buffer.ReplayBuffer(10 ** 6, num_steps=args.n_step_return)

def burnin_action_func():
"""Select random actions until model is updated one or more times."""
return np.random.uniform(
action_space.low, action_space.high).astype(np.float32)

# Hyperparameters in http://arxiv.org/abs/1802.09477
agent = chainerrl.agents.SoftActorCritic(
policy,
q_func1,
q_func2,
policy_optimizer,
q_func1_optimizer,
q_func2_optimizer,
rbuf,
gamma=args.discount,
update_interval=args.update_interval,
replay_start_size=args.replay_start_size,
gpu=args.gpu,
minibatch_size=args.batch_size,
burnin_action_func=burnin_action_func,
entropy_target=-action_size,
temperature_optimizer=chainer.optimizers.Adam(
args.lr, eps=args.adam_eps),
)

if len(args.load) > 0:
agent.load(args.load)

if args.demo:
eval_env = make_env(args, seed=0, test=True)
eval_stats = experiments.eval_performance(
env=eval_env,
agent=agent,
n_steps=None,
n_episodes=args.eval_n_runs,
max_episode_len=timestep_limit,
)
print('n_runs: {} mean: {} median: {} stdev {}'.format(
args.eval_n_runs, eval_stats['mean'], eval_stats['median'],
eval_stats['stdev']))
else:
experiments.train_agent_batch_with_evaluation(
agent=agent,
env=make_batch_env(test=False),
eval_env=make_batch_env(test=True),
outdir=args.outdir,
steps=args.steps,
eval_n_steps=None,
eval_n_episodes=args.eval_n_runs,
eval_interval=args.eval_interval,
log_interval=args.log_interval,
max_episode_len=timestep_limit,
)


if __name__ == '__main__':
main()
13 changes: 13 additions & 0 deletions examples_tests/atlas/test_soft_actor_critic.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#!/bin/bash

set -Ceu

outdir=$(mktemp -d)

gpu="$1"

# atlas/soft_actor_critic
# Use Pendulum-v0 to test without installing roboschool
python examples/atlas/train_soft_actor_critic_atlas.py --gpu $gpu --steps 100 --outdir $outdir/atlas/soft_actor_critic --env Pendulum-v0
model=$(find $outdir/atlas/soft_actor_critic -name "*_finish")
python examples/atlas/train_soft_actor_critic_atlas.py --demo --load $model --eval-n-runs 1 --outdir $outdir/temp --gpu $gpu --env Pendulum-v0

0 comments on commit 23f3032

Please sign in to comment.