-
Notifications
You must be signed in to change notification settings - Fork 26
/
run.py
109 lines (95 loc) · 3.73 KB
/
run.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#!/usr/bin/python
# -*- coding: utf-8 -*-
# author: yao62995@gmail.com
import os
import argparse
import gym
import numpy as np
from DDPG_deep_deterministic_policy_gradient import DDPG
from common import logger
import filter_env
class Experiment(object):
def __init__(self, env, agent, t_max):
self.env = env
self.agent = agent
self.t_max = t_max
self.state = None
def reset(self):
self.agent.explore_noise.reset()
return self.env.reset()
def run_episode(self, test=True, monitor=False):
# env.monitor.configure(lambda _: test and monitor)
self.state = self.reset()
R = 0 # return
t = 1
term = False
while not term:
# env.render()
action = self.agent.get_action(self.state, with_noise=not test)
# action = env.action_space.sample()
state_n, reward, term, info = self.env.step(action)
if test:
term = (t >= 1000) or term
else:
term = (t >= self.t_max) or term
if not test:
self.agent.feedback(self.state, action, reward, term, state_n)
self.state = state_n
t += 1
R += reward
return R, t
def run(args):
# experiment = "InvertedPendulum-v1"
env = filter_env.makeFilteredEnv(gym.make(args.game))
print "reward_threshold:", env.spec.reward_threshold, ", timestep_limit:", env.spec.timestep_limit
save_dir = './result/%s/monitor/' % args.game
if not os.path.isdir(save_dir):
os.makedirs(save_dir)
# env.monitor.start(save_dir, video_callable=lambda _: False, force=True)
state_dim = env.observation_space.shape[0]
action_dim = env.action_space.shape[0]
action_range = (env.action_space.low, env.action_space.high)
print "action space range:", action_range
train_dir = "./result/%s/tf/" % args.game
agent = DDPG(state_dim, action_dim, train_dir=train_dir,
gpu_id=args.gpu, dim=args.dim)
t_train, t_test = 0, 0
experiment = Experiment(env, agent, args.tmax)
while True:
# test
T = t_test
R = []
# env.monitor.start(save_dir, video_callable=lambda _: False, resume=True)
while t_test - T < args.test:
r, t = experiment.run_episode(test=True, monitor=(len(R) == 0))
R.append(r)
t_test += t
if len(R) > 0:
avr = sum(R) / len(R)
logger.info('Average test return\t{} after {} timesteps of training target: ({})'.format(avr, t_train,
env.spec.reward_threshold))
# env.monitor.close()
# train
T = t_train
R = []
while t_train - T < args.train:
r, t = experiment.run_episode(test=False)
R.append(r)
t_train += t
if len(R) > 0:
avr = sum(R) / len(R)
logger.info('Average train return\t{} after {} timesteps of training'.format(avr, t_train))
# env.monitor.close()
def parser_argument():
str2bool = lambda v: v.lower() in ("yes", "true", "t", "1")
parse = argparse.ArgumentParser()
parse.add_argument("--game", type=str, help="game name")
parse.add_argument("--gpu", type=int, default=0, help="gpu number")
parse.add_argument("--dim", type=int, default=256, help="layer dim")
parse.add_argument("--train", type=int, default=1e3, help="train time step")
parse.add_argument("--test", type=int, default=1e3, help="test time step")
parse.add_argument("--tmax", type=int, default=1e3, help="time step max")
args = parse.parse_args()
return args
if __name__ == "__main__":
run(parser_argument())