-
Notifications
You must be signed in to change notification settings - Fork 3.7k
/
policy_gradient.py
134 lines (115 loc) · 4.88 KB
/
policy_gradient.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#######################################################################
# Copyright (C) #
# 2016 - 2019 Pinard Liu(liujianping-ok@163.com) #
# https://www.cnblogs.com/pinard #
# Permission given to modify the code as long as you keep this #
# declaration at the top #
#######################################################################
## https://www.cnblogs.com/pinard/p/10137696.html ##
## 强化学习(十三) 策略梯度(Policy Gradient) ##
import gym
import tensorflow as tf
import numpy as np
import random
from collections import deque
# Hyper Parameters
GAMMA = 0.95 # discount factor
LEARNING_RATE=0.01
class Policy_Gradient():
def __init__(self, env):
# init some parameters
self.time_step = 0
self.state_dim = env.observation_space.shape[0]
self.action_dim = env.action_space.n
self.ep_obs, self.ep_as, self.ep_rs = [], [], []
self.create_softmax_network()
# Init session
self.session = tf.InteractiveSession()
self.session.run(tf.global_variables_initializer())
def create_softmax_network(self):
# network weights
W1 = self.weight_variable([self.state_dim, 20])
b1 = self.bias_variable([20])
W2 = self.weight_variable([20, self.action_dim])
b2 = self.bias_variable([self.action_dim])
# input layer
self.state_input = tf.placeholder("float", [None, self.state_dim])
self.tf_acts = tf.placeholder(tf.int32, [None, ], name="actions_num")
self.tf_vt = tf.placeholder(tf.float32, [None, ], name="actions_value")
# hidden layers
h_layer = tf.nn.relu(tf.matmul(self.state_input, W1) + b1)
# softmax layer
self.softmax_input = tf.matmul(h_layer, W2) + b2
#softmax output
self.all_act_prob = tf.nn.softmax(self.softmax_input, name='act_prob')
self.neg_log_prob = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=self.softmax_input,
labels=self.tf_acts)
self.loss = tf.reduce_mean(self.neg_log_prob * self.tf_vt) # reward guided loss
self.train_op = tf.train.AdamOptimizer(LEARNING_RATE).minimize(self.loss)
def weight_variable(self, shape):
initial = tf.truncated_normal(shape)
return tf.Variable(initial)
def bias_variable(self, shape):
initial = tf.constant(0.01, shape=shape)
return tf.Variable(initial)
def choose_action(self, observation):
prob_weights = self.session.run(self.all_act_prob, feed_dict={self.state_input: observation[np.newaxis, :]})
action = np.random.choice(range(prob_weights.shape[1]), p=prob_weights.ravel()) # select action w.r.t the actions prob
return action
def store_transition(self, s, a, r):
self.ep_obs.append(s)
self.ep_as.append(a)
self.ep_rs.append(r)
def learn(self):
discounted_ep_rs = np.zeros_like(self.ep_rs)
running_add = 0
for t in reversed(range(0, len(self.ep_rs))):
running_add = running_add * GAMMA + self.ep_rs[t]
discounted_ep_rs[t] = running_add
discounted_ep_rs -= np.mean(discounted_ep_rs)
discounted_ep_rs /= np.std(discounted_ep_rs)
# train on episode
self.session.run(self.train_op, feed_dict={
self.state_input: np.vstack(self.ep_obs),
self.tf_acts: np.array(self.ep_as),
self.tf_vt: discounted_ep_rs,
})
self.ep_obs, self.ep_as, self.ep_rs = [], [], [] # empty episode data
# Hyper Parameters
ENV_NAME = 'CartPole-v0'
EPISODE = 3000 # Episode limitation
STEP = 3000 # Step limitation in an episode
TEST = 10 # The number of experiment test every 100 episode
def main():
# initialize OpenAI Gym env and dqn agent
env = gym.make(ENV_NAME)
agent = Policy_Gradient(env)
for episode in range(EPISODE):
# initialize task
state = env.reset()
# Train
for step in range(STEP):
action = agent.choose_action(state) # e-greedy action for train
next_state,reward,done,_ = env.step(action)
agent.store_transition(state, action, reward)
state = next_state
if done:
#print("stick for ",step, " steps")
agent.learn()
break
# Test every 100 episodes
if episode % 100 == 0:
total_reward = 0
for i in range(TEST):
state = env.reset()
for j in range(STEP):
env.render()
action = agent.choose_action(state) # direct action for test
state,reward,done,_ = env.step(action)
total_reward += reward
if done:
break
ave_reward = total_reward/TEST
print ('episode: ',episode,'Evaluation Average Reward:',ave_reward)
if __name__ == '__main__':
main()