-
Notifications
You must be signed in to change notification settings - Fork 2
/
play_DQN_fixed_target.py
251 lines (197 loc) · 12.8 KB
/
play_DQN_fixed_target.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import random
import gym
import make_env_
import numpy as np
import csv
from collections import deque
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import os # for creating directories
#TODO: make epsilon 0.01, episode 13900 {:05d}, close column names,
#TODO: n_steps 1450,updating_target_freq 725
#^ Set parameters
env = make_env_.make_env('swarm',benchmark=True)
num_of_agents = env.n
state_size = (2+2+2*(num_of_agents-1)*2) # [agent's velocity(2d vector) + agent's position(2d vector) +
# other agent's relative position((n-1)*2d vector) +
# other agent's relative velocity((n-1)*2d vector)) ]
# in 3 agent case it is 2+2+2*2+2*2=12
action_size = 4 # discrete action space [up,down,left,right]
batch_size = 32 # used for batch gradient descent update
testing = True # render or not, expodation vs. exploration
render = True
n_episodes = 100000 if not testing else 50 # number of simulations
n_steps = 100 if not testing else 100 # number of steps
load_episode = 13700
updating_target_freq = 50 # rate C, reset W` <- W
output_dir = 'model_output/swarm/DQN_fixed_target_2v1'
# ────────────────────────────────────────────────────────────────────────────────
# if testing:
# import pyautogui
# ────────────────────────────────────────────────────────────────────────────────
#^ Define agent
class DQNAgent:
def __init__(self, state_size, action_size):
self.state_size = state_size # defined above
self.action_size = action_size # defined above
self.memory = deque(maxlen=2000) # double-ended queue; removes the oldest element each time that you add a new element.
self.gamma = 0.99 # discount rate
self.epsilon = 1.0 if not testing else 0.01 # exploration rate: how much to act randomly; more initially than later due to epsilon decay
self.epsilon_decay = (1-0.0005) # exponential decay rate for exploration prob
self.epsilon_min = 0.01 # minimum amount of random exploration permitted
self.learning_rate = 0.0001 if not testing else 0 # learning rate of NN
self.evaluation_model = self._build_model()
self.target_model = self._build_model()
def _build_model(self):
# neural net for approximating Q-value function: Q*(s,a) ~ Q(s,a;W)
model = Sequential() #fully connected NN
model.add(Dense(state_size*2, input_dim=self.state_size, activation='relu')) # 1st hidden layer
model.add(Dense(state_size*2, activation='relu')) # 2nd hidden layer
model.add(Dense(self.action_size, activation='linear')) # 4 actions, so 4 output neurons
model.compile(loss='mse',optimizer=Adam(lr=self.learning_rate))
return model
def update_target_weights(self):
self.target_model.set_weights(self.evaluation_model.get_weights())
def remember(self, state, action, reward, next_state, done):
self.memory.append((state, action, reward, next_state, done)) # list of previous experiences, enabling re-training later
def act(self, state):
if np.random.rand() <= self.epsilon: # take random action with epsilon probability
onehot_action = np.zeros(action_size+1)
onehot_action[random.randint(1,4)] = 1
return onehot_action
act_values = self.evaluation_model.predict(state) # predict reward value based on current state
# print(act_values)
act_index = np.argmax(act_values[0]) # pick the action with highest value
onehot_action = np.zeros(action_size+1)
onehot_action[act_index+1] = 1
return onehot_action
def replay(self, batch_size): # method that trains NN with experiences sampled from memory
minibatch = random.sample(self.memory, batch_size) # sample a minibatch from memory
for state, action, reward, next_state, done in minibatch: # extract data for each minibatch sample
target = reward # if done then target = reward
state = np.reshape(state, [1, state_size]) #! reshape the state for DQN model
next_state = np.reshape(next_state, [1, state_size]) #! reshape the state for DQN model
if not done: # if not done, then predict future discounted reward
target = (reward + self.gamma * # (target) = reward + (discount rate gamma) *
np.amax(self.target_model.predict(next_state))) # (maximum target Q based on future action a')
target_f = self.evaluation_model.predict(state) # approximately map current state to future discounted reward
target_f[0][np.argmax(action)-1] = target
history = self.evaluation_model.fit(state, target_f, epochs=1, verbose=0)
# single epoch of training with x=state
if self.epsilon > self.epsilon_min:
self.epsilon *= self.epsilon_decay
return history
def load(self, name):
self.evaluation_model.load_weights(name)
self.update_target_weights()
def save(self, name):
self.evaluation_model.save_weights(name)
#^ Interact with environment
agents = [ DQNAgent(state_size, action_size) for agent in range(num_of_agents) ] # initialise agents
#! create model output folders
for i,agent in enumerate(agents):
if not os.path.exists(output_dir + "/weights/agent{}".format(i)):
os.makedirs(output_dir + "/weights/agent{}".format(i))
#! load weights if exist
for i,agent in enumerate(agents):
file_name = (output_dir + "/weights/agent{}/".format(i) +"weights_" + '{:04d}'.format(load_episode) + ".hdf5")
if os.path.isfile(file_name):
print("Loading of model weights to use for agent {}".format(i))
agent.load(file_name)
#! statistics
# ────────────────────────────────────────────────────────────────────────────────
collision_ = ['collision_{}'.format(i) for i in range(num_of_agents)]
loss_ = ['loss_{}'.format(i) for i in range(num_of_agents)]
reward_ = ['reward_{}'.format(i) for i in range(num_of_agents)]
statistics = ['episode','epsilon']+collision_+reward_+loss_
#! closed column names
# if not testing:
# with open(output_dir + '/statistics.csv', 'a') as csvFile:
# writer = csv.writer(csvFile)
# writer.writerow(statistics)
# csvFile.close()
# ────────────────────────────────────────────────────────────────────────────────
for episode in range(1,n_episodes+1): # iterate over new episodes of the game
if(episode % 500 == 0): # my idea for better exploration
n_steps+=50
updating_target_freq+=25
# ────────────────────────────────────────────────────────────────────────────────
#^ for statistics
statictics_row=[]
collisions = [0]*num_of_agents
rewards_ = [0]*num_of_agents
losses = [0]*num_of_agents
# ────────────────────────────────────────────────────────────────────────────────
states = env.reset() # reset states at start of each new episode of the game
for step in range(1,n_steps+1): # for every step
# ────────────────────────────────────────────────────────────────────────────────
#! reset target model weights
if(step % updating_target_freq == 0):
for agent in agents:
agent.update_target_weights()
# ────────────────────────────────────────────────────────────────────────────────
if (render):
env.render()
# if (step % 4 == 0 ):
# # Take screenshot
# pic = pyautogui.screenshot()
# # Save the image
# pic.save(output_dir+'/screenshots/Screenshot_{}.png'.format(step))
# ─────────────────────────────────────────────────────────────────
# if(episode > 100 and episode < 110): env.render();
# if(episode > 500 and episode < 510): env.render();
# if(episode > 950 and episode < 1000): env.render();
# ─────────────────────────────────────────────────────────────────
all_actions=[]
for state,agent in zip(states,agents):
state = np.reshape(state, [1, state_size]) #! reshape the state for DQN model
action_i = agent.act(state)
all_actions.append(action_i)
next_states, rewards, dones, infos = env.step(all_actions) # take a step (update all agents)
# ─────────────────────────────────────────────────────────────────
#* collision,reward statistics
for i in range(num_of_agents):
collisions[i] += (infos['collision'][i])
rewards_[i] += (rewards[i])
# ────────────────────────────────────────────────────────────────────────────────
for state in next_states:
state = np.reshape(state, [1, state_size]) #! reshape the state for DQN model
for i,agent in enumerate(agents):
agent.remember(states[i], all_actions[i], rewards[i], next_states[i], dones[i])
# remember the previous timestep's state, actions, reward vs.
states = next_states # update the states
for i,agent in enumerate(agents):
if len(agent.memory) > batch_size:
history = agent.replay(batch_size) # train the agent by replaying the experiences of the episode
losses[i] += history.history['loss'][0]
print("\n episode: {}/{}, collisions: {}, \
rewards: {:.2f}|{:.2f}|{:.2f},\
losses: {:.2f}|{:.2f}|{:.2f}".format(episode,
n_episodes,
collisions[0],
rewards_[0],
rewards_[1],
rewards_[2],
losses[0],
losses[1],
losses[2]))
# ────────────────────────────────────────────────────────────────────────────────
#* episode,epsilon,collisions,rewards,losses statistics written
statictics_row.append(episode)
statictics_row.append(agents[0].epsilon)
statictics_row += (collisions)
statictics_row += (rewards_)
statictics_row += (losses)
if not testing:
with open(output_dir + '/statistics.csv', 'a') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(statictics_row)
csvFile.close()
# ────────────────────────────────────────────────────────────────────────────────
#! save weights
if not testing:
if episode % 50 == 0:
for i,agent in enumerate(agents):
file_name = (output_dir + "/weights/agent{}/".format(i) +"weights_" + '{:04d}'.format(episode) + ".hdf5")
agent.save(file_name)