-
Notifications
You must be signed in to change notification settings - Fork 9
/
learning.py
149 lines (112 loc) · 4.38 KB
/
learning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
import sys
sys.path.insert(0, '/users/aarnavjindal/desktop/rl')
from FlapPyBird import flappy
import numpy as np
import random
import csv
from nn import LossHistory,neural_net
import os.path
import timeit
NUM_INPUT = 11
GAMMA = 1 # Forgetting.
LR = 0.7
def train_net(model, params):
filename = params_to_filename(params)
train_frames = 300000 # Number of frames to play.
batchSize = params['batchSize']
buffer = params['buffer']
# Just stuff used below.
t = 0
replay = [] # stores tuples of (S, A, R, S').
loss_log = []
# Create a new game instance.
game_state = flappy.Game()
game_state.init_elements()
# Get initial state by doing nothing and getting the state.
state,_ = game_state.frame_step(0)
# Run the frames.
while t < train_frames:
t += 1
# Choose an action.
qval = model.predict(np.array([state]))[0]
action = (np.argmax(qval)) # best
if t%500 == 0:
print(qval)
# Take action, observe new state and get our treat.
new_state,reward = game_state.frame_step(action)
if t%1000 == 0:
print(t,action,state,reward)
# Experience replay storage.
replay.append((state, action, reward, new_state))
# If we're done observing, start training.
if t > batchSize:
# If we've stored enough in our buffer, pop the oldest.
if len(replay) > buffer:
replay.pop(0)
# Randomly sample our experience replay memory
minibatch = random.sample(replay, batchSize)
# Get training values.
X_train, y_train = process_minibatch(minibatch, model)
# Train the model on this batch.
history = LossHistory()
model.fit(
X_train, y_train, batch_size=batchSize,
nb_epoch=1, verbose=0, callbacks=[history]
)
loss_log.append(history.losses)
# Update the starting state with S'.
state = new_state
if reward == -1000:
game_state.init_elements()
state,_ = game_state.frame_step(0)
# Save the model every 2500 frames.
if t % 25000 == 0:
model.save_weights('results/saved-models/' + filename + '-' +
str(t) + '.h5',
overwrite=True)
print("Saving model %s - %d" % (filename, t))
if t%50000 == 0:
# Log results after we're done all frames.
log_results(filename, loss_log)
def log_results(filename, data_collect, loss_log):
# Save the results to a file so we can graph it later.
with open('results/sonar-frames/loss_data-' + filename + '.csv', 'w') as lf:
wr = csv.writer(lf)
for loss_item in loss_log:
wr.writerow(loss_item)
def process_minibatch(minibatch, model):
mb_len = len(minibatch)
old_states = np.zeros(shape=(mb_len, NUM_INPUT))
actions = np.zeros(shape=(mb_len,))
rewards = np.zeros(shape=(mb_len,))
new_states = np.zeros(shape=(mb_len, NUM_INPUT))
for i, m in enumerate(minibatch):
old_state_m, action_m, reward_m, new_state_m = m
old_states[i, :] = old_state_m[:]
actions[i] = action_m
rewards[i] = reward_m
new_states[i, :] = new_state_m[:]
old_qvals = model.predict(old_states, batch_size=mb_len)
new_qvals = model.predict(new_states, batch_size=mb_len)
maxQs = np.max(new_qvals, axis=1)
y = old_qvals
non_term_inds = np.where(rewards != -1000)[0]
term_inds = np.where(rewards == -1000)[0]
y[non_term_inds, actions[non_term_inds].astype(int)] = (1-LR)*y[non_term_inds, actions[non_term_inds].astype(int)] + LR*(rewards[non_term_inds] + (GAMMA * maxQs[non_term_inds]))
y[term_inds, actions[term_inds].astype(int)] = rewards[term_inds]
X_train = old_states
y_train = y
return X_train, y_train
def params_to_filename(params):
return str(params['nn'][0]) + '-' + str(params['nn'][1]) + '-' + \
str(params['batchSize']) + '-' + str(params['buffer']) + '-' + 'ver'+str(params['ver'])
if __name__ == "__main__":
nn_param = [256,256]
params = {
"batchSize": 512,
"buffer": 50000,
"nn": nn_param,
"ver": 19
}
model = neural_net(NUM_INPUT, nn_param,'')
train_net(model, params)