-
Notifications
You must be signed in to change notification settings - Fork 0
/
sarsa.py
130 lines (108 loc) · 4.74 KB
/
sarsa.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from easy21 import Easy21
import numpy as np
"""
sarsa lambda 1-step implementation to solve easy21
"""
# states matrix for dealer and player
DEALER_STATE = 10 # 1 - 10
PLAYER_STATE = 21 # 1 - 21
ACTIONS = 2
# Discount factor
GAMMA = 1
# our q values function
q_states_actions = np.zeros((DEALER_STATE, PLAYER_STATE, ACTIONS), dtype=np.float32)
# our number of counts
possible_states = np.zeros((DEALER_STATE, PLAYER_STATE), dtype=int)
possible_states_actions = np.zeros((DEALER_STATE, PLAYER_STATE, ACTIONS), dtype=int)
def get_count(state):
dealer, player = state
# starts from 0 lol
return possible_states[dealer-1, player-1]
def ep_greedy(state):
"""
params:
state - current game state
return:
action - a greedy action or a random action
"""
n_0 = 100
# number of time this state has been visited
n_S = get_count(state)
# ep is always <= 1
ep = n_0 / (n_0 + n_S)
# probability 1 - ep choose the greedy action
# https://jamesmccaffrey.wordpress.com/2017/11/30/the-epsilon-greedy-algorithm/
# basically if we have a random probability between 0 and 1
# if this probability is bigger than our ep, then we choose
# greedy, else otherwise
p = np.random.uniform(0, 1)
if p > ep :
# choose the action that gives us the best reward
dealer, player = state
return np.argmax(q_states_actions[dealer-1 , player-1, :])
# choose our actions lol
stick = np.random.uniform(0, 1) > 0.5
return 0 if stick else 1
def main():
# initialise our env
env = Easy21()
# run 1000 episodes
runs = 1000
global q_states_actions
global possible_states
global possible_states_actions
global GAMMA
for x in [x * 0.1 for x in range(0, 10)]:
lambda_x = x
# run our episodes
for t in range(runs):
# initialize our records
results = []
# start the game
env.start()
# initialize first state and action
start_state = env.state()
action = ep_greedy(start_state)
# re-init eligibility traces each episode
e_states_actions = np.zeros((DEALER_STATE, PLAYER_STATE, ACTIONS), dtype=np.float32)
while not env.is_finished():
current_state = env.state()
# take action A, observe S' , Reward
next_state, reward = env.step(current_state, action)
next_dealer, next_player = next_state
#print ("next_dealer is %d, next_player is %d, immediate reward is %d" % (next_dealer, next_player, reward))
current_dealer, current_player = current_state
current_q_values = q_states_actions[current_dealer-1, current_player-1, action]
# if we should still proceed
if(next_dealer <= 10 and next_player <= 21):
# choose A' from S' using e-greedy
next_action = ep_greedy(next_state)
# calculate the TD error (delta)
next_q_values = q_states_actions[next_dealer-1, next_player-1, next_action]
td_error = reward + (GAMMA * next_q_values) - current_q_values
else:
td_error = reward - current_q_values
# add counts during the episode
e_states_actions[current_dealer-1, current_player-1, action] += 1
possible_states[current_dealer-1, current_player-1] += 1
possible_states_actions[current_dealer-1, current_player-1, action] += 1
# step-size
alpha = 1/possible_states_actions[current_dealer-1, current_player-1, action]
results.append((current_state, action))
# update the action-value for the whole of episode of each steps
for (state, action) in results:
# update all action-value and eligilibity traces in the results
dealer, player = state
q_states_actions[dealer-1, player-1, action] += alpha * td_error * e_states_actions[dealer-1, player-1, action]
e_states_actions[dealer-1, player-1, action] = GAMMA * lambda_x * e_states_actions[dealer-1, player-1, action]
if(next_dealer <= 10 and next_player <= 21):
current_state = next_state
action = next_action
# Read the q true from disk
q_true_txt = np.loadtxt('q_true.txt')
# original shape of the array
q_true = q_true_txt.reshape((DEALER_STATE,PLAYER_STATE,ACTIONS))
q_diff = q_states_actions - q_true
print(np.mean(np.square(q_diff)))
if __name__ == '__main__':
main()