-
Notifications
You must be signed in to change notification settings - Fork 0
/
QLearningAgentAndy.py
81 lines (73 loc) · 2.76 KB
/
QLearningAgentAndy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
import function_approximators as fa
import itertools
import numpy as np
import random
import math
from collections import defaultdict
class QLearningAgent():
def __init__(self, actions, discount, featureExtractor, explorationProb = 0.2):
"""
Creates Q-learning agent.
---------------------
Parameters:
- actions (list)
- discount (float)
- featureExtractor (observations)
- exploration Probability (for epsilon greedy policy)
- weights (defaultDict(float))
- numIteration (int)
"""
self.actions = actions
self.discount = discount
self.featureExtractor = featureExtractor
self.explorationProb = explorationProb
self.weights = defaultdict(float)
self.numIters = 0
def getQ(self,state,action):
score = 0
for f,v in self.featureExtractor(state,action):
score += self.weights[f] * v
return score
def get_action(self, state):
"""
Asks Q-learning agent to give an action suggestion given state/observation.
---------------------
Parameters:
- state (np.ndarray): complete or partial observation of the world.
Returns:
- action (np.ndarray): action suggestion given agent's understanding of the world.
"""
self.numIters += 1
if(random.random() < self.explorationProb):
return random.choice(self.actions)
else:
maxQ = -float('inf')
bestAction = None
for action in self.actions:
if(self.getQ(state,action) > maxQ):
maxQ = self.getQ(state,action)
bestAction = action
return bestAction
def getStepSize(self):
import math
return 1.0 / math.sqrt(self.numIters)
def give_feedback(self, state, action, reward, newState):
"""
Gives Q-learning agent feedback on the previous action based on environment reward.
---------------------
Parameters:
- reward (float): reward granted by environment based on last action.
- new_state (np.ndarray): complete or partial observation of the world.
"""
vOpt = 0
if(len(newState) != 0):
vOpt = self.getQ(newState, self.actions[0])
for nextAction in self.actions:
if(self.getQ(newState,nextAction) > vOpt):
vOpt = self.getQ(newState, nextAction)
target = reward + self.discount * vOpt
inner = self.getStepSize() * (self.getQ(state,action) - target)
for f,v in self.featureExtractor(state,action):
self.weights[f] = self.weights[f] - inner * v
if __name__ == "__main__":
print("hi")