-
Notifications
You must be signed in to change notification settings - Fork 0
/
mdp.py
171 lines (140 loc) · 6.82 KB
/
mdp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""Markov Decision Processes (Chapter 17)
First we define an MDP, and the special case of a GridMDP, in which
states are laid out in a 2-dimensional grid. We also represent a policy
as a dictionary of {state:action} pairs, and a Utility function as a
dictionary of {state:number} pairs. We then define the value_iteration
and policy_iteration algorithms."""
from utils import *
class MDP:
"""A Markov Decision Process, defined by an initial state, transition model,
and reward function. We also keep track of a gamma value, for use by
algorithms. The transition model is represented somewhat differently from
the text. Instead of P(s' | s, a) being a probability number for each
state/state/action triplet, we instead have T(s, a) return a list of (p, s')
pairs. We also keep track of the possible states, terminal states, and
actions for each state. [page 646]"""
def __init__(self, init, actlist, terminals, gamma=.9):
update(self, init=init, actlist=actlist, terminals=terminals,
gamma=gamma, states=set(), reward={})
def R(self, state):
"Return a numeric reward for this state."
return self.reward[state]
def T(self, state, action):
"""Transition model. From a state and an action, return a list
of (probability, result-state) pairs."""
abstract
def actions(self, state):
"""Set of actions that can be performed in this state. By default, a
fixed list of actions, except for terminal states. Override this
method if you need to specialize by state."""
if state in self.terminals:
return [None]
else:
return self.actlist
class GridMDP(MDP):
"""A two-dimensional grid MDP, as in [Figure 17.1]. All you have to do is
specify the grid as a list of lists of rewards; use None for an obstacle
(unreachable state). Also, you should specify the terminal states.
An action is an (x, y) unit vector; e.g. (1, 0) means move east."""
def __init__(self, grid, terminals, init=(0, 0), gamma=.9):
grid.reverse() ## because we want row 0 on bottom, not on top
MDP.__init__(self, init, actlist=orientations,
terminals=terminals, gamma=gamma)
update(self, grid=grid, rows=len(grid), cols=len(grid[0]))
for x in range(self.cols):
for y in range(self.rows):
self.reward[x, y] = grid[y][x]
if grid[y][x] is not None:
self.states.add((x, y))
def T(self, state, action):
if action is None:
return [(0.0, state)]
else:
return [(0.8, self.go(state, action)),
(0.1, self.go(state, turn_right(action))),
(0.1, self.go(state, turn_left(action)))]
def go(self, state, direction):
"Return the state that results from going in this direction."
state1 = vector_add(state, direction)
return if_(state1 in self.states, state1, state)
def to_grid(self, mapping):
"""Convert a mapping from (x, y) to v into a [[..., v, ...]] grid."""
return list(reversed([[mapping.get((x,y), None)
for x in range(self.cols)]
for y in range(self.rows)]))
def to_arrows(self, policy):
chars = {(1, 0):'>', (0, 1):'^', (-1, 0):'<', (0, -1):'v', None: '.'}
return self.to_grid(dict([(s, chars[a]) for (s, a) in policy.items()]))
#______________________________________________________________________________
Fig[17,1] = GridMDP([[-0.04, -0.04, -0.04, +1],
[-0.04, None, -0.04, -1],
[-0.04, -0.04, -0.04, -0.04]],
terminals=[(3, 2), (3, 1)])
#______________________________________________________________________________
def value_iteration(mdp, epsilon=0.001):
"Solving an MDP by value iteration. [Fig. 17.4]"
U1 = dict([(s, 0) for s in mdp.states])
R, T, gamma = mdp.R, mdp.T, mdp.gamma
while True:
U = U1.copy()
delta = 0
for s in mdp.states:
U1[s] = R(s) + gamma * max([sum([p * U[s1] for (p, s1) in T(s, a)])
for a in mdp.actions(s)])
delta = max(delta, abs(U1[s] - U[s]))
if delta < epsilon * (1 - gamma) / gamma:
return U
def best_policy(mdp, U):
"""Given an MDP and a utility function U, determine the best policy,
as a mapping from state to action. (Equation 17.4)"""
pi = {}
for s in mdp.states:
pi[s] = argmax(mdp.actions(s), lambda a:expected_utility(a, s, U, mdp))
return pi
def expected_utility(a, s, U, mdp):
"The expected utility of doing a in state s, according to the MDP and U."
return sum([p * U[s1] for (p, s1) in mdp.T(s, a)])
#______________________________________________________________________________
def policy_iteration(mdp):
"Solve an MDP by policy iteration [Fig. 17.7]"
U = dict([(s, 0) for s in mdp.states])
pi = dict([(s, random.choice(mdp.actions(s))) for s in mdp.states])
while True:
U = policy_evaluation(pi, U, mdp)
unchanged = True
for s in mdp.states:
a = argmax(mdp.actions(s), lambda a: expected_utility(a,s,U,mdp))
if a != pi[s]:
pi[s] = a
unchanged = False
if unchanged:
return pi
def policy_evaluation(pi, U, mdp, k=20):
"""Return an updated utility mapping U from each state in the MDP to its
utility, using an approximation (modified policy iteration)."""
R, T, gamma = mdp.R, mdp.T, mdp.gamma
for i in range(k):
for s in mdp.states:
U[s] = R(s) + gamma * sum([p * U[s1] for (p, s1) in T(s, pi[s])])
return U
__doc__ += """
>>> pi = best_policy(Fig[17,1], value_iteration(Fig[17,1], .01))
>>> Fig[17,1].to_arrows(pi)
[['>', '>', '>', '.'], ['^', None, '^', '.'], ['^', '>', '^', '<']]
>>> print_table(Fig[17,1].to_arrows(pi))
> > > .
^ None ^ .
^ > ^ <
>>> print_table(Fig[17,1].to_arrows(policy_iteration(Fig[17,1])))
> > > .
^ None ^ .
^ > ^ <
"""
__doc__ += random_tests("""
>>> pi
{(3, 2): None, (3, 1): None, (3, 0): (-1, 0), (2, 1): (0, 1), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (0, 1), (1, 2): (1, 0), (2, 0): (0, 1), (0, 1): (0, 1), (2, 2): (1, 0)}
>>> value_iteration(Fig[17,1], .01)
{(3, 2): 1.0, (3, 1): -1.0, (3, 0): 0.12958868267972745, (0, 1): 0.39810203830605462, (0, 2): 0.50928545646220924, (1, 0): 0.25348746162470537, (0, 0): 0.29543540628363629, (1, 2): 0.64958064617168676, (2, 0): 0.34461306281476806, (2, 1): 0.48643676237737926, (2, 2): 0.79536093684710951}
>>> policy_iteration(Fig[17,1])
{(3, 2): None, (3, 1): None, (3, 0): (0, -1), (2, 1): (-1, 0), (0, 2): (1, 0), (1, 0): (1, 0), (0, 0): (1, 0), (1, 2): (1, 0), (2, 0): (1, 0), (0, 1): (1, 0), (2, 2): (1, 0)}
""")