-
Notifications
You must be signed in to change notification settings - Fork 1
/
ReinforcementLearning.tex
116 lines (103 loc) · 4.08 KB
/
ReinforcementLearning.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
\chapter{Reinforcement Learning}
Reinforcement learning is an area of machine learning concerned with how software agents ought to take actions in an environment so as to maximize some notion of cumulative reward.
\href{https://en.wikipedia.org/wiki/Reinforcement_learning}{wikipedia}
\section{Bellman Equations}
\newcommand{\qfunc}[2]{Q_\pi(#1, #2)}
\newcommand{\vfunc}[1]{v_\pi (#1)}
\newcommand{\policy}[2]{\pi (#1 | #2)}
\todo{RL backup diagramms}
TODO backup diagramms
\subsection{State Value Function}
\begin{align}
\vfunc{s} = \sum_{a \in A} \policy{s}{a} \qfunc{s}{a}
\end{align}
\subsection{Action Value Function}
\begin{align}
\qfunc{s}{a} = r_s^a + \gamma \sum_{s' \in S} P_{ss'}^a \vfunc{s'}
\end{align}
\subsection{State Value Function recursive}
\begin{align}
\vfunc{s} = \sum_{a \in A} \policy{s}{a} (r_s^a + \gamma \sum_{s' \in S} P_{ss'}^a \vfunc{s'})
\end{align}
\subsection{Action Value Function recursive}
\begin{align}
\qfunc{s}{a} = r_s^a + \gamma \sum_{s' \in S} P_{ss'}^a \sum_{a \in A} \policy{a'}{s'}\qfunc{s'}{a'}
\end{align}
\subsection{Optimal State Value Function}
\renewcommand{\qfunc}[2]{Q_{*}(#1, #2)}
\renewcommand{\vfunc}[1]{v_{*} (#1)}
\begin{align}
\vfunc{s} = \max_a \qfunc{s}{a}
\end{align}
\subsection{Optimal Action State Value Function}
\begin{align}
\qfunc{s}{a} = r_s^a + \gamma \sum_{s' \in S} P_{ss'}^a \vfunc{s'}
\end{align}
\subsection{Optimal State Value Function recursive}
\begin{align}
\vfunc{s} = \max\limits_a r_s^a + \gamma \sum_{s' \in S} P_{ss'}^a \vfunc{s'}
\end{align}
\subsection{Optimal Action State Value Function recursive}
\begin{align}
\qfunc{a}{s} = r_s^a + \gamma \sum_{s' \in S} P_{ss'}^a \max\limits_{a'} \qfunc{s'}{a'}
\end{align}
\section{Advantage Function}
\todo{Advantage Function Definition}
\section{Policy, Policy Gradient}
\subsection{Policy: Distribution over actions given states}
\renewcommand{\policy}[2]{\pi_\theta (#1 | #2)}
\begin{align}
\policy{a}{s} = P(a | s)
\end{align}
\subsection{Policy Gradient}
\begin{align}
\nabla_\theta \policy{s}{a} = \policy{s}{a} \nabla_\theta \log \policy{s}{a}
\end{align}
Note: this is valid for all probability distributions (the policy is a distribution over actions given states). The gradient term on the right hand side is called score function. The derivation basically uses the "log-trick".
\subsection{Proximal Policy Optimization PPO}
\newcommand{\oldpolicy}[2]{\pi_{\theta_{\mathrm{old}}} ( {#1} | {#2})}
\begin{enumerate}
\item \url{https://arxiv.org/abs/1707.06347}.
\end{enumerate}
Agent objective function:
\begin{align}
L^{\mathrm{CLIP}}( \theta)
= \EE{
\min(r_t(\theta) \hat{A_t}, \mathrm{clip}(r_t(\theta), 1-\epsilon, 1+\epsilon) \hat{A_t}
}
\end{align}
Eventually they used many more objective terms, including an Entropy Term.
Advantage $\hat{A_t}$ in this case means
\begin{align}
\hat{A_t} &= -V(s_t) + r_t + \gamma r_{t+1} + \dots + \gamma^{T-t}V(s_T)
\end{align}
If the advantage is positive, then the estimated value of the currente state $s_t$ is smaller then what you actually got
as rewards a couple time steps later. Your estimated state value was too low then.
If the advantage is negative, the estimate was too high.
In a generalized form using temporal difference formulation.
\begin{align}
\hat{A_t} &= \delta_t + (\gamma \lambda)\delta_{t+1} + \dots + (\gamma\lambda)^{T-t+1}\delta_{T-1} \\
\delta_t &= r_t + \gamma V(s_{t+1}) - V(s_t)
\end{align}
\begin{algorithm}[H]
\For{iteration=1,2,\dots}{
\For{actor=1,2, \dots}{
Run policy $\pi_{\theta_{\mathrm{old}}}$ in environment for $T$ timesteps\\
Compute advantage estimates $\hat{A_1}, \dots, \hat{A_t}$
}
Optimize surrogate $L$ wrt $\theta$, with $K$ epochs and minibatch size $M \leq NT$ \\
$\theta_{old} \leftarrow \theta$
}
\caption{PPO, Actor-Critic Style}
\end{algorithm}
\iftoggle{questions}
{
\paragraph{Questions}
\begin{enumerate}
\item How does the clipping affect learning. We see higher variance towards end of training in the plots
\item where does the objective come from (why devide the policy values)
\item why conjugate gradients
\item How does the clipping affect the gradients
\end{enumerate}
}{
}