Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for the parameter-shift hessian to the Torch interface #1129

Merged
merged 17 commits into from
Mar 26, 2021
Merged
Show file tree
Hide file tree
Changes from 11 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 33 additions & 0 deletions .github/CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,39 @@

<h3>New features since last release</h3>

* Computing second derivatives and Hessians of QNodes is now supported when
using the PyTorch interface.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So for my understanding, we'd also like to add for other interfaces, but Torch is the first one we're doing (maybe because it's easier)?

So this is a continuation of #961?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We've got #1131 (autograd) and #1110 (tf) also open :) Just decided to split it into three to help with code review. But feel free to review the others as well if interested!

[(#1129)](https://github.com/PennyLaneAI/pennylane/pull/1129/files)

Hessians are computed using the parameter-shift rule, and can be
evaluated on both hardware and simulator devices.

```python
from torch.autograd.functional import jacobian, hessian
dev = qml.device('default.qubit', wires=1)

@qml.qnode(dev, interface='torch', diff_method="parameter-shift")
def circuit(p):
qml.RY(p[0], wires=0)
qml.RX(p[1], wires=0)
return qml.expval(qml.PauliZ(0))

x = torch.tensor([1.0, 2.0], requires_grad=True)
```

```python
>>> circuit(x)
tensor([0.3876, 0.6124], dtype=torch.float64, grad_fn=<SqueezeBackward0>)
>>> jacobian(circuit, x)
tensor([[ 0.1751, -0.2456],
[-0.1751, 0.2456]], grad_fn=<ViewBackward>)
>>> hessian(circuit, x)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is really cool!
So when it's the hessian of a vector output, are we just lining up the individual Hessian matrices? E.g. like here. I guess this is pretty standard.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yep! With a disclaimer that I'm not entirely sure how pytorch is ordering the dimensions here

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The output below agrees with TF and Autograd, but should double check this

tensor([[[ 0.1124, 0.3826],
[ 0.3826, 0.1124]],
[[-0.1124, -0.3826],
[-0.3826, -0.1124]]])
```

* Adds a new optimizer `qml.ShotAdaptiveOptimizer`, a gradient-descent optimizer where
the shot rate is adaptively calculated using the variances of the parameter-shift gradient.
[(#1139)](https://github.com/PennyLaneAI/pennylane/pull/1139)
Expand Down
96 changes: 73 additions & 23 deletions pennylane/interfaces/torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,74 @@ def forward(ctx, input_kwargs, *input_):
if hasattr(res, "numpy"):
res = res.numpy()

ctx.saved_grad_matrices = {}

def _evaluate_grad_matrix(grad_matrix_fn):
"""Convenience function for generating gradient matrices
for the given parameter values.

This function serves two purposes:

* Avoids duplicating logic surrounding parameter unwrapping/wrapping

* Takes advantage of closure, to cache computed gradient matrices via
the ctx.saved_grad_matrices attribute, to avoid gradient matrices being
computed multiple redundant times.

This is particularly useful when differentiating vector-valued QNodes.
Because PyTorch requests the vector-GradMatrix product,
and *not* the full GradMatrix, differentiating vector-valued
functions will result in multiple backward passes.
josh146 marked this conversation as resolved.
Show resolved Hide resolved
"""
if grad_matrix_fn in ctx.saved_grad_matrices:
return ctx.saved_grad_matrices[grad_matrix_fn]

tape.set_parameters(ctx.all_params_unwrapped, trainable_only=False)
grad_matrix = getattr(tape, grad_matrix_fn)(
device, params=ctx.args, **tape.jacobian_options
albi3ro marked this conversation as resolved.
Show resolved Hide resolved
)
tape.set_parameters(ctx.all_params, trainable_only=False)

grad_matrix = torch.as_tensor(torch.from_numpy(grad_matrix), dtype=tape.dtype)
ctx.saved_grad_matrices[grad_matrix_fn] = grad_matrix

return grad_matrix

class _Jacobian(torch.autograd.Function):
@staticmethod
def forward(ctx_, parent_ctx, *input_):
"""Implements the forward pass QNode Jacobian evaluation"""
ctx_.dy = parent_ctx.dy
ctx_.save_for_backward(*input_)
jacobian = _evaluate_grad_matrix("jacobian")
return jacobian

@staticmethod
def backward(ctx_, ddy): # pragma: no cover
"""Implements the backward pass QNode vector-Hessian product"""
hessian = _evaluate_grad_matrix("hessian")

if torch.squeeze(ddy).ndim > 1:
vhp = ctx_.dy.view(1, -1) @ ddy @ hessian @ ctx_.dy.view(-1, 1)
else:
vhp = ddy @ hessian

vhp = torch.unbind(vhp.view(-1))

grad_input = []

# match the type and device of the input tensors
for i, j in zip(vhp, ctx_.saved_tensors):
res = torch.as_tensor(i, dtype=tape.dtype)
if j.is_cuda: # pragma: no cover
cuda_device = j.get_device()
res = torch.as_tensor(res, device=cuda_device)
grad_input.append(res)

return (None,) + tuple(grad_input)

ctx.jacobian = _Jacobian

# if any input tensor uses the GPU, the output should as well
for i in input_:
if isinstance(i, torch.Tensor):
Expand All @@ -94,30 +162,12 @@ def forward(ctx, input_kwargs, *input_):
return torch.as_tensor(torch.from_numpy(res), dtype=tape.dtype)

@staticmethod
def backward(ctx, grad_output): # pragma: no cover
def backward(ctx, dy): # pragma: no cover
"""Implements the backwards pass QNode vector-Jacobian product"""
tape = ctx.kwargs["tape"]
device = ctx.kwargs["device"]

tape.set_parameters(ctx.all_params_unwrapped, trainable_only=False)
jacobian = tape.jacobian(device, params=ctx.args, **tape.jacobian_options)
tape.set_parameters(ctx.all_params, trainable_only=False)

jacobian = torch.as_tensor(jacobian, dtype=grad_output.dtype).to(grad_output)

vjp = grad_output.view(1, -1) @ jacobian
grad_input_list = torch.unbind(vjp.flatten())
grad_input = []

# match the type and device of the input tensors
for i, j in zip(grad_input_list, ctx.saved_tensors):
res = torch.as_tensor(i, dtype=tape.dtype)
if j.is_cuda: # pragma: no cover
cuda_device = j.get_device()
res = torch.as_tensor(res, device=cuda_device)
grad_input.append(res)

return (None,) + tuple(grad_input)
ctx.dy = dy
vjp = dy.view(1, -1) @ ctx.jacobian.apply(ctx, *ctx.saved_tensors)
vjp = torch.unbind(vjp.view(-1))
return (None,) + tuple(vjp)


class TorchInterface(AnnotatedQueue):
Expand Down
11 changes: 10 additions & 1 deletion pennylane/tape/jacobian_tape.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ class JacobianTape(QuantumTape):
def __init__(self, name=None, do_queue=True):
super().__init__(name=name, do_queue=do_queue)
self.jacobian_options = {}
self.hessian_options = {}
josh146 marked this conversation as resolved.
Show resolved Hide resolved

def _grad_method(self, idx, use_graph=True, default_method="F"):
"""Determine the correct partial derivative computation method for each gate parameter.
Expand Down Expand Up @@ -730,11 +731,19 @@ def hessian(self, device, params=None, **options):

if hessian is None:
# create the Hessian matrix
hessian = np.zeros((len(params), len(params)), dtype=float)
if self.output_dim is not None:
hessian = np.zeros(
(len(params), len(params), np.prod(self.output_dim)), dtype=float
)
else:
hessian = np.zeros((len(params), len(params)), dtype=float)

if i == j:
hessian[i, i] = g
else:
hessian[i, j] = hessian[j, i] = g

if self.output_dim == 1:
hessian = np.squeeze(hessian, axis=-1)

return hessian
144 changes: 144 additions & 0 deletions tests/interfaces/test_qnode_torch.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import pennylane as qml
from pennylane import qnode, QNode
from pennylane.tape import JacobianTape
from torch.autograd.functional import hessian, jacobian


@pytest.mark.parametrize(
Expand Down Expand Up @@ -527,6 +528,149 @@ def circuit():
assert isinstance(res[0], torch.Tensor)
assert isinstance(res[1], torch.Tensor)

def test_hessian(self, dev_name, diff_method, mocker, tol):
"""Test hessian calculation of a scalar valued QNode"""
if diff_method not in {"parameter-shift", "backprop"}:
pytest.skip("Test only supports parameter-shift or backprop")

dev = qml.device(dev_name, wires=1)

@qnode(dev, diff_method=diff_method, interface="torch")
def circuit(x):
qml.RY(x[0], wires=0)
qml.RX(x[1], wires=0)
return qml.expval(qml.PauliZ(0))

x = torch.tensor([1.0, 2.0], requires_grad=True)
res = circuit(x)

res.backward()
g = x.grad

spy = mocker.spy(JacobianTape, "hessian")
hess = hessian(circuit, x)
spy.assert_called_once()

a, b = x.detach().numpy()

expected_res = np.cos(a) * np.cos(b)
assert np.allclose(res.detach(), expected_res, atol=tol, rtol=0)

expected_g = [-np.sin(a) * np.cos(b), -np.cos(a) * np.sin(b)]
assert np.allclose(g.detach(), expected_g, atol=tol, rtol=0)

expected_hess = [
[-np.cos(a) * np.cos(b), np.sin(a) * np.sin(b)],
[np.sin(a) * np.sin(b), -np.cos(a) * np.cos(b)]
]
assert np.allclose(hess.detach(), expected_hess, atol=tol, rtol=0)

def test_hessian_vector_valued(self, dev_name, diff_method, mocker, tol):
"""Test hessian calculation of a vector valued QNode"""
if diff_method not in {"parameter-shift", "backprop"}:
pytest.skip("Test only supports parameter-shift or backprop")

dev = qml.device(dev_name, wires=1)

@qnode(dev, diff_method=diff_method, interface="torch")
def circuit(x):
qml.RY(x[0], wires=0)
qml.RX(x[1], wires=0)
return qml.probs(wires=0)

x = torch.tensor([1.0, 2.0], requires_grad=True)
res = circuit(x)
jac_fn = lambda x: jacobian(circuit, x, create_graph=True)

g = jac_fn(x)

spy = mocker.spy(JacobianTape, "hessian")
hess = jacobian(jac_fn, x)
spy.assert_called_once()

a, b = x.detach().numpy()

expected_res = [
0.5 + 0.5 * np.cos(a) * np.cos(b),
0.5 - 0.5 * np.cos(a) * np.cos(b)
]
assert np.allclose(res.detach(), expected_res, atol=tol, rtol=0)

expected_g = [
[-0.5 * np.sin(a) * np.cos(b), -0.5 * np.cos(a) * np.sin(b)],
[0.5 * np.sin(a) * np.cos(b), 0.5 * np.cos(a) * np.sin(b)]
]
assert np.allclose(g.detach(), expected_g, atol=tol, rtol=0)

expected_hess = [
[
[-0.5 * np.cos(a) * np.cos(b), 0.5 * np.sin(a) * np.sin(b)],
[0.5 * np.sin(a) * np.sin(b), -0.5 * np.cos(a) * np.cos(b)]
],
[
[0.5 * np.cos(a) * np.cos(b), -0.5 * np.sin(a) * np.sin(b)],
[-0.5 * np.sin(a) * np.sin(b), 0.5 * np.cos(a) * np.cos(b)]
]
]
assert np.allclose(hess.detach(), expected_hess, atol=tol, rtol=0)

def test_hessian_ragged(self, dev_name, diff_method, mocker, tol):
"""Test hessian calculation of a ragged QNode"""
if diff_method not in {"parameter-shift", "backprop"}:
pytest.skip("Test only supports parameter-shift or backprop")

dev = qml.device(dev_name, wires=2)

@qnode(dev, diff_method=diff_method, interface="torch")
def circuit(x):
qml.RY(x[0], wires=0)
qml.RX(x[1], wires=0)
qml.RY(x[0], wires=1)
qml.RX(x[1], wires=1)
return qml.expval(qml.PauliZ(0)), qml.probs(wires=1)

x = torch.tensor([1.0, 2.0], requires_grad=True)
res = circuit(x)
jac_fn = lambda x: jacobian(circuit, x, create_graph=True)

g = jac_fn(x)

spy = mocker.spy(JacobianTape, "hessian")
hess = jacobian(jac_fn, x)
spy.assert_called_once()

a, b = x.detach().numpy()

expected_res = [
np.cos(a) * np.cos(b),
0.5 + 0.5 * np.cos(a) * np.cos(b),
0.5 - 0.5 * np.cos(a) * np.cos(b)
]
assert np.allclose(res.detach(), expected_res, atol=tol, rtol=0)

expected_g = [
[-np.sin(a) * np.cos(b), -np.cos(a) * np.sin(b)],
[-0.5 * np.sin(a) * np.cos(b), -0.5 * np.cos(a) * np.sin(b)],
[0.5 * np.sin(a) * np.cos(b), 0.5 * np.cos(a) * np.sin(b)]
]
assert np.allclose(g.detach(), expected_g, atol=tol, rtol=0)

expected_hess = [
[
[-np.cos(a) * np.cos(b), np.sin(a) * np.sin(b)],
[np.sin(a) * np.sin(b), -np.cos(a) * np.cos(b)]
],
[
[-0.5 * np.cos(a) * np.cos(b), 0.5 * np.sin(a) * np.sin(b)],
[0.5 * np.sin(a) * np.sin(b), -0.5 * np.cos(a) * np.cos(b)]
],
[
[0.5 * np.cos(a) * np.cos(b), -0.5 * np.sin(a) * np.sin(b)],
[-0.5 * np.sin(a) * np.sin(b), 0.5 * np.cos(a) * np.cos(b)]
]
]
assert np.allclose(hess.detach(), expected_hess, atol=tol, rtol=0)


def qtransform(qnode, a, framework=torch):
"""Transforms every RY(y) gate in a circuit to RX(-a*cos(y))"""
Expand Down