Add caching to the autograd batch interface (#1508)

* Added differentiable VJP transform * linting * more tests * linting * add tests * add comment * fix * more * typos * Apply suggestions from code review Co-authored-by: Nathan Killoran <co9olguy@users.noreply.github.com> * fixes * add tests * more tests * renamed * typo * Add caching to the autograd backend * more * more * more * more * caching * fix * fix * fix tests * final * update changelog * update * more * revert formatting * more * add tests * linting * merge master * Apply suggestions from code review Co-authored-by: Maria Schuld <mariaschuld@gmail.com> * fix * Apply suggestions from code review Co-authored-by: Nathan Killoran <co9olguy@users.noreply.github.com> * linting * linting * linting * remove pass * changelog * Apply suggestions from code review Co-authored-by: Tom Bromley <49409390+trbromley@users.noreply.github.com> * Update pennylane/interfaces/batch/__init__.py * Add hashing tests * Apply suggestions from code review Co-authored-by: Tom Bromley <49409390+trbromley@users.noreply.github.com> Co-authored-by: Nathan Killoran <co9olguy@users.noreply.github.com> Co-authored-by: Maria Schuld <mariaschuld@gmail.com> Co-authored-by: Tom Bromley <49409390+trbromley@users.noreply.github.com>
PennyLaneAI · Aug 20, 2021 · 117599e · 117599e
1 parent 29d1daa
commit 117599e
Show file tree

Hide file tree

Showing 10 changed files with 693 additions and 24 deletions.
diff --git a/.github/CHANGELOG.md b/.github/CHANGELOG.md
@@ -57,12 +57,15 @@
   ```
 
 * Support for differentiable execution of batches of circuits has been
-  added, via the beta `pennylane.batch` module.
+  added, via the beta `pennylane.interfaces.batch` module.
   [(#1501)](https://github.com/PennyLaneAI/pennylane/pull/1501)
+  [(#1508)](https://github.com/PennyLaneAI/pennylane/pull/1508)
 
   For example:
 
   ```python
+  from pennylane.interfaces.batch import execute
+
   def cost_fn(x):
       with qml.tape.JacobianTape() as tape1:
           qml.RX(x[0], wires=[0])
@@ -76,7 +79,11 @@
           qml.CNOT(wires=[0, 1])
           qml.probs(wires=1)
 
-      result = execute([tape1, tape2], dev, gradient_fn=param_shift)
+      result = execute(
+          [tape1, tape2], dev,
+          gradient_fn=qml.gradients.param_shift,
+          interface="autograd"
+      )
       return result[0] + result[1][0, 0]
 
   res = qml.grad(cost_fn)(params)

diff --git a/pennylane/interfaces/batch/__init__.py b/pennylane/interfaces/batch/__init__.py
@@ -15,13 +15,137 @@
 This subpackage defines functions for interfacing devices' batch execution
 capabilities with different machine learning libraries.
 """
-# pylint: disable=import-outside-toplevel,too-many-arguments
+# pylint: disable=import-outside-toplevel,too-many-arguments,too-many-branches
+from functools import wraps
+
+from cachetools import LRUCache
+import numpy as np
+
 import pennylane as qml
 
 from .autograd import execute as execute_autograd
 
 
-def execute(tapes, device, gradient_fn, interface="autograd", mode="best", gradient_kwargs=None):
+def cache_execute(fn, cache, pass_kwargs=False, return_tuple=True):
+    """Decorator that adds caching to a function that executes
+    multiple tapes on a device.
+
+    This decorator makes use of :attr:`.QuantumTape.hash` to identify
+    unique tapes.
+
+    - If a tape does not match a hash in the cache, then the tape
+      has not been previously executed. It is executed, and the result
+      added to the cache.
+
+    - If a tape matches a hash in the cache, then the tape has been previously
+      executed. The corresponding cached result is
+      extracted, and the tape is not passed to the execution function.
+
+    - Finally, there might be the case where one or more tapes in the current
+      set of tapes to be executed are identical and thus share a hash. If this is the case,
+      duplicates are removed, to avoid redundant evaluations.
+
+    Args:
+        fn (callable): The execution function to add caching to.
+            This function should have the signature ``fn(tapes, **kwargs)``,
+            and it should return ``list[tensor_like]``, with the
+            same length as the input ``tapes``.
+        cache (None or dict or Cache or bool): The cache to use. If ``None``,
+            caching will not occur.
+        pass_kwargs (bool): If ``True``, keyword arguments passed to the
+            wrapped function will be passed directly to ``fn``. If ``False``,
+            they will be ignored.
+        return_tuple (bool): If ``True``, the output of ``fn`` is returned
+            as a tuple ``(fn_ouput, [])``, to match the output of execution functions
+            that also return gradients.
+
+    Returns:
+        function: a wrapped version of the execution function ``fn`` with caching
+        support
+    """
+
+    @wraps(fn)
+    def wrapper(tapes, **kwargs):
+
+        if not pass_kwargs:
+            kwargs = {}
+
+        if cache is None or (isinstance(cache, bool) and not cache):
+            # No caching. Simply execute the execution function
+            # and return the results.
+            res = fn(tapes, **kwargs)
+            return res, [] if return_tuple else res
+
+        execution_tapes = {}
+        cached_results = {}
+        hashes = {}
+        repeated = {}
+
+        for i, tape in enumerate(tapes):
+            h = tape.hash
+
+            if h in hashes.values():
+                # Tape already exists within ``tapes``. Determine the
+                # index of the first occurrence of the tape, store this,
+                # and continue to the next iteration.
+                idx = list(hashes.keys())[list(hashes.values()).index(h)]
+                repeated[i] = idx
+                continue
+
+            hashes[i] = h
+
+            if hashes[i] in cache:
+                # Tape exists within the cache, store the cached result
+                cached_results[i] = cache[hashes[i]]
+            else:
+                # Tape does not exist within the cache, store the tape
+                # for execution via the execution function.
+                execution_tapes[i] = tape
+
+        # if there are no execution tapes, simply return!
+        if not execution_tapes:
+            if not repeated:
+                res = list(cached_results.values())
+                return res, [] if return_tuple else res
+
+        else:
+            # execute all unique tapes that do not exist in the cache
+            res = fn(execution_tapes.values(), **kwargs)
+
+        final_res = []
+
+        for i, tape in enumerate(tapes):
+            if i in cached_results:
+                # insert cached results into the results vector
+                final_res.append(cached_results[i])
+
+            elif i in repeated:
+                # insert repeated results into the results vector
+                final_res.append(final_res[repeated[i]])
+
+            else:
+                # insert evaluated results into the results vector
+                r = res.pop(0)
+                final_res.append(r)
+                cache[hashes[i]] = r
+
+        return final_res, [] if return_tuple else final_res
+
+    wrapper.fn = fn
+    return wrapper
+
+
+def execute(
+    tapes,
+    device,
+    gradient_fn,
+    interface="autograd",
+    mode="best",
+    gradient_kwargs=None,
+    cache=True,
+    cachesize=10000,
+    max_diff=2,
+):
     """Execute a batch of tapes on a device in an autodifferentiable-compatible manner.
 
     Args:
@@ -42,6 +166,13 @@ def execute(tapes, device, gradient_fn, interface="autograd", mode="best", gradi
             pass.
         gradient_kwargs (dict): dictionary of keyword arguments to pass when
             determining the gradients of tapes
+        cache (bool): Whether to cache evaluations. This can result in
+            a significant reduction in quantum evaluations during gradient computations.
+        cachesize (int): the size of the cache
+        max_diff (int): If ``gradient_fn`` is a gradient transform, this option specifies
+            the maximum number of derivatives to support. Increasing this value allows
+            for higher order derivatives to be extracted, at the cost of additional
+            (classical) computational overhead during the backwards pass.
 
     Returns:
         list[list[float]]: A nested list of tape results. Each element in
@@ -101,11 +232,15 @@ def cost_fn(params, x):
            [ 0.01983384, -0.97517033,  0.        ],
            [ 0.        ,  0.        , -0.95533649]])
     """
-    # Default execution function; simply call device.batch_execute
-    # and return no Jacobians.
-    execute_fn = lambda tapes, **kwargs: (device.batch_execute(tapes), [])
     gradient_kwargs = gradient_kwargs or {}
 
+    if isinstance(cache, bool) and cache:
+        # cache=True: create a LRUCache object
+        cache = LRUCache(maxsize=cachesize, getsizeof=len)
+
+    # the default execution function is device.batch_execute
+    execute_fn = cache_execute(device.batch_execute, cache)
+
     if gradient_fn == "device":
         # gradient function is a device method
 
@@ -116,8 +251,13 @@ def cost_fn(params, x):
             gradient_fn = None
 
         elif mode == "backward":
+            # disable caching on the forward pass
+            execute_fn = cache_execute(device.batch_execute, cache=None)
+
             # replace the backward gradient computation
-            gradient_fn = device.gradients
+            gradient_fn = cache_execute(
+                device.gradients, cache, pass_kwargs=True, return_tuple=False
+            )
 
     elif mode == "forward":
         # In "forward" mode, gradients are automatically handled
@@ -126,6 +266,10 @@ def cost_fn(params, x):
         raise ValueError("Gradient transforms cannot be used with mode='forward'")
 
     if interface == "autograd":
-        return execute_autograd(tapes, device, execute_fn, gradient_fn, gradient_kwargs)
+        res = execute_autograd(
+            tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1, max_diff=max_diff
+        )
+    else:
+        raise ValueError(f"Unknown interface {interface}")
 
-    raise ValueError(f"Unknown interface {interface}")
+    return res
diff --git a/pennylane/interfaces/batch/autograd.py b/pennylane/interfaces/batch/autograd.py
@@ -25,7 +25,7 @@
 from pennylane import numpy as np
 
 
-def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1):
+def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1, max_diff=2):
     """Execute a batch of tapes with Autograd parameters on a device.
 
     Args:
@@ -42,6 +42,10 @@ def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1):
         gradient_fn (callable): the gradient function to use to compute quantum gradients
         _n (int): a positive integer used to track nesting of derivatives, for example
             if the nth-order derivative is requested.
+        max_diff (int): If ``gradient_fn`` is a gradient transform, this option specifies
+            the maximum order of derivatives to support. Increasing this value allows
+            for higher order derivatives to be extracted, at the cost of additional
+            (classical) computational overhead during the backwards pass.
 
     Returns:
         list[list[float]]: A nested list of tape results. Each element in
@@ -64,6 +68,7 @@ def execute(tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=1):
         gradient_fn=gradient_fn,
         gradient_kwargs=gradient_kwargs,
         _n=_n,
+        max_diff=max_diff,
     )[0]
 
 
@@ -76,6 +81,7 @@ def _execute(
     gradient_fn=None,
     gradient_kwargs=None,
     _n=1,
+    max_diff=2,
 ):  # pylint: disable=dangerous-default-value,unused-argument
     """Autodifferentiable wrapper around ``Device.batch_execute``.
 
@@ -119,6 +125,7 @@ def vjp(
     gradient_fn=None,
     gradient_kwargs=None,
     _n=1,
+    max_diff=2,
 ):  # pylint: disable=dangerous-default-value,unused-argument
     """Returns the vector-Jacobian product operator for a batch of quantum tapes.
 
@@ -139,6 +146,10 @@ def vjp(
             determining the gradients of tapes
         _n (int): a positive integer used to track nesting of derivatives, for example
             if the nth-order derivative is requested.
+        max_diff (int): If ``gradient_fn`` is a gradient transform, this option specifies
+            the maximum number of derivatives to support. Increasing this value allows
+            for higher order derivatives to be extracted, at the cost of additional
+            (classical) computational overhead during the backwards pass.
 
     Returns:
         function: this function accepts the backpropagation
@@ -169,18 +180,43 @@ def grad_fn(dy):
             if "pennylane.gradients" in module_name:
 
                 # Generate and execute the required gradient tapes
-                vjp_tapes, processing_fn = qml.gradients.batch_vjp(
-                    tapes, dy, gradient_fn, reduction="append", gradient_kwargs=gradient_kwargs
-                )
-
-                # This is where the magic happens. Note that we call ``execute``.
-                # This recursion, coupled with the fact that the gradient transforms
-                # are differentiable, allows for arbitrary order differentiation.
-                vjps = processing_fn(
-                    execute(vjp_tapes, device, execute_fn, gradient_fn, gradient_kwargs, _n=_n + 1)
-                )
-
-            elif inspect.ismethod(gradient_fn) and gradient_fn.__self__ is device:
+                if _n == max_diff:
+                    with qml.tape.Unwrap(*tapes):
+                        vjp_tapes, processing_fn = qml.gradients.batch_vjp(
+                            tapes,
+                            dy,
+                            gradient_fn,
+                            reduction="append",
+                            gradient_kwargs=gradient_kwargs,
+                        )
+
+                    vjps = processing_fn(execute_fn(vjp_tapes)[0])
+
+                else:
+                    vjp_tapes, processing_fn = qml.gradients.batch_vjp(
+                        tapes, dy, gradient_fn, reduction="append", gradient_kwargs=gradient_kwargs
+                    )
+
+                    # This is where the magic happens. Note that we call ``execute``.
+                    # This recursion, coupled with the fact that the gradient transforms
+                    # are differentiable, allows for arbitrary order differentiation.
+                    vjps = processing_fn(
+                        execute(
+                            vjp_tapes,
+                            device,
+                            execute_fn,
+                            gradient_fn,
+                            gradient_kwargs,
+                            _n=_n + 1,
+                            max_diff=max_diff,
+                        )
+                    )
+
+            elif (
+                hasattr(gradient_fn, "fn")
+                and inspect.ismethod(gradient_fn.fn)
+                and gradient_fn.fn.__self__ is device
+            ):
                 # Gradient function is a device method.
                 # Note that unlike the previous branch:
                 #

diff --git a/pennylane/measure.py b/pennylane/measure.py
@@ -202,6 +202,26 @@ def queue(self, context=qml.QueuingContext):
 
         return self
 
+    @property
+    def hash(self):
+        """int: returns an integer hash uniquely representing the measurement process"""
+        if self.obs is None:
+            fingerprint = (
+                str(self.name),
+                tuple(self.wires.tolist()),
+                str(self.data),
+                self.return_type,
+            )
+        else:
+            fingerprint = (
+                str(self.obs.name),
+                tuple(self.wires.tolist()),
+                str(self.obs.data),
+                self.return_type,
+            )
+
+        return hash(fingerprint)
+
 
 def expval(op):
     r"""Expectation value of the supplied observable.

diff --git a/pennylane/operation.py b/pennylane/operation.py
@@ -233,6 +233,16 @@ def classproperty(func):
 # =============================================================================
 
 
+def _process_data(op):
+    if op.name in ("RX", "RY", "RZ", "PhaseShift", "Rot"):
+        return str([d % (2 * np.pi) for d in op.data])
+
+    if op.name in ("CRX", "CRY", "CRZ", "CRot"):
+        return str([d % (4 * np.pi) for d in op.data])
+
+    return str(op.data)
+
+
 class Operator(abc.ABC):
     r"""Base class for quantum operators supported by a device.
 
@@ -282,6 +292,11 @@ def __deepcopy__(self, memo):
                 setattr(copied_op, attribute, copy.deepcopy(value, memo))
         return copied_op
 
+    @property
+    def hash(self):
+        """int: returns an integer hash uniquely representing the operator"""
+        return hash((str(self.name), tuple(self.wires.tolist()), _process_data(self)))
+
     @classmethod
     def _matrix(cls, *params):
         """Matrix representation of the operator