deepjavalibrary · zachgk · Nov 1, 2022 · Oct 27, 2022
@@ -21,6 +21,14 @@
  * performed within the try-with-resources are recorded and the variables marked. When {@link
  * #backward(NDArray) backward function} is called, gradients are collected w.r.t previously marked
  * variables.
+ *
+ * <p>The typical behavior is to open up a gradient collector during each batch and close it during
+ * the end of the batch. In this way, the gradient is reset between batches. If the gradient
+ * collector is left open for multiple calls to backwards, the gradients collected are accumulated
+ * and added together.
+ *
+ * <p>Due to limitations in most engines, the gradient collectors are global. This means that only
+ * one can be used at a time. If multiple are opened, an error will be thrown.
  */
 public interface GradientCollector extends AutoCloseable {
 

@@ -18,15 +18,26 @@
 import ai.djl.pytorch.jni.JniUtils;
 import ai.djl.training.GradientCollector;
 
+import java.util.concurrent.atomic.AtomicBoolean;
+
 /** {@code PtGradientCollector} is the PyTorch implementation of {@link GradientCollector}. */
 public final class PtGradientCollector implements GradientCollector {
 
     private boolean gradModel;
+    private static AtomicBoolean isCollecting = new AtomicBoolean();
 
     /** Constructs a new {@code PtGradientCollector} instance. */
     public PtGradientCollector() {
         gradModel = JniUtils.isGradMode();
         JniUtils.setGradMode(true);
+
+        boolean wasCollecting = isCollecting.getAndSet(true);
+        if (wasCollecting) {
+            throw new IllegalStateException(
+                    "A PtGradientCollector is already collecting. Only one can be collecting at a"
+                            + " time");
+        }
+
         zeroGradients();
     }
 
@@ -73,6 +84,7 @@ public void close() {
         if (!gradModel) {
             JniUtils.setGradMode(false);
         }
+        isCollecting.set(false);
         // TODO: do some clean up if necessary
     }
 }
@@ -24,6 +24,7 @@
 public class TrainAirfoilWithTabNetTest {
     @Test
     public void testTrainAirfoilWithTabNet() throws TranslateException, IOException {
+        TestRequirements.nightly();
         TestRequirements.engine("MXNet", "PyTorch");
         String[] args = new String[] {"-g", "1", "-e", "20", "-b", "32"};
         TrainingResult result = TrainAirfoilWithTabNet.runExample(args);

@@ -121,6 +121,46 @@ public void testClearGradients() {
         }
     }
 
+    /** Tests that the gradients do accumulate within the same gradient collector. */
+    @Test
+    public void testAccumulateGradients() {
+        // TODO: MXNet support for accumulating gradients does not currently work
+        TestRequirements.notEngine("MXNet");
+        try (NDManager manager = NDManager.newBaseManager()) {
+            NDArray a = manager.create(0.0f);
+            a.setRequiresGradient(true);
+
+            try (GradientCollector gc = Engine.getInstance().newGradientCollector()) {
+                for (int i = 1; i <= 3; i++) {
+                    NDArray b = a.mul(2);
+                    gc.backward(b);
+                    Assert.assertEquals(a.getGradient().getFloat(), 2.0f * i);
+                }
+            }
+        }
+    }
+
+    /**
+     * Ensures that a gradient collector does not start when one is already created because they are
+     * global.
+     */
+    @Test
+    @SuppressWarnings({"try", "PMD.UseTryWithResources"})
+    public void testMultipleGradientCollectors() {
+        Assert.assertThrows(
+                () -> {
+                    GradientCollector gc2 = null;
+                    try (GradientCollector gc = Engine.getInstance().newGradientCollector()) {
+                        gc2 = Engine.getInstance().newGradientCollector();
+                        gc2.close();
+                    } finally {
+                        if (gc2 != null) {
+                            gc2.close();
+                        }
+                    }
+                });
+    }
+
     @Test
     public void testFreezeParameters() {
         try (Model model = Model.newInstance("model")) {