Merge remote-tracking branch 'origin/master' into min/regnet

facebookresearch · Mar 30, 2021 · 301d60a · 301d60a
2 parents acc95a3 + 5e6a7a5
commit 301d60a
Show file tree

Hide file tree

Showing 6 changed files with 75 additions and 45 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -8,7 +8,8 @@
 # Pro tip: download circle ci cli to validate the config locally during development.
 
 version: 2.1
-
+orbs:
+  codecov: codecov/codecov@1.0.2
 # -------------------------------------------------------------------------------------
 # Environments to run the jobs in
 # -------------------------------------------------------------------------------------
@@ -101,9 +102,9 @@ install_dep_171: &install_dep_171
         python -m torch.utils.collect_env
         wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py
 
-install_dep_180: &install_dep_180
+install_dep_181: &install_dep_181
   - run:
-      name: Install Dependencies with torch 1.8.0
+      name: Install Dependencies with torch 1.8.1
       command: |
         # make sure that apt-get retries if needed
         sudo sh -c "echo 'APT::Acquire::Retries "3";' > /etc/apt/apt.conf.d/80-retries"
@@ -112,7 +113,7 @@ install_dep_180: &install_dep_180
         # check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
         if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.8 && exit 0; fi
         # start installing
-        pip install --progress-bar off torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+        pip install --progress-bar off torch==1.8.1+cu101 torchvision==0.9.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
         pip install --progress-bar off -r requirements-test.txt
         pip install --progress-bar off -r requirements-benchmarks.txt
         python -c 'import torch; print("Torch version:", torch.__version__)'
@@ -159,17 +160,10 @@ check_test_list: &check_test_list
         bash ./tests/ci_test_list_check.sh
 
 
-# TODO (Min): figure out how to do coverage nightly or on-demand. Doing it
-# on every commit seems like an overkill since we can easily figure out which
-# code is not covered without looking at coverage results from each commit.
-# Also, it is a long pole for testing time, which slows down development a lot.
-run_coverage: &run_coverage
-  - run:
-      name: Run Unit Tests With Coverage
-      command: |
-        pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./
-        #Uploading test coverage for Python code
-        bash <(curl -s https://codecov.io/bash) -f coverage.xml -cF Python
+upload_coverage: &upload_coverage
+  - codecov/upload:
+      file: 'coverage.xml'
+      token: $CODECOV_TOKEN
 
 run_mpi_unittests: &run_mpi_unittests
   - run:
@@ -233,7 +227,7 @@ run_unittests: &run_unittests
        name: Run all unit tests.
        # We run all and not stopping on failure on CPU since docker time is cheaper.
        command: |
-         pytest --junitxml=test-results/junit.xml --verbose --timeout 60
+         pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./
 
 commands:
 
@@ -249,7 +243,7 @@ commands:
            name: Run Unit Tests
            command: |
              if [ ! -f <<parameters.test_list_file>> ]; then exit 1; fi
-             pytest --junitxml=test-results/junit.xml --verbose --timeout 60 `cat <<parameters.test_list_file>>`
+             pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./ `cat <<parameters.test_list_file>>`
 
 # -------------------------------------------------------------------------------------
 # Jobs to run
@@ -338,18 +332,14 @@ jobs:
       # Cache the venv directory that contains dependencies
       - restore_cache:
           keys:
-            - cache-key-cpu-py39-180-4-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-cpu-py39-181-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
 
-      # py3.9 doesn't work well with torch < 1.8. See this PR:
-      # https://github.com/pytorch/pytorch/pull/50998
-      #
-      # Therefore, we test py39 with torch 1.8.0.
-      - <<: *install_dep_180
+      - <<: *install_dep_181
 
       - save_cache:
           paths:
             - ~/venv
-          key: cache-key-cpu-py39-180-4-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-cpu-py39-181-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
 
       - <<: *install_repo
 
@@ -403,6 +393,8 @@ jobs:
 
       - store_test_results:
           path: test-results
+
+      - <<: *upload_coverage
 
   gpu_tests_171:
     parameters:
@@ -443,8 +435,10 @@ jobs:
 
       - store_test_results:
           path: test-results
+
+      - <<: *upload_coverage
 
-  gpu_tests_180:
+  gpu_tests_181:
     parameters:
       test_list_file:
         type: string
@@ -467,14 +461,14 @@ jobs:
       # Cache the venv directory that contains dependencies
       - restore_cache:
           keys:
-            - cache-key-gpu-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-gpu-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
 
-      - <<: *install_dep_180
+      - <<: *install_dep_181
 
       - save_cache:
           paths:
             - ~/venv
-          key: cache-key-gpu-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-gpu-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
 
       - <<: *install_repo
 
@@ -483,6 +477,8 @@ jobs:
 
       - store_test_results:
           path: test-results
+
+      - <<: *upload_coverage
 
   benchmarks_1:
     <<: *gpu
@@ -505,19 +501,19 @@ jobs:
       # Cache the venv directory that contains dependencies
       - restore_cache:
           keys:
-            - cache-key-benchmarks-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-benchmarks-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
 
       # Cache the MNIST directory that contains benchmark data
       - restore_cache:
           keys:
             - cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
 
-      - <<: *install_dep_180
+      - <<: *install_dep_181
 
       - save_cache:
           paths:
             - ~/venv
-          key: cache-key-benchmarks-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-benchmarks-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
 
       - <<: *install_repo
 
@@ -557,20 +553,20 @@ jobs:
       # Cache the venv directory that contains dependencies
       - restore_cache:
           keys:
-            - cache-key-benchmarks-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+            - cache-key-benchmarks-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
 
 
       # Cache the MNIST directory that contains benchmark data
       - restore_cache:
           keys:
             - cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}
 
-      - <<: *install_dep_180
+      - <<: *install_dep_181
 
       - save_cache:
           paths:
             - ~/venv
-          key: cache-key-benchmarks-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
+          key: cache-key-benchmarks-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
 
       - <<: *install_repo
 
@@ -593,19 +589,19 @@ workflows:
           test_list_file: tests/ci_test_list_1.txt
       - gpu_tests_171:
           test_list_file: tests/ci_test_list_1.txt
-      - gpu_tests_180:
+      - gpu_tests_181:
           test_list_file: tests/ci_test_list_1.txt
       - gpu_tests_160:
           test_list_file: tests/ci_test_list_2.txt
       - gpu_tests_171:
           test_list_file: tests/ci_test_list_2.txt
-      - gpu_tests_180:
+      - gpu_tests_181:
           test_list_file: tests/ci_test_list_2.txt
       - gpu_tests_160:
           test_list_file: tests/ci_test_list_3.txt
       - gpu_tests_171:
           test_list_file: tests/ci_test_list_3.txt
-      - gpu_tests_180:
+      - gpu_tests_181:
           test_list_file: tests/ci_test_list_3.txt
       - benchmarks_1
       - benchmarks_2
diff --git a/.codecov.yml b/.codecov.yml
@@ -1,6 +1,23 @@
+codecov:
+  require_ci_to_pass: yes
 coverage:
   status:
     project:
       default:
         target: 94%
         threshold: 0.1%
+  status:
+      project: yes
+      patch: yes
+      changes: no
+parsers:
+  gcov:
+    branch_detection:
+      conditional: yes
+      loop: yes
+      method: no
+      macro: no
+comment:
+  layout: "reach,diff,flags,tree"
+  behavior: default
+  require_changes: no
diff --git a/README.md b/README.md
@@ -164,7 +164,7 @@ At a high level, we want ML researchers to:
 
 # Testing
 
-We use circleci to test on PyTorch versions 1.6.0, 1.7.1, and 1.8.0. Please create an [issue](https://github.com/facebookresearch/fairscale/issues) if you are having trouble with installation.
+We use circleci to test on PyTorch versions 1.6.0, 1.7.1, and 1.8.1. Please create an [issue](https://github.com/facebookresearch/fairscale/issues) if you are having trouble with installation.
 
 ## Contributors
 

diff --git a/fairscale/experimental/nn/multiprocess_pipe.py b/fairscale/experimental/nn/multiprocess_pipe.py
@@ -11,6 +11,7 @@
 from torch import Tensor
 import torch.distributed.rpc as rpc
 import torch.nn as nn
+from torch.utils.checkpoint import checkpoint_sequential
 
 Tensors = Tuple[Tensor, ...]
 TensorOrTensors = Union[Tensor, Tensors]
@@ -69,6 +70,12 @@ def _rcat(tensors: List) -> Tensor:
     return torch.cat([t.local_value() for t in tensors])
 
 
+def _rcheckpoint(rmodule: rpc.RRef, input_rref: rpc.RRef) -> TensorOrTensors:
+    module = rmodule.local_value()
+    input = module[0](input_rref)  # calls _ToHere.forward
+    return checkpoint_sequential(module[1:], 1, input)
+
+
 def _parameter_rrefs(module: rpc.RRef) -> List[rpc.RRef]:
     return [rpc.RRef(p) for p in module.local_value().parameters()]
 
@@ -159,8 +166,8 @@ def __init__(
 
         if type(chunks) is not int or chunks <= 0:
             raise ValueError("number of chunks must be positive integer")
-        if checkpoint not in ["never"]:
-            raise ValueError("checkpoint is not yet implemented")
+        if checkpoint not in ["always", "except_last", "never"]:
+            raise ValueError("checkpoint is not one of 'always', 'except_last', or 'never'")
         if deferred_batch_norm:
             raise ValueError("deferred_batch_norm is not yet implemented")
         if len(balance) != len(devices):
@@ -181,6 +188,9 @@ def __init__(
             workers.append(worker)
             rmodule.append(rlayer)
 
+        # The micro-batch index where the checkpointing stops.
+        self.checkpoint_stop = {"always": chunks, "except_last": chunks - 1, "never": 0}[checkpoint]
+
         self.chunks = chunks
         self.checkpoint = checkpoint
         self.module = module
@@ -189,10 +199,14 @@ def __init__(
 
     def forward(self, x: Tensor) -> rpc.RRef:  # type: ignore
         outputs = []
-        for chunk in x.chunk(self.chunks):
+        for i, chunk in enumerate(x.chunk(self.chunks)):
             output = rpc.RRef(chunk)
-            for rlayer in self.rmodule:
-                output = rlayer.remote().forward(output)
+            if i < self.checkpoint_stop:
+                for rlayer in self.rmodule:
+                    output = rpc.remote(rlayer.owner(), _rcheckpoint, args=(rlayer, output))
+            else:
+                for rlayer in self.rmodule:
+                    output = rlayer.remote().forward(output)
             outputs.append(output)
         return rpc.remote(outputs[0].owner(), _rcat, args=(outputs,))
 

diff --git a/stubs/torch/utils/checkpoint.pyi b/stubs/torch/utils/checkpoint.pyi
@@ -7,3 +7,4 @@ from torch.nn.modules.module import Module
 def detach_variable(inputs: Tuple[Tensor,...]) -> Tuple[Tensor,...]: ...
 def checkpoint(function: Module, *args, **kwargs): ...
 def check_backward_validity(inputs: Iterable[Any]): ...
+def checkpoint_sequential(function: Module, segments: int, *args, **kwargs): ...
diff --git a/tests/experimental/nn/test_multiprocess_pipe.py b/tests/experimental/nn/test_multiprocess_pipe.py
@@ -130,13 +130,15 @@ def forward_chunks(devices):
 
 @rpc_test(world_size=2)
 @pytest.mark.parametrize("devices", DEVICES)
-def forward_multi(devices):
+@pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"])
+def forward_multi(devices, checkpoint):
     device = devices[0].split("/")[1]
     torch.random.manual_seed(3)
     torch.cuda.manual_seed_all(3)
     x = torch.randn(8, 4).to(device)
+    x.requires_grad = True  # TODO(msb) remove this limitation
     model = [("linear1", nn.Linear, (4, 4), {}), ("relu", nn.ReLU, (), {})]
-    pipe = MultiProcessPipe(model, balance=[1, 1], chunks=4, devices=devices[:2])
+    pipe = MultiProcessPipe(model, balance=[1, 1], chunks=4, devices=devices[:2], checkpoint=checkpoint)
     if BOUNCE_TENSORS:
         y = pipe(x).remote().cpu().to_here()
     else: