Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/master' into min/regnet
Browse files Browse the repository at this point in the history
  • Loading branch information
min-xu-ai committed Mar 30, 2021
2 parents acc95a3 + 5e6a7a5 commit 301d60a
Show file tree
Hide file tree
Showing 6 changed files with 75 additions and 45 deletions.
70 changes: 33 additions & 37 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
# Pro tip: download circle ci cli to validate the config locally during development.

version: 2.1

orbs:
codecov: codecov/codecov@1.0.2
# -------------------------------------------------------------------------------------
# Environments to run the jobs in
# -------------------------------------------------------------------------------------
Expand Down Expand Up @@ -101,9 +102,9 @@ install_dep_171: &install_dep_171
python -m torch.utils.collect_env
wget -O /home/circleci/venv/check_version.py https://raw.githubusercontent.com/min-xu-ai/check_verion/main/check_version.py
install_dep_180: &install_dep_180
install_dep_181: &install_dep_181
- run:
name: Install Dependencies with torch 1.8.0
name: Install Dependencies with torch 1.8.1
command: |
# make sure that apt-get retries if needed
sudo sh -c "echo 'APT::Acquire::Retries "3";' > /etc/apt/apt.conf.d/80-retries"
Expand All @@ -112,7 +113,7 @@ install_dep_180: &install_dep_180
# check if we have restored venv cache (/home/circleci/venv) correctly, if so, just skip
if [ -f /home/circleci/venv/check_version.py ]; then python /home/circleci/venv/check_version.py torch eq 1.8 && exit 0; fi
# start installing
pip install --progress-bar off torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
pip install --progress-bar off torch==1.8.1+cu101 torchvision==0.9.1+cu101 -f https://download.pytorch.org/whl/torch_stable.html
pip install --progress-bar off -r requirements-test.txt
pip install --progress-bar off -r requirements-benchmarks.txt
python -c 'import torch; print("Torch version:", torch.__version__)'
Expand Down Expand Up @@ -159,17 +160,10 @@ check_test_list: &check_test_list
bash ./tests/ci_test_list_check.sh
# TODO (Min): figure out how to do coverage nightly or on-demand. Doing it
# on every commit seems like an overkill since we can easily figure out which
# code is not covered without looking at coverage results from each commit.
# Also, it is a long pole for testing time, which slows down development a lot.
run_coverage: &run_coverage
- run:
name: Run Unit Tests With Coverage
command: |
pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./
#Uploading test coverage for Python code
bash <(curl -s https://codecov.io/bash) -f coverage.xml -cF Python
upload_coverage: &upload_coverage
- codecov/upload:
file: 'coverage.xml'
token: $CODECOV_TOKEN

run_mpi_unittests: &run_mpi_unittests
- run:
Expand Down Expand Up @@ -233,7 +227,7 @@ run_unittests: &run_unittests
name: Run all unit tests.
# We run all and not stopping on failure on CPU since docker time is cheaper.
command: |
pytest --junitxml=test-results/junit.xml --verbose --timeout 60
pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./
commands:

Expand All @@ -249,7 +243,7 @@ commands:
name: Run Unit Tests
command: |
if [ ! -f <<parameters.test_list_file>> ]; then exit 1; fi
pytest --junitxml=test-results/junit.xml --verbose --timeout 60 `cat <<parameters.test_list_file>>`
pytest --junitxml=test-results/junit.xml --verbose --timeout 60 --cov-report=xml --cov=./ `cat <<parameters.test_list_file>>`
# -------------------------------------------------------------------------------------
# Jobs to run
Expand Down Expand Up @@ -338,18 +332,14 @@ jobs:
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-cpu-py39-180-4-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-cpu-py39-181-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}

# py3.9 doesn't work well with torch < 1.8. See this PR:
# https://github.com/pytorch/pytorch/pull/50998
#
# Therefore, we test py39 with torch 1.8.0.
- <<: *install_dep_180
- <<: *install_dep_181

- save_cache:
paths:
- ~/venv
key: cache-key-cpu-py39-180-4-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-cpu-py39-181-0-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}

- <<: *install_repo

Expand Down Expand Up @@ -403,6 +393,8 @@ jobs:

- store_test_results:
path: test-results

- <<: *upload_coverage

gpu_tests_171:
parameters:
Expand Down Expand Up @@ -443,8 +435,10 @@ jobs:

- store_test_results:
path: test-results

- <<: *upload_coverage

gpu_tests_180:
gpu_tests_181:
parameters:
test_list_file:
type: string
Expand All @@ -467,14 +461,14 @@ jobs:
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-gpu-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-gpu-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}

- <<: *install_dep_180
- <<: *install_dep_181

- save_cache:
paths:
- ~/venv
key: cache-key-gpu-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-gpu-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}

- <<: *install_repo

Expand All @@ -483,6 +477,8 @@ jobs:

- store_test_results:
path: test-results

- <<: *upload_coverage

benchmarks_1:
<<: *gpu
Expand All @@ -505,19 +501,19 @@ jobs:
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-benchmarks-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-benchmarks-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}

# Cache the MNIST directory that contains benchmark data
- restore_cache:
keys:
- cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}

- <<: *install_dep_180
- <<: *install_dep_181

- save_cache:
paths:
- ~/venv
key: cache-key-benchmarks-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-benchmarks-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}

- <<: *install_repo

Expand Down Expand Up @@ -557,20 +553,20 @@ jobs:
# Cache the venv directory that contains dependencies
- restore_cache:
keys:
- cache-key-benchmarks-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
- cache-key-benchmarks-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}


# Cache the MNIST directory that contains benchmark data
- restore_cache:
keys:
- cache-key-benchmark-MNIST-{{ checksum "benchmarks/datasets/mnist.py"}}

- <<: *install_dep_180
- <<: *install_dep_181

- save_cache:
paths:
- ~/venv
key: cache-key-benchmarks-180-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}
key: cache-key-benchmarks-181-101-{{ checksum "setup.py"}}-{{ checksum "requirements-test.txt"}}

- <<: *install_repo

Expand All @@ -593,19 +589,19 @@ workflows:
test_list_file: tests/ci_test_list_1.txt
- gpu_tests_171:
test_list_file: tests/ci_test_list_1.txt
- gpu_tests_180:
- gpu_tests_181:
test_list_file: tests/ci_test_list_1.txt
- gpu_tests_160:
test_list_file: tests/ci_test_list_2.txt
- gpu_tests_171:
test_list_file: tests/ci_test_list_2.txt
- gpu_tests_180:
- gpu_tests_181:
test_list_file: tests/ci_test_list_2.txt
- gpu_tests_160:
test_list_file: tests/ci_test_list_3.txt
- gpu_tests_171:
test_list_file: tests/ci_test_list_3.txt
- gpu_tests_180:
- gpu_tests_181:
test_list_file: tests/ci_test_list_3.txt
- benchmarks_1
- benchmarks_2
17 changes: 17 additions & 0 deletions .codecov.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,23 @@
codecov:
require_ci_to_pass: yes
coverage:
status:
project:
default:
target: 94%
threshold: 0.1%
status:
project: yes
patch: yes
changes: no
parsers:
gcov:
branch_detection:
conditional: yes
loop: yes
method: no
macro: no
comment:
layout: "reach,diff,flags,tree"
behavior: default
require_changes: no
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,7 +164,7 @@ At a high level, we want ML researchers to:

# Testing

We use circleci to test on PyTorch versions 1.6.0, 1.7.1, and 1.8.0. Please create an [issue](https://github.com/facebookresearch/fairscale/issues) if you are having trouble with installation.
We use circleci to test on PyTorch versions 1.6.0, 1.7.1, and 1.8.1. Please create an [issue](https://github.com/facebookresearch/fairscale/issues) if you are having trouble with installation.

## Contributors

Expand Down
24 changes: 19 additions & 5 deletions fairscale/experimental/nn/multiprocess_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from torch import Tensor
import torch.distributed.rpc as rpc
import torch.nn as nn
from torch.utils.checkpoint import checkpoint_sequential

Tensors = Tuple[Tensor, ...]
TensorOrTensors = Union[Tensor, Tensors]
Expand Down Expand Up @@ -69,6 +70,12 @@ def _rcat(tensors: List) -> Tensor:
return torch.cat([t.local_value() for t in tensors])


def _rcheckpoint(rmodule: rpc.RRef, input_rref: rpc.RRef) -> TensorOrTensors:
module = rmodule.local_value()
input = module[0](input_rref) # calls _ToHere.forward
return checkpoint_sequential(module[1:], 1, input)


def _parameter_rrefs(module: rpc.RRef) -> List[rpc.RRef]:
return [rpc.RRef(p) for p in module.local_value().parameters()]

Expand Down Expand Up @@ -159,8 +166,8 @@ def __init__(

if type(chunks) is not int or chunks <= 0:
raise ValueError("number of chunks must be positive integer")
if checkpoint not in ["never"]:
raise ValueError("checkpoint is not yet implemented")
if checkpoint not in ["always", "except_last", "never"]:
raise ValueError("checkpoint is not one of 'always', 'except_last', or 'never'")
if deferred_batch_norm:
raise ValueError("deferred_batch_norm is not yet implemented")
if len(balance) != len(devices):
Expand All @@ -181,6 +188,9 @@ def __init__(
workers.append(worker)
rmodule.append(rlayer)

# The micro-batch index where the checkpointing stops.
self.checkpoint_stop = {"always": chunks, "except_last": chunks - 1, "never": 0}[checkpoint]

self.chunks = chunks
self.checkpoint = checkpoint
self.module = module
Expand All @@ -189,10 +199,14 @@ def __init__(

def forward(self, x: Tensor) -> rpc.RRef: # type: ignore
outputs = []
for chunk in x.chunk(self.chunks):
for i, chunk in enumerate(x.chunk(self.chunks)):
output = rpc.RRef(chunk)
for rlayer in self.rmodule:
output = rlayer.remote().forward(output)
if i < self.checkpoint_stop:
for rlayer in self.rmodule:
output = rpc.remote(rlayer.owner(), _rcheckpoint, args=(rlayer, output))
else:
for rlayer in self.rmodule:
output = rlayer.remote().forward(output)
outputs.append(output)
return rpc.remote(outputs[0].owner(), _rcat, args=(outputs,))

Expand Down
1 change: 1 addition & 0 deletions stubs/torch/utils/checkpoint.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,4 @@ from torch.nn.modules.module import Module
def detach_variable(inputs: Tuple[Tensor,...]) -> Tuple[Tensor,...]: ...
def checkpoint(function: Module, *args, **kwargs): ...
def check_backward_validity(inputs: Iterable[Any]): ...
def checkpoint_sequential(function: Module, segments: int, *args, **kwargs): ...
6 changes: 4 additions & 2 deletions tests/experimental/nn/test_multiprocess_pipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -130,13 +130,15 @@ def forward_chunks(devices):

@rpc_test(world_size=2)
@pytest.mark.parametrize("devices", DEVICES)
def forward_multi(devices):
@pytest.mark.parametrize("checkpoint", ["never", "always", "except_last"])
def forward_multi(devices, checkpoint):
device = devices[0].split("/")[1]
torch.random.manual_seed(3)
torch.cuda.manual_seed_all(3)
x = torch.randn(8, 4).to(device)
x.requires_grad = True # TODO(msb) remove this limitation
model = [("linear1", nn.Linear, (4, 4), {}), ("relu", nn.ReLU, (), {})]
pipe = MultiProcessPipe(model, balance=[1, 1], chunks=4, devices=devices[:2])
pipe = MultiProcessPipe(model, balance=[1, 1], chunks=4, devices=devices[:2], checkpoint=checkpoint)
if BOUNCE_TENSORS:
y = pipe(x).remote().cpu().to_here()
else:
Expand Down

0 comments on commit 301d60a

Please sign in to comment.