Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Runner profiler update #1348

Merged
merged 29 commits into from
Nov 23, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
51f0f47
profiler is added to runner
asteyo Nov 6, 2021
cb5b4ea
profiler minimal example import fix
asteyo Nov 8, 2021
de4c5f4
runner docs updated
asteyo Nov 8, 2021
aad700a
logdir added to profiler
asteyo Nov 8, 2021
0fc2564
tensorboard_path fix
asteyo Nov 8, 2021
95e5ade
runner profiler callback updated
asteyo Nov 11, 2021
a2b3f00
Merge branch 'master' into runner_profiler_update
asteyo Nov 11, 2021
51a899d
fix codestyle
asteyo Nov 11, 2021
b54db88
Merge branch 'runner_profiler_update' of https://github.com/asteyo/ca…
asteyo Nov 11, 2021
c015c1c
Merge branch 'master' into runner_profiler_update
asteyo Nov 11, 2021
5b6ed80
key_loader fix
asteyo Nov 11, 2021
5cefd03
Merge branch 'runner_profiler_update' of https://github.com/asteyo/ca…
asteyo Nov 11, 2021
d4d3b81
profiler_kwargs fix
asteyo Nov 11, 2021
64715ef
profiler updated
asteyo Nov 11, 2021
7db8df5
profiler on_experiment_start method refactoring
asteyo Nov 11, 2021
5f16fed
logs path fixed
asteyo Nov 11, 2021
bd03a44
Revert "logs path fixed"
asteyo Nov 11, 2021
45fc5b6
profiler test is added
asteyo Nov 21, 2021
1d69e8c
flake8 update
asteyo Nov 21, 2021
65e831b
Merge branch 'master' into runner_profiler_update
asteyo Nov 21, 2021
a8c4ea7
profiler availability check is added
asteyo Nov 22, 2021
64b390e
profiler reqs is added
asteyo Nov 22, 2021
506ee23
Merge branch 'runner_profiler_update' of https://github.com/asteyo/ca…
asteyo Nov 22, 2021
f6c2d67
setup docs updated
asteyo Nov 22, 2021
023d398
codestyle fix
asteyo Nov 22, 2021
dc6371a
dl_cpu.yml refactoring
asteyo Nov 22, 2021
028c407
Revert "dl_cpu.yml refactoring"
asteyo Nov 22, 2021
9e0fe35
profiler added to dl_cpu.yml
asteyo Nov 22, 2021
fca5fef
Merge branch 'master' into runner_profiler_update
asteyo Nov 22, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion .github/workflows/dl_cpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ jobs:
python -c "req = open('./requirements/requirements-ml.txt').read().replace('>', '=') ; open('./requirements/requirements-ml.txt', 'w').write(req)"
python -c "req = open('./requirements/requirements-neptune.txt').read().replace('>', '=') ; open('./requirements/requirements-neptune.txt', 'w').write(req)"
python -c "req = open('./requirements/requirements-albu.txt').read().replace('>', '=') ; open('./requirements/requirements-albu.txt', 'w').write(req)"
python -c "req = open('./requirements/requirements-profiler.txt').read().replace('>', '=') ; open('./requirements/requirements-profiler.txt', 'w').write(req)"

# https://github.com/actions/cache/blob/master/examples.md
# Note: This uses an internal pip API and may not always work
Expand Down Expand Up @@ -97,7 +98,7 @@ jobs:
- name: install dependencies
run: |
# python -m pip install --upgrade --user pip
pip install -r ./requirements/requirements.txt -r ./requirements/requirements-cv.txt -r ./requirements/requirements-dev.txt -r ./requirements/requirements-hydra.txt -r ./requirements/requirements-ml.txt -r ./requirements/requirements-optuna.txt -r ./requirements/requirements-mlflow.txt -r ./requirements/requirements-nifti.txt -r ./requirements/requirements-neptune.txt -r ./requirements/requirements-albu.txt
pip install -r ./requirements/requirements.txt -r ./requirements/requirements-cv.txt -r ./requirements/requirements-dev.txt -r ./requirements/requirements-hydra.txt -r ./requirements/requirements-ml.txt -r ./requirements/requirements-optuna.txt -r ./requirements/requirements-mlflow.txt -r ./requirements/requirements-nifti.txt -r ./requirements/requirements-neptune.txt -r ./requirements/requirements-albu.txt -r ./requirements/requirements-profiler.txt
python --version
pip --version
pip list
Expand Down
16 changes: 4 additions & 12 deletions catalyst/callbacks/profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,6 @@ class ProfilerCallback(Callback):
from catalyst import dl
from catalyst.data import ToTensor
from catalyst.contrib.datasets import MNIST
from catalyst.contrib.layers import Flatten

loaders = {
"train": DataLoader(
Expand All @@ -64,7 +63,7 @@ class ProfilerCallback(Callback):
),
}

model = nn.Sequential(Flatten(), nn.Linear(784, 512), nn.ReLU(), nn.Linear(512, 10))
model = nn.Sequential(nn.Flatten(), nn.Linear(784, 512), nn.ReLU(), nn.Linear(512, 10))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
runner = dl.SupervisedRunner()
Expand Down Expand Up @@ -122,16 +121,6 @@ def __init__(
self.profiler = None
self.stats = None

def on_experiment_start(self, runner: IRunner) -> None:
"""
On batch end action

Args:
runner: current runner
"""
if self.loader_key is None:
self.loader_key = runner.loader_key # use first loader for profile

def _should_use_profiler(self, loader_key: str, epoch: int):
if self.loader_key == loader_key and self.epoch == epoch:
if self.num_batches is not None:
Expand Down Expand Up @@ -186,6 +175,9 @@ def on_loader_start(self, runner: IRunner) -> None:
Args:
runner: current runner
"""
if self.loader_key is None:
self.loader_key = runner.loader_key # use first loader for profile

self._enter_profiler(runner)

def on_loader_end(self, runner: IRunner) -> None:
Expand Down
21 changes: 21 additions & 0 deletions catalyst/runners/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from catalyst.callbacks.criterion import CriterionCallback, ICriterionCallback
from catalyst.callbacks.misc import CheckRunCallback, TimerCallback, TqdmCallback
from catalyst.callbacks.optimizer import IOptimizerCallback, OptimizerCallback
from catalyst.callbacks.profiler import ProfilerCallback
from catalyst.callbacks.scheduler import ISchedulerCallback, SchedulerCallback
from catalyst.core._misc import callback_isinstance, sort_callbacks_by_order
from catalyst.core.callback import Callback
Expand Down Expand Up @@ -192,6 +193,7 @@ def __init__(self, *args, **kwargs):
self._timeit = False
self._check = False
self._overfit = False
self._profile = False
self._load_best_on_end = False

@property
Expand Down Expand Up @@ -304,6 +306,21 @@ def get_callbacks(self, stage: str) -> "OrderedDict[str, Callback]":
callbacks["_check"] = CheckRunCallback()
if self._overfit and not is_callback_exists(BatchOverfitCallback):
callbacks["_overfit"] = BatchOverfitCallback()
if self._profile and not is_callback_exists(ProfilerCallback):
callbacks["_profile"] = ProfilerCallback(
tensorboard_path=os.path.join(self._logdir, "tb_profile"),
profiler_kwargs={
"activities": [
torch.profiler.ProfilerActivity.CPU,
torch.profiler.ProfilerActivity.CUDA,
],
"on_trace_ready": torch.profiler.tensorboard_trace_handler(
os.path.join(self._logdir, "tb_profile")
),
"with_stack": True,
"with_flops": True,
},
)

if self._logdir is not None and not is_callback_exists(ICheckpointCallback):
callbacks["_checkpoint"] = CheckpointCallback(
Expand Down Expand Up @@ -348,6 +365,7 @@ def train(
timeit: bool = False,
check: bool = False,
overfit: bool = False,
profile: bool = False,
load_best_on_end: bool = False,
# engine extra params,
fp16: bool = False,
Expand Down Expand Up @@ -390,6 +408,8 @@ def train(
overfit: if True, then takes only one batch per loader
for model overfitting, for advance usage please check
``BatchOverfitCallback``
profile: if True, then uses ProfilerCallback, for advance usage please check
``ProfilerCallback``
load_best_on_end: if True, Runner will load
best checkpoint state (model, optimizer, etc)
according to validation metrics. Requires specified ``logdir``.
Expand Down Expand Up @@ -516,6 +536,7 @@ def on_loader_end(self, runner):
self._timeit = timeit
self._check = check
self._overfit = overfit
self._profile = profile
self._load_best_on_end = load_best_on_end
# run
self.run()
Expand Down
1 change: 1 addition & 0 deletions docs/faq/settings.rst
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ Nevertheless, there are much more availabe:
pip install catalyst[onnx-gpu] # + onnx, onnxruntime-gpu
pip install catalyst[onnx] # + onnx, onnxruntime
pip install catalyst[optuna] # + optuna
pip install catalyst[profiler] # + profiler
pip install catalyst[wandb] # + wandb
pip install catalyst[all] # + catalyst[cv], catalyst[ml], catalyst[optuna]

Expand Down
1 change: 1 addition & 0 deletions requirements/requirements-profiler.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
torch_tb_profiler
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ def load_version():
"onnx-gpu": load_requirements("requirements/requirements-onnx-gpu.txt"),
"onnx": load_requirements("requirements/requirements-onnx.txt"),
"optuna": load_requirements("requirements/requirements-optuna.txt"),
"profiler": load_requirements("requirements/requirements-profiler.txt"),
"wandb": load_requirements("requirements/requirements-wandb.txt"),
# "xla": load_requirements("requirements/requirements-xla.txt"),
}
Expand Down
54 changes: 54 additions & 0 deletions tests/pipelines/test_runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
# flake8: noqa

import os

from pytest import mark

import torch
from torch import nn
from torch.utils.data import DataLoader

from catalyst import dl
from catalyst.contrib.data import ImageToTensor
from catalyst.contrib.datasets import MNIST


def _is_profile_available():
try:
from torch import profiler # noqa: F401

return True
except ImportError:
return False


def train_experiment():
loaders = {
"train": DataLoader(
MNIST(os.getcwd(), train=False, download=True, transform=ImageToTensor()),
batch_size=32,
),
"valid": DataLoader(
MNIST(os.getcwd(), train=False, download=True, transform=ImageToTensor()),
batch_size=32,
),
}
model = nn.Sequential(nn.Flatten(), nn.Linear(784, 512), nn.ReLU(), nn.Linear(512, 10))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
runner = dl.SupervisedRunner()
runner.train(
model=model,
loaders=loaders,
criterion=criterion,
optimizer=optimizer,
num_epochs=5,
logdir="./logs",
profile=True,
)


@mark.skipif(not _is_profile_available(), reason="Torch profiler is not available")
def test_profiler():
train_experiment()
assert os.path.isdir("./logs/tb_profile") and not len(os.listdir("./logs/tb_profile")) == 0