Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Support PL 1.8 #910

Merged
merged 13 commits into from
Nov 1, 2022
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pl_bolts/callbacks/printing.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ class PrintTableMetricsCallback(Callback):
def __init__(self) -> None:
self.metrics: List = []

def on_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
metrics_dict = copy.copy(trainer.callback_metrics)
self.metrics.append(metrics_dict)
rank_zero_info(dicts_to_table(self.metrics))
Expand Down
31 changes: 1 addition & 30 deletions pl_bolts/callbacks/sparseml.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,42 +51,13 @@ def on_fit_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
raise MisconfigurationException("SparseML only supports training with one optimizer.")
optimizer = optimizer[0]
optimizer = self.manager.modify(
pl_module, optimizer, steps_per_epoch=self._num_training_steps_per_epoch(trainer), epoch=0
pl_module, optimizer, steps_per_epoch=trainer.estimated_stepping_batches, epoch=0
)
trainer.optimizers = [optimizer]

def on_fit_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
self.manager.finalize(pl_module)

def _num_training_steps_per_epoch(self, trainer: Trainer) -> int:
"""Total training steps inferred from the datamodule and devices."""
if isinstance(trainer.limit_train_batches, int) and trainer.limit_train_batches != 0:
dataset_size = trainer.limit_train_batches
elif isinstance(trainer.limit_train_batches, float):
# limit_train_batches is a percentage of batches
dataset_size = len(trainer.datamodule.train_dataloader())
dataset_size = int(dataset_size * trainer.limit_train_batches)
else:
dataset_size = len(trainer.datamodule.train_dataloader())

if hasattr(trainer, "num_devices"):
# New behavior in Lightning
num_devices = max(1, trainer.num_devices)
else:
# Old behavior deprecated in v1.6
num_devices = max(1, trainer.num_gpus, trainer.num_processes)
if trainer.tpu_cores:
num_devices = max(num_devices, trainer.tpu_cores)

effective_batch_size = trainer.accumulate_grad_batches * num_devices
max_estimated_steps = dataset_size // effective_batch_size

# To avoid breaking changes, max_steps is set to -1 if it is not defined
max_steps = -1 if not trainer.max_steps else trainer.max_steps
if max_steps != -1 and max_steps < max_estimated_steps:
return max_steps
return max_estimated_steps

@staticmethod
def export_to_sparse_onnx(
model: LightningModule, output_dir: str, sample_batch: Optional[torch.Tensor] = None, **export_kwargs: Any
Expand Down
8 changes: 4 additions & 4 deletions pl_bolts/callbacks/ssl_online.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def setup(self, trainer: Trainer, pl_module: LightningModule, stage: Optional[st
if self.dataset is None:
self.dataset = trainer.datamodule.name

def on_pretrain_routine_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
def on_fit_start(self, trainer: Trainer, pl_module: LightningModule) -> None:
# must move to device after setup, as during setup, pl_module is still on cpu
self.online_evaluator = SSLEvaluator(
n_input=self.z_dim,
Expand Down Expand Up @@ -167,11 +167,11 @@ def on_validation_batch_end(
pl_module.log("online_val_acc", val_acc, on_step=False, on_epoch=True, sync_dist=True)
pl_module.log("online_val_loss", mlp_loss, on_step=False, on_epoch=True, sync_dist=True)

def on_save_checkpoint(self, trainer: Trainer, pl_module: LightningModule, checkpoint: Dict[str, Any]) -> dict:
def state_dict(self) -> dict:
return {"state_dict": self.online_evaluator.state_dict(), "optimizer_state": self.optimizer.state_dict()}

def on_load_checkpoint(self, trainer: Trainer, pl_module: LightningModule, callback_state: Dict[str, Any]) -> None:
self._recovered_callback_state = callback_state
def load_state_dict(self, state_dict: Dict[str, Any]) -> None:
self._recovered_callback_state = state_dict


@under_review()
Expand Down
2 changes: 1 addition & 1 deletion pl_bolts/callbacks/variational.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def __init__(
self.normalize = normalize
self.steps = steps

def on_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
if (trainer.current_epoch + 1) % self.interpolate_epoch_interval == 0:
images = self.interpolate_latent_space(pl_module, latent_dim=pl_module.hparams.latent_dim)
images = torch.cat(images, dim=0)
Expand Down
2 changes: 1 addition & 1 deletion pl_bolts/callbacks/vision/image_generation.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def __init__(
self.scale_each = scale_each
self.pad_value = pad_value

def on_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
def on_train_epoch_end(self, trainer: Trainer, pl_module: LightningModule) -> None:
dim = (self.num_samples, pl_module.hparams.latent_dim)
z = torch.normal(mean=0.0, std=1.0, size=dim, device=pl_module.device)

Expand Down
2 changes: 1 addition & 1 deletion pl_bolts/models/rl/double_dqn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -57,7 +57,7 @@ def training_step(self, batch: Tuple[Tensor, Tensor], _) -> OrderedDict:
# calculates training loss
loss = double_dqn_loss(batch, self.net, self.target_net, self.gamma)

if self._use_dp_or_ddp2(self.trainer):
if self._use_dp(self.trainer):
loss = loss.unsqueeze(0)

# Soft update of target network
Expand Down
8 changes: 4 additions & 4 deletions pl_bolts/models/rl/dqn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import torch
from pytorch_lightning import LightningModule, Trainer, seed_everything
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.plugins import DataParallelPlugin, DDP2Plugin
from pytorch_lightning.strategies import DataParallelStrategy
from torch import Tensor, optim
from torch.optim.optimizer import Optimizer
from torch.utils.data import DataLoader
Expand Down Expand Up @@ -269,7 +269,7 @@ def training_step(self, batch: Tuple[Tensor, Tensor], _) -> OrderedDict:
# calculates training loss
loss = dqn_loss(batch, self.net, self.target_net, self.gamma)

if self._use_dp_or_ddp2(self.trainer):
if self._use_dp(self.trainer):
loss = loss.unsqueeze(0)

# Soft update of target network
Expand Down Expand Up @@ -406,8 +406,8 @@ def add_model_specific_args(
return arg_parser

@staticmethod
def _use_dp_or_ddp2(trainer: Trainer) -> bool:
return isinstance(trainer.training_type_plugin, (DataParallelPlugin, DDP2Plugin))
def _use_dp(trainer: Trainer) -> bool:
return isinstance(trainer.strategy, DataParallelStrategy)


@under_review()
Expand Down
2 changes: 1 addition & 1 deletion pl_bolts/models/rl/per_dqn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ def training_step(self, batch, _) -> OrderedDict:
# calculates training loss
loss, batch_weights = per_dqn_loss(samples, weights, self.net, self.target_net, self.gamma)

if self._use_dp_or_ddp2(self.trainer):
if self._use_dp(self.trainer):
loss = loss.unsqueeze(0)

# update priorities in buffer
Expand Down
2 changes: 1 addition & 1 deletion pl_bolts/models/self_supervised/moco/callbacks.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def __init__(self, initial_lr=0.03, use_cosine_scheduler=False, schedule=(120, 1
self.schedule = schedule
self.max_epochs = max_epochs

def on_epoch_start(self, trainer, pl_module):
def on_train_epoch_start(self, trainer, pl_module):
epoch = trainer.current_epoch
lr = self.lr

Expand Down
12 changes: 6 additions & 6 deletions pl_bolts/models/self_supervised/moco/moco2_module.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import torch
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.plugins import DDP2Plugin, DDPPlugin
from pytorch_lightning.strategies import DDPStrategy
from torch import nn
from torch.nn import functional as F

Expand Down Expand Up @@ -147,7 +147,7 @@ def _momentum_update_key_encoder(self):
@torch.no_grad()
def _dequeue_and_enqueue(self, keys, queue_ptr, queue):
# gather keys before updating queue
if self._use_ddp_or_ddp2(self.trainer):
if self._use_ddp(self.trainer):
keys = concat_all_gather(keys)

batch_size = keys.shape[0]
Expand Down Expand Up @@ -226,14 +226,14 @@ def forward(self, img_q, img_k, queue):
with torch.no_grad(): # no gradient to keys

# shuffle for making use of BN
if self._use_ddp_or_ddp2(self.trainer):
if self._use_ddp(self.trainer):
img_k, idx_unshuffle = self._batch_shuffle_ddp(img_k)

k = self.encoder_k(img_k) # keys: NxC
k = nn.functional.normalize(k, dim=1)

# undo shuffle
if self._use_ddp_or_ddp2(self.trainer):
if self._use_ddp(self.trainer):
k = self._batch_unshuffle_ddp(k, idx_unshuffle)

# compute logits
Expand Down Expand Up @@ -337,8 +337,8 @@ def add_model_specific_args(parent_parser):
return parser

@staticmethod
def _use_ddp_or_ddp2(trainer: Trainer) -> bool:
return isinstance(trainer.training_type_plugin, (DDPPlugin, DDP2Plugin))
def _use_ddp(trainer: Trainer) -> bool:
return isinstance(trainer.strategy, DDPStrategy)


# utils
Expand Down
58 changes: 17 additions & 41 deletions pl_bolts/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,49 +1,25 @@
import importlib
import operator
from typing import Callable

import torch
from packaging.version import Version
from pkg_resources import DistributionNotFound
from pytorch_lightning.utilities import _module_available
from lightning_utilities.core.imports import compare_version, module_available

from pl_bolts.callbacks.verification.batch_gradient import BatchGradientVerification # type: ignore


# Ported from https://github.com/PyTorchLightning/pytorch-lightning/blob/master/pytorch_lightning/utilities/imports.py
def _compare_version(package: str, op: Callable, version: str) -> bool:
"""Compare package version with some requirements.

>>> _compare_version("torch", operator.ge, "0.1")
True
"""
try:
pkg = importlib.import_module(package)
except (ModuleNotFoundError, DistributionNotFound):
return False
try:
pkg_version = Version(pkg.__version__)
except TypeError:
# this is mock by sphinx, so it shall return True ro generate all summaries
return True
return op(pkg_version, Version(version))


_NATIVE_AMP_AVAILABLE: bool = _module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast")

_TORCHVISION_AVAILABLE: bool = _module_available("torchvision")
_GYM_AVAILABLE: bool = _module_available("gym")
_SKLEARN_AVAILABLE: bool = _module_available("sklearn")
_PIL_AVAILABLE: bool = _module_available("PIL")
_OPENCV_AVAILABLE: bool = _module_available("cv2")
_WANDB_AVAILABLE: bool = _module_available("wandb")
_MATPLOTLIB_AVAILABLE: bool = _module_available("matplotlib")
_TORCHVISION_LESS_THAN_0_9_1: bool = _compare_version("torchvision", operator.lt, "0.9.1")
_TORCHVISION_LESS_THAN_0_13: bool = _compare_version("torchvision", operator.le, "0.13.0")
_PL_GREATER_EQUAL_1_4 = _compare_version("pytorch_lightning", operator.ge, "1.4.0")
_PL_GREATER_EQUAL_1_4_5 = _compare_version("pytorch_lightning", operator.ge, "1.4.5")
_TORCH_ORT_AVAILABLE = _module_available("torch_ort")
_TORCH_MAX_VERSION_SPARSEML = _compare_version("torch", operator.lt, "1.11.0")
_SPARSEML_AVAILABLE = _module_available("sparseml") and _PL_GREATER_EQUAL_1_4_5 and _TORCH_MAX_VERSION_SPARSEML
_NATIVE_AMP_AVAILABLE: bool = module_available("torch.cuda.amp") and hasattr(torch.cuda.amp, "autocast")

_TORCHVISION_AVAILABLE: bool = module_available("torchvision")
_GYM_AVAILABLE: bool = module_available("gym")
_SKLEARN_AVAILABLE: bool = module_available("sklearn")
_PIL_AVAILABLE: bool = module_available("PIL")
_OPENCV_AVAILABLE: bool = module_available("cv2")
_WANDB_AVAILABLE: bool = module_available("wandb")
_MATPLOTLIB_AVAILABLE: bool = module_available("matplotlib")
_TORCHVISION_LESS_THAN_0_9_1: bool = compare_version("torchvision", operator.lt, "0.9.1")
_TORCHVISION_LESS_THAN_0_13: bool = compare_version("torchvision", operator.le, "0.13.0")
_PL_GREATER_EQUAL_1_4 = compare_version("pytorch_lightning", operator.ge, "1.4.0")
_PL_GREATER_EQUAL_1_4_5 = compare_version("pytorch_lightning", operator.ge, "1.4.5")
_TORCH_ORT_AVAILABLE = module_available("torch_ort")
_TORCH_MAX_VERSION_SPARSEML = compare_version("torch", operator.lt, "1.11.0")
_SPARSEML_AVAILABLE = module_available("sparseml") and _PL_GREATER_EQUAL_1_4_5 and _TORCH_MAX_VERSION_SPARSEML

__all__ = ["BatchGradientVerification"]
2 changes: 0 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,2 @@
torch>=1.9.*
torchmetrics>=0.4.1
pytorch-lightning>=1.6.0
rohitgr7 marked this conversation as resolved.
Show resolved Hide resolved
packaging
9 changes: 0 additions & 9 deletions tests/callbacks/test_data_monitor.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
import pytest
import torch
from pytorch_lightning import Trainer
from pytorch_lightning.loggers import LoggerCollection, TensorBoardLogger
from torch import nn

from pl_bolts.callbacks import ModuleDataMonitor, TrainingDataMonitor
Expand Down Expand Up @@ -63,14 +62,6 @@ def test_base_no_logger_warning():
monitor.on_train_start(trainer, pl_module=None)


def test_base_unsupported_logger_warning(tmpdir):
"""Test a warning is displayed when an unsupported logger is used."""
monitor = TrainingDataMonitor()
trainer = Trainer(logger=LoggerCollection([TensorBoardLogger(tmpdir)]), callbacks=[monitor])
with pytest.warns(UserWarning, match="does not support logging with LoggerCollection"):
monitor.on_train_start(trainer, pl_module=None)


@mock.patch("pl_bolts.callbacks.data_monitor.TrainingDataMonitor.log_histogram")
def test_training_data_monitor(log_histogram, tmpdir, datadir):
"""Test that the TrainingDataMonitor logs histograms of data points going into training_step."""
Expand Down
2 changes: 1 addition & 1 deletion tests/models/test_mnist_templates.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,4 @@ def test_mnist(tmpdir, datadir, catch_warnings):
)
trainer.fit(model, datamodule=datamodule)
loss = trainer.callback_metrics["train_loss"]
assert loss <= 2.2, "mnist failed"
assert loss <= 2.3, "mnist failed"