diff --git a/configs/trainer/ddp.yaml b/configs/trainer/ddp.yaml index 8a936a8a1..4db8ecf1f 100644 --- a/configs/trainer/ddp.yaml +++ b/configs/trainer/ddp.yaml @@ -1,11 +1,7 @@ defaults: - default.yaml -# use "ddp_spawn" instead of "ddp", -# it's slower but normal "ddp" currently doesn't work ideally with hydra -# https://github.com/facebookresearch/hydra/issues/2070 -# https://pytorch-lightning.readthedocs.io/en/latest/accelerators/gpu_intermediate.html#distributed-data-parallel-spawn -strategy: ddp_spawn +strategy: ddp accelerator: gpu devices: 2 diff --git a/src/models/mnist_module.py b/src/models/mnist_module.py index e1bb76d93..d27cc9f22 100644 --- a/src/models/mnist_module.py +++ b/src/models/mnist_module.py @@ -97,7 +97,7 @@ def on_validation_epoch_end(self): self.val_acc_best(acc) # update best so far val acc # log `val_acc_best` as a value through `.compute()` method, instead of as a metric object # otherwise metric would be reset by lightning after each epoch - self.log("val/acc_best", self.val_acc_best.compute(), prog_bar=True) + self.log("val/acc_best", self.val_acc_best.compute(), sync_dist=True, prog_bar=True) def test_step(self, batch: Any, batch_idx: int): loss, preds, targets = self.model_step(batch)