mosaicml · mvpatel2000 · Feb 24, 2023 · Feb 8, 2023 · Feb 22, 2023 · Feb 22, 2023
diff --git a/composer/callbacks/__init__.py b/composer/callbacks/__init__.py
@@ -14,6 +14,7 @@
 from composer.callbacks.memory_monitor import MemoryMonitor
 from composer.callbacks.mlperf import MLPerfCallback
 from composer.callbacks.optimizer_monitor import OptimizerMonitor
+from composer.callbacks.runtime_estimator import RuntimeEstimator
 from composer.callbacks.speed_monitor import SpeedMonitor
 from composer.callbacks.threshold_stopper import ThresholdStopper
 
@@ -28,4 +29,5 @@
     'ExportForInferenceCallback',
     'ThresholdStopper',
     'ImageVisualizer',
+    'RuntimeEstimator',
 ]
diff --git a/composer/callbacks/runtime_estimator.py b/composer/callbacks/runtime_estimator.py
@@ -0,0 +1,159 @@
+# Copyright 2022 MosaicML Composer authors
+# SPDX-License-Identifier: Apache-2.0
+
+"""Estimate total time of training."""
+from __future__ import annotations
+
+import time
+import warnings
+from typing import Any, Dict, List, Optional
+
+from composer.core import Callback, State, TimeUnit
+from composer.loggers import Logger
+
+__all__ = ['RuntimeEstimator']
+
+
+class RuntimeEstimator(Callback):
+    """Estimates total training time.
+
+    The training time is computed by taking the time elapsed for the current duration and multiplying
+    out to the full extended length of the training run.
+
+    This callback provides a best attempt estimate. This estimate may be inaccurate if throughput
+    changes through training or other significant changes are made to the model or dataloader.
+
+    Example:
+        .. doctest::
+
+            >>> from composer import Trainer
+            >>> from composer.callbacks import RuntimeEstimator
+            >>> # constructing trainer object with this callback
+            >>> trainer = Trainer(
+            ...     model=model,
+            ...     train_dataloader=train_dataloader,
+            ...     eval_dataloader=eval_dataloader,
+            ...     optimizers=optimizer,
+            ...     max_duration='1ep',
+            ...     callbacks=[RuntimeEstimator()],
+            ... )
+
+    The runtime estimate is logged by the :class:`.Logger` to the following key as described below.
+
+    +-----------------------------------+---------------------------------------------------------+
+    | Key                               | Logged data                                             |
+    +===================================+=========================================================+
+    | `wall_clock/remaining_estimate`   | Estimated time to completion                            |
+    +-----------------------------------+---------------------------------------------------------+
+
+    Args:
+        skip_batches (int, optional): Number of batches to skip before starting clock to estimate
+            remaining time. Typically, the first few batches are slower due to dataloader, cache
+            warming, and other reasons. Defaults to 1.
+    """
+
+    def __init__(self, skip_batches: int = 1) -> None:
+        self._enabled = True
+        self.batches_left_to_skip = skip_batches
+        self.start_time = None
+        self.start_dur = None
+
+        # Keep track of time spent evaluating
+        self.total_eval_wct = 0.0
+        self.eval_wct_per_label: Dict[str, List[float]] = {}
+        # How often eval is called as fraction of total training time
+        self.eval_frequency_per_label: Dict[str, float] = {}
+        self.last_elapsed_fraction: float = 0.0
+
+    def state_dict(self) -> Dict[str, Any]:
+        return {
+            'total_eval_wct': self.total_eval_wct,
+            'eval_wct_per_label': self.eval_wct_per_label,
+            'eval_frequency_per_label': self.eval_frequency_per_label,
+            'last_elapsed_fraction': self.last_elapsed_fraction,
+        }
+
+    def load_state_dict(self, state: Dict[str, Any]) -> None:
+        self.total_eval_wct = state['total_eval_wct']
+        self.eval_wct_per_label = state['eval_wct_per_label']
+        self.eval_frequency_per_label = state['eval_frequency_per_label']
+        self.last_elapsed_fraction = state['last_elapsed_fraction']
+
+    def get_elapsed_duration(self, state: State) -> Optional[float]:
+        """Get the elapsed duration.
+
+        Unlike `state.get_elapsed_duration`, this method computes fractional progress in an epoch
+        provided at least 1 epoch has passed by recording how many batches were in each epoch.
+        """
+        if state.max_duration is None:
+            return None
+        if state.max_duration.unit == TimeUnit('ep'):
+            if state.timestamp.epoch.value >= 1:
+                batches_per_epoch = (state.timestamp.batch -
+                                     state.timestamp.batch_in_epoch).value / state.timestamp.epoch.value
+                return state.timestamp.get('ba').value / (state.max_duration.value * batches_per_epoch)
+            elif state.dataloader_len is not None:
+                return state.timestamp.get('ba').value / (state.max_duration.value * state.dataloader_len.value)
+        elapsed_dur = state.get_elapsed_duration()
+        if elapsed_dur is not None:
+            return elapsed_dur.value
+        return None
+
+    def batch_start(self, state: State, logger: Logger) -> None:
+        if self._enabled and self.start_time is None and self.batches_left_to_skip == 0:
+            self.start_time = time.time()
+            self.start_dur = self.get_elapsed_duration(state)
+            if self.start_dur is None:
+                warnings.warn('`max_duration` is not set. Cannot estimate remaining time.')
+                self._enabled = False
+
+    def batch_end(self, state: State, logger: Logger) -> None:
+        if not self._enabled:
+            return
+        if self.batches_left_to_skip > 0:
+            self.batches_left_to_skip -= 1
+            return
+
+        elapsed_dur = self.get_elapsed_duration(state)
+        assert elapsed_dur is not None, 'max_duration checked as non-None on batch_start'
+
+        assert self.start_dur is not None
+        assert self.start_time is not None
+        if elapsed_dur > self.start_dur:
+            elapsed_time = time.time() - self.start_time
+            elapsed_time -= self.total_eval_wct  # Subtract time spent evaluating
+            rate = elapsed_time / (elapsed_dur - self.start_dur)
+            remaining_time = rate * (1 - elapsed_dur)
+
+            # Add remaining time from each evaluator using known frequencies. We explicitly compute
+            # frequency instead of using time interpolation to avoid saw tooth pattern in estimates
+            for dataloader_label, eval_wcts in self.eval_wct_per_label.items():
+                # Discard first eval_wct if possible as it is often slower due to dataset downloading
+                eval_wct_avg = None
+                num_evals_finished = len(eval_wcts)
+                if num_evals_finished > 1:
+                    eval_wct_avg = sum(eval_wcts[1:]) / (num_evals_finished - 1)
+                else:
+                    eval_wct_avg = sum(eval_wcts) / num_evals_finished
+                eval_rate = self.eval_frequency_per_label[dataloader_label]
+                num_total_evals = 1 / eval_rate
+                remaining_calls = num_total_evals - num_evals_finished
+                remaining_time += eval_wct_avg * remaining_calls
+
+            logger.log_metrics({'wall_clock/remaining_estimate': remaining_time})
+
+    def eval_end(self, state: State, logger: Logger) -> None:
+        # If eval is called before training starts, ignore it
+        if not self._enabled or self.start_time is None:
+            return
+        self.total_eval_wct += state.eval_timestamp.total_wct.total_seconds()
+        # state.dataloader_label should always be non-None unless user explicitly sets evaluator
+        # label to None, ignoring type hints
+        assert state.dataloader_label is not None, 'evaluator label must not be None'
+        if state.dataloader_label not in self.eval_wct_per_label:
+            self.eval_wct_per_label[state.dataloader_label] = []
+        self.eval_wct_per_label[state.dataloader_label].append(state.eval_timestamp.total_wct.total_seconds())
+        elapsed_fraction = self.get_elapsed_duration(state)
+        assert elapsed_fraction is not None, 'max_duration checked as non-None on batch_start'
+        num_evals_finished = len(self.eval_wct_per_label[state.dataloader_label])
+        self.eval_frequency_per_label[state.dataloader_label] = elapsed_fraction / num_evals_finished
diff --git a/composer/trainer/trainer.py b/composer/trainer/trainer.py
@@ -2022,8 +2022,6 @@ def _train_loop(self) -> None:
                     # This happens if the "break" did not trigger above, or if it
                     # did (e.g. duration specified in samples/batches/tokens), but it is still
                     # the end of the dataloader (i.e. next(dataloader) would raise StopIteration)
-                    self.state.timestamp = self.state.timestamp.to_next_epoch()
-
                     if self.state.train_metrics is not None:
                         self._compute_and_log_metrics(
                             dataloader_label='train',
@@ -2034,6 +2032,8 @@ def _train_loop(self) -> None:
                         for scheduler in self.state.schedulers:
                             scheduler.step()
 
+                    self.state.timestamp = self.state.timestamp.to_next_epoch()
+
                     self.engine.run_event(Event.EPOCH_END)
 
                     # Pause the timing during evaluation

@@ -65,7 +65,7 @@ reportUnusedCoroutine = "error"
 addopts = "--codeblocks --strict-markers -m 'not gpu and not vision and not doctest and not daily and not remote'"
 
 markers = [
-    # !!!!!!!!!!!IMPORTANT!!!!!!!!!: when updating the markers, also make sure to update .ci/Jenkinsfile and meta.yaml
+    # !!!!!!!!!!!IMPORTANT!!!!!!!!!: when updating the markers, also make sure to update meta.yaml
     # Tests that require a world_size of two should be annotated with `@pytest.mark.world_size(2)`.
     # If not specified, the test will be assumed to have a world-size of one, which is
     # equivalent to `@pytest.mark.world_size(1)`
@@ -118,6 +118,12 @@ filterwarnings = [
     'ignore:Torchmetrics v0.9 introduced a new argument class property:UserWarning',
     'ignore:torch.distributed._all_gather_base is a private function and will be deprecated:UserWarning',
     'ignore:torch.distributed._reduce_scatter_base is a private function and will be deprecated:UserWarning',
+    # Ignore tensorboard deprecation warnings
+    'ignore:Call to deprecated create function Descriptor().*:DeprecationWarning:tensorboard',
+    'ignore:Call to deprecated create function EnumDescriptor().*:DeprecationWarning:tensorboard',
+    'ignore:Call to deprecated create function EnumValueDescriptor().*:DeprecationWarning:tensorboard',
+    'ignore:Call to deprecated create function FieldDescriptor().*:DeprecationWarning:tensorboard',
+    'ignore:Call to deprecated create function FileDescriptor().*:DeprecationWarning:tensorboard',
 ]
 
 # Coverage

diff --git a/tests/algorithms/algorithm_settings.py b/tests/algorithms/algorithm_settings.py
@@ -114,13 +114,10 @@
     FusedLayerNorm: simple_bert_settings,
     GatedLinearUnits: simple_bert_settings,
     GhostBatchNorm: {
-        'model': (composer_resnet, {
-            'model_name': 'resnet18',
-            'num_classes': 2
-        }),
-        'dataset': (RandomImageDataset, {
-            'shape': (3, 224, 224)
+        'model': (SimpleConvModel, {
+            'norm': 'group',
         }),
+        'dataset': RandomImageDataset,
         'kwargs': {
             'ghost_batch_size': 2,
         }

diff --git a/tests/algorithms/test_torch_export.py b/tests/algorithms/test_torch_export.py
@@ -139,6 +139,7 @@ def test_surgery_torchfx_eval(
 @pytest.mark.parametrize('alg_cls', torchscript_algs_with_marks)
 @pytest.mark.filterwarnings(
     r'ignore:Converting a tensor to a Python .* might cause the trace to be incorrect:torch.jit._trace.TracerWarning')
+@pytest.mark.filterwarnings('ignore:__floordiv__ is deprecated')
 def test_surgery_onnx(
     input: Any,
     alg_cls: Type[Algorithm],

diff --git a/tests/common/models.py b/tests/common/models.py
@@ -4,7 +4,7 @@
 """Contains commonly used models that are shared across the test suite."""
 import copy
 from functools import partial
-from typing import Any, Dict, Tuple, Union
+from typing import Any, Dict, Optional, Tuple, Union
 
 import pytest
 import torch
@@ -146,14 +146,27 @@ class SimpleConvModel(ComposerClassifier):
         num_classes (int): number of classes (default: 2)
     """
 
-    def __init__(self, num_channels: int = 3, num_classes: int = 2) -> None:
+    def __init__(self, num_channels: int = 3, num_classes: int = 2, norm: Optional[str] = None) -> None:
 
         self.num_classes = num_classes
         self.num_channels = num_channels
 
         conv_args = {'kernel_size': (3, 3), 'padding': 1, 'stride': 2}
         conv1 = torch.nn.Conv2d(in_channels=num_channels, out_channels=8, **conv_args)
         conv2 = torch.nn.Conv2d(in_channels=8, out_channels=4, **conv_args)
+        norm_layer = None
+        if norm is None:
+            norm_layer = torch.nn.Identity()
+        elif norm == 'batch':
+            norm_layer = torch.nn.BatchNorm2d(4)
+        elif norm == 'instance':
+            norm_layer = torch.nn.InstanceNorm2d(4)
+        elif norm == 'layer':
+            norm_layer = torch.nn.LayerNorm(4)
+        elif norm == 'group':
+            norm_layer = torch.nn.GroupNorm(2, 4)
+        else:
+            raise ValueError(f'Unknown norm: {norm}')
         pool = torch.nn.AdaptiveAvgPool2d(1)
         flatten = torch.nn.Flatten()
         fc1 = torch.nn.Linear(4, 16)
@@ -162,6 +175,7 @@ def __init__(self, num_channels: int = 3, num_classes: int = 2) -> None:
         net = torch.nn.Sequential(
             conv1,
             conv2,
+            norm_layer,
             pool,
             flatten,
             fc1,