From 37a232396ca76610f3b5bee4eb67fc45da412278 Mon Sep 17 00:00:00 2001
From: zhouzaida <zhouzaida@163.com>
Date: Fri, 19 Aug 2022 17:36:01 +0800
Subject: [PATCH 1/3] Remove runner, parallel, engine and device

---
 MANIFEST.in                                   |    1 -
 mmcv/__init__.py                              |    3 -
 mmcv/cnn/alexnet.py                           |    2 +-
 mmcv/cnn/resnet.py                            |    2 +-
 mmcv/cnn/vgg.py                               |    2 +-
 mmcv/device/__init__.py                       |    6 -
 mmcv/device/_functions.py                     |   30 -
 mmcv/device/ipu/__init__.py                   |   14 -
 mmcv/device/ipu/dataloader.py                 |  157 --
 mmcv/device/ipu/hierarchical_data_manager.py  |  243 ---
 mmcv/device/ipu/hook_wrapper.py               |  105 -
 mmcv/device/ipu/model_wrapper.py              |  721 ------
 mmcv/device/ipu/runner.py                     |  142 --
 mmcv/device/ipu/utils.py                      |  244 ---
 mmcv/device/mlu/__init__.py                   |    5 -
 mmcv/device/mlu/_functions.py                 |   24 -
 mmcv/device/mlu/data_parallel.py              |   41 -
 mmcv/device/mlu/distributed.py                |   20 -
 mmcv/device/mlu/scatter_gather.py             |   59 -
 mmcv/device/mps/__init__.py                   |    4 -
 mmcv/device/mps/data_parallel.py              |   34 -
 mmcv/device/scatter_gather.py                 |   64 -
 mmcv/device/utils.py                          |   18 -
 mmcv/engine/__init__.py                       |    8 -
 mmcv/engine/test.py                           |  214 --
 mmcv/model_zoo/deprecated.json                |    6 -
 mmcv/model_zoo/mmcls.json                     |   59 -
 mmcv/model_zoo/open_mmlab.json                |   50 -
 mmcv/model_zoo/torchvision_0.12.json          |   57 -
 mmcv/ops/points_sampler.py                    |    7 +-
 mmcv/parallel/__init__.py                     |   13 -
 mmcv/parallel/_functions.py                   |   82 -
 mmcv/parallel/collate.py                      |   84 -
 mmcv/parallel/data_container.py               |   91 -
 mmcv/parallel/data_parallel.py                |   99 -
 mmcv/parallel/distributed.py                  |  167 --
 mmcv/parallel/distributed_deprecated.py       |   74 -
 mmcv/parallel/registry.py                     |    8 -
 mmcv/parallel/scatter_gather.py               |   70 -
 mmcv/parallel/utils.py                        |   32 -
 mmcv/runner/__init__.py                       |   72 -
 mmcv/runner/base_runner.py                    |  566 -----
 mmcv/runner/builder.py                        |   25 -
 mmcv/runner/checkpoint.py                     |  811 -------
 mmcv/runner/default_constructor.py            |   47 -
 mmcv/runner/dist_utils.py                     |  211 --
 mmcv/runner/epoch_based_runner.py             |  197 --
 mmcv/runner/fp16_utils.py                     |  435 ----
 mmcv/runner/hooks/__init__.py                 |   48 -
 mmcv/runner/hooks/checkpoint.py               |  169 --
 mmcv/runner/hooks/closure.py                  |   13 -
 mmcv/runner/hooks/ema.py                      |   91 -
 mmcv/runner/hooks/evaluation.py               |  515 -----
 mmcv/runner/hooks/hook.py                     |   92 -
 mmcv/runner/hooks/iter_timer.py               |   18 -
 mmcv/runner/hooks/logger/__init__.py          |   18 -
 mmcv/runner/hooks/logger/base.py              |  172 --
 mmcv/runner/hooks/logger/clearml.py           |   63 -
 mmcv/runner/hooks/logger/dvclive.py           |   69 -
 mmcv/runner/hooks/logger/mlflow.py            |   81 -
 mmcv/runner/hooks/logger/neptune.py           |   89 -
 mmcv/runner/hooks/logger/pavi.py              |  150 --
 mmcv/runner/hooks/logger/segmind.py           |   48 -
 mmcv/runner/hooks/logger/tensorboard.py       |   69 -
 mmcv/runner/hooks/logger/text.py              |  256 ---
 mmcv/runner/hooks/logger/wandb.py             |  107 -
 mmcv/runner/hooks/lr_updater.py               |  754 -------
 mmcv/runner/hooks/memory.py                   |   28 -
 mmcv/runner/hooks/momentum_updater.py         |  594 -----
 mmcv/runner/hooks/optimizer.py                |  563 -----
 mmcv/runner/hooks/profiler.py                 |  190 --
 mmcv/runner/hooks/sampler_seed.py             |   20 -
 mmcv/runner/hooks/sync_buffer.py              |   22 -
 mmcv/runner/iter_based_runner.py              |  285 ---
 mmcv/runner/log_buffer.py                     |   41 -
 mmcv/runner/optimizer/__init__.py             |    9 -
 mmcv/runner/optimizer/builder.py              |   45 -
 mmcv/runner/optimizer/default_constructor.py  |  258 ---
 mmcv/runner/priority.py                       |   61 -
 mmcv/runner/utils.py                          |   99 -
 mmcv/utils/__init__.py                        |    3 +-
 mmcv/utils/hub.py                             |  131 --
 tests/data/model_zoo/deprecated.json          |    4 -
 .../data/model_zoo/mmcv_home/open_mmlab.json  |    5 -
 tests/data/model_zoo/mmcv_home/test.pth       |  Bin 341 -> 0 bytes
 tests/data/model_zoo/mmcv_home/val.pth        |  Bin 341 -> 0 bytes
 tests/data/model_zoo/open_mmlab.json          |    4 -
 tests/data/model_zoo/torchvision_0.12.json    |   57 -
 tests/test_device/test_device_utils.py        |   15 -
 tests/test_device/test_functions.py           |   90 -
 .../test_ipu/test_hierarchicaldatamanager.py  |  106 -
 .../test_ipu/test_ipu_dataloder.py            |   69 -
 tests/test_device/test_ipu/test_ipu_hooks.py  |  130 --
 tests/test_device/test_ipu/test_ipu_model.py  |  301 ---
 tests/test_device/test_ipu/test_ipu_runner.py |  126 --
 tests/test_device/test_ipu/test_ipu_utils.py  |  194 --
 .../test_device/test_mlu/test_mlu_parallel.py |   37 -
 .../test_device/test_mps/test_mps_parallel.py |   34 -
 tests/test_load_model_zoo.py                  |  157 --
 tests/test_parallel.py                        |  188 --
 tests/test_runner/test_checkpoint.py          |  452 ----
 tests/test_runner/test_dist_utils.py          |   53 -
 tests/test_runner/test_eval_hook.py           |  483 -----
 tests/test_runner/test_fp16.py                |  317 ---
 tests/test_runner/test_hooks.py               | 1923 -----------------
 tests/test_runner/test_optimizer.py           |  640 ------
 tests/test_runner/test_runner.py              |  289 ---
 tests/test_runner/test_utils.py               |   39 -
 tests/test_utils/test_hub.py                  |   36 -
 109 files changed, 9 insertions(+), 16337 deletions(-)
 delete mode 100644 mmcv/device/__init__.py
 delete mode 100644 mmcv/device/_functions.py
 delete mode 100755 mmcv/device/ipu/__init__.py
 delete mode 100755 mmcv/device/ipu/dataloader.py
 delete mode 100755 mmcv/device/ipu/hierarchical_data_manager.py
 delete mode 100755 mmcv/device/ipu/hook_wrapper.py
 delete mode 100755 mmcv/device/ipu/model_wrapper.py
 delete mode 100755 mmcv/device/ipu/runner.py
 delete mode 100755 mmcv/device/ipu/utils.py
 delete mode 100644 mmcv/device/mlu/__init__.py
 delete mode 100644 mmcv/device/mlu/_functions.py
 delete mode 100644 mmcv/device/mlu/data_parallel.py
 delete mode 100644 mmcv/device/mlu/distributed.py
 delete mode 100644 mmcv/device/mlu/scatter_gather.py
 delete mode 100644 mmcv/device/mps/__init__.py
 delete mode 100644 mmcv/device/mps/data_parallel.py
 delete mode 100644 mmcv/device/scatter_gather.py
 delete mode 100644 mmcv/device/utils.py
 delete mode 100644 mmcv/engine/__init__.py
 delete mode 100644 mmcv/engine/test.py
 delete mode 100644 mmcv/model_zoo/deprecated.json
 delete mode 100644 mmcv/model_zoo/mmcls.json
 delete mode 100644 mmcv/model_zoo/open_mmlab.json
 delete mode 100644 mmcv/model_zoo/torchvision_0.12.json
 delete mode 100644 mmcv/parallel/__init__.py
 delete mode 100644 mmcv/parallel/_functions.py
 delete mode 100644 mmcv/parallel/collate.py
 delete mode 100644 mmcv/parallel/data_container.py
 delete mode 100644 mmcv/parallel/data_parallel.py
 delete mode 100644 mmcv/parallel/distributed.py
 delete mode 100644 mmcv/parallel/distributed_deprecated.py
 delete mode 100644 mmcv/parallel/registry.py
 delete mode 100644 mmcv/parallel/scatter_gather.py
 delete mode 100644 mmcv/parallel/utils.py
 delete mode 100644 mmcv/runner/__init__.py
 delete mode 100644 mmcv/runner/base_runner.py
 delete mode 100644 mmcv/runner/builder.py
 delete mode 100644 mmcv/runner/checkpoint.py
 delete mode 100644 mmcv/runner/default_constructor.py
 delete mode 100644 mmcv/runner/dist_utils.py
 delete mode 100644 mmcv/runner/epoch_based_runner.py
 delete mode 100644 mmcv/runner/fp16_utils.py
 delete mode 100644 mmcv/runner/hooks/__init__.py
 delete mode 100644 mmcv/runner/hooks/checkpoint.py
 delete mode 100644 mmcv/runner/hooks/closure.py
 delete mode 100644 mmcv/runner/hooks/ema.py
 delete mode 100644 mmcv/runner/hooks/evaluation.py
 delete mode 100644 mmcv/runner/hooks/hook.py
 delete mode 100644 mmcv/runner/hooks/iter_timer.py
 delete mode 100644 mmcv/runner/hooks/logger/__init__.py
 delete mode 100644 mmcv/runner/hooks/logger/base.py
 delete mode 100644 mmcv/runner/hooks/logger/clearml.py
 delete mode 100644 mmcv/runner/hooks/logger/dvclive.py
 delete mode 100644 mmcv/runner/hooks/logger/mlflow.py
 delete mode 100644 mmcv/runner/hooks/logger/neptune.py
 delete mode 100644 mmcv/runner/hooks/logger/pavi.py
 delete mode 100644 mmcv/runner/hooks/logger/segmind.py
 delete mode 100644 mmcv/runner/hooks/logger/tensorboard.py
 delete mode 100644 mmcv/runner/hooks/logger/text.py
 delete mode 100644 mmcv/runner/hooks/logger/wandb.py
 delete mode 100644 mmcv/runner/hooks/lr_updater.py
 delete mode 100644 mmcv/runner/hooks/memory.py
 delete mode 100644 mmcv/runner/hooks/momentum_updater.py
 delete mode 100644 mmcv/runner/hooks/optimizer.py
 delete mode 100644 mmcv/runner/hooks/profiler.py
 delete mode 100644 mmcv/runner/hooks/sampler_seed.py
 delete mode 100644 mmcv/runner/hooks/sync_buffer.py
 delete mode 100644 mmcv/runner/iter_based_runner.py
 delete mode 100644 mmcv/runner/log_buffer.py
 delete mode 100644 mmcv/runner/optimizer/__init__.py
 delete mode 100644 mmcv/runner/optimizer/builder.py
 delete mode 100644 mmcv/runner/optimizer/default_constructor.py
 delete mode 100644 mmcv/runner/priority.py
 delete mode 100644 mmcv/runner/utils.py
 delete mode 100644 mmcv/utils/hub.py
 delete mode 100644 tests/data/model_zoo/deprecated.json
 delete mode 100644 tests/data/model_zoo/mmcv_home/open_mmlab.json
 delete mode 100644 tests/data/model_zoo/mmcv_home/test.pth
 delete mode 100644 tests/data/model_zoo/mmcv_home/val.pth
 delete mode 100644 tests/data/model_zoo/open_mmlab.json
 delete mode 100644 tests/data/model_zoo/torchvision_0.12.json
 delete mode 100644 tests/test_device/test_device_utils.py
 delete mode 100644 tests/test_device/test_functions.py
 delete mode 100755 tests/test_device/test_ipu/test_hierarchicaldatamanager.py
 delete mode 100755 tests/test_device/test_ipu/test_ipu_dataloder.py
 delete mode 100755 tests/test_device/test_ipu/test_ipu_hooks.py
 delete mode 100755 tests/test_device/test_ipu/test_ipu_model.py
 delete mode 100755 tests/test_device/test_ipu/test_ipu_runner.py
 delete mode 100755 tests/test_device/test_ipu/test_ipu_utils.py
 delete mode 100644 tests/test_device/test_mlu/test_mlu_parallel.py
 delete mode 100644 tests/test_device/test_mps/test_mps_parallel.py
 delete mode 100644 tests/test_load_model_zoo.py
 delete mode 100644 tests/test_parallel.py
 delete mode 100644 tests/test_runner/test_checkpoint.py
 delete mode 100644 tests/test_runner/test_dist_utils.py
 delete mode 100644 tests/test_runner/test_eval_hook.py
 delete mode 100644 tests/test_runner/test_fp16.py
 delete mode 100644 tests/test_runner/test_hooks.py
 delete mode 100644 tests/test_runner/test_optimizer.py
 delete mode 100644 tests/test_runner/test_runner.py
 delete mode 100644 tests/test_runner/test_utils.py
 delete mode 100644 tests/test_utils/test_hub.py

diff --git a/MANIFEST.in b/MANIFEST.in
index 5de8494b5d..622635caa1 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1,5 +1,4 @@
 include requirements/runtime.txt
-include mmcv/model_zoo/open_mmlab.json mmcv/model_zoo/deprecated.json mmcv/model_zoo/mmcls.json mmcv/model_zoo/torchvision_0.12.json
 include mmcv/ops/csrc/common/cuda/*.cuh mmcv/ops/csrc/common/cuda/*.hpp mmcv/ops/csrc/common/*.hpp
 include mmcv/ops/csrc/pytorch/*.cpp mmcv/ops/csrc/pytorch/cuda/*.cu mmcv/ops/csrc/pytorch/cuda/*.cpp mmcv/ops/csrc/pytorch/cpu/*.cpp
 include mmcv/ops/csrc/parrots/*.h mmcv/ops/csrc/parrots/*.cpp
diff --git a/mmcv/__init__.py b/mmcv/__init__.py
index 57ac414727..36bfa336d5 100644
--- a/mmcv/__init__.py
+++ b/mmcv/__init__.py
@@ -10,7 +10,4 @@
 
 # The following modules are not imported to this level, so mmcv may be used
 # without PyTorch.
-# - runner
-# - parallel
 # - op
-# - device
diff --git a/mmcv/cnn/alexnet.py b/mmcv/cnn/alexnet.py
index 4d45d96d86..dd6f9d4d02 100644
--- a/mmcv/cnn/alexnet.py
+++ b/mmcv/cnn/alexnet.py
@@ -2,6 +2,7 @@
 import logging
 from typing import Optional
 
+from mmengine.runner import load_checkpoint
 import torch
 import torch.nn as nn
 
@@ -45,7 +46,6 @@ def __init__(self, num_classes: int = -1):
     def init_weights(self, pretrained: Optional[str] = None) -> None:
         if isinstance(pretrained, str):
             logger = logging.getLogger()
-            from ..runner import load_checkpoint
             load_checkpoint(self, pretrained, strict=False, logger=logger)
         elif pretrained is None:
             # use default initializer
diff --git a/mmcv/cnn/resnet.py b/mmcv/cnn/resnet.py
index 17024e446e..f469b2efd6 100644
--- a/mmcv/cnn/resnet.py
+++ b/mmcv/cnn/resnet.py
@@ -6,6 +6,7 @@
 import torch.utils.checkpoint as cp
 from mmengine.model.utils import constant_init, kaiming_init
 from torch import Tensor
+from mmengine.runner import load_checkpoint
 
 
 def conv3x3(in_planes: int,
@@ -270,7 +271,6 @@ def __init__(self,
     def init_weights(self, pretrained: Optional[str] = None) -> None:
         if isinstance(pretrained, str):
             logger = logging.getLogger()
-            from ..runner import load_checkpoint
             load_checkpoint(self, pretrained, strict=False, logger=logger)
         elif pretrained is None:
             for m in self.modules():
diff --git a/mmcv/cnn/vgg.py b/mmcv/cnn/vgg.py
index 09548b63d5..29618117ba 100644
--- a/mmcv/cnn/vgg.py
+++ b/mmcv/cnn/vgg.py
@@ -5,6 +5,7 @@
 import torch.nn as nn
 from mmengine.model.utils import constant_init, kaiming_init, normal_init
 from torch import Tensor
+from mmengine.runner import load_checkpoint
 
 
 def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
@@ -126,7 +127,6 @@ def __init__(self,
     def init_weights(self, pretrained: Optional[str] = None) -> None:
         if isinstance(pretrained, str):
             logger = logging.getLogger()
-            from ..runner import load_checkpoint
             load_checkpoint(self, pretrained, strict=False, logger=logger)
         elif pretrained is None:
             for m in self.modules():
diff --git a/mmcv/device/__init__.py b/mmcv/device/__init__.py
deleted file mode 100644
index ba217b0771..0000000000
--- a/mmcv/device/__init__.py
+++ /dev/null
@@ -1,6 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from . import ipu, mlu, mps
-from .scatter_gather import scatter, scatter_kwargs
-from .utils import get_device
-
-__all__ = ['mlu', 'ipu', 'mps', 'get_device', 'scatter', 'scatter_kwargs']
diff --git a/mmcv/device/_functions.py b/mmcv/device/_functions.py
deleted file mode 100644
index 462a7e4ddc..0000000000
--- a/mmcv/device/_functions.py
+++ /dev/null
@@ -1,30 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
-
-import torch
-
-from mmcv.utils import deprecated_api_warning
-from .utils import get_device
-
-
-def scatter(input: Union[List, torch.Tensor], devices: List) -> List:
-    """scatter copies tensor to devices directly."""
-    current_device = get_device()
-    if isinstance(input, list):
-        outputs = [scatter(_input, devices) for _input in input]
-        return outputs
-    elif isinstance(input, torch.Tensor):
-        output = input.contiguous()
-        return output.to(current_device) if devices != [-1] else output
-    else:
-        raise Exception(f'Unknown type {type(input)}.')
-
-
-class Scatter:
-
-    @staticmethod
-    @deprecated_api_warning({'target_mlus': 'target_devices'},
-                            cls_name='Scatter')
-    def forward(target_devices, input):
-        outputs = scatter(input, target_devices)
-        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/device/ipu/__init__.py b/mmcv/device/ipu/__init__.py
deleted file mode 100755
index d550865ad2..0000000000
--- a/mmcv/device/ipu/__init__.py
+++ /dev/null
@@ -1,14 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import IS_IPU_AVAILABLE
-
-if IS_IPU_AVAILABLE:
-    from .dataloader import IPUDataLoader
-    from .hook_wrapper import IPUFp16OptimizerHook
-    from .model_wrapper import ipu_model_wrapper
-    from .runner import IPUBaseRunner, IPUEpochBasedRunner, IPUIterBasedRunner
-    from .utils import cfg2options
-    __all__ = [
-        'cfg2options', 'ipu_model_wrapper', 'IPUFp16OptimizerHook',
-        'IPUDataLoader', 'IPUBaseRunner', 'IPUEpochBasedRunner',
-        'IPUIterBasedRunner'
-    ]
diff --git a/mmcv/device/ipu/dataloader.py b/mmcv/device/ipu/dataloader.py
deleted file mode 100755
index 1485df2f31..0000000000
--- a/mmcv/device/ipu/dataloader.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections.abc import Mapping, Sequence
-from functools import partial
-
-import poptorch
-from torch.utils.data.dataloader import default_collate
-
-from mmcv.parallel import DataContainer
-
-
-def collate(batch, samples_per_gpu=1):
-    """Put each data field into a tensor/DataContainer with outer dimension
-    batch size.
-
-    TODO support for
-    :type:`~mmcv.parallel.DataContainer`. Currently, it will be ignored.
-    There are 3 cases.
-
-    1. cpu_only = True, e.g., meta data.
-    2. cpu_only = False, stack = True, e.g., images tensors.
-    3. cpu_only = False, stack = False, e.g., gt bboxes.
-    """
-
-    if not isinstance(batch, Sequence):
-        raise TypeError(
-            f'`batch` should be a sequence, but got {type(batch)}.')
-
-    if isinstance(batch[0], DataContainer):
-        # TODO `DataContainer` will be supported in the future.
-        raise TypeError('DataContainer is not supported in ipu data loader.')
-    elif isinstance(batch[0], Sequence):
-        transposed = zip(*batch)
-        collated_batch = []
-        for samples in transposed:
-            if not isinstance(samples[0], DataContainer):
-                # At present, we will skip the processing of datacontainer,
-                # which will reduce the performance of IPU DataLoder
-                collated_batch.append(collate(samples, samples_per_gpu))
-        return collated_batch
-    elif isinstance(batch[0], Mapping):
-        collated_batch = {}
-        for key in batch[0]:
-            if not isinstance(batch[0][key], DataContainer):
-                # At present, we will skip the processing of datacontainer,
-                # which will reduce the performance of IPU DataLoder
-                collated_batch[key] = collate([d[key] for d in batch])
-        return collated_batch
-    else:
-        return default_collate(batch)
-
-
-class IPUDataLoader(poptorch.DataLoader):
-    """Thin wrapper of `torch.utils.data.DataLoader`.
-
-    Compared with the pytorch DataLoder, this DataLoder changes the way of
-    calculation of batch size and adds the AsynchronousDataAccessor to
-    load and release data faster in cpu mode.
-
-    If this data loader is used in a distributed execution environment, it will
-    ensure that each process uses a different subset of the dataset, providing
-    you first call ``options.randomSeed(N)`` with an integer N which is the
-    same across all hosts.
-
-    Args:
-        dataset (torch.utils.data.Dataset): The dataset to get the data from.
-        options (poptorch.Options): Options that will be used to compile
-            and run the model.
-        batch_size (int, optional): This is the batch size in the conventional
-            sense of being the size that runs through an operation in the model
-            at any given time.
-        shuffle (bool, optional): set to ``True`` to have the data reshuffled
-            at every epoch (default: ``False``).
-        num_workers (int, optional): how many subprocesses to use for data
-            loading. ``0`` means that the data will be loaded in the main
-            process. (default: ``0``)
-        drop_last (bool, optional): If True and the number of elements in the
-            dataset is not a multiple of the combined batch size then the
-            incomplete batch at the end will be dropped.
-        persistent_workers (bool, optional): Re-use workers between
-            iterations if True.
-        auto_distributed_partitioning (bool, optional): If True, partitions the
-            dataset for distributed execution automatically. Otherwise, it is
-            assumed that partitioning has been handled manually.
-        mode (poptorch.DataLoaderMode, optional): If `DataLoaderMode.Async`,
-            uses an :py:class:`~poptorch.AsynchronousDataAccessor` to access
-            the dataset. If `DataLoaderMode.Sync`, accesses the dataset
-            synchronously.
-        async_options (Dict[str, Any], optional): Options to pass to
-            :py:class:`~poptorch.AsynchronousDataAccessor`.
-        rebatched_worker_size (int, optional): When using AsyncRebatched: batch
-            size of the tensors loaded by the workers.
-            Default to the combined batch size.
-            If specified the ``rebatched_worker_size`` must be less than
-            or equal to the combined batch size.
-        kwargs (Dict[str, Any], optional): Other options to pass to PyTorch's
-            ``DataLoader`` constructor.
-    """
-
-    def __init__(self,
-                 dataset,
-                 options,
-                 batch_size=1,
-                 shuffle=False,
-                 num_workers=0,
-                 drop_last=True,
-                 persistent_workers=True,
-                 auto_distributed_partitioning=True,
-                 mode='sync',
-                 async_options=None,
-                 rebatched_worker_size=None,
-                 **kwargs):
-        """Lazy init:
-
-        In many frameworks, the dataloader will be constructed before the
-        initialization of the ipu options, so the lazy init method is used
-        here, and the real initialization will not be done until the dataloader
-        needs to be used and the options are input.
-        """
-        # lazy init: sometimes, we cannot get IPU options when build data
-        #            loader
-        self.kwargs = {
-            'dataset': dataset,
-            'batch_size': batch_size,
-            'shuffle': shuffle,
-            'num_workers': num_workers,
-            'drop_last': drop_last,
-            'persistent_workers': persistent_workers,
-            'auto_distributed_partitioning': auto_distributed_partitioning,
-            'mode': mode,
-            'collate_fn': partial(collate, samples_per_gpu=batch_size),
-            'async_options': async_options,
-            'rebatched_worker_size': rebatched_worker_size,
-            **kwargs
-        }
-        self.dataset = dataset
-        self.initialized = False
-        if options:
-            self.init(options=options)
-
-    def init(self, options, **kwargs):
-        if not self.initialized:
-            kwargs = {**self.kwargs, **kwargs, 'options': options}
-            if kwargs['mode'] == 'sync':
-                kwargs['mode'] = poptorch.DataLoaderMode.Sync
-            elif kwargs['mode'] == 'async':
-                kwargs['mode'] = poptorch.DataLoaderMode.AsyncRebatched
-                if kwargs['async_options'] is None:
-                    kwargs['async_options'] = {
-                        'load_indefinitely': True,
-                        'buffer_size': 8
-                    }
-                if kwargs['rebatched_worker_size'] is None:
-                    kwargs['rebatched_worker_size'] = 128
-            super().__init__(**kwargs)
-            self.initialized = True
-
-        return self
diff --git a/mmcv/device/ipu/hierarchical_data_manager.py b/mmcv/device/ipu/hierarchical_data_manager.py
deleted file mode 100755
index a6f3b3cd2a..0000000000
--- a/mmcv/device/ipu/hierarchical_data_manager.py
+++ /dev/null
@@ -1,243 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-
-import numpy as np
-import torch
-
-from mmcv.parallel import DataContainer
-
-# A customized None type for HierarchicalDataManager
-HierarchicalDataNone = object()
-
-
-class HierarchicalDataManager:
-    """A class manage all the tensors in the hierarchical data.
-
-    At present, the input data structure accepted by IPU is limited,
-    when the input data structure of mmcv varies.
-    Here, an intermediate class is needed to get and update tensors
-    from the original data.
-
-    HierarchicalDataManager will record a hierarchical input/output data in
-    self._hierarchical_data. For example, we have an input data:
-    {'img': tensorA, 'label': tensorB, 'img_metas': [tensorC, tensorD]}
-    To enable IPU to use the input, HierarchicalDataManager will collect
-    the torch tensors from self._hierarchical_data into a tuple like:
-    (tensorA, tensorB, tensorC, tensorD).
-    Meanwhile, the return of IPU is a tuple of tensors, HierarchicalDataManager
-    also have a function named update_all_tensors to update tensors in
-    self._hierarchical_data which is the output for upper calls.
-
-    Args:
-        logger (:obj:`logging.Logger`): Logger used during running.
-             Defaults to None.
-    """
-
-    def __init__(self, logger=None):
-        self.atomic_types = (int, str, float, np.ndarray, type(None))
-        self.warning = warnings.warn if logger is None else logger.warning
-        # enable or disable input data's shape and value check
-        self.quick_mode = False
-        self._hierarchical_data = None
-
-    def quick(self):
-        self.quick_mode = True
-
-    def compare_atomic_type(self, a, b):
-        """Compare data, supported datatypes are numpy array and python basic
-        types."""
-        if isinstance(a, np.ndarray):
-            return np.all(a == b)
-        else:
-            return a == b
-
-    def record_hierarchical_data(self, data):
-        """Record a hierarchical data."""
-        if self._hierarchical_data is not None:
-            if isinstance(data, torch.Tensor):
-                assert isinstance(self._hierarchical_data, torch.Tensor), \
-                    'original hierarchical data is not torch.tensor'
-                self._hierarchical_data = data
-            else:
-                self.update_hierarchical_data(data)
-        else:
-            self._hierarchical_data = data
-
-    @property
-    def hierarchical_data(self):
-        return self._hierarchical_data
-
-    def update_hierarchical_data(self,
-                                 dataA,
-                                 dataB=HierarchicalDataNone,
-                                 strict=True,
-                                 address='data'):
-        """Update dataB with dataA in-place.
-
-        Args:
-            dataA (list or dict or tuple): New hierarchical data.
-            dataB (list or dict or tuple): hierarchical data to update.
-                if not specified, self.hierarchical_data will be updated then.
-            strict (bool, optional): If true, an error will be reported
-                when the following conditions occur:
-                1. Non-torch.Tensor data changed.
-                2. Torch.Tensor data shape changed.
-            address (str): Record the address of current data to be updated.
-                Default: 'data'.
-        """
-        if dataB is HierarchicalDataNone:
-            dataB = self.hierarchical_data
-
-        # Update with a da ta with the same structure
-        # but different values(tensors and basic python data types)
-        if isinstance(dataA, (tuple, list)):
-            for idx, node in enumerate(dataA):
-                new_address = ''
-                if not self.quick_mode:
-                    new_address = address + f'[{str(idx)}]'
-                    assert isinstance(node, type(dataB[idx])),\
-                        f'data structure changed: {new_address}'
-                if isinstance(node, torch.Tensor):
-                    dataB[idx] = node
-                else:
-                    self.update_hierarchical_data(
-                        node, dataB[idx], strict, address=new_address)
-        elif isinstance(dataA, dict):
-            for k, v in dataA.items():
-                new_address = ''
-                if not self.quick_mode:
-                    new_address = address + f'[{str(k)}]'
-                    assert isinstance(v, type(dataB[k])),\
-                        f'data structure changed: {new_address}'
-                if isinstance(v, torch.Tensor):
-                    dataB[k] = v
-                else:
-                    self.update_hierarchical_data(
-                        v, dataB[k], strict, address=new_address)
-        elif isinstance(dataA, self.atomic_types):
-            if not self.quick_mode:
-                is_equal = self.compare_atomic_type(dataA, dataB)
-                if not is_equal:
-                    if strict:
-                        raise ValueError(
-                            'all data except torch.Tensor should be same, '
-                            f'but data({address}) is changed.')
-                    else:
-                        self.warning(
-                            f'find a non-torch.Tensor data({type(dataA)}) '
-                            f'changed, and the address is {address}')
-        elif isinstance(dataA, DataContainer):
-            if not self.quick_mode:
-                assert isinstance(dataB, DataContainer)
-                new_address = address + '.data'
-                self.update_hierarchical_data(
-                    dataA.data, dataB.data, False, address=new_address)
-        else:
-            raise NotImplementedError(
-                f'not supported datatype:{type(dataA)}, address is {address}')
-
-    def collect_all_tensors(self, hierarchical_data=None):
-        """Collect torch.Tensor data from self.hierarchical_data to a list and
-        return."""
-        # get a list of tensor from self._hierarchical_data
-        if hierarchical_data is None:
-            hierarchical_data = self._hierarchical_data
-        tensors = []
-        if isinstance(hierarchical_data, torch.Tensor):
-            tensors = [hierarchical_data]
-        else:
-            self._collect_tensors(hierarchical_data, tensors)
-        return tensors
-
-    def _collect_tensors(self, data, tensors):
-        if isinstance(data, (tuple, list)):
-            for node in data:
-                if isinstance(node, torch.Tensor):
-                    tensors.append(node)
-                else:
-                    self._collect_tensors(node, tensors)
-        elif isinstance(data, dict):
-            for v in data.values():
-                if isinstance(v, torch.Tensor):
-                    tensors.append(v)
-                else:
-                    self._collect_tensors(v, tensors)
-        elif isinstance(data, self.atomic_types):
-            pass
-        elif isinstance(data, DataContainer):
-            self._collect_tensors(data.data, tensors)
-        else:
-            raise NotImplementedError(f'not supported datatype:{type(data)}')
-
-    def update_all_tensors(self, tensors):
-        """Put tensors from tuple back to self.hierarchical_data."""
-        if isinstance(self._hierarchical_data, torch.Tensor):
-            print(tensors, len(tensors))
-            assert len(tensors) == 1
-            assert isinstance(tensors[0], torch.Tensor)
-            self._hierarchical_data = tensors[0]
-        else:
-            # convert to list if tensors is tuple
-            tensors = list(tensors)
-            self._set_tensors(self._hierarchical_data, tensors)
-        return self.hierarchical_data
-
-    def _set_tensors(self, data, tensors):
-        if isinstance(data, tuple):
-            data = list(data)
-            for idx in range(len(data)):
-                if isinstance(data[idx], torch.Tensor):
-                    data[idx] = tensors.pop(0)
-                else:
-                    self._set_tensors(data[idx], tensors)
-            data = tuple(data)
-        elif isinstance(data, list):
-            for idx in range(len(data)):
-                if isinstance(data[idx], torch.Tensor):
-                    data[idx] = tensors.pop(0)
-                else:
-                    self._set_tensors(data[idx], tensors)
-        elif isinstance(data, dict):
-            for k, v in data.items():
-                if isinstance(v, torch.Tensor):
-                    data[k] = tensors.pop(0)
-                else:
-                    self._set_tensors(v, tensors)
-        elif isinstance(data, self.atomic_types):
-            pass
-        elif isinstance(data, DataContainer):
-            self._set_tensors(data.data, tensors)
-        else:
-            raise NotImplementedError(f'not supported datatype:{type(data)}')
-
-    def clean_all_tensors(self):
-        """Delete tensors from self.hierarchical_data."""
-        self._clean_tensors(self._hierarchical_data)
-
-    def _clean_tensors(self, data):
-        if isinstance(data, tuple):
-            data = list(data)
-            for idx in range(len(data)):
-                if isinstance(data[idx], torch.Tensor):
-                    data[idx] = None
-                else:
-                    self._clean_tensors(data[idx])
-            data = tuple(data)
-        elif isinstance(data, list):
-            for idx in range(len(data)):
-                if isinstance(data[idx], torch.Tensor):
-                    data[idx] = None
-                else:
-                    self._clean_tensors(data[idx])
-        elif isinstance(data, dict):
-            for k, v in data.items():
-                if isinstance(v, torch.Tensor):
-                    data[k] = None
-                else:
-                    self._clean_tensors(v)
-        elif isinstance(data, self.atomic_types):
-            pass
-        elif isinstance(data, DataContainer):
-            self._clean_tensors(data.data)
-        else:
-            raise NotImplementedError(f'not supported datatype:{type(data)}')
diff --git a/mmcv/device/ipu/hook_wrapper.py b/mmcv/device/ipu/hook_wrapper.py
deleted file mode 100755
index 141afb86d0..0000000000
--- a/mmcv/device/ipu/hook_wrapper.py
+++ /dev/null
@@ -1,105 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.runner import HOOKS, LrUpdaterHook, OptimizerHook
-from mmcv.utils import TORCH_VERSION, digit_version
-
-
-def wrap_lr_updater_hook(lr_hook_class):
-    """A wrapper function to wrap any subclass of LrUpdaterHook.
-
-    IPU needs extra operations to upload optimizer settings. This wrapper will
-    override function(_set_lr) of a subclass of LrUpdaterHook.
-    """
-    assert issubclass(lr_hook_class, LrUpdaterHook)
-
-    class ipu_lr_hook_class(lr_hook_class):
-
-        def _set_lr(self, runner, *args, **kwargs):
-            super()._set_lr(runner, *args, **kwargs)
-            # convert torch optimizer to poptorch optimizer
-            runner.model.setOptimizer(runner.optimizer)
-
-    return ipu_lr_hook_class
-
-
-def wrap_optimizer_hook(optimizer_hook_class):
-    """A wrapper function to wrap OptimizerHook.
-
-    This is an non-intrusive implementation of wrapping optimizer hook (or you
-    need to change every config file to use IPU optimizer hook) IPU's clip-norm
-    implementation is different from pytorch, so there should be an error
-    raised when using clip-norm.
-    """
-
-    class ipu_optimizer_hook_class(OptimizerHook):
-
-        def __init__(self, **kwargs):
-            super().__init__(**kwargs)
-            if self.grad_clip is not None:
-                raise NotImplementedError('IPU does not support gradient clip')
-
-    return ipu_optimizer_hook_class
-
-
-if (TORCH_VERSION != 'parrots'
-        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
-
-    @HOOKS.register_module()
-    class IPUFp16OptimizerHook(OptimizerHook):
-        """FP16 optimizer hook (using PyTorch's implementation).
-
-        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
-        to take care of the optimization procedure.
-
-        Args:
-            loss_scale (float | str | dict): Scale factor configuration.
-                If loss_scale is a float, static loss scaling will be used with
-                the specified scale. If loss_scale is a string, it must be
-                'dynamic', then dynamic loss scaling will be used.
-                It can also be a dict containing arguments of GradScalar.
-                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
-                implementation of GradScaler. If you use a dict version of
-                loss_scale to create GradScaler, please refer to:
-                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
-                for the parameters.
-
-        Examples:
-            >>> loss_scale = dict(
-            ...     init_scale=65536.0,
-            ...     growth_factor=2.0,
-            ...     backoff_factor=0.5,
-            ...     growth_interval=2000
-            ... )
-            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
-        """
-
-        def __init__(self,
-                     grad_clip=None,
-                     coalesce=True,
-                     bucket_size_mb=-1,
-                     loss_scale=512.,
-                     distributed=True):
-            assert grad_clip is None,\
-                'IPU mode does not support `grad_clip` currently'
-            assert coalesce,\
-                'implemented all reduce in distributed training currently'
-            assert bucket_size_mb == -1,\
-                '`bucket_size_mb` should not be set in IPU mode'
-            self.distributed = distributed
-            self._scale_update_param = None
-            if loss_scale == 'dynamic':
-                raise NotImplementedError(
-                    'IPU mode does not support dynamic loss scale currently')
-            elif isinstance(loss_scale, float):
-                self.loss_scale = loss_scale
-            elif isinstance(loss_scale, dict):
-                raise NotImplementedError(
-                    'IPU mode supports single scale currently')
-            else:
-                raise ValueError(
-                    f'loss_scale should be float, but got {loss_scale} ')
-
-        def after_train_iter(self, runner):
-            pass
-
-else:
-    raise RuntimeError('The IPU mode only supports torch 1.6 and above')
diff --git a/mmcv/device/ipu/model_wrapper.py b/mmcv/device/ipu/model_wrapper.py
deleted file mode 100755
index c345537e29..0000000000
--- a/mmcv/device/ipu/model_wrapper.py
+++ /dev/null
@@ -1,721 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import inspect
-from collections import OrderedDict
-from typing import Optional, Union
-
-import poptorch
-import torch
-import torch.nn as nn
-from poptorch import PoplarExecutor, __version__, identity_loss
-from poptorch._args_parser import ArgsParser
-
-from mmcv.runner import auto_fp16
-from .hierarchical_data_manager import HierarchicalDataManager
-from .utils import compare_ndarray, model_sharding, recomputation_checkpoint
-
-
-class DictArgsParser(ArgsParser):
-    """A helper class for handling model input.
-
-    Args:
-        inputs (list): Inputs of model.
-    """
-
-    def __init__(self, inputs):
-        # Combine args and kwargs:
-        self._has_variadic_arguments = True
-        self._varnames = list(inputs.keys())
-        self._defaults = [inspect.Parameter.empty for _ in self._varnames]
-        self._warned_not_contiguous_input = False
-
-
-class WrappedNet(nn.Module):
-    """A net wrapper for model conversion.
-
-    This wrapper will make some changes and add some extra functions to
-    training/inference model.
-
-    Args:
-        model (:obj:`nn.Module`): The model to run.
-        inputs_manager (:obj:`HierarchicalDataManager`): A parser
-            converting inputs from tuple to dictionary.
-        outputs_manager (:obj:`HierarchicalDataManager`): A parser
-            converting outputs from dictionary to tuple.
-        inter_outputs_in_cpu (dict): Specify the features to be
-            recorded.
-        modules_to_record (mmcv.Config, list): Index or name of modules which
-            will be recorded for output. It is necessary to specify output for
-            static graph of model training or inference.
-    """
-
-    def __init__(self,
-                 model,
-                 inputs_manager,
-                 outputs_manager,
-                 inter_outputs_in_cpu,
-                 modules_to_record=None):
-        super().__init__()
-        self.model = model
-        self.inputs_manager = inputs_manager
-        self.outputs_manager = outputs_manager
-        self.training = model.training
-        # Register a hook function to capture the intermediate features
-        # generated by the network to align the outputs between ipu and cpu
-        # Used to confirm whether the implementation of CPU is consistent
-        # with the implementation of IPU
-        self.inter_outputs_in_cpu = inter_outputs_in_cpu
-        if modules_to_record is None:
-            modules_to_record = []
-
-        for idx, (name, module) in enumerate(model.named_modules()):
-            if name in modules_to_record or idx in modules_to_record:
-                features_hook = self.get_input_output_hook(
-                    name, idx, self.inter_outputs_in_cpu)
-                module.register_forward_hook(hook=features_hook)
-
-    def get_input_output_hook(self, name, idx, save_dict):
-
-        def input_output_hook(module, fea_in, fea_out):
-            if isinstance(fea_in, tuple):
-                fea_in = list(fea_in)
-            if isinstance(fea_out, tuple):
-                fea_out = list(fea_out)
-            save_dict[name] = {
-                'fea_in': fea_in,
-                'fea_out': fea_out,
-                'idx': idx
-            }
-            return None
-
-        return input_output_hook
-
-    def forward(self, inputs_tuple):
-        """This function is used to be compiled to ipu, the inputs and outputs
-        need to be tuples, so here we need to restore the input back to a
-        dictionary and convert the output to a tuple."""
-        self.inputs_manager.update_all_tensors(inputs_tuple)
-        kwargs = {**(self.inputs_manager.hierarchical_data)}
-        if self.training:
-            outputs = self.forward_train(kwargs)
-            # tell poptorch which loss will be used finally
-            identity_loss(outputs['loss'], reduction='none')
-        else:
-            outputs = self.forward_eval(kwargs)
-
-        if isinstance(outputs, torch.Tensor):
-            # currently not support single tensor output,
-            # need to wrap it with a dictionary,
-            # use a keyword to identify this case
-            outputs = {'output of WrappedNet: single tensor': outputs}
-
-        # if there are some features need to be record, add extra outputs
-        for name in self.inter_outputs_in_cpu:
-            outputs[name] = self.inter_outputs_in_cpu[name]
-
-        # record all the places of return tensors in the converting stage
-        # while in the real run stage, all the tensor are changed in-place
-        # that means the output can be obtained directly outside this function
-        self.outputs_manager.record_hierarchical_data(outputs)
-        plain_outputs = self.outputs_manager.collect_all_tensors()
-        return plain_outputs
-
-    def forward_train(self, kwargs):
-        optimizer = kwargs.pop('optimizer')
-        outputs = self.train_step(kwargs, optimizer)
-        return outputs
-
-    def train_step(self, data, optimizer=None, **kwargs):
-        """The iteration step during training.
-
-        This method defines an iteration step during training, except for the
-        back propagation and optimizer updating, which are done in an optimizer
-        hook. Note that in some complicated cases or models, the whole process
-        including back propagation and optimizer updating are also defined in
-        this method, such as GAN.
-
-        Args:
-            data (dict): The output of dataloader.
-            optimizer (:obj:`torch.optim.Optimizer`, optional): The
-                optimizer of runner is passed to ``train_step()``. This
-                argument is unused and reserved.
-
-        Returns:
-            dict: Dict of outputs. The following fields are contained.
-                - loss (torch.Tensor): A tensor for back propagation, which \
-                    can be a weighted sum of multiple losses.
-                - log_vars (dict): Dict contains all the variables to be sent \
-                    to the logger.
-                - num_samples (int): Indicates the batch size (when the model \
-                    is DDP, it means the batch size on each GPU), which is \
-                    used for averaging the logs.
-        """
-        losses = self.model(**data)
-        loss, log_vars = self._parse_losses(losses)
-
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
-
-        return outputs
-
-    def _parse_losses(self, losses):
-        log_vars = OrderedDict()
-        for loss_name, loss_value in losses.items():
-            if isinstance(loss_value, torch.Tensor):
-                log_vars[loss_name] = loss_value.mean()
-            elif isinstance(loss_value, list):
-                log_vars[loss_name] = sum(loss.mean() for loss in loss_value)
-            elif isinstance(loss_value, dict):
-                for name, value in loss_value.items():
-                    log_vars[name] = value
-            else:
-                raise TypeError(
-                    f'{loss_name} is not a tensor or list of tensors')
-
-        loss = sum(value for key, value in log_vars.items() if 'loss' in key)
-        log_vars['loss'] = loss
-
-        return loss, log_vars
-
-    def forward_eval(self, kwargs):
-        img = kwargs.pop('img')
-        img_metas = kwargs.pop('img_metas', None)
-        return_loss = kwargs.pop('return_loss')
-        assert not return_loss
-        # TODO Temporarily hard-code to close post_process,
-        # otherwise, in the third trace(_check_trace),
-        # post_process will convert output tensor to numpy array automatically,
-        # resulting in _check_trace failure
-        outputs = self.model(
-            img,
-            img_metas=img_metas,
-            return_loss=return_loss,
-            post_process=False)
-        return outputs
-
-
-class MMPoplarExecutor(PoplarExecutor):
-    """An executor for inputs/outputs parsing, model compilation, data
-    alignment and IPU upload/download.
-
-    Args:
-        model (:obj:`nn.Module`): The model to be compiled.
-        logger (:obj:`logging.Logger`): Logger used during running.
-             Defaults to None.
-        training (bool): Model in training mode or eval mode.
-        modules_to_record (mmcv.Config, list): Index or name of modules which
-            will be recorded for output. It is necessary to specify output for
-            static graph of model training or inference.
-        args (argument list): Arguments passed to the `__init__`
-            method of PoplarExecutor.
-        kwargs (keyword arguments): Keyword arguments passed to the `__init__`
-            method of PoplarExecutor.
-    """
-
-    def __init__(self,
-                 model,
-                 logger=None,
-                 training=True,
-                 modules_to_record=None,
-                 *args,
-                 **kwargs):
-        # self.model == self._user_model: input pytorch model
-        # self._model: wrapped model which is used to compile
-        # and update weights, these two models use same weights
-        # wrapped model only accept and output tuple, so
-        # HierarchicalDataManager will convert dictionary
-        # to tuple and convert them back
-        self.inputs_manager = HierarchicalDataManager(logger=logger)
-        self.outputs_manager = HierarchicalDataManager(logger=logger)
-        self.logger = logger
-        # the features calculated by CPU
-        self.inter_outputs_in_cpu = {}
-        # the features calculated by IPU
-        self.inter_outputs_in_ipu = {}
-        if modules_to_record is None:
-            # It is possible that the IPU implementation of some operators
-            # is inconsistent with the expected (CPU), here you can use
-            # this method to confirm whether there is a problem
-            self.compare_with_cpu = False
-        else:
-            self.compare_with_cpu = True
-        # move model.fp16_enabled to self.fp16_enabled,
-        # modify the position where the input is automatically casted to half
-        if getattr(model, 'fp16_enabled', False):
-            model.fp16_enabled = False
-            self.fp16_enabled = True
-        # make torch.jit.trace convert self._model
-        model = WrappedNet(
-            model,
-            self.inputs_manager,
-            self.outputs_manager,
-            self.inter_outputs_in_cpu,
-            modules_to_record=modules_to_record)
-        super().__init__(model, training=training, *args, **kwargs)
-        # overwrite self._args_parser in train_step or val_step
-        self._args_parser = None
-        if training:
-            assert self.training
-        else:
-            assert not self.training
-
-    @property
-    def training(self):
-        # If trying to get the attribute(training) of self,
-        # since the class has no training attribute,
-        # it will automatically look for the training attribute of self.model.
-        # However, the real attribute we want to check is self._training,
-        # self.model.training  and self._training are often inconsistent.
-        # It is not clear whether it is a Poptorch bug or a special design,
-        # temporarily use this function to fix the problem
-        return self._training  # comes from self.model._training
-
-    @auto_fp16(supported_types=(PoplarExecutor, ))
-    def run_model(self, data_dict):
-        # this function is used to parse input_dict
-        # and convert to output_dict
-        if self.isCompiled():
-            self.inputs_manager.record_hierarchical_data(data_dict)
-            inputs_tuple = tuple(self.inputs_manager.collect_all_tensors())
-        else:
-            # get tensors out of data and put them in a tuple
-            self.inputs_manager.record_hierarchical_data(data_dict)
-            inputs_tuple = tuple(self.inputs_manager.collect_all_tensors())
-            # turn logger in data manager off after compilation
-            self.inputs_manager.quick()
-            self.outputs_manager.quick()
-
-        # parser args in the first iter
-        if self._args_parser is None:
-            self._args_parser = DictArgsParser({'args': inputs_tuple})
-
-        # run or convert model
-        # the plain_outputs will be used in converting stage
-        plain_outputs = self(inputs_tuple)
-
-        self.inputs_manager.clean_all_tensors()
-
-        # put list of tensors back to the output dict
-        # according to the same order
-        self.outputs_manager.update_all_tensors(plain_outputs)
-        # get the real output dictionary from self.outputs_manager
-        output_dict = self.outputs_manager.hierarchical_data
-
-        # split output_dict into inter_outputs_in_ipu
-        # and output of the torch model
-        torch_model_output = {}
-        for name in output_dict:
-            if name in self.inter_outputs_in_cpu:
-                self.inter_outputs_in_ipu[name] = output_dict[name]
-            else:
-                torch_model_output[name] = output_dict[name]
-
-        if 'output of WrappedNet: single tensor' in output_dict:
-            assert len(torch_model_output) == 1
-            assert isinstance(
-                torch_model_output['output of WrappedNet: single tensor'],
-                torch.Tensor)
-            torch_model_output = \
-                torch_model_output['output of WrappedNet: single tensor']
-
-        return torch_model_output
-
-    def train_step(self, data, optimizer=None, **kwargs):
-        # arguments from mmcls/models/classifiers/base.py:
-        # BaseClassifier.train_step
-        assert self.training
-        assert len(kwargs) == 0  # TODO, support later if necessary
-
-        # TODO support datacontainer as input
-        # currently, auto_fp16 and HierarchicalDataManager take too much
-        # time on traversing datacontainer
-        data['img_metas'] = None
-        num_samples = len(data['img'].data)
-
-        # TODO we will ignore optimizer because it will not be used in model,
-        # support later if necessary
-        data['optimizer'] = None
-        output_dict = self.run_model(data)
-
-        # outputs contained loss, log_vars, num_samples,
-        # only loss(torch.tensor) has been updated
-        # remove all unchanged vars, left torch.tensor
-        neat_output_dict = {'loss': output_dict['loss']}
-
-        # re-parse outputs, get back log_vars and num_samples
-        loss, log_vars = self.model._parse_losses(neat_output_dict)
-        final_output_dict = dict(
-            loss=loss, log_vars=log_vars, num_samples=num_samples)
-        return final_output_dict
-
-    def eval_call(self, img, img_metas=None, return_loss=True, **kwargs):
-        # arguments from mmdet/models/detectors/base.py:BaseDetector.forward
-        # tmp usssage for eval mode
-        assert not self.training
-        assert len(kwargs) == 0  # TODO, support later if necessary
-        assert not return_loss
-        data = {'img': img, 'img_metas': img_metas, 'return_loss': return_loss}
-
-        output_dict = self.run_model(data)
-
-        return output_dict
-
-    def detachFromDevice(self):
-        if self.isCompiled() and self._is_attached:
-            super().detachFromDevice()
-
-    def attachToDevice(self):
-        if self.isCompiled() and not self._is_attached:
-            super().attachToDevice()
-
-
-class TrainEvalModel:
-    """A class maintaining training MMPoplarExecutor and inference
-    MMPoplarExecutor.
-
-    Args:
-        train_model (:obj:`nn.Module`): The training model to be compiled.
-            ``train_model`` can be None if only executing validation.
-        eval_model (:obj:`nn.Module`): The inference model to be compiled.
-        options (mmcv.Config, dict): Options that will be used to compile
-            and run the model.
-        optimizer (:obj:`torch.optim.Optimizer`, optional): torch
-            optimizer, necessary if in training mode
-        logger (:obj:`logging.Logger`): Logger used during running.
-             Defaults to None.
-        modules_to_record (mmcv.Config, list): Index or name of modules which
-            will be recorded for output. It is necessary to specify output for
-            static graph of model training or inference.
-    """
-
-    def __init__(self,
-                 train_model,
-                 eval_model,
-                 options,
-                 optimizer,
-                 modules_to_record=None,
-                 logger=None):
-        if train_model is None:
-            self._train_executor = None
-            self.training = False
-        else:
-            self._train_executor = get_training_model(
-                train_model,
-                options=options['training'],
-                optimizer=optimizer,
-                logger=logger,
-                modules_to_record=modules_to_record)
-            self.training = True
-        self._eval_executor = get_inference_model(
-            eval_model, options=options['inference'], logger=logger)
-
-    @property
-    def executor(self):
-        if self.training:
-            return self._train_executor
-        else:
-            return self._eval_executor
-
-    def train(self, mode: bool = True):
-        """Sets the module in training mode.
-
-        This has any effect only on certain modules. See documentations of
-        particular modules for details of their behaviors in
-        training/evaluation mode, if they are affected,
-        e.g. :class:`Dropout`, :class:`BatchNorm`, etc.
-
-        Args:
-            mode (bool): whether to set training mode (``True``) or evaluation
-                mode (``False``). Default: ``True``.
-
-        Returns:
-            Module: self
-        """
-        if not isinstance(mode, bool):
-            raise ValueError('training mode is expected to be boolean, '
-                             f'but got {type(mode)}')
-        if self._train_executor is None and mode:
-            raise RuntimeError(
-                'The train_executor is not initialized.'
-                'If you want to initialize train_executor,'
-                'you need to input optimizer when converting pytorch model')
-
-        if mode == self.training:
-            self.model.train(mode)
-            return self
-        else:
-            if self.isCompiled():
-                # copy weights from IPU to cpu before off-load current session
-                self.copyWeightsToHost()
-                # detach the current session before change the mode,
-                # if is training mode and weights are updated,
-                # poptorch will copy weights from IPU to host
-                self.detachFromDevice()
-
-            self.training = mode  # session will changed with mode changing
-            self.model.train(mode)
-
-            # after changing mode, attach the current new session,
-            # and this function will copy weights of model to device
-            self.attachToDevice()
-            return self
-
-    def eval(self):
-        """Sets the module in evaluation mode.
-
-        This has any effect only on certain modules.
-        See documentations of particular modules
-        for details of their behaviors in training/evaluation mode,
-        if they are affected, e.g. :class:`Dropout`, :class:`BatchNorm`, etc.
-
-        This is equivalent with :meth:`self.train(False)
-        <nn.Module.train>`.
-
-        See :ref:`locally-disable-grad-doc` for a comparison between
-        `.eval()` and several similar mechanisms that may be confused with it.
-
-        Returns:
-            Module: self
-        """
-        return self.train(False)
-
-    def compare_data_between_ipu_and_cpu(self, inter_outputs_in_cpu,
-                                         inter_outputs_in_ipu):
-        for key, val in inter_outputs_in_cpu.items():
-            is_tensor = isinstance(val['fea_in'], torch.Tensor)
-            fea_in_cpu = val['fea_in']
-            fea_in_cpu_list = [fea_in_cpu] if is_tensor else fea_in_cpu
-            fea_in_ipu = inter_outputs_in_ipu[key]['fea_in']
-            fea_in_ipu_list = [fea_in_ipu] if is_tensor else fea_in_ipu
-
-            is_tensor = isinstance(val['fea_out'], torch.Tensor)
-            fea_out_cpu = val['fea_out']
-            fea_out_cpu_list = [fea_out_cpu] if is_tensor else fea_out_cpu
-            fea_out_ipu = inter_outputs_in_ipu[key]['fea_out']
-            fea_out_ipu_list = [fea_out_ipu] if is_tensor else fea_out_ipu
-
-            print('comparing layer:', key)
-            for idx, (featA, featB) in \
-                    enumerate(zip(fea_in_cpu_list, fea_in_ipu_list)):
-                print('fea_in, tensor ', idx)
-                compare_ndarray(featA.detach().numpy(), featB.detach().numpy())
-            for idx, (featA, featB) in \
-                    enumerate(zip(fea_out_cpu_list, fea_out_ipu_list)):
-                print('fea_out, tensor', idx)
-                compare_ndarray(featA.detach().numpy(), featB.detach().numpy())
-
-    # TODO Unified training and eval interface,
-    # merge train_step(train) and __call__(eval) together
-    def train_step(self, data, optimizer=None, **kwargs):
-        assert self.training, 'not supported train_step on eval mode'
-        inter_outputs_in_cpu = {}
-        if (self._train_executor.isCompiled()
-                and self._train_executor.compare_with_cpu):
-            self.copyWeightsToHost()
-            # run in CPU mode
-            self._train_executor.model.train_step(data, optimizer, **kwargs)
-            inter_outputs_in_cpu = {
-                **(self._train_executor.inter_outputs_in_cpu)
-            }
-        # run in IPU mode
-        result = self._train_executor.train_step(data, optimizer, **kwargs)
-        if (self._train_executor.isCompiled()
-                and self._train_executor.compare_with_cpu
-                and len(inter_outputs_in_cpu) > 0):
-            self.compare_data_between_ipu_and_cpu(
-                inter_outputs_in_cpu,
-                self._train_executor.inter_outputs_in_ipu)
-        return result
-
-    # TODO Unified training and eval interface,
-    # merge train_step(train) and __call__(eval) together
-    def __call__(self, *args, **kwargs):
-        if self.training:
-            raise NotImplementedError('use train_step rather than __call__')
-        else:
-            return self._eval_executor.eval_call(*args, **kwargs)
-
-    def __getattr__(self, attr):
-        return getattr(self.executor, attr)
-
-
-def get_training_model(model: nn.Module,
-                       options: Optional[poptorch.Options] = None,
-                       optimizer: Optional[torch.optim.Optimizer] = None,
-                       logger=None,
-                       modules_to_record=None) -> poptorch.PoplarExecutor:
-    """Create a PopTorch training model from a PyTorch model, running on IPU
-    hardware in training mode.
-
-    Note:
-        PopTorch makes a shallow copy of the model. Changes to the
-        parameters in the returned training model affect the original model
-        and vice versa. However, primitive variable types are not synced: for
-        example calling ``model.train()`` on the original model, which
-        changes the ``training`` bool of the model instance, will not alter the
-        model returned by this function. You may need to call ``model.train()``
-        on your model before you call this function for correct behavior.
-
-    Args:
-        model (:obj:`nn.Module`): The model to run.
-        options (poptorch.Options): Options that will be used to compile
-            and run the model.
-        optimizer (:obj:`torch.optim.Optimizer`, optional): The optimizers
-            to apply during training.
-        logger (:obj:`logging.Logger`): Logger used during running.
-             Defaults to None.
-        modules_to_record (mmcv.Config, list): Index or name of modules which
-            will be recorded for output. It is necessary to specify output for
-            static graph of model training or inference.
-
-    Returns:
-        The :class:`poptorch.PoplarExecutor` wrapper to use in place
-        of ``model``.
-    """
-    # Create a copy of the original model in case it needs to be wrapped
-    maybe_wrapped_model = copy.copy(model)
-
-    return MMPoplarExecutor(
-        model=maybe_wrapped_model,
-        logger=logger,
-        options=options,
-        training=True,
-        optimizer=optimizer,
-        user_model=model,
-        modules_to_record=modules_to_record,
-        poptorch_version=__version__)
-
-
-def get_inference_model(model: Union[nn.Module, poptorch.PoplarExecutor],
-                        options: Optional[poptorch.Options] = None,
-                        logger=None) -> poptorch.PoplarExecutor:
-    """Create a PopTorch inference model from a PyTorch model, running on IPU
-    hardware in inference mode.
-
-    Note:
-        PopTorch makes a shallow copy of the model. Changes to the
-        parameters in the returned inference model affect the original model
-        and vice versa. However, primitive variable types are not synced: for
-        example calling ``model.eval()`` on the original model will not alter
-        the model returned by this function. You may need to call
-        ``model.eval()`` on your model before you call this function for
-        correct behavior.
-
-    Args:
-        model (:obj:`nn.Module`): The model to run.
-        options (poptorch.Options): Options that will be used to compile
-            and run the model.
-        logger (:obj:`logging.Logger`): Logger used during running.
-             Defaults to None.
-
-    Returns:
-        The :class:`poptorch.PoplarExecutor` wrapper to use in place of
-        ``model``.
-    """
-
-    return MMPoplarExecutor(
-        model=copy.copy(model),
-        logger=logger,
-        options=options,
-        training=False,
-        poptorch_version=__version__)
-
-
-def ipu_model_wrapper(model,
-                      options,
-                      optimizer=None,
-                      logger=None,
-                      modules_to_record=None,
-                      ipu_model_cfg=None,
-                      fp16_cfg=None):
-    """Convert torch model to IPU model.
-
-    Args:
-        model (nn.Module): The target model to be converted.
-        options (dict[str, poptorch.Options]): IPU options, generated
-            by :func:`cfg2options`.
-        optimizer (:obj:`torch.optim.Optimizer`, optional): torch
-            optimizer, necessary if in training mode
-        logger (:obj:`logging.Logger`): Logger used during training.
-        modules_to_record (mmcv.Config, list): Index or name of modules which
-            will be recorded for output. It is necessary to specify output for
-            static graph of model training or inference.
-        ipu_model_cfg (dict): A dictionary contains train_split_edges and
-            train_ckpt_nodes, See details in :func:`model_sharding` and
-            :func:`recomputation_checkpoint` functions.
-        fp16_cfg (dict): Config for IPU fp16 training. Currently supports
-            configs: `loss_scale`, `velocity_accum_type` and `accum_type`.
-            See details in
-            https://docs.graphcore.ai/projects/poptorch-user-guide/en/latest/index.html
-
-    Returns:
-        TrainEvalModel: IPU wrapped model.
-    """
-    if ipu_model_cfg is None:
-        ipu_model_cfg = {}
-    training = model.training if optimizer is not None else False
-    # set mixed-precision
-    if fp16_cfg is not None:
-        from mmcv.runner import wrap_fp16_model
-        loss_scale = fp16_cfg['loss_scale']
-        wrap_fp16_model(model)
-        model.half()
-        # TODO tmp ussage to set loss scaling for torch original optimizer
-        if optimizer is not None:
-            optimizer.loss_scaling = loss_scale
-            if fp16_cfg.get('velocity_accum_type', False):
-                if fp16_cfg['velocity_accum_type'] == 'half':
-                    optimizer.velocity_accum_type = torch.half
-                else:
-                    optimizer.velocity_accum_type = torch.float32
-            if fp16_cfg.get('accum_type', False):
-                if fp16_cfg['accum_type'] == 'half':
-                    optimizer.accum_type = torch.half
-                else:
-                    optimizer.accum_type = torch.float32
-        # TODO support feature alignment for fp16
-        if modules_to_record is not None:
-            raise NotImplementedError(
-                'Feature alignment for fp16 is not implemented')
-
-    # set model partition
-    if optimizer is None:
-        train_model = None
-    else:
-        # split model into multi-IPUs if specified
-        train_model = model_sharding(
-            copy.copy(model).train(),
-            ipu_model_cfg.get('train_split_edges', []))
-
-        recomputation_checkpoint(train_model,
-                                 ipu_model_cfg.get('train_ckpt_nodes', []))
-
-        # TODO support feature alignment for gradient accumulation mode
-        gradient_accumulation = \
-            getattr(options['training'].Training, 'gradient_accumulation', 1)
-        if gradient_accumulation > 1:
-            assert modules_to_record is None, \
-                'Feature alignment for grad-accumulation mode not implemented'
-
-        # TODO support feature alignment for multi-replica mode
-        replication_factor = \
-            getattr(options['training'], 'replication_factor', 1)
-        if replication_factor > 1:
-            assert modules_to_record is None, \
-                'Feature alignment for multi-replica mode not implemented'
-
-    # TODO supports different model partitions between train and eval mode
-    assert len(ipu_model_cfg.get('eval_split_edges', [])) == 0,\
-        'Currently, BeginBlock can only be used once on the same model'
-    eval_model = copy.copy(model).eval()
-
-    # wrap model for compilation
-    model = TrainEvalModel(
-        train_model,
-        eval_model,
-        options=options,
-        optimizer=optimizer,
-        logger=logger,
-        modules_to_record=modules_to_record)
-    model.train(training)
-    return model
diff --git a/mmcv/device/ipu/runner.py b/mmcv/device/ipu/runner.py
deleted file mode 100755
index e2d4922677..0000000000
--- a/mmcv/device/ipu/runner.py
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from mmcv.runner import (HOOKS, RUNNERS, BaseRunner, EpochBasedRunner,
-                         IterBasedRunner)
-from mmcv.utils import IS_IPU_AVAILABLE
-
-if IS_IPU_AVAILABLE:
-    from .dataloader import IPUDataLoader
-    from .hook_wrapper import (IPUFp16OptimizerHook, wrap_lr_updater_hook,
-                               wrap_optimizer_hook)
-    from .model_wrapper import ipu_model_wrapper
-    from .utils import build_from_cfg_with_wrapper, cfg2options
-
-
-class IPUBaseRunner(BaseRunner):
-    """A base runner for IPU.
-
-    This runner has some extra processes for IPU which are shown below:
-
-    1. Parse options for IPU
-    2. wrap pytorch model for IPU
-    3. Raise errors while encountering illegal usage
-    4. Input IPU options and initialize dataloader if finding an instance
-       of IPUDataLoader
-
-    Args:
-        model (:obj:`nn.Module`): The model to run.
-        options_cfg (mmcv.Config, dict): Options that will be used to compile
-            and run the model.
-        modules_to_record (mmcv.Config, list): Index or name of modules which
-            will be recorded for output. It is necessary to specify output for
-            static graph of model training or inference.
-        ipu_model_cfg (mmcv.Config, dict): Config of model partition and
-            recomputing checkpoint
-        fp16_cfg (mmcv.Config): Config for fp16 training.
-        batch_processor (callable): A callable method that process a data
-            batch. Should be None for IPU runner
-        kwargs (Dict[str, Any], optional): Keyword arguments will be passed to
-        ``base_runner.BaseRunner``.
-    """
-
-    def __init__(self,
-                 model,
-                 options_cfg=None,
-                 modules_to_record=None,
-                 ipu_model_cfg=None,
-                 fp16_cfg=None,
-                 batch_processor=None,
-                 **kwargs):
-        assert hasattr(model, 'train_step') and batch_processor is None,\
-            'only support model with train_step'
-
-        if options_cfg is None:
-            options_cfg = {}
-        # call BaseRunner.__init__() here
-        super().__init__(model, **kwargs)
-
-        # process options of ipu
-        if IS_IPU_AVAILABLE:
-            self.options = cfg2options(options_cfg)
-            self.model = ipu_model_wrapper(
-                self.model,
-                self.options,
-                self.optimizer,
-                self.logger,
-                modules_to_record=modules_to_record,
-                ipu_model_cfg=ipu_model_cfg,
-                fp16_cfg=fp16_cfg)
-        else:
-            raise NotImplementedError('cpu mode on IPURunner is not supported')
-
-    def register_lr_hook(self, lr_config):
-        if lr_config is None:
-            return
-        assert isinstance(lr_config, dict)
-        assert 'policy' in lr_config
-        policy_type = lr_config.pop('policy')
-        # If the type of policy is all in lower case,
-        # e.g., 'cyclic', then its first letter will be capitalized,
-        # e.g., to be 'Cyclic'.
-        # This is for the convenient usage of Lr updater.
-        # Since this is not applicable for `
-        # CosineAnnealingLrUpdater`, the string will not be changed
-        # if it contains capital letters.
-        if policy_type == policy_type.lower():
-            policy_type = policy_type.title()
-        hook_type = policy_type + 'LrUpdaterHook'
-        lr_config['type'] = hook_type
-        hook = build_from_cfg_with_wrapper(lr_config, HOOKS,
-                                           wrap_lr_updater_hook)
-        self.register_hook(hook, priority='VERY_HIGH')
-
-    def register_optimizer_hook(self, optimizer_config):
-        if optimizer_config is None:
-            return
-        assert isinstance(optimizer_config, (dict, IPUFp16OptimizerHook))
-        if isinstance(optimizer_config, dict):
-            optimizer_config.setdefault('type', 'OptimizerHook')
-            hook = build_from_cfg_with_wrapper(optimizer_config, HOOKS,
-                                               wrap_optimizer_hook)
-        else:
-            hook = optimizer_config
-        self.register_hook(hook, priority='ABOVE_NORMAL')
-
-    def run(self, data_loaders, workflow, *args, **kwargs):
-        for i, flow in enumerate(workflow):
-            mode, _ = flow
-            # initialize IPU dataloader if not initialized
-            assert isinstance(data_loaders[i], IPUDataLoader),\
-                'IPU runner can only work with `IPUDataLoader`'
-            data_loaders[i].init(options=self.get_options(mode))
-
-        super().run(data_loaders, workflow, *args, **kwargs)
-
-    def get_options(self, mode):
-        if mode == 'train':
-            return self.options['training']
-        elif mode == 'val':
-            return self.options['inference']
-        else:
-            raise ValueError(f'mode should be train or val but got {mode}')
-
-
-@RUNNERS.register_module()
-class IPUEpochBasedRunner(IPUBaseRunner, EpochBasedRunner):
-    """Epoch-based Runner for IPU.
-
-    The Inheritance order(MRO) is: IPUEpochBasedRunner -> IPUBaseRunner ->
-    EpochBasedRunner -> BaseRunner This runner train models epoch by epoch.
-    """
-    pass
-
-
-@RUNNERS.register_module()
-class IPUIterBasedRunner(IPUBaseRunner, IterBasedRunner):
-    """Iteration-based Runner for IPU.
-
-    The Inheritance order(MRO) is: IPUIterBasedRunner -> IPUBaseRunner ->
-    IterBasedRunner -> BaseRunner This runner train models iteration by
-    iteration.
-    """
-    pass
diff --git a/mmcv/device/ipu/utils.py b/mmcv/device/ipu/utils.py
deleted file mode 100755
index 79709db1ee..0000000000
--- a/mmcv/device/ipu/utils.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import inspect
-
-import numpy as np
-import popart
-import poptorch
-import torch
-import torch.nn as nn
-
-from mmcv.utils import Registry
-
-
-def _options_assigner(cfg, options_node):
-    # set popart.options by config
-    # cfg: dict, python data type
-    # options_node: python module or function
-    if isinstance(cfg, dict):
-        for key in cfg:
-            _options_assigner(cfg[key], getattr(options_node, key))
-    elif isinstance(cfg, (int, float, str, list)):
-        if callable(options_node):
-            options_node(cfg)
-        else:
-            error_msg = f'options_node type {type(options_node)} not supported'
-            raise NotImplementedError(error_msg)
-    else:
-        error_msg = f'cfg type {type(cfg)} not supported'
-        raise NotImplementedError(error_msg)
-
-
-def cfg2options(cfg):
-    """Parse dictionary to ipu options.
-
-    Args:
-        cfg (dict): A dictionary of ipu settings.
-
-    Returns:
-        dict[str, poptorch.Options]: Training options and inference options
-        of IPU.
-    """
-    # set ipu options for inference and training by config
-    train_cfg = cfg.pop('train_cfg', {})
-    eval_cfg = cfg.pop('eval_cfg', {})
-    eval_cfg['replicationFactor'] = 1  # eval mode only use one replica
-    eval_cfg['executionStrategy'] = 'ShardedExecution'
-    # overwrite default ipu cfg with specified train cfgs
-    training_ipu_cfg = {**cfg, **train_cfg}
-    # overwrite default ipu cfg with specified eval cfgs
-    inference_ipu_cfg = {**cfg, **eval_cfg}
-
-    ipu_options = {
-        'training': _cast_to_options(training_ipu_cfg),
-        'inference': _cast_to_options(inference_ipu_cfg)
-    }
-
-    # TODO configure these codes
-    ipu_options['training']._Popart.set('disableGradAccumulationTensorStreams',
-                                        True)
-    ipu_options['training']._Popart.set(
-        'accumulateOuterFragmentSettings.schedule',
-        int(popart.AccumulateOuterFragmentSchedule.OverlapMemoryOptimized))
-    ipu_options['training'].Precision.enableStochasticRounding(True)
-
-    return ipu_options
-
-
-def _cast_to_options(cfg):
-    # If it cannot be directly assigned, use if statement to parse it,
-    # and if it can be directly assigned, use _options_assigner to assign
-    options = poptorch.Options()
-
-    if 'availableMemoryProportion' in cfg:
-        available_memory_proportion = cfg.pop('availableMemoryProportion')
-        mem_props = {}
-        for i, mem_prop in enumerate(available_memory_proportion):
-            mem_props[f'IPU{i}'] = mem_prop
-        options.setAvailableMemoryProportion(mem_props)
-
-    if 'executionStrategy' in cfg:
-        execution_strategy = cfg.pop('executionStrategy')
-        if execution_strategy == 'SameAsIpu':
-            options.setExecutionStrategy(
-                poptorch.PipelinedExecution(
-                    getattr(poptorch.AutoStage, execution_strategy)))
-        elif execution_strategy == 'ShardedExecution':
-            options.setExecutionStrategy(poptorch.ShardedExecution())
-        else:
-            raise NotImplementedError(
-                'executionStrategy should be "SameAsIpu" or "ShardedExecution"'
-                f', but got {execution_strategy}')
-
-    if 'partialsType' in cfg:
-        partials_type = cfg.pop('partialsType')
-        options.Precision.setPartialsType(getattr(
-            torch, partials_type))  # half or float
-
-    _options_assigner(cfg, options)
-    return options
-
-
-def model_sharding(model, split_edges):
-    """split models in-place into multi-IPUs.
-
-    Args:
-        model (nn.Module): The target model to be split.
-        split_edges (list of dict): Model layer names or layer numbers
-            of split edge. Each item of ``split_edges`` is a dictionary,
-            which may contain the following key-pairs:
-
-            - layer_to_call: PyTorch module to assign to the block
-            - user_id (optional): A user defined identifier for the block.
-            - ipu_id: The id of the IPU to run on.
-
-        Examples:
-            >>> split_edges = [
-            ...     dict(layer_to_call='model.conv1', ipu_id=0),
-            ...     dict(layer_to_call='model.conv3', ipu_id=1)]
-            >>> sharding_model = model_sharding(torch_model, split_edges)
-
-    Returns:
-        nn.Module: Split model.
-    """
-    if len(split_edges) == 0:
-        return model
-    assert isinstance(split_edges, list)
-    spilt_edges_dict = {edge['layer_to_call']: edge for edge in split_edges}
-
-    for idx, (name, module) in enumerate(model.named_modules()):
-        if idx in spilt_edges_dict and name in spilt_edges_dict:
-            raise ValueError(
-                'The same layer is referenced twice while doing model'
-                f' partition: idx is {idx} and name is {name}')
-
-        edge = spilt_edges_dict.pop(name, None)
-        edge = spilt_edges_dict.pop(idx, edge)
-        if edge is not None:
-            poptorch.BeginBlock(module, edge.get('user_id', name),
-                                edge['ipu_id'])
-
-    # ensure all split_edges are used
-    if len(spilt_edges_dict) > 0:
-        split_edge_names = list(spilt_edges_dict.keys())
-        raise RuntimeError(
-            f'split_edges: {split_edge_names} are not contained in the model')
-    return model
-
-
-def recomputation_checkpoint(model: nn.Module, module_names: list):
-    """Annotates the output of a module to be checkpointed instead of
-    recomputed.
-
-    If recomputation mode is enabled, ipu will release the activations of
-    the middle layers to save memory. During the backward of gradient,
-    the activation of the middle layer will be recalculated again.
-    This function is used to declare the activations of some intermediate
-    layers that need to be saved in order to skip the recomputation of
-    some layers.
-
-    Args:
-        model (nn.Module): The target model to apply recomputation
-            checkpoint.
-        module_names (list): Layer names of module.
-    """
-
-    def recompute_outputs(module, inputs, outputs):
-        if isinstance(outputs, tuple):
-            return tuple(poptorch.recomputationCheckpoint(y) for y in outputs)
-        else:
-            return poptorch.recomputationCheckpoint(outputs)
-
-    for name, module in model.named_modules():
-        if name in module_names:
-            module.register_forward_hook(recompute_outputs)
-            module_names.remove(name)
-
-    # check all module_names are used
-    assert len(module_names) == 0,\
-        f'recomputed nodes: {module_names} are not contained in the model'
-
-
-def compare_ndarray(featA, featB, rtol=1e-3, atol=1e-5):
-    """Align data between two activations or weights."""
-    try:
-        np.testing.assert_allclose(featA, featB, rtol=rtol, atol=atol)
-    except AssertionError as e:
-        print(e)
-
-
-def build_from_cfg_with_wrapper(cfg,
-                                registry,
-                                wrapper_func=None,
-                                default_args=None):
-    """Build a module from config dict and wrap module with "wrapper_func".
-
-    Args:
-        cfg (dict): Config dict. It should at least contain the key "type".
-        registry (:obj:`Registry`): The registry to search the type from.
-        default_args (dict, optional): Default initialization arguments.
-        wrapper_func (function): Used to wrap class
-
-    Returns:
-        object: The constructed object.
-    """
-    if not isinstance(cfg, dict):
-        raise TypeError(f'cfg must be a dict, but got {type(cfg)}')
-    if 'type' not in cfg:
-        if default_args is None or 'type' not in default_args:
-            raise KeyError(
-                '`cfg` or `default_args` must contain the key "type", '
-                f'but got {cfg}\n{default_args}')
-    if not isinstance(registry, Registry):
-        raise TypeError('registry must be an mmcv.Registry object, '
-                        f'but got {type(registry)}')
-    if not (isinstance(default_args, dict) or default_args is None):
-        raise TypeError('default_args must be a dict or None, '
-                        f'but got {type(default_args)}')
-
-    args = cfg.copy()
-
-    if default_args is not None:
-        for name, value in default_args.items():
-            args.setdefault(name, value)
-
-    obj_type = args.pop('type')
-    if isinstance(obj_type, str):
-        obj_cls = registry.get(obj_type)
-        if obj_cls is None:
-            raise KeyError(
-                f'{obj_type} is not in the {registry.name} registry')
-    elif inspect.isclass(obj_type):
-        obj_cls = obj_type
-    else:
-        raise TypeError(
-            f'type must be a str or valid type, but got {type(obj_type)}')
-
-    if wrapper_func is None:
-        wrapped_obj_cls = obj_cls
-    else:
-        wrapped_obj_cls = wrapper_func(obj_cls)
-    try:
-        return wrapped_obj_cls(**args)
-    except Exception as e:
-        # Normal TypeError does not print class name.
-        raise type(e)(f'{wrapped_obj_cls.__name__}: {e}')
diff --git a/mmcv/device/mlu/__init__.py b/mmcv/device/mlu/__init__.py
deleted file mode 100644
index 77c71ccf3c..0000000000
--- a/mmcv/device/mlu/__init__.py
+++ /dev/null
@@ -1,5 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .data_parallel import MLUDataParallel
-from .distributed import MLUDistributedDataParallel
-
-__all__ = ['MLUDataParallel', 'MLUDistributedDataParallel']
diff --git a/mmcv/device/mlu/_functions.py b/mmcv/device/mlu/_functions.py
deleted file mode 100644
index 75660fa9b3..0000000000
--- a/mmcv/device/mlu/_functions.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Union
-
-import torch
-
-
-def scatter(input: Union[List, torch.Tensor], devices: List) -> List:
-    """scatter copies tensor to MLU directly."""
-    if isinstance(input, list):
-        outputs = [scatter(_input, devices) for _input in input]
-        return outputs
-    elif isinstance(input, torch.Tensor):
-        output = input.contiguous()
-        return output.to('mlu') if devices != [-1] else output
-    else:
-        raise Exception(f'Unknown type {type(input)}.')
-
-
-class Scatter:
-
-    @staticmethod
-    def forward(target_mlus, input):
-        outputs = scatter(input, target_mlus)
-        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/device/mlu/data_parallel.py b/mmcv/device/mlu/data_parallel.py
deleted file mode 100644
index ebe14c0a55..0000000000
--- a/mmcv/device/mlu/data_parallel.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import torch
-
-from mmcv.parallel import MMDataParallel
-from .scatter_gather import scatter_kwargs
-
-
-class MLUDataParallel(MMDataParallel):
-    """The MLUDataParallel module that supports DataContainer.
-
-    MLUDataParallel is a class inherited from MMDataParall, which supports
-    MLU training and inference only.
-
-    The main differences with MMDataParallel:
-
-    - It only supports single-card of MLU, and only use first card to
-      run training and inference.
-
-    - It uses direct host-to-device copy instead of stream-background
-      scatter.
-
-    .. warning::
-        MLUDataParallel only supports single MLU training, if you need to
-        train with multiple MLUs, please use MLUDistributedDataParallel
-        instead. If you have multiple MLUs, you can set the environment
-        variable ``MLU_VISIBLE_DEVICES=0`` (or any other card number(s))
-        to specify the running device.
-
-    Args:
-        module (:class:`nn.Module`): Module to be encapsulated.
-        dim (int): Dimension used to scatter the data. Defaults to 0.
-    """
-
-    def __init__(self, *args, dim=0, **kwargs):
-        super().__init__(*args, dim=dim, **kwargs)
-        self.device_ids = [0]
-        self.src_device_obj = torch.device('mlu:0')
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/device/mlu/distributed.py b/mmcv/device/mlu/distributed.py
deleted file mode 100644
index 3768c754c9..0000000000
--- a/mmcv/device/mlu/distributed.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from mmcv.parallel import MMDistributedDataParallel
-from .scatter_gather import scatter_kwargs
-
-
-class MLUDistributedDataParallel(MMDistributedDataParallel):
-    """The DDP module supports DataContainer.
-
-    MLUDDP has one difference from MMDDP which moves data to MLU with coping
-    instead of scattering.
-    """
-
-    def to_kwargs(self, inputs, kwargs, device_id):
-        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
-        # to move all tensors to device_id
-        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/device/mlu/scatter_gather.py b/mmcv/device/mlu/scatter_gather.py
deleted file mode 100644
index 0b0c9b96f5..0000000000
--- a/mmcv/device/mlu/scatter_gather.py
+++ /dev/null
@@ -1,59 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmcv.parallel.data_container import DataContainer
-from ._functions import Scatter
-
-
-def scatter(inputs, target_mlus, dim=0):
-    """Scatter inputs to target mlu.
-
-    The only difference from original :func:`scatter` is to add support for
-    :type:`~mmcv.parallel.DataContainer`.
-    """
-
-    def scatter_map(obj):
-        if isinstance(obj, torch.Tensor):
-            if target_mlus != [-1]:
-                obj = obj.to('mlu')
-                return [obj]
-            else:
-                # for CPU inference we use self-implemented scatter
-                return Scatter.forward(target_mlus, obj)
-        if isinstance(obj, DataContainer):
-            if obj.cpu_only:
-                return obj.data
-            else:
-                return Scatter.forward(target_mlus, obj.data)
-        if isinstance(obj, tuple) and len(obj) > 0:
-            return list(zip(*map(scatter_map, obj)))
-        if isinstance(obj, list) and len(obj) > 0:
-            out = list(map(list, zip(*map(scatter_map, obj))))
-            return out
-        if isinstance(obj, dict) and len(obj) > 0:
-            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
-            return out
-        return [obj for targets in target_mlus]
-
-    # After scatter_map is called, a scatter_map cell will exist. This cell
-    # has a reference to the actual function scatter_map, which has references
-    # to a closure that has a reference to the scatter_map cell (because the
-    # fn is recursive). To avoid this reference cycle, we set the function to
-    # None, clearing the cell
-    try:
-        return scatter_map(inputs)
-    finally:
-        scatter_map = None
-
-
-def scatter_kwargs(inputs, kwargs, target_mlus, dim=0):
-    """Scatter with support for kwargs dictionary."""
-    inputs = scatter(inputs, target_mlus, dim) if inputs else []
-    kwargs = scatter(kwargs, target_mlus, dim) if kwargs else []
-    if len(inputs) < len(kwargs):
-        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
-    elif len(kwargs) < len(inputs):
-        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
-    inputs = tuple(inputs)
-    kwargs = tuple(kwargs)
-    return inputs, kwargs
diff --git a/mmcv/device/mps/__init__.py b/mmcv/device/mps/__init__.py
deleted file mode 100644
index e28144ef0a..0000000000
--- a/mmcv/device/mps/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .data_parallel import MPSDataParallel
-
-__all__ = ['MPSDataParallel']
diff --git a/mmcv/device/mps/data_parallel.py b/mmcv/device/mps/data_parallel.py
deleted file mode 100644
index 7ae5396d24..0000000000
--- a/mmcv/device/mps/data_parallel.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-import torch
-
-from mmcv.parallel import MMDataParallel
-from ..scatter_gather import scatter_kwargs
-
-
-class MPSDataParallel(MMDataParallel):
-    """The MPSDataParallel module that supports DataContainer.
-
-    MPSDataParallel is a class inherited from MMDataParall, which supports
-    MPS training and inference only.
-
-    The main differences with MMDataParallel:
-
-    - It only supports single-card of MPS, and only use first card to
-      run training and inference.
-
-    - It uses direct host-to-device copy instead of stream-background
-      scatter.
-
-    Args:
-        module (:class:`nn.Module`): Module to be encapsulated.
-        dim (int): Dimension used to scatter the data. Defaults to 0.
-    """
-
-    def __init__(self, *args, dim=0, **kwargs):
-        super().__init__(*args, dim=dim, **kwargs)
-        self.device_ids = [0]
-        self.src_device_obj = torch.device('mps:0')
-
-    def scatter(self, inputs, kwargs, device_ids):
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
diff --git a/mmcv/device/scatter_gather.py b/mmcv/device/scatter_gather.py
deleted file mode 100644
index 744b0ca51e..0000000000
--- a/mmcv/device/scatter_gather.py
+++ /dev/null
@@ -1,64 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from mmcv.parallel.data_container import DataContainer
-from mmcv.utils import deprecated_api_warning
-from ._functions import Scatter
-from .utils import get_device
-
-
-@deprecated_api_warning({'target_mlus': 'target_devices'})
-def scatter(inputs, target_devices, dim=0):
-    """Scatter inputs to target devices.
-
-    The only difference from original :func:`scatter` is to add support for
-    :type:`~mmcv.parallel.DataContainer`.
-    """
-    current_device = get_device()
-
-    def scatter_map(obj):
-        if isinstance(obj, torch.Tensor):
-            if target_devices != [-1]:
-                obj = obj.to(current_device)
-                return [obj]
-            else:
-                # for CPU inference we use self-implemented scatter
-                return Scatter.forward(target_devices, obj)
-        if isinstance(obj, DataContainer):
-            if obj.cpu_only:
-                return obj.data
-            else:
-                return Scatter.forward(target_devices, obj.data)
-        if isinstance(obj, tuple) and len(obj) > 0:
-            return list(zip(*map(scatter_map, obj)))
-        if isinstance(obj, list) and len(obj) > 0:
-            out = list(map(list, zip(*map(scatter_map, obj))))
-            return out
-        if isinstance(obj, dict) and len(obj) > 0:
-            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
-            return out
-        return [obj for _ in target_devices]
-
-    # After scatter_map is called, a scatter_map cell will exist. This cell
-    # has a reference to the actual function scatter_map, which has references
-    # to a closure that has a reference to the scatter_map cell (because the
-    # fn is recursive). To avoid this reference cycle, we set the function to
-    # None, clearing the cell
-    try:
-        return scatter_map(inputs)
-    finally:
-        scatter_map = None
-
-
-@deprecated_api_warning({'target_mlus': 'target_devices'})
-def scatter_kwargs(inputs, kwargs, target_devices, dim=0):
-    """Scatter with support for kwargs dictionary."""
-    inputs = scatter(inputs, target_devices, dim) if inputs else []
-    kwargs = scatter(kwargs, target_devices, dim) if kwargs else []
-    if len(inputs) < len(kwargs):
-        inputs.extend([() for _ in range(len(kwargs) - len(inputs))])
-    elif len(kwargs) < len(inputs):
-        kwargs.extend([{} for _ in range(len(inputs) - len(kwargs))])
-    inputs = tuple(inputs)
-    kwargs = tuple(kwargs)
-    return inputs, kwargs
diff --git a/mmcv/device/utils.py b/mmcv/device/utils.py
deleted file mode 100644
index e2adec08dd..0000000000
--- a/mmcv/device/utils.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
-
-
-def get_device() -> str:
-    """Returns the currently existing device type.
-
-    Returns:
-        str: cuda | mlu | mps | cpu.
-    """
-    if IS_CUDA_AVAILABLE:
-        return 'cuda'
-    elif IS_MLU_AVAILABLE:
-        return 'mlu'
-    elif IS_MPS_AVAILABLE:
-        return 'mps'
-    else:
-        return 'cpu'
diff --git a/mmcv/engine/__init__.py b/mmcv/engine/__init__.py
deleted file mode 100644
index 3193b7f664..0000000000
--- a/mmcv/engine/__init__.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .test import (collect_results_cpu, collect_results_gpu, multi_gpu_test,
-                   single_gpu_test)
-
-__all__ = [
-    'collect_results_cpu', 'collect_results_gpu', 'multi_gpu_test',
-    'single_gpu_test'
-]
diff --git a/mmcv/engine/test.py b/mmcv/engine/test.py
deleted file mode 100644
index 9baad4e0bf..0000000000
--- a/mmcv/engine/test.py
+++ /dev/null
@@ -1,214 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import pickle
-import shutil
-import tempfile
-import time
-from typing import Optional
-
-import mmengine
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from torch.utils.data import DataLoader
-
-import mmcv
-from mmcv.runner import get_dist_info
-
-
-def single_gpu_test(model: nn.Module, data_loader: DataLoader) -> list:
-    """Test model with a single gpu.
-
-    This method tests model with a single gpu and displays test progress bar.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (nn.Dataloader): Pytorch data loader.
-
-    Returns:
-        list: The prediction results.
-    """
-    model.eval()
-    results = []
-    dataset = data_loader.dataset
-    prog_bar = mmcv.ProgressBar(len(dataset))
-    for data in data_loader:
-        with torch.no_grad():
-            result = model(return_loss=False, **data)
-        results.extend(result)
-
-        # Assume result has the same length of batch_size
-        # refer to https://github.com/open-mmlab/mmcv/issues/985
-        batch_size = len(result)
-        for _ in range(batch_size):
-            prog_bar.update()
-    return results
-
-
-def multi_gpu_test(model: nn.Module,
-                   data_loader: DataLoader,
-                   tmpdir: Optional[str] = None,
-                   gpu_collect: bool = False) -> Optional[list]:
-    """Test model with multiple gpus.
-
-    This method tests model with multiple gpus and collects the results
-    under two different modes: gpu and cpu modes. By setting
-    ``gpu_collect=True``, it encodes results to gpu tensors and use gpu
-    communication for results collection. On cpu mode it saves the results on
-    different gpus to ``tmpdir`` and collects them by the rank 0 worker.
-
-    Args:
-        model (nn.Module): Model to be tested.
-        data_loader (nn.Dataloader): Pytorch data loader.
-        tmpdir (str): Path of directory to save the temporary results from
-            different gpus under cpu mode.
-        gpu_collect (bool): Option to use either gpu or cpu to collect results.
-
-    Returns:
-        list: The prediction results.
-    """
-    model.eval()
-    results = []
-    dataset = data_loader.dataset
-    rank, world_size = get_dist_info()
-    if rank == 0:
-        prog_bar = mmcv.ProgressBar(len(dataset))
-    time.sleep(2)  # This line can prevent deadlock problem in some cases.
-    for i, data in enumerate(data_loader):
-        with torch.no_grad():
-            result = model(return_loss=False, **data)
-        results.extend(result)
-
-        if rank == 0:
-            batch_size = len(result)
-            batch_size_all = batch_size * world_size
-            if batch_size_all + prog_bar.completed > len(dataset):
-                batch_size_all = len(dataset) - prog_bar.completed
-            for _ in range(batch_size_all):
-                prog_bar.update()
-
-    # collect results from all ranks
-    if gpu_collect:
-        result_from_ranks = collect_results_gpu(results, len(dataset))
-    else:
-        result_from_ranks = collect_results_cpu(results, len(dataset), tmpdir)
-    return result_from_ranks
-
-
-def collect_results_cpu(result_part: list,
-                        size: int,
-                        tmpdir: Optional[str] = None) -> Optional[list]:
-    """Collect results under cpu mode.
-
-    On cpu mode, this function will save the results on different gpus to
-    ``tmpdir`` and collect them by the rank 0 worker.
-
-    Args:
-        result_part (list): Result list containing result parts
-            to be collected.
-        size (int): Size of the results, commonly equal to length of
-            the results.
-        tmpdir (str | None): temporal directory for collected results to
-            store. If set to None, it will create a random temporal directory
-            for it.
-
-    Returns:
-        list: The collected results.
-    """
-    rank, world_size = get_dist_info()
-    # create a tmp dir if it is not specified
-    if tmpdir is None:
-        MAX_LEN = 512
-        # 32 is whitespace
-        dir_tensor = torch.full((MAX_LEN, ),
-                                32,
-                                dtype=torch.uint8,
-                                device='cuda')
-        if rank == 0:
-            mmcv.mkdir_or_exist('.dist_test')
-            tmpdir = tempfile.mkdtemp(dir='.dist_test')
-            tmpdir = torch.tensor(
-                bytearray(tmpdir.encode()), dtype=torch.uint8, device='cuda')
-            dir_tensor[:len(tmpdir)] = tmpdir
-        dist.broadcast(dir_tensor, 0)
-        tmpdir = dir_tensor.cpu().numpy().tobytes().decode().rstrip()
-    else:
-        mmcv.mkdir_or_exist(tmpdir)
-    # dump the part result to the dir
-    part_file = osp.join(tmpdir, f'part_{rank}.pkl')  # type: ignore
-    mmengine.dump(result_part, part_file)
-    dist.barrier()
-    # collect all parts
-    if rank != 0:
-        return None
-    else:
-        # load results of all parts from tmp dir
-        part_list = []
-        for i in range(world_size):
-            part_file = osp.join(tmpdir, f'part_{i}.pkl')  # type: ignore
-            part_result = mmengine.load(part_file)
-            # When data is severely insufficient, an empty part_result
-            # on a certain gpu could makes the overall outputs empty.
-            if part_result:
-                part_list.append(part_result)
-        # sort the results
-        ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
-        # the dataloader may pad some samples
-        ordered_results = ordered_results[:size]
-        # remove tmp dir
-        shutil.rmtree(tmpdir)  # type: ignore
-        return ordered_results
-
-
-def collect_results_gpu(result_part: list, size: int) -> Optional[list]:
-    """Collect results under gpu mode.
-
-    On gpu mode, this function will encode results to gpu tensors and use gpu
-    communication for results collection.
-
-    Args:
-        result_part (list): Result list containing result parts
-            to be collected.
-        size (int): Size of the results, commonly equal to length of
-            the results.
-
-    Returns:
-        list: The collected results.
-    """
-    rank, world_size = get_dist_info()
-    # dump result part to tensor with pickle
-    part_tensor = torch.tensor(
-        bytearray(pickle.dumps(result_part)), dtype=torch.uint8, device='cuda')
-    # gather all result part tensor shape
-    shape_tensor = torch.tensor(part_tensor.shape, device='cuda')
-    shape_list = [shape_tensor.clone() for _ in range(world_size)]
-    dist.all_gather(shape_list, shape_tensor)
-    # padding result part tensor to max length
-    shape_max = torch.tensor(shape_list).max()
-    part_send = torch.zeros(shape_max, dtype=torch.uint8, device='cuda')
-    part_send[:shape_tensor[0]] = part_tensor
-    part_recv_list = [
-        part_tensor.new_zeros(shape_max) for _ in range(world_size)
-    ]
-    # gather all result part
-    dist.all_gather(part_recv_list, part_send)
-
-    if rank == 0:
-        part_list = []
-        for recv, shape in zip(part_recv_list, shape_list):
-            part_result = pickle.loads(recv[:shape[0]].cpu().numpy().tobytes())
-            # When data is severely insufficient, an empty part_result
-            # on a certain gpu could makes the overall outputs empty.
-            if part_result:
-                part_list.append(part_result)
-        # sort the results
-        ordered_results = []
-        for res in zip(*part_list):
-            ordered_results.extend(list(res))
-        # the dataloader may pad some samples
-        ordered_results = ordered_results[:size]
-        return ordered_results
-    else:
-        return None
diff --git a/mmcv/model_zoo/deprecated.json b/mmcv/model_zoo/deprecated.json
deleted file mode 100644
index 25cf6f28ca..0000000000
--- a/mmcv/model_zoo/deprecated.json
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "resnet50_caffe": "detectron/resnet50_caffe",
-  "resnet50_caffe_bgr": "detectron2/resnet50_caffe_bgr",
-  "resnet101_caffe": "detectron/resnet101_caffe",
-  "resnet101_caffe_bgr": "detectron2/resnet101_caffe_bgr"
-}
diff --git a/mmcv/model_zoo/mmcls.json b/mmcv/model_zoo/mmcls.json
deleted file mode 100644
index c073a41d0a..0000000000
--- a/mmcv/model_zoo/mmcls.json
+++ /dev/null
@@ -1,59 +0,0 @@
-{
-  "vgg11": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_batch256_imagenet_20210208-4271cd6c.pth",
-  "vgg13": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_batch256_imagenet_20210208-4d1d6080.pth",
-  "vgg16": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_batch256_imagenet_20210208-db26f1a5.pth",
-  "vgg19": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_batch256_imagenet_20210208-e6920e4a.pth",
-  "vgg11_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg11_bn_batch256_imagenet_20210207-f244902c.pth",
-  "vgg13_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg13_bn_batch256_imagenet_20210207-1a8b7864.pth",
-  "vgg16_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg16_bn_batch256_imagenet_20210208-7e55cd29.pth",
-  "vgg19_bn": "https://download.openmmlab.com/mmclassification/v0/vgg/vgg19_bn_batch256_imagenet_20210208-da620c4f.pth",
-  "resnet18": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet18_8xb32_in1k_20210831-fbbb1da6.pth",
-  "resnet34": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet34_8xb32_in1k_20210831-f257d4e6.pth",
-  "resnet50": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet50_8xb32_in1k_20210831-ea4938fc.pth",
-  "resnet101": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet101_8xb32_in1k_20210831-539c63f8.pth",
-  "resnet152": "https://download.openmmlab.com/mmclassification/v0/resnet/resnet152_8xb32_in1k_20210901-4d7582fa.pth",
-  "resnet50_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d50_b32x8_imagenet_20210531-db14775a.pth",
-  "resnet101_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d101_b32x8_imagenet_20210531-6e13bcd3.pth",
-  "resnet152_v1d": "https://download.openmmlab.com/mmclassification/v0/resnet/resnetv1d152_b32x8_imagenet_20210531-278cf22a.pth",
-  "resnext50_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext50_32x4d_b32x8_imagenet_20210429-56066e27.pth",
-  "resnext101_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x4d_b32x8_imagenet_20210506-e0fa3dd5.pth",
-  "resnext101_32x8d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext101_32x8d_b32x8_imagenet_20210506-23a247d5.pth",
-  "resnext152_32x4d": "https://download.openmmlab.com/mmclassification/v0/resnext/resnext152_32x4d_b32x8_imagenet_20210524-927787be.pth",
-  "se-resnet50": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet50_batch256_imagenet_20200804-ae206104.pth",
-  "se-resnet101": "https://download.openmmlab.com/mmclassification/v0/se-resnet/se-resnet101_batch256_imagenet_20200804-ba5b51d4.pth",
-  "resnest50": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest50_imagenet_converted-1ebf0afe.pth",
-  "resnest101": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest101_imagenet_converted-032caa52.pth",
-  "resnest200": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest200_imagenet_converted-581a60f2.pth",
-  "resnest269": "https://download.openmmlab.com/mmclassification/v0/resnest/resnest269_imagenet_converted-59930960.pth",
-  "shufflenet_v1": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v1/shufflenet_v1_batch1024_imagenet_20200804-5d6cec73.pth",
-  "shufflenet_v2": "https://download.openmmlab.com/mmclassification/v0/shufflenet_v2/shufflenet_v2_batch1024_imagenet_20200812-5bf4721e.pth",
-  "mobilenet_v2": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v2/mobilenet_v2_batch256_imagenet_20200708-3b2dc3af.pth",
-  "mobilenet_v3_small": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_small-8427ecf0.pth",
-  "mobilenet_v3_large": "https://download.openmmlab.com/mmclassification/v0/mobilenet_v3/convert/mobilenet_v3_large-3ea3c186.pth",
-  "repvgg_A0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A0_3rdparty_4xb64-coslr-120e_in1k_20210909-883ab98c.pth",
-  "repvgg_A1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A1_3rdparty_4xb64-coslr-120e_in1k_20210909-24003a24.pth",
-  "repvgg_A2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-A2_3rdparty_4xb64-coslr-120e_in1k_20210909-97d7695a.pth",
-  "repvgg_B0": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B0_3rdparty_4xb64-coslr-120e_in1k_20210909-446375f4.pth",
-  "repvgg_B1": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1_3rdparty_4xb64-coslr-120e_in1k_20210909-750cdf67.pth",
-  "repvgg_B1g2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g2_3rdparty_4xb64-coslr-120e_in1k_20210909-344f6422.pth",
-  "repvgg_B1g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B1g4_3rdparty_4xb64-coslr-120e_in1k_20210909-d4c1a642.pth",
-  "repvgg_B2": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2_3rdparty_4xb64-coslr-120e_in1k_20210909-bd6b937c.pth",
-  "repvgg_B2g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B2g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-7b7955f0.pth",
-  "repvgg_B3": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-dda968bf.pth",
-  "repvgg_B3g4": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-B3g4_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-4e54846a.pth",
-  "repvgg_D2se": "https://download.openmmlab.com/mmclassification/v0/repvgg/repvgg-D2se_3rdparty_4xb64-autoaug-lbs-mixup-coslr-200e_in1k_20210909-cf3139b7.pth",
-  "res2net101_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net101-w26-s4_3rdparty_8xb32_in1k_20210927-870b6c36.pth",
-  "res2net50_w14": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w14-s8_3rdparty_8xb32_in1k_20210927-bc967bf1.pth",
-  "res2net50_w26": "https://download.openmmlab.com/mmclassification/v0/res2net/res2net50-w26-s8_3rdparty_8xb32_in1k_20210927-f547a94b.pth",
-  "swin_tiny": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_tiny_224_b16x64_300e_imagenet_20210616_090925-66df6be6.pth",
-  "swin_small": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/swin_small_224_b16x64_300e_imagenet_20210615_110219-7f9d988b.pth",
-  "swin_base": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_base_patch4_window7_224_22kto1k-f967f799.pth",
-  "swin_large": "https://download.openmmlab.com/mmclassification/v0/swin-transformer/convert/swin_large_patch4_window7_224_22kto1k-5f0996db.pth",
-  "t2t_vit_t_14": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-14_3rdparty_8xb64_in1k_20210928-b7c09b62.pth",
-  "t2t_vit_t_19": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-19_3rdparty_8xb64_in1k_20210928-7f1478d5.pth",
-  "t2t_vit_t_24": "https://download.openmmlab.com/mmclassification/v0/t2t-vit/t2t-vit-t-24_3rdparty_8xb64_in1k_20210928-fe95a61b.pth",
-  "tnt_small": "https://download.openmmlab.com/mmclassification/v0/tnt/tnt-small-p16_3rdparty_in1k_20210903-c56ee7df.pth",
-  "vit_base_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-98e8652b.pth",
-  "vit_base_p32": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-base-p32_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-9cea8599.pth",
-  "vit_large_p16": "https://download.openmmlab.com/mmclassification/v0/vit/finetune/vit-large-p16_in21k-pre-3rdparty_ft-64xb64_in1k-384_20210928-b20ba619.pth"
-}
diff --git a/mmcv/model_zoo/open_mmlab.json b/mmcv/model_zoo/open_mmlab.json
deleted file mode 100644
index 8311db4fee..0000000000
--- a/mmcv/model_zoo/open_mmlab.json
+++ /dev/null
@@ -1,50 +0,0 @@
-{
-  "vgg16_caffe": "https://download.openmmlab.com/pretrain/third_party/vgg16_caffe-292e1171.pth",
-  "detectron/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_caffe-788b5fa3.pth",
-  "detectron2/resnet50_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet50_msra-5891d200.pth",
-  "detectron/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_caffe-3ad79236.pth",
-  "detectron2/resnet101_caffe": "https://download.openmmlab.com/pretrain/third_party/resnet101_msra-6cc46731.pth",
-  "detectron2/resnext101_32x8d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x8d-1516f1aa.pth",
-  "resnext50_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext50-32x4d-0ab1a123.pth",
-  "resnext101_32x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d-a5af3160.pth",
-  "resnext101_64x4d": "https://download.openmmlab.com/pretrain/third_party/resnext101_64x4d-ee2c6f71.pth",
-  "contrib/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_thangvubk-ad1730dd.pth",
-  "detectron/resnet50_gn": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn-9186a21c.pth",
-  "detectron/resnet101_gn": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn-cac0ab98.pth",
-  "jhu/resnet50_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet50_gn_ws-15beedd8.pth",
-  "jhu/resnet101_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnet101_gn_ws-3e3c308c.pth",
-  "jhu/resnext50_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn_ws-0d87ac85.pth",
-  "jhu/resnext101_32x4d_gn_ws": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn_ws-34ac1a9e.pth",
-  "jhu/resnext50_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext50_32x4d_gn-c7e8b754.pth",
-  "jhu/resnext101_32x4d_gn": "https://download.openmmlab.com/pretrain/third_party/resnext101_32x4d_gn-ac3bb84e.pth",
-  "msra/hrnetv2_w18_small": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18_small-b5a04e21.pth",
-  "msra/hrnetv2_w18": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w18-00eb2006.pth",
-  "msra/hrnetv2_w32": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w32-dc9eeb4f.pth",
-  "msra/hrnetv2_w40": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w40-ed0b031c.pth",
-  "msra/hrnetv2_w48": "https://download.openmmlab.com/pretrain/third_party/hrnetv2_w48-d2186c55.pth",
-  "bninception_caffe": "https://download.openmmlab.com/pretrain/third_party/bn_inception_caffe-ed2e8665.pth",
-  "kin400/i3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/i3d_r50_f32s2_k400-2c57e077.pth",
-  "kin400/nl3d_r50_f32s2_k400": "https://download.openmmlab.com/pretrain/third_party/nl3d_r50_f32s2_k400-fa7e7caa.pth",
-  "res2net101_v1d_26w_4s": "https://download.openmmlab.com/pretrain/third_party/res2net101_v1d_26w_4s_mmdetv2-f0a600f9.pth",
-  "regnetx_400mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_400mf-a5b10d96.pth",
-  "regnetx_800mf": "https://download.openmmlab.com/pretrain/third_party/regnetx_800mf-1f4be4c7.pth",
-  "regnetx_1.6gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_1.6gf-5791c176.pth",
-  "regnetx_3.2gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_3.2gf-c2599b0f.pth",
-  "regnetx_4.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_4.0gf-a88f671e.pth",
-  "regnetx_6.4gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_6.4gf-006af45d.pth",
-  "regnetx_8.0gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_8.0gf-3c68abe7.pth",
-  "regnetx_12gf": "https://download.openmmlab.com/pretrain/third_party/regnetx_12gf-4c2a3350.pth",
-  "resnet18_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet18_v1c-b5776b93.pth",
-  "resnet50_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet50_v1c-2cccc1ad.pth",
-  "resnet101_v1c": "https://download.openmmlab.com/pretrain/third_party/resnet101_v1c-e67eebb6.pth",
-  "mmedit/vgg16": "https://download.openmmlab.com/mmediting/third_party/vgg_state_dict.pth",
-  "mmedit/res34_en_nomixup": "https://download.openmmlab.com/mmediting/third_party/model_best_resnet34_En_nomixup.pth",
-  "mmedit/mobilenet_v2": "https://download.openmmlab.com/mmediting/third_party/mobilenet_v2.pth",
-  "contrib/mobilenet_v3_large": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_large-bc2c3fd3.pth",
-  "contrib/mobilenet_v3_small": "https://download.openmmlab.com/pretrain/third_party/mobilenet_v3_small-47085aa1.pth",
-  "resnest50": "https://download.openmmlab.com/pretrain/third_party/resnest50_d2-7497a55b.pth",
-  "resnest101": "https://download.openmmlab.com/pretrain/third_party/resnest101_d2-f3b931b2.pth",
-  "resnest200": "https://download.openmmlab.com/pretrain/third_party/resnest200_d2-ca88e41f.pth",
-  "darknet53": "https://download.openmmlab.com/pretrain/third_party/darknet53-a628ea1b.pth",
-  "mmdet/mobilenet_v2": "https://download.openmmlab.com/mmdetection/v2.0/third_party/mobilenet_v2_batch256_imagenet-ff34753d.pth"
-}
diff --git a/mmcv/model_zoo/torchvision_0.12.json b/mmcv/model_zoo/torchvision_0.12.json
deleted file mode 100644
index 06defe6748..0000000000
--- a/mmcv/model_zoo/torchvision_0.12.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-    "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth",
-    "densenet121": "https://download.pytorch.org/models/densenet121-a639ec97.pth",
-    "densenet169": "https://download.pytorch.org/models/densenet169-b2777c0a.pth",
-    "densenet201": "https://download.pytorch.org/models/densenet201-c1103571.pth",
-    "densenet161": "https://download.pytorch.org/models/densenet161-8d451a50.pth",
-    "efficientnet_b0": "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth",
-    "efficientnet_b1": "https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
-    "efficientnet_b2": "https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth",
-    "efficientnet_b3": "https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth",
-    "efficientnet_b4": "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth",
-    "efficientnet_b5": "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth",
-    "efficientnet_b6": "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth",
-    "efficientnet_b7": "https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth",
-    "googlenet": "https://download.pytorch.org/models/googlenet-1378be20.pth",
-    "inception_v3_google": "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth",
-    "mobilenet_v2": "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth",
-    "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
-    "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
-    "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth",
-    "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth",
-    "regnet_y_1_6gf": "https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth",
-    "regnet_y_3_2gf": "https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth",
-    "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth",
-    "regnet_y_16gf": "https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth",
-    "regnet_y_32gf": "https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth",
-    "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth",
-    "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth",
-    "regnet_x_1_6gf": "https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth",
-    "regnet_x_3_2gf": "https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth",
-    "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth",
-    "regnet_x_16gf": "https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth",
-    "regnet_x_32gf": "https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth",
-    "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
-    "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
-    "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
-    "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
-    "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
-    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
-    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
-    "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
-    "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
-    "shufflenetv2_x0.5": "https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth",
-    "shufflenetv2_x1.0": "https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth",
-    "shufflenetv2_x1.5": null,
-    "shufflenetv2_x2.0": null,
-    "squeezenet1_0": "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth",
-    "squeezenet1_1": "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth",
-    "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth",
-    "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth",
-    "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth",
-    "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
-    "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
-    "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
-    "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
-    "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth"
-}
diff --git a/mmcv/ops/points_sampler.py b/mmcv/ops/points_sampler.py
index e1fd376051..f321195f57 100644
--- a/mmcv/ops/points_sampler.py
+++ b/mmcv/ops/points_sampler.py
@@ -4,7 +4,6 @@
 from torch import Tensor
 from torch import nn as nn
 
-from mmcv.runner import force_fp32
 from .furthest_point_sample import (furthest_point_sample,
                                     furthest_point_sample_with_dist)
 
@@ -91,7 +90,6 @@ def __init__(self,
             self.samplers.append(get_sampler_cls(fps_mod)())
         self.fp16_enabled = False
 
-    @force_fp32()
     def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:
         """
         Args:
@@ -102,6 +100,11 @@ def forward(self, points_xyz: Tensor, features: Tensor) -> Tensor:
         Returns:
             torch.Tensor: (B, npoint, sample_num) Indices of sampled points.
         """
+        if points_xyz.dtype == torch.half:
+            points_xyz = points_xyz.to(torch.float32)
+        if features.dtype == torch.half:
+            features = features.to(torch.float32)
+
         indices = []
         last_fps_end_index = 0
         for fps_sample_range, sampler, npoint in zip(
diff --git a/mmcv/parallel/__init__.py b/mmcv/parallel/__init__.py
deleted file mode 100644
index 2ed2c17ad3..0000000000
--- a/mmcv/parallel/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .collate import collate
-from .data_container import DataContainer
-from .data_parallel import MMDataParallel
-from .distributed import MMDistributedDataParallel
-from .registry import MODULE_WRAPPERS
-from .scatter_gather import scatter, scatter_kwargs
-from .utils import is_module_wrapper
-
-__all__ = [
-    'collate', 'DataContainer', 'MMDataParallel', 'MMDistributedDataParallel',
-    'scatter', 'scatter_kwargs', 'is_module_wrapper', 'MODULE_WRAPPERS'
-]
diff --git a/mmcv/parallel/_functions.py b/mmcv/parallel/_functions.py
deleted file mode 100644
index 43580b46f9..0000000000
--- a/mmcv/parallel/_functions.py
+++ /dev/null
@@ -1,82 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Optional, Union
-
-import torch
-from torch import Tensor
-from torch.nn.parallel._functions import _get_stream
-
-
-def scatter(input: Union[List, Tensor],
-            devices: List,
-            streams: Optional[List] = None) -> Union[List, Tensor]:
-    """Scatters tensor across multiple GPUs."""
-    if streams is None:
-        streams = [None] * len(devices)
-
-    if isinstance(input, list):
-        chunk_size = (len(input) - 1) // len(devices) + 1
-        outputs = [
-            scatter(input[i], [devices[i // chunk_size]],
-                    [streams[i // chunk_size]]) for i in range(len(input))
-        ]
-        return outputs
-    elif isinstance(input, Tensor):
-        output = input.contiguous()
-        # TODO: copy to a pinned buffer first (if copying from CPU)
-        stream = streams[0] if output.numel() > 0 else None
-        if devices != [-1]:
-            with torch.cuda.device(devices[0]), torch.cuda.stream(stream):
-                output = output.cuda(devices[0], non_blocking=True)
-
-        return output
-    else:
-        raise Exception(f'Unknown type {type(input)}.')
-
-
-def synchronize_stream(output: Union[List, Tensor], devices: List,
-                       streams: List) -> None:
-    if isinstance(output, list):
-        chunk_size = len(output) // len(devices)
-        for i in range(len(devices)):
-            for j in range(chunk_size):
-                synchronize_stream(output[i * chunk_size + j], [devices[i]],
-                                   [streams[i]])
-    elif isinstance(output, Tensor):
-        if output.numel() != 0:
-            with torch.cuda.device(devices[0]):
-                main_stream = torch.cuda.current_stream()
-                main_stream.wait_stream(streams[0])
-                output.record_stream(main_stream)
-    else:
-        raise Exception(f'Unknown type {type(output)}.')
-
-
-def get_input_device(input: Union[List, Tensor]) -> int:
-    if isinstance(input, list):
-        for item in input:
-            input_device = get_input_device(item)
-            if input_device != -1:
-                return input_device
-        return -1
-    elif isinstance(input, Tensor):
-        return input.get_device() if input.is_cuda else -1
-    else:
-        raise Exception(f'Unknown type {type(input)}.')
-
-
-class Scatter:
-
-    @staticmethod
-    def forward(target_gpus: List[int], input: Union[List, Tensor]) -> tuple:
-        input_device = get_input_device(input)
-        streams = None
-        if input_device == -1 and target_gpus != [-1]:
-            # Perform CPU to GPU copies in a background stream
-            streams = [_get_stream(device) for device in target_gpus]
-
-        outputs = scatter(input, target_gpus, streams)
-        # Synchronize with the copy stream
-        if streams is not None:
-            synchronize_stream(outputs, target_gpus, streams)
-
-        return tuple(outputs) if isinstance(outputs, list) else (outputs, )
diff --git a/mmcv/parallel/collate.py b/mmcv/parallel/collate.py
deleted file mode 100644
index 50c408bedc..0000000000
--- a/mmcv/parallel/collate.py
+++ /dev/null
@@ -1,84 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections.abc import Mapping, Sequence
-
-import torch
-import torch.nn.functional as F
-from torch.utils.data.dataloader import default_collate
-
-from .data_container import DataContainer
-
-
-def collate(batch: Sequence, samples_per_gpu: int = 1):
-    """Puts each data field into a tensor/DataContainer with outer dimension
-    batch size.
-
-    Extend default_collate to add support for
-    :type:`~mmcv.parallel.DataContainer`. There are 3 cases.
-
-    1. cpu_only = True, e.g., meta data
-    2. cpu_only = False, stack = True, e.g., images tensors
-    3. cpu_only = False, stack = False, e.g., gt bboxes
-    """
-
-    if not isinstance(batch, Sequence):
-        raise TypeError(f'{batch.dtype} is not supported.')
-
-    if isinstance(batch[0], DataContainer):
-        stacked = []
-        if batch[0].cpu_only:
-            for i in range(0, len(batch), samples_per_gpu):
-                stacked.append(
-                    [sample.data for sample in batch[i:i + samples_per_gpu]])
-            return DataContainer(
-                stacked, batch[0].stack, batch[0].padding_value, cpu_only=True)
-        elif batch[0].stack:
-            for i in range(0, len(batch), samples_per_gpu):
-                assert isinstance(batch[i].data, torch.Tensor)
-
-                if batch[i].pad_dims is not None:
-                    ndim = batch[i].dim()
-                    assert ndim > batch[i].pad_dims
-                    max_shape = [0 for _ in range(batch[i].pad_dims)]
-                    for dim in range(1, batch[i].pad_dims + 1):
-                        max_shape[dim - 1] = batch[i].size(-dim)
-                    for sample in batch[i:i + samples_per_gpu]:
-                        for dim in range(0, ndim - batch[i].pad_dims):
-                            assert batch[i].size(dim) == sample.size(dim)
-                        for dim in range(1, batch[i].pad_dims + 1):
-                            max_shape[dim - 1] = max(max_shape[dim - 1],
-                                                     sample.size(-dim))
-                    padded_samples = []
-                    for sample in batch[i:i + samples_per_gpu]:
-                        pad = [0 for _ in range(batch[i].pad_dims * 2)]
-                        for dim in range(1, batch[i].pad_dims + 1):
-                            pad[2 * dim -
-                                1] = max_shape[dim - 1] - sample.size(-dim)
-                        padded_samples.append(
-                            F.pad(
-                                sample.data, pad, value=sample.padding_value))
-                    stacked.append(default_collate(padded_samples))
-                elif batch[i].pad_dims is None:
-                    stacked.append(
-                        default_collate([
-                            sample.data
-                            for sample in batch[i:i + samples_per_gpu]
-                        ]))
-                else:
-                    raise ValueError(
-                        'pad_dims should be either None or integers (1-3)')
-
-        else:
-            for i in range(0, len(batch), samples_per_gpu):
-                stacked.append(
-                    [sample.data for sample in batch[i:i + samples_per_gpu]])
-        return DataContainer(stacked, batch[0].stack, batch[0].padding_value)
-    elif isinstance(batch[0], Sequence):
-        transposed = zip(*batch)
-        return [collate(samples, samples_per_gpu) for samples in transposed]
-    elif isinstance(batch[0], Mapping):
-        return {
-            key: collate([d[key] for d in batch], samples_per_gpu)
-            for key in batch[0]
-        }
-    else:
-        return default_collate(batch)
diff --git a/mmcv/parallel/data_container.py b/mmcv/parallel/data_container.py
deleted file mode 100644
index 62f2573110..0000000000
--- a/mmcv/parallel/data_container.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import functools
-from typing import Callable, Type, Union
-
-import numpy as np
-import torch
-
-
-def assert_tensor_type(func: Callable) -> Callable:
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        if not isinstance(args[0].data, torch.Tensor):
-            raise AttributeError(
-                f'{args[0].__class__.__name__} has no attribute '
-                f'{func.__name__} for type {args[0].datatype}')
-        return func(*args, **kwargs)
-
-    return wrapper
-
-
-class DataContainer:
-    """A container for any type of objects.
-
-    Typically tensors will be stacked in the collate function and sliced along
-    some dimension in the scatter function. This behavior has some limitations.
-    1. All tensors have to be the same size.
-    2. Types are limited (numpy array or Tensor).
-
-    We design `DataContainer` and `MMDataParallel` to overcome these
-    limitations. The behavior can be either of the following.
-
-    - copy to GPU, pad all tensors to the same size and stack them
-    - copy to GPU without stacking
-    - leave the objects as is and pass it to the model
-    - pad_dims specifies the number of last few dimensions to do padding
-    """
-
-    def __init__(self,
-                 data: Union[torch.Tensor, np.ndarray],
-                 stack: bool = False,
-                 padding_value: int = 0,
-                 cpu_only: bool = False,
-                 pad_dims: int = 2):
-        self._data = data
-        self._cpu_only = cpu_only
-        self._stack = stack
-        self._padding_value = padding_value
-        assert pad_dims in [None, 1, 2, 3]
-        self._pad_dims = pad_dims
-
-    def __repr__(self) -> str:
-        return f'{self.__class__.__name__}({repr(self.data)})'
-
-    def __len__(self) -> int:
-        return len(self._data)
-
-    @property
-    def data(self) -> Union[torch.Tensor, np.ndarray]:
-        return self._data
-
-    @property
-    def datatype(self) -> Union[Type, str]:
-        if isinstance(self.data, torch.Tensor):
-            return self.data.type()
-        else:
-            return type(self.data)
-
-    @property
-    def cpu_only(self) -> bool:
-        return self._cpu_only
-
-    @property
-    def stack(self) -> bool:
-        return self._stack
-
-    @property
-    def padding_value(self) -> int:
-        return self._padding_value
-
-    @property
-    def pad_dims(self) -> int:
-        return self._pad_dims
-
-    @assert_tensor_type
-    def size(self, *args, **kwargs) -> torch.Size:
-        return self.data.size(*args, **kwargs)
-
-    @assert_tensor_type
-    def dim(self) -> int:
-        return self.data.dim()
diff --git a/mmcv/parallel/data_parallel.py b/mmcv/parallel/data_parallel.py
deleted file mode 100644
index eea088fa0c..0000000000
--- a/mmcv/parallel/data_parallel.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from itertools import chain
-from typing import List, Tuple
-
-from torch.nn.parallel import DataParallel
-
-from .scatter_gather import ScatterInputs, scatter_kwargs
-
-
-class MMDataParallel(DataParallel):
-    """The DataParallel module that supports DataContainer.
-
-    MMDataParallel has two main differences with PyTorch DataParallel:
-
-    - It supports a custom type :class:`DataContainer` which allows more
-      flexible control of input data during both GPU and CPU inference.
-    - It implements two more APIs ``train_step()`` and ``val_step()``.
-
-    .. warning::
-        MMDataParallel only supports single GPU training, if you need to
-        train with multiple GPUs, please use MMDistributedDataParallel
-        instead. If you have multiple GPUs and you just want to use
-        MMDataParallel, you can set the environment variable
-        ``CUDA_VISIBLE_DEVICES=0`` or instantiate ``MMDataParallel`` with
-        ``device_ids=[0]``.
-
-    Args:
-        module (:class:`nn.Module`): Module to be encapsulated.
-        device_ids (list[int]): Device IDS of modules to be scattered to.
-            Defaults to None when GPU is not available.
-        output_device (str | int): Device ID for output. Defaults to None.
-        dim (int): Dimension used to scatter the data. Defaults to 0.
-    """
-
-    def __init__(self, *args, dim: int = 0, **kwargs):
-        super().__init__(*args, dim=dim, **kwargs)
-        self.dim = dim
-
-    def forward(self, *inputs, **kwargs):
-        """Override the original forward function.
-
-        The main difference lies in the CPU inference where the data in
-        :class:`DataContainers` will still be gathered.
-        """
-        if not self.device_ids:
-            # We add the following line thus the module could gather and
-            # convert data containers as those in GPU inference
-            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
-            return self.module(*inputs[0], **kwargs[0])
-        else:
-            return super().forward(*inputs, **kwargs)
-
-    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
-                device_ids: List[int]) -> Tuple[tuple, tuple]:
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-
-    def train_step(self, *inputs, **kwargs):
-        if not self.device_ids:
-            # We add the following line thus the module could gather and
-            # convert data containers as those in GPU inference
-            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
-            return self.module.train_step(*inputs[0], **kwargs[0])
-
-        assert len(self.device_ids) == 1, \
-            ('MMDataParallel only supports single GPU training, if you need to'
-             ' train with multiple GPUs, please use MMDistributedDataParallel'
-             ' instead.')
-
-        for t in chain(self.module.parameters(), self.module.buffers()):
-            if t.device != self.src_device_obj:
-                raise RuntimeError(
-                    'module must have its parameters and buffers '
-                    f'on device {self.src_device_obj} (device_ids[0]) but '
-                    f'found one of them on device: {t.device}')
-
-        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-        return self.module.train_step(*inputs[0], **kwargs[0])
-
-    def val_step(self, *inputs, **kwargs):
-        if not self.device_ids:
-            # We add the following line thus the module could gather and
-            # convert data containers as those in GPU inference
-            inputs, kwargs = self.scatter(inputs, kwargs, [-1])
-            return self.module.val_step(*inputs[0], **kwargs[0])
-
-        assert len(self.device_ids) == 1, \
-            ('MMDataParallel only supports single GPU training, if you need to'
-             ' train with multiple GPUs, please use MMDistributedDataParallel'
-             ' instead.')
-
-        for t in chain(self.module.parameters(), self.module.buffers()):
-            if t.device != self.src_device_obj:
-                raise RuntimeError(
-                    'module must have its parameters and buffers '
-                    f'on device {self.src_device_obj} (device_ids[0]) but '
-                    f'found one of them on device: {t.device}')
-
-        inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-        return self.module.val_step(*inputs[0], **kwargs[0])
diff --git a/mmcv/parallel/distributed.py b/mmcv/parallel/distributed.py
deleted file mode 100644
index bf34cb5906..0000000000
--- a/mmcv/parallel/distributed.py
+++ /dev/null
@@ -1,167 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Any, List, Tuple
-
-import torch
-from torch.nn.parallel.distributed import (DistributedDataParallel,
-                                           _find_tensors)
-
-from mmcv import print_log
-from mmcv.utils import TORCH_VERSION, digit_version
-from .scatter_gather import ScatterInputs, scatter_kwargs
-
-
-class MMDistributedDataParallel(DistributedDataParallel):
-    """The DDP module that supports DataContainer.
-
-    MMDDP has two main differences with PyTorch DDP:
-
-    - It supports a custom type :class:`DataContainer` which allows more
-      flexible control of input data.
-    - It implement two APIs ``train_step()`` and ``val_step()``.
-    """
-
-    def to_kwargs(self, inputs: ScatterInputs, kwargs: ScatterInputs,
-                  device_id: int) -> Tuple[tuple, tuple]:
-        # Use `self.to_kwargs` instead of `self.scatter` in pytorch1.8
-        # to move all tensors to device_id
-        return scatter_kwargs(inputs, kwargs, [device_id], dim=self.dim)
-
-    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
-                device_ids: List[int]) -> Tuple[tuple, tuple]:
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-
-    def train_step(self, *inputs, **kwargs):
-        """train_step() API for module wrapped by DistributedDataParallel.
-
-        This method is basically the same as
-        ``DistributedDataParallel.forward()``, while replacing
-        ``self.module.forward()`` with ``self.module.train_step()``.
-        It is compatible with PyTorch 1.1 - 1.5.
-        """
-
-        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
-        # end of backward to the beginning of forward.
-        if ('parrots' not in TORCH_VERSION
-                and digit_version(TORCH_VERSION) >= digit_version('1.7')
-                and self.reducer._rebuild_buckets()):
-            print_log(
-                'Reducer buckets have been rebuilt in this iteration.',
-                logger='mmcv')
-
-        if ('parrots' not in TORCH_VERSION
-                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
-            if self._check_sync_bufs_pre_fwd():
-                self._sync_buffers()
-        else:
-            if (getattr(self, 'require_forward_param_sync', False)
-                    and self.require_forward_param_sync):
-                self._sync_params()
-
-        if self.device_ids:
-            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-            if len(self.device_ids) == 1:
-                output = self.module.train_step(*inputs[0], **kwargs[0])
-            else:
-                outputs = self.parallel_apply(
-                    self._module_copies[:len(inputs)], inputs, kwargs)
-                output = self.gather(outputs, self.output_device)
-        else:
-            output = self.module.train_step(*inputs, **kwargs)
-
-        if ('parrots' not in TORCH_VERSION
-                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
-            if self._check_sync_bufs_post_fwd():
-                self._sync_buffers()
-
-        if (torch.is_grad_enabled()
-                and getattr(self, 'require_backward_grad_sync', False)
-                and self.require_backward_grad_sync):
-            if self.find_unused_parameters:
-                self.reducer.prepare_for_backward(list(_find_tensors(output)))
-            else:
-                self.reducer.prepare_for_backward([])
-        else:
-            if ('parrots' not in TORCH_VERSION
-                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
-                self.require_forward_param_sync = False
-        return output
-
-    def val_step(self, *inputs, **kwargs):
-        """val_step() API for module wrapped by DistributedDataParallel.
-
-        This method is basically the same as
-        ``DistributedDataParallel.forward()``, while replacing
-        ``self.module.forward()`` with ``self.module.val_step()``.
-        It is compatible with PyTorch 1.1 - 1.5.
-        """
-        # In PyTorch >= 1.7, ``reducer._rebuild_buckets()`` is moved from the
-        # end of backward to the beginning of forward.
-        if ('parrots' not in TORCH_VERSION
-                and digit_version(TORCH_VERSION) >= digit_version('1.7')
-                and self.reducer._rebuild_buckets()):
-            print_log(
-                'Reducer buckets have been rebuilt in this iteration.',
-                logger='mmcv')
-
-        if ('parrots' not in TORCH_VERSION
-                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
-            if self._check_sync_bufs_pre_fwd():
-                self._sync_buffers()
-        else:
-            if (getattr(self, 'require_forward_param_sync', False)
-                    and self.require_forward_param_sync):
-                self._sync_params()
-
-        if self.device_ids:
-            inputs, kwargs = self.scatter(inputs, kwargs, self.device_ids)
-            if len(self.device_ids) == 1:
-                output = self.module.val_step(*inputs[0], **kwargs[0])
-            else:
-                outputs = self.parallel_apply(
-                    self._module_copies[:len(inputs)], inputs, kwargs)
-                output = self.gather(outputs, self.output_device)
-        else:
-            output = self.module.val_step(*inputs, **kwargs)
-
-        if ('parrots' not in TORCH_VERSION
-                and digit_version(TORCH_VERSION) >= digit_version('1.11.0a0')):
-            if self._check_sync_bufs_post_fwd():
-                self._sync_buffers()
-
-        if (torch.is_grad_enabled()
-                and getattr(self, 'require_backward_grad_sync', False)
-                and self.require_backward_grad_sync):
-            if self.find_unused_parameters:
-                self.reducer.prepare_for_backward(list(_find_tensors(output)))
-            else:
-                self.reducer.prepare_for_backward([])
-        else:
-            if ('parrots' not in TORCH_VERSION
-                    and digit_version(TORCH_VERSION) > digit_version('1.2')):
-                self.require_forward_param_sync = False
-        return output
-
-    def _run_ddp_forward(self, *inputs, **kwargs) -> Any:
-        """Processes inputs and runs ``self.module.forward``.
-
-        Pytorch 1.12.0 performs ``self.module.forward`` in ``_run_ddp_forward``
-        and deprecates using ``DistributedDataParallel.to_kwargs`` to
-        process inputs, which leads to inputs cannot be processed by
-        :meth:`MMDistributedDataParallel.to_kwargs` anymore. Therefore,
-        ``MMDistributedDataParallel`` overrides this method to call
-        :meth:`to_kwargs` explicitly.
-
-        See more information in `<https://github.com/open-mmlab/mmsegmentation/issues/1742>`_.  # noqa: E501
-
-        Returns:
-            Any: Forward result of :attr:`module`.
-        """
-        module_to_run = self._replicated_tensor_module if \
-            self._use_replicated_tensor_module else self.module
-
-        if self.device_ids:
-            inputs, kwargs = self.to_kwargs(  # type: ignore
-                inputs, kwargs, self.device_ids[0])
-            return module_to_run(*inputs[0], **kwargs[0])  # type: ignore
-        else:
-            return module_to_run(*inputs, **kwargs)
diff --git a/mmcv/parallel/distributed_deprecated.py b/mmcv/parallel/distributed_deprecated.py
deleted file mode 100644
index 21b6c4ec15..0000000000
--- a/mmcv/parallel/distributed_deprecated.py
+++ /dev/null
@@ -1,74 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Sequence, Tuple
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from torch._utils import (_flatten_dense_tensors, _take_tensors,
-                          _unflatten_dense_tensors)
-
-from mmcv.utils import TORCH_VERSION, digit_version
-from .registry import MODULE_WRAPPERS
-from .scatter_gather import ScatterInputs, scatter_kwargs
-
-
-@MODULE_WRAPPERS.register_module()
-class MMDistributedDataParallel(nn.Module):
-
-    def __init__(self,
-                 module: nn.Module,
-                 dim: int = 0,
-                 broadcast_buffers: bool = True,
-                 bucket_cap_mb: int = 25):
-        super().__init__()
-        self.module = module
-        self.dim = dim
-        self.broadcast_buffers = broadcast_buffers
-
-        self.broadcast_bucket_size = bucket_cap_mb * 1024 * 1024
-        self._sync_params()
-
-    def _dist_broadcast_coalesced(self, tensors: Sequence[torch.Tensor],
-                                  buffer_size: int) -> None:
-        for tensors in _take_tensors(tensors, buffer_size):
-            flat_tensors = _flatten_dense_tensors(tensors)
-            dist.broadcast(flat_tensors, 0)
-            for tensor, synced in zip(
-                    tensors, _unflatten_dense_tensors(flat_tensors, tensors)):
-                tensor.copy_(synced)
-
-    def _sync_params(self) -> None:
-        module_states = list(self.module.state_dict().values())
-        if len(module_states) > 0:
-            self._dist_broadcast_coalesced(module_states,
-                                           self.broadcast_bucket_size)
-        if self.broadcast_buffers:
-            if (TORCH_VERSION != 'parrots'
-                    and digit_version(TORCH_VERSION) < digit_version('1.0')):
-                buffers = [b.data for b in self.module._all_buffers()]
-            else:
-                buffers = [b.data for b in self.module.buffers()]
-            if len(buffers) > 0:
-                self._dist_broadcast_coalesced(buffers,
-                                               self.broadcast_bucket_size)
-
-    def scatter(self, inputs: ScatterInputs, kwargs: ScatterInputs,
-                device_ids: List[int]) -> Tuple[tuple, tuple]:
-        return scatter_kwargs(inputs, kwargs, device_ids, dim=self.dim)
-
-    def forward(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs,
-                                      [torch.cuda.current_device()])
-        return self.module(*inputs[0], **kwargs[0])
-
-    def train_step(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs,
-                                      [torch.cuda.current_device()])
-        output = self.module.train_step(*inputs[0], **kwargs[0])
-        return output
-
-    def val_step(self, *inputs, **kwargs):
-        inputs, kwargs = self.scatter(inputs, kwargs,
-                                      [torch.cuda.current_device()])
-        output = self.module.val_step(*inputs[0], **kwargs[0])
-        return output
diff --git a/mmcv/parallel/registry.py b/mmcv/parallel/registry.py
deleted file mode 100644
index 144f9fb168..0000000000
--- a/mmcv/parallel/registry.py
+++ /dev/null
@@ -1,8 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch.nn.parallel import DataParallel, DistributedDataParallel
-
-from mmcv.utils import Registry
-
-MODULE_WRAPPERS = Registry('module wrapper')
-MODULE_WRAPPERS.register_module(module=DataParallel)
-MODULE_WRAPPERS.register_module(module=DistributedDataParallel)
diff --git a/mmcv/parallel/scatter_gather.py b/mmcv/parallel/scatter_gather.py
deleted file mode 100644
index 3133b253c9..0000000000
--- a/mmcv/parallel/scatter_gather.py
+++ /dev/null
@@ -1,70 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import List, Tuple, Union
-
-from torch import Tensor
-from torch.nn.parallel._functions import Scatter as OrigScatter
-
-from ._functions import Scatter
-from .data_container import DataContainer
-
-ScatterInputs = Union[Tensor, DataContainer, tuple, list, dict]
-
-
-def scatter(inputs: ScatterInputs,
-            target_gpus: List[int],
-            dim: int = 0) -> list:
-    """Scatter inputs to target gpus.
-
-    The only difference from original :func:`scatter` is to add support for
-    :type:`~mmcv.parallel.DataContainer`.
-    """
-
-    def scatter_map(obj):
-        if isinstance(obj, Tensor):
-            if target_gpus != [-1]:
-                return OrigScatter.apply(target_gpus, None, dim, obj)
-            else:
-                # for CPU inference we use self-implemented scatter
-                return Scatter.forward(target_gpus, obj)
-        if isinstance(obj, DataContainer):
-            if obj.cpu_only:
-                return obj.data
-            else:
-                return Scatter.forward(target_gpus, obj.data)
-        if isinstance(obj, tuple) and len(obj) > 0:
-            return list(zip(*map(scatter_map, obj)))
-        if isinstance(obj, list) and len(obj) > 0:
-            out = list(map(list, zip(*map(scatter_map, obj))))
-            return out
-        if isinstance(obj, dict) and len(obj) > 0:
-            out = list(map(type(obj), zip(*map(scatter_map, obj.items()))))
-            return out
-        return [obj for _ in target_gpus]
-
-    # After scatter_map is called, a scatter_map cell will exist. This cell
-    # has a reference to the actual function scatter_map, which has references
-    # to a closure that has a reference to the scatter_map cell (because the
-    # fn is recursive). To avoid this reference cycle, we set the function to
-    # None, clearing the cell
-    try:
-        return scatter_map(inputs)
-    finally:
-        scatter_map = None  # type: ignore
-
-
-def scatter_kwargs(inputs: ScatterInputs,
-                   kwargs: ScatterInputs,
-                   target_gpus: List[int],
-                   dim: int = 0) -> Tuple[tuple, tuple]:
-    """Scatter with support for kwargs dictionary."""
-    inputs = scatter(inputs, target_gpus, dim) if inputs else []
-    kwargs = scatter(kwargs, target_gpus, dim) if kwargs else []
-    if len(inputs) < len(kwargs):
-        length = len(kwargs) - len(inputs)
-        inputs.extend([() for _ in range(length)])  # type: ignore
-    elif len(kwargs) < len(inputs):
-        length = len(inputs) - len(kwargs)
-        kwargs.extend([{} for _ in range(length)])  # type: ignore
-    inputs = tuple(inputs)
-    kwargs = tuple(kwargs)
-    return inputs, kwargs
diff --git a/mmcv/parallel/utils.py b/mmcv/parallel/utils.py
deleted file mode 100644
index bd52622b1b..0000000000
--- a/mmcv/parallel/utils.py
+++ /dev/null
@@ -1,32 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from torch import nn
-
-from .registry import MODULE_WRAPPERS
-
-
-def is_module_wrapper(module: nn.Module) -> bool:
-    """Check if a module is a module wrapper.
-
-    The following 3 modules in MMCV (and their subclasses) are regarded as
-    module wrappers: DataParallel, DistributedDataParallel,
-    MMDistributedDataParallel (the deprecated version). You may add you own
-    module wrapper by registering it to mmcv.parallel.MODULE_WRAPPERS or
-    its children registries.
-
-    Args:
-        module (nn.Module): The module to be checked.
-
-    Returns:
-        bool: True if the input module is a module wrapper.
-    """
-
-    def is_module_in_wrapper(module, module_wrapper):
-        module_wrappers = tuple(module_wrapper.module_dict.values())
-        if isinstance(module, module_wrappers):
-            return True
-        for child in module_wrapper.children.values():
-            if is_module_in_wrapper(module, child):
-                return True
-        return False
-
-    return is_module_in_wrapper(module, MODULE_WRAPPERS)
diff --git a/mmcv/runner/__init__.py b/mmcv/runner/__init__.py
deleted file mode 100644
index 5c4e2a5dfc..0000000000
--- a/mmcv/runner/__init__.py
+++ /dev/null
@@ -1,72 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base_runner import BaseRunner
-from .builder import RUNNERS, build_runner
-from .checkpoint import (CheckpointLoader, _load_checkpoint,
-                         _load_checkpoint_with_prefix, load_checkpoint,
-                         load_state_dict, save_checkpoint, weights_to_cpu)
-from .default_constructor import DefaultRunnerConstructor
-from .dist_utils import (allreduce_grads, allreduce_params, get_dist_info,
-                         init_dist, master_only)
-from .epoch_based_runner import EpochBasedRunner, Runner
-from .fp16_utils import LossScaler, auto_fp16, force_fp32, wrap_fp16_model
-from .hooks import (HOOKS, CheckpointHook, ClearMLLoggerHook, ClosureHook,
-                    DistEvalHook, DistSamplerSeedHook, DvcliveLoggerHook,
-                    EMAHook, EvalHook, Fp16OptimizerHook,
-                    GradientCumulativeFp16OptimizerHook,
-                    GradientCumulativeOptimizerHook, Hook, IterTimerHook,
-                    LoggerHook, MlflowLoggerHook, NeptuneLoggerHook,
-                    OptimizerHook, PaviLoggerHook, SegmindLoggerHook,
-                    SyncBuffersHook, TensorboardLoggerHook, TextLoggerHook,
-                    WandbLoggerHook)
-from .hooks.lr_updater import StepLrUpdaterHook  # noqa
-from .hooks.lr_updater import (CosineAnnealingLrUpdaterHook,
-                               CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
-                               ExpLrUpdaterHook, FixedLrUpdaterHook,
-                               FlatCosineAnnealingLrUpdaterHook,
-                               InvLrUpdaterHook, LinearAnnealingLrUpdaterHook,
-                               LrUpdaterHook, OneCycleLrUpdaterHook,
-                               PolyLrUpdaterHook)
-from .hooks.momentum_updater import (CosineAnnealingMomentumUpdaterHook,
-                                     CyclicMomentumUpdaterHook,
-                                     LinearAnnealingMomentumUpdaterHook,
-                                     MomentumUpdaterHook,
-                                     OneCycleMomentumUpdaterHook,
-                                     StepMomentumUpdaterHook)
-from .iter_based_runner import IterBasedRunner, IterLoader
-from .log_buffer import LogBuffer
-from .optimizer import (OPTIMIZER_BUILDERS, OPTIMIZERS,
-                        DefaultOptimizerConstructor, build_optimizer,
-                        build_optimizer_constructor)
-from .priority import Priority, get_priority
-from .utils import get_host_info, get_time_str, obj_from_dict, set_random_seed
-
-# initialize ipu to registor ipu runner to RUNNERS
-from mmcv.device import ipu  # isort:skip  # noqa
-
-__all__ = [
-    'BaseRunner', 'Runner', 'EpochBasedRunner', 'IterBasedRunner', 'LogBuffer',
-    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
-    'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
-    'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
-    'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
-    'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'MomentumUpdaterHook',
-    'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
-    'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
-    'OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook', 'LoggerHook',
-    'PaviLoggerHook', 'TextLoggerHook', 'TensorboardLoggerHook',
-    'NeptuneLoggerHook', 'WandbLoggerHook', 'MlflowLoggerHook',
-    'DvcliveLoggerHook', '_load_checkpoint', 'load_state_dict',
-    'load_checkpoint', 'weights_to_cpu', 'save_checkpoint', 'Priority',
-    'get_priority', 'get_host_info', 'get_time_str', 'obj_from_dict',
-    'init_dist', 'get_dist_info', 'master_only', 'OPTIMIZER_BUILDERS',
-    'OPTIMIZERS', 'DefaultOptimizerConstructor', 'build_optimizer',
-    'build_optimizer_constructor', 'IterLoader', 'set_random_seed',
-    'auto_fp16', 'force_fp32', 'wrap_fp16_model', 'Fp16OptimizerHook',
-    'SyncBuffersHook', 'EMAHook', 'build_runner', 'RUNNERS', 'allreduce_grads',
-    'allreduce_params', 'LossScaler', 'CheckpointLoader',
-    '_load_checkpoint_with_prefix', 'EvalHook', 'DistEvalHook',
-    'GradientCumulativeOptimizerHook', 'GradientCumulativeFp16OptimizerHook',
-    'DefaultRunnerConstructor', 'SegmindLoggerHook',
-    'LinearAnnealingMomentumUpdaterHook', 'LinearAnnealingLrUpdaterHook',
-    'ClearMLLoggerHook'
-]
diff --git a/mmcv/runner/base_runner.py b/mmcv/runner/base_runner.py
deleted file mode 100644
index 2c5a9ddd00..0000000000
--- a/mmcv/runner/base_runner.py
+++ /dev/null
@@ -1,566 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import logging
-import os.path as osp
-import warnings
-from abc import ABCMeta, abstractmethod
-from collections import OrderedDict
-from typing import (Any, Callable, Dict, List, Optional, Tuple, Union,
-                    no_type_check)
-
-import torch
-from torch.optim import Optimizer
-from torch.utils.data import DataLoader
-
-import mmcv
-from ..parallel import is_module_wrapper
-from .checkpoint import load_checkpoint
-from .dist_utils import get_dist_info
-from .hooks import HOOKS, Hook
-from .log_buffer import LogBuffer
-from .priority import Priority, get_priority
-from .utils import get_time_str
-
-
-class BaseRunner(metaclass=ABCMeta):
-    """The base class of Runner, a training helper for PyTorch.
-
-    All subclasses should implement the following APIs:
-
-    - ``run()``
-    - ``train()``
-    - ``val()``
-    - ``save_checkpoint()``
-
-    Args:
-        model (:obj:`torch.nn.Module`): The model to be run.
-        batch_processor (callable): A callable method that process a data
-            batch. The interface of this method should be
-            `batch_processor(model, data, train_mode) -> dict`
-        optimizer (dict or :obj:`torch.optim.Optimizer`): It can be either an
-            optimizer (in most cases) or a dict of optimizers (in models that
-            requires more than one optimizer, e.g., GAN).
-        work_dir (str, optional): The working directory to save checkpoints
-            and logs. Defaults to None.
-        logger (:obj:`logging.Logger`): Logger used during training.
-             Defaults to None. (The default value is just for backward
-             compatibility)
-        meta (dict | None): A dict records some import information such as
-            environment info and seed, which will be logged in logger hook.
-            Defaults to None.
-        max_epochs (int, optional): Total training epochs.
-        max_iters (int, optional): Total training iterations.
-    """
-
-    def __init__(self,
-                 model: torch.nn.Module,
-                 batch_processor: Optional[Callable] = None,
-                 optimizer: Union[Dict, torch.optim.Optimizer, None] = None,
-                 work_dir: Optional[str] = None,
-                 logger: Optional[logging.Logger] = None,
-                 meta: Optional[Dict] = None,
-                 max_iters: Optional[int] = None,
-                 max_epochs: Optional[int] = None) -> None:
-        if batch_processor is not None:
-            if not callable(batch_processor):
-                raise TypeError('batch_processor must be callable, '
-                                f'but got {type(batch_processor)}')
-            warnings.warn(
-                'batch_processor is deprecated, please implement '
-                'train_step() and val_step() in the model instead.',
-                DeprecationWarning)
-            # raise an error is `batch_processor` is not None and
-            # `model.train_step()` exists.
-            if is_module_wrapper(model):
-                _model = model.module
-            else:
-                _model = model
-            if hasattr(_model, 'train_step') or hasattr(_model, 'val_step'):
-                raise RuntimeError(
-                    'batch_processor and model.train_step()/model.val_step() '
-                    'cannot be both available.')
-        else:
-            assert hasattr(model, 'train_step')
-
-        # check the type of `optimizer`
-        if isinstance(optimizer, dict):
-            for name, optim in optimizer.items():
-                if not isinstance(optim, Optimizer):
-                    raise TypeError(
-                        f'optimizer must be a dict of torch.optim.Optimizers, '
-                        f'but optimizer["{name}"] is a {type(optim)}')
-        elif not isinstance(optimizer, Optimizer) and optimizer is not None:
-            raise TypeError(
-                f'optimizer must be a torch.optim.Optimizer object '
-                f'or dict or None, but got {type(optimizer)}')
-
-        # check the type of `logger`
-        if not isinstance(logger, logging.Logger):
-            raise TypeError(f'logger must be a logging.Logger object, '
-                            f'but got {type(logger)}')
-
-        # check the type of `meta`
-        if meta is not None and not isinstance(meta, dict):
-            raise TypeError(
-                f'meta must be a dict or None, but got {type(meta)}')
-
-        self.model = model
-        self.batch_processor = batch_processor
-        self.optimizer = optimizer
-        self.logger = logger
-        self.meta = meta
-        # create work_dir
-        if isinstance(work_dir, str):
-            self.work_dir: Optional[str] = osp.abspath(work_dir)
-            mmcv.mkdir_or_exist(self.work_dir)
-        elif work_dir is None:
-            self.work_dir = None
-        else:
-            raise TypeError('"work_dir" must be a str or None')
-
-        # get model name from the model class
-        if hasattr(self.model, 'module'):
-            self._model_name = self.model.module.__class__.__name__
-        else:
-            self._model_name = self.model.__class__.__name__
-
-        self._rank, self._world_size = get_dist_info()
-        self.timestamp = get_time_str()
-        self.mode: Optional[str] = None
-        self._hooks: List[Hook] = []
-        self._epoch = 0
-        self._iter = 0
-        self._inner_iter = 0
-
-        if max_epochs is not None and max_iters is not None:
-            raise ValueError(
-                'Only one of `max_epochs` or `max_iters` can be set.')
-
-        self._max_epochs = max_epochs
-        self._max_iters = max_iters
-        # TODO: Redesign LogBuffer, it is not flexible and elegant enough
-        self.log_buffer = LogBuffer()
-
-    @property
-    def model_name(self) -> str:
-        """str: Name of the model, usually the module class name."""
-        return self._model_name
-
-    @property
-    def rank(self) -> int:
-        """int: Rank of current process. (distributed training)"""
-        return self._rank
-
-    @property
-    def world_size(self) -> int:
-        """int: Number of processes participating in the job.
-        (distributed training)"""
-        return self._world_size
-
-    @property
-    def hooks(self) -> List[Hook]:
-        """list[:obj:`Hook`]: A list of registered hooks."""
-        return self._hooks
-
-    @property
-    def epoch(self) -> int:
-        """int: Current epoch."""
-        return self._epoch
-
-    @property
-    def iter(self) -> int:
-        """int: Current iteration."""
-        return self._iter
-
-    @property
-    def inner_iter(self) -> int:
-        """int: Iteration in an epoch."""
-        return self._inner_iter
-
-    @property
-    def max_epochs(self):
-        """int: Maximum training epochs."""
-        return self._max_epochs
-
-    @property
-    def max_iters(self):
-        """int: Maximum training iterations."""
-        return self._max_iters
-
-    @abstractmethod
-    def train(self):
-        pass
-
-    @abstractmethod
-    def val(self):
-        pass
-
-    @abstractmethod
-    def run(self, data_loaders: List[DataLoader],
-            workflow: List[Tuple[str, int]], **kwargs) -> Any:
-        pass
-
-    @abstractmethod
-    def save_checkpoint(self,
-                        out_dir: str,
-                        filename_tmpl: str,
-                        save_optimizer: bool = True,
-                        meta: Optional[Dict] = None,
-                        create_symlink: bool = True) -> None:
-        pass
-
-    def current_lr(self) -> Union[List[float], Dict[str, List[float]]]:
-        """Get current learning rates.
-
-        Returns:
-            list[float] | dict[str, list[float]]: Current learning rates of all
-            param groups. If the runner has a dict of optimizers, this method
-            will return a dict.
-        """
-        lr: Union[List[float], Dict[str, List[float]]]
-        if isinstance(self.optimizer, torch.optim.Optimizer):
-            lr = [group['lr'] for group in self.optimizer.param_groups]
-        elif isinstance(self.optimizer, dict):
-            lr = dict()
-            for name, optim in self.optimizer.items():
-                lr[name] = [group['lr'] for group in optim.param_groups]
-        else:
-            raise RuntimeError(
-                'lr is not applicable because optimizer does not exist.')
-        return lr
-
-    def current_momentum(self) -> Union[List[float], Dict[str, List[float]]]:
-        """Get current momentums.
-
-        Returns:
-            list[float] | dict[str, list[float]]: Current momentums of all
-            param groups. If the runner has a dict of optimizers, this method
-            will return a dict.
-        """
-
-        def _get_momentum(optimizer):
-            momentums = []
-            for group in optimizer.param_groups:
-                if 'momentum' in group.keys():
-                    momentums.append(group['momentum'])
-                elif 'betas' in group.keys():
-                    momentums.append(group['betas'][0])
-                else:
-                    momentums.append(0)
-            return momentums
-
-        if self.optimizer is None:
-            raise RuntimeError(
-                'momentum is not applicable because optimizer does not exist.')
-        elif isinstance(self.optimizer, torch.optim.Optimizer):
-            momentums = _get_momentum(self.optimizer)
-        elif isinstance(self.optimizer, dict):
-            momentums = dict()
-            for name, optim in self.optimizer.items():
-                momentums[name] = _get_momentum(optim)
-        return momentums
-
-    def register_hook(self,
-                      hook: Hook,
-                      priority: Union[int, str, Priority] = 'NORMAL') -> None:
-        """Register a hook into the hook list.
-
-        The hook will be inserted into a priority queue, with the specified
-        priority (See :class:`Priority` for details of priorities).
-        For hooks with the same priority, they will be triggered in the same
-        order as they are registered.
-
-        Args:
-            hook (:obj:`Hook`): The hook to be registered.
-            priority (int or str or :obj:`Priority`): Hook priority.
-                Lower value means higher priority.
-        """
-        assert isinstance(hook, Hook)
-        if hasattr(hook, 'priority'):
-            raise ValueError('"priority" is a reserved attribute for hooks')
-        priority = get_priority(priority)
-        hook.priority = priority  # type: ignore
-        # insert the hook to a sorted list
-        inserted = False
-        for i in range(len(self._hooks) - 1, -1, -1):
-            if priority >= self._hooks[i].priority:  # type: ignore
-                self._hooks.insert(i + 1, hook)
-                inserted = True
-                break
-        if not inserted:
-            self._hooks.insert(0, hook)
-
-    def register_hook_from_cfg(self, hook_cfg: Dict) -> None:
-        """Register a hook from its cfg.
-
-        Args:
-            hook_cfg (dict): Hook config. It should have at least keys 'type'
-              and 'priority' indicating its type and priority.
-
-        Note:
-            The specific hook class to register should not use 'type' and
-            'priority' arguments during initialization.
-        """
-        hook_cfg = hook_cfg.copy()
-        priority = hook_cfg.pop('priority', 'NORMAL')
-        hook = mmcv.build_from_cfg(hook_cfg, HOOKS)
-        self.register_hook(hook, priority=priority)
-
-    def call_hook(self, fn_name: str) -> None:
-        """Call all hooks.
-
-        Args:
-            fn_name (str): The function name in each hook to be called, such as
-                "before_train_epoch".
-        """
-        for hook in self._hooks:
-            getattr(hook, fn_name)(self)
-
-    def get_hook_info(self) -> str:
-        # Get hooks info in each stage
-        stage_hook_map: Dict[str, list] = {stage: [] for stage in Hook.stages}
-        for hook in self.hooks:
-            try:
-                priority = Priority(hook.priority).name  # type: ignore
-            except ValueError:
-                priority = hook.priority  # type: ignore
-            classname = hook.__class__.__name__
-            hook_info = f'({priority:<12}) {classname:<35}'
-            for trigger_stage in hook.get_triggered_stages():
-                stage_hook_map[trigger_stage].append(hook_info)
-
-        stage_hook_infos = []
-        for stage in Hook.stages:
-            hook_infos = stage_hook_map[stage]
-            if len(hook_infos) > 0:
-                info = f'{stage}:\n'
-                info += '\n'.join(hook_infos)
-                info += '\n -------------------- '
-                stage_hook_infos.append(info)
-        return '\n'.join(stage_hook_infos)
-
-    def load_checkpoint(
-        self,
-        filename: str,
-        map_location: Union[str, Callable] = 'cpu',
-        strict: bool = False,
-        revise_keys: List = [(r'^module.', '')],
-    ) -> Union[Dict, OrderedDict]:
-        return load_checkpoint(
-            self.model,
-            filename,
-            map_location,
-            strict,
-            self.logger,
-            revise_keys=revise_keys)
-
-    @no_type_check
-    def resume(self,
-               checkpoint: str,
-               resume_optimizer: bool = True,
-               map_location: Union[str, Callable] = 'default') -> None:
-        if map_location == 'default':
-            if torch.cuda.is_available():
-                device_id = torch.cuda.current_device()
-                checkpoint = self.load_checkpoint(
-                    checkpoint,
-                    map_location=lambda storage, loc: storage.cuda(device_id))
-            else:
-                checkpoint = self.load_checkpoint(checkpoint)
-        else:
-            checkpoint = self.load_checkpoint(
-                checkpoint, map_location=map_location)
-
-        self._epoch = checkpoint['meta']['epoch']
-        self._iter = checkpoint['meta']['iter']
-        if self.meta is None:
-            self.meta = {}
-        self.meta.setdefault('hook_msgs', {})
-        # load `last_ckpt`, `best_score`, `best_ckpt`, etc. for hook messages
-        self.meta['hook_msgs'].update(checkpoint['meta'].get('hook_msgs', {}))
-
-        # Re-calculate the number of iterations when resuming
-        # models with different number of GPUs
-        if 'config' in checkpoint['meta']:
-            config = mmcv.Config.fromstring(
-                checkpoint['meta']['config'], file_format='.py')
-            previous_gpu_ids = config.get('gpu_ids', None)
-            if previous_gpu_ids and len(previous_gpu_ids) > 0 and len(
-                    previous_gpu_ids) != self.world_size:
-                self._iter = int(self._iter * len(previous_gpu_ids) /
-                                 self.world_size)
-                self.logger.info('the iteration number is changed due to '
-                                 'change of GPU number')
-
-        # resume meta information meta
-        self.meta = checkpoint['meta']
-
-        if 'optimizer' in checkpoint and resume_optimizer:
-            if isinstance(self.optimizer, Optimizer):
-                self.optimizer.load_state_dict(checkpoint['optimizer'])
-            elif isinstance(self.optimizer, dict):
-                for k in self.optimizer.keys():
-                    self.optimizer[k].load_state_dict(
-                        checkpoint['optimizer'][k])
-            else:
-                raise TypeError(
-                    'Optimizer should be dict or torch.optim.Optimizer '
-                    f'but got {type(self.optimizer)}')
-
-        self.logger.info('resumed epoch %d, iter %d', self.epoch, self.iter)
-
-    def register_lr_hook(self, lr_config: Union[Dict, Hook, None]) -> None:
-        if lr_config is None:
-            return
-        elif isinstance(lr_config, dict):
-            assert 'policy' in lr_config
-            policy_type = lr_config.pop('policy')
-            # If the type of policy is all in lower case, e.g., 'cyclic',
-            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
-            # This is for the convenient usage of Lr updater.
-            # Since this is not applicable for `
-            # CosineAnnealingLrUpdater`,
-            # the string will not be changed if it contains capital letters.
-            if policy_type == policy_type.lower():
-                policy_type = policy_type.title()
-            hook_type = policy_type + 'LrUpdaterHook'
-            lr_config['type'] = hook_type
-            hook = mmcv.build_from_cfg(lr_config, HOOKS)
-        else:
-            hook = lr_config
-        self.register_hook(hook, priority='VERY_HIGH')
-
-    def register_momentum_hook(
-            self, momentum_config: Union[Dict, Hook, None]) -> None:
-        if momentum_config is None:
-            return
-        if isinstance(momentum_config, dict):
-            assert 'policy' in momentum_config
-            policy_type = momentum_config.pop('policy')
-            # If the type of policy is all in lower case, e.g., 'cyclic',
-            # then its first letter will be capitalized, e.g., to be 'Cyclic'.
-            # This is for the convenient usage of momentum updater.
-            # Since this is not applicable for
-            # `CosineAnnealingMomentumUpdater`,
-            # the string will not be changed if it contains capital letters.
-            if policy_type == policy_type.lower():
-                policy_type = policy_type.title()
-            hook_type = policy_type + 'MomentumUpdaterHook'
-            momentum_config['type'] = hook_type
-            hook = mmcv.build_from_cfg(momentum_config, HOOKS)
-        else:
-            hook = momentum_config
-        self.register_hook(hook, priority='HIGH')
-
-    def register_optimizer_hook(
-            self, optimizer_config: Union[Dict, Hook, None]) -> None:
-        if optimizer_config is None:
-            return
-        if isinstance(optimizer_config, dict):
-            optimizer_config.setdefault('type', 'OptimizerHook')
-            hook = mmcv.build_from_cfg(optimizer_config, HOOKS)
-        else:
-            hook = optimizer_config
-        self.register_hook(hook, priority='ABOVE_NORMAL')
-
-    def register_checkpoint_hook(
-            self, checkpoint_config: Union[Dict, Hook, None]) -> None:
-        if checkpoint_config is None:
-            return
-        if isinstance(checkpoint_config, dict):
-            checkpoint_config.setdefault('type', 'CheckpointHook')
-            hook = mmcv.build_from_cfg(checkpoint_config, HOOKS)
-        else:
-            hook = checkpoint_config
-        self.register_hook(hook, priority='NORMAL')
-
-    def register_logger_hooks(self, log_config: Optional[Dict]) -> None:
-        if log_config is None:
-            return
-        log_interval = log_config['interval']
-        for info in log_config['hooks']:
-            logger_hook = mmcv.build_from_cfg(
-                info, HOOKS, default_args=dict(interval=log_interval))
-            self.register_hook(logger_hook, priority='VERY_LOW')
-
-    def register_timer_hook(
-        self,
-        timer_config: Union[Dict, Hook, None],
-    ) -> None:
-        if timer_config is None:
-            return
-        if isinstance(timer_config, dict):
-            timer_config_ = copy.deepcopy(timer_config)
-            hook = mmcv.build_from_cfg(timer_config_, HOOKS)
-        else:
-            hook = timer_config
-        self.register_hook(hook, priority='LOW')
-
-    def register_custom_hooks(
-            self, custom_config: Union[List, Dict, Hook, None]) -> None:
-        if custom_config is None:
-            return
-
-        if not isinstance(custom_config, list):
-            custom_config = [custom_config]
-
-        for item in custom_config:
-            if isinstance(item, dict):
-                self.register_hook_from_cfg(item)
-            else:
-                self.register_hook(item, priority='NORMAL')
-
-    def register_profiler_hook(
-        self,
-        profiler_config: Union[Dict, Hook, None],
-    ) -> None:
-        if profiler_config is None:
-            return
-        if isinstance(profiler_config, dict):
-            profiler_config.setdefault('type', 'ProfilerHook')
-            hook = mmcv.build_from_cfg(profiler_config, HOOKS)
-        else:
-            hook = profiler_config
-        self.register_hook(hook)
-
-    def register_training_hooks(
-            self,
-            lr_config: Union[Dict, Hook, None],
-            optimizer_config: Union[Dict, Hook, None] = None,
-            checkpoint_config: Union[Dict, Hook, None] = None,
-            log_config: Optional[Dict] = None,
-            momentum_config: Union[Dict, Hook, None] = None,
-            timer_config: Union[Dict, Hook] = dict(type='IterTimerHook'),
-            custom_hooks_config: Union[List, Dict, Hook, None] = None) -> None:
-        """Register default and custom hooks for training.
-
-        Default and custom hooks include:
-
-        +----------------------+-------------------------+
-        | Hooks                | Priority                |
-        +======================+=========================+
-        | LrUpdaterHook        | VERY_HIGH (10)          |
-        +----------------------+-------------------------+
-        | MomentumUpdaterHook  | HIGH (30)               |
-        +----------------------+-------------------------+
-        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
-        +----------------------+-------------------------+
-        | CheckpointSaverHook  | NORMAL (50)             |
-        +----------------------+-------------------------+
-        | IterTimerHook        | LOW (70)                |
-        +----------------------+-------------------------+
-        | LoggerHook(s)        | VERY_LOW (90)           |
-        +----------------------+-------------------------+
-        | CustomHook(s)        | defaults to NORMAL (50) |
-        +----------------------+-------------------------+
-
-        If custom hooks have same priority with default hooks, custom hooks
-        will be triggered after default hooks.
-        """
-        self.register_lr_hook(lr_config)
-        self.register_momentum_hook(momentum_config)
-        self.register_optimizer_hook(optimizer_config)
-        self.register_checkpoint_hook(checkpoint_config)
-        self.register_timer_hook(timer_config)
-        self.register_logger_hooks(log_config)
-        self.register_custom_hooks(custom_hooks_config)
diff --git a/mmcv/runner/builder.py b/mmcv/runner/builder.py
deleted file mode 100644
index 008da32aa0..0000000000
--- a/mmcv/runner/builder.py
+++ /dev/null
@@ -1,25 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-from typing import Optional
-
-from ..utils import Registry
-
-RUNNERS = Registry('runner')
-RUNNER_BUILDERS = Registry('runner builder')
-
-
-def build_runner_constructor(cfg: dict):
-    return RUNNER_BUILDERS.build(cfg)
-
-
-def build_runner(cfg: dict, default_args: Optional[dict] = None):
-    runner_cfg = copy.deepcopy(cfg)
-    constructor_type = runner_cfg.pop('constructor',
-                                      'DefaultRunnerConstructor')
-    runner_constructor = build_runner_constructor(
-        dict(
-            type=constructor_type,
-            runner_cfg=runner_cfg,
-            default_args=default_args))
-    runner = runner_constructor()
-    return runner
diff --git a/mmcv/runner/checkpoint.py b/mmcv/runner/checkpoint.py
deleted file mode 100644
index 1e1d44dad8..0000000000
--- a/mmcv/runner/checkpoint.py
+++ /dev/null
@@ -1,811 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import io
-import logging
-import os
-import os.path as osp
-import pkgutil
-import re
-import time
-import warnings
-from collections import OrderedDict
-from importlib import import_module
-from tempfile import TemporaryDirectory
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import mmengine
-import torch
-import torch.nn as nn
-import torchvision
-from mmengine.fileio import FileClient
-from mmengine.fileio import load as load_file
-from torch.optim import Optimizer
-
-import mmcv
-from ..parallel import is_module_wrapper
-from ..utils import digit_version, load_url, mkdir_or_exist
-from .dist_utils import get_dist_info
-
-ENV_MMCV_HOME = 'MMCV_HOME'
-ENV_XDG_CACHE_HOME = 'XDG_CACHE_HOME'
-DEFAULT_CACHE_DIR = '~/.cache'
-
-
-def _get_mmcv_home() -> str:
-    mmcv_home = os.path.expanduser(
-        os.getenv(
-            ENV_MMCV_HOME,
-            os.path.join(
-                os.getenv(ENV_XDG_CACHE_HOME, DEFAULT_CACHE_DIR), 'mmcv')))
-
-    mkdir_or_exist(mmcv_home)
-    return mmcv_home
-
-
-def load_state_dict(module: nn.Module,
-                    state_dict: Union[dict, OrderedDict],
-                    strict: bool = False,
-                    logger: Optional[logging.Logger] = None) -> None:
-    """Load state_dict to a module.
-
-    This method is modified from :meth:`torch.nn.Module.load_state_dict`.
-    Default value for ``strict`` is set to ``False`` and the message for
-    param mismatch will be shown even if strict is False.
-
-    Args:
-        module (Module): Module that receives the state_dict.
-        state_dict (dict or OrderedDict): Weights.
-        strict (bool): whether to strictly enforce that the keys
-            in :attr:`state_dict` match the keys returned by this module's
-            :meth:`~torch.nn.Module.state_dict` function. Default: ``False``.
-        logger (:obj:`logging.Logger`, optional): Logger to log the error
-            message. If not specified, print function will be used.
-    """
-    unexpected_keys: List[str] = []
-    all_missing_keys: List[str] = []
-    err_msg: List[str] = []
-
-    metadata = getattr(state_dict, '_metadata', None)
-    state_dict = state_dict.copy()  # type: ignore
-    if metadata is not None:
-        state_dict._metadata = metadata  # type: ignore
-
-    # use _load_from_state_dict to enable checkpoint version control
-    def load(module, prefix=''):
-        # recursively check parallel module in case that the model has a
-        # complicated structure, e.g., nn.Module(nn.Module(DDP))
-        if is_module_wrapper(module):
-            module = module.module
-        local_metadata = {} if metadata is None else metadata.get(
-            prefix[:-1], {})
-        module._load_from_state_dict(state_dict, prefix, local_metadata, True,
-                                     all_missing_keys, unexpected_keys,
-                                     err_msg)
-        for name, child in module._modules.items():
-            if child is not None:
-                load(child, prefix + name + '.')
-
-    load(module)
-    # break load->load reference cycle
-    load = None  # type: ignore
-
-    # ignore "num_batches_tracked" of BN layers
-    missing_keys = [
-        key for key in all_missing_keys if 'num_batches_tracked' not in key
-    ]
-
-    if unexpected_keys:
-        err_msg.append('unexpected key in source '
-                       f'state_dict: {", ".join(unexpected_keys)}\n')
-    if missing_keys:
-        err_msg.append(
-            f'missing keys in source state_dict: {", ".join(missing_keys)}\n')
-
-    rank, _ = get_dist_info()
-    if len(err_msg) > 0 and rank == 0:
-        err_msg.insert(
-            0, 'The model and loaded state dict do not match exactly\n')
-        err_msg = '\n'.join(err_msg)  # type: ignore
-        if strict:
-            raise RuntimeError(err_msg)
-        elif logger is not None:
-            logger.warning(err_msg)
-        else:
-            print(err_msg)
-
-
-def get_torchvision_models():
-    if digit_version(torchvision.__version__) < digit_version('0.13.0a0'):
-        model_urls = dict()
-        # When the version of torchvision is lower than 0.13, the model url is
-        # not declared in `torchvision.model.__init__.py`, so we need to
-        # iterate through `torchvision.models.__path__` to get the url for each
-        # model.
-        for _, name, ispkg in pkgutil.walk_packages(
-                torchvision.models.__path__):
-            if ispkg:
-                continue
-            _zoo = import_module(f'torchvision.models.{name}')
-            if hasattr(_zoo, 'model_urls'):
-                _urls = getattr(_zoo, 'model_urls')
-                model_urls.update(_urls)
-    else:
-        # Since torchvision bumps to v0.13, the weight loading logic,
-        # model keys and model urls have been changed. Here the URLs of old
-        # version is loaded to avoid breaking back compatibility. If the
-        # torchvision version>=0.13.0, new URLs will be added. Users can get
-        # the resnet50 checkpoint by setting 'resnet50.imagent1k_v1',
-        # 'resnet50' or 'ResNet50_Weights.IMAGENET1K_V1' in the config.
-        json_path = osp.join(mmcv.__path__[0],
-                             'model_zoo/torchvision_0.12.json')
-        model_urls = mmengine.load(json_path)
-        for cls_name, cls in torchvision.models.__dict__.items():
-            # The name of torchvision model weights classes ends with
-            # `_Weights` such as `ResNet18_Weights`. However, some model weight
-            # classes, such as `MNASNet0_75_Weights` does not have any urls in
-            # torchvision 0.13.0 and cannot be iterated. Here we simply check
-            # `DEFAULT` attribute to ensure the class is not empty.
-            if (not cls_name.endswith('_Weights')
-                    or not hasattr(cls, 'DEFAULT')):
-                continue
-            # Since `cls.DEFAULT` can not be accessed by iterating cls, we set
-            # default urls explicitly.
-            cls_key = cls_name.replace('_Weights', '').lower()
-            model_urls[f'{cls_key}.default'] = cls.DEFAULT.url
-            for weight_enum in cls:
-                cls_key = cls_name.replace('_Weights', '').lower()
-                cls_key = f'{cls_key}.{weight_enum.name.lower()}'
-                model_urls[cls_key] = weight_enum.url
-
-    return model_urls
-
-
-def get_external_models():
-    mmcv_home = _get_mmcv_home()
-    default_json_path = osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json')
-    default_urls = load_file(default_json_path)
-    assert isinstance(default_urls, dict)
-    external_json_path = osp.join(mmcv_home, 'open_mmlab.json')
-    if osp.exists(external_json_path):
-        external_urls = load_file(external_json_path)
-        assert isinstance(external_urls, dict)
-        default_urls.update(external_urls)
-
-    return default_urls
-
-
-def get_mmcls_models():
-    mmcls_json_path = osp.join(mmcv.__path__[0], 'model_zoo/mmcls.json')
-    mmcls_urls = load_file(mmcls_json_path)
-
-    return mmcls_urls
-
-
-def get_deprecated_model_names():
-    deprecate_json_path = osp.join(mmcv.__path__[0],
-                                   'model_zoo/deprecated.json')
-    deprecate_urls = load_file(deprecate_json_path)
-    assert isinstance(deprecate_urls, dict)
-
-    return deprecate_urls
-
-
-def _process_mmcls_checkpoint(checkpoint: Dict) -> Dict:
-    if 'state_dict' in checkpoint:
-        state_dict = checkpoint['state_dict']
-    else:
-        # Some checkpoints converted from 3rd-party repo don't
-        # have the "state_dict" key.
-        state_dict = checkpoint
-    new_state_dict = OrderedDict()
-    for k, v in state_dict.items():
-        if k.startswith('backbone.'):
-            new_state_dict[k[9:]] = v
-    new_checkpoint = dict(state_dict=new_state_dict)
-
-    return new_checkpoint
-
-
-class CheckpointLoader:
-    """A general checkpoint loader to manage all schemes."""
-
-    _schemes: dict = {}
-
-    @classmethod
-    def _register_scheme(cls,
-                         prefixes: Union[str, List, Tuple],
-                         loader: Callable,
-                         force: bool = False) -> None:
-        if isinstance(prefixes, str):
-            prefixes = [prefixes]
-        else:
-            assert isinstance(prefixes, (list, tuple))
-        for prefix in prefixes:
-            if (prefix not in cls._schemes) or force:
-                cls._schemes[prefix] = loader
-            else:
-                raise KeyError(
-                    f'{prefix} is already registered as a loader backend, '
-                    'add "force=True" if you want to override it')
-        # sort, longer prefixes take priority
-        cls._schemes = OrderedDict(
-            sorted(cls._schemes.items(), key=lambda t: t[0], reverse=True))
-
-    @classmethod
-    def register_scheme(cls,
-                        prefixes: Union[str, List[str], Tuple[str, ...]],
-                        loader: Optional[Callable] = None,
-                        force: bool = False) -> Callable:
-        """Register a loader to CheckpointLoader.
-
-        This method can be used as a normal class method or a decorator.
-
-        Args:
-            prefixes (str or Sequence[str]):
-            The prefix of the registered loader.
-            loader (function, optional): The loader function to be registered.
-                When this method is used as a decorator, loader is None.
-                Defaults to None.
-            force (bool, optional): Whether to override the loader
-                if the prefix has already been registered. Defaults to False.
-        """
-
-        if loader is not None:
-            cls._register_scheme(prefixes, loader, force=force)
-            return  # type: ignore
-
-        def _register(loader_cls):
-            cls._register_scheme(prefixes, loader_cls, force=force)
-            return loader_cls
-
-        return _register
-
-    @classmethod
-    def _get_checkpoint_loader(cls, path: str):
-        """Finds a loader that supports the given path. Falls back to the local
-        loader if no other loader is found.
-
-        Args:
-            path (str): checkpoint path
-
-        Returns:
-            callable: checkpoint loader
-        """
-        for p in cls._schemes:
-            # use regular match to handle some cases that where the prefix of
-            # loader has a prefix. For example, both 's3://path' and
-            # 'open-mmlab:s3://path' should return `load_from_ceph`
-            if re.match(p, path) is not None:
-                return cls._schemes[p]
-
-    @classmethod
-    def load_checkpoint(
-            cls,
-            filename: str,
-            map_location: Union[str, Callable, None] = None,
-            logger: Optional[logging.Logger] = None
-    ) -> Union[dict, OrderedDict]:
-        """load checkpoint through URL scheme path.
-
-        Args:
-            filename (str): checkpoint file name with given prefix
-            map_location (str, optional): Same as :func:`torch.load`.
-                Default: None
-            logger (:mod:`logging.Logger`, optional): The logger for message.
-                Default: None
-
-        Returns:
-            dict or OrderedDict: The loaded checkpoint.
-        """
-
-        checkpoint_loader = cls._get_checkpoint_loader(filename)
-        class_name = checkpoint_loader.__name__  # type: ignore
-        mmcv.print_log(
-            f'load checkpoint from {class_name[10:]} path: {filename}', logger)
-        return checkpoint_loader(filename, map_location)  # type: ignore
-
-
-@CheckpointLoader.register_scheme(prefixes='')
-def load_from_local(
-    filename: str,
-    map_location: Union[str, Callable, None] = None,
-) -> Union[dict, OrderedDict]:
-    """load checkpoint by local file path.
-
-    Args:
-        filename (str): local checkpoint file path
-        map_location (str, optional): Same as :func:`torch.load`.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    filename = osp.expanduser(filename)
-    if not osp.isfile(filename):
-        raise FileNotFoundError(f'{filename} can not be found.')
-    checkpoint = torch.load(filename, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes=('http://', 'https://'))
-def load_from_http(
-        filename: str,
-        map_location: Union[str, Callable, None] = None,
-        model_dir: Optional[str] = None) -> Union[dict, OrderedDict]:
-    """load checkpoint through HTTP or HTTPS scheme path. In distributed
-    setting, this function only download checkpoint at local rank 0.
-
-    Args:
-        filename (str): checkpoint file path with modelzoo or
-            torchvision prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-        model_dir (str, optional): directory in which to save the object,
-            Default: None
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    rank, world_size = get_dist_info()
-    if rank == 0:
-        checkpoint = load_url(
-            filename, model_dir=model_dir, map_location=map_location)
-    if world_size > 1:
-        torch.distributed.barrier()
-        if rank > 0:
-            checkpoint = load_url(
-                filename, model_dir=model_dir, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes='pavi://')
-def load_from_pavi(
-    filename: str,
-    map_location: Union[str, Callable, None] = None,
-) -> Union[dict, OrderedDict]:
-    """load checkpoint through the file path prefixed with pavi. In distributed
-    setting, this function download ckpt at all ranks to different temporary
-    directories.
-
-    Args:
-        filename (str): checkpoint file path with pavi prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-          Default: None
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    assert filename.startswith('pavi://'), \
-        f'Expected filename startswith `pavi://`, but get {filename}'
-    model_path = filename[7:]
-
-    try:
-        from pavi import modelcloud
-    except ImportError:
-        raise ImportError(
-            'Please install pavi to load checkpoint from modelcloud.')
-
-    model = modelcloud.get(model_path)
-    with TemporaryDirectory() as tmp_dir:
-        downloaded_file = osp.join(tmp_dir, model.name)
-        model.download(downloaded_file)
-        checkpoint = torch.load(downloaded_file, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes=r'(\S+\:)?s3://')
-def load_from_ceph(filename: str,
-                   map_location: Union[str, Callable, None] = None,
-                   backend: str = 'petrel') -> Union[dict, OrderedDict]:
-    """load checkpoint through the file path prefixed with s3.  In distributed
-    setting, this function download ckpt at all ranks to different temporary
-    directories.
-
-    Note:
-        Since v1.4.1, the registered scheme prefixes have been enhanced to
-        support bucket names in the path prefix, e.g. 's3://xx.xx/xx.path',
-        'bucket1:s3://xx.xx/xx.path'.
-
-    Args:
-        filename (str): checkpoint file path with s3 prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-        backend (str): The storage backend type. Options are 'ceph',
-            'petrel'. Default: 'petrel'.
-
-    .. warning::
-        :class:`mmengine.fileio.file_client.CephBackend` will be deprecated,
-        please use :class:`mmengine.fileio.file_client.PetrelBackend` instead.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    allowed_backends = ['ceph', 'petrel']
-    if backend not in allowed_backends:
-        raise ValueError(f'Load from Backend {backend} is not supported.')
-
-    if backend == 'ceph':
-        warnings.warn(
-            'CephBackend will be deprecated, please use PetrelBackend instead',
-            DeprecationWarning)
-
-    # CephClient and PetrelBackend have the same prefix 's3://' and the latter
-    # will be chosen as default. If PetrelBackend can not be instantiated
-    # successfully, the CephClient will be chosen.
-    try:
-        file_client = FileClient(backend=backend)
-    except ImportError:
-        allowed_backends.remove(backend)
-        file_client = FileClient(backend=allowed_backends[0])
-
-    with io.BytesIO(file_client.get(filename)) as buffer:
-        checkpoint = torch.load(buffer, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes=('modelzoo://', 'torchvision://'))
-def load_from_torchvision(
-    filename: str,
-    map_location: Union[str, Callable, None] = None,
-) -> Union[dict, OrderedDict]:
-    """load checkpoint through the file path prefixed with modelzoo or
-    torchvision.
-
-    Args:
-        filename (str): checkpoint file path with modelzoo or
-            torchvision prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    model_urls = get_torchvision_models()
-    if filename.startswith('modelzoo://'):
-        warnings.warn(
-            'The URL scheme of "modelzoo://" is deprecated, please '
-            'use "torchvision://" instead', DeprecationWarning)
-        model_name = filename[11:]
-    else:
-        model_name = filename[14:]
-
-    # Support getting model urls in the same way as torchvision
-    # `ResNet50_Weights.IMAGENET1K_V1` will be mapped to
-    # resnet50.imagenet1k_v1.
-    model_name = model_name.lower().replace('_weights', '')
-    return load_from_http(model_urls[model_name], map_location=map_location)
-
-
-@CheckpointLoader.register_scheme(prefixes=('open-mmlab://', 'openmmlab://'))
-def load_from_openmmlab(
-    filename: str,
-    map_location: Union[str, Callable, None] = None,
-) -> Union[dict, OrderedDict]:
-    """load checkpoint through the file path prefixed with open-mmlab or
-    openmmlab.
-
-    Args:
-        filename (str): checkpoint file path with open-mmlab or
-        openmmlab prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-          Default: None
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-
-    model_urls = get_external_models()
-    prefix_str = 'open-mmlab://'
-    if filename.startswith(prefix_str):
-        model_name = filename[13:]
-    else:
-        model_name = filename[12:]
-        prefix_str = 'openmmlab://'
-
-    deprecated_urls = get_deprecated_model_names()
-    if model_name in deprecated_urls:
-        warnings.warn(
-            f'{prefix_str}{model_name} is deprecated in favor '
-            f'of {prefix_str}{deprecated_urls[model_name]}',
-            DeprecationWarning)
-        model_name = deprecated_urls[model_name]
-    model_url = model_urls[model_name]
-    # check if is url
-    if model_url.startswith(('http://', 'https://')):
-        checkpoint = load_from_http(model_url, map_location=map_location)
-    else:
-        filename = osp.join(_get_mmcv_home(), model_url)
-        if not osp.isfile(filename):
-            raise FileNotFoundError(f'{filename} can not be found.')
-        checkpoint = torch.load(filename, map_location=map_location)
-    return checkpoint
-
-
-@CheckpointLoader.register_scheme(prefixes='mmcls://')
-def load_from_mmcls(
-    filename: str,
-    map_location: Union[str, Callable, None] = None,
-) -> Union[dict, OrderedDict]:
-    """load checkpoint through the file path prefixed with mmcls.
-
-    Args:
-        filename (str): checkpoint file path with mmcls prefix
-        map_location (str, optional): Same as :func:`torch.load`.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-
-    model_urls = get_mmcls_models()
-    model_name = filename[8:]
-    checkpoint = load_from_http(
-        model_urls[model_name], map_location=map_location)
-    checkpoint = _process_mmcls_checkpoint(checkpoint)
-    return checkpoint
-
-
-def _load_checkpoint(
-        filename: str,
-        map_location: Union[str, Callable, None] = None,
-        logger: Optional[logging.Logger] = None) -> Union[dict, OrderedDict]:
-    """Load checkpoint from somewhere (modelzoo, file, url).
-
-    Args:
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str, optional): Same as :func:`torch.load`.
-           Default: None.
-        logger (:mod:`logging.Logger`, optional): The logger for error message.
-           Default: None
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint. It can be either an
-           OrderedDict storing model weights or a dict containing other
-           information, which depends on the checkpoint.
-    """
-    return CheckpointLoader.load_checkpoint(filename, map_location, logger)
-
-
-def _load_checkpoint_with_prefix(
-    prefix: str,
-    filename: str,
-    map_location: Union[str, Callable, None] = None,
-) -> Union[dict, OrderedDict]:
-    """Load partial pretrained model with specific prefix.
-
-    Args:
-        prefix (str): The prefix of sub-module.
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str | None): Same as :func:`torch.load`. Default: None.
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-
-    checkpoint = _load_checkpoint(filename, map_location=map_location)
-
-    if 'state_dict' in checkpoint:
-        state_dict = checkpoint['state_dict']
-    else:
-        state_dict = checkpoint
-    if not prefix.endswith('.'):
-        prefix += '.'
-    prefix_len = len(prefix)
-
-    state_dict = {
-        k[prefix_len:]: v
-        for k, v in state_dict.items() if k.startswith(prefix)
-    }
-
-    assert state_dict, f'{prefix} is not in the pretrained model'
-    return state_dict
-
-
-def load_checkpoint(
-        model: torch.nn.Module,
-        filename: str,
-        map_location: Union[str, Callable, None] = None,
-        strict: bool = False,
-        logger: Optional[logging.Logger] = None,
-        revise_keys: list = [(r'^module\.', '')]) -> Union[dict, OrderedDict]:
-    """Load checkpoint from a file or URI.
-
-    Args:
-        model (Module): Module to load checkpoint.
-        filename (str): Accept local filepath, URL, ``torchvision://xxx``,
-            ``open-mmlab://xxx``. Please refer to ``docs/model_zoo.md`` for
-            details.
-        map_location (str): Same as :func:`torch.load`.
-        strict (bool): Whether to allow different params for the model and
-            checkpoint.
-        logger (:mod:`logging.Logger` or None): The logger for error message.
-        revise_keys (list): A list of customized keywords to modify the
-            state_dict in checkpoint. Each item is a (pattern, replacement)
-            pair of the regular expression operations. Default: strip
-            the prefix 'module.' by [(r'^module\\.', '')].
-
-    Returns:
-        dict or OrderedDict: The loaded checkpoint.
-    """
-    checkpoint = _load_checkpoint(filename, map_location, logger)
-    # OrderedDict is a subclass of dict
-    if not isinstance(checkpoint, dict):
-        raise RuntimeError(
-            f'No state_dict found in checkpoint file {filename}')
-    # get state_dict from checkpoint
-    if 'state_dict' in checkpoint:
-        state_dict = checkpoint['state_dict']
-    else:
-        state_dict = checkpoint
-
-    # strip prefix of state_dict
-    metadata = getattr(state_dict, '_metadata', OrderedDict())
-    for p, r in revise_keys:
-        state_dict = OrderedDict(
-            {re.sub(p, r, k): v
-             for k, v in state_dict.items()})
-    # Keep metadata in state_dict
-    state_dict._metadata = metadata
-
-    # load state_dict
-    load_state_dict(model, state_dict, strict, logger)
-    return checkpoint
-
-
-def weights_to_cpu(state_dict: OrderedDict) -> OrderedDict:
-    """Copy a model state_dict to cpu.
-
-    Args:
-        state_dict (OrderedDict): Model weights on GPU.
-
-    Returns:
-        OrderedDict: Model weights on GPU.
-    """
-    state_dict_cpu = OrderedDict()
-    for key, val in state_dict.items():
-        state_dict_cpu[key] = val.cpu()
-    # Keep metadata in state_dict
-    state_dict_cpu._metadata = getattr(  # type: ignore
-        state_dict, '_metadata', OrderedDict())
-    return state_dict_cpu
-
-
-def _save_to_state_dict(module: torch.nn.Module, destination: dict,
-                        prefix: str, keep_vars: bool) -> None:
-    """Saves module state to `destination` dictionary.
-
-    This method is modified from :meth:`torch.nn.Module._save_to_state_dict`.
-
-    Args:
-        module (nn.Module): The module to generate state_dict.
-        destination (dict): A dict where state will be stored.
-        prefix (str): The prefix for parameters and buffers used in this
-            module.
-    """
-    for name, param in module._parameters.items():
-        if param is not None:
-            destination[prefix + name] = param if keep_vars else param.detach()
-    for name, buf in module._buffers.items():
-        # remove check of _non_persistent_buffers_set to allow nn.BatchNorm2d
-        if buf is not None:
-            destination[prefix + name] = buf if keep_vars else buf.detach()
-
-
-def get_state_dict(module: torch.nn.Module,
-                   destination: Optional[OrderedDict] = None,
-                   prefix: str = '',
-                   keep_vars: bool = False) -> OrderedDict:
-    """Returns a dictionary containing a whole state of the module.
-
-    Both parameters and persistent buffers (e.g. running averages) are
-    included. Keys are corresponding parameter and buffer names.
-
-    This method is modified from :meth:`torch.nn.Module.state_dict` to
-    recursively check parallel module in case that the model has a complicated
-    structure, e.g., nn.Module(nn.Module(DDP)).
-
-    Args:
-        module (nn.Module): The module to generate state_dict.
-        destination (OrderedDict): Returned dict for the state of the
-            module.
-        prefix (str): Prefix of the key.
-        keep_vars (bool): Whether to keep the variable property of the
-            parameters. Default: False.
-
-    Returns:
-        dict: A dictionary containing a whole state of the module.
-    """
-    # recursively check parallel module in case that the model has a
-    # complicated structure, e.g., nn.Module(nn.Module(DDP))
-    if is_module_wrapper(module):
-        module = module.module
-
-    # below is the same as torch.nn.Module.state_dict()
-    if destination is None:
-        destination = OrderedDict()
-        destination._metadata = OrderedDict()  # type: ignore
-    destination._metadata[prefix[:-1]] = local_metadata = dict(  # type: ignore
-        version=module._version)
-    _save_to_state_dict(module, destination, prefix, keep_vars)  # type: ignore
-    for name, child in module._modules.items():
-        if child is not None:
-            get_state_dict(
-                child, destination, prefix + name + '.', keep_vars=keep_vars)
-    for hook in module._state_dict_hooks.values():
-        hook_result = hook(module, destination, prefix, local_metadata)
-        if hook_result is not None:
-            destination = hook_result
-    return destination  # type: ignore
-
-
-def save_checkpoint(model: torch.nn.Module,
-                    filename: str,
-                    optimizer: Optional[Optimizer] = None,
-                    meta: Optional[dict] = None,
-                    file_client_args: Optional[dict] = None) -> None:
-    """Save checkpoint to file.
-
-    The checkpoint will have 3 fields: ``meta``, ``state_dict`` and
-    ``optimizer``. By default ``meta`` will contain version and time info.
-
-    Args:
-        model (Module): Module whose params are to be saved.
-        filename (str): Checkpoint filename.
-        optimizer (:obj:`Optimizer`, optional): Optimizer to be saved.
-        meta (dict, optional): Metadata to be saved in checkpoint.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmengine.fileio.FileClient` for details.
-            Default: None.
-            `New in version 1.3.16.`
-    """
-    if meta is None:
-        meta = {}
-    elif not isinstance(meta, dict):
-        raise TypeError(f'meta must be a dict or None, but got {type(meta)}')
-    meta.update(mmcv_version=mmcv.__version__, time=time.asctime())
-
-    if is_module_wrapper(model):
-        model = model.module
-
-    if hasattr(model, 'CLASSES') and model.CLASSES is not None:
-        # save class name to the meta
-        meta.update(CLASSES=model.CLASSES)
-
-    checkpoint = {
-        'meta': meta,
-        'state_dict': weights_to_cpu(get_state_dict(model))  # type: ignore
-    }
-    # save optimizer state dict in the checkpoint
-    if isinstance(optimizer, Optimizer):
-        checkpoint['optimizer'] = optimizer.state_dict()
-    elif isinstance(optimizer, dict):
-        checkpoint['optimizer'] = {}
-        for name, optim in optimizer.items():
-            checkpoint['optimizer'][name] = optim.state_dict()
-
-    if filename.startswith('pavi://'):
-        if file_client_args is not None:
-            raise ValueError(
-                'file_client_args should be "None" if filename starts with'
-                f'"pavi://", but got {file_client_args}')
-        try:
-            from pavi import exception, modelcloud
-        except ImportError:
-            raise ImportError(
-                'Please install pavi to load checkpoint from modelcloud.')
-        model_path = filename[7:]
-        root = modelcloud.Folder()
-        model_dir, model_name = osp.split(model_path)
-        try:
-            model = modelcloud.get(model_dir)
-        except exception.NodeNotFoundError:
-            model = root.create_training_model(model_dir)
-        with TemporaryDirectory() as tmp_dir:
-            checkpoint_file = osp.join(tmp_dir, model_name)
-            with open(checkpoint_file, 'wb') as f:
-                torch.save(checkpoint, f)
-                f.flush()
-            model.create_file(checkpoint_file, name=model_name)
-    else:
-        file_client = FileClient.infer_client(file_client_args, filename)
-        with io.BytesIO() as f:
-            torch.save(checkpoint, f)
-            file_client.put(f.getvalue(), filename)
diff --git a/mmcv/runner/default_constructor.py b/mmcv/runner/default_constructor.py
deleted file mode 100644
index 394b51cfd7..0000000000
--- a/mmcv/runner/default_constructor.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
-
-from .builder import RUNNER_BUILDERS, RUNNERS
-
-
-@RUNNER_BUILDERS.register_module()
-class DefaultRunnerConstructor:
-    """Default constructor for runners.
-
-    Custom existing `Runner` like `EpocBasedRunner` though `RunnerConstructor`.
-    For example, We can inject some new properties and functions for `Runner`.
-
-    Example:
-        >>> from mmcv.runner import RUNNER_BUILDERS, build_runner
-        >>> # Define a new RunnerReconstructor
-        >>> @RUNNER_BUILDERS.register_module()
-        >>> class MyRunnerConstructor:
-        ...     def __init__(self, runner_cfg, default_args=None):
-        ...         if not isinstance(runner_cfg, dict):
-        ...             raise TypeError('runner_cfg should be a dict',
-        ...                             f'but got {type(runner_cfg)}')
-        ...         self.runner_cfg = runner_cfg
-        ...         self.default_args = default_args
-        ...
-        ...     def __call__(self):
-        ...         runner = RUNNERS.build(self.runner_cfg,
-        ...                                default_args=self.default_args)
-        ...         # Add new properties for existing runner
-        ...         runner.my_name = 'my_runner'
-        ...         runner.my_function = lambda self: print(self.my_name)
-        ...         ...
-        >>> # build your runner
-        >>> runner_cfg = dict(type='EpochBasedRunner', max_epochs=40,
-        ...                   constructor='MyRunnerConstructor')
-        >>> runner = build_runner(runner_cfg)
-    """
-
-    def __init__(self, runner_cfg: dict, default_args: Optional[dict] = None):
-        if not isinstance(runner_cfg, dict):
-            raise TypeError('runner_cfg should be a dict',
-                            f'but got {type(runner_cfg)}')
-        self.runner_cfg = runner_cfg
-        self.default_args = default_args
-
-    def __call__(self):
-        return RUNNERS.build(self.runner_cfg, default_args=self.default_args)
diff --git a/mmcv/runner/dist_utils.py b/mmcv/runner/dist_utils.py
deleted file mode 100644
index ee55dfda36..0000000000
--- a/mmcv/runner/dist_utils.py
+++ /dev/null
@@ -1,211 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved
-import functools
-import os
-import socket
-import subprocess
-from collections import OrderedDict
-from typing import Callable, List, Optional, Tuple
-
-import torch
-import torch.multiprocessing as mp
-from torch import distributed as dist
-from torch._utils import (_flatten_dense_tensors, _take_tensors,
-                          _unflatten_dense_tensors)
-
-from mmcv.utils import IS_MLU_AVAILABLE
-
-
-def _find_free_port() -> str:
-    # Copied from https://github.com/facebookresearch/detectron2/blob/main/detectron2/engine/launch.py # noqa: E501
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    # Binding to port 0 will cause the OS to find an available port for us
-    sock.bind(('', 0))
-    port = sock.getsockname()[1]
-    sock.close()
-    # NOTE: there is still a chance the port could be taken by other processes.
-    return port
-
-
-def _is_free_port(port: int) -> bool:
-    ips = socket.gethostbyname_ex(socket.gethostname())[-1]
-    ips.append('localhost')
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        return all(s.connect_ex((ip, port)) != 0 for ip in ips)
-
-
-def init_dist(launcher: str, backend: str = 'nccl', **kwargs) -> None:
-    if mp.get_start_method(allow_none=True) is None:
-        mp.set_start_method('spawn')
-    if launcher == 'pytorch':
-        _init_dist_pytorch(backend, **kwargs)
-    elif launcher == 'mpi':
-        _init_dist_mpi(backend, **kwargs)
-    elif launcher == 'slurm':
-        _init_dist_slurm(backend, **kwargs)
-    else:
-        raise ValueError(f'Invalid launcher type: {launcher}')
-
-
-def _init_dist_pytorch(backend: str, **kwargs) -> None:
-    # TODO: use local_rank instead of rank % num_gpus
-    rank = int(os.environ['RANK'])
-    if IS_MLU_AVAILABLE:
-        import torch_mlu  # noqa: F401
-        torch.mlu.set_device(rank)
-        dist.init_process_group(
-            backend='cncl',
-            rank=rank,
-            world_size=int(os.environ['WORLD_SIZE']),
-            **kwargs)
-    else:
-        num_gpus = torch.cuda.device_count()
-        torch.cuda.set_device(rank % num_gpus)
-        dist.init_process_group(backend=backend, **kwargs)
-
-
-def _init_dist_mpi(backend: str, **kwargs) -> None:
-    local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
-    torch.cuda.set_device(local_rank)
-    if 'MASTER_PORT' not in os.environ:
-        # 29500 is torch.distributed default port
-        os.environ['MASTER_PORT'] = '29500'
-    if 'MASTER_ADDR' not in os.environ:
-        raise KeyError('The environment variable MASTER_ADDR is not set')
-    os.environ['WORLD_SIZE'] = os.environ['OMPI_COMM_WORLD_SIZE']
-    os.environ['RANK'] = os.environ['OMPI_COMM_WORLD_RANK']
-    dist.init_process_group(backend=backend, **kwargs)
-
-
-def _init_dist_slurm(backend: str, port: Optional[int] = None) -> None:
-    """Initialize slurm distributed training environment.
-
-    If argument ``port`` is not specified, then the master port will be system
-    environment variable ``MASTER_PORT``. If ``MASTER_PORT`` is not in system
-    environment variable, then a default port ``29500`` will be used.
-
-    Args:
-        backend (str): Backend of torch.distributed.
-        port (int, optional): Master port. Defaults to None.
-    """
-    proc_id = int(os.environ['SLURM_PROCID'])
-    ntasks = int(os.environ['SLURM_NTASKS'])
-    node_list = os.environ['SLURM_NODELIST']
-    num_gpus = torch.cuda.device_count()
-    torch.cuda.set_device(proc_id % num_gpus)
-    addr = subprocess.getoutput(
-        f'scontrol show hostname {node_list} | head -n1')
-    # specify master port
-    if port is not None:
-        os.environ['MASTER_PORT'] = str(port)
-    elif 'MASTER_PORT' in os.environ:
-        pass  # use MASTER_PORT in the environment variable
-    else:
-        # if torch.distributed default port(29500) is available
-        # then use it, else find a free port
-        if _is_free_port(29500):
-            os.environ['MASTER_PORT'] = '29500'
-        else:
-            os.environ['MASTER_PORT'] = str(_find_free_port())
-    # use MASTER_ADDR in the environment variable if it already exists
-    if 'MASTER_ADDR' not in os.environ:
-        os.environ['MASTER_ADDR'] = addr
-    os.environ['WORLD_SIZE'] = str(ntasks)
-    os.environ['LOCAL_RANK'] = str(proc_id % num_gpus)
-    os.environ['RANK'] = str(proc_id)
-    dist.init_process_group(backend=backend)
-
-
-def get_dist_info() -> Tuple[int, int]:
-    if dist.is_available() and dist.is_initialized():
-        rank = dist.get_rank()
-        world_size = dist.get_world_size()
-    else:
-        rank = 0
-        world_size = 1
-    return rank, world_size
-
-
-def master_only(func: Callable) -> Callable:
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        rank, _ = get_dist_info()
-        if rank == 0:
-            return func(*args, **kwargs)
-
-    return wrapper
-
-
-def allreduce_params(params: List[torch.nn.Parameter],
-                     coalesce: bool = True,
-                     bucket_size_mb: int = -1) -> None:
-    """Allreduce parameters.
-
-    Args:
-        params (list[torch.nn.Parameter]): List of parameters or buffers
-            of a model.
-        coalesce (bool, optional): Whether allreduce parameters as a whole.
-            Defaults to True.
-        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
-            Defaults to -1.
-    """
-    _, world_size = get_dist_info()
-    if world_size == 1:
-        return
-    params = [param.data for param in params]
-    if coalesce:
-        _allreduce_coalesced(params, world_size, bucket_size_mb)
-    else:
-        for tensor in params:
-            dist.all_reduce(tensor.div_(world_size))
-
-
-def allreduce_grads(params: List[torch.nn.Parameter],
-                    coalesce: bool = True,
-                    bucket_size_mb: int = -1) -> None:
-    """Allreduce gradients.
-
-    Args:
-        params (list[torch.nn.Parameter]): List of parameters of a model.
-        coalesce (bool, optional): Whether allreduce parameters as a whole.
-            Defaults to True.
-        bucket_size_mb (int, optional): Size of bucket, the unit is MB.
-            Defaults to -1.
-    """
-    grads = [
-        param.grad.data for param in params
-        if param.requires_grad and param.grad is not None
-    ]
-    _, world_size = get_dist_info()
-    if world_size == 1:
-        return
-    if coalesce:
-        _allreduce_coalesced(grads, world_size, bucket_size_mb)
-    else:
-        for tensor in grads:
-            dist.all_reduce(tensor.div_(world_size))
-
-
-def _allreduce_coalesced(tensors: torch.Tensor,
-                         world_size: int,
-                         bucket_size_mb: int = -1) -> None:
-    if bucket_size_mb > 0:
-        bucket_size_bytes = bucket_size_mb * 1024 * 1024
-        buckets = _take_tensors(tensors, bucket_size_bytes)
-    else:
-        buckets = OrderedDict()
-        for tensor in tensors:
-            tp = tensor.type()
-            if tp not in buckets:
-                buckets[tp] = []
-            buckets[tp].append(tensor)
-        buckets = buckets.values()
-
-    for bucket in buckets:
-        flat_tensors = _flatten_dense_tensors(bucket)
-        dist.all_reduce(flat_tensors)
-        flat_tensors.div_(world_size)
-        for tensor, synced in zip(
-                bucket, _unflatten_dense_tensors(flat_tensors, bucket)):
-            tensor.copy_(synced)
diff --git a/mmcv/runner/epoch_based_runner.py b/mmcv/runner/epoch_based_runner.py
deleted file mode 100644
index d6e9069289..0000000000
--- a/mmcv/runner/epoch_based_runner.py
+++ /dev/null
@@ -1,197 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import platform
-import shutil
-import time
-import warnings
-from typing import Any, Dict, List, Optional, Tuple
-
-import torch
-from torch.utils.data import DataLoader
-
-import mmcv
-from .base_runner import BaseRunner
-from .builder import RUNNERS
-from .checkpoint import save_checkpoint
-from .utils import get_host_info
-
-
-@RUNNERS.register_module()
-class EpochBasedRunner(BaseRunner):
-    """Epoch-based Runner.
-
-    This runner train models epoch by epoch.
-    """
-
-    def run_iter(self, data_batch: Any, train_mode: bool, **kwargs) -> None:
-        if self.batch_processor is not None:
-            outputs = self.batch_processor(
-                self.model, data_batch, train_mode=train_mode, **kwargs)
-        elif train_mode:
-            outputs = self.model.train_step(data_batch, self.optimizer,
-                                            **kwargs)
-        else:
-            outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
-        if not isinstance(outputs, dict):
-            raise TypeError('"batch_processor()" or "model.train_step()"'
-                            'and "model.val_step()" must return a dict')
-        if 'log_vars' in outputs:
-            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
-        self.outputs = outputs
-
-    def train(self, data_loader, **kwargs):
-        self.model.train()
-        self.mode = 'train'
-        self.data_loader = data_loader
-        self._max_iters = self._max_epochs * len(self.data_loader)
-        self.call_hook('before_train_epoch')
-        time.sleep(2)  # Prevent possible deadlock during epoch transition
-        for i, data_batch in enumerate(self.data_loader):
-            self.data_batch = data_batch
-            self._inner_iter = i
-            self.call_hook('before_train_iter')
-            self.run_iter(data_batch, train_mode=True, **kwargs)
-            self.call_hook('after_train_iter')
-            del self.data_batch
-            self._iter += 1
-
-        self.call_hook('after_train_epoch')
-        self._epoch += 1
-
-    @torch.no_grad()
-    def val(self, data_loader, **kwargs):
-        self.model.eval()
-        self.mode = 'val'
-        self.data_loader = data_loader
-        self.call_hook('before_val_epoch')
-        time.sleep(2)  # Prevent possible deadlock during epoch transition
-        for i, data_batch in enumerate(self.data_loader):
-            self.data_batch = data_batch
-            self._inner_iter = i
-            self.call_hook('before_val_iter')
-            self.run_iter(data_batch, train_mode=False)
-            self.call_hook('after_val_iter')
-            del self.data_batch
-        self.call_hook('after_val_epoch')
-
-    def run(self,
-            data_loaders: List[DataLoader],
-            workflow: List[Tuple[str, int]],
-            max_epochs: Optional[int] = None,
-            **kwargs) -> None:
-        """Start running.
-
-        Args:
-            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
-                and validation.
-            workflow (list[tuple]): A list of (phase, epochs) to specify the
-                running order and epochs. E.g, [('train', 2), ('val', 1)] means
-                running 2 epochs for training and 1 epoch for validation,
-                iteratively.
-        """
-        assert isinstance(data_loaders, list)
-        assert mmcv.is_list_of(workflow, tuple)
-        assert len(data_loaders) == len(workflow)
-        if max_epochs is not None:
-            warnings.warn(
-                'setting max_epochs in run is deprecated, '
-                'please set max_epochs in runner_config', DeprecationWarning)
-            self._max_epochs = max_epochs
-
-        assert self._max_epochs is not None, (
-            'max_epochs must be specified during instantiation')
-
-        for i, flow in enumerate(workflow):
-            mode, epochs = flow
-            if mode == 'train':
-                self._max_iters = self._max_epochs * len(data_loaders[i])
-                break
-
-        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
-        self.logger.info('Start running, host: %s, work_dir: %s',
-                         get_host_info(), work_dir)
-        self.logger.info('Hooks will be executed in the following order:\n%s',
-                         self.get_hook_info())
-        self.logger.info('workflow: %s, max: %d epochs', workflow,
-                         self._max_epochs)
-        self.call_hook('before_run')
-
-        while self.epoch < self._max_epochs:
-            for i, flow in enumerate(workflow):
-                mode, epochs = flow
-                if isinstance(mode, str):  # self.train()
-                    if not hasattr(self, mode):
-                        raise ValueError(
-                            f'runner has no method named "{mode}" to run an '
-                            'epoch')
-                    epoch_runner = getattr(self, mode)
-                else:
-                    raise TypeError(
-                        'mode in workflow must be a str, but got {}'.format(
-                            type(mode)))
-
-                for _ in range(epochs):
-                    if mode == 'train' and self.epoch >= self._max_epochs:
-                        break
-                    epoch_runner(data_loaders[i], **kwargs)
-
-        time.sleep(1)  # wait for some hooks like loggers to finish
-        self.call_hook('after_run')
-
-    def save_checkpoint(self,
-                        out_dir: str,
-                        filename_tmpl: str = 'epoch_{}.pth',
-                        save_optimizer: bool = True,
-                        meta: Optional[Dict] = None,
-                        create_symlink: bool = True) -> None:
-        """Save the checkpoint.
-
-        Args:
-            out_dir (str): The directory that checkpoints are saved.
-            filename_tmpl (str, optional): The checkpoint filename template,
-                which contains a placeholder for the epoch number.
-                Defaults to 'epoch_{}.pth'.
-            save_optimizer (bool, optional): Whether to save the optimizer to
-                the checkpoint. Defaults to True.
-            meta (dict, optional): The meta information to be saved in the
-                checkpoint. Defaults to None.
-            create_symlink (bool, optional): Whether to create a symlink
-                "latest.pth" to point to the latest checkpoint.
-                Defaults to True.
-        """
-        if meta is None:
-            meta = {}
-        elif not isinstance(meta, dict):
-            raise TypeError(
-                f'meta should be a dict or None, but got {type(meta)}')
-        if self.meta is not None:
-            meta.update(self.meta)
-            # Note: meta.update(self.meta) should be done before
-            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
-            # there will be problems with resumed checkpoints.
-            # More details in https://github.com/open-mmlab/mmcv/pull/1108
-        meta.update(epoch=self.epoch + 1, iter=self.iter)
-
-        filename = filename_tmpl.format(self.epoch + 1)
-        filepath = osp.join(out_dir, filename)
-        optimizer = self.optimizer if save_optimizer else None
-        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
-        # in some environments, `os.symlink` is not supported, you may need to
-        # set `create_symlink` to False
-        if create_symlink:
-            dst_file = osp.join(out_dir, 'latest.pth')
-            if platform.system() != 'Windows':
-                mmcv.symlink(filename, dst_file)
-            else:
-                shutil.copy(filepath, dst_file)
-
-
-@RUNNERS.register_module()
-class Runner(EpochBasedRunner):
-    """Deprecated name of EpochBasedRunner."""
-
-    def __init__(self, *args, **kwargs):
-        warnings.warn(
-            'Runner was deprecated, please use EpochBasedRunner instead',
-            DeprecationWarning)
-        super().__init__(*args, **kwargs)
diff --git a/mmcv/runner/fp16_utils.py b/mmcv/runner/fp16_utils.py
deleted file mode 100644
index 4674d27a44..0000000000
--- a/mmcv/runner/fp16_utils.py
+++ /dev/null
@@ -1,435 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import functools
-import warnings
-from collections import abc
-from inspect import getfullargspec
-from typing import Callable, Iterable, List, Optional
-
-import numpy as np
-import torch
-import torch.nn as nn
-from torch.nn.parameter import Parameter
-
-from mmcv.utils import TORCH_VERSION, digit_version
-from .dist_utils import allreduce_grads as _allreduce_grads
-
-try:
-    # If PyTorch version >= 1.6.0, torch.cuda.amp.autocast would be imported
-    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
-    # Note that when PyTorch >= 1.6.0, we still cast tensor types to fp16
-    # manually, so the behavior may not be consistent with real amp.
-    from torch.cuda.amp import autocast
-except ImportError:
-    pass
-
-
-def cast_tensor_type(inputs, src_type: torch.dtype, dst_type: torch.dtype):
-    """Recursively convert Tensor in inputs from src_type to dst_type.
-
-    Note:
-        In v1.4.4 and later, ``cast_tersor_type`` will only convert the
-        torch.Tensor which is consistent with ``src_type`` to the ``dst_type``.
-        Before v1.4.4, it ignores the ``src_type`` argument, leading to some
-        potential problems. For example,
-        ``cast_tensor_type(inputs, torch.float, torch.half)`` will convert all
-        tensors in inputs to ``torch.half`` including those originally in
-        ``torch.Int`` or other types, which is not expected.
-
-    Args:
-        inputs: Inputs that to be casted.
-        src_type (torch.dtype): Source type..
-        dst_type (torch.dtype): Destination type.
-
-    Returns:
-        The same type with inputs, but all contained Tensors have been cast.
-    """
-    if isinstance(inputs, nn.Module):
-        return inputs
-    elif isinstance(inputs, torch.Tensor):
-        # we need to ensure that the type of inputs to be casted are the same
-        # as the argument `src_type`.
-        return inputs.to(dst_type) if inputs.dtype == src_type else inputs
-    elif isinstance(inputs, str):
-        return inputs
-    elif isinstance(inputs, np.ndarray):
-        return inputs
-    elif isinstance(inputs, abc.Mapping):
-        return type(inputs)({  # type: ignore
-            k: cast_tensor_type(v, src_type, dst_type)
-            for k, v in inputs.items()
-        })
-    elif isinstance(inputs, abc.Iterable):
-        return type(inputs)(  # type: ignore
-            cast_tensor_type(item, src_type, dst_type) for item in inputs)
-    else:
-        return inputs
-
-
-def auto_fp16(
-        apply_to: Optional[Iterable] = None,
-        out_fp32: bool = False,
-        supported_types: tuple = (nn.Module, ),
-) -> Callable:
-    """Decorator to enable fp16 training automatically.
-
-    This decorator is useful when you write custom modules and want to support
-    mixed precision training. If inputs arguments are fp32 tensors, they will
-    be converted to fp16 automatically. Arguments other than fp32 tensors are
-    ignored. If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
-    backend, otherwise, original mmcv implementation will be adopted.
-
-    Args:
-        apply_to (Iterable, optional): The argument names to be converted.
-            `None` indicates all arguments.
-        out_fp32 (bool): Whether to convert the output back to fp32.
-        supported_types (tuple): Classes can be decorated by ``auto_fp16``.
-            `New in version 1.5.0.`
-    Example:
-
-        >>> import torch.nn as nn
-        >>> class MyModule1(nn.Module):
-        >>>
-        >>>     # Convert x and y to fp16
-        >>>     @auto_fp16()
-        >>>     def forward(self, x, y):
-        >>>         pass
-
-        >>> import torch.nn as nn
-        >>> class MyModule2(nn.Module):
-        >>>
-        >>>     # convert pred to fp16
-        >>>     @auto_fp16(apply_to=('pred', ))
-        >>>     def do_something(self, pred, others):
-        >>>         pass
-    """
-
-    def auto_fp16_wrapper(old_func: Callable) -> Callable:
-
-        @functools.wraps(old_func)
-        def new_func(*args, **kwargs) -> Callable:
-            # check if the module has set the attribute `fp16_enabled`, if not,
-            # just fallback to the original method.
-            if not isinstance(args[0], supported_types):
-                raise TypeError('@auto_fp16 can only be used to decorate the '
-                                f'method of those classes {supported_types}')
-            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
-                return old_func(*args, **kwargs)
-
-            # get the arg spec of the decorated method
-            args_info = getfullargspec(old_func)
-            # get the argument names to be casted
-            args_to_cast = args_info.args if apply_to is None else apply_to
-            # convert the args that need to be processed
-            new_args = []
-            # NOTE: default args are not taken into consideration
-            if args:
-                arg_names = args_info.args[:len(args)]
-                for i, arg_name in enumerate(arg_names):
-                    if arg_name in args_to_cast:
-                        new_args.append(
-                            cast_tensor_type(args[i], torch.float, torch.half))
-                    else:
-                        new_args.append(args[i])
-            # convert the kwargs that need to be processed
-            new_kwargs = {}
-            if kwargs:
-                for arg_name, arg_value in kwargs.items():
-                    if arg_name in args_to_cast:
-                        new_kwargs[arg_name] = cast_tensor_type(
-                            arg_value, torch.float, torch.half)
-                    else:
-                        new_kwargs[arg_name] = arg_value
-            # apply converted arguments to the decorated method
-            if (TORCH_VERSION != 'parrots' and
-                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
-                with autocast(enabled=True):
-                    output = old_func(*new_args, **new_kwargs)
-            else:
-                output = old_func(*new_args, **new_kwargs)
-            # cast the results back to fp32 if necessary
-            if out_fp32:
-                output = cast_tensor_type(output, torch.half, torch.float)
-            return output
-
-        return new_func
-
-    return auto_fp16_wrapper
-
-
-def force_fp32(apply_to: Optional[Iterable] = None,
-               out_fp16: bool = False) -> Callable:
-    """Decorator to convert input arguments to fp32 in force.
-
-    This decorator is useful when you write custom modules and want to support
-    mixed precision training. If there are some inputs that must be processed
-    in fp32 mode, then this decorator can handle it. If inputs arguments are
-    fp16 tensors, they will be converted to fp32 automatically. Arguments other
-    than fp16 tensors are ignored. If you are using PyTorch >= 1.6,
-    torch.cuda.amp is used as the backend, otherwise, original mmcv
-    implementation will be adopted.
-
-    Args:
-        apply_to (Iterable, optional): The argument names to be converted.
-            `None` indicates all arguments.
-        out_fp16 (bool): Whether to convert the output back to fp16.
-
-    Example:
-
-        >>> import torch.nn as nn
-        >>> class MyModule1(nn.Module):
-        >>>
-        >>>     # Convert x and y to fp32
-        >>>     @force_fp32()
-        >>>     def loss(self, x, y):
-        >>>         pass
-
-        >>> import torch.nn as nn
-        >>> class MyModule2(nn.Module):
-        >>>
-        >>>     # convert pred to fp32
-        >>>     @force_fp32(apply_to=('pred', ))
-        >>>     def post_process(self, pred, others):
-        >>>         pass
-    """
-
-    def force_fp32_wrapper(old_func):
-
-        @functools.wraps(old_func)
-        def new_func(*args, **kwargs) -> Callable:
-            # check if the module has set the attribute `fp16_enabled`, if not,
-            # just fallback to the original method.
-            if not isinstance(args[0], torch.nn.Module):
-                raise TypeError('@force_fp32 can only be used to decorate the '
-                                'method of nn.Module')
-            if not (hasattr(args[0], 'fp16_enabled') and args[0].fp16_enabled):
-                return old_func(*args, **kwargs)
-            # get the arg spec of the decorated method
-            args_info = getfullargspec(old_func)
-            # get the argument names to be casted
-            args_to_cast = args_info.args if apply_to is None else apply_to
-            # convert the args that need to be processed
-            new_args = []
-            if args:
-                arg_names = args_info.args[:len(args)]
-                for i, arg_name in enumerate(arg_names):
-                    if arg_name in args_to_cast:
-                        new_args.append(
-                            cast_tensor_type(args[i], torch.half, torch.float))
-                    else:
-                        new_args.append(args[i])
-            # convert the kwargs that need to be processed
-            new_kwargs = dict()
-            if kwargs:
-                for arg_name, arg_value in kwargs.items():
-                    if arg_name in args_to_cast:
-                        new_kwargs[arg_name] = cast_tensor_type(
-                            arg_value, torch.half, torch.float)
-                    else:
-                        new_kwargs[arg_name] = arg_value
-            # apply converted arguments to the decorated method
-            if (TORCH_VERSION != 'parrots' and
-                    digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
-                with autocast(enabled=False):
-                    output = old_func(*new_args, **new_kwargs)
-            else:
-                output = old_func(*new_args, **new_kwargs)
-            # cast the results back to fp32 if necessary
-            if out_fp16:
-                output = cast_tensor_type(output, torch.float, torch.half)
-            return output
-
-        return new_func
-
-    return force_fp32_wrapper
-
-
-def allreduce_grads(params: List[Parameter],
-                    coalesce: bool = True,
-                    bucket_size_mb: int = -1) -> None:
-    warnings.warn(
-        '"mmcv.runner.fp16_utils.allreduce_grads" is deprecated, and will be '
-        'removed in v2.8. Please switch to "mmcv.runner.allreduce_grads',
-        DeprecationWarning)
-    _allreduce_grads(params, coalesce=coalesce, bucket_size_mb=bucket_size_mb)
-
-
-def wrap_fp16_model(model: nn.Module) -> None:
-    """Wrap the FP32 model to FP16.
-
-    If you are using PyTorch >= 1.6, torch.cuda.amp is used as the
-    backend, otherwise, original mmcv implementation will be adopted.
-
-    For PyTorch >= 1.6, this function will
-    1. Set fp16 flag inside the model to True.
-
-    Otherwise:
-    1. Convert FP32 model to FP16.
-    2. Remain some necessary layers to be FP32, e.g., normalization layers.
-    3. Set `fp16_enabled` flag inside the model to True.
-
-    Args:
-        model (nn.Module): Model in FP32.
-    """
-    if (TORCH_VERSION == 'parrots'
-            or digit_version(TORCH_VERSION) < digit_version('1.6.0')):
-        # convert model to fp16
-        model.half()
-        # patch the normalization layers to make it work in fp32 mode
-        patch_norm_fp32(model)
-    # set `fp16_enabled` flag
-    for m in model.modules():
-        if hasattr(m, 'fp16_enabled'):
-            m.fp16_enabled = True
-
-
-def patch_norm_fp32(module: nn.Module) -> nn.Module:
-    """Recursively convert normalization layers from FP16 to FP32.
-
-    Args:
-        module (nn.Module): The modules to be converted in FP16.
-
-    Returns:
-        nn.Module: The converted module, the normalization layers have been
-            converted to FP32.
-    """
-    if isinstance(module, (nn.modules.batchnorm._BatchNorm, nn.GroupNorm)):
-        module.float()
-        if isinstance(module, nn.GroupNorm) or torch.__version__ < '1.3':
-            module.forward = patch_forward_method(module.forward, torch.half,
-                                                  torch.float)
-    for child in module.children():
-        patch_norm_fp32(child)
-    return module
-
-
-def patch_forward_method(func: Callable,
-                         src_type: torch.dtype,
-                         dst_type: torch.dtype,
-                         convert_output: bool = True) -> Callable:
-    """Patch the forward method of a module.
-
-    Args:
-        func (callable): The original forward method.
-        src_type (torch.dtype): Type of input arguments to be converted from.
-        dst_type (torch.dtype): Type of input arguments to be converted to.
-        convert_output (bool): Whether to convert the output back to src_type.
-
-    Returns:
-        callable: The patched forward method.
-    """
-
-    def new_forward(*args, **kwargs):
-        output = func(*cast_tensor_type(args, src_type, dst_type),
-                      **cast_tensor_type(kwargs, src_type, dst_type))
-        if convert_output:
-            output = cast_tensor_type(output, dst_type, src_type)
-        return output
-
-    return new_forward
-
-
-class LossScaler:
-    """Class that manages loss scaling in mixed precision training which
-    supports both dynamic or static mode.
-
-    The implementation refers to
-    https://github.com/NVIDIA/apex/blob/master/apex/fp16_utils/loss_scaler.py.
-    Indirectly, by supplying ``mode='dynamic'`` for dynamic loss scaling.
-    It's important to understand how :class:`LossScaler` operates.
-    Loss scaling is designed to combat the problem of underflowing
-    gradients encountered at long times when training fp16 networks.
-    Dynamic loss scaling begins by attempting a very high loss
-    scale.  Ironically, this may result in OVERflowing gradients.
-    If overflowing gradients are encountered, :class:`FP16_Optimizer` then
-    skips the update step for this particular iteration/minibatch,
-    and :class:`LossScaler` adjusts the loss scale to a lower value.
-    If a certain number of iterations occur without overflowing gradients
-    detected,:class:`LossScaler` increases the loss scale once more.
-    In this way :class:`LossScaler` attempts to "ride the edge" of always
-    using the highest loss scale possible without incurring overflow.
-
-    Args:
-        init_scale (float): Initial loss scale value, default: 2**32.
-        scale_factor (float): Factor used when adjusting the loss scale.
-            Default: 2.
-        mode (str): Loss scaling mode. 'dynamic' or 'static'
-        scale_window (int): Number of consecutive iterations without an
-            overflow to wait before increasing the loss scale. Default: 1000.
-    """
-
-    def __init__(self,
-                 init_scale: float = 2**32,
-                 mode: str = 'dynamic',
-                 scale_factor: float = 2.,
-                 scale_window: int = 1000):
-        self.cur_scale = init_scale
-        self.cur_iter = 0
-        assert mode in ('dynamic',
-                        'static'), 'mode can only be dynamic or static'
-        self.mode = mode
-        self.last_overflow_iter = -1
-        self.scale_factor = scale_factor
-        self.scale_window = scale_window
-
-    def has_overflow(self, params: List[Parameter]) -> bool:
-        """Check if params contain overflow."""
-        if self.mode != 'dynamic':
-            return False
-        for p in params:
-            if p.grad is not None and LossScaler._has_inf_or_nan(p.grad.data):
-                return True
-        return False
-
-    def _has_inf_or_nan(x: torch.Tensor) -> bool:
-        """Check if params contain NaN."""
-        try:
-            cpu_sum = float(x.float().sum())
-        except RuntimeError as instance:
-            if 'value cannot be converted' not in instance.args[0]:
-                raise
-            return True
-        else:
-            if cpu_sum == float('inf') or cpu_sum == -float('inf') \
-                    or cpu_sum != cpu_sum:
-                return True
-            return False
-
-    def update_scale(self, overflow: bool) -> None:
-        """update the current loss scale value when overflow happens."""
-        if self.mode != 'dynamic':
-            return
-        if overflow:
-            self.cur_scale = max(self.cur_scale / self.scale_factor, 1)
-            self.last_overflow_iter = self.cur_iter
-        else:
-            if (self.cur_iter - self.last_overflow_iter) % \
-                    self.scale_window == 0:
-                self.cur_scale *= self.scale_factor
-        self.cur_iter += 1
-
-    def state_dict(self) -> dict:
-        """Returns the state of the scaler as a :class:`dict`."""
-        return dict(
-            cur_scale=self.cur_scale,
-            cur_iter=self.cur_iter,
-            mode=self.mode,
-            last_overflow_iter=self.last_overflow_iter,
-            scale_factor=self.scale_factor,
-            scale_window=self.scale_window)
-
-    def load_state_dict(self, state_dict: dict) -> None:
-        """Loads the loss_scaler state dict.
-
-        Args:
-           state_dict (dict): scaler state.
-        """
-        self.cur_scale = state_dict['cur_scale']
-        self.cur_iter = state_dict['cur_iter']
-        self.mode = state_dict['mode']
-        self.last_overflow_iter = state_dict['last_overflow_iter']
-        self.scale_factor = state_dict['scale_factor']
-        self.scale_window = state_dict['scale_window']
-
-    @property
-    def loss_scale(self) -> float:
-        return self.cur_scale
diff --git a/mmcv/runner/hooks/__init__.py b/mmcv/runner/hooks/__init__.py
deleted file mode 100644
index 03e2a619e8..0000000000
--- a/mmcv/runner/hooks/__init__.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .checkpoint import CheckpointHook
-from .closure import ClosureHook
-from .ema import EMAHook
-from .evaluation import DistEvalHook, EvalHook
-from .hook import HOOKS, Hook
-from .iter_timer import IterTimerHook
-from .logger import (ClearMLLoggerHook, DvcliveLoggerHook, LoggerHook,
-                     MlflowLoggerHook, NeptuneLoggerHook, PaviLoggerHook,
-                     SegmindLoggerHook, TensorboardLoggerHook, TextLoggerHook,
-                     WandbLoggerHook)
-from .lr_updater import (CosineAnnealingLrUpdaterHook,
-                         CosineRestartLrUpdaterHook, CyclicLrUpdaterHook,
-                         ExpLrUpdaterHook, FixedLrUpdaterHook,
-                         FlatCosineAnnealingLrUpdaterHook, InvLrUpdaterHook,
-                         LinearAnnealingLrUpdaterHook, LrUpdaterHook,
-                         OneCycleLrUpdaterHook, PolyLrUpdaterHook,
-                         StepLrUpdaterHook)
-from .memory import EmptyCacheHook
-from .momentum_updater import (CosineAnnealingMomentumUpdaterHook,
-                               CyclicMomentumUpdaterHook,
-                               LinearAnnealingMomentumUpdaterHook,
-                               MomentumUpdaterHook,
-                               OneCycleMomentumUpdaterHook,
-                               StepMomentumUpdaterHook)
-from .optimizer import (Fp16OptimizerHook, GradientCumulativeFp16OptimizerHook,
-                        GradientCumulativeOptimizerHook, OptimizerHook)
-from .profiler import ProfilerHook
-from .sampler_seed import DistSamplerSeedHook
-from .sync_buffer import SyncBuffersHook
-
-__all__ = [
-    'HOOKS', 'Hook', 'CheckpointHook', 'ClosureHook', 'LrUpdaterHook',
-    'FixedLrUpdaterHook', 'StepLrUpdaterHook', 'ExpLrUpdaterHook',
-    'PolyLrUpdaterHook', 'InvLrUpdaterHook', 'CosineAnnealingLrUpdaterHook',
-    'FlatCosineAnnealingLrUpdaterHook', 'CosineRestartLrUpdaterHook',
-    'CyclicLrUpdaterHook', 'OneCycleLrUpdaterHook', 'OptimizerHook',
-    'Fp16OptimizerHook', 'IterTimerHook', 'DistSamplerSeedHook',
-    'EmptyCacheHook', 'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
-    'TextLoggerHook', 'TensorboardLoggerHook', 'NeptuneLoggerHook',
-    'WandbLoggerHook', 'DvcliveLoggerHook', 'MomentumUpdaterHook',
-    'StepMomentumUpdaterHook', 'CosineAnnealingMomentumUpdaterHook',
-    'CyclicMomentumUpdaterHook', 'OneCycleMomentumUpdaterHook',
-    'SyncBuffersHook', 'EMAHook', 'EvalHook', 'DistEvalHook', 'ProfilerHook',
-    'GradientCumulativeOptimizerHook', 'GradientCumulativeFp16OptimizerHook',
-    'SegmindLoggerHook', 'LinearAnnealingLrUpdaterHook',
-    'LinearAnnealingMomentumUpdaterHook', 'ClearMLLoggerHook'
-]
diff --git a/mmcv/runner/hooks/checkpoint.py b/mmcv/runner/hooks/checkpoint.py
deleted file mode 100644
index 8a74c7229f..0000000000
--- a/mmcv/runner/hooks/checkpoint.py
+++ /dev/null
@@ -1,169 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import warnings
-from typing import Optional
-
-from mmengine.fileio import FileClient
-
-from ..dist_utils import allreduce_params, master_only
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class CheckpointHook(Hook):
-    """Save checkpoints periodically.
-
-    Args:
-        interval (int): The saving period. If ``by_epoch=True``, interval
-            indicates epochs, otherwise it indicates iterations.
-            Default: -1, which means "never".
-        by_epoch (bool): Saving checkpoints by epoch or by iteration.
-            Default: True.
-        save_optimizer (bool): Whether to save optimizer state_dict in the
-            checkpoint. It is usually used for resuming experiments.
-            Default: True.
-        out_dir (str, optional): The root directory to save checkpoints. If not
-            specified, ``runner.work_dir`` will be used by default. If
-            specified, the ``out_dir`` will be the concatenation of ``out_dir``
-            and the last level directory of ``runner.work_dir``.
-            `Changed in version 1.3.16.`
-        max_keep_ckpts (int, optional): The maximum checkpoints to keep.
-            In some cases we want only the latest few checkpoints and would
-            like to delete old ones to save the disk space.
-            Default: -1, which means unlimited.
-        save_last (bool, optional): Whether to force the last checkpoint to be
-            saved regardless of interval. Default: True.
-        sync_buffer (bool, optional): Whether to synchronize buffers in
-            different gpus. Default: False.
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmengine.fileio.FileClient` for details.
-            Default: None.
-            `New in version 1.3.16.`
-
-    .. warning::
-        Before v1.3.16, the ``out_dir`` argument indicates the path where the
-        checkpoint is stored. However, since v1.3.16, ``out_dir`` indicates the
-        root directory and the final path to save checkpoint is the
-        concatenation of ``out_dir`` and the last level directory of
-        ``runner.work_dir``. Suppose the value of ``out_dir`` is "/path/of/A"
-        and the value of ``runner.work_dir`` is "/path/of/B", then the final
-        path will be "/path/of/A/B".
-    """
-
-    def __init__(self,
-                 interval: int = -1,
-                 by_epoch: bool = True,
-                 save_optimizer: bool = True,
-                 out_dir: Optional[str] = None,
-                 max_keep_ckpts: int = -1,
-                 save_last: bool = True,
-                 sync_buffer: bool = False,
-                 file_client_args: Optional[dict] = None,
-                 **kwargs):
-        self.interval = interval
-        self.by_epoch = by_epoch
-        self.save_optimizer = save_optimizer
-        self.out_dir = out_dir
-        self.max_keep_ckpts = max_keep_ckpts
-        self.save_last = save_last
-        self.args = kwargs
-        self.sync_buffer = sync_buffer
-        self.file_client_args = file_client_args
-
-    def before_run(self, runner):
-        if not self.out_dir:
-            self.out_dir = runner.work_dir
-
-        self.file_client = FileClient.infer_client(self.file_client_args,
-                                                   self.out_dir)
-
-        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
-        # `self.out_dir` is set so the final `self.out_dir` is the
-        # concatenation of `self.out_dir` and the last level directory of
-        # `runner.work_dir`
-        if self.out_dir != runner.work_dir:
-            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-            self.out_dir = self.file_client.join_path(self.out_dir, basename)
-
-        runner.logger.info(f'Checkpoints will be saved to {self.out_dir} by '
-                           f'{self.file_client.name}.')
-
-        # disable the create_symlink option because some file backends do not
-        # allow to create a symlink
-        if 'create_symlink' in self.args:
-            if self.args[
-                    'create_symlink'] and not self.file_client.allow_symlink:
-                self.args['create_symlink'] = False
-                warnings.warn(
-                    'create_symlink is set as True by the user but is changed'
-                    'to be False because creating symbolic link is not '
-                    f'allowed in {self.file_client.name}')
-        else:
-            self.args['create_symlink'] = self.file_client.allow_symlink
-
-    def after_train_epoch(self, runner):
-        if not self.by_epoch:
-            return
-
-        # save checkpoint for following cases:
-        # 1. every ``self.interval`` epochs
-        # 2. reach the last epoch of training
-        if self.every_n_epochs(
-                runner, self.interval) or (self.save_last
-                                           and self.is_last_epoch(runner)):
-            runner.logger.info(
-                f'Saving checkpoint at {runner.epoch + 1} epochs')
-            if self.sync_buffer:
-                allreduce_params(runner.model.buffers())
-            self._save_checkpoint(runner)
-
-    @master_only
-    def _save_checkpoint(self, runner):
-        """Save the current checkpoint and delete unwanted checkpoint."""
-        runner.save_checkpoint(
-            self.out_dir, save_optimizer=self.save_optimizer, **self.args)
-        if runner.meta is not None:
-            if self.by_epoch:
-                cur_ckpt_filename = self.args.get(
-                    'filename_tmpl', 'epoch_{}.pth').format(runner.epoch + 1)
-            else:
-                cur_ckpt_filename = self.args.get(
-                    'filename_tmpl', 'iter_{}.pth').format(runner.iter + 1)
-            runner.meta.setdefault('hook_msgs', dict())
-            runner.meta['hook_msgs']['last_ckpt'] = self.file_client.join_path(
-                self.out_dir, cur_ckpt_filename)
-        # remove other checkpoints
-        if self.max_keep_ckpts > 0:
-            if self.by_epoch:
-                name = 'epoch_{}.pth'
-                current_ckpt = runner.epoch + 1
-            else:
-                name = 'iter_{}.pth'
-                current_ckpt = runner.iter + 1
-            redundant_ckpts = range(
-                current_ckpt - self.max_keep_ckpts * self.interval, 0,
-                -self.interval)
-            filename_tmpl = self.args.get('filename_tmpl', name)
-            for _step in redundant_ckpts:
-                ckpt_path = self.file_client.join_path(
-                    self.out_dir, filename_tmpl.format(_step))
-                if self.file_client.isfile(ckpt_path):
-                    self.file_client.remove(ckpt_path)
-                else:
-                    break
-
-    def after_train_iter(self, runner):
-        if self.by_epoch:
-            return
-
-        # save checkpoint for following cases:
-        # 1. every ``self.interval`` iterations
-        # 2. reach the last iteration of training
-        if self.every_n_iters(
-                runner, self.interval) or (self.save_last
-                                           and self.is_last_iter(runner)):
-            runner.logger.info(
-                f'Saving checkpoint at {runner.iter + 1} iterations')
-            if self.sync_buffer:
-                allreduce_params(runner.model.buffers())
-            self._save_checkpoint(runner)
diff --git a/mmcv/runner/hooks/closure.py b/mmcv/runner/hooks/closure.py
deleted file mode 100644
index 73a3e6a90e..0000000000
--- a/mmcv/runner/hooks/closure.py
+++ /dev/null
@@ -1,13 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable
-
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class ClosureHook(Hook):
-
-    def __init__(self, fn_name: str, fn: Callable):
-        assert hasattr(self, fn_name)
-        assert callable(fn)
-        setattr(self, fn_name, fn)
diff --git a/mmcv/runner/hooks/ema.py b/mmcv/runner/hooks/ema.py
deleted file mode 100644
index b5b578e5e3..0000000000
--- a/mmcv/runner/hooks/ema.py
+++ /dev/null
@@ -1,91 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Optional
-
-from ...parallel import is_module_wrapper
-from ..hooks.hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class EMAHook(Hook):
-    r"""Exponential Moving Average Hook.
-
-    Use Exponential Moving Average on all parameters of model in training
-    process. All parameters have a ema backup, which update by the formula
-    as below. EMAHook takes priority over EvalHook and CheckpointSaverHook.
-
-        .. math::
-
-            Xema\_{t+1} = (1 - \text{momentum}) \times
-            Xema\_{t} +  \text{momentum} \times X_t
-
-    Args:
-        momentum (float): The momentum used for updating ema parameter.
-            Defaults to 0.0002.
-        interval (int): Update ema parameter every interval iteration.
-            Defaults to 1.
-        warm_up (int): During first warm_up steps, we may use smaller momentum
-            to update ema parameters more slowly. Defaults to 100.
-        resume_from (str, optional): The checkpoint path. Defaults to None.
-    """
-
-    def __init__(self,
-                 momentum: float = 0.0002,
-                 interval: int = 1,
-                 warm_up: int = 100,
-                 resume_from: Optional[str] = None):
-        assert isinstance(interval, int) and interval > 0
-        self.warm_up = warm_up
-        self.interval = interval
-        assert momentum > 0 and momentum < 1
-        self.momentum = momentum**interval
-        self.checkpoint = resume_from
-
-    def before_run(self, runner):
-        """To resume model with it's ema parameters more friendly.
-
-        Register ema parameter as ``named_buffer`` to model
-        """
-        model = runner.model
-        if is_module_wrapper(model):
-            model = model.module
-        self.param_ema_buffer = {}
-        self.model_parameters = dict(model.named_parameters(recurse=True))
-        for name, value in self.model_parameters.items():
-            # "." is not allowed in module's buffer name
-            buffer_name = f"ema_{name.replace('.', '_')}"
-            self.param_ema_buffer[name] = buffer_name
-            model.register_buffer(buffer_name, value.data.clone())
-        self.model_buffers = dict(model.named_buffers(recurse=True))
-        if self.checkpoint is not None:
-            runner.resume(self.checkpoint)
-
-    def after_train_iter(self, runner):
-        """Update ema parameter every self.interval iterations."""
-        curr_step = runner.iter
-        # We warm up the momentum considering the instability at beginning
-        momentum = min(self.momentum,
-                       (1 + curr_step) / (self.warm_up + curr_step))
-        if curr_step % self.interval != 0:
-            return
-        for name, parameter in self.model_parameters.items():
-            buffer_name = self.param_ema_buffer[name]
-            buffer_parameter = self.model_buffers[buffer_name]
-            buffer_parameter.mul_(1 - momentum).add_(momentum, parameter.data)
-
-    def after_train_epoch(self, runner):
-        """We load parameter values from ema backup to model before the
-        EvalHook."""
-        self._swap_ema_parameters()
-
-    def before_train_epoch(self, runner):
-        """We recover model's parameter from ema backup after last epoch's
-        EvalHook."""
-        self._swap_ema_parameters()
-
-    def _swap_ema_parameters(self):
-        """Swap the parameter of model with parameter in ema_buffer."""
-        for name, value in self.model_parameters.items():
-            temp = value.data.clone()
-            ema_buffer = self.model_buffers[self.param_ema_buffer[name]]
-            value.data.copy_(ema_buffer.data)
-            ema_buffer.data.copy_(temp)
diff --git a/mmcv/runner/hooks/evaluation.py b/mmcv/runner/hooks/evaluation.py
deleted file mode 100644
index 3437cd40d7..0000000000
--- a/mmcv/runner/hooks/evaluation.py
+++ /dev/null
@@ -1,515 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import warnings
-from math import inf
-from typing import Callable, List, Optional
-
-import torch.distributed as dist
-from mmengine.fileio import FileClient
-from torch.nn.modules.batchnorm import _BatchNorm
-from torch.utils.data import DataLoader
-
-from mmcv.utils import is_seq_of
-from .hook import Hook
-from .logger import LoggerHook
-
-
-class EvalHook(Hook):
-    """Non-Distributed evaluation hook.
-
-    This hook will regularly perform evaluation in a given interval when
-    performing in non-distributed environment.
-
-    Args:
-        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
-            implemented ``evaluate`` function.
-        start (int | None, optional): Evaluation starting epoch. It enables
-            evaluation before the training starts if ``start`` <= the resuming
-            epoch. If None, whether to evaluate is merely decided by
-            ``interval``. Default: None.
-        interval (int): Evaluation interval. Default: 1.
-        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
-            If set to True, it will perform by epoch. Otherwise, by iteration.
-            Default: True.
-        save_best (str, optional): If a metric is specified, it would measure
-            the best checkpoint during evaluation. The information about best
-            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
-            best score value and best checkpoint path, which will be also
-            loaded when resume checkpoint. Options are the evaluation metrics
-            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
-            detection and instance segmentation. ``AR@100`` for proposal
-            recall. If ``save_best`` is ``auto``, the first key of the returned
-            ``OrderedDict`` result will be used. Default: None.
-        rule (str | None, optional): Comparison rule for best score. If set to
-            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
-            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
-            be inferred by 'less' rule. Options are 'greater', 'less', None.
-            Default: None.
-        test_fn (callable, optional): test a model with samples from a
-            dataloader, and return the test results. If ``None``, the default
-            test function ``mmcv.engine.single_gpu_test`` will be used.
-            (default: ``None``)
-        greater_keys (List[str] | None, optional): Metric keys that will be
-            inferred by 'greater' comparison rule. If ``None``,
-            _default_greater_keys will be used. (default: ``None``)
-        less_keys (List[str] | None, optional): Metric keys that will be
-            inferred by 'less' comparison rule. If ``None``, _default_less_keys
-            will be used. (default: ``None``)
-        out_dir (str, optional): The root directory to save checkpoints. If not
-            specified, `runner.work_dir` will be used by default. If specified,
-            the `out_dir` will be the concatenation of `out_dir` and the last
-            level directory of `runner.work_dir`.
-            `New in version 1.3.16.`
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details. Default: None.
-            `New in version 1.3.16.`
-        **eval_kwargs: Evaluation arguments fed into the evaluate function of
-            the dataset.
-
-    Note:
-        If new arguments are added for EvalHook, tools/test.py,
-        tools/eval_metric.py may be affected.
-    """
-
-    # Since the key for determine greater or less is related to the downstream
-    # tasks, downstream repos may need to overwrite the following inner
-    # variable accordingly.
-
-    rule_map = {'greater': lambda x, y: x > y, 'less': lambda x, y: x < y}
-    init_value_map = {'greater': -inf, 'less': inf}
-    _default_greater_keys = [
-        'acc', 'top', 'AR@', 'auc', 'precision', 'mAP', 'mDice', 'mIoU',
-        'mAcc', 'aAcc'
-    ]
-    _default_less_keys = ['loss']
-
-    def __init__(self,
-                 dataloader: DataLoader,
-                 start: Optional[int] = None,
-                 interval: int = 1,
-                 by_epoch: bool = True,
-                 save_best: Optional[str] = None,
-                 rule: Optional[str] = None,
-                 test_fn: Optional[Callable] = None,
-                 greater_keys: Optional[List[str]] = None,
-                 less_keys: Optional[List[str]] = None,
-                 out_dir: Optional[str] = None,
-                 file_client_args: Optional[dict] = None,
-                 **eval_kwargs):
-        if not isinstance(dataloader, DataLoader):
-            raise TypeError(f'dataloader must be a pytorch DataLoader, '
-                            f'but got {type(dataloader)}')
-
-        if interval <= 0:
-            raise ValueError(f'interval must be a positive number, '
-                             f'but got {interval}')
-
-        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean'
-
-        if start is not None and start < 0:
-            raise ValueError(f'The evaluation start epoch {start} is smaller '
-                             f'than 0')
-
-        self.dataloader = dataloader
-        self.interval = interval
-        self.start = start
-        self.by_epoch = by_epoch
-
-        assert isinstance(save_best, str) or save_best is None, \
-            '""save_best"" should be a str or None ' \
-            f'rather than {type(save_best)}'
-        self.save_best = save_best
-        self.eval_kwargs = eval_kwargs
-        self.initial_flag = True
-
-        if test_fn is None:
-            from mmcv.engine import single_gpu_test
-            self.test_fn = single_gpu_test
-        else:
-            self.test_fn = test_fn
-
-        if greater_keys is None:
-            self.greater_keys = self._default_greater_keys
-        else:
-            if not isinstance(greater_keys, (list, tuple)):
-                assert isinstance(greater_keys, str)
-                greater_keys = (greater_keys, )
-            assert is_seq_of(greater_keys, str)
-            self.greater_keys = greater_keys
-
-        if less_keys is None:
-            self.less_keys = self._default_less_keys
-        else:
-            if not isinstance(less_keys, (list, tuple)):
-                assert isinstance(greater_keys, str)
-                less_keys = (less_keys, )
-            assert is_seq_of(less_keys, str)
-            self.less_keys = less_keys
-
-        if self.save_best is not None:
-            self.best_ckpt_path = None
-            self._init_rule(rule, self.save_best)
-
-        self.out_dir = out_dir
-        self.file_client_args = file_client_args
-
-    def _init_rule(self, rule: Optional[str], key_indicator: str):
-        """Initialize rule, key_indicator, comparison_func, and best score.
-
-        Here is the rule to determine which rule is used for key indicator
-        when the rule is not specific (note that the key indicator matching
-        is case-insensitive):
-        1. If the key indicator is in ``self.greater_keys``, the rule will be
-           specified as 'greater'.
-        2. Or if the key indicator is in ``self.less_keys``, the rule will be
-           specified as 'less'.
-        3. Or if any one item in ``self.greater_keys`` is a substring of
-            key_indicator , the rule will be specified as 'greater'.
-        4. Or if any one item in ``self.less_keys`` is a substring of
-            key_indicator , the rule will be specified as 'less'.
-
-        Args:
-            rule (str | None): Comparison rule for best score.
-            key_indicator (str | None): Key indicator to determine the
-                comparison rule.
-        """
-        if rule not in self.rule_map and rule is not None:
-            raise KeyError(f'rule must be greater, less or None, '
-                           f'but got {rule}.')
-
-        if rule is None:
-            if key_indicator != 'auto':
-                # `_lc` here means we use the lower case of keys for
-                # case-insensitive matching
-                assert isinstance(key_indicator, str)
-                key_indicator_lc = key_indicator.lower()
-                greater_keys = [key.lower() for key in self.greater_keys]
-                less_keys = [key.lower() for key in self.less_keys]
-
-                if key_indicator_lc in greater_keys:
-                    rule = 'greater'
-                elif key_indicator_lc in less_keys:
-                    rule = 'less'
-                elif any(key in key_indicator_lc for key in greater_keys):
-                    rule = 'greater'
-                elif any(key in key_indicator_lc for key in less_keys):
-                    rule = 'less'
-                else:
-                    raise ValueError(f'Cannot infer the rule for key '
-                                     f'{key_indicator}, thus a specific rule '
-                                     f'must be specified.')
-        self.rule = rule
-        self.key_indicator = key_indicator
-        if self.rule is not None:
-            self.compare_func = self.rule_map[self.rule]
-
-    def before_run(self, runner):
-        if not self.out_dir:
-            self.out_dir = runner.work_dir
-
-        self.file_client = FileClient.infer_client(self.file_client_args,
-                                                   self.out_dir)
-
-        # if `self.out_dir` is not equal to `runner.work_dir`, it means that
-        # `self.out_dir` is set so the final `self.out_dir` is the
-        # concatenation of `self.out_dir` and the last level directory of
-        # `runner.work_dir`
-        if self.out_dir != runner.work_dir:
-            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-            self.out_dir = self.file_client.join_path(self.out_dir, basename)
-            runner.logger.info(
-                f'The best checkpoint will be saved to {self.out_dir} by '
-                f'{self.file_client.name}')
-
-        if self.save_best is not None:
-            if runner.meta is None:
-                warnings.warn('runner.meta is None. Creating an empty one.')
-                runner.meta = dict()
-            runner.meta.setdefault('hook_msgs', dict())
-            self.best_ckpt_path = runner.meta['hook_msgs'].get(
-                'best_ckpt', None)
-
-    def before_train_iter(self, runner):
-        """Evaluate the model only at the start of training by iteration."""
-        if self.by_epoch or not self.initial_flag:
-            return
-        if self.start is not None and runner.iter >= self.start:
-            self.after_train_iter(runner)
-        self.initial_flag = False
-
-    def before_train_epoch(self, runner):
-        """Evaluate the model only at the start of training by epoch."""
-        if not (self.by_epoch and self.initial_flag):
-            return
-        if self.start is not None and runner.epoch >= self.start:
-            self.after_train_epoch(runner)
-        self.initial_flag = False
-
-    def after_train_iter(self, runner):
-        """Called after every training iter to evaluate the results."""
-        if not self.by_epoch and self._should_evaluate(runner):
-            # Because the priority of EvalHook is higher than LoggerHook, the
-            # training log and the evaluating log are mixed. Therefore,
-            # we need to dump the training log and clear it before evaluating
-            # log is generated. In addition, this problem will only appear in
-            # `IterBasedRunner` whose `self.by_epoch` is False, because
-            # `EpochBasedRunner` whose `self.by_epoch` is True calls
-            # `_do_evaluate` in `after_train_epoch` stage, and at this stage
-            # the training log has been printed, so it will not cause any
-            # problem. more details at
-            # https://github.com/open-mmlab/mmsegmentation/issues/694
-            for hook in runner._hooks:
-                if isinstance(hook, LoggerHook):
-                    hook.after_train_iter(runner)
-            runner.log_buffer.clear()
-
-            self._do_evaluate(runner)
-
-    def after_train_epoch(self, runner):
-        """Called after every training epoch to evaluate the results."""
-        if self.by_epoch and self._should_evaluate(runner):
-            self._do_evaluate(runner)
-
-    def _do_evaluate(self, runner):
-        """perform evaluation and save ckpt."""
-        results = self.test_fn(runner.model, self.dataloader)
-        runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
-        key_score = self.evaluate(runner, results)
-        # the key_score may be `None` so it needs to skip the action to save
-        # the best checkpoint
-        if self.save_best and key_score:
-            self._save_ckpt(runner, key_score)
-
-    def _should_evaluate(self, runner):
-        """Judge whether to perform evaluation.
-
-        Here is the rule to judge whether to perform evaluation:
-        1. It will not perform evaluation during the epoch/iteration interval,
-           which is determined by ``self.interval``.
-        2. It will not perform evaluation if the start time is larger than
-           current time.
-        3. It will not perform evaluation when current time is larger than
-           the start time but during epoch/iteration interval.
-
-        Returns:
-            bool: The flag indicating whether to perform evaluation.
-        """
-        if self.by_epoch:
-            current = runner.epoch
-            check_time = self.every_n_epochs
-        else:
-            current = runner.iter
-            check_time = self.every_n_iters
-
-        if self.start is None:
-            if not check_time(runner, self.interval):
-                # No evaluation during the interval.
-                return False
-        elif (current + 1) < self.start:
-            # No evaluation if start is larger than the current time.
-            return False
-        else:
-            # Evaluation only at epochs/iters 3, 5, 7...
-            # if start==3 and interval==2
-            if (current + 1 - self.start) % self.interval:
-                return False
-        return True
-
-    def _save_ckpt(self, runner, key_score):
-        """Save the best checkpoint.
-
-        It will compare the score according to the compare function, write
-        related information (best score, best checkpoint path) and save the
-        best checkpoint into ``work_dir``.
-        """
-        if self.by_epoch:
-            current = f'epoch_{runner.epoch + 1}'
-            cur_type, cur_time = 'epoch', runner.epoch + 1
-        else:
-            current = f'iter_{runner.iter + 1}'
-            cur_type, cur_time = 'iter', runner.iter + 1
-
-        best_score = runner.meta['hook_msgs'].get(
-            'best_score', self.init_value_map[self.rule])
-        if self.compare_func(key_score, best_score):
-            best_score = key_score
-            runner.meta['hook_msgs']['best_score'] = best_score
-
-            if self.best_ckpt_path and self.file_client.isfile(
-                    self.best_ckpt_path):
-                self.file_client.remove(self.best_ckpt_path)
-                runner.logger.info(
-                    f'The previous best checkpoint {self.best_ckpt_path} was '
-                    'removed')
-
-            best_ckpt_name = f'best_{self.key_indicator}_{current}.pth'
-            self.best_ckpt_path = self.file_client.join_path(
-                self.out_dir, best_ckpt_name)
-            runner.meta['hook_msgs']['best_ckpt'] = self.best_ckpt_path
-
-            runner.save_checkpoint(
-                self.out_dir,
-                filename_tmpl=best_ckpt_name,
-                create_symlink=False)
-            runner.logger.info(
-                f'Now best checkpoint is saved as {best_ckpt_name}.')
-            runner.logger.info(
-                f'Best {self.key_indicator} is {best_score:0.4f} '
-                f'at {cur_time} {cur_type}.')
-
-    def evaluate(self, runner, results):
-        """Evaluate the results.
-
-        Args:
-            runner (:obj:`mmcv.Runner`): The underlined training runner.
-            results (list): Output results.
-        """
-        eval_res = self.dataloader.dataset.evaluate(
-            results, logger=runner.logger, **self.eval_kwargs)
-
-        for name, val in eval_res.items():
-            runner.log_buffer.output[name] = val
-        runner.log_buffer.ready = True
-
-        if self.save_best is not None:
-            # If the performance of model is pool, the `eval_res` may be an
-            # empty dict and it will raise exception when `self.save_best` is
-            # not None. More details at
-            # https://github.com/open-mmlab/mmdetection/issues/6265.
-            if not eval_res:
-                warnings.warn(
-                    'Since `eval_res` is an empty dict, the behavior to save '
-                    'the best checkpoint will be skipped in this evaluation.')
-                return None
-
-            if self.key_indicator == 'auto':
-                # infer from eval_results
-                self._init_rule(self.rule, list(eval_res.keys())[0])
-            return eval_res[self.key_indicator]
-
-        return None
-
-
-class DistEvalHook(EvalHook):
-    """Distributed evaluation hook.
-
-    This hook will regularly perform evaluation in a given interval when
-    performing in distributed environment.
-
-    Args:
-        dataloader (DataLoader): A PyTorch dataloader, whose dataset has
-            implemented ``evaluate`` function.
-        start (int | None, optional): Evaluation starting epoch. It enables
-            evaluation before the training starts if ``start`` <= the resuming
-            epoch. If None, whether to evaluate is merely decided by
-            ``interval``. Default: None.
-        interval (int): Evaluation interval. Default: 1.
-        by_epoch (bool): Determine perform evaluation by epoch or by iteration.
-            If set to True, it will perform by epoch. Otherwise, by iteration.
-            default: True.
-        save_best (str, optional): If a metric is specified, it would measure
-            the best checkpoint during evaluation. The information about best
-            checkpoint would be saved in ``runner.meta['hook_msgs']`` to keep
-            best score value and best checkpoint path, which will be also
-            loaded when resume checkpoint. Options are the evaluation metrics
-            on the test dataset. e.g., ``bbox_mAP``, ``segm_mAP`` for bbox
-            detection and instance segmentation. ``AR@100`` for proposal
-            recall. If ``save_best`` is ``auto``, the first key of the returned
-            ``OrderedDict`` result will be used. Default: None.
-        rule (str | None, optional): Comparison rule for best score. If set to
-            None, it will infer a reasonable rule. Keys such as 'acc', 'top'
-            .etc will be inferred by 'greater' rule. Keys contain 'loss' will
-            be inferred by 'less' rule. Options are 'greater', 'less', None.
-            Default: None.
-        test_fn (callable, optional): test a model with samples from a
-            dataloader in a multi-gpu manner, and return the test results. If
-            ``None``, the default test function ``mmcv.engine.multi_gpu_test``
-            will be used. (default: ``None``)
-        tmpdir (str | None): Temporary directory to save the results of all
-            processes. Default: None.
-        gpu_collect (bool): Whether to use gpu or cpu to collect results.
-            Default: False.
-        broadcast_bn_buffer (bool): Whether to broadcast the
-            buffer(running_mean and running_var) of rank 0 to other rank
-            before evaluation. Default: True.
-        out_dir (str, optional): The root directory to save checkpoints. If not
-            specified, `runner.work_dir` will be used by default. If specified,
-            the `out_dir` will be the concatenation of `out_dir` and the last
-            level directory of `runner.work_dir`.
-        file_client_args (dict): Arguments to instantiate a FileClient.
-            See :class:`mmengine.fileio.FileClient` for details. Default: None.
-        **eval_kwargs: Evaluation arguments fed into the evaluate function of
-            the dataset.
-    """
-
-    def __init__(self,
-                 dataloader: DataLoader,
-                 start: Optional[int] = None,
-                 interval: int = 1,
-                 by_epoch: bool = True,
-                 save_best: Optional[str] = None,
-                 rule: Optional[str] = None,
-                 test_fn: Optional[Callable] = None,
-                 greater_keys: Optional[List[str]] = None,
-                 less_keys: Optional[List[str]] = None,
-                 broadcast_bn_buffer: bool = True,
-                 tmpdir: Optional[str] = None,
-                 gpu_collect: bool = False,
-                 out_dir: Optional[str] = None,
-                 file_client_args: Optional[dict] = None,
-                 **eval_kwargs):
-
-        if test_fn is None:
-            from mmcv.engine import multi_gpu_test
-            test_fn = multi_gpu_test
-
-        super().__init__(
-            dataloader,
-            start=start,
-            interval=interval,
-            by_epoch=by_epoch,
-            save_best=save_best,
-            rule=rule,
-            test_fn=test_fn,
-            greater_keys=greater_keys,
-            less_keys=less_keys,
-            out_dir=out_dir,
-            file_client_args=file_client_args,
-            **eval_kwargs)
-
-        self.broadcast_bn_buffer = broadcast_bn_buffer
-        self.tmpdir = tmpdir
-        self.gpu_collect = gpu_collect
-
-    def _do_evaluate(self, runner):
-        """perform evaluation and save ckpt."""
-        # Synchronization of BatchNorm's buffer (running_mean
-        # and running_var) is not supported in the DDP of pytorch,
-        # which may cause the inconsistent performance of models in
-        # different ranks, so we broadcast BatchNorm's buffers
-        # of rank 0 to other ranks to avoid this.
-        if self.broadcast_bn_buffer:
-            model = runner.model
-            for name, module in model.named_modules():
-                if isinstance(module,
-                              _BatchNorm) and module.track_running_stats:
-                    dist.broadcast(module.running_var, 0)
-                    dist.broadcast(module.running_mean, 0)
-
-        tmpdir = self.tmpdir
-        if tmpdir is None:
-            tmpdir = osp.join(runner.work_dir, '.eval_hook')
-
-        results = self.test_fn(
-            runner.model,
-            self.dataloader,
-            tmpdir=tmpdir,
-            gpu_collect=self.gpu_collect)
-        if runner.rank == 0:
-            print('\n')
-            runner.log_buffer.output['eval_iter_num'] = len(self.dataloader)
-            key_score = self.evaluate(runner, results)
-            # the key_score may be `None` so it needs to skip the action to
-            # save the best checkpoint
-            if self.save_best and key_score:
-                self._save_ckpt(runner, key_score)
diff --git a/mmcv/runner/hooks/hook.py b/mmcv/runner/hooks/hook.py
deleted file mode 100644
index f2d1c9865b..0000000000
--- a/mmcv/runner/hooks/hook.py
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.utils import Registry, is_method_overridden
-
-HOOKS = Registry('hook')
-
-
-class Hook:
-    stages = ('before_run', 'before_train_epoch', 'before_train_iter',
-              'after_train_iter', 'after_train_epoch', 'before_val_epoch',
-              'before_val_iter', 'after_val_iter', 'after_val_epoch',
-              'after_run')
-
-    def before_run(self, runner):
-        pass
-
-    def after_run(self, runner):
-        pass
-
-    def before_epoch(self, runner):
-        pass
-
-    def after_epoch(self, runner):
-        pass
-
-    def before_iter(self, runner):
-        pass
-
-    def after_iter(self, runner):
-        pass
-
-    def before_train_epoch(self, runner):
-        self.before_epoch(runner)
-
-    def before_val_epoch(self, runner):
-        self.before_epoch(runner)
-
-    def after_train_epoch(self, runner):
-        self.after_epoch(runner)
-
-    def after_val_epoch(self, runner):
-        self.after_epoch(runner)
-
-    def before_train_iter(self, runner):
-        self.before_iter(runner)
-
-    def before_val_iter(self, runner):
-        self.before_iter(runner)
-
-    def after_train_iter(self, runner):
-        self.after_iter(runner)
-
-    def after_val_iter(self, runner):
-        self.after_iter(runner)
-
-    def every_n_epochs(self, runner, n):
-        return (runner.epoch + 1) % n == 0 if n > 0 else False
-
-    def every_n_inner_iters(self, runner, n):
-        return (runner.inner_iter + 1) % n == 0 if n > 0 else False
-
-    def every_n_iters(self, runner, n):
-        return (runner.iter + 1) % n == 0 if n > 0 else False
-
-    def end_of_epoch(self, runner):
-        return runner.inner_iter + 1 == len(runner.data_loader)
-
-    def is_last_epoch(self, runner):
-        return runner.epoch + 1 == runner._max_epochs
-
-    def is_last_iter(self, runner):
-        return runner.iter + 1 == runner._max_iters
-
-    def get_triggered_stages(self):
-        trigger_stages = set()
-        for stage in Hook.stages:
-            if is_method_overridden(stage, Hook, self):
-                trigger_stages.add(stage)
-
-        # some methods will be triggered in multi stages
-        # use this dict to map method to stages.
-        method_stages_map = {
-            'before_epoch': ['before_train_epoch', 'before_val_epoch'],
-            'after_epoch': ['after_train_epoch', 'after_val_epoch'],
-            'before_iter': ['before_train_iter', 'before_val_iter'],
-            'after_iter': ['after_train_iter', 'after_val_iter'],
-        }
-
-        for method, map_stages in method_stages_map.items():
-            if is_method_overridden(method, Hook, self):
-                trigger_stages.update(map_stages)
-
-        return [stage for stage in Hook.stages if stage in trigger_stages]
diff --git a/mmcv/runner/hooks/iter_timer.py b/mmcv/runner/hooks/iter_timer.py
deleted file mode 100644
index cfd5002fe8..0000000000
--- a/mmcv/runner/hooks/iter_timer.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import time
-
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class IterTimerHook(Hook):
-
-    def before_epoch(self, runner):
-        self.t = time.time()
-
-    def before_iter(self, runner):
-        runner.log_buffer.update({'data_time': time.time() - self.t})
-
-    def after_iter(self, runner):
-        runner.log_buffer.update({'time': time.time() - self.t})
-        self.t = time.time()
diff --git a/mmcv/runner/hooks/logger/__init__.py b/mmcv/runner/hooks/logger/__init__.py
deleted file mode 100644
index 062709e704..0000000000
--- a/mmcv/runner/hooks/logger/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .base import LoggerHook
-from .clearml import ClearMLLoggerHook
-from .dvclive import DvcliveLoggerHook
-from .mlflow import MlflowLoggerHook
-from .neptune import NeptuneLoggerHook
-from .pavi import PaviLoggerHook
-from .segmind import SegmindLoggerHook
-from .tensorboard import TensorboardLoggerHook
-from .text import TextLoggerHook
-from .wandb import WandbLoggerHook
-
-__all__ = [
-    'LoggerHook', 'MlflowLoggerHook', 'PaviLoggerHook',
-    'TensorboardLoggerHook', 'TextLoggerHook', 'WandbLoggerHook',
-    'NeptuneLoggerHook', 'DvcliveLoggerHook', 'SegmindLoggerHook',
-    'ClearMLLoggerHook'
-]
diff --git a/mmcv/runner/hooks/logger/base.py b/mmcv/runner/hooks/logger/base.py
deleted file mode 100644
index 416a1b7510..0000000000
--- a/mmcv/runner/hooks/logger/base.py
+++ /dev/null
@@ -1,172 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numbers
-from abc import ABCMeta, abstractmethod
-from typing import Dict
-
-import numpy as np
-import torch
-
-from ..hook import Hook
-
-
-class LoggerHook(Hook):
-    """Base class for logger hooks.
-
-    Args:
-        interval (int): Logging interval (every k iterations). Default 10.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`. Default True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default False.
-        by_epoch (bool): Whether EpochBasedRunner is used. Default True.
-    """
-
-    __metaclass__ = ABCMeta
-
-    def __init__(self,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 by_epoch: bool = True):
-        self.interval = interval
-        self.ignore_last = ignore_last
-        self.reset_flag = reset_flag
-        self.by_epoch = by_epoch
-
-    @abstractmethod
-    def log(self, runner):
-        pass
-
-    @staticmethod
-    def is_scalar(val,
-                  include_np: bool = True,
-                  include_torch: bool = True) -> bool:
-        """Tell the input variable is a scalar or not.
-
-        Args:
-            val: Input variable.
-            include_np (bool): Whether include 0-d np.ndarray as a scalar.
-            include_torch (bool): Whether include 0-d torch.Tensor as a scalar.
-
-        Returns:
-            bool: True or False.
-        """
-        if isinstance(val, numbers.Number):
-            return True
-        elif include_np and isinstance(val, np.ndarray) and val.ndim == 0:
-            return True
-        elif include_torch and isinstance(val, torch.Tensor) and len(val) == 1:
-            return True
-        else:
-            return False
-
-    def get_mode(self, runner) -> str:
-        if runner.mode == 'train':
-            if 'time' in runner.log_buffer.output:
-                mode = 'train'
-            else:
-                mode = 'val'
-        elif runner.mode == 'val':
-            mode = 'val'
-        else:
-            raise ValueError(f"runner mode should be 'train' or 'val', "
-                             f'but got {runner.mode}')
-        return mode
-
-    def get_epoch(self, runner) -> int:
-        if runner.mode == 'train':
-            epoch = runner.epoch + 1
-        elif runner.mode == 'val':
-            # normal val mode
-            # runner.epoch += 1 has been done before val workflow
-            epoch = runner.epoch
-        else:
-            raise ValueError(f"runner mode should be 'train' or 'val', "
-                             f'but got {runner.mode}')
-        return epoch
-
-    def get_iter(self, runner, inner_iter: bool = False) -> int:
-        """Get the current training iteration step."""
-        if self.by_epoch and inner_iter:
-            current_iter = runner.inner_iter + 1
-        else:
-            current_iter = runner.iter + 1
-        return current_iter
-
-    def get_lr_tags(self, runner) -> Dict[str, float]:
-        tags = {}
-        lrs = runner.current_lr()
-        if isinstance(lrs, dict):
-            for name, value in lrs.items():
-                tags[f'learning_rate/{name}'] = value[0]
-        else:
-            tags['learning_rate'] = lrs[0]
-        return tags
-
-    def get_momentum_tags(self, runner) -> Dict[str, float]:
-        tags = {}
-        momentums = runner.current_momentum()
-        if isinstance(momentums, dict):
-            for name, value in momentums.items():
-                tags[f'momentum/{name}'] = value[0]
-        else:
-            tags['momentum'] = momentums[0]
-        return tags
-
-    def get_loggable_tags(
-        self,
-        runner,
-        allow_scalar: bool = True,
-        allow_text: bool = False,
-        add_mode: bool = True,
-        tags_to_skip: tuple = ('time', 'data_time')
-    ) -> Dict:
-        tags = {}
-        for var, val in runner.log_buffer.output.items():
-            if var in tags_to_skip:
-                continue
-            if self.is_scalar(val) and not allow_scalar:
-                continue
-            if isinstance(val, str) and not allow_text:
-                continue
-            if add_mode:
-                var = f'{self.get_mode(runner)}/{var}'
-            tags[var] = val
-        tags.update(self.get_lr_tags(runner))
-        tags.update(self.get_momentum_tags(runner))
-        return tags
-
-    def before_run(self, runner) -> None:
-        for hook in runner.hooks[::-1]:
-            if isinstance(hook, LoggerHook):
-                hook.reset_flag = True
-                break
-
-    def before_epoch(self, runner) -> None:
-        runner.log_buffer.clear()  # clear logs of last epoch
-
-    def after_train_iter(self, runner) -> None:
-        if self.by_epoch and self.every_n_inner_iters(runner, self.interval):
-            runner.log_buffer.average(self.interval)
-        elif not self.by_epoch and self.every_n_iters(runner, self.interval):
-            runner.log_buffer.average(self.interval)
-        elif self.end_of_epoch(runner) and not self.ignore_last:
-            # not precise but more stable
-            runner.log_buffer.average(self.interval)
-
-        if runner.log_buffer.ready:
-            self.log(runner)
-            if self.reset_flag:
-                runner.log_buffer.clear_output()
-
-    def after_train_epoch(self, runner) -> None:
-        if runner.log_buffer.ready:
-            self.log(runner)
-            if self.reset_flag:
-                runner.log_buffer.clear_output()
-
-    def after_val_epoch(self, runner) -> None:
-        runner.log_buffer.average()
-        self.log(runner)
-        if self.reset_flag:
-            runner.log_buffer.clear_output()
diff --git a/mmcv/runner/hooks/logger/clearml.py b/mmcv/runner/hooks/logger/clearml.py
deleted file mode 100644
index 7db651f031..0000000000
--- a/mmcv/runner/hooks/logger/clearml.py
+++ /dev/null
@@ -1,63 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-
-from typing import Dict, Optional
-
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class ClearMLLoggerHook(LoggerHook):
-    """Class to log metrics with clearml.
-
-    It requires `clearml`_ to be installed.
-
-
-    Args:
-        init_kwargs (dict): A dict contains the `clearml.Task.init`
-            initialization keys. See `taskinit`_  for more details.
-        interval (int): Logging interval (every k iterations). Default 10.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`. Default: True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: False.
-        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
-
-    .. _clearml:
-        https://clear.ml/docs/latest/docs/
-    .. _taskinit:
-        https://clear.ml/docs/latest/docs/references/sdk/task/#taskinit
-    """
-
-    def __init__(self,
-                 init_kwargs: Optional[Dict] = None,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 by_epoch: bool = True):
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.import_clearml()
-        self.init_kwargs = init_kwargs
-
-    def import_clearml(self):
-        try:
-            import clearml
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install clearml" to install clearml')
-        self.clearml = clearml
-
-    @master_only
-    def before_run(self, runner) -> None:
-        super().before_run(runner)
-        task_kwargs = self.init_kwargs if self.init_kwargs else {}
-        self.task = self.clearml.Task.init(**task_kwargs)
-        self.task_logger = self.task.get_logger()
-
-    @master_only
-    def log(self, runner) -> None:
-        tags = self.get_loggable_tags(runner)
-        for tag, val in tags.items():
-            self.task_logger.report_scalar(tag, tag, val,
-                                           self.get_iter(runner))
diff --git a/mmcv/runner/hooks/logger/dvclive.py b/mmcv/runner/hooks/logger/dvclive.py
deleted file mode 100644
index fc0a58c497..0000000000
--- a/mmcv/runner/hooks/logger/dvclive.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from pathlib import Path
-from typing import Optional
-
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class DvcliveLoggerHook(LoggerHook):
-    """Class to log metrics with dvclive.
-
-    It requires `dvclive`_ to be installed.
-
-    Args:
-        model_file (str): Default None. If not None, after each epoch the
-            model will be saved to {model_file}.
-        interval (int): Logging interval (every k iterations). Default 10.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`. Default: True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: False.
-        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
-        kwargs: Arguments for instantiating `Live`_.
-
-    .. _dvclive:
-        https://dvc.org/doc/dvclive
-
-    .. _Live:
-        https://dvc.org/doc/dvclive/api-reference/live#parameters
-    """
-
-    def __init__(self,
-                 model_file: Optional[str] = None,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 by_epoch: bool = True,
-                 **kwargs):
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.model_file = model_file
-        self.import_dvclive(**kwargs)
-
-    def import_dvclive(self, **kwargs) -> None:
-        try:
-            from dvclive import Live
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install dvclive" to install dvclive')
-        self.dvclive = Live(**kwargs)
-
-    @master_only
-    def log(self, runner) -> None:
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            self.dvclive.set_step(self.get_iter(runner))
-            for k, v in tags.items():
-                self.dvclive.log(k, v)
-
-    @master_only
-    def after_train_epoch(self, runner) -> None:
-        super().after_train_epoch(runner)
-        if self.model_file is not None:
-            runner.save_checkpoint(
-                Path(self.model_file).parent,
-                filename_tmpl=Path(self.model_file).name,
-                create_symlink=False,
-            )
diff --git a/mmcv/runner/hooks/logger/mlflow.py b/mmcv/runner/hooks/logger/mlflow.py
deleted file mode 100644
index a76b0426b7..0000000000
--- a/mmcv/runner/hooks/logger/mlflow.py
+++ /dev/null
@@ -1,81 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, Optional
-
-from mmcv.utils import TORCH_VERSION
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class MlflowLoggerHook(LoggerHook):
-    """Class to log metrics and (optionally) a trained model to MLflow.
-
-    It requires `MLflow`_ to be installed.
-
-    Args:
-        exp_name (str, optional): Name of the experiment to be used.
-            Default None. If not None, set the active experiment.
-            If experiment does not exist, an experiment with provided name
-            will be created.
-        tags (Dict[str], optional): Tags for the current run.
-            Default None. If not None, set tags for the current run.
-        log_model (bool, optional): Whether to log an MLflow artifact.
-            Default True. If True, log runner.model as an MLflow artifact
-            for the current run.
-        interval (int): Logging interval (every k iterations). Default: 10.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`. Default: True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: False.
-        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
-
-    .. _MLflow:
-        https://www.mlflow.org/docs/latest/index.html
-    """
-
-    def __init__(self,
-                 exp_name: Optional[str] = None,
-                 tags: Optional[Dict] = None,
-                 log_model: bool = True,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 by_epoch: bool = True):
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.import_mlflow()
-        self.exp_name = exp_name
-        self.tags = tags
-        self.log_model = log_model
-
-    def import_mlflow(self) -> None:
-        try:
-            import mlflow
-            import mlflow.pytorch as mlflow_pytorch
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install mlflow" to install mlflow')
-        self.mlflow = mlflow
-        self.mlflow_pytorch = mlflow_pytorch
-
-    @master_only
-    def before_run(self, runner) -> None:
-        super().before_run(runner)
-        if self.exp_name is not None:
-            self.mlflow.set_experiment(self.exp_name)
-        if self.tags is not None:
-            self.mlflow.set_tags(self.tags)
-
-    @master_only
-    def log(self, runner) -> None:
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            self.mlflow.log_metrics(tags, step=self.get_iter(runner))
-
-    @master_only
-    def after_run(self, runner) -> None:
-        if self.log_model:
-            self.mlflow_pytorch.log_model(
-                runner.model,
-                'models',
-                pip_requirements=[f'torch=={TORCH_VERSION}'])
diff --git a/mmcv/runner/hooks/logger/neptune.py b/mmcv/runner/hooks/logger/neptune.py
deleted file mode 100644
index e398fe1e79..0000000000
--- a/mmcv/runner/hooks/logger/neptune.py
+++ /dev/null
@@ -1,89 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Dict, Optional
-
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class NeptuneLoggerHook(LoggerHook):
-    """Class to log metrics to NeptuneAI.
-
-    It requires `Neptune`_ to be installed.
-
-    Args:
-        init_kwargs (dict): a dict contains the initialization keys as below:
-
-            - project (str): Name of a project in a form of
-              namespace/project_name. If None, the value of NEPTUNE_PROJECT
-              environment variable will be taken.
-            - api_token (str): User’s API token. If None, the value of
-              NEPTUNE_API_TOKEN environment variable will be taken. Note: It is
-              strongly recommended to use NEPTUNE_API_TOKEN environment
-              variable rather than placing your API token in plain text in your
-              source code.
-            - name (str, optional, default is 'Untitled'): Editable name of the
-              run. Name is displayed in the run's Details and in Runs table as
-              a column.
-
-            Check https://docs.neptune.ai/api-reference/neptune#init for more
-            init arguments.
-        interval (int): Logging interval (every k iterations). Default: 10.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than ``interval``. Default: True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: True.
-        with_step (bool): If True, the step will be logged from
-            ``self.get_iters``. Otherwise, step will not be logged.
-            Default: True.
-        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
-
-    .. _Neptune:
-        https://docs.neptune.ai
-    """
-
-    def __init__(self,
-                 init_kwargs: Optional[Dict] = None,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = True,
-                 with_step: bool = True,
-                 by_epoch: bool = True):
-
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.import_neptune()
-        self.init_kwargs = init_kwargs
-        self.with_step = with_step
-
-    def import_neptune(self) -> None:
-        try:
-            import neptune.new as neptune
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install neptune-client" to install neptune')
-        self.neptune = neptune
-        self.run = None
-
-    @master_only
-    def before_run(self, runner) -> None:
-        if self.init_kwargs:
-            self.run = self.neptune.init(**self.init_kwargs)
-        else:
-            self.run = self.neptune.init()
-
-    @master_only
-    def log(self, runner) -> None:
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            for tag_name, tag_value in tags.items():
-                if self.with_step:
-                    self.run[tag_name].log(  # type: ignore
-                        tag_value, step=self.get_iter(runner))
-                else:
-                    tags['global_step'] = self.get_iter(runner)
-                    self.run[tag_name].log(tags)  # type: ignore
-
-    @master_only
-    def after_run(self, runner) -> None:
-        self.run.stop()  # type: ignore
diff --git a/mmcv/runner/hooks/logger/pavi.py b/mmcv/runner/hooks/logger/pavi.py
deleted file mode 100644
index 3263b3cfa2..0000000000
--- a/mmcv/runner/hooks/logger/pavi.py
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os
-import os.path as osp
-from typing import Dict, Optional
-
-import mmengine
-import torch
-import yaml
-
-import mmcv
-from ....parallel.utils import is_module_wrapper
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class PaviLoggerHook(LoggerHook):
-    """Class to visual model, log metrics (for internal use).
-
-    Args:
-        init_kwargs (dict): A dict contains the initialization keys as below:
-
-            - name (str, optional): Custom training name. Defaults to None,
-              which means current work_dir.
-            - project (str, optional): Project name. Defaults to "default".
-            - model (str, optional): Training model name. Defaults to current
-              model.
-            - session_text (str, optional): Session string in YAML format.
-              Defaults to current config.
-            - training_id (int, optional): Training ID in PAVI, if you want to
-              use an existing training. Defaults to None.
-            - compare_id (int, optional): Compare ID in PAVI, if you want to
-              add the task to an existing compare. Defaults to None.
-            - overwrite_last_training (bool, optional): Whether to upload data
-              to the training with the same name in the same project, rather
-              than creating a new one. Defaults to False.
-        add_graph (bool): Whether to visual model. Default: False.
-        add_last_ckpt (bool): Whether to save checkpoint after run.
-            Default: False.
-        interval (int): Logging interval (every k iterations). Default: True.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`. Default: True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: False.
-        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
-        img_key (string): Get image data from Dataset. Default: 'img_info'.
-    """
-
-    def __init__(self,
-                 init_kwargs: Optional[Dict] = None,
-                 add_graph: bool = False,
-                 add_last_ckpt: bool = False,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 by_epoch: bool = True,
-                 img_key: str = 'img_info'):
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.init_kwargs = init_kwargs
-        self.add_graph = add_graph
-        self.add_last_ckpt = add_last_ckpt
-        self.img_key = img_key
-
-    @master_only
-    def before_run(self, runner) -> None:
-        super().before_run(runner)
-        try:
-            from pavi import SummaryWriter
-        except ImportError:
-            raise ImportError(
-                'No module named pavi, please contact pavi team or visit'
-                'document for pavi installation instructions.')
-
-        self.run_name = runner.work_dir.split('/')[-1]
-
-        if not self.init_kwargs:
-            self.init_kwargs = dict()
-        self.init_kwargs.setdefault('name', self.run_name)
-        self.init_kwargs.setdefault('model', runner._model_name)
-        if runner.meta is not None:
-            if 'config_dict' in runner.meta:
-                config_dict = runner.meta['config_dict']
-                assert isinstance(
-                    config_dict,
-                    dict), ('meta["config_dict"] has to be of a dict, '
-                            f'but got {type(config_dict)}')
-            elif 'config_file' in runner.meta:
-                config_file = runner.meta['config_file']
-                config_dict = dict(mmcv.Config.fromfile(config_file))
-            else:
-                config_dict = None
-            if config_dict is not None:
-                # 'max_.*iter' is parsed in pavi sdk as the maximum iterations
-                #  to properly set up the progress bar.
-                config_dict = config_dict.copy()
-                config_dict.setdefault('max_iter', runner.max_iters)
-                # non-serializable values are first converted in
-                # mmengine.dump to json
-                config_dict = json.loads(
-                    mmengine.dump(config_dict, file_format='json'))
-                session_text = yaml.dump(config_dict)
-                self.init_kwargs.setdefault('session_text', session_text)
-        self.writer = SummaryWriter(**self.init_kwargs)
-
-    def get_step(self, runner) -> int:
-        """Get the total training step/epoch."""
-        if self.get_mode(runner) == 'val' and self.by_epoch:
-            return self.get_epoch(runner)
-        else:
-            return self.get_iter(runner)
-
-    @master_only
-    def log(self, runner) -> None:
-        tags = self.get_loggable_tags(runner, add_mode=False)
-        if tags:
-            self.writer.add_scalars(
-                self.get_mode(runner), tags, self.get_step(runner))
-
-    @master_only
-    def after_run(self, runner) -> None:
-        if self.add_last_ckpt:
-            ckpt_path = osp.join(runner.work_dir, 'latest.pth')
-            if osp.islink(ckpt_path):
-                ckpt_path = osp.join(runner.work_dir, os.readlink(ckpt_path))
-
-            if osp.isfile(ckpt_path):
-                # runner.epoch += 1 has been done before `after_run`.
-                iteration = runner.epoch if self.by_epoch else runner.iter
-                return self.writer.add_snapshot_file(
-                    tag=self.run_name,
-                    snapshot_file_path=ckpt_path,
-                    iteration=iteration)
-
-        # flush the buffer and send a task ending signal to Pavi
-        self.writer.close()
-
-    @master_only
-    def before_epoch(self, runner) -> None:
-        if runner.epoch == 0 and self.add_graph:
-            if is_module_wrapper(runner.model):
-                _model = runner.model.module
-            else:
-                _model = runner.model
-            device = next(_model.parameters()).device
-            data = next(iter(runner.data_loader))
-            image = data[self.img_key][0:1].to(device)
-            with torch.no_grad():
-                self.writer.add_graph(_model, image)
diff --git a/mmcv/runner/hooks/logger/segmind.py b/mmcv/runner/hooks/logger/segmind.py
deleted file mode 100644
index ecb3751ed7..0000000000
--- a/mmcv/runner/hooks/logger/segmind.py
+++ /dev/null
@@ -1,48 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class SegmindLoggerHook(LoggerHook):
-    """Class to log metrics to Segmind.
-
-    It requires `Segmind`_ to be installed.
-
-    Args:
-        interval (int): Logging interval (every k iterations). Default: 10.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`. Default True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default False.
-        by_epoch (bool): Whether EpochBasedRunner is used. Default True.
-
-    .. _Segmind:
-        https://docs.segmind.com/python-library
-    """
-
-    def __init__(self,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 by_epoch=True):
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.import_segmind()
-
-    def import_segmind(self) -> None:
-        try:
-            import segmind
-        except ImportError:
-            raise ImportError(
-                "Please run 'pip install segmind' to install segmind")
-        self.log_metrics = segmind.tracking.fluent.log_metrics
-        self.mlflow_log = segmind.utils.logging_utils.try_mlflow_log
-
-    @master_only
-    def log(self, runner) -> None:
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            # logging metrics to segmind
-            self.mlflow_log(
-                self.log_metrics, tags, step=runner.epoch, epoch=runner.epoch)
diff --git a/mmcv/runner/hooks/logger/tensorboard.py b/mmcv/runner/hooks/logger/tensorboard.py
deleted file mode 100644
index 11d0799112..0000000000
--- a/mmcv/runner/hooks/logger/tensorboard.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-from typing import Optional
-
-from mmcv.utils import TORCH_VERSION, digit_version
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class TensorboardLoggerHook(LoggerHook):
-    """Class to log metrics to Tensorboard.
-
-    Args:
-        log_dir (string): Save directory location. Default: None. If default
-            values are used, directory location is ``runner.work_dir``/tf_logs.
-        interval (int): Logging interval (every k iterations). Default: True.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`. Default: True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: False.
-        by_epoch (bool): Whether EpochBasedRunner is used. Default: True.
-    """
-
-    def __init__(self,
-                 log_dir: Optional[str] = None,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 by_epoch: bool = True):
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.log_dir = log_dir
-
-    @master_only
-    def before_run(self, runner) -> None:
-        super().before_run(runner)
-        if (TORCH_VERSION == 'parrots'
-                or digit_version(TORCH_VERSION) < digit_version('1.1')):
-            try:
-                from tensorboardX import SummaryWriter
-            except ImportError:
-                raise ImportError('Please install tensorboardX to use '
-                                  'TensorboardLoggerHook.')
-        else:
-            try:
-                from torch.utils.tensorboard import SummaryWriter
-            except ImportError:
-                raise ImportError(
-                    'Please run "pip install future tensorboard" to install '
-                    'the dependencies to use torch.utils.tensorboard '
-                    '(applicable to PyTorch 1.1 or higher)')
-
-        if self.log_dir is None:
-            self.log_dir = osp.join(runner.work_dir, 'tf_logs')
-        self.writer = SummaryWriter(self.log_dir)
-
-    @master_only
-    def log(self, runner) -> None:
-        tags = self.get_loggable_tags(runner, allow_text=True)
-        for tag, val in tags.items():
-            if isinstance(val, str):
-                self.writer.add_text(tag, val, self.get_iter(runner))
-            else:
-                self.writer.add_scalar(tag, val, self.get_iter(runner))
-
-    @master_only
-    def after_run(self, runner) -> None:
-        self.writer.close()
diff --git a/mmcv/runner/hooks/logger/text.py b/mmcv/runner/hooks/logger/text.py
deleted file mode 100644
index 33e32ffeab..0000000000
--- a/mmcv/runner/hooks/logger/text.py
+++ /dev/null
@@ -1,256 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import datetime
-import os
-import os.path as osp
-from collections import OrderedDict
-from typing import Dict, Optional, Union
-
-import mmengine
-import torch
-import torch.distributed as dist
-from mmengine.fileio.file_client import FileClient
-
-from mmcv.utils import is_tuple_of, scandir
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class TextLoggerHook(LoggerHook):
-    """Logger hook in text.
-
-    In this logger hook, the information will be printed on terminal and
-    saved in json file.
-
-    Args:
-        by_epoch (bool, optional): Whether EpochBasedRunner is used.
-            Default: True.
-        interval (int, optional): Logging interval (every k iterations).
-            Default: 10.
-        ignore_last (bool, optional): Ignore the log of last iterations in each
-            epoch if less than :attr:`interval`. Default: True.
-        reset_flag (bool, optional): Whether to clear the output buffer after
-            logging. Default: False.
-        interval_exp_name (int, optional): Logging interval for experiment
-            name. This feature is to help users conveniently get the experiment
-            information from screen or log file. Default: 1000.
-        out_dir (str, optional): Logs are saved in ``runner.work_dir`` default.
-            If ``out_dir`` is specified, logs will be copied to a new directory
-            which is the concatenation of ``out_dir`` and the last level
-            directory of ``runner.work_dir``. Default: None.
-            `New in version 1.3.16.`
-        out_suffix (str or tuple[str], optional): Those filenames ending with
-            ``out_suffix`` will be copied to ``out_dir``.
-            Default: ('.log.json', '.log', '.py').
-            `New in version 1.3.16.`
-        keep_local (bool, optional): Whether to keep local log when
-            :attr:`out_dir` is specified. If False, the local log will be
-            removed. Default: True.
-            `New in version 1.3.16.`
-        file_client_args (dict, optional): Arguments to instantiate a
-            FileClient. See :class:`mmengine.fileio.FileClient` for details.
-            Default: None.
-            `New in version 1.3.16.`
-    """
-
-    def __init__(self,
-                 by_epoch: bool = True,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 interval_exp_name: int = 1000,
-                 out_dir: Optional[str] = None,
-                 out_suffix: Union[str, tuple] = ('.log.json', '.log', '.py'),
-                 keep_local: bool = True,
-                 file_client_args: Optional[Dict] = None):
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.by_epoch = by_epoch
-        self.time_sec_tot = 0
-        self.interval_exp_name = interval_exp_name
-
-        if out_dir is None and file_client_args is not None:
-            raise ValueError(
-                'file_client_args should be "None" when `out_dir` is not'
-                'specified.')
-        self.out_dir = out_dir
-
-        if not (out_dir is None or isinstance(out_dir, str)
-                or is_tuple_of(out_dir, str)):
-            raise TypeError('out_dir should be  "None" or string or tuple of '
-                            'string, but got {out_dir}')
-        self.out_suffix = out_suffix
-
-        self.keep_local = keep_local
-        self.file_client_args = file_client_args
-        if self.out_dir is not None:
-            self.file_client = FileClient.infer_client(file_client_args,
-                                                       self.out_dir)
-
-    def before_run(self, runner) -> None:
-        super().before_run(runner)
-
-        if self.out_dir is not None:
-            self.file_client = FileClient.infer_client(self.file_client_args,
-                                                       self.out_dir)
-            # The final `self.out_dir` is the concatenation of `self.out_dir`
-            # and the last level directory of `runner.work_dir`
-            basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-            self.out_dir = self.file_client.join_path(self.out_dir, basename)
-            runner.logger.info(
-                f'Text logs will be saved to {self.out_dir} by '
-                f'{self.file_client.name} after the training process.')
-
-        self.start_iter = runner.iter
-        self.json_log_path = osp.join(runner.work_dir,
-                                      f'{runner.timestamp}.log.json')
-        if runner.meta is not None:
-            self._dump_log(runner.meta, runner)
-
-    def _get_max_memory(self, runner) -> int:
-        device = getattr(runner.model, 'output_device', None)
-        mem = torch.cuda.max_memory_allocated(device=device)
-        mem_mb = torch.tensor([int(mem) // (1024 * 1024)],
-                              dtype=torch.int,
-                              device=device)
-        if runner.world_size > 1:
-            dist.reduce(mem_mb, 0, op=dist.ReduceOp.MAX)
-        return mem_mb.item()
-
-    def _log_info(self, log_dict: Dict, runner) -> None:
-        # print exp name for users to distinguish experiments
-        # at every ``interval_exp_name`` iterations and the end of each epoch
-        if runner.meta is not None and 'exp_name' in runner.meta:
-            if (self.every_n_iters(runner, self.interval_exp_name)) or (
-                    self.by_epoch and self.end_of_epoch(runner)):
-                exp_info = f'Exp name: {runner.meta["exp_name"]}'
-                runner.logger.info(exp_info)
-
-        if log_dict['mode'] == 'train':
-            if isinstance(log_dict['lr'], dict):
-                lr_str = []
-                for k, val in log_dict['lr'].items():
-                    lr_str.append(f'lr_{k}: {val:.3e}')
-                lr_str = ' '.join(lr_str)  # type: ignore
-            else:
-                lr_str = f'lr: {log_dict["lr"]:.3e}'  # type: ignore
-
-            # by epoch: Epoch [4][100/1000]
-            # by iter:  Iter [100/100000]
-            if self.by_epoch:
-                log_str = f'Epoch [{log_dict["epoch"]}]' \
-                          f'[{log_dict["iter"]}/{len(runner.data_loader)}]\t'
-            else:
-                log_str = f'Iter [{log_dict["iter"]}/{runner.max_iters}]\t'
-            log_str += f'{lr_str}, '
-
-            if 'time' in log_dict.keys():
-                self.time_sec_tot += (log_dict['time'] * self.interval)
-                time_sec_avg = self.time_sec_tot / (
-                    runner.iter - self.start_iter + 1)
-                eta_sec = time_sec_avg * (runner.max_iters - runner.iter - 1)
-                eta_str = str(datetime.timedelta(seconds=int(eta_sec)))
-                log_str += f'eta: {eta_str}, '
-                log_str += f'time: {log_dict["time"]:.3f}, ' \
-                           f'data_time: {log_dict["data_time"]:.3f}, '
-                # statistic memory
-                if torch.cuda.is_available():
-                    log_str += f'memory: {log_dict["memory"]}, '
-        else:
-            # val/test time
-            # here 1000 is the length of the val dataloader
-            # by epoch: Epoch[val] [4][1000]
-            # by iter: Iter[val] [1000]
-            if self.by_epoch:
-                log_str = f'Epoch({log_dict["mode"]}) ' \
-                    f'[{log_dict["epoch"]}][{log_dict["iter"]}]\t'
-            else:
-                log_str = f'Iter({log_dict["mode"]}) [{log_dict["iter"]}]\t'
-
-        log_items = []
-        for name, val in log_dict.items():
-            # TODO: resolve this hack
-            # these items have been in log_str
-            if name in [
-                    'mode', 'Epoch', 'iter', 'lr', 'time', 'data_time',
-                    'memory', 'epoch'
-            ]:
-                continue
-            if isinstance(val, float):
-                val = f'{val:.4f}'
-            log_items.append(f'{name}: {val}')
-        log_str += ', '.join(log_items)
-
-        runner.logger.info(log_str)
-
-    def _dump_log(self, log_dict: Dict, runner) -> None:
-        # dump log in json format
-        json_log = OrderedDict()
-        for k, v in log_dict.items():
-            json_log[k] = self._round_float(v)
-        # only append log at last line
-        if runner.rank == 0:
-            with open(self.json_log_path, 'a+') as f:
-                mmengine.dump(json_log, f, file_format='json')
-                f.write('\n')
-
-    def _round_float(self, items):
-        if isinstance(items, list):
-            return [self._round_float(item) for item in items]
-        elif isinstance(items, float):
-            return round(items, 5)
-        else:
-            return items
-
-    def log(self, runner) -> OrderedDict:
-        if 'eval_iter_num' in runner.log_buffer.output:
-            # this doesn't modify runner.iter and is regardless of by_epoch
-            cur_iter = runner.log_buffer.output.pop('eval_iter_num')
-        else:
-            cur_iter = self.get_iter(runner, inner_iter=True)
-
-        log_dict = OrderedDict(
-            mode=self.get_mode(runner),
-            epoch=self.get_epoch(runner),
-            iter=cur_iter)
-
-        # only record lr of the first param group
-        cur_lr = runner.current_lr()
-        if isinstance(cur_lr, list):
-            log_dict['lr'] = cur_lr[0]
-        else:
-            assert isinstance(cur_lr, dict)
-            log_dict['lr'] = {}
-            for k, lr_ in cur_lr.items():
-                assert isinstance(lr_, list)
-                log_dict['lr'].update({k: lr_[0]})
-
-        if 'time' in runner.log_buffer.output:
-            # statistic memory
-            if torch.cuda.is_available():
-                log_dict['memory'] = self._get_max_memory(runner)
-
-        log_dict = dict(log_dict, **runner.log_buffer.output)  # type: ignore
-
-        self._log_info(log_dict, runner)
-        self._dump_log(log_dict, runner)
-        return log_dict
-
-    def after_run(self, runner) -> None:
-        # copy or upload logs to self.out_dir
-        if self.out_dir is not None:
-            for filename in scandir(runner.work_dir, self.out_suffix, True):
-                local_filepath = osp.join(runner.work_dir, filename)
-                out_filepath = self.file_client.join_path(
-                    self.out_dir, filename)
-                with open(local_filepath) as f:
-                    self.file_client.put_text(f.read(), out_filepath)
-
-                runner.logger.info(
-                    f'The file {local_filepath} has been uploaded to '
-                    f'{out_filepath}.')
-
-                if not self.keep_local:
-                    os.remove(local_filepath)
-                    runner.logger.info(
-                        f'{local_filepath} was removed due to the '
-                        '`self.keep_local=False`')
diff --git a/mmcv/runner/hooks/logger/wandb.py b/mmcv/runner/hooks/logger/wandb.py
deleted file mode 100644
index 1cf165507e..0000000000
--- a/mmcv/runner/hooks/logger/wandb.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-from typing import Dict, Optional, Union
-
-from mmcv.utils import scandir
-from ...dist_utils import master_only
-from ..hook import HOOKS
-from .base import LoggerHook
-
-
-@HOOKS.register_module()
-class WandbLoggerHook(LoggerHook):
-    """Class to log metrics with wandb.
-
-    It requires `wandb`_ to be installed.
-
-
-    Args:
-        init_kwargs (dict): A dict contains the initialization keys. Check
-            https://docs.wandb.ai/ref/python/init for more init arguments.
-        interval (int): Logging interval (every k iterations).
-            Default 10.
-        ignore_last (bool): Ignore the log of last iterations in each epoch
-            if less than `interval`.
-            Default: True.
-        reset_flag (bool): Whether to clear the output buffer after logging.
-            Default: False.
-        commit (bool): Save the metrics dict to the wandb server and increment
-            the step. If false ``wandb.log`` just updates the current metrics
-            dict with the row argument and metrics won't be saved until
-            ``wandb.log`` is called with ``commit=True``.
-            Default: True.
-        by_epoch (bool): Whether EpochBasedRunner is used.
-            Default: True.
-        with_step (bool): If True, the step will be logged from
-            ``self.get_iters``. Otherwise, step will not be logged.
-            Default: True.
-        log_artifact (bool): If True, artifacts in {work_dir} will be uploaded
-            to wandb after training ends.
-            Default: True
-            `New in version 1.4.3.`
-        out_suffix (str or tuple[str], optional): Those filenames ending with
-            ``out_suffix`` will be uploaded to wandb.
-            Default: ('.log.json', '.log', '.py').
-            `New in version 1.4.3.`
-
-    .. _wandb:
-        https://docs.wandb.ai
-    """
-
-    def __init__(self,
-                 init_kwargs: Optional[Dict] = None,
-                 interval: int = 10,
-                 ignore_last: bool = True,
-                 reset_flag: bool = False,
-                 commit: bool = True,
-                 by_epoch: bool = True,
-                 with_step: bool = True,
-                 log_artifact: bool = True,
-                 out_suffix: Union[str, tuple] = ('.log.json', '.log', '.py')):
-        super().__init__(interval, ignore_last, reset_flag, by_epoch)
-        self.import_wandb()
-        self.init_kwargs = init_kwargs
-        self.commit = commit
-        self.with_step = with_step
-        self.log_artifact = log_artifact
-        self.out_suffix = out_suffix
-
-    def import_wandb(self) -> None:
-        try:
-            import wandb
-        except ImportError:
-            raise ImportError(
-                'Please run "pip install wandb" to install wandb')
-        self.wandb = wandb
-
-    @master_only
-    def before_run(self, runner) -> None:
-        super().before_run(runner)
-        if self.wandb is None:
-            self.import_wandb()
-        if self.init_kwargs:
-            self.wandb.init(**self.init_kwargs)  # type: ignore
-        else:
-            self.wandb.init()  # type: ignore
-
-    @master_only
-    def log(self, runner) -> None:
-        tags = self.get_loggable_tags(runner)
-        if tags:
-            if self.with_step:
-                self.wandb.log(
-                    tags, step=self.get_iter(runner), commit=self.commit)
-            else:
-                tags['global_step'] = self.get_iter(runner)
-                self.wandb.log(tags, commit=self.commit)
-
-    @master_only
-    def after_run(self, runner) -> None:
-        if self.log_artifact:
-            wandb_artifact = self.wandb.Artifact(
-                name='artifacts', type='model')
-            for filename in scandir(runner.work_dir, self.out_suffix, True):
-                local_filepath = osp.join(runner.work_dir, filename)
-                wandb_artifact.add_file(local_filepath)
-            self.wandb.log_artifact(wandb_artifact)
-        self.wandb.join()
diff --git a/mmcv/runner/hooks/lr_updater.py b/mmcv/runner/hooks/lr_updater.py
deleted file mode 100644
index e0be405596..0000000000
--- a/mmcv/runner/hooks/lr_updater.py
+++ /dev/null
@@ -1,754 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numbers
-from math import cos, pi
-from typing import Callable, List, Optional, Union
-
-import mmcv
-from mmcv import runner
-from .hook import HOOKS, Hook
-
-
-class LrUpdaterHook(Hook):
-    """LR Scheduler in MMCV.
-
-    Args:
-        by_epoch (bool): LR changes epoch by epoch
-        warmup (string): Type of warmup used. It can be None(use no warmup),
-            'constant', 'linear' or 'exp'
-        warmup_iters (int): The number of iterations or epochs that warmup
-            lasts
-        warmup_ratio (float): LR used at the beginning of warmup equals to
-            warmup_ratio * initial_lr
-        warmup_by_epoch (bool): When warmup_by_epoch == True, warmup_iters
-            means the number of epochs that warmup lasts, otherwise means the
-            number of iteration that warmup lasts
-    """
-
-    def __init__(self,
-                 by_epoch: bool = True,
-                 warmup: Optional[str] = None,
-                 warmup_iters: int = 0,
-                 warmup_ratio: float = 0.1,
-                 warmup_by_epoch: bool = False) -> None:
-        # validate the "warmup" argument
-        if warmup is not None:
-            if warmup not in ['constant', 'linear', 'exp']:
-                raise ValueError(
-                    f'"{warmup}" is not a supported type for warming up, valid'
-                    ' types are "constant", "linear" and "exp"')
-        if warmup is not None:
-            assert warmup_iters > 0, \
-                '"warmup_iters" must be a positive integer'
-            assert 0 < warmup_ratio <= 1.0, \
-                '"warmup_ratio" must be in range (0,1]'
-
-        self.by_epoch = by_epoch
-        self.warmup = warmup
-        self.warmup_iters: Optional[int] = warmup_iters
-        self.warmup_ratio = warmup_ratio
-        self.warmup_by_epoch = warmup_by_epoch
-
-        if self.warmup_by_epoch:
-            self.warmup_epochs: Optional[int] = self.warmup_iters
-            self.warmup_iters = None
-        else:
-            self.warmup_epochs = None
-
-        self.base_lr: Union[list, dict] = []  # initial lr for all param groups
-        self.regular_lr: list = []  # expected lr if no warming up is performed
-
-    def _set_lr(self, runner, lr_groups):
-        if isinstance(runner.optimizer, dict):
-            for k, optim in runner.optimizer.items():
-                for param_group, lr in zip(optim.param_groups, lr_groups[k]):
-                    param_group['lr'] = lr
-        else:
-            for param_group, lr in zip(runner.optimizer.param_groups,
-                                       lr_groups):
-                param_group['lr'] = lr
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        raise NotImplementedError
-
-    def get_regular_lr(self, runner: 'runner.BaseRunner'):
-        if isinstance(runner.optimizer, dict):
-            lr_groups = {}
-            for k in runner.optimizer.keys():
-                _lr_group = [
-                    self.get_lr(runner, _base_lr)
-                    for _base_lr in self.base_lr[k]
-                ]
-                lr_groups.update({k: _lr_group})
-
-            return lr_groups
-        else:
-            return [self.get_lr(runner, _base_lr) for _base_lr in self.base_lr]
-
-    def get_warmup_lr(self, cur_iters: int):
-
-        def _get_warmup_lr(cur_iters, regular_lr):
-            if self.warmup == 'constant':
-                warmup_lr = [_lr * self.warmup_ratio for _lr in regular_lr]
-            elif self.warmup == 'linear':
-                k = (1 - cur_iters / self.warmup_iters) * (1 -
-                                                           self.warmup_ratio)
-                warmup_lr = [_lr * (1 - k) for _lr in regular_lr]
-            elif self.warmup == 'exp':
-                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
-                warmup_lr = [_lr * k for _lr in regular_lr]
-            return warmup_lr
-
-        if isinstance(self.regular_lr, dict):
-            lr_groups = {}
-            for key, regular_lr in self.regular_lr.items():
-                lr_groups[key] = _get_warmup_lr(cur_iters, regular_lr)
-            return lr_groups
-        else:
-            return _get_warmup_lr(cur_iters, self.regular_lr)
-
-    def before_run(self, runner: 'runner.BaseRunner'):
-        # NOTE: when resuming from a checkpoint, if 'initial_lr' is not saved,
-        # it will be set according to the optimizer params
-        if isinstance(runner.optimizer, dict):
-            self.base_lr = {}
-            for k, optim in runner.optimizer.items():
-                for group in optim.param_groups:
-                    group.setdefault('initial_lr', group['lr'])
-                _base_lr = [
-                    group['initial_lr'] for group in optim.param_groups
-                ]
-                self.base_lr.update({k: _base_lr})
-        else:
-            for group in runner.optimizer.param_groups:  # type: ignore
-                group.setdefault('initial_lr', group['lr'])
-            self.base_lr = [
-                group['initial_lr']
-                for group in runner.optimizer.param_groups  # type: ignore
-            ]
-
-    def before_train_epoch(self, runner: 'runner.BaseRunner'):
-        if self.warmup_iters is None:
-            epoch_len = len(runner.data_loader)  # type: ignore
-            self.warmup_iters = self.warmup_epochs * epoch_len  # type: ignore
-
-        if not self.by_epoch:
-            return
-
-        self.regular_lr = self.get_regular_lr(runner)
-        self._set_lr(runner, self.regular_lr)
-
-    def before_train_iter(self, runner: 'runner.BaseRunner'):
-        cur_iter = runner.iter
-        assert isinstance(self.warmup_iters, int)
-        if not self.by_epoch:
-            self.regular_lr = self.get_regular_lr(runner)
-            if self.warmup is None or cur_iter >= self.warmup_iters:
-                self._set_lr(runner, self.regular_lr)
-            else:
-                warmup_lr = self.get_warmup_lr(cur_iter)
-                self._set_lr(runner, warmup_lr)
-        elif self.by_epoch:
-            if self.warmup is None or cur_iter > self.warmup_iters:
-                return
-            elif cur_iter == self.warmup_iters:
-                self._set_lr(runner, self.regular_lr)
-            else:
-                warmup_lr = self.get_warmup_lr(cur_iter)
-                self._set_lr(runner, warmup_lr)
-
-
-@HOOKS.register_module()
-class FixedLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self, **kwargs):
-        super().__init__(**kwargs)
-
-    def get_lr(self, runner, base_lr):
-        return base_lr
-
-
-@HOOKS.register_module()
-class StepLrUpdaterHook(LrUpdaterHook):
-    """Step LR scheduler with min_lr clipping.
-
-    Args:
-        step (int | list[int]): Step to decay the LR. If an int value is given,
-            regard it as the decay interval. If a list is given, decay LR at
-            these steps.
-        gamma (float): Decay LR ratio. Defaults to 0.1.
-        min_lr (float, optional): Minimum LR value to keep. If LR after decay
-            is lower than `min_lr`, it will be clipped to this value. If None
-            is given, we don't perform lr clipping. Default: None.
-    """
-
-    def __init__(self,
-                 step: Union[int, List[int]],
-                 gamma: float = 0.1,
-                 min_lr: Optional[float] = None,
-                 **kwargs) -> None:
-        if isinstance(step, list):
-            assert mmcv.is_list_of(step, int)
-            assert all([s > 0 for s in step])
-        elif isinstance(step, int):
-            assert step > 0
-        else:
-            raise TypeError('"step" must be a list or integer')
-        self.step = step
-        self.gamma = gamma
-        self.min_lr = min_lr
-        super().__init__(**kwargs)
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        progress = runner.epoch if self.by_epoch else runner.iter
-
-        # calculate exponential term
-        if isinstance(self.step, int):
-            exp = progress // self.step
-        else:
-            exp = len(self.step)
-            for i, s in enumerate(self.step):
-                if progress < s:
-                    exp = i
-                    break
-
-        lr = base_lr * (self.gamma**exp)
-        if self.min_lr is not None:
-            # clip to a minimum value
-            lr = max(lr, self.min_lr)
-        return lr
-
-
-@HOOKS.register_module()
-class ExpLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self, gamma: float, **kwargs) -> None:
-        self.gamma = gamma
-        super().__init__(**kwargs)
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        progress = runner.epoch if self.by_epoch else runner.iter
-        return base_lr * self.gamma**progress
-
-
-@HOOKS.register_module()
-class PolyLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self,
-                 power: float = 1.,
-                 min_lr: float = 0.,
-                 **kwargs) -> None:
-        self.power = power
-        self.min_lr = min_lr
-        super().__init__(**kwargs)
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        if self.by_epoch:
-            progress = runner.epoch
-            max_progress = runner.max_epochs
-        else:
-            progress = runner.iter
-            max_progress = runner.max_iters
-        coeff = (1 - progress / max_progress)**self.power
-        return (base_lr - self.min_lr) * coeff + self.min_lr
-
-
-@HOOKS.register_module()
-class InvLrUpdaterHook(LrUpdaterHook):
-
-    def __init__(self, gamma: float, power: float = 1., **kwargs) -> None:
-        self.gamma = gamma
-        self.power = power
-        super().__init__(**kwargs)
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        progress = runner.epoch if self.by_epoch else runner.iter
-        return base_lr * (1 + self.gamma * progress)**(-self.power)
-
-
-@HOOKS.register_module()
-class CosineAnnealingLrUpdaterHook(LrUpdaterHook):
-    """CosineAnnealing LR scheduler.
-
-    Args:
-        min_lr (float, optional): The minimum lr. Default: None.
-        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
-            Either `min_lr` or `min_lr_ratio` should be specified.
-            Default: None.
-    """
-
-    def __init__(self,
-                 min_lr: Optional[float] = None,
-                 min_lr_ratio: Optional[float] = None,
-                 **kwargs) -> None:
-        assert (min_lr is None) ^ (min_lr_ratio is None)
-        self.min_lr = min_lr
-        self.min_lr_ratio = min_lr_ratio
-        super().__init__(**kwargs)
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        if self.by_epoch:
-            progress = runner.epoch
-            max_progress = runner.max_epochs
-        else:
-            progress = runner.iter
-            max_progress = runner.max_iters
-
-        if self.min_lr_ratio is not None:
-            target_lr = base_lr * self.min_lr_ratio
-        else:
-            target_lr = self.min_lr  # type:ignore
-        return annealing_cos(base_lr, target_lr, progress / max_progress)
-
-
-@HOOKS.register_module()
-class FlatCosineAnnealingLrUpdaterHook(LrUpdaterHook):
-    """Flat + Cosine lr schedule.
-
-    Modified from https://github.com/fastai/fastai/blob/master/fastai/callback/schedule.py#L128 # noqa: E501
-
-    Args:
-        start_percent (float): When to start annealing the learning rate
-            after the percentage of the total training steps.
-            The value should be in range [0, 1).
-            Default: 0.75
-        min_lr (float, optional): The minimum lr. Default: None.
-        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
-            Either `min_lr` or `min_lr_ratio` should be specified.
-            Default: None.
-    """
-
-    def __init__(self,
-                 start_percent: float = 0.75,
-                 min_lr: Optional[float] = None,
-                 min_lr_ratio: Optional[float] = None,
-                 **kwargs) -> None:
-        assert (min_lr is None) ^ (min_lr_ratio is None)
-        if start_percent < 0 or start_percent > 1 or not isinstance(
-                start_percent, float):
-            raise ValueError(
-                'expected float between 0 and 1 start_percent, but '
-                f'got {start_percent}')
-        self.start_percent = start_percent
-        self.min_lr = min_lr
-        self.min_lr_ratio = min_lr_ratio
-        super().__init__(**kwargs)
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        if self.by_epoch:
-            start = round(runner.max_epochs * self.start_percent)
-            progress = runner.epoch - start
-            max_progress = runner.max_epochs - start
-        else:
-            start = round(runner.max_iters * self.start_percent)
-            progress = runner.iter - start
-            max_progress = runner.max_iters - start
-
-        if self.min_lr_ratio is not None:
-            target_lr = base_lr * self.min_lr_ratio
-        else:
-            target_lr = self.min_lr  # type:ignore
-
-        if progress < 0:
-            return base_lr
-        else:
-            return annealing_cos(base_lr, target_lr, progress / max_progress)
-
-
-@HOOKS.register_module()
-class CosineRestartLrUpdaterHook(LrUpdaterHook):
-    """Cosine annealing with restarts learning rate scheme.
-
-    Args:
-        periods (list[int]): Periods for each cosine anneling cycle.
-        restart_weights (list[float]): Restart weights at each
-            restart iteration. Defaults to [1].
-        min_lr (float, optional): The minimum lr. Default: None.
-        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
-            Either `min_lr` or `min_lr_ratio` should be specified.
-            Default: None.
-    """
-
-    def __init__(self,
-                 periods: List[int],
-                 restart_weights: List[float] = [1],
-                 min_lr: Optional[float] = None,
-                 min_lr_ratio: Optional[float] = None,
-                 **kwargs) -> None:
-        assert (min_lr is None) ^ (min_lr_ratio is None)
-        self.periods = periods
-        self.min_lr = min_lr
-        self.min_lr_ratio = min_lr_ratio
-        self.restart_weights = restart_weights
-        assert (len(self.periods) == len(self.restart_weights)
-                ), 'periods and restart_weights should have the same length.'
-        super().__init__(**kwargs)
-
-        self.cumulative_periods = [
-            sum(self.periods[0:i + 1]) for i in range(0, len(self.periods))
-        ]
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        if self.by_epoch:
-            progress = runner.epoch
-        else:
-            progress = runner.iter
-
-        if self.min_lr_ratio is not None:
-            target_lr = base_lr * self.min_lr_ratio
-        else:
-            target_lr = self.min_lr  # type:ignore
-
-        idx = get_position_from_periods(progress, self.cumulative_periods)
-        current_weight = self.restart_weights[idx]
-        nearest_restart = 0 if idx == 0 else self.cumulative_periods[idx - 1]
-        current_periods = self.periods[idx]
-
-        alpha = min((progress - nearest_restart) / current_periods, 1)
-        return annealing_cos(base_lr, target_lr, alpha, current_weight)
-
-
-def get_position_from_periods(iteration: int, cumulative_periods: List[int]):
-    """Get the position from a period list.
-
-    It will return the index of the right-closest number in the period list.
-    For example, the cumulative_periods = [100, 200, 300, 400],
-    if iteration == 50, return 0;
-    if iteration == 210, return 2;
-    if iteration == 300, return 3.
-
-    Args:
-        iteration (int): Current iteration.
-        cumulative_periods (list[int]): Cumulative period list.
-
-    Returns:
-        int: The position of the right-closest number in the period list.
-    """
-    for i, period in enumerate(cumulative_periods):
-        if iteration < period:
-            return i
-    raise ValueError(f'Current iteration {iteration} exceeds '
-                     f'cumulative_periods {cumulative_periods}')
-
-
-@HOOKS.register_module()
-class CyclicLrUpdaterHook(LrUpdaterHook):
-    """Cyclic LR Scheduler.
-
-    Implement the cyclical learning rate policy (CLR) described in
-    https://arxiv.org/pdf/1506.01186.pdf
-
-    Different from the original paper, we use cosine annealing rather than
-    triangular policy inside a cycle. This improves the performance in the
-    3D detection area.
-
-    Args:
-        by_epoch (bool, optional): Whether to update LR by epoch.
-        target_ratio (tuple[float], optional): Relative ratio of the highest LR
-            and the lowest LR to the initial LR.
-        cyclic_times (int, optional): Number of cycles during training
-        step_ratio_up (float, optional): The ratio of the increasing process of
-            LR in the total cycle.
-        anneal_strategy (str, optional): {'cos', 'linear'}
-            Specifies the annealing strategy: 'cos' for cosine annealing,
-            'linear' for linear annealing. Default: 'cos'.
-        gamma (float, optional): Cycle decay ratio. Default: 1.
-            It takes values in the range (0, 1]. The difference between the
-            maximum learning rate and the minimum learning rate decreases
-            periodically when it is less than 1. `New in version 1.4.4.`
-    """
-
-    def __init__(self,
-                 by_epoch: bool = False,
-                 target_ratio: Union[float, tuple] = (10, 1e-4),
-                 cyclic_times: int = 1,
-                 step_ratio_up: float = 0.4,
-                 anneal_strategy: str = 'cos',
-                 gamma: float = 1,
-                 **kwargs) -> None:
-        if isinstance(target_ratio, float):
-            target_ratio = (target_ratio, target_ratio / 1e5)
-        elif isinstance(target_ratio, tuple):
-            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
-                if len(target_ratio) == 1 else target_ratio
-        else:
-            raise ValueError('target_ratio should be either float '
-                             f'or tuple, got {type(target_ratio)}')
-
-        assert len(target_ratio) == 2, \
-            '"target_ratio" must be list or tuple of two floats'
-        assert 0 <= step_ratio_up < 1.0, \
-            '"step_ratio_up" must be in range [0,1)'
-        assert 0 < gamma <= 1, \
-            '"gamma" must be in range (0, 1]'
-
-        self.target_ratio = target_ratio
-        self.cyclic_times = cyclic_times
-        self.step_ratio_up = step_ratio_up
-        self.gamma = gamma
-        self.max_iter_per_phase = None
-        self.lr_phases: list = []  # init lr_phases
-        # validate anneal_strategy
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError('anneal_strategy must be one of "cos" or '
-                             f'"linear", instead got {anneal_strategy}')
-        elif anneal_strategy == 'cos':
-            self.anneal_func: Callable[[float, float, float],
-                                       float] = annealing_cos
-        elif anneal_strategy == 'linear':
-            self.anneal_func = annealing_linear
-
-        assert not by_epoch, \
-            'currently only support "by_epoch" = False'
-        super().__init__(by_epoch, **kwargs)
-
-    def before_run(self, runner: 'runner.BaseRunner'):
-        super().before_run(runner)
-        # initiate lr_phases
-        # total lr_phases are separated as up and down
-        self.max_iter_per_phase = runner.max_iters // self.cyclic_times
-        iter_up_phase = int(self.step_ratio_up *
-                            self.max_iter_per_phase)  # type: ignore
-        self.lr_phases.append([0, iter_up_phase, 1, self.target_ratio[0]])
-        self.lr_phases.append([
-            iter_up_phase, self.max_iter_per_phase, self.target_ratio[0],
-            self.target_ratio[1]
-        ])
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        curr_iter = runner.iter % self.max_iter_per_phase  # type: ignore
-        curr_cycle = runner.iter // self.max_iter_per_phase  # type: ignore
-        # Update weight decay
-        scale = self.gamma**curr_cycle
-
-        for (start_iter, end_iter, start_ratio, end_ratio) in self.lr_phases:
-            if start_iter <= curr_iter < end_iter:
-                # Apply cycle scaling to gradually reduce the difference
-                # between max_lr and base lr. The target end_ratio can be
-                # expressed as:
-                # end_ratio = (base_lr + scale * (max_lr - base_lr)) / base_lr
-                # iteration: 0-iter_up_phase:
-                if start_iter == 0:
-                    end_ratio = 1 - scale + end_ratio * scale
-                # iteration: iter_up_phase-self.max_iter_per_phase
-                else:
-                    start_ratio = 1 - scale + start_ratio * scale
-                progress = curr_iter - start_iter
-                return self.anneal_func(base_lr * start_ratio,
-                                        base_lr * end_ratio,
-                                        progress / (end_iter - start_iter))
-
-
-@HOOKS.register_module()
-class OneCycleLrUpdaterHook(LrUpdaterHook):
-    """One Cycle LR Scheduler.
-
-    The 1cycle learning rate policy changes the learning rate after every
-    batch. The one cycle learning rate policy is described in
-    https://arxiv.org/pdf/1708.07120.pdf
-
-    Args:
-        max_lr (float or list): Upper learning rate boundaries in the cycle
-            for each parameter group.
-        total_steps (int, optional): The total number of steps in the cycle.
-            Note that if a value is not provided here, it will be the max_iter
-            of runner. Default: None.
-        pct_start (float): The percentage of the cycle (in number of steps)
-            spent increasing the learning rate.
-            Default: 0.3
-        anneal_strategy (str): {'cos', 'linear'}
-            Specifies the annealing strategy: 'cos' for cosine annealing,
-            'linear' for linear annealing.
-            Default: 'cos'
-        div_factor (float): Determines the initial learning rate via
-            initial_lr = max_lr/div_factor
-            Default: 25
-        final_div_factor (float): Determines the minimum learning rate via
-            min_lr = initial_lr/final_div_factor
-            Default: 1e4
-        three_phase (bool): If three_phase is True, use a third phase of the
-            schedule to annihilate the learning rate according to
-            final_div_factor instead of modifying the second phase (the first
-            two phases will be symmetrical about the step indicated by
-            pct_start).
-            Default: False
-    """
-
-    def __init__(self,
-                 max_lr: Union[float, List],
-                 total_steps: Optional[int] = None,
-                 pct_start: float = 0.3,
-                 anneal_strategy: str = 'cos',
-                 div_factor: float = 25,
-                 final_div_factor: float = 1e4,
-                 three_phase: bool = False,
-                 **kwargs) -> None:
-        # validate by_epoch, currently only support by_epoch = False
-        if 'by_epoch' not in kwargs:
-            kwargs['by_epoch'] = False
-        else:
-            assert not kwargs['by_epoch'], \
-                'currently only support "by_epoch" = False'
-        if not isinstance(max_lr, (numbers.Number, list, dict)):
-            raise ValueError('the type of max_lr must be the one of list or '
-                             f'dict, but got {type(max_lr)}')
-        self._max_lr = max_lr
-        if total_steps is not None:
-            if not isinstance(total_steps, int):
-                raise ValueError('the type of total_steps must be int, but'
-                                 f'got {type(total_steps)}')
-            self.total_steps = total_steps
-        # validate pct_start
-        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
-            raise ValueError('expected float between 0 and 1 pct_start, but '
-                             f'got {pct_start}')
-        self.pct_start = pct_start
-        # validate anneal_strategy
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError('anneal_strategy must be one of "cos" or '
-                             f'"linear", instead got {anneal_strategy}')
-        elif anneal_strategy == 'cos':
-            self.anneal_func: Callable[[float, float, float],
-                                       float] = annealing_cos
-        elif anneal_strategy == 'linear':
-            self.anneal_func = annealing_linear
-        self.div_factor = div_factor
-        self.final_div_factor = final_div_factor
-        self.three_phase = three_phase
-        self.lr_phases: list = []  # init lr_phases
-        super().__init__(**kwargs)
-
-    def before_run(self, runner: 'runner.BaseRunner'):
-        if hasattr(self, 'total_steps'):
-            total_steps = self.total_steps
-        else:
-            total_steps = runner.max_iters
-        if total_steps < runner.max_iters:
-            raise ValueError(
-                'The total steps must be greater than or equal to max '
-                f'iterations {runner.max_iters} of runner, but total steps '
-                f'is {total_steps}.')
-
-        if isinstance(runner.optimizer, dict):
-            self.base_lr = {}
-            for k, optim in runner.optimizer.items():
-                _max_lr = format_param(k, optim, self._max_lr)
-                self.base_lr[k] = [lr / self.div_factor for lr in _max_lr]
-                for group, lr in zip(optim.param_groups, self.base_lr[k]):
-                    group.setdefault('initial_lr', lr)
-        else:
-            k = type(runner.optimizer).__name__
-            _max_lr = format_param(k, runner.optimizer, self._max_lr)
-            self.base_lr = [lr / self.div_factor for lr in _max_lr]
-            optim_param_groups = runner.optimizer.param_groups  # type: ignore
-            for group, lr in zip(optim_param_groups, self.base_lr):
-                group.setdefault('initial_lr', lr)
-
-        if self.three_phase:
-            self.lr_phases.append(
-                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
-            self.lr_phases.append([
-                float(2 * self.pct_start * total_steps) - 2, self.div_factor, 1
-            ])
-            self.lr_phases.append(
-                [total_steps - 1, 1, 1 / self.final_div_factor])
-        else:
-            self.lr_phases.append(
-                [float(self.pct_start * total_steps) - 1, 1, self.div_factor])
-            self.lr_phases.append(
-                [total_steps - 1, self.div_factor, 1 / self.final_div_factor])
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        curr_iter = runner.iter
-        start_iter = 0
-        for i, (end_iter, start_lr, end_lr) in enumerate(self.lr_phases):
-            if curr_iter <= end_iter:
-                pct = (curr_iter - start_iter) / (end_iter - start_iter)
-                lr = self.anneal_func(base_lr * start_lr, base_lr * end_lr,
-                                      pct)
-                break
-            start_iter = end_iter
-        return lr
-
-
-@HOOKS.register_module()
-class LinearAnnealingLrUpdaterHook(LrUpdaterHook):
-    """Linear annealing LR Scheduler decays the learning rate of each parameter
-    group linearly.
-
-    Args:
-        min_lr (float, optional): The minimum lr. Default: None.
-        min_lr_ratio (float, optional): The ratio of minimum lr to the base lr.
-            Either `min_lr` or `min_lr_ratio` should be specified.
-            Default: None.
-    """
-
-    def __init__(self,
-                 min_lr: Optional[float] = None,
-                 min_lr_ratio: Optional[float] = None,
-                 **kwargs):
-        assert (min_lr is None) ^ (min_lr_ratio is None)
-        self.min_lr = min_lr
-        self.min_lr_ratio = min_lr_ratio
-        super().__init__(**kwargs)
-
-    def get_lr(self, runner: 'runner.BaseRunner', base_lr: float):
-        if self.by_epoch:
-            progress = runner.epoch
-            max_progress = runner.max_epochs
-        else:
-            progress = runner.iter
-            max_progress = runner.max_iters
-        if self.min_lr_ratio is not None:
-            target_lr = base_lr * self.min_lr_ratio
-        else:
-            target_lr = self.min_lr  # type:ignore
-        return annealing_linear(base_lr, target_lr, progress / max_progress)
-
-
-def annealing_cos(start: float,
-                  end: float,
-                  factor: float,
-                  weight: float = 1.) -> float:
-    """Calculate annealing cos learning rate.
-
-    Cosine anneal from `weight * start + (1 - weight) * end` to `end` as
-    percentage goes from 0.0 to 1.0.
-
-    Args:
-        start (float): The starting learning rate of the cosine annealing.
-        end (float): The ending learing rate of the cosine annealing.
-        factor (float): The coefficient of `pi` when calculating the current
-            percentage. Range from 0.0 to 1.0.
-        weight (float, optional): The combination factor of `start` and `end`
-            when calculating the actual starting learning rate. Default to 1.
-    """
-    cos_out = cos(pi * factor) + 1
-    return end + 0.5 * weight * (start - end) * cos_out
-
-
-def annealing_linear(start: float, end: float, factor: float) -> float:
-    """Calculate annealing linear learning rate.
-
-    Linear anneal from `start` to `end` as percentage goes from 0.0 to 1.0.
-
-    Args:
-        start (float): The starting learning rate of the linear annealing.
-        end (float): The ending learing rate of the linear annealing.
-        factor (float): The coefficient of `pi` when calculating the current
-            percentage. Range from 0.0 to 1.0.
-    """
-    return start + (end - start) * factor
-
-
-def format_param(name, optim, param):
-    if isinstance(param, numbers.Number):
-        return [param] * len(optim.param_groups)
-    elif isinstance(param, (list, tuple)):  # multi param groups
-        if len(param) != len(optim.param_groups):
-            raise ValueError(f'expected {len(optim.param_groups)} '
-                             f'values for {name}, got {len(param)}')
-        return param
-    else:  # multi optimizers
-        if name not in param:
-            raise KeyError(f'{name} is not found in {param.keys()}')
-        return param[name]
diff --git a/mmcv/runner/hooks/memory.py b/mmcv/runner/hooks/memory.py
deleted file mode 100644
index 78d1a7e368..0000000000
--- a/mmcv/runner/hooks/memory.py
+++ /dev/null
@@ -1,28 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import torch
-
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class EmptyCacheHook(Hook):
-
-    def __init__(self,
-                 before_epoch: bool = False,
-                 after_epoch: bool = True,
-                 after_iter: bool = False):
-        self._before_epoch = before_epoch
-        self._after_epoch = after_epoch
-        self._after_iter = after_iter
-
-    def after_iter(self, runner):
-        if self._after_iter:
-            torch.cuda.empty_cache()
-
-    def before_epoch(self, runner):
-        if self._before_epoch:
-            torch.cuda.empty_cache()
-
-    def after_epoch(self, runner):
-        if self._after_epoch:
-            torch.cuda.empty_cache()
diff --git a/mmcv/runner/hooks/momentum_updater.py b/mmcv/runner/hooks/momentum_updater.py
deleted file mode 100644
index fd9bc4834b..0000000000
--- a/mmcv/runner/hooks/momentum_updater.py
+++ /dev/null
@@ -1,594 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import mmcv
-from .hook import HOOKS, Hook
-from .lr_updater import annealing_cos, annealing_linear, format_param
-
-
-class MomentumUpdaterHook(Hook):
-
-    def __init__(self,
-                 by_epoch: bool = True,
-                 warmup: Optional[str] = None,
-                 warmup_iters: int = 0,
-                 warmup_ratio: float = 0.9):
-        # validate the "warmup" argument
-        if warmup is not None:
-            if warmup not in ['constant', 'linear', 'exp']:
-                raise ValueError(
-                    f'"{warmup}" is not a supported type for warming up, valid'
-                    ' types are "constant" and "linear"')
-        if warmup is not None:
-            assert warmup_iters > 0, \
-                '"warmup_iters" must be a positive integer'
-            assert 0 < warmup_ratio <= 1.0, \
-                '"warmup_momentum" must be in range (0,1]'
-
-        self.by_epoch = by_epoch
-        self.warmup = warmup
-        self.warmup_iters = warmup_iters
-        self.warmup_ratio = warmup_ratio
-
-        # initial momentum for all param groups
-        self.base_momentum: Union[list, dict] = []
-        # expected momentum if no warming up is performed
-        self.regular_momentum: Union[list, dict] = []
-
-    def _set_momentum(self, runner, momentum_groups):
-        if isinstance(runner.optimizer, dict):
-            for k, optim in runner.optimizer.items():
-                for param_group, mom in zip(optim.param_groups,
-                                            momentum_groups[k]):
-                    if 'momentum' in param_group.keys():
-                        param_group['momentum'] = mom
-                    elif 'betas' in param_group.keys():
-                        param_group['betas'] = (mom, param_group['betas'][1])
-        else:
-            for param_group, mom in zip(runner.optimizer.param_groups,
-                                        momentum_groups):
-                if 'momentum' in param_group.keys():
-                    param_group['momentum'] = mom
-                elif 'betas' in param_group.keys():
-                    param_group['betas'] = (mom, param_group['betas'][1])
-
-    def get_momentum(self, runner, base_momentum) -> float:
-        raise NotImplementedError
-
-    def get_regular_momentum(self, runner) -> Union[list, Dict[str, list]]:
-        if isinstance(runner.optimizer, dict):
-            assert isinstance(self.base_momentum, dict)
-            momentum_groups: Dict[str, List[float]] = {}
-            for k in runner.optimizer.keys():
-                _momentum_group: List[float] = [
-                    self.get_momentum(runner, _base_momentum)
-                    for _base_momentum in self.base_momentum[k]
-                ]
-                momentum_groups.update({k: _momentum_group})
-            return momentum_groups
-        else:
-            assert isinstance(self.base_momentum, list)
-            return [
-                self.get_momentum(runner, _base_momentum)
-                for _base_momentum in self.base_momentum
-            ]
-
-    def get_warmup_momentum(
-            self,
-            cur_iters: int) -> Union[List[float], Dict[str, List[float]]]:
-
-        def _get_warmup_momentum(cur_iters, regular_momentum):
-            if self.warmup == 'constant':
-                warmup_momentum = [
-                    _momentum / self.warmup_ratio
-                    for _momentum in regular_momentum
-                ]
-            elif self.warmup == 'linear':
-                k = (1 - cur_iters / self.warmup_iters) * (1 -
-                                                           self.warmup_ratio)
-                warmup_momentum = [
-                    _momentum / (1 - k) for _momentum in regular_momentum
-                ]
-            elif self.warmup == 'exp':
-                k = self.warmup_ratio**(1 - cur_iters / self.warmup_iters)
-                warmup_momentum = [
-                    _momentum / k for _momentum in regular_momentum
-                ]
-            else:
-                raise ValueError(
-                    'Expected values of `self.warmup` to be "constant", '
-                    f'"linear", or "exp", got {self.warmup}')
-            return warmup_momentum
-
-        if isinstance(self.regular_momentum, dict):
-            momentum_groups = {}
-            for key, regular_momentum in self.regular_momentum.items():
-                momentum_groups[key] = _get_warmup_momentum(
-                    cur_iters, regular_momentum)
-            return momentum_groups
-        else:
-            return _get_warmup_momentum(cur_iters, self.regular_momentum)
-
-    def before_run(self, runner):
-        # NOTE: when resuming from a checkpoint,
-        # if 'initial_momentum' is not saved,
-        # it will be set according to the optimizer params
-        if isinstance(runner.optimizer, dict):
-            self.base_momentum = {}
-            for k, optim in runner.optimizer.items():
-                for group in optim.param_groups:
-                    if 'momentum' in group.keys():
-                        group.setdefault('initial_momentum', group['momentum'])
-                    else:
-                        group.setdefault('initial_momentum', group['betas'][0])
-                _base_momentum = [
-                    group['initial_momentum'] for group in optim.param_groups
-                ]
-                self.base_momentum.update({k: _base_momentum})
-        else:
-            for group in runner.optimizer.param_groups:
-                if 'momentum' in group.keys():
-                    group.setdefault('initial_momentum', group['momentum'])
-                else:
-                    group.setdefault('initial_momentum', group['betas'][0])
-            self.base_momentum = [
-                group['initial_momentum']
-                for group in runner.optimizer.param_groups
-            ]
-
-    def before_train_epoch(self, runner):
-        if not self.by_epoch:
-            return
-        self.regular_momentum = self.get_regular_momentum(runner)
-        self._set_momentum(runner, self.regular_momentum)
-
-    def before_train_iter(self, runner):
-        cur_iter = runner.iter
-        if not self.by_epoch:
-            self.regular_momentum = self.get_regular_momentum(runner)
-            if self.warmup is None or cur_iter >= self.warmup_iters:
-                self._set_momentum(runner, self.regular_momentum)
-            else:
-                warmup_momentum = self.get_warmup_momentum(cur_iter)
-                self._set_momentum(runner, warmup_momentum)
-        elif self.by_epoch:
-            if self.warmup is None or cur_iter > self.warmup_iters:
-                return
-            elif cur_iter == self.warmup_iters:
-                self._set_momentum(runner, self.regular_momentum)
-            else:
-                warmup_momentum = self.get_warmup_momentum(cur_iter)
-                self._set_momentum(runner, warmup_momentum)
-
-
-@HOOKS.register_module()
-class StepMomentumUpdaterHook(MomentumUpdaterHook):
-    """Step momentum scheduler with min value clipping.
-
-    Args:
-        step (int | list[int]): Step to decay the momentum. If an int value is
-            given, regard it as the decay interval. If a list is given, decay
-            momentum at these steps.
-        gamma (float, optional): Decay momentum ratio. Default: 0.5.
-        min_momentum (float, optional): Minimum momentum value to keep. If
-            momentum after decay is lower than this value, it will be clipped
-            accordingly. If None is given, we don't perform lr clipping.
-            Default: None.
-    """
-
-    def __init__(self,
-                 step: Union[int, List[int]],
-                 gamma: float = 0.5,
-                 min_momentum: Optional[float] = None,
-                 **kwargs):
-        if isinstance(step, list):
-            assert mmcv.is_list_of(step, int)
-            assert all([s > 0 for s in step])
-        elif isinstance(step, int):
-            assert step > 0
-        else:
-            raise TypeError('"step" must be a list or integer')
-        self.step = step
-        self.gamma = gamma
-        self.min_momentum = min_momentum
-        super().__init__(**kwargs)
-
-    def get_momentum(self, runner, base_momentum: float) -> float:
-        progress = runner.epoch if self.by_epoch else runner.iter
-
-        # calculate exponential term
-        if isinstance(self.step, int):
-            exp = progress // self.step
-        else:
-            exp = len(self.step)
-            for i, s in enumerate(self.step):
-                if progress < s:
-                    exp = i
-                    break
-
-        momentum = base_momentum * (self.gamma**exp)
-        if self.min_momentum is not None:
-            # clip to a minimum value
-            momentum = max(momentum, self.min_momentum)
-        return momentum
-
-
-@HOOKS.register_module()
-class CosineAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
-    """Cosine annealing LR Momentum decays the Momentum of each parameter group
-    linearly.
-
-    Args:
-        min_momentum (float, optional): The minimum momentum. Default: None.
-        min_momentum_ratio (float, optional): The ratio of minimum momentum to
-            the base momentum. Either `min_momentum` or `min_momentum_ratio`
-            should be specified. Default: None.
-    """
-
-    def __init__(self,
-                 min_momentum: Optional[float] = None,
-                 min_momentum_ratio: Optional[float] = None,
-                 **kwargs):
-        assert (min_momentum is None) ^ (min_momentum_ratio is None)
-        self.min_momentum = min_momentum
-        self.min_momentum_ratio = min_momentum_ratio
-        super().__init__(**kwargs)
-
-    def get_momentum(self, runner, base_momentum: float) -> float:
-        if self.by_epoch:
-            progress = runner.epoch
-            max_progress = runner.max_epochs
-        else:
-            progress = runner.iter
-            max_progress = runner.max_iters
-        if self.min_momentum_ratio is not None:
-            target_momentum = base_momentum * self.min_momentum_ratio
-        else:
-            assert self.min_momentum is not None
-            target_momentum = self.min_momentum
-        return annealing_cos(base_momentum, target_momentum,
-                             progress / max_progress)
-
-
-@HOOKS.register_module()
-class LinearAnnealingMomentumUpdaterHook(MomentumUpdaterHook):
-    """Linear annealing LR Momentum decays the Momentum of each parameter group
-    linearly.
-
-    Args:
-        min_momentum (float, optional): The minimum momentum. Default: None.
-        min_momentum_ratio (float, optional): The ratio of minimum momentum to
-            the base momentum. Either `min_momentum` or `min_momentum_ratio`
-            should be specified. Default: None.
-    """
-
-    def __init__(self,
-                 min_momentum: Optional[float] = None,
-                 min_momentum_ratio: Optional[float] = None,
-                 **kwargs):
-        assert (min_momentum is None) ^ (min_momentum_ratio is None)
-        self.min_momentum = min_momentum
-        self.min_momentum_ratio = min_momentum_ratio
-        super().__init__(**kwargs)
-
-    def get_momentum(self, runner, base_momentum: float) -> float:
-        if self.by_epoch:
-            progress = runner.epoch
-            max_progress = runner.max_epochs
-        else:
-            progress = runner.iter
-            max_progress = runner.max_iters
-        if self.min_momentum_ratio is not None:
-            target_momentum = base_momentum * self.min_momentum_ratio
-        else:
-            assert self.min_momentum is not None
-            target_momentum = self.min_momentum
-        return annealing_linear(base_momentum, target_momentum,
-                                progress / max_progress)
-
-
-@HOOKS.register_module()
-class CyclicMomentumUpdaterHook(MomentumUpdaterHook):
-    """Cyclic momentum Scheduler.
-
-    Implement the cyclical momentum scheduler policy described in
-    https://arxiv.org/pdf/1708.07120.pdf
-
-    This momentum scheduler usually used together with the CyclicLRUpdater
-    to improve the performance in the 3D detection area.
-
-    Args:
-        target_ratio (tuple[float]): Relative ratio of the lowest momentum and
-            the highest momentum to the initial momentum.
-        cyclic_times (int): Number of cycles during training
-        step_ratio_up (float): The ratio of the increasing process of momentum
-            in  the total cycle.
-        by_epoch (bool): Whether to update momentum by epoch.
-        anneal_strategy (str, optional): {'cos', 'linear'}
-            Specifies the annealing strategy: 'cos' for cosine annealing,
-            'linear' for linear annealing. Default: 'cos'.
-        gamma (float, optional): Cycle decay ratio. Default: 1.
-            It takes values in the range (0, 1]. The difference between the
-            maximum learning rate and the minimum learning rate decreases
-            periodically when it is less than 1. `New in version 1.4.4.`
-    """
-
-    def __init__(self,
-                 by_epoch: bool = False,
-                 target_ratio: Tuple[float, float] = (0.85 / 0.95, 1.),
-                 cyclic_times: int = 1,
-                 step_ratio_up: float = 0.4,
-                 anneal_strategy: str = 'cos',
-                 gamma: float = 1.,
-                 **kwargs):
-        if isinstance(target_ratio, float):
-            target_ratio = (target_ratio, target_ratio / 1e5)
-        elif isinstance(target_ratio, tuple):
-            target_ratio = (target_ratio[0], target_ratio[0] / 1e5) \
-                if len(target_ratio) == 1 else target_ratio
-        else:
-            raise ValueError('target_ratio should be either float '
-                             f'or tuple, got {type(target_ratio)}')
-
-        assert len(target_ratio) == 2, \
-            '"target_ratio" must be list or tuple of two floats'
-        assert 0 <= step_ratio_up < 1.0, \
-            '"step_ratio_up" must be in range [0,1)'
-
-        self.target_ratio = target_ratio
-        self.cyclic_times = cyclic_times
-        self.step_ratio_up = step_ratio_up
-        self.gamma = gamma
-        self.momentum_phases: List[list] = []  # init momentum_phases
-
-        self.anneal_func: Callable[[float, float, float], float]
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError('anneal_strategy must be one of "cos" or '
-                             f'"linear", instead got {anneal_strategy}')
-        elif anneal_strategy == 'cos':
-            self.anneal_func = annealing_cos
-        elif anneal_strategy == 'linear':
-            self.anneal_func = annealing_linear
-        # currently only support by_epoch=False
-        assert not by_epoch, \
-            'currently only support "by_epoch" = False'
-        super().__init__(by_epoch, **kwargs)
-
-    def before_run(self, runner):
-        super().before_run(runner)
-        # initiate momentum_phases
-        # total momentum_phases are separated as up and down
-        max_iter_per_phase = runner.max_iters // self.cyclic_times
-        iter_up_phase = int(self.step_ratio_up * max_iter_per_phase)
-        self.max_iter_per_phase = max_iter_per_phase
-        self.momentum_phases.append(
-            [0, iter_up_phase, 1, self.target_ratio[0]])
-        self.momentum_phases.append([
-            iter_up_phase, max_iter_per_phase, self.target_ratio[0],
-            self.target_ratio[1]
-        ])
-
-    def get_momentum(self, runner, base_momentum: float) -> float:
-        curr_iter = runner.iter % self.max_iter_per_phase
-        curr_cycle = runner.iter // self.max_iter_per_phase
-        scale = self.gamma**curr_cycle
-        for (start_iter, end_iter, start_ratio, end_ratio) \
-                in self.momentum_phases:
-            if start_iter <= curr_iter < end_iter:
-                # Apply cycle scaling to gradually reduce the difference
-                # between max_momentum and base momentum. The target end_ratio
-                # can be expressed as:
-                # end_ratio = (base_momentum + scale * \
-                # (max_momentum - base_momentum)) / base_momentum
-                # iteration: 0-iter_up_phase:
-                if start_iter == 0:
-                    end_ratio = 1 - scale + end_ratio * scale
-                # iteration: iter_up_phase-self.max_iter_per_phase
-                else:
-                    start_ratio = 1 - scale + start_ratio * scale
-                progress = curr_iter - start_iter
-                return self.anneal_func(base_momentum * start_ratio,
-                                        base_momentum * end_ratio,
-                                        progress / (end_iter - start_iter))
-        raise RuntimeError('The method should return in the for-loop and '
-                           'should not be executed until this')
-
-
-@HOOKS.register_module()
-class OneCycleMomentumUpdaterHook(MomentumUpdaterHook):
-    """OneCycle momentum Scheduler.
-
-    This momentum scheduler usually used together with the OneCycleLrUpdater
-    to improve the performance.
-
-    Args:
-        base_momentum (float or list): Lower momentum boundaries in the cycle
-            for each parameter group. Note that momentum is cycled inversely
-            to learning rate; at the peak of a cycle, momentum is
-            'base_momentum' and learning rate is 'max_lr'.
-            Default: 0.85
-        max_momentum (float or list): Upper momentum boundaries in the cycle
-            for each parameter group. Functionally,
-            it defines the cycle amplitude (max_momentum - base_momentum).
-            Note that momentum is cycled inversely
-            to learning rate; at the start of a cycle, momentum is
-            'max_momentum' and learning rate is 'base_lr'
-            Default: 0.95
-        pct_start (float): The percentage of the cycle (in number of steps)
-            spent increasing the learning rate.
-            Default: 0.3
-        anneal_strategy (str): {'cos', 'linear'}
-            Specifies the annealing strategy: 'cos' for cosine annealing,
-            'linear' for linear annealing.
-            Default: 'cos'
-        three_phase (bool): If three_phase is True, use a third phase of the
-            schedule to annihilate the learning rate according to
-            final_div_factor instead of modifying the second phase (the first
-            two phases will be symmetrical about the step indicated by
-            pct_start).
-            Default: False
-    """
-
-    def __init__(self,
-                 base_momentum: Union[float, list, dict] = 0.85,
-                 max_momentum: Union[float, list, dict] = 0.95,
-                 pct_start: float = 0.3,
-                 anneal_strategy: str = 'cos',
-                 three_phase: bool = False,
-                 **kwargs):
-        # validate by_epoch, currently only support by_epoch=False
-        if 'by_epoch' not in kwargs:
-            kwargs['by_epoch'] = False
-        else:
-            assert not kwargs['by_epoch'], \
-                'currently only support "by_epoch" = False'
-        if not isinstance(base_momentum, (float, list, dict)):
-            raise ValueError('base_momentum must be the type among of float,'
-                             'list or dict.')
-        self._base_momentum = base_momentum
-        if not isinstance(max_momentum, (float, list, dict)):
-            raise ValueError('max_momentum must be the type among of float,'
-                             'list or dict.')
-        self._max_momentum = max_momentum
-        # validate pct_start
-        if pct_start < 0 or pct_start > 1 or not isinstance(pct_start, float):
-            raise ValueError('Expected float between 0 and 1 pct_start, but '
-                             f'got {pct_start}')
-        self.pct_start = pct_start
-        # validate anneal_strategy
-        self.anneal_func: Callable[[float, float, float], float]
-        if anneal_strategy not in ['cos', 'linear']:
-            raise ValueError('anneal_strategy must by one of "cos" or '
-                             f'"linear", instead got {anneal_strategy}')
-        elif anneal_strategy == 'cos':
-            self.anneal_func = annealing_cos
-        elif anneal_strategy == 'linear':
-            self.anneal_func = annealing_linear
-        self.three_phase = three_phase
-        self.momentum_phases: List[dict] = []  # init momentum_phases
-        super().__init__(**kwargs)
-
-    def before_run(self, runner):
-        if isinstance(runner.optimizer, dict):
-            for k, optim in runner.optimizer.items():
-                if ('momentum' not in optim.defaults
-                        and 'betas' not in optim.defaults):
-                    raise ValueError('optimizer must support momentum with'
-                                     'option enabled')
-                self.use_beta1 = 'betas' in optim.defaults
-                _base_momentum = format_param(k, optim, self._base_momentum)
-                _max_momentum = format_param(k, optim, self._max_momentum)
-                for group, b_momentum, m_momentum in zip(
-                        optim.param_groups, _base_momentum, _max_momentum):
-                    if self.use_beta1:
-                        _, beta2 = group['betas']
-                        group['betas'] = (m_momentum, beta2)
-                    else:
-                        group['momentum'] = m_momentum
-                    group['base_momentum'] = b_momentum
-                    group['max_momentum'] = m_momentum
-        else:
-            optim = runner.optimizer
-            if ('momentum' not in optim.defaults
-                    and 'betas' not in optim.defaults):
-                raise ValueError('optimizer must support momentum with'
-                                 'option enabled')
-            self.use_beta1 = 'betas' in optim.defaults
-            k = type(optim).__name__
-            _base_momentum = format_param(k, optim, self._base_momentum)
-            _max_momentum = format_param(k, optim, self._max_momentum)
-            for group, b_momentum, m_momentum in zip(optim.param_groups,
-                                                     _base_momentum,
-                                                     _max_momentum):
-                if self.use_beta1:
-                    _, beta2 = group['betas']
-                    group['betas'] = (m_momentum, beta2)
-                else:
-                    group['momentum'] = m_momentum
-                group['base_momentum'] = b_momentum
-                group['max_momentum'] = m_momentum
-
-        if self.three_phase:
-            self.momentum_phases.append({
-                'end_iter':
-                float(self.pct_start * runner.max_iters) - 1,
-                'start_momentum':
-                'max_momentum',
-                'end_momentum':
-                'base_momentum'
-            })
-            self.momentum_phases.append({
-                'end_iter':
-                float(2 * self.pct_start * runner.max_iters) - 2,
-                'start_momentum':
-                'base_momentum',
-                'end_momentum':
-                'max_momentum'
-            })
-            self.momentum_phases.append({
-                'end_iter': runner.max_iters - 1,
-                'start_momentum': 'max_momentum',
-                'end_momentum': 'max_momentum'
-            })
-        else:
-            self.momentum_phases.append({
-                'end_iter':
-                float(self.pct_start * runner.max_iters) - 1,
-                'start_momentum':
-                'max_momentum',
-                'end_momentum':
-                'base_momentum'
-            })
-            self.momentum_phases.append({
-                'end_iter': runner.max_iters - 1,
-                'start_momentum': 'base_momentum',
-                'end_momentum': 'max_momentum'
-            })
-
-    def _set_momentum(self, runner, momentum_groups):
-        if isinstance(runner.optimizer, dict):
-            for k, optim in runner.optimizer.items():
-                for param_group, mom in zip(optim.param_groups,
-                                            momentum_groups[k]):
-                    if 'momentum' in param_group.keys():
-                        param_group['momentum'] = mom
-                    elif 'betas' in param_group.keys():
-                        param_group['betas'] = (mom, param_group['betas'][1])
-        else:
-            for param_group, mom in zip(runner.optimizer.param_groups,
-                                        momentum_groups):
-                if 'momentum' in param_group.keys():
-                    param_group['momentum'] = mom
-                elif 'betas' in param_group.keys():
-                    param_group['betas'] = (mom, param_group['betas'][1])
-
-    def get_momentum(self, runner, param_group: Dict[str, float]) -> float:
-        curr_iter = runner.iter
-        start_iter = 0
-        momentum = 0.
-        for i, phase in enumerate(self.momentum_phases):
-            end_iter = phase['end_iter']
-            if curr_iter <= end_iter or i == len(self.momentum_phases) - 1:
-                pct = (curr_iter - start_iter) / (end_iter - start_iter)
-                momentum = self.anneal_func(
-                    param_group[phase['start_momentum']],
-                    param_group[phase['end_momentum']], pct)
-                break
-            start_iter = end_iter
-        return momentum
-
-    def get_regular_momentum(self, runner):
-        if isinstance(runner.optimizer, dict):
-            momentum_groups = {}
-            for k, optim in runner.optimizer.items():
-                _momentum_group = [
-                    self.get_momentum(runner, param_group)
-                    for param_group in optim.param_groups
-                ]
-                momentum_groups.update({k: _momentum_group})
-            return momentum_groups
-        else:
-            momentum_groups = []
-            for param_group in runner.optimizer.param_groups:
-                momentum_groups.append(self.get_momentum(runner, param_group))
-            return momentum_groups
diff --git a/mmcv/runner/hooks/optimizer.py b/mmcv/runner/hooks/optimizer.py
deleted file mode 100644
index fb3f90e656..0000000000
--- a/mmcv/runner/hooks/optimizer.py
+++ /dev/null
@@ -1,563 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import logging
-from collections import defaultdict
-from itertools import chain
-from typing import Optional, Union
-
-import torch.nn as nn
-from torch import Tensor
-from torch.nn.utils import clip_grad
-
-from mmcv.utils import TORCH_VERSION, _BatchNorm, digit_version
-from ..dist_utils import allreduce_grads
-from ..fp16_utils import LossScaler, wrap_fp16_model
-from .hook import HOOKS, Hook
-
-try:
-    # If PyTorch version >= 1.6.0, torch.cuda.amp.GradScaler would be imported
-    # and used; otherwise, auto fp16 will adopt mmcv's implementation.
-    from torch.cuda.amp import GradScaler
-except ImportError:
-    pass
-
-
-@HOOKS.register_module()
-class OptimizerHook(Hook):
-    """A hook contains custom operations for the optimizer.
-
-    Args:
-        grad_clip (dict, optional): A config dict to control the clip_grad.
-            Default: None.
-        detect_anomalous_params (bool): This option is only used for
-            debugging which will slow down the training speed.
-            Detect anomalous parameters that are not included in
-            the computational graph with `loss` as the root.
-            There are two cases
-
-                - Parameters were not used during
-                  forward pass.
-                - Parameters were not used to produce
-                  loss.
-            Default: False.
-    """
-
-    def __init__(self,
-                 grad_clip: Optional[dict] = None,
-                 detect_anomalous_params: bool = False):
-        self.grad_clip = grad_clip
-        self.detect_anomalous_params = detect_anomalous_params
-
-    def clip_grads(self, params):
-        params = list(
-            filter(lambda p: p.requires_grad and p.grad is not None, params))
-        if len(params) > 0:
-            return clip_grad.clip_grad_norm_(params, **self.grad_clip)
-
-    def after_train_iter(self, runner):
-        runner.optimizer.zero_grad()
-        if self.detect_anomalous_params:
-            self.detect_anomalous_parameters(runner.outputs['loss'], runner)
-        runner.outputs['loss'].backward()
-
-        if self.grad_clip is not None:
-            grad_norm = self.clip_grads(runner.model.parameters())
-            if grad_norm is not None:
-                # Add grad norm to the logger
-                runner.log_buffer.update({'grad_norm': float(grad_norm)},
-                                         runner.outputs['num_samples'])
-        runner.optimizer.step()
-
-    def detect_anomalous_parameters(self, loss: Tensor, runner) -> None:
-        logger = runner.logger
-        parameters_in_graph = set()
-        visited = set()
-
-        def traverse(grad_fn):
-            if grad_fn is None:
-                return
-            if grad_fn not in visited:
-                visited.add(grad_fn)
-                if hasattr(grad_fn, 'variable'):
-                    parameters_in_graph.add(grad_fn.variable)
-                parents = grad_fn.next_functions
-                if parents is not None:
-                    for parent in parents:
-                        grad_fn = parent[0]
-                        traverse(grad_fn)
-
-        traverse(loss.grad_fn)
-        for n, p in runner.model.named_parameters():
-            if p not in parameters_in_graph and p.requires_grad:
-                logger.log(
-                    level=logging.ERROR,
-                    msg=f'{n} with shape {p.size()} is not '
-                    f'in the computational graph \n')
-
-
-@HOOKS.register_module()
-class GradientCumulativeOptimizerHook(OptimizerHook):
-    """Optimizer Hook implements multi-iters gradient cumulating.
-
-    Args:
-        cumulative_iters (int, optional): Num of gradient cumulative iters.
-            The optimizer will step every `cumulative_iters` iters.
-            Defaults to 1.
-
-    Examples:
-        >>> # Use cumulative_iters to simulate a large batch size
-        >>> # It is helpful when the hardware cannot handle a large batch size.
-        >>> loader = DataLoader(data, batch_size=64)
-        >>> optim_hook = GradientCumulativeOptimizerHook(cumulative_iters=4)
-        >>> # almost equals to
-        >>> loader = DataLoader(data, batch_size=256)
-        >>> optim_hook = OptimizerHook()
-    """
-
-    def __init__(self, cumulative_iters: int = 1, **kwargs):
-        super().__init__(**kwargs)
-
-        assert isinstance(cumulative_iters, int) and cumulative_iters > 0, \
-            f'cumulative_iters only accepts positive int, but got ' \
-            f'{type(cumulative_iters)} instead.'
-
-        self.cumulative_iters = cumulative_iters
-        self.divisible_iters = 0
-        self.remainder_iters = 0
-        self.initialized = False
-
-    def has_batch_norm(self, module: nn.Module) -> bool:
-        if isinstance(module, _BatchNorm):
-            return True
-        for m in module.children():
-            if self.has_batch_norm(m):
-                return True
-        return False
-
-    def _init(self, runner):
-        if runner.iter % self.cumulative_iters != 0:
-            runner.logger.warning(
-                'Resume iter number is not divisible by cumulative_iters in '
-                'GradientCumulativeOptimizerHook, which means the gradient of '
-                'some iters is lost and the result may be influenced slightly.'
-            )
-
-        if self.has_batch_norm(runner.model) and self.cumulative_iters > 1:
-            runner.logger.warning(
-                'GradientCumulativeOptimizerHook may slightly decrease '
-                'performance if the model has BatchNorm layers.')
-
-        residual_iters = runner.max_iters - runner.iter
-
-        self.divisible_iters = (
-            residual_iters // self.cumulative_iters * self.cumulative_iters)
-        self.remainder_iters = residual_iters - self.divisible_iters
-
-        self.initialized = True
-
-    def after_train_iter(self, runner):
-        if not self.initialized:
-            self._init(runner)
-
-        if runner.iter < self.divisible_iters:
-            loss_factor = self.cumulative_iters
-        else:
-            loss_factor = self.remainder_iters
-        loss = runner.outputs['loss']
-        loss = loss / loss_factor
-        loss.backward()
-
-        if (self.every_n_iters(runner, self.cumulative_iters)
-                or self.is_last_iter(runner)):
-
-            if self.grad_clip is not None:
-                grad_norm = self.clip_grads(runner.model.parameters())
-                if grad_norm is not None:
-                    # Add grad norm to the logger
-                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
-                                             runner.outputs['num_samples'])
-            runner.optimizer.step()
-            runner.optimizer.zero_grad()
-
-
-if (TORCH_VERSION != 'parrots'
-        and digit_version(TORCH_VERSION) >= digit_version('1.6.0')):
-
-    @HOOKS.register_module()
-    class Fp16OptimizerHook(OptimizerHook):
-        """FP16 optimizer hook (using PyTorch's implementation).
-
-        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
-        to take care of the optimization procedure.
-
-        Args:
-            loss_scale (float | str | dict): Scale factor configuration.
-                If loss_scale is a float, static loss scaling will be used with
-                the specified scale. If loss_scale is a string, it must be
-                'dynamic', then dynamic loss scaling will be used.
-                It can also be a dict containing arguments of GradScalar.
-                Defaults to 512. For Pytorch >= 1.6, mmcv uses official
-                implementation of GradScaler. If you use a dict version of
-                loss_scale to create GradScaler, please refer to:
-                https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler
-                for the parameters.
-
-        Examples:
-            >>> loss_scale = dict(
-            ...     init_scale=65536.0,
-            ...     growth_factor=2.0,
-            ...     backoff_factor=0.5,
-            ...     growth_interval=2000
-            ... )
-            >>> optimizer_hook = Fp16OptimizerHook(loss_scale=loss_scale)
-        """
-
-        def __init__(self,
-                     grad_clip: Optional[dict] = None,
-                     coalesce: bool = True,
-                     bucket_size_mb: int = -1,
-                     loss_scale: Union[float, str, dict] = 512.,
-                     distributed: bool = True):
-            self.grad_clip = grad_clip
-            self.coalesce = coalesce
-            self.bucket_size_mb = bucket_size_mb
-            self.distributed = distributed
-            self._scale_update_param = None
-            if loss_scale == 'dynamic':
-                self.loss_scaler = GradScaler()
-            elif isinstance(loss_scale, float):
-                self._scale_update_param = loss_scale
-                self.loss_scaler = GradScaler(init_scale=loss_scale)
-            elif isinstance(loss_scale, dict):
-                self.loss_scaler = GradScaler(**loss_scale)
-            else:
-                raise ValueError('loss_scale must be of type float, dict, or '
-                                 f'"dynamic", got {loss_scale}')
-
-        def before_run(self, runner) -> None:
-            """Preparing steps before Mixed Precision Training."""
-            # wrap model mode to fp16
-            wrap_fp16_model(runner.model)
-            # resume from state dict
-            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
-                scaler_state_dict = runner.meta['fp16']['loss_scaler']
-                self.loss_scaler.load_state_dict(scaler_state_dict)
-
-        def copy_grads_to_fp32(self, fp16_net: nn.Module,
-                               fp32_weights: Tensor) -> None:
-            """Copy gradients from fp16 model to fp32 weight copy."""
-            for fp32_param, fp16_param in zip(fp32_weights,
-                                              fp16_net.parameters()):
-                if fp16_param.grad is not None:
-                    if fp32_param.grad is None:
-                        fp32_param.grad = fp32_param.data.new(
-                            fp32_param.size())
-                    fp32_param.grad.copy_(fp16_param.grad)
-
-        def copy_params_to_fp16(self, fp16_net: nn.Module,
-                                fp32_weights: Tensor) -> None:
-            """Copy updated params from fp32 weight copy to fp16 model."""
-            for fp16_param, fp32_param in zip(fp16_net.parameters(),
-                                              fp32_weights):
-                fp16_param.data.copy_(fp32_param.data)
-
-        def after_train_iter(self, runner) -> None:
-            """Backward optimization steps for Mixed Precision Training. For
-            dynamic loss scaling, please refer to
-            https://pytorch.org/docs/stable/amp.html#torch.cuda.amp.GradScaler.
-
-            1. Scale the loss by a scale factor.
-            2. Backward the loss to obtain the gradients.
-            3. Unscale the optimizer’s gradient tensors.
-            4. Call optimizer.step() and update scale factor.
-            5. Save loss_scaler state_dict for resume purpose.
-            """
-            # clear grads of last iteration
-            runner.model.zero_grad()
-            runner.optimizer.zero_grad()
-
-            self.loss_scaler.scale(runner.outputs['loss']).backward()
-            self.loss_scaler.unscale_(runner.optimizer)
-            # grad clip
-            if self.grad_clip is not None:
-                grad_norm = self.clip_grads(runner.model.parameters())
-                if grad_norm is not None:
-                    # Add grad norm to the logger
-                    runner.log_buffer.update({'grad_norm': float(grad_norm)},
-                                             runner.outputs['num_samples'])
-            # backward and update scaler
-            self.loss_scaler.step(runner.optimizer)
-            self.loss_scaler.update(self._scale_update_param)
-
-            # save state_dict of loss_scaler
-            runner.meta.setdefault(
-                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
-
-    @HOOKS.register_module()
-    class GradientCumulativeFp16OptimizerHook(GradientCumulativeOptimizerHook,
-                                              Fp16OptimizerHook):
-        """Fp16 optimizer Hook (using PyTorch's implementation) implements
-        multi-iters gradient cumulating.
-
-        If you are using PyTorch >= 1.6, torch.cuda.amp is used as the backend,
-        to take care of the optimization procedure.
-        """
-
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
-
-        def after_train_iter(self, runner) -> None:
-            if not self.initialized:
-                self._init(runner)
-
-            if runner.iter < self.divisible_iters:
-                loss_factor = self.cumulative_iters
-            else:
-                loss_factor = self.remainder_iters
-            loss = runner.outputs['loss']
-            loss = loss / loss_factor
-
-            self.loss_scaler.scale(loss).backward()
-
-            if (self.every_n_iters(runner, self.cumulative_iters)
-                    or self.is_last_iter(runner)):
-
-                # copy fp16 grads in the model to fp32 params in the optimizer
-                self.loss_scaler.unscale_(runner.optimizer)
-
-                if self.grad_clip is not None:
-                    grad_norm = self.clip_grads(runner.model.parameters())
-                    if grad_norm is not None:
-                        # Add grad norm to the logger
-                        runner.log_buffer.update(
-                            {'grad_norm': float(grad_norm)},
-                            runner.outputs['num_samples'])
-
-                # backward and update scaler
-                self.loss_scaler.step(runner.optimizer)
-                self.loss_scaler.update(self._scale_update_param)
-
-                # save state_dict of loss_scaler
-                runner.meta.setdefault(
-                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
-
-                # clear grads
-                runner.model.zero_grad()
-                runner.optimizer.zero_grad()
-
-else:
-
-    @HOOKS.register_module()
-    class Fp16OptimizerHook(OptimizerHook):  # type: ignore
-        """FP16 optimizer hook (mmcv's implementation).
-
-        The steps of fp16 optimizer is as follows.
-        1. Scale the loss value.
-        2. BP in the fp16 model.
-        2. Copy gradients from fp16 model to fp32 weights.
-        3. Update fp32 weights.
-        4. Copy updated parameters from fp32 weights to fp16 model.
-
-        Refer to https://arxiv.org/abs/1710.03740 for more details.
-
-        Args:
-            loss_scale (float | str | dict): Scale factor configuration.
-                If loss_scale is a float, static loss scaling will be used with
-                the specified scale. If loss_scale is a string, it must be
-                'dynamic', then dynamic loss scaling will be used.
-                It can also be a dict containing arguments of LossScaler.
-                Defaults to 512.
-        """
-
-        def __init__(self,
-                     grad_clip: Optional[dict] = None,
-                     coalesce: bool = True,
-                     bucket_size_mb: int = -1,
-                     loss_scale: Union[float, str, dict] = 512.,
-                     distributed: bool = True):
-            self.grad_clip = grad_clip
-            self.coalesce = coalesce
-            self.bucket_size_mb = bucket_size_mb
-            self.distributed = distributed
-            if loss_scale == 'dynamic':
-                self.loss_scaler = LossScaler(mode='dynamic')
-            elif isinstance(loss_scale, float):
-                self.loss_scaler = LossScaler(
-                    init_scale=loss_scale, mode='static')
-            elif isinstance(loss_scale, dict):
-                self.loss_scaler = LossScaler(**loss_scale)
-            else:
-                raise ValueError('loss_scale must be of type float, dict, or '
-                                 f'"dynamic", got {loss_scale}')
-
-        def before_run(self, runner) -> None:
-            """Preparing steps before Mixed Precision Training.
-
-            1. Make a master copy of fp32 weights for optimization.
-            2. Convert the main model from fp32 to fp16.
-            """
-            # keep a copy of fp32 weights
-            old_groups = runner.optimizer.param_groups
-            runner.optimizer.param_groups = copy.deepcopy(
-                runner.optimizer.param_groups)
-            state: defaultdict = defaultdict(dict)
-            p_map = {
-                old_p: p
-                for old_p, p in zip(
-                    chain(*(g['params'] for g in old_groups)),
-                    chain(*(g['params']
-                            for g in runner.optimizer.param_groups)))
-            }
-            for k, v in runner.optimizer.state.items():
-                state[p_map[k]] = v
-            runner.optimizer.state = state
-            # convert model to fp16
-            wrap_fp16_model(runner.model)
-            # resume from state dict
-            if 'fp16' in runner.meta and 'loss_scaler' in runner.meta['fp16']:
-                scaler_state_dict = runner.meta['fp16']['loss_scaler']
-                self.loss_scaler.load_state_dict(scaler_state_dict)
-
-        def copy_grads_to_fp32(self, fp16_net: nn.Module,
-                               fp32_weights: Tensor) -> None:
-            """Copy gradients from fp16 model to fp32 weight copy."""
-            for fp32_param, fp16_param in zip(fp32_weights,
-                                              fp16_net.parameters()):
-                if fp16_param.grad is not None:
-                    if fp32_param.grad is None:
-                        fp32_param.grad = fp32_param.data.new(
-                            fp32_param.size())
-                    fp32_param.grad.copy_(fp16_param.grad)
-
-        def copy_params_to_fp16(self, fp16_net: nn.Module,
-                                fp32_weights: Tensor) -> None:
-            """Copy updated params from fp32 weight copy to fp16 model."""
-            for fp16_param, fp32_param in zip(fp16_net.parameters(),
-                                              fp32_weights):
-                fp16_param.data.copy_(fp32_param.data)
-
-        def after_train_iter(self, runner) -> None:
-            """Backward optimization steps for Mixed Precision Training. For
-            dynamic loss scaling, please refer `loss_scalar.py`
-
-            1. Scale the loss by a scale factor.
-            2. Backward the loss to obtain the gradients (fp16).
-            3. Copy gradients from the model to the fp32 weight copy.
-            4. Scale the gradients back and update the fp32 weight copy.
-            5. Copy back the params from fp32 weight copy to the fp16 model.
-            6. Save loss_scaler state_dict for resume purpose.
-            """
-            # clear grads of last iteration
-            runner.model.zero_grad()
-            runner.optimizer.zero_grad()
-            # scale the loss value
-            scaled_loss = runner.outputs['loss'] * self.loss_scaler.loss_scale
-            scaled_loss.backward()
-            # copy fp16 grads in the model to fp32 params in the optimizer
-
-            fp32_weights = []
-            for param_group in runner.optimizer.param_groups:
-                fp32_weights += param_group['params']
-            self.copy_grads_to_fp32(runner.model, fp32_weights)
-            # allreduce grads
-            if self.distributed:
-                allreduce_grads(fp32_weights, self.coalesce,
-                                self.bucket_size_mb)
-
-            has_overflow = self.loss_scaler.has_overflow(fp32_weights)
-            # if has overflow, skip this iteration
-            if not has_overflow:
-                # scale the gradients back
-                for param in fp32_weights:
-                    if param.grad is not None:
-                        param.grad.div_(self.loss_scaler.loss_scale)
-                if self.grad_clip is not None:
-                    grad_norm = self.clip_grads(fp32_weights)
-                    if grad_norm is not None:
-                        # Add grad norm to the logger
-                        runner.log_buffer.update(
-                            {'grad_norm': float(grad_norm)},
-                            runner.outputs['num_samples'])
-                # update fp32 params
-                runner.optimizer.step()
-                # copy fp32 params to the fp16 model
-                self.copy_params_to_fp16(runner.model, fp32_weights)
-            self.loss_scaler.update_scale(has_overflow)
-            if has_overflow:
-                runner.logger.warning('Check overflow, downscale loss scale '
-                                      f'to {self.loss_scaler.cur_scale}')
-
-            # save state_dict of loss_scaler
-            runner.meta.setdefault(
-                'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
-
-    @HOOKS.register_module()
-    class GradientCumulativeFp16OptimizerHook(  # type: ignore
-            GradientCumulativeOptimizerHook, Fp16OptimizerHook):
-        """Fp16 optimizer Hook (using mmcv implementation) implements multi-
-        iters gradient cumulating."""
-
-        def __init__(self, *args, **kwargs):
-            super().__init__(*args, **kwargs)
-
-        def after_train_iter(self, runner) -> None:
-            if not self.initialized:
-                self._init(runner)
-
-            if runner.iter < self.divisible_iters:
-                loss_factor = self.cumulative_iters
-            else:
-                loss_factor = self.remainder_iters
-
-            loss = runner.outputs['loss']
-            loss = loss / loss_factor
-
-            # scale the loss value
-            scaled_loss = loss * self.loss_scaler.loss_scale
-            scaled_loss.backward()
-
-            if (self.every_n_iters(runner, self.cumulative_iters)
-                    or self.is_last_iter(runner)):
-
-                # copy fp16 grads in the model to fp32 params in the optimizer
-                fp32_weights = []
-                for param_group in runner.optimizer.param_groups:
-                    fp32_weights += param_group['params']
-                self.copy_grads_to_fp32(runner.model, fp32_weights)
-                # allreduce grads
-                if self.distributed:
-                    allreduce_grads(fp32_weights, self.coalesce,
-                                    self.bucket_size_mb)
-
-                has_overflow = self.loss_scaler.has_overflow(fp32_weights)
-                # if has overflow, skip this iteration
-                if not has_overflow:
-                    # scale the gradients back
-                    for param in fp32_weights:
-                        if param.grad is not None:
-                            param.grad.div_(self.loss_scaler.loss_scale)
-                    if self.grad_clip is not None:
-                        grad_norm = self.clip_grads(fp32_weights)
-                        if grad_norm is not None:
-                            # Add grad norm to the logger
-                            runner.log_buffer.update(
-                                {'grad_norm': float(grad_norm)},
-                                runner.outputs['num_samples'])
-                    # update fp32 params
-                    runner.optimizer.step()
-                    # copy fp32 params to the fp16 model
-                    self.copy_params_to_fp16(runner.model, fp32_weights)
-                else:
-                    runner.logger.warning(
-                        'Check overflow, downscale loss scale '
-                        f'to {self.loss_scaler.cur_scale}')
-
-                self.loss_scaler.update_scale(has_overflow)
-
-                # save state_dict of loss_scaler
-                runner.meta.setdefault(
-                    'fp16', {})['loss_scaler'] = self.loss_scaler.state_dict()
-
-                # clear grads
-                runner.model.zero_grad()
-                runner.optimizer.zero_grad()
diff --git a/mmcv/runner/hooks/profiler.py b/mmcv/runner/hooks/profiler.py
deleted file mode 100644
index 6b0fc4b864..0000000000
--- a/mmcv/runner/hooks/profiler.py
+++ /dev/null
@@ -1,190 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import warnings
-from typing import Callable, List, Optional, Union
-
-import torch
-
-from ..dist_utils import master_only
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class ProfilerHook(Hook):
-    """Profiler to analyze performance during training.
-
-    PyTorch Profiler is a tool that allows the collection of the performance
-    metrics during the training. More details on Profiler can be found at
-    https://pytorch.org/docs/1.8.1/profiler.html#torch.profiler.profile
-
-    Args:
-        by_epoch (bool): Profile performance by epoch or by iteration.
-            Default: True.
-        profile_iters (int): Number of iterations for profiling.
-            If ``by_epoch=True``, profile_iters indicates that they are the
-            first profile_iters epochs at the beginning of the
-            training, otherwise it indicates the first profile_iters
-            iterations. Default: 1.
-        activities (list[str]): List of activity groups (CPU, CUDA) to use in
-            profiling. Default: ['cpu', 'cuda'].
-        schedule (dict, optional): Config of generating the callable schedule.
-            if schedule is None, profiler will not add step markers into the
-            trace and table view. Default: None.
-        on_trace_ready (callable, dict): Either a handler or a dict of generate
-            handler. Default: None.
-        record_shapes (bool): Save information about operator's input shapes.
-            Default: False.
-        profile_memory (bool): Track tensor memory allocation/deallocation.
-            Default: False.
-        with_stack (bool): Record source information (file and line number)
-            for the ops. Default: False.
-        with_flops (bool): Use formula to estimate the FLOPS of specific
-            operators (matrix multiplication and 2D convolution).
-            Default: False.
-        json_trace_path (str, optional): Exports the collected trace in Chrome
-            JSON format. Default: None.
-
-    Example:
-        >>> runner = ... # instantiate a Runner
-        >>> # tensorboard trace
-        >>> trace_config = dict(type='tb_trace', dir_name='work_dir')
-        >>> profiler_config = dict(on_trace_ready=trace_config)
-        >>> runner.register_profiler_hook(profiler_config)
-        >>> runner.run(data_loaders=[trainloader], workflow=[('train', 1)])
-    """
-
-    def __init__(self,
-                 by_epoch: bool = True,
-                 profile_iters: int = 1,
-                 activities: List[str] = ['cpu', 'cuda'],
-                 schedule: Optional[dict] = None,
-                 on_trace_ready: Optional[Union[Callable, dict]] = None,
-                 record_shapes: bool = False,
-                 profile_memory: bool = False,
-                 with_stack: bool = False,
-                 with_flops: bool = False,
-                 json_trace_path: Optional[str] = None) -> None:
-        try:
-            from torch import profiler  # torch version >= 1.8.1
-        except ImportError:
-            raise ImportError('profiler is the new feature of torch1.8.1, '
-                              f'but your version is {torch.__version__}')
-
-        assert isinstance(by_epoch, bool), '``by_epoch`` should be a boolean.'
-        self.by_epoch = by_epoch
-
-        if profile_iters < 1:
-            raise ValueError('profile_iters should be greater than 0, but got '
-                             f'{profile_iters}')
-        self.profile_iters = profile_iters
-
-        if not isinstance(activities, list):
-            raise ValueError(
-                f'activities should be list, but got {type(activities)}')
-        self.activities = []
-        for activity in activities:
-            activity = activity.lower()
-            if activity == 'cpu':
-                self.activities.append(profiler.ProfilerActivity.CPU)
-            elif activity == 'cuda':
-                self.activities.append(profiler.ProfilerActivity.CUDA)
-            else:
-                raise ValueError(
-                    f'activity should be "cpu" or "cuda", but got {activity}')
-
-        if schedule is not None:
-            self.schedule = profiler.schedule(**schedule)
-        else:
-            self.schedule = None
-
-        self.on_trace_ready = on_trace_ready
-        self.record_shapes = record_shapes
-        self.profile_memory = profile_memory
-        self.with_stack = with_stack
-        self.with_flops = with_flops
-        self.json_trace_path = json_trace_path
-
-    @master_only
-    def before_run(self, runner):
-        if self.by_epoch and runner.max_epochs < self.profile_iters:
-            raise ValueError('self.profile_iters should not be greater than '
-                             f'{runner.max_epochs}')
-
-        if not self.by_epoch and runner.max_iters < self.profile_iters:
-            raise ValueError('self.profile_iters should not be greater than '
-                             f'{runner.max_iters}')
-
-        if callable(self.on_trace_ready):  # handler
-            _on_trace_ready = self.on_trace_ready
-        elif isinstance(self.on_trace_ready, dict):  # config of handler
-            trace_cfg = self.on_trace_ready.copy()
-            trace_type = trace_cfg.pop('type')  # log_trace handler
-            if trace_type == 'log_trace':
-
-                def _log_handler(prof):
-                    print(prof.key_averages().table(**trace_cfg))
-
-                _on_trace_ready = _log_handler
-            elif trace_type == 'tb_trace':  # tensorboard_trace handler
-                try:
-                    import torch_tb_profiler  # noqa: F401
-                except ImportError:
-                    raise ImportError('please run "pip install '
-                                      'torch-tb-profiler" to install '
-                                      'torch_tb_profiler')
-                if 'dir_name' not in trace_cfg:
-                    trace_cfg['dir_name'] = osp.join(runner.work_dir,
-                                                     'tf_tracing_logs')
-                elif not osp.isabs(trace_cfg['dir_name']):
-                    trace_cfg['dir_name'] = osp.join(runner.work_dir,
-                                                     trace_cfg['dir_name'])
-                runner.logger.info(
-                    'tracing files of ProfilerHook will be saved to '
-                    f"{trace_cfg['dir_name']}.")
-                _on_trace_ready = torch.profiler.tensorboard_trace_handler(
-                    **trace_cfg)
-            else:
-                raise ValueError('trace_type should be "log_trace" or '
-                                 f'"tb_trace", but got {trace_type}')
-        elif self.on_trace_ready is None:
-            _on_trace_ready = None  # type: ignore
-        else:
-            raise ValueError('on_trace_ready should be handler, dict or None, '
-                             f'but got {type(self.on_trace_ready)}')
-
-        if self.by_epoch and runner.max_epochs > 1:
-            warnings.warn(f'profiler will profile {runner.max_epochs} epochs '
-                          'instead of 1 epoch. Since profiler will slow down '
-                          'the training, it is recommended to train 1 epoch '
-                          'with ProfilerHook and adjust your setting according'
-                          ' to the profiler summary. During normal training '
-                          '(epoch > 1), you may disable the ProfilerHook.')
-
-        self.profiler = torch.profiler.profile(
-            activities=self.activities,
-            schedule=self.schedule,
-            on_trace_ready=_on_trace_ready,
-            record_shapes=self.record_shapes,
-            profile_memory=self.profile_memory,
-            with_stack=self.with_stack,
-            with_flops=self.with_flops)
-
-        self.profiler.__enter__()
-        runner.logger.info('profiler is profiling...')
-
-    @master_only
-    def after_train_epoch(self, runner):
-        if self.by_epoch and runner.epoch == self.profile_iters - 1:
-            runner.logger.info('profiler may take a few minutes...')
-            self.profiler.__exit__(None, None, None)
-            if self.json_trace_path is not None:
-                self.profiler.export_chrome_trace(self.json_trace_path)
-
-    @master_only
-    def after_train_iter(self, runner):
-        self.profiler.step()
-        if not self.by_epoch and runner.iter == self.profile_iters - 1:
-            runner.logger.info('profiler may take a few minutes...')
-            self.profiler.__exit__(None, None, None)
-            if self.json_trace_path is not None:
-                self.profiler.export_chrome_trace(self.json_trace_path)
diff --git a/mmcv/runner/hooks/sampler_seed.py b/mmcv/runner/hooks/sampler_seed.py
deleted file mode 100644
index ee0dc6bdd8..0000000000
--- a/mmcv/runner/hooks/sampler_seed.py
+++ /dev/null
@@ -1,20 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class DistSamplerSeedHook(Hook):
-    """Data-loading sampler for distributed training.
-
-    When distributed training, it is only useful in conjunction with
-    :obj:`EpochBasedRunner`, while :obj:`IterBasedRunner` achieves the same
-    purpose with :obj:`IterLoader`.
-    """
-
-    def before_epoch(self, runner):
-        if hasattr(runner.data_loader.sampler, 'set_epoch'):
-            # in case the data loader uses `SequentialSampler` in Pytorch
-            runner.data_loader.sampler.set_epoch(runner.epoch)
-        elif hasattr(runner.data_loader.batch_sampler.sampler, 'set_epoch'):
-            # batch sampler in pytorch warps the sampler as its attributes.
-            runner.data_loader.batch_sampler.sampler.set_epoch(runner.epoch)
diff --git a/mmcv/runner/hooks/sync_buffer.py b/mmcv/runner/hooks/sync_buffer.py
deleted file mode 100644
index 5f07ae656a..0000000000
--- a/mmcv/runner/hooks/sync_buffer.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from ..dist_utils import allreduce_params
-from .hook import HOOKS, Hook
-
-
-@HOOKS.register_module()
-class SyncBuffersHook(Hook):
-    """Synchronize model buffers such as running_mean and running_var in BN at
-    the end of each epoch.
-
-    Args:
-        distributed (bool): Whether distributed training is used. It is
-          effective only for distributed training. Defaults to True.
-    """
-
-    def __init__(self, distributed: bool = True):
-        self.distributed = distributed
-
-    def after_epoch(self, runner):
-        """All-reduce model buffers at the end of each epoch."""
-        if self.distributed:
-            allreduce_params(runner.model.buffers())
diff --git a/mmcv/runner/iter_based_runner.py b/mmcv/runner/iter_based_runner.py
deleted file mode 100644
index 06b4b7d2a0..0000000000
--- a/mmcv/runner/iter_based_runner.py
+++ /dev/null
@@ -1,285 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os.path as osp
-import platform
-import shutil
-import time
-import warnings
-from typing import Callable, Dict, List, Optional, Tuple, Union, no_type_check
-
-import torch
-from torch.optim import Optimizer
-from torch.utils.data import DataLoader
-
-import mmcv
-from .base_runner import BaseRunner
-from .builder import RUNNERS
-from .checkpoint import save_checkpoint
-from .hooks import IterTimerHook
-from .utils import get_host_info
-
-
-class IterLoader:
-
-    def __init__(self, dataloader: DataLoader):
-        self._dataloader = dataloader
-        self.iter_loader = iter(self._dataloader)
-        self._epoch = 0
-
-    @property
-    def epoch(self) -> int:
-        return self._epoch
-
-    def __next__(self):
-        try:
-            data = next(self.iter_loader)
-        except StopIteration:
-            self._epoch += 1
-            if hasattr(self._dataloader.sampler, 'set_epoch'):
-                self._dataloader.sampler.set_epoch(self._epoch)
-            time.sleep(2)  # Prevent possible deadlock during epoch transition
-            self.iter_loader = iter(self._dataloader)
-            data = next(self.iter_loader)
-
-        return data
-
-    def __len__(self):
-        return len(self._dataloader)
-
-
-@RUNNERS.register_module()
-class IterBasedRunner(BaseRunner):
-    """Iteration-based Runner.
-
-    This runner train models iteration by iteration.
-    """
-
-    def train(self, data_loader, **kwargs):
-        self.model.train()
-        self.mode = 'train'
-        self.data_loader = data_loader
-        self._epoch = data_loader.epoch
-        data_batch = next(data_loader)
-        self.data_batch = data_batch
-        self.call_hook('before_train_iter')
-        outputs = self.model.train_step(data_batch, self.optimizer, **kwargs)
-        if not isinstance(outputs, dict):
-            raise TypeError('model.train_step() must return a dict')
-        if 'log_vars' in outputs:
-            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
-        self.outputs = outputs
-        self.call_hook('after_train_iter')
-        del self.data_batch
-        self._inner_iter += 1
-        self._iter += 1
-
-    @torch.no_grad()
-    def val(self, data_loader, **kwargs):
-        self.model.eval()
-        self.mode = 'val'
-        self.data_loader = data_loader
-        data_batch = next(data_loader)
-        self.data_batch = data_batch
-        self.call_hook('before_val_iter')
-        outputs = self.model.val_step(data_batch, **kwargs)
-        if not isinstance(outputs, dict):
-            raise TypeError('model.val_step() must return a dict')
-        if 'log_vars' in outputs:
-            self.log_buffer.update(outputs['log_vars'], outputs['num_samples'])
-        self.outputs = outputs
-        self.call_hook('after_val_iter')
-        del self.data_batch
-        self._inner_iter += 1
-
-    def run(self,
-            data_loaders: List[DataLoader],
-            workflow: List[Tuple[str, int]],
-            max_iters: Optional[int] = None,
-            **kwargs) -> None:
-        """Start running.
-
-        Args:
-            data_loaders (list[:obj:`DataLoader`]): Dataloaders for training
-                and validation.
-            workflow (list[tuple]): A list of (phase, iters) to specify the
-                running order and iterations. E.g, [('train', 10000),
-                ('val', 1000)] means running 10000 iterations for training and
-                1000 iterations for validation, iteratively.
-        """
-        assert isinstance(data_loaders, list)
-        assert mmcv.is_list_of(workflow, tuple)
-        assert len(data_loaders) == len(workflow)
-        if max_iters is not None:
-            warnings.warn(
-                'setting max_iters in run is deprecated, '
-                'please set max_iters in runner_config', DeprecationWarning)
-            self._max_iters = max_iters
-        assert self._max_iters is not None, (
-            'max_iters must be specified during instantiation')
-
-        work_dir = self.work_dir if self.work_dir is not None else 'NONE'
-        self.logger.info('Start running, host: %s, work_dir: %s',
-                         get_host_info(), work_dir)
-        self.logger.info('Hooks will be executed in the following order:\n%s',
-                         self.get_hook_info())
-        self.logger.info('workflow: %s, max: %d iters', workflow,
-                         self._max_iters)
-        self.call_hook('before_run')
-
-        iter_loaders = [IterLoader(x) for x in data_loaders]
-
-        self.call_hook('before_epoch')
-
-        while self.iter < self._max_iters:
-            for i, flow in enumerate(workflow):
-                self._inner_iter = 0
-                mode, iters = flow
-                if not isinstance(mode, str) or not hasattr(self, mode):
-                    raise ValueError(
-                        'runner has no method named "{}" to run a workflow'.
-                        format(mode))
-                iter_runner = getattr(self, mode)
-                for _ in range(iters):
-                    if mode == 'train' and self.iter >= self._max_iters:
-                        break
-                    iter_runner(iter_loaders[i], **kwargs)
-
-        time.sleep(1)  # wait for some hooks like loggers to finish
-        self.call_hook('after_epoch')
-        self.call_hook('after_run')
-
-    @no_type_check
-    def resume(self,
-               checkpoint: str,
-               resume_optimizer: bool = True,
-               map_location: Union[str, Callable] = 'default') -> None:
-        """Resume model from checkpoint.
-
-        Args:
-            checkpoint (str): Checkpoint to resume from.
-            resume_optimizer (bool, optional): Whether resume the optimizer(s)
-                if the checkpoint file includes optimizer(s). Default to True.
-            map_location (str, optional): Same as :func:`torch.load`.
-                Default to 'default'.
-        """
-        if map_location == 'default':
-            device_id = torch.cuda.current_device()
-            checkpoint = self.load_checkpoint(
-                checkpoint,
-                map_location=lambda storage, loc: storage.cuda(device_id))
-        else:
-            checkpoint = self.load_checkpoint(
-                checkpoint, map_location=map_location)
-
-        self._epoch = checkpoint['meta']['epoch']
-        self._iter = checkpoint['meta']['iter']
-        self._inner_iter = checkpoint['meta']['iter']
-        if 'optimizer' in checkpoint and resume_optimizer:
-            if isinstance(self.optimizer, Optimizer):
-                self.optimizer.load_state_dict(checkpoint['optimizer'])
-            elif isinstance(self.optimizer, dict):
-                for k in self.optimizer.keys():
-                    self.optimizer[k].load_state_dict(
-                        checkpoint['optimizer'][k])
-            else:
-                raise TypeError(
-                    'Optimizer should be dict or torch.optim.Optimizer '
-                    f'but got {type(self.optimizer)}')
-
-        self.logger.info(f'resumed from epoch: {self.epoch}, iter {self.iter}')
-
-    def save_checkpoint(  # type: ignore
-            self,
-            out_dir: str,
-            filename_tmpl: str = 'iter_{}.pth',
-            meta: Optional[Dict] = None,
-            save_optimizer: bool = True,
-            create_symlink: bool = True) -> None:
-        """Save checkpoint to file.
-
-        Args:
-            out_dir (str): Directory to save checkpoint files.
-            filename_tmpl (str, optional): Checkpoint file template.
-                Defaults to 'iter_{}.pth'.
-            meta (dict, optional): Metadata to be saved in checkpoint.
-                Defaults to None.
-            save_optimizer (bool, optional): Whether save optimizer.
-                Defaults to True.
-            create_symlink (bool, optional): Whether create symlink to the
-                latest checkpoint file. Defaults to True.
-        """
-        if meta is None:
-            meta = {}
-        elif not isinstance(meta, dict):
-            raise TypeError(
-                f'meta should be a dict or None, but got {type(meta)}')
-        if self.meta is not None:
-            meta.update(self.meta)
-            # Note: meta.update(self.meta) should be done before
-            # meta.update(epoch=self.epoch + 1, iter=self.iter) otherwise
-            # there will be problems with resumed checkpoints.
-            # More details in https://github.com/open-mmlab/mmcv/pull/1108
-        meta.update(epoch=self.epoch + 1, iter=self.iter)
-
-        filename = filename_tmpl.format(self.iter + 1)
-        filepath = osp.join(out_dir, filename)
-        optimizer = self.optimizer if save_optimizer else None
-        save_checkpoint(self.model, filepath, optimizer=optimizer, meta=meta)
-        # in some environments, `os.symlink` is not supported, you may need to
-        # set `create_symlink` to False
-        if create_symlink:
-            dst_file = osp.join(out_dir, 'latest.pth')
-            if platform.system() != 'Windows':
-                mmcv.symlink(filename, dst_file)
-            else:
-                shutil.copy(filepath, dst_file)
-
-    def register_training_hooks(self,
-                                lr_config,
-                                optimizer_config=None,
-                                checkpoint_config=None,
-                                log_config=None,
-                                momentum_config=None,
-                                custom_hooks_config=None):
-        """Register default hooks for iter-based training.
-
-        Checkpoint hook, optimizer stepper hook and logger hooks will be set to
-        `by_epoch=False` by default.
-
-        Default hooks include:
-
-        +----------------------+-------------------------+
-        | Hooks                | Priority                |
-        +======================+=========================+
-        | LrUpdaterHook        | VERY_HIGH (10)          |
-        +----------------------+-------------------------+
-        | MomentumUpdaterHook  | HIGH (30)               |
-        +----------------------+-------------------------+
-        | OptimizerStepperHook | ABOVE_NORMAL (40)       |
-        +----------------------+-------------------------+
-        | CheckpointSaverHook  | NORMAL (50)             |
-        +----------------------+-------------------------+
-        | IterTimerHook        | LOW (70)                |
-        +----------------------+-------------------------+
-        | LoggerHook(s)        | VERY_LOW (90)           |
-        +----------------------+-------------------------+
-        | CustomHook(s)        | defaults to NORMAL (50) |
-        +----------------------+-------------------------+
-
-        If custom hooks have same priority with default hooks, custom hooks
-        will be triggered after default hooks.
-        """
-        if checkpoint_config is not None:
-            checkpoint_config.setdefault('by_epoch', False)  # type: ignore
-        if lr_config is not None:
-            lr_config.setdefault('by_epoch', False)  # type: ignore
-        if log_config is not None:
-            for info in log_config['hooks']:
-                info.setdefault('by_epoch', False)
-        super().register_training_hooks(
-            lr_config=lr_config,
-            momentum_config=momentum_config,
-            optimizer_config=optimizer_config,
-            checkpoint_config=checkpoint_config,
-            log_config=log_config,
-            timer_config=IterTimerHook(),
-            custom_hooks_config=custom_hooks_config)
diff --git a/mmcv/runner/log_buffer.py b/mmcv/runner/log_buffer.py
deleted file mode 100644
index 3c9f379637..0000000000
--- a/mmcv/runner/log_buffer.py
+++ /dev/null
@@ -1,41 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from collections import OrderedDict
-
-import numpy as np
-
-
-class LogBuffer:
-
-    def __init__(self):
-        self.val_history = OrderedDict()
-        self.n_history = OrderedDict()
-        self.output = OrderedDict()
-        self.ready = False
-
-    def clear(self) -> None:
-        self.val_history.clear()
-        self.n_history.clear()
-        self.clear_output()
-
-    def clear_output(self) -> None:
-        self.output.clear()
-        self.ready = False
-
-    def update(self, vars: dict, count: int = 1) -> None:
-        assert isinstance(vars, dict)
-        for key, var in vars.items():
-            if key not in self.val_history:
-                self.val_history[key] = []
-                self.n_history[key] = []
-            self.val_history[key].append(var)
-            self.n_history[key].append(count)
-
-    def average(self, n: int = 0) -> None:
-        """Average latest n values or all values."""
-        assert n >= 0
-        for key in self.val_history:
-            values = np.array(self.val_history[key][-n:])
-            nums = np.array(self.n_history[key][-n:])
-            avg = np.sum(values * nums) / np.sum(nums)
-            self.output[key] = avg
-        self.ready = True
diff --git a/mmcv/runner/optimizer/__init__.py b/mmcv/runner/optimizer/__init__.py
deleted file mode 100644
index 53c34d0470..0000000000
--- a/mmcv/runner/optimizer/__init__.py
+++ /dev/null
@@ -1,9 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from .builder import (OPTIMIZER_BUILDERS, OPTIMIZERS, build_optimizer,
-                      build_optimizer_constructor)
-from .default_constructor import DefaultOptimizerConstructor
-
-__all__ = [
-    'OPTIMIZER_BUILDERS', 'OPTIMIZERS', 'DefaultOptimizerConstructor',
-    'build_optimizer', 'build_optimizer_constructor'
-]
diff --git a/mmcv/runner/optimizer/builder.py b/mmcv/runner/optimizer/builder.py
deleted file mode 100644
index 49d8f05a2c..0000000000
--- a/mmcv/runner/optimizer/builder.py
+++ /dev/null
@@ -1,45 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-import inspect
-from typing import Dict, List
-
-import torch
-
-from ...utils import Registry, build_from_cfg
-
-OPTIMIZERS = Registry('optimizer')
-OPTIMIZER_BUILDERS = Registry('optimizer builder')
-
-
-def register_torch_optimizers() -> List:
-    torch_optimizers = []
-    for module_name in dir(torch.optim):
-        if module_name.startswith('__'):
-            continue
-        _optim = getattr(torch.optim, module_name)
-        if inspect.isclass(_optim) and issubclass(_optim,
-                                                  torch.optim.Optimizer):
-            OPTIMIZERS.register_module()(_optim)
-            torch_optimizers.append(module_name)
-    return torch_optimizers
-
-
-TORCH_OPTIMIZERS = register_torch_optimizers()
-
-
-def build_optimizer_constructor(cfg: Dict):
-    return build_from_cfg(cfg, OPTIMIZER_BUILDERS)
-
-
-def build_optimizer(model, cfg: Dict):
-    optimizer_cfg = copy.deepcopy(cfg)
-    constructor_type = optimizer_cfg.pop('constructor',
-                                         'DefaultOptimizerConstructor')
-    paramwise_cfg = optimizer_cfg.pop('paramwise_cfg', None)
-    optim_constructor = build_optimizer_constructor(
-        dict(
-            type=constructor_type,
-            optimizer_cfg=optimizer_cfg,
-            paramwise_cfg=paramwise_cfg))
-    optimizer = optim_constructor(model)
-    return optimizer
diff --git a/mmcv/runner/optimizer/default_constructor.py b/mmcv/runner/optimizer/default_constructor.py
deleted file mode 100644
index c82b56e52f..0000000000
--- a/mmcv/runner/optimizer/default_constructor.py
+++ /dev/null
@@ -1,258 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import warnings
-from typing import Dict, List, Optional, Union
-
-import torch
-import torch.nn as nn
-from torch.nn import GroupNorm, LayerNorm
-
-from mmcv.utils import _BatchNorm, _InstanceNorm, build_from_cfg, is_list_of
-from mmcv.utils.ext_loader import check_ops_exist
-from .builder import OPTIMIZER_BUILDERS, OPTIMIZERS
-
-
-@OPTIMIZER_BUILDERS.register_module()
-class DefaultOptimizerConstructor:
-    """Default constructor for optimizers.
-
-    By default each parameter share the same optimizer settings, and we
-    provide an argument ``paramwise_cfg`` to specify parameter-wise settings.
-    It is a dict and may contain the following fields:
-
-    - ``custom_keys`` (dict): Specified parameters-wise settings by keys. If
-      one of the keys in ``custom_keys`` is a substring of the name of one
-      parameter, then the setting of the parameter will be specified by
-      ``custom_keys[key]`` and other setting like ``bias_lr_mult`` etc. will
-      be ignored. It should be noted that the aforementioned ``key`` is the
-      longest key that is a substring of the name of the parameter. If there
-      are multiple matched keys with the same length, then the key with lower
-      alphabet order will be chosen.
-      ``custom_keys[key]`` should be a dict and may contain fields ``lr_mult``
-      and ``decay_mult``. See Example 2 below.
-    - ``bias_lr_mult`` (float): It will be multiplied to the learning
-      rate for all bias parameters (except for those in normalization
-      layers and offset layers of DCN).
-    - ``bias_decay_mult`` (float): It will be multiplied to the weight
-      decay for all bias parameters (except for those in
-      normalization layers, depthwise conv layers, offset layers of DCN).
-    - ``norm_decay_mult`` (float): It will be multiplied to the weight
-      decay for all weight and bias parameters of normalization
-      layers.
-    - ``dwconv_decay_mult`` (float): It will be multiplied to the weight
-      decay for all weight and bias parameters of depthwise conv
-      layers.
-    - ``dcn_offset_lr_mult`` (float): It will be multiplied to the learning
-      rate for parameters of offset layer in the deformable convs
-      of a model.
-    - ``bypass_duplicate`` (bool): If true, the duplicate parameters
-      would not be added into optimizer. Default: False.
-
-    Note:
-
-        1. If the option ``dcn_offset_lr_mult`` is used, the constructor will
-        override the effect of ``bias_lr_mult`` in the bias of offset layer.
-        So be careful when using both ``bias_lr_mult`` and
-        ``dcn_offset_lr_mult``. If you wish to apply both of them to the offset
-        layer in deformable convs, set ``dcn_offset_lr_mult`` to the original
-        ``dcn_offset_lr_mult`` * ``bias_lr_mult``.
-
-        2. If the option ``dcn_offset_lr_mult`` is used, the constructor will
-        apply it to all the DCN layers in the model. So be careful when the
-        model contains multiple DCN layers in places other than backbone.
-
-    Args:
-        model (:obj:`nn.Module`): The model with parameters to be optimized.
-        optimizer_cfg (dict): The config dict of the optimizer.
-            Positional fields are
-
-                - `type`: class name of the optimizer.
-
-            Optional fields are
-
-                - any arguments of the corresponding optimizer type, e.g.,
-                  lr, weight_decay, momentum, etc.
-        paramwise_cfg (dict, optional): Parameter-wise options.
-
-    Example 1:
-        >>> model = torch.nn.modules.Conv1d(1, 1, 1)
-        >>> optimizer_cfg = dict(type='SGD', lr=0.01, momentum=0.9,
-        >>>                      weight_decay=0.0001)
-        >>> paramwise_cfg = dict(norm_decay_mult=0.)
-        >>> optim_builder = DefaultOptimizerConstructor(
-        >>>     optimizer_cfg, paramwise_cfg)
-        >>> optimizer = optim_builder(model)
-
-    Example 2:
-        >>> # assume model have attribute model.backbone and model.cls_head
-        >>> optimizer_cfg = dict(type='SGD', lr=0.01, weight_decay=0.95)
-        >>> paramwise_cfg = dict(custom_keys={
-                'backbone': dict(lr_mult=0.1, decay_mult=0.9)})
-        >>> optim_builder = DefaultOptimizerConstructor(
-        >>>     optimizer_cfg, paramwise_cfg)
-        >>> optimizer = optim_builder(model)
-        >>> # Then the `lr` and `weight_decay` for model.backbone is
-        >>> # (0.01 * 0.1, 0.95 * 0.9). `lr` and `weight_decay` for
-        >>> # model.cls_head is (0.01, 0.95).
-    """
-
-    def __init__(self,
-                 optimizer_cfg: Dict,
-                 paramwise_cfg: Optional[Dict] = None):
-        if not isinstance(optimizer_cfg, dict):
-            raise TypeError('optimizer_cfg should be a dict',
-                            f'but got {type(optimizer_cfg)}')
-        self.optimizer_cfg = optimizer_cfg
-        self.paramwise_cfg = {} if paramwise_cfg is None else paramwise_cfg
-        self.base_lr = optimizer_cfg.get('lr', None)
-        self.base_wd = optimizer_cfg.get('weight_decay', None)
-        self._validate_cfg()
-
-    def _validate_cfg(self) -> None:
-        if not isinstance(self.paramwise_cfg, dict):
-            raise TypeError('paramwise_cfg should be None or a dict, '
-                            f'but got {type(self.paramwise_cfg)}')
-
-        if 'custom_keys' in self.paramwise_cfg:
-            if not isinstance(self.paramwise_cfg['custom_keys'], dict):
-                raise TypeError(
-                    'If specified, custom_keys must be a dict, '
-                    f'but got {type(self.paramwise_cfg["custom_keys"])}')
-            if self.base_wd is None:
-                for key in self.paramwise_cfg['custom_keys']:
-                    if 'decay_mult' in self.paramwise_cfg['custom_keys'][key]:
-                        raise ValueError('base_wd should not be None')
-
-        # get base lr and weight decay
-        # weight_decay must be explicitly specified if mult is specified
-        if ('bias_decay_mult' in self.paramwise_cfg
-                or 'norm_decay_mult' in self.paramwise_cfg
-                or 'dwconv_decay_mult' in self.paramwise_cfg):
-            if self.base_wd is None:
-                raise ValueError('base_wd should not be None')
-
-    def _is_in(self, param_group: Dict, param_group_list: List) -> bool:
-        assert is_list_of(param_group_list, dict)
-        param = set(param_group['params'])
-        param_set = set()
-        for group in param_group_list:
-            param_set.update(set(group['params']))
-
-        return not param.isdisjoint(param_set)
-
-    def add_params(self,
-                   params: List[Dict],
-                   module: nn.Module,
-                   prefix: str = '',
-                   is_dcn_module: Union[int, float, None] = None) -> None:
-        """Add all parameters of module to the params list.
-
-        The parameters of the given module will be added to the list of param
-        groups, with specific rules defined by paramwise_cfg.
-
-        Args:
-            params (list[dict]): A list of param groups, it will be modified
-                in place.
-            module (nn.Module): The module to be added.
-            prefix (str): The prefix of the module
-            is_dcn_module (int|float|None): If the current module is a
-                submodule of DCN, `is_dcn_module` will be passed to
-                control conv_offset layer's learning rate. Defaults to None.
-        """
-        # get param-wise options
-        custom_keys = self.paramwise_cfg.get('custom_keys', {})
-        # first sort with alphabet order and then sort with reversed len of str
-        sorted_keys = sorted(sorted(custom_keys.keys()), key=len, reverse=True)
-
-        bias_lr_mult = self.paramwise_cfg.get('bias_lr_mult', 1.)
-        bias_decay_mult = self.paramwise_cfg.get('bias_decay_mult', 1.)
-        norm_decay_mult = self.paramwise_cfg.get('norm_decay_mult', 1.)
-        dwconv_decay_mult = self.paramwise_cfg.get('dwconv_decay_mult', 1.)
-        bypass_duplicate = self.paramwise_cfg.get('bypass_duplicate', False)
-        dcn_offset_lr_mult = self.paramwise_cfg.get('dcn_offset_lr_mult', 1.)
-
-        # special rules for norm layers and depth-wise conv layers
-        is_norm = isinstance(module,
-                             (_BatchNorm, _InstanceNorm, GroupNorm, LayerNorm))
-        is_dwconv = (
-            isinstance(module, torch.nn.Conv2d)
-            and module.in_channels == module.groups)
-
-        for name, param in module.named_parameters(recurse=False):
-            param_group = {'params': [param]}
-            if not param.requires_grad:
-                params.append(param_group)
-                continue
-            if bypass_duplicate and self._is_in(param_group, params):
-                warnings.warn(f'{prefix} is duplicate. It is skipped since '
-                              f'bypass_duplicate={bypass_duplicate}')
-                continue
-            # if the parameter match one of the custom keys, ignore other rules
-            is_custom = False
-            for key in sorted_keys:
-                if key in f'{prefix}.{name}':
-                    is_custom = True
-                    lr_mult = custom_keys[key].get('lr_mult', 1.)
-                    param_group['lr'] = self.base_lr * lr_mult
-                    if self.base_wd is not None:
-                        decay_mult = custom_keys[key].get('decay_mult', 1.)
-                        param_group['weight_decay'] = self.base_wd * decay_mult
-                    break
-
-            if not is_custom:
-                # bias_lr_mult affects all bias parameters
-                # except for norm.bias dcn.conv_offset.bias
-                if name == 'bias' and not (is_norm or is_dcn_module):
-                    param_group['lr'] = self.base_lr * bias_lr_mult
-
-                if (prefix.find('conv_offset') != -1 and is_dcn_module
-                        and isinstance(module, torch.nn.Conv2d)):
-                    # deal with both dcn_offset's bias & weight
-                    param_group['lr'] = self.base_lr * dcn_offset_lr_mult
-
-                # apply weight decay policies
-                if self.base_wd is not None:
-                    # norm decay
-                    if is_norm:
-                        param_group[
-                            'weight_decay'] = self.base_wd * norm_decay_mult
-                    # depth-wise conv
-                    elif is_dwconv:
-                        param_group[
-                            'weight_decay'] = self.base_wd * dwconv_decay_mult
-                    # bias lr and decay
-                    elif name == 'bias' and not is_dcn_module:
-                        # TODO: current bias_decay_mult will have affect on DCN
-                        param_group[
-                            'weight_decay'] = self.base_wd * bias_decay_mult
-            params.append(param_group)
-
-        if check_ops_exist():
-            from mmcv.ops import DeformConv2d, ModulatedDeformConv2d
-            is_dcn_module = isinstance(module,
-                                       (DeformConv2d, ModulatedDeformConv2d))
-        else:
-            is_dcn_module = False
-        for child_name, child_mod in module.named_children():
-            child_prefix = f'{prefix}.{child_name}' if prefix else child_name
-            self.add_params(
-                params,
-                child_mod,
-                prefix=child_prefix,
-                is_dcn_module=is_dcn_module)
-
-    def __call__(self, model: nn.Module):
-        if hasattr(model, 'module'):
-            model = model.module
-
-        optimizer_cfg = self.optimizer_cfg.copy()
-        # if no paramwise option is specified, just use the global setting
-        if not self.paramwise_cfg:
-            optimizer_cfg['params'] = model.parameters()
-            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
-
-        # set param-wise lr and weight decay recursively
-        params: List[Dict] = []
-        self.add_params(params, model)
-        optimizer_cfg['params'] = params
-
-        return build_from_cfg(optimizer_cfg, OPTIMIZERS)
diff --git a/mmcv/runner/priority.py b/mmcv/runner/priority.py
deleted file mode 100644
index ff644043b8..0000000000
--- a/mmcv/runner/priority.py
+++ /dev/null
@@ -1,61 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from enum import Enum
-from typing import Union
-
-
-class Priority(Enum):
-    """Hook priority levels.
-
-    +--------------+------------+
-    | Level        | Value      |
-    +==============+============+
-    | HIGHEST      | 0          |
-    +--------------+------------+
-    | VERY_HIGH    | 10         |
-    +--------------+------------+
-    | HIGH         | 30         |
-    +--------------+------------+
-    | ABOVE_NORMAL | 40         |
-    +--------------+------------+
-    | NORMAL       | 50         |
-    +--------------+------------+
-    | BELOW_NORMAL | 60         |
-    +--------------+------------+
-    | LOW          | 70         |
-    +--------------+------------+
-    | VERY_LOW     | 90         |
-    +--------------+------------+
-    | LOWEST       | 100        |
-    +--------------+------------+
-    """
-
-    HIGHEST = 0
-    VERY_HIGH = 10
-    HIGH = 30
-    ABOVE_NORMAL = 40
-    NORMAL = 50
-    BELOW_NORMAL = 60
-    LOW = 70
-    VERY_LOW = 90
-    LOWEST = 100
-
-
-def get_priority(priority: Union[int, str, Priority]) -> int:
-    """Get priority value.
-
-    Args:
-        priority (int or str or :obj:`Priority`): Priority.
-
-    Returns:
-        int: The priority value.
-    """
-    if isinstance(priority, int):
-        if priority < 0 or priority > 100:
-            raise ValueError('priority must be between 0 and 100')
-        return priority
-    elif isinstance(priority, Priority):
-        return priority.value
-    elif isinstance(priority, str):
-        return Priority[priority.upper()].value
-    else:
-        raise TypeError('priority must be an integer or Priority enum value')
diff --git a/mmcv/runner/utils.py b/mmcv/runner/utils.py
deleted file mode 100644
index 8cdc6faddb..0000000000
--- a/mmcv/runner/utils.py
+++ /dev/null
@@ -1,99 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import random
-import sys
-import time
-import warnings
-from getpass import getuser
-from socket import gethostname
-from types import ModuleType
-from typing import Optional
-
-import numpy as np
-import torch
-
-import mmcv
-
-
-def get_host_info() -> str:
-    """Get hostname and username.
-
-    Return empty string if exception raised, e.g. ``getpass.getuser()`` will
-    lead to error in docker container
-    """
-    host = ''
-    try:
-        host = f'{getuser()}@{gethostname()}'
-    except Exception as e:
-        warnings.warn(f'Host or user not found: {str(e)}')
-    finally:
-        return host
-
-
-def get_time_str() -> str:
-    return time.strftime('%Y%m%d_%H%M%S', time.localtime())
-
-
-def obj_from_dict(info: dict,
-                  parent: Optional[ModuleType] = None,
-                  default_args: Optional[dict] = None):
-    """Initialize an object from dict.
-
-    The dict must contain the key "type", which indicates the object type, it
-    can be either a string or type, such as "list" or ``list``. Remaining
-    fields are treated as the arguments for constructing the object.
-
-    Args:
-        info (dict): Object types and arguments.
-        parent (:class:`module`): Module which may containing expected object
-            classes.
-        default_args (dict, optional): Default arguments for initializing the
-            object.
-
-    Returns:
-        any type: Object built from the dict.
-    """
-    assert isinstance(info, dict) and 'type' in info
-    assert isinstance(default_args, dict) or default_args is None
-    args = info.copy()
-    obj_type = args.pop('type')
-    if mmcv.is_str(obj_type):
-        if parent is not None:
-            obj_type = getattr(parent, obj_type)
-        else:
-            obj_type = sys.modules[obj_type]
-    elif not isinstance(obj_type, type):
-        raise TypeError('type must be a str or valid type, but '
-                        f'got {type(obj_type)}')
-    if default_args is not None:
-        for name, value in default_args.items():
-            args.setdefault(name, value)
-    return obj_type(**args)
-
-
-def set_random_seed(seed: int,
-                    deterministic: bool = False,
-                    use_rank_shift: bool = False) -> None:
-    """Set random seed.
-
-    Args:
-        seed (int): Seed to be used.
-        deterministic (bool): Whether to set the deterministic option for
-            CUDNN backend, i.e., set `torch.backends.cudnn.deterministic`
-            to True and `torch.backends.cudnn.benchmark` to False.
-            Default: False.
-        rank_shift (bool): Whether to add rank number to the random seed to
-            have different random seed in different threads. Default: False.
-    """
-    if use_rank_shift:
-        rank, _ = mmcv.runner.get_dist_info()
-        seed += rank
-    random.seed(seed)
-    np.random.seed(seed)
-    torch.manual_seed(seed)
-    torch.cuda.manual_seed(seed)
-    torch.cuda.manual_seed_all(seed)
-    os.environ['PYTHONHASHSEED'] = str(seed)
-    if deterministic:
-        torch.backends.cudnn.deterministic = True
-        torch.backends.cudnn.benchmark = False
diff --git a/mmcv/utils/__init__.py b/mmcv/utils/__init__.py
index 8bb5a8173d..cf7f0d60bb 100644
--- a/mmcv/utils/__init__.py
+++ b/mmcv/utils/__init__.py
@@ -39,7 +39,6 @@
     from .device_type import (IS_IPU_AVAILABLE, IS_MLU_AVAILABLE,
                               IS_MPS_AVAILABLE)
     from .env import collect_env
-    from .hub import load_url
     from .logging import get_logger, print_log
     from .parrots_jit import jit, skip_no_elena
     # yapf: disable
@@ -75,7 +74,7 @@
         'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer',
         'assert_params_all_zeros', 'check_python_script',
         'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch',
-        '_get_cuda_home', 'load_url', 'has_method', 'IS_CUDA_AVAILABLE',
+        '_get_cuda_home', 'has_method', 'IS_CUDA_AVAILABLE',
         'worker_init_fn', 'IS_MLU_AVAILABLE', 'IS_IPU_AVAILABLE',
         'IS_MPS_AVAILABLE', 'torch_meshgrid'
     ]
diff --git a/mmcv/utils/hub.py b/mmcv/utils/hub.py
deleted file mode 100644
index a9cbbc95ba..0000000000
--- a/mmcv/utils/hub.py
+++ /dev/null
@@ -1,131 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-# The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
-# file format. It will cause RuntimeError when a checkpoint was saved in
-# torch >= 1.6.0 but loaded in torch < 1.7.0.
-# More details at https://github.com/open-mmlab/mmpose/issues/904
-from .parrots_wrapper import TORCH_VERSION
-from .path import mkdir_or_exist
-from .version_utils import digit_version
-
-if TORCH_VERSION != 'parrots' and digit_version(TORCH_VERSION) < digit_version(
-        '1.7.0'):
-    # Modified from https://github.com/pytorch/pytorch/blob/master/torch/hub.py
-    import os
-    import sys
-    import warnings
-    import zipfile
-    from urllib.parse import urlparse
-
-    import torch
-    from torch.hub import HASH_REGEX, _get_torch_home, download_url_to_file
-
-    # Hub used to support automatically extracts from zipfile manually
-    # compressed by users. The legacy zip format expects only one file from
-    # torch.save() < 1.6 in the zip. We should remove this support since
-    # zipfile is now default zipfile format for torch.save().
-    def _is_legacy_zip_format(filename):
-        if zipfile.is_zipfile(filename):
-            infolist = zipfile.ZipFile(filename).infolist()
-            return len(infolist) == 1 and not infolist[0].is_dir()
-        return False
-
-    def _legacy_zip_load(filename, model_dir, map_location):
-        warnings.warn(
-            'Falling back to the old format < 1.6. This support will'
-            ' be deprecated in favor of default zipfile format '
-            'introduced in 1.6. Please redo torch.save() to save it '
-            'in the new zipfile format.', DeprecationWarning)
-        # Note: extractall() defaults to overwrite file if exists. No need to
-        #       clean up beforehand. We deliberately don't handle tarfile here
-        #       since our legacy serialization format was in tar.
-        #       E.g. resnet18-5c106cde.pth which is widely used.
-        with zipfile.ZipFile(filename) as f:
-            members = f.infolist()
-            if len(members) != 1:
-                raise RuntimeError(
-                    'Only one file(not dir) is allowed in the zipfile')
-            f.extractall(model_dir)
-            extraced_name = members[0].filename
-            extracted_file = os.path.join(model_dir, extraced_name)
-        return torch.load(extracted_file, map_location=map_location)
-
-    def load_url(url,
-                 model_dir=None,
-                 map_location=None,
-                 progress=True,
-                 check_hash=False,
-                 file_name=None):
-        r"""Loads the Torch serialized object at the given URL.
-
-        If downloaded file is a zip file, it will be automatically decompressed
-
-        If the object is already present in `model_dir`, it's deserialized and
-        returned.
-        The default value of ``model_dir`` is ``<hub_dir>/checkpoints`` where
-        ``hub_dir`` is the directory returned by :func:`~torch.hub.get_dir`.
-
-        Args:
-            url (str): URL of the object to download
-            model_dir (str, optional): directory in which to save the object
-            map_location (optional): a function or a dict specifying how to
-                remap storage locations (see torch.load)
-            progress (bool, optional): whether or not to display a progress bar
-                to stderr. Default: True
-            check_hash(bool, optional): If True, the filename part of the URL
-                should follow the naming convention ``filename-<sha256>.ext``
-                where ``<sha256>`` is the first eight or more digits of the
-                SHA256 hash of the contents of the file. The hash is used to
-                ensure unique names and to verify the contents of the file.
-                Default: False
-            file_name (str, optional): name for the downloaded file. Filename
-                from ``url`` will be used if not set. Default: None.
-
-        Example:
-            >>> url = ('https://s3.amazonaws.com/pytorch/models/resnet18-5c106'
-            ...        'cde.pth')
-            >>> state_dict = torch.hub.load_state_dict_from_url(url)
-        """
-        # Issue warning to move data if old env is set
-        if os.getenv('TORCH_MODEL_ZOO'):
-            warnings.warn(
-                'TORCH_MODEL_ZOO is deprecated, please use env '
-                'TORCH_HOME instead', DeprecationWarning)
-
-        if model_dir is None:
-            torch_home = _get_torch_home()
-            model_dir = os.path.join(torch_home, 'checkpoints')
-
-        mkdir_or_exist(model_dir)
-
-        parts = urlparse(url)
-        filename = os.path.basename(parts.path)
-        if file_name is not None:
-            filename = file_name
-        cached_file = os.path.join(model_dir, filename)
-        if not os.path.exists(cached_file):
-            sys.stderr.write('Downloading: "{}" to {}\n'.format(
-                url, cached_file))
-            hash_prefix = None
-            if check_hash:
-                r = HASH_REGEX.search(filename)  # r is Optional[Match[str]]
-                hash_prefix = r.group(1) if r else None
-            download_url_to_file(
-                url, cached_file, hash_prefix, progress=progress)
-
-        if _is_legacy_zip_format(cached_file):
-            return _legacy_zip_load(cached_file, model_dir, map_location)
-
-        try:
-            return torch.load(cached_file, map_location=map_location)
-        except RuntimeError as error:
-            if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
-                warnings.warn(
-                    f'If the error is the same as "{cached_file} is a zip '
-                    'archive (did you mean to use torch.jit.load()?)", you can'
-                    ' upgrade your torch to 1.5.0 or higher (current torch '
-                    f'version is {TORCH_VERSION}). The error was raised '
-                    ' because the checkpoint was saved in torch>=1.6.0 but '
-                    'loaded in torch<1.5.')
-            raise error
-else:
-    from torch.utils.model_zoo import load_url  # type: ignore # noqa: F401
diff --git a/tests/data/model_zoo/deprecated.json b/tests/data/model_zoo/deprecated.json
deleted file mode 100644
index 7c2d3e4584..0000000000
--- a/tests/data/model_zoo/deprecated.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "train_old": "train",
-  "test_old": "test"
-}
\ No newline at end of file
diff --git a/tests/data/model_zoo/mmcv_home/open_mmlab.json b/tests/data/model_zoo/mmcv_home/open_mmlab.json
deleted file mode 100644
index c9ee238373..0000000000
--- a/tests/data/model_zoo/mmcv_home/open_mmlab.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "test": "test.pth",
-  "val": "val.pth",
-  "train_empty": "train.pth"
-}
\ No newline at end of file
diff --git a/tests/data/model_zoo/mmcv_home/test.pth b/tests/data/model_zoo/mmcv_home/test.pth
deleted file mode 100644
index 2060ac6024475731200015104b40995cdbb8b10b..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 341
zcmZusJxc>Y5WTN>x}w<HS)R=y;XIPgLed<5#bz5>&b=jBS#EE3=K>0WG=lmY`~m(o
ze}Z7)#8^7Tz`S|$-kVkQ#?1Ho<m>3+<N7dPMHinj^(6!uKL~=La~7?%Ml6)`+O`l9
zf$!1AXhfrgY8zd*5WR3lAU<|Vc>Sb2#7jsrJ|NJ~twR`85szuoEy7ln@MNuF+Zm))
z()Z*k(K<QLMUUEeCY-wOwP^&Z)>|hQLq?f0o~{LnS<e)Nn-exMw#M5H$ROhT#2FXE
zqAbUw)3P`zMiBpc%dUr{3NI<mO%+t(GbN#xzDj!0D2eo;XIHLKPBrIRB7^PYDeNGy
UDeO|lI|%+$59>T6&%ZkR0hc{sTmS$7

diff --git a/tests/data/model_zoo/mmcv_home/val.pth b/tests/data/model_zoo/mmcv_home/val.pth
deleted file mode 100644
index 57faf4d3fc748ddbf55f73a5b12e2e79d7175eb6..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 341
zcmZusJxc>Y5WTN>x}w<HS;pot;W$iXBWaHK5u0sfIro-iWx37l&IJ?#*9huw@CW$U
z{0V}E6R&lOfqC=hy*FL-&dkrp^!w!T^X53~qS;qWeFMRe9|b|Mau%($My!?d+BOgp
zfgjMuXhfrgYHD3I5WR9nAU>^>@cLPKh&PZ7`G7#bv<_iVMm(lTHwZgr!jr9rZ5EK0
zN!yaAMC;@-6D?}vnQ-c%)utAx8gHFgj~Qhe@^mXm%xa+^>`&O#*b47@kU_)`i8IN^
zc~MM8MSgZZDj@#zma7($GQ6TRw`EX<FO-B@`X=c`qa@OYo?W|IIaOb3i41m6=dg#s
V=CDs0?;-e4J*={jJp1Y{egTtYV95Xg

diff --git a/tests/data/model_zoo/open_mmlab.json b/tests/data/model_zoo/open_mmlab.json
deleted file mode 100644
index 4f5ada7ac5..0000000000
--- a/tests/data/model_zoo/open_mmlab.json
+++ /dev/null
@@ -1,4 +0,0 @@
-{
-  "train": "https://localhost/train.pth",
-  "test": "https://localhost/test.pth"
-}
\ No newline at end of file
diff --git a/tests/data/model_zoo/torchvision_0.12.json b/tests/data/model_zoo/torchvision_0.12.json
deleted file mode 100644
index 06defe6748..0000000000
--- a/tests/data/model_zoo/torchvision_0.12.json
+++ /dev/null
@@ -1,57 +0,0 @@
-{
-    "alexnet": "https://download.pytorch.org/models/alexnet-owt-7be5be79.pth",
-    "densenet121": "https://download.pytorch.org/models/densenet121-a639ec97.pth",
-    "densenet169": "https://download.pytorch.org/models/densenet169-b2777c0a.pth",
-    "densenet201": "https://download.pytorch.org/models/densenet201-c1103571.pth",
-    "densenet161": "https://download.pytorch.org/models/densenet161-8d451a50.pth",
-    "efficientnet_b0": "https://download.pytorch.org/models/efficientnet_b0_rwightman-3dd342df.pth",
-    "efficientnet_b1": "https://download.pytorch.org/models/efficientnet_b1_rwightman-533bc792.pth",
-    "efficientnet_b2": "https://download.pytorch.org/models/efficientnet_b2_rwightman-bcdf34b7.pth",
-    "efficientnet_b3": "https://download.pytorch.org/models/efficientnet_b3_rwightman-cf984f9c.pth",
-    "efficientnet_b4": "https://download.pytorch.org/models/efficientnet_b4_rwightman-7eb33cd5.pth",
-    "efficientnet_b5": "https://download.pytorch.org/models/efficientnet_b5_lukemelas-b6417697.pth",
-    "efficientnet_b6": "https://download.pytorch.org/models/efficientnet_b6_lukemelas-c76e70fd.pth",
-    "efficientnet_b7": "https://download.pytorch.org/models/efficientnet_b7_lukemelas-dcc49843.pth",
-    "googlenet": "https://download.pytorch.org/models/googlenet-1378be20.pth",
-    "inception_v3_google": "https://download.pytorch.org/models/inception_v3_google-0cc3c7bd.pth",
-    "mobilenet_v2": "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth",
-    "mobilenet_v3_large": "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth",
-    "mobilenet_v3_small": "https://download.pytorch.org/models/mobilenet_v3_small-047dcff4.pth",
-    "regnet_y_400mf": "https://download.pytorch.org/models/regnet_y_400mf-c65dace8.pth",
-    "regnet_y_800mf": "https://download.pytorch.org/models/regnet_y_800mf-1b27b58c.pth",
-    "regnet_y_1_6gf": "https://download.pytorch.org/models/regnet_y_1_6gf-b11a554e.pth",
-    "regnet_y_3_2gf": "https://download.pytorch.org/models/regnet_y_3_2gf-b5a9779c.pth",
-    "regnet_y_8gf": "https://download.pytorch.org/models/regnet_y_8gf-d0d0e4a8.pth",
-    "regnet_y_16gf": "https://download.pytorch.org/models/regnet_y_16gf-9e6ed7dd.pth",
-    "regnet_y_32gf": "https://download.pytorch.org/models/regnet_y_32gf-4dee3f7a.pth",
-    "regnet_x_400mf": "https://download.pytorch.org/models/regnet_x_400mf-adf1edd5.pth",
-    "regnet_x_800mf": "https://download.pytorch.org/models/regnet_x_800mf-ad17e45c.pth",
-    "regnet_x_1_6gf": "https://download.pytorch.org/models/regnet_x_1_6gf-e3633e7f.pth",
-    "regnet_x_3_2gf": "https://download.pytorch.org/models/regnet_x_3_2gf-f342aeae.pth",
-    "regnet_x_8gf": "https://download.pytorch.org/models/regnet_x_8gf-03ceed89.pth",
-    "regnet_x_16gf": "https://download.pytorch.org/models/regnet_x_16gf-2007eb11.pth",
-    "regnet_x_32gf": "https://download.pytorch.org/models/regnet_x_32gf-9d47f8d0.pth",
-    "resnet18": "https://download.pytorch.org/models/resnet18-f37072fd.pth",
-    "resnet34": "https://download.pytorch.org/models/resnet34-b627a593.pth",
-    "resnet50": "https://download.pytorch.org/models/resnet50-0676ba61.pth",
-    "resnet101": "https://download.pytorch.org/models/resnet101-63fe2227.pth",
-    "resnet152": "https://download.pytorch.org/models/resnet152-394f9c45.pth",
-    "resnext50_32x4d": "https://download.pytorch.org/models/resnext50_32x4d-7cdf4587.pth",
-    "resnext101_32x8d": "https://download.pytorch.org/models/resnext101_32x8d-8ba56ff5.pth",
-    "wide_resnet50_2": "https://download.pytorch.org/models/wide_resnet50_2-95faca4d.pth",
-    "wide_resnet101_2": "https://download.pytorch.org/models/wide_resnet101_2-32ee1156.pth",
-    "shufflenetv2_x0.5": "https://download.pytorch.org/models/shufflenetv2_x0.5-f707e7126e.pth",
-    "shufflenetv2_x1.0": "https://download.pytorch.org/models/shufflenetv2_x1-5666bf0f80.pth",
-    "shufflenetv2_x1.5": null,
-    "shufflenetv2_x2.0": null,
-    "squeezenet1_0": "https://download.pytorch.org/models/squeezenet1_0-b66bff10.pth",
-    "squeezenet1_1": "https://download.pytorch.org/models/squeezenet1_1-b8a52dc0.pth",
-    "vgg11": "https://download.pytorch.org/models/vgg11-8a719046.pth",
-    "vgg13": "https://download.pytorch.org/models/vgg13-19584684.pth",
-    "vgg16": "https://download.pytorch.org/models/vgg16-397923af.pth",
-    "vgg19": "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth",
-    "vgg11_bn": "https://download.pytorch.org/models/vgg11_bn-6002323d.pth",
-    "vgg13_bn": "https://download.pytorch.org/models/vgg13_bn-abd245e5.pth",
-    "vgg16_bn": "https://download.pytorch.org/models/vgg16_bn-6c64b313.pth",
-    "vgg19_bn": "https://download.pytorch.org/models/vgg19_bn-c79401a0.pth"
-}
diff --git a/tests/test_device/test_device_utils.py b/tests/test_device/test_device_utils.py
deleted file mode 100644
index 6597efa5a3..0000000000
--- a/tests/test_device/test_device_utils.py
+++ /dev/null
@@ -1,15 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from mmcv.device import get_device
-from mmcv.utils import IS_CUDA_AVAILABLE, IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
-
-
-def test_get_device():
-    current_device = get_device()
-    if IS_CUDA_AVAILABLE:
-        assert current_device == 'cuda'
-    elif IS_MLU_AVAILABLE:
-        assert current_device == 'mlu'
-    elif IS_MPS_AVAILABLE:
-        assert current_device == 'mps'
-    else:
-        assert current_device == 'cpu'
diff --git a/tests/test_device/test_functions.py b/tests/test_device/test_functions.py
deleted file mode 100644
index dbbb8978b5..0000000000
--- a/tests/test_device/test_functions.py
+++ /dev/null
@@ -1,90 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-
-from mmcv.device._functions import Scatter, scatter
-from mmcv.utils import IS_MLU_AVAILABLE, IS_MPS_AVAILABLE
-
-
-def test_scatter():
-    # if the device is CPU, just return the input
-    input = torch.zeros([1, 3, 3, 3])
-    output = scatter(input=input, devices=[-1])
-    assert torch.allclose(input, output)
-
-    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-    outputs = scatter(input=inputs, devices=[-1])
-    for input, output in zip(inputs, outputs):
-        assert torch.allclose(input, output)
-
-    # if the device is MLU, copy the input from CPU to MLU
-    if IS_MLU_AVAILABLE:
-        input = torch.zeros([1, 3, 3, 3])
-        output = scatter(input=input, devices=[0])
-        assert torch.allclose(input.to('mlu'), output)
-
-        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-        outputs = scatter(input=inputs, devices=[0])
-        for input, output in zip(inputs, outputs):
-            assert torch.allclose(input.to('mlu'), output)
-
-    # if the device is MPS, copy the input from CPU to MPS
-    if IS_MPS_AVAILABLE:
-        input = torch.zeros([1, 3, 3, 3])
-        output = scatter(input=input, devices=[0])
-        assert torch.allclose(input.to('mps'), output)
-
-        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-        outputs = scatter(input=inputs, devices=[0])
-        for input, output in zip(inputs, outputs):
-            assert torch.allclose(input.to('mps'), output)
-
-    # input should be a tensor or list of tensor
-    with pytest.raises(Exception):
-        scatter(5, [-1])
-
-
-def test_Scatter():
-    # if the device is CPU, just return the input
-    target_devices = [-1]
-    input = torch.zeros([1, 3, 3, 3])
-    outputs = Scatter.forward(target_devices, input)
-    assert isinstance(outputs, tuple)
-    assert torch.allclose(input, outputs[0])
-
-    target_devices = [-1]
-    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-    outputs = Scatter.forward(target_devices, inputs)
-    assert isinstance(outputs, tuple)
-    for input, output in zip(inputs, outputs):
-        assert torch.allclose(input, output)
-
-    # if the device is MLU, copy the input from CPU to MLU
-    if IS_MLU_AVAILABLE:
-        target_devices = [0]
-        input = torch.zeros([1, 3, 3, 3])
-        outputs = Scatter.forward(target_devices, input)
-        assert isinstance(outputs, tuple)
-        assert torch.allclose(input.to('mlu'), outputs[0])
-
-        target_devices = [0]
-        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-        outputs = Scatter.forward(target_devices, inputs)
-        assert isinstance(outputs, tuple)
-        for input, output in zip(inputs, outputs):
-            assert torch.allclose(input.to('mlu'), output[0])
-
-    # if the device is MPS, copy the input from CPU to MPS
-    if IS_MPS_AVAILABLE:
-        target_devices = [0]
-        input = torch.zeros([1, 3, 3, 3])
-        outputs = Scatter.forward(target_devices, input)
-        assert isinstance(outputs, tuple)
-        assert torch.allclose(input.to('mps'), outputs[0])
-
-        target_devices = [0]
-        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-        outputs = Scatter.forward(target_devices, inputs)
-        assert isinstance(outputs, tuple)
-        for input, output in zip(inputs, outputs):
-            assert torch.allclose(input.to('mps'), output[0])
diff --git a/tests/test_device/test_ipu/test_hierarchicaldatamanager.py b/tests/test_device/test_ipu/test_hierarchicaldatamanager.py
deleted file mode 100755
index e0a0f012fa..0000000000
--- a/tests/test_device/test_ipu/test_hierarchicaldatamanager.py
+++ /dev/null
@@ -1,106 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-
-import numpy as np
-import pytest
-import torch
-
-from mmcv.parallel.data_container import DataContainer
-from mmcv.utils import IS_IPU_AVAILABLE
-
-if IS_IPU_AVAILABLE:
-    from mmcv.device.ipu.hierarchical_data_manager import \
-        HierarchicalDataManager
-
-skip_no_ipu = pytest.mark.skipif(
-    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
-
-
-@skip_no_ipu
-def test_HierarchicalData():
-    # test hierarchical data
-    hierarchical_data_sample = {
-        'a': torch.rand(3, 4),
-        'b': np.random.rand(3, 4),
-        'c': DataContainer({
-            'a': torch.rand(3, 4),
-            'b': 4,
-            'c': 'd'
-        }),
-        'd': 123,
-        'e': [1, 3, torch.rand(3, 4),
-              np.random.rand(3, 4)],
-        'f': {
-            'a': torch.rand(3, 4),
-            'b': np.random.rand(3, 4),
-            'c': [1, 'asd']
-        }
-    }
-    all_tensors = []
-    all_tensors.append(hierarchical_data_sample['a'])
-    all_tensors.append(hierarchical_data_sample['c'].data['a'])
-    all_tensors.append(hierarchical_data_sample['e'][2])
-    all_tensors.append(hierarchical_data_sample['f']['a'])
-    all_tensors_id = [id(ele) for ele in all_tensors]
-
-    hd = HierarchicalDataManager(logging.getLogger())
-    hd.record_hierarchical_data(hierarchical_data_sample)
-    tensors = hd.collect_all_tensors()
-    for t in tensors:
-        assert id(t) in all_tensors_id
-    tensors[0].add_(1)
-    hd.update_all_tensors(tensors)
-    data = hd.hierarchical_data
-    data['c'].data['a'].sub_(1)
-    hd.record_hierarchical_data(data)
-    tensors = hd.collect_all_tensors()
-    for t in tensors:
-        assert id(t) in all_tensors_id
-    hd.quick()
-
-    with pytest.raises(
-            AssertionError,
-            match='original hierarchical data is not torch.tensor'):
-        hd.record_hierarchical_data(torch.rand(3, 4))
-
-    class AuxClass:
-        pass
-
-    with pytest.raises(NotImplementedError, match='not supported datatype:'):
-        hd.record_hierarchical_data(AuxClass())
-
-    with pytest.raises(NotImplementedError, match='not supported datatype:'):
-        hierarchical_data_sample['a'] = AuxClass()
-        hd.update_all_tensors(tensors)
-
-    with pytest.raises(NotImplementedError, match='not supported datatype:'):
-        hierarchical_data_sample['a'] = AuxClass()
-        hd.collect_all_tensors()
-
-    with pytest.raises(NotImplementedError, match='not supported datatype:'):
-        hierarchical_data_sample['a'] = AuxClass()
-        hd.clean_all_tensors()
-
-    hd = HierarchicalDataManager(logging.getLogger())
-    hd.record_hierarchical_data(hierarchical_data_sample)
-    hierarchical_data_sample['a'] = torch.rand(3, 4)
-    with pytest.raises(ValueError, match='all data except torch.Tensor'):
-        new_hierarchical_data_sample = {
-            **hierarchical_data_sample, 'b': np.random.rand(3, 4)
-        }
-        hd.update_hierarchical_data(new_hierarchical_data_sample)
-
-    hd.update_hierarchical_data(new_hierarchical_data_sample, strict=False)
-
-    hd.clean_all_tensors()
-
-    # test single tensor
-    single_tensor = torch.rand(3, 4)
-    hd = HierarchicalDataManager(logging.getLogger())
-    hd.record_hierarchical_data(single_tensor)
-    tensors = hd.collect_all_tensors()
-    assert len(tensors) == 1 and single_tensor in tensors
-    single_tensor_to_update = [torch.rand(3, 4)]
-    hd.update_all_tensors(single_tensor_to_update)
-    new_tensors = hd.collect_all_tensors()
-    assert new_tensors == single_tensor_to_update
diff --git a/tests/test_device/test_ipu/test_ipu_dataloder.py b/tests/test_device/test_ipu/test_ipu_dataloder.py
deleted file mode 100755
index b1db148051..0000000000
--- a/tests/test_device/test_ipu/test_ipu_dataloder.py
+++ /dev/null
@@ -1,69 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-from torch.utils.data import Dataset
-
-from mmcv.parallel.data_container import DataContainer
-from mmcv.utils import IS_IPU_AVAILABLE
-
-if IS_IPU_AVAILABLE:
-    from mmcv.device.ipu import IPUDataLoader, cfg2options
-    from mmcv.device.ipu.dataloader import collate
-
-skip_no_ipu = pytest.mark.skipif(
-    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
-
-
-class ToyDataset(Dataset):
-
-    def __getitem__(self, index):
-        return 111
-
-    def __len__(self, ):
-        return 3
-
-
-@skip_no_ipu
-def test_ipu_dataloader():
-    # test lazy initialization
-    dataloader = IPUDataLoader(
-        ToyDataset(), None, batch_size=256, num_workers=1, mode='async')
-    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
-    ipu_options = cfg2options(options_cfg)
-    dataloader.init(ipu_options['training'])
-
-    # test normal initialization
-    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
-    ipu_options = cfg2options(options_cfg)['training']
-    dataloader = IPUDataLoader(
-        ToyDataset(), ipu_options, batch_size=256, num_workers=1, mode='async')
-
-
-@skip_no_ipu
-def test_ipu_collate():
-    with pytest.raises(TypeError, match='`batch` should be a sequence'):
-        collate(123)
-
-    with pytest.raises(TypeError, match='DataContainer is not supported'):
-        collate([DataContainer(666)])
-
-    data_list = [[1, 2, 3], [2, 3, 4], DataContainer(666)]
-    batch0 = {
-        'tensor': torch.rand(3, 4, 5),
-        'arr': np.random.rand(3, 4, 5, 6),
-        'data_list': data_list
-    }
-    batch1 = {
-        'tensor': torch.rand(3, 4, 5),
-        'arr': np.random.rand(3, 4, 5, 6),
-        'data_list': data_list
-    }
-    batch = [batch1, batch0]
-    results = collate(batch)
-    assert results['tensor'].shape == (2, 3, 4, 5)
-    assert results['arr'].shape == (2, 3, 4, 5, 6)
-    for data in results['data_list']:
-        for tensor in data:
-            assert not isinstance(tensor, DataContainer)
-            assert tensor.shape == (2, )
diff --git a/tests/test_device/test_ipu/test_ipu_hooks.py b/tests/test_device/test_ipu/test_ipu_hooks.py
deleted file mode 100755
index d76291a372..0000000000
--- a/tests/test_device/test_ipu/test_ipu_hooks.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import os.path as osp
-
-import pytest
-import torch
-import torch.nn as nn
-
-from mmcv.runner import build_runner
-from mmcv.runner.fp16_utils import auto_fp16
-from mmcv.utils import IS_IPU_AVAILABLE
-
-if IS_IPU_AVAILABLE:
-    from mmcv.device.ipu.hook_wrapper import IPUFp16OptimizerHook
-
-skip_no_ipu = pytest.mark.skipif(
-    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
-
-
-# TODO Once the model training and inference interfaces
-# of MMCLS and MMDET are unified,
-# construct the model according to the unified standards
-class ToyModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 3, 1)
-        self.bn = nn.BatchNorm2d(3)
-        self.relu = nn.ReLU6()
-        self.fp16_enabled = False
-
-    @auto_fp16(apply_to=('img', ))
-    def forward(self, img, return_loss=True, **kwargs):
-        x = self.conv(img)
-        x = self.bn(x)
-        x = self.relu(x)
-        if return_loss:
-            loss = ((x - kwargs['gt_label'])**2).sum()
-            return {
-                'loss': loss,
-                'loss_list': [loss, loss],
-                'loss_dict': {
-                    'loss1': loss
-                }
-            }
-        return x
-
-    def _parse_losses(self, losses):
-        return losses['loss'], losses['loss']
-
-    def train_step(self, data, optimizer=None, **kwargs):
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
-        return outputs
-
-
-@skip_no_ipu
-def test_ipu_hook_wrapper(tmp_path):
-
-    model = ToyModel()
-    dummy_input = {
-        'data': {
-            'img': torch.rand((16, 3, 10, 10)),
-            'gt_label': torch.rand((16, 3, 10, 10))
-        }
-    }
-
-    dir_name = 'a_tmp_dir'
-    working_dir = osp.join(tmp_path, dir_name)
-
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
-
-    default_args = dict(
-        model=model,
-        work_dir=working_dir,
-        optimizer=optimizer,
-        logger=logging.getLogger())
-    cfg = dict(type='IPUEpochBasedRunner', max_epochs=1)
-    dummy_runner = build_runner(cfg, default_args=default_args)
-
-    # learning policy
-    lr_config = dict(policy='step', step=[1, 150])
-    # test optimizer config
-    optimizer_config = dict(
-        grad_clip=dict(max_norm=2), detect_anomalous_params=True)
-
-    # test building ipu_lr_hook_class
-    dummy_runner.register_training_hooks(
-        lr_config=lr_config, optimizer_config=None, timer_config=None)
-
-    # test _set_lr()
-    output = dummy_runner.model.train_step(**dummy_input)
-    dummy_runner.outputs = output
-    dummy_runner.call_hook('before_train_epoch')
-
-    # test building ipu_optimizer_hook_class
-    with pytest.raises(
-            NotImplementedError, match='IPU does not support gradient clip'):
-        dummy_runner.register_training_hooks(
-            lr_config=None,
-            optimizer_config=optimizer_config,
-            timer_config=None)
-
-    # test fp16 optimizer hook
-    lr_config = dict(policy='step', step=[1, 150])
-    optimizer_config = dict(grad_clip=dict(max_norm=2))
-    dummy_runner.hooks.pop(0)
-
-    with pytest.raises(NotImplementedError, match='IPU mode does not support'):
-        optimizer_config = IPUFp16OptimizerHook(
-            loss_scale='dynamic', distributed=False)
-
-    with pytest.raises(NotImplementedError, match='IPU mode supports single'):
-        optimizer_config = IPUFp16OptimizerHook(
-            loss_scale={}, distributed=False)
-
-    with pytest.raises(ValueError, match='loss_scale should be float'):
-        optimizer_config = IPUFp16OptimizerHook(
-            loss_scale=[], distributed=False)
-
-    optimizer_config = IPUFp16OptimizerHook(loss_scale=2.0, distributed=False)
-
-    dummy_runner.register_training_hooks(
-        lr_config=lr_config,
-        optimizer_config=optimizer_config,
-        timer_config=None)
-
-    dummy_runner.call_hook('after_train_iter')
diff --git a/tests/test_device/test_ipu/test_ipu_model.py b/tests/test_device/test_ipu/test_ipu_model.py
deleted file mode 100755
index 390d09a134..0000000000
--- a/tests/test_device/test_ipu/test_ipu_model.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-
-import numpy as np
-import pytest
-import torch
-import torch.nn as nn
-
-from mmcv.runner.fp16_utils import auto_fp16
-from mmcv.utils import IS_IPU_AVAILABLE
-
-if IS_IPU_AVAILABLE:
-    from mmcv.device.ipu import cfg2options, ipu_model_wrapper
-    from mmcv.device.ipu.utils import compare_ndarray
-
-skip_no_ipu = pytest.mark.skipif(
-    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
-
-
-class MyBN(nn.BatchNorm2d):
-
-    def forward(self, *args, **kwargs):
-        result = super().forward(*args, **kwargs)
-        return result, self.running_mean
-
-
-# TODO Once the model training and inference interfaces
-# of MMCLS and MMDET are unified,
-# construct the model according to the unified standards
-class ToyModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 3, 1)
-        self.bn = MyBN(3)
-        self.relu = nn.ReLU6()
-        self.fp16_enabled = False
-
-    @auto_fp16(apply_to=('img', ))
-    def forward(self, img, return_loss=True, **kwargs):
-        x = self.conv(img)
-        x, running_mean = self.bn(x)
-        x = self.relu(x)
-        if return_loss:
-            loss = ((x - kwargs['gt_label'])**2).sum()
-            return {
-                'loss': loss,
-                'loss_list': [loss, loss],
-                'loss_dict': {
-                    'loss1': loss
-                }
-            }
-        return x
-
-    def _parse_losses(self, losses):
-        return losses['loss'], losses['loss']
-
-    def train_step(self, data, optimizer=None, **kwargs):
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
-        return outputs
-
-
-@skip_no_ipu
-def test_build_model():
-    for execution_strategy in \
-            ['SameAsIpu', 'ShardedExecution', 'error_strategy']:
-        if execution_strategy == 'error_strategy':
-
-            def maybe_catch_error(_error):
-                return pytest.raises(_error)
-        else:
-
-            class NullContextManager:
-
-                def __enter__(self, ):
-                    pass
-
-                def __exit__(self, exc_type, exc_value, exc_traceback):
-                    pass
-
-            def maybe_catch_error(_error):
-                return NullContextManager()
-
-        with maybe_catch_error(NotImplementedError):
-            options_cfg = dict(
-                randomSeed=888,
-                enableExecutableCaching='cache_engine',
-                train_cfg=dict(
-                    executionStrategy=execution_strategy,
-                    Training=dict(gradientAccumulation=8),
-                    availableMemoryProportion=[0.3, 0.3, 0.3, 0.3]),
-                eval_cfg=dict(deviceIterations=1, ),
-                partialsType='half')
-
-            ipu_options = cfg2options(options_cfg)
-            model = ToyModel()
-            optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
-            logger = logging.getLogger()
-            modules_to_record = None
-            ipu_model_cfg = dict(
-                train_split_edges=[dict(layer_to_call='conv', ipu_id=0)],
-                train_ckpt_nodes=['bn', 'conv'])
-            fp16_cfg = {'loss_scale': 0.5}
-            ipu_model = ipu_model_wrapper(
-                model,
-                ipu_options,
-                optimizer,
-                logger,
-                modules_to_record=modules_to_record,
-                ipu_model_cfg=ipu_model_cfg,
-                fp16_cfg=fp16_cfg)
-
-            ipu_model.train()
-            ipu_model.eval()
-            ipu_model.train()
-
-
-def run_model(ipu_options,
-              fp16_cfg,
-              modules_to_record,
-              ipu_model_wrapper_func,
-              only_eval=False):
-    model = ToyModel()
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)\
-        if not only_eval else None
-    logger = logging.getLogger()
-    ipu_model_cfg = dict(
-        train_split_edges=[dict(layer_to_call='conv', ipu_id=0)],
-        train_ckpt_nodes=['bn', 'conv'])
-    ipu_model = ipu_model_wrapper_func(
-        model,
-        ipu_options,
-        optimizer,
-        logger,
-        modules_to_record=modules_to_record,
-        ipu_model_cfg=ipu_model_cfg,
-        fp16_cfg=fp16_cfg)
-
-    def get_dummy_input(training):
-        if training:
-            return {
-                'data': {
-                    'img': torch.rand((16, 3, 10, 10)),
-                    'gt_label': torch.rand((16, 3, 10, 10))
-                }
-            }
-        else:
-            return {
-                'img': torch.rand((16, 3, 10, 10)),
-                'img_metas': {
-                    'img': torch.rand((16, 3, 10, 10))
-                },
-                'return_loss': False
-            }
-
-    if not only_eval:
-        training = True
-        ipu_model.train()
-        for _ in range(3):
-            dummy_input = get_dummy_input(training)
-            output = ipu_model.train_step(**dummy_input)
-    training = False
-    ipu_model.eval()
-    for _ in range(3):
-        dummy_input = get_dummy_input(training)
-        output = ipu_model(**dummy_input)
-    return output, ipu_model
-
-
-@skip_no_ipu
-def test_run_model():
-
-    # test feature alignment not support gradientAccumulation mode
-    options_cfg = dict(
-        randomSeed=888,
-        enableExecutableCaching='cache_engine',
-        train_cfg=dict(
-            executionStrategy='SameAsIpu',
-            Training=dict(gradientAccumulation=8),
-            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
-        ),
-        eval_cfg=dict(deviceIterations=1, ),
-        partialsType='half')
-    ipu_options = cfg2options(options_cfg)
-    modules_to_record = ['bn']
-    with pytest.raises(AssertionError, match='Feature alignment'):
-        run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
-
-    # test feature alignment not support multi-replica mode
-    options_cfg = dict(
-        randomSeed=888,
-        replicationFactor=2,
-        enableExecutableCaching='cache_engine',
-        train_cfg=dict(
-            executionStrategy='SameAsIpu',
-            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
-        ),
-        eval_cfg=dict(deviceIterations=1, ),
-        partialsType='half')
-    ipu_options = cfg2options(options_cfg)
-    modules_to_record = ['bn']
-    with pytest.raises(AssertionError, match='Feature alignment'):
-        run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
-
-    # test feature alignment not support fp16 mode
-    options_cfg = dict(
-        randomSeed=888,
-        enableExecutableCaching='cache_engine',
-        train_cfg=dict(
-            executionStrategy='SameAsIpu',
-            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
-        ),
-        eval_cfg=dict(deviceIterations=1, ),
-        partialsType='half')
-    ipu_options = cfg2options(options_cfg)
-    fp16_cfg = {
-        'loss_scale': 0.5,
-        'velocity_accum_type': 'half',
-        'accum_type': 'half'
-    }
-    modules_to_record = ['bn']
-    with pytest.raises(NotImplementedError):
-        run_model(ipu_options, fp16_cfg, modules_to_record, ipu_model_wrapper)
-
-    # test velocity_accum_type and accum_type
-    fp16_cfg = {
-        'loss_scale': 0.5,
-        'velocity_accum_type': 'float',
-        'accum_type': 'float'
-    }
-    run_model(ipu_options, fp16_cfg, None, ipu_model_wrapper)
-
-    # test compile and run
-    options_cfg = dict(
-        randomSeed=888,
-        enableExecutableCaching='cache_engine',
-        train_cfg=dict(
-            executionStrategy='SameAsIpu',
-            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
-        ),
-        eval_cfg=dict(deviceIterations=1, ),
-        partialsType='half')
-    ipu_options = cfg2options(options_cfg)
-    modules_to_record = ['bn']
-    run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
-
-    # test feature alignment
-    options_cfg = dict(
-        randomSeed=888,
-        enableExecutableCaching='cache_engine',
-        train_cfg=dict(
-            executionStrategy='SameAsIpu',
-            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
-        ),
-        eval_cfg=dict(deviceIterations=1, ))
-    ipu_options = cfg2options(options_cfg)
-    modules_to_record = None
-    run_model(ipu_options, None, modules_to_record, ipu_model_wrapper)
-
-    # test inference mode
-    options_cfg = dict(
-        randomSeed=888,
-        enableExecutableCaching='cache_engine',
-        train_cfg=dict(
-            executionStrategy='SameAsIpu',
-            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
-        ),
-        eval_cfg=dict(deviceIterations=1, ),
-        partialsType='half')
-    ipu_options = cfg2options(options_cfg)
-    fp16_cfg = {'loss_scale': 0.5}
-    modules_to_record = None
-    _, ipu_model = run_model(
-        ipu_options,
-        fp16_cfg,
-        modules_to_record,
-        ipu_model_wrapper,
-        only_eval=True)
-    with pytest.raises(RuntimeError):
-        ipu_model.train()
-    with pytest.raises(ValueError):
-        ipu_model.train(123)
-    _, ipu_model = run_model(ipu_options, None, modules_to_record,
-                             ipu_model_wrapper)
-
-    # test NotImplementedError in __call__
-    ipu_model.train()
-    with pytest.raises(NotImplementedError):
-        ipu_model()
-
-    # test parse_losses
-    with pytest.raises(TypeError):
-        ipu_model._model.model._parse_losses({'loss': None})
-
-
-@skip_no_ipu
-def test_compare_tensor():
-    compare_ndarray(np.random.rand(3, 4), np.random.rand(3, 4))
diff --git a/tests/test_device/test_ipu/test_ipu_runner.py b/tests/test_device/test_ipu/test_ipu_runner.py
deleted file mode 100755
index 4de4fb7089..0000000000
--- a/tests/test_device/test_ipu/test_ipu_runner.py
+++ /dev/null
@@ -1,126 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import os.path as osp
-
-import pytest
-import torch
-import torch.nn as nn
-from torch.utils.data import Dataset
-
-from mmcv.runner import build_runner
-from mmcv.utils import IS_IPU_AVAILABLE
-
-if IS_IPU_AVAILABLE:
-    from mmcv.device.ipu import IPUDataLoader, runner
-
-skip_no_ipu = pytest.mark.skipif(
-    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
-
-# Most of its functions are inherited from EpochBasedRunner and IterBasedRunner
-# So only do incremental testing on overridden methods
-# Comparing with base runner,
-# Overridden functions are listed below:
-# __init__, register_lr_hook, register_optimizer_hook
-# register_lr_hook and register_optimizer_hook are tested in test_runner.py
-
-
-class OldStyleModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 3, 1)
-
-
-class Model(OldStyleModel):
-
-    def train_step(self):
-        pass
-
-    def val_step(self):
-        pass
-
-
-class ToyModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 3, 1)
-        self.bn = nn.BatchNorm2d(3)
-        self.relu = nn.ReLU6()
-        self.fp16_enabled = False
-
-    def forward(self, img, return_loss=True, **kwargs):
-        x = self.conv(img)
-        x = self.bn(x)
-        x = self.relu(x)
-        if return_loss:
-            loss = ((x - kwargs['gt_label'])**2).sum()
-            return {'loss': loss, 'loss1': loss + 1}
-        return x
-
-    def _parse_losses(self, losses):
-        return losses['loss'], {'loss1': losses['loss']}
-
-    def train_step(self, data, optimizer=None, **kwargs):
-        losses = self(**data)
-        loss, log_vars = self._parse_losses(losses)
-        outputs = dict(
-            loss=loss, log_vars=log_vars, num_samples=len(data['img'].data))
-        return outputs
-
-
-class ToyDataset(Dataset):
-
-    def __getitem__(self, index):
-        return {
-            'img': torch.rand((3, 10, 10)),
-            'gt_label': torch.rand((3, 10, 10))
-        }
-
-    def __len__(self, ):
-        return 3
-
-
-@skip_no_ipu
-def test_build_runner(tmp_path):
-    # __init__
-    dir_name = 'a_tmp_dir'
-
-    default_args = dict(
-        model=Model(),
-        work_dir=osp.join(tmp_path, dir_name),
-        logger=logging.getLogger())
-    cfg = dict(type='IPUEpochBasedRunner', max_epochs=1)
-    ipu_runner = build_runner(cfg, default_args=default_args)
-    assert ipu_runner._max_epochs == 1
-    cfg = dict(type='IPUIterBasedRunner', max_iters=1)
-    ipu_runner = build_runner(cfg, default_args=default_args)
-    assert ipu_runner._max_iters == 1
-
-    runner.IS_IPU_AVAILABLE = False
-    cfg = dict(type='IPUIterBasedRunner', max_iters=1)
-    with pytest.raises(
-            NotImplementedError,
-            match='cpu mode on IPURunner is not supported'):
-        ipu_runner = build_runner(cfg, default_args=default_args)
-
-    runner.IS_IPU_AVAILABLE = True
-    with pytest.raises(ValueError, match='Only one of'):
-        cfg = dict(type='IPUIterBasedRunner', max_epochs=1, max_iters=1)
-        ipu_runner = build_runner(cfg, default_args=default_args)
-
-    model = ToyModel()
-    options_cfg = {'train_cfg': {}, 'eval_cfg': {}}
-    dataloader = IPUDataLoader(ToyDataset(), None, num_workers=1)
-    optimizer = torch.optim.SGD(model.parameters(), lr=0.1)
-    cfg = dict(type='IPUIterBasedRunner', max_iters=2, options_cfg=options_cfg)
-    default_args = dict(
-        model=model,
-        optimizer=optimizer,
-        work_dir=osp.join(tmp_path, dir_name),
-        logger=logging.getLogger())
-    ipu_runner = build_runner(cfg, default_args=default_args)
-    ipu_runner.run([dataloader], [('train', 2)])
-    ipu_runner.get_options('val')
-    with pytest.raises(ValueError, match='mode should be train or val'):
-        ipu_runner.get_options('666')
diff --git a/tests/test_device/test_ipu/test_ipu_utils.py b/tests/test_device/test_ipu/test_ipu_utils.py
deleted file mode 100755
index f554c2414f..0000000000
--- a/tests/test_device/test_ipu/test_ipu_utils.py
+++ /dev/null
@@ -1,194 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import copy
-
-import pytest
-import torch.nn as nn
-
-import mmcv
-from mmcv.utils import IS_IPU_AVAILABLE
-
-if IS_IPU_AVAILABLE:
-    from poptorch.options import _IExecutionStrategy
-
-    from mmcv.device.ipu import cfg2options
-    from mmcv.device.ipu.utils import (build_from_cfg_with_wrapper,
-                                       model_sharding)
-
-skip_no_ipu = pytest.mark.skipif(
-    not IS_IPU_AVAILABLE, reason='test case under ipu environment')
-
-
-class ToyModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 3, 1)
-        self.bn = nn.BatchNorm2d(3)
-        self.relu = nn.ReLU6()
-
-
-@skip_no_ipu
-def test_build_from_cfg():
-    BACKBONES = mmcv.Registry('backbone')
-
-    @BACKBONES.register_module()
-    class ResNet:
-
-        def __init__(self, depth, stages=4):
-            self.depth = depth
-            self.stages = stages
-
-    @BACKBONES.register_module()
-    class ResNeXt:
-
-        def __init__(self, depth, stages=4):
-            self.depth = depth
-            self.stages = stages
-
-    cfg = dict(type='ResNet', depth=50)
-    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
-    assert isinstance(model, ResNet)
-    assert model.depth == 50 and model.stages == 4
-
-    cfg = dict(type='ResNet', depth=50)
-    model = build_from_cfg_with_wrapper(
-        cfg, BACKBONES, default_args={'stages': 3})
-    assert isinstance(model, ResNet)
-    assert model.depth == 50 and model.stages == 3
-
-    cfg = dict(type='ResNeXt', depth=50, stages=3)
-    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
-    assert isinstance(model, ResNeXt)
-    assert model.depth == 50 and model.stages == 3
-
-    cfg = dict(type=ResNet, depth=50)
-    model = build_from_cfg_with_wrapper(cfg, BACKBONES)
-    assert isinstance(model, ResNet)
-    assert model.depth == 50 and model.stages == 4
-
-    # type defined using default_args
-    cfg = dict(depth=50)
-    model = build_from_cfg_with_wrapper(
-        cfg, BACKBONES, default_args=dict(type='ResNet'))
-    assert isinstance(model, ResNet)
-    assert model.depth == 50 and model.stages == 4
-
-    cfg = dict(depth=50)
-    model = build_from_cfg_with_wrapper(
-        cfg, BACKBONES, default_args=dict(type=ResNet))
-    assert isinstance(model, ResNet)
-    assert model.depth == 50 and model.stages == 4
-
-    # not a registry
-    with pytest.raises(TypeError):
-        cfg = dict(type='VGG')
-        model = build_from_cfg_with_wrapper(cfg, 'BACKBONES')
-
-    # non-registered class
-    with pytest.raises(KeyError):
-        cfg = dict(type='VGG')
-        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
-
-    # default_args must be a dict or None
-    with pytest.raises(TypeError):
-        cfg = dict(type='ResNet', depth=50)
-        model = build_from_cfg_with_wrapper(cfg, BACKBONES, default_args=1)
-
-    # cfg['type'] should be a str or class
-    with pytest.raises(TypeError):
-        cfg = dict(type=1000)
-        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
-
-    # cfg should contain the key "type"
-    with pytest.raises(KeyError, match='must contain the key "type"'):
-        cfg = dict(depth=50, stages=4)
-        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
-
-    # cfg or default_args should contain the key "type"
-    with pytest.raises(KeyError, match='must contain the key "type"'):
-        cfg = dict(depth=50)
-        model = build_from_cfg_with_wrapper(
-            cfg, BACKBONES, default_args=dict(stages=4))
-
-    # incorrect registry type
-    with pytest.raises(TypeError):
-        cfg = dict(type='ResNet', depth=50)
-        model = build_from_cfg_with_wrapper(cfg, 'BACKBONES')
-
-    # incorrect default_args type
-    with pytest.raises(TypeError):
-        cfg = dict(type='ResNet', depth=50)
-        model = build_from_cfg_with_wrapper(cfg, BACKBONES, default_args=0)
-
-    # incorrect arguments
-    with pytest.raises(TypeError):
-        cfg = dict(type='ResNet', non_existing_arg=50)
-        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
-
-    # cfg not dict
-    with pytest.raises(TypeError):
-        cfg = []
-        model = build_from_cfg_with_wrapper(cfg, BACKBONES)
-
-
-@skip_no_ipu
-def test_cast_to_options():
-    options_cfg = dict(
-        randomSeed=888,
-        enableExecutableCaching='cache_engine',
-        train_cfg=dict(
-            executionStrategy='SameAsIpu',
-            Training=dict(gradientAccumulation=8),
-            availableMemoryProportion=[0.3, 0.3, 0.3, 0.3],
-        ),
-        eval_cfg=dict(deviceIterations=1, ),
-    )
-    ipu_options = cfg2options(copy.deepcopy(options_cfg))
-    assert 'training' in ipu_options
-    assert 'inference' in ipu_options
-    assert ipu_options['training']._values['random_seed'] == 888
-    assert ipu_options['training']._values['replication_factor'] == 1
-    assert ipu_options['training']._values['available_memory_proportion'] == {
-        0: 0.3,
-        1: 0.3,
-        2: 0.3,
-        3: 0.3
-    }
-    assert ipu_options['training']._popart.options[
-        'cachePath'] == 'cache_engine'
-    assert isinstance(ipu_options['training']._execution_strategy,
-                      _IExecutionStrategy)
-    assert ipu_options['inference']._values['device_iterations'] == 1
-
-    with pytest.raises(NotImplementedError, match='cfg type'):
-        _options_cfg = copy.deepcopy(options_cfg)
-        _options_cfg['randomSeed'] = (1, 3)
-        cfg2options(_options_cfg)
-
-    with pytest.raises(NotImplementedError, match='options_node type'):
-        _options_cfg = copy.deepcopy(options_cfg)
-        _options_cfg['train_cfg']['Precision'] = {'autocast_policy': 123}
-        cfg2options(_options_cfg)
-
-
-@skip_no_ipu
-def test_model_sharding():
-
-    model = ToyModel()
-    split_edges = [dict(layer_to_call='666', ipu_id=0)]
-
-    with pytest.raises(RuntimeError, match='split_edges:'):
-        model_sharding(model, split_edges)
-
-    model = ToyModel()
-    split_edges = [
-        dict(layer_to_call='conv', ipu_id=0),
-        dict(layer_to_call=1, ipu_id=0)
-    ]
-
-    with pytest.raises(ValueError, match='The same layer is referenced'):
-        model_sharding(model, split_edges)
-
-    model = ToyModel()
-    split_edges = [dict(layer_to_call='conv', ipu_id=0)]
-    model_sharding(model, split_edges)
diff --git a/tests/test_device/test_mlu/test_mlu_parallel.py b/tests/test_device/test_mlu/test_mlu_parallel.py
deleted file mode 100644
index 4d04fb6551..0000000000
--- a/tests/test_device/test_mlu/test_mlu_parallel.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from unittest.mock import MagicMock, patch
-
-import torch.nn as nn
-
-from mmcv.device.mlu import MLUDataParallel, MLUDistributedDataParallel
-from mmcv.parallel import is_module_wrapper
-from mmcv.utils import IS_MLU_AVAILABLE
-
-
-def mock(*args, **kwargs):
-    pass
-
-
-@patch('torch.distributed._broadcast_coalesced', mock)
-@patch('torch.distributed.broadcast', mock)
-@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
-def test_is_module_wrapper():
-
-    class Model(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.conv = nn.Conv2d(2, 2, 1)
-
-        def forward(self, x):
-            return self.conv(x)
-
-    model = Model()
-    assert not is_module_wrapper(model)
-
-    if IS_MLU_AVAILABLE:
-        mludp = MLUDataParallel(model)
-        assert is_module_wrapper(mludp)
-
-        mluddp = MLUDistributedDataParallel(model, process_group=MagicMock())
-        assert is_module_wrapper(mluddp)
diff --git a/tests/test_device/test_mps/test_mps_parallel.py b/tests/test_device/test_mps/test_mps_parallel.py
deleted file mode 100644
index 4b4e0b86e1..0000000000
--- a/tests/test_device/test_mps/test_mps_parallel.py
+++ /dev/null
@@ -1,34 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from unittest.mock import patch
-
-import torch.nn as nn
-
-from mmcv.device.mps import MPSDataParallel
-from mmcv.parallel import is_module_wrapper
-from mmcv.utils import IS_MPS_AVAILABLE
-
-
-def mock(*args, **kwargs):
-    pass
-
-
-@patch('torch.distributed._broadcast_coalesced', mock)
-@patch('torch.distributed.broadcast', mock)
-@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
-def test_is_module_wrapper():
-
-    class Model(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.conv = nn.Conv2d(2, 2, 1)
-
-        def forward(self, x):
-            return self.conv(x)
-
-    model = Model()
-    assert not is_module_wrapper(model)
-
-    if IS_MPS_AVAILABLE:
-        mpsdp = MPSDataParallel(model)
-        assert is_module_wrapper(mpsdp)
diff --git a/tests/test_load_model_zoo.py b/tests/test_load_model_zoo.py
deleted file mode 100644
index ae4495db72..0000000000
--- a/tests/test_load_model_zoo.py
+++ /dev/null
@@ -1,157 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import os.path as osp
-from unittest.mock import patch
-
-import mmengine
-import pytest
-import torchvision
-
-import mmcv
-from mmcv.runner.checkpoint import (DEFAULT_CACHE_DIR, ENV_MMCV_HOME,
-                                    ENV_XDG_CACHE_HOME, _get_mmcv_home,
-                                    _load_checkpoint,
-                                    get_deprecated_model_names,
-                                    get_external_models)
-from mmcv.utils import digit_version
-
-
-@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
-def test_set_mmcv_home():
-    os.environ.pop(ENV_MMCV_HOME, None)
-    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
-    os.environ[ENV_MMCV_HOME] = mmcv_home
-    assert _get_mmcv_home() == mmcv_home
-
-
-@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
-def test_default_mmcv_home():
-    os.environ.pop(ENV_MMCV_HOME, None)
-    os.environ.pop(ENV_XDG_CACHE_HOME, None)
-    assert _get_mmcv_home() == os.path.expanduser(
-        os.path.join(DEFAULT_CACHE_DIR, 'mmcv'))
-    model_urls = get_external_models()
-    assert model_urls == mmengine.load(
-        osp.join(mmcv.__path__[0], 'model_zoo/open_mmlab.json'))
-
-
-@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
-def test_get_external_models():
-    os.environ.pop(ENV_MMCV_HOME, None)
-    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
-    os.environ[ENV_MMCV_HOME] = mmcv_home
-    ext_urls = get_external_models()
-    assert ext_urls == {
-        'train': 'https://localhost/train.pth',
-        'test': 'test.pth',
-        'val': 'val.pth',
-        'train_empty': 'train.pth'
-    }
-
-
-@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
-def test_get_deprecated_models():
-    os.environ.pop(ENV_MMCV_HOME, None)
-    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home/')
-    os.environ[ENV_MMCV_HOME] = mmcv_home
-    dep_urls = get_deprecated_model_names()
-    assert dep_urls == {
-        'train_old': 'train',
-        'test_old': 'test',
-    }
-
-
-def load_from_http(url, map_location=None):
-    return 'url:' + url
-
-
-def load_url(url, map_location=None, model_dir=None):
-    return load_from_http(url)
-
-
-def load(filepath, map_location=None):
-    return 'local:' + filepath
-
-
-@patch('mmcv.__path__', [osp.join(osp.dirname(__file__), 'data/')])
-@patch('mmcv.runner.checkpoint.load_from_http', load_from_http)
-@patch('mmcv.runner.checkpoint.load_url', load_url)
-@patch('torch.load', load)
-def test_load_external_url():
-    # test modelzoo://
-    torchvision_version = torchvision.__version__
-    if digit_version(torchvision_version) < digit_version('0.10.0a0'):
-        assert (_load_checkpoint('modelzoo://resnet50') ==
-                'url:https://download.pytorch.org/models/resnet50-19c8e'
-                '357.pth')
-        assert (_load_checkpoint('torchvision://resnet50') ==
-                'url:https://download.pytorch.org/models/resnet50-19c8e'
-                '357.pth')
-    else:
-        assert (_load_checkpoint('modelzoo://resnet50') ==
-                'url:https://download.pytorch.org/models/resnet50-0676b'
-                'a61.pth')
-        assert (_load_checkpoint('torchvision://resnet50') ==
-                'url:https://download.pytorch.org/models/resnet50-0676b'
-                'a61.pth')
-
-    if digit_version(torchvision_version) >= digit_version('0.13.0a0'):
-        # Test load new format torchvision models.
-        assert (
-            _load_checkpoint('torchvision://resnet50.imagenet1k_v1') ==
-            'url:https://download.pytorch.org/models/resnet50-0676ba61.pth')
-
-        assert (
-            _load_checkpoint('torchvision://ResNet50_Weights.IMAGENET1K_V1') ==
-            'url:https://download.pytorch.org/models/resnet50-0676ba61.pth')
-
-        _load_checkpoint('torchvision://resnet50.default')
-
-    # test open-mmlab:// with default MMCV_HOME
-    os.environ.pop(ENV_MMCV_HOME, None)
-    os.environ.pop(ENV_XDG_CACHE_HOME, None)
-    url = _load_checkpoint('open-mmlab://train')
-    assert url == 'url:https://localhost/train.pth'
-
-    # test open-mmlab:// with deprecated model name
-    os.environ.pop(ENV_MMCV_HOME, None)
-    os.environ.pop(ENV_XDG_CACHE_HOME, None)
-    with pytest.warns(
-            Warning,
-            match='open-mmlab://train_old is deprecated in favor of '
-            'open-mmlab://train'):
-        url = _load_checkpoint('open-mmlab://train_old')
-        assert url == 'url:https://localhost/train.pth'
-
-    # test openmmlab:// with deprecated model name
-    os.environ.pop(ENV_MMCV_HOME, None)
-    os.environ.pop(ENV_XDG_CACHE_HOME, None)
-    with pytest.warns(
-            Warning,
-            match='openmmlab://train_old is deprecated in favor of '
-            'openmmlab://train'):
-        url = _load_checkpoint('openmmlab://train_old')
-        assert url == 'url:https://localhost/train.pth'
-
-    # test open-mmlab:// with user-defined MMCV_HOME
-    os.environ.pop(ENV_MMCV_HOME, None)
-    mmcv_home = osp.join(osp.dirname(__file__), 'data/model_zoo/mmcv_home')
-    os.environ[ENV_MMCV_HOME] = mmcv_home
-    url = _load_checkpoint('open-mmlab://train')
-    assert url == 'url:https://localhost/train.pth'
-    with pytest.raises(FileNotFoundError, match='train.pth can not be found.'):
-        _load_checkpoint('open-mmlab://train_empty')
-    url = _load_checkpoint('open-mmlab://test')
-    assert url == f'local:{osp.join(_get_mmcv_home(), "test.pth")}'
-    url = _load_checkpoint('open-mmlab://val')
-    assert url == f'local:{osp.join(_get_mmcv_home(), "val.pth")}'
-
-    # test http:// https://
-    url = _load_checkpoint('http://localhost/train.pth')
-    assert url == 'url:http://localhost/train.pth'
-
-    # test local file
-    with pytest.raises(FileNotFoundError, match='train.pth can not be found.'):
-        _load_checkpoint('train.pth')
-    url = _load_checkpoint(osp.join(_get_mmcv_home(), 'test.pth'))
-    assert url == f'local:{osp.join(_get_mmcv_home(), "test.pth")}'
diff --git a/tests/test_parallel.py b/tests/test_parallel.py
deleted file mode 100644
index 814aaeadfb..0000000000
--- a/tests/test_parallel.py
+++ /dev/null
@@ -1,188 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-from unittest.mock import MagicMock, patch
-
-import pytest
-import torch
-import torch.nn as nn
-from torch.nn.parallel import DataParallel, DistributedDataParallel
-
-from mmcv.parallel import (MODULE_WRAPPERS, MMDataParallel,
-                           MMDistributedDataParallel, is_module_wrapper)
-from mmcv.parallel._functions import Scatter, get_input_device, scatter
-from mmcv.parallel.distributed_deprecated import \
-    MMDistributedDataParallel as DeprecatedMMDDP
-from mmcv.utils import Registry
-
-
-def mock(*args, **kwargs):
-    pass
-
-
-@pytest.mark.skipif(
-    torch.__version__ == 'parrots', reason='not supported in parrots now')
-@patch('torch.distributed._broadcast_coalesced', mock)
-@patch('torch.distributed.broadcast', mock)
-@patch('torch.nn.parallel.DistributedDataParallel._ddp_init_helper', mock)
-def test_is_module_wrapper():
-
-    class Model(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.conv = nn.Conv2d(2, 2, 1)
-
-        def forward(self, x):
-            return self.conv(x)
-
-    # _verify_model_across_ranks is added in torch1.9.0,
-    # _verify_params_across_processes is added in torch1.11.0,
-    # so we should check whether _verify_model_across_ranks
-    # and _verify_params_across_processes are the member of
-    # torch.distributed before mocking
-    if hasattr(torch.distributed, '_verify_model_across_ranks'):
-        torch.distributed._verify_model_across_ranks = mock
-    if hasattr(torch.distributed, '_verify_params_across_processes'):
-        torch.distributed._verify_params_across_processes = mock
-
-    model = Model()
-    assert not is_module_wrapper(model)
-
-    dp = DataParallel(model)
-    assert is_module_wrapper(dp)
-
-    mmdp = MMDataParallel(model)
-    assert is_module_wrapper(mmdp)
-
-    ddp = DistributedDataParallel(model, process_group=MagicMock())
-    assert is_module_wrapper(ddp)
-
-    mmddp = MMDistributedDataParallel(model, process_group=MagicMock())
-    assert is_module_wrapper(mmddp)
-
-    deprecated_mmddp = DeprecatedMMDDP(model)
-    assert is_module_wrapper(deprecated_mmddp)
-
-    # test module wrapper registry
-    @MODULE_WRAPPERS.register_module()
-    class ModuleWrapper:
-
-        def __init__(self, module):
-            self.module = module
-
-        def forward(self, *args, **kwargs):
-            return self.module(*args, **kwargs)
-
-    module_wraper = ModuleWrapper(model)
-    assert is_module_wrapper(module_wraper)
-
-    # test module wrapper registry in downstream repo
-    MMRAZOR_MODULE_WRAPPERS = Registry(
-        'mmrazor module wrapper', parent=MODULE_WRAPPERS, scope='mmrazor')
-    MMPOSE_MODULE_WRAPPERS = Registry(
-        'mmpose module wrapper', parent=MODULE_WRAPPERS, scope='mmpose')
-
-    @MMRAZOR_MODULE_WRAPPERS.register_module()
-    class ModuleWrapperInRazor:
-
-        def __init__(self, module):
-            self.module = module
-
-        def forward(self, *args, **kwargs):
-            return self.module(*args, **kwargs)
-
-    @MMPOSE_MODULE_WRAPPERS.register_module()
-    class ModuleWrapperInPose:
-
-        def __init__(self, module):
-            self.module = module
-
-        def forward(self, *args, **kwargs):
-            return self.module(*args, **kwargs)
-
-    wrapped_module = ModuleWrapperInRazor(model)
-    assert is_module_wrapper(wrapped_module)
-
-    wrapped_module = ModuleWrapperInPose(model)
-    assert is_module_wrapper(wrapped_module)
-
-
-def test_get_input_device():
-    # if the device is CPU, return -1
-    input = torch.zeros([1, 3, 3, 3])
-    assert get_input_device(input) == -1
-    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-    assert get_input_device(inputs) == -1
-
-    # if the device is GPU, return the index of device
-    if torch.cuda.is_available():
-        input = torch.zeros([1, 3, 3, 3]).cuda()
-        assert get_input_device(input) == 0
-        inputs = [
-            torch.zeros([1, 3, 3, 3]).cuda(),
-            torch.zeros([1, 4, 4, 4]).cuda()
-        ]
-        assert get_input_device(inputs) == 0
-
-    # input should be a tensor or list of tensor
-    with pytest.raises(Exception):
-        get_input_device(5)
-
-
-def test_scatter():
-    # if the device is CPU, just return the input
-    input = torch.zeros([1, 3, 3, 3])
-    output = scatter(input=input, devices=[-1])
-    assert torch.allclose(input, output)
-
-    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-    outputs = scatter(input=inputs, devices=[-1])
-    for input, output in zip(inputs, outputs):
-        assert torch.allclose(input, output)
-
-    # if the device is GPU, copy the input from CPU to GPU
-    if torch.cuda.is_available():
-        input = torch.zeros([1, 3, 3, 3])
-        output = scatter(input=input, devices=[0])
-        assert torch.allclose(input.cuda(), output)
-
-        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-        outputs = scatter(input=inputs, devices=[0])
-        for input, output in zip(inputs, outputs):
-            assert torch.allclose(input.cuda(), output)
-
-    # input should be a tensor or list of tensor
-    with pytest.raises(Exception):
-        scatter(5, [-1])
-
-
-@pytest.mark.skipif(
-    torch.__version__ == 'parrots', reason='not supported in parrots now')
-def test_Scatter():
-    # if the device is CPU, just return the input
-    target_gpus = [-1]
-    input = torch.zeros([1, 3, 3, 3])
-    outputs = Scatter.forward(target_gpus, input)
-    assert isinstance(outputs, tuple)
-    assert torch.allclose(input, outputs[0])
-
-    target_gpus = [-1]
-    inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-    outputs = Scatter.forward(target_gpus, inputs)
-    assert isinstance(outputs, tuple)
-    for input, output in zip(inputs, outputs):
-        assert torch.allclose(input, output)
-
-    # if the device is GPU, copy the input from CPU to GPU
-    if torch.cuda.is_available():
-        target_gpus = [0]
-        input = torch.zeros([1, 3, 3, 3])
-        outputs = Scatter.forward(target_gpus, input)
-        assert isinstance(outputs, tuple)
-        assert torch.allclose(input.cuda(), outputs[0])
-
-        target_gpus = [0]
-        inputs = [torch.zeros([1, 3, 3, 3]), torch.zeros([1, 4, 4, 4])]
-        outputs = Scatter.forward(target_gpus, inputs)
-        assert isinstance(outputs, tuple)
-        for input, output in zip(inputs, outputs):
-            assert torch.allclose(input.cuda(), output[0])
diff --git a/tests/test_runner/test_checkpoint.py b/tests/test_runner/test_checkpoint.py
deleted file mode 100644
index 6b842e0e61..0000000000
--- a/tests/test_runner/test_checkpoint.py
+++ /dev/null
@@ -1,452 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import sys
-from collections import OrderedDict
-from tempfile import TemporaryDirectory
-from unittest.mock import MagicMock, patch
-
-import pytest
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from mmengine.fileio.file_client import PetrelBackend
-from torch.nn.parallel import DataParallel
-
-from mmcv.parallel.registry import MODULE_WRAPPERS
-from mmcv.runner.checkpoint import (_load_checkpoint_with_prefix,
-                                    get_state_dict, load_checkpoint,
-                                    load_from_local, load_from_pavi,
-                                    save_checkpoint)
-
-sys.modules['petrel_client'] = MagicMock()
-sys.modules['petrel_client.client'] = MagicMock()
-
-
-@MODULE_WRAPPERS.register_module()
-class DDPWrapper:
-
-    def __init__(self, module):
-        self.module = module
-
-
-class Block(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 3, 1)
-        self.norm = nn.BatchNorm2d(3)
-
-
-class Model(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.block = Block()
-        self.conv = nn.Conv2d(3, 3, 1)
-
-
-class Mockpavimodel:
-
-    def __init__(self, name='fakename'):
-        self.name = name
-
-    def download(self, file):
-        pass
-
-
-def assert_tensor_equal(tensor_a, tensor_b):
-    assert tensor_a.eq(tensor_b).all()
-
-
-def test_get_state_dict():
-    if torch.__version__ == 'parrots':
-        state_dict_keys = {
-            'block.conv.weight', 'block.conv.bias', 'block.norm.weight',
-            'block.norm.bias', 'block.norm.running_mean',
-            'block.norm.running_var', 'conv.weight', 'conv.bias'
-        }
-    else:
-        state_dict_keys = {
-            'block.conv.weight', 'block.conv.bias', 'block.norm.weight',
-            'block.norm.bias', 'block.norm.running_mean',
-            'block.norm.running_var', 'block.norm.num_batches_tracked',
-            'conv.weight', 'conv.bias'
-        }
-
-    model = Model()
-    state_dict = get_state_dict(model)
-    assert isinstance(state_dict, OrderedDict)
-    assert set(state_dict.keys()) == state_dict_keys
-
-    assert_tensor_equal(state_dict['block.conv.weight'],
-                        model.block.conv.weight)
-    assert_tensor_equal(state_dict['block.conv.bias'], model.block.conv.bias)
-    assert_tensor_equal(state_dict['block.norm.weight'],
-                        model.block.norm.weight)
-    assert_tensor_equal(state_dict['block.norm.bias'], model.block.norm.bias)
-    assert_tensor_equal(state_dict['block.norm.running_mean'],
-                        model.block.norm.running_mean)
-    assert_tensor_equal(state_dict['block.norm.running_var'],
-                        model.block.norm.running_var)
-    if torch.__version__ != 'parrots':
-        assert_tensor_equal(state_dict['block.norm.num_batches_tracked'],
-                            model.block.norm.num_batches_tracked)
-    assert_tensor_equal(state_dict['conv.weight'], model.conv.weight)
-    assert_tensor_equal(state_dict['conv.bias'], model.conv.bias)
-
-    wrapped_model = DDPWrapper(model)
-    state_dict = get_state_dict(wrapped_model)
-    assert isinstance(state_dict, OrderedDict)
-    assert set(state_dict.keys()) == state_dict_keys
-    assert_tensor_equal(state_dict['block.conv.weight'],
-                        wrapped_model.module.block.conv.weight)
-    assert_tensor_equal(state_dict['block.conv.bias'],
-                        wrapped_model.module.block.conv.bias)
-    assert_tensor_equal(state_dict['block.norm.weight'],
-                        wrapped_model.module.block.norm.weight)
-    assert_tensor_equal(state_dict['block.norm.bias'],
-                        wrapped_model.module.block.norm.bias)
-    assert_tensor_equal(state_dict['block.norm.running_mean'],
-                        wrapped_model.module.block.norm.running_mean)
-    assert_tensor_equal(state_dict['block.norm.running_var'],
-                        wrapped_model.module.block.norm.running_var)
-    if torch.__version__ != 'parrots':
-        assert_tensor_equal(
-            state_dict['block.norm.num_batches_tracked'],
-            wrapped_model.module.block.norm.num_batches_tracked)
-    assert_tensor_equal(state_dict['conv.weight'],
-                        wrapped_model.module.conv.weight)
-    assert_tensor_equal(state_dict['conv.bias'],
-                        wrapped_model.module.conv.bias)
-
-    # wrapped inner module
-    for name, module in wrapped_model.module._modules.items():
-        module = DataParallel(module)
-        wrapped_model.module._modules[name] = module
-    state_dict = get_state_dict(wrapped_model)
-    assert isinstance(state_dict, OrderedDict)
-    assert set(state_dict.keys()) == state_dict_keys
-    assert_tensor_equal(state_dict['block.conv.weight'],
-                        wrapped_model.module.block.module.conv.weight)
-    assert_tensor_equal(state_dict['block.conv.bias'],
-                        wrapped_model.module.block.module.conv.bias)
-    assert_tensor_equal(state_dict['block.norm.weight'],
-                        wrapped_model.module.block.module.norm.weight)
-    assert_tensor_equal(state_dict['block.norm.bias'],
-                        wrapped_model.module.block.module.norm.bias)
-    assert_tensor_equal(state_dict['block.norm.running_mean'],
-                        wrapped_model.module.block.module.norm.running_mean)
-    assert_tensor_equal(state_dict['block.norm.running_var'],
-                        wrapped_model.module.block.module.norm.running_var)
-    if torch.__version__ != 'parrots':
-        assert_tensor_equal(
-            state_dict['block.norm.num_batches_tracked'],
-            wrapped_model.module.block.module.norm.num_batches_tracked)
-    assert_tensor_equal(state_dict['conv.weight'],
-                        wrapped_model.module.conv.module.weight)
-    assert_tensor_equal(state_dict['conv.bias'],
-                        wrapped_model.module.conv.module.bias)
-
-
-def test_load_pavimodel_dist():
-
-    sys.modules['pavi'] = MagicMock()
-    sys.modules['pavi.modelcloud'] = MagicMock()
-    pavimodel = Mockpavimodel()
-    import pavi
-    pavi.modelcloud.get = MagicMock(return_value=pavimodel)
-    with pytest.raises(AssertionError):
-        # test pavi prefix
-        _ = load_from_pavi('MyPaviFolder/checkpoint.pth')
-
-    with pytest.raises(FileNotFoundError):
-        # there is not such checkpoint for us to load
-        _ = load_from_pavi('pavi://checkpoint.pth')
-
-
-def test_load_checkpoint_with_prefix():
-
-    class FooModule(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.linear = nn.Linear(1, 2)
-            self.conv2d = nn.Conv2d(3, 1, 3)
-            self.conv2d_2 = nn.Conv2d(3, 2, 3)
-
-    model = FooModule()
-    nn.init.constant_(model.linear.weight, 1)
-    nn.init.constant_(model.linear.bias, 2)
-    nn.init.constant_(model.conv2d.weight, 3)
-    nn.init.constant_(model.conv2d.bias, 4)
-    nn.init.constant_(model.conv2d_2.weight, 5)
-    nn.init.constant_(model.conv2d_2.bias, 6)
-
-    with TemporaryDirectory():
-        torch.save(model.state_dict(), 'model.pth')
-        prefix = 'conv2d'
-        state_dict = _load_checkpoint_with_prefix(prefix, 'model.pth')
-        assert torch.equal(model.conv2d.state_dict()['weight'],
-                           state_dict['weight'])
-        assert torch.equal(model.conv2d.state_dict()['bias'],
-                           state_dict['bias'])
-
-        # test whether prefix is in pretrained model
-        with pytest.raises(AssertionError):
-            prefix = 'back'
-            _load_checkpoint_with_prefix(prefix, 'model.pth')
-
-
-def test_load_checkpoint():
-    import os
-    import re
-    import tempfile
-
-    class PrefixModel(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.backbone = Model()
-
-    pmodel = PrefixModel()
-    model = Model()
-    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
-
-    # add prefix
-    torch.save(model.state_dict(), checkpoint_path)
-    state_dict = load_checkpoint(
-        pmodel, checkpoint_path, revise_keys=[(r'^', 'backbone.')])
-    for key in pmodel.backbone.state_dict().keys():
-        assert torch.equal(pmodel.backbone.state_dict()[key], state_dict[key])
-    # strip prefix
-    torch.save(pmodel.state_dict(), checkpoint_path)
-    state_dict = load_checkpoint(
-        model, checkpoint_path, revise_keys=[(r'^backbone\.', '')])
-
-    for key in state_dict.keys():
-        key_stripped = re.sub(r'^backbone\.', '', key)
-        assert torch.equal(model.state_dict()[key_stripped], state_dict[key])
-    os.remove(checkpoint_path)
-
-
-def test_load_checkpoint_metadata():
-    import os
-    import tempfile
-
-    from mmcv.runner import load_checkpoint, save_checkpoint
-
-    class ModelV1(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.block = Block()
-            self.conv1 = nn.Conv2d(3, 3, 1)
-            self.conv2 = nn.Conv2d(3, 3, 1)
-            nn.init.normal_(self.conv1.weight)
-            nn.init.normal_(self.conv2.weight)
-
-    class ModelV2(nn.Module):
-        _version = 2
-
-        def __init__(self):
-            super().__init__()
-            self.block = Block()
-            self.conv0 = nn.Conv2d(3, 3, 1)
-            self.conv1 = nn.Conv2d(3, 3, 1)
-            nn.init.normal_(self.conv0.weight)
-            nn.init.normal_(self.conv1.weight)
-
-        def _load_from_state_dict(self, state_dict, prefix, local_metadata,
-                                  *args, **kwargs):
-            """load checkpoints."""
-
-            # Names of some parameters in has been changed.
-            version = local_metadata.get('version', None)
-            if version is None or version < 2:
-                state_dict_keys = list(state_dict.keys())
-                convert_map = {'conv1': 'conv0', 'conv2': 'conv1'}
-                for k in state_dict_keys:
-                    for ori_str, new_str in convert_map.items():
-                        if k.startswith(prefix + ori_str):
-                            new_key = k.replace(ori_str, new_str)
-                            state_dict[new_key] = state_dict[k]
-                            del state_dict[k]
-
-            super()._load_from_state_dict(state_dict, prefix, local_metadata,
-                                          *args, **kwargs)
-
-    model_v1 = ModelV1()
-    model_v1_conv0_weight = model_v1.conv1.weight.detach()
-    model_v1_conv1_weight = model_v1.conv2.weight.detach()
-    model_v2 = ModelV2()
-    model_v2_conv0_weight = model_v2.conv0.weight.detach()
-    model_v2_conv1_weight = model_v2.conv1.weight.detach()
-    ckpt_v1_path = os.path.join(tempfile.gettempdir(), 'checkpoint_v1.pth')
-    ckpt_v2_path = os.path.join(tempfile.gettempdir(), 'checkpoint_v2.pth')
-
-    # Save checkpoint
-    save_checkpoint(model_v1, ckpt_v1_path)
-    save_checkpoint(model_v2, ckpt_v2_path)
-
-    # test load v1 model
-    load_checkpoint(model_v2, ckpt_v1_path)
-    assert torch.allclose(model_v2.conv0.weight, model_v1_conv0_weight)
-    assert torch.allclose(model_v2.conv1.weight, model_v1_conv1_weight)
-
-    # test load v2 model
-    load_checkpoint(model_v2, ckpt_v2_path)
-    assert torch.allclose(model_v2.conv0.weight, model_v2_conv0_weight)
-    assert torch.allclose(model_v2.conv1.weight, model_v2_conv1_weight)
-
-
-def test_load_classes_name():
-    import os
-    import tempfile
-
-    from mmcv.runner import load_checkpoint, save_checkpoint
-    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
-    model = Model()
-    save_checkpoint(model, checkpoint_path)
-    checkpoint = load_checkpoint(model, checkpoint_path)
-    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
-
-    model.CLASSES = ('class1', 'class2')
-    save_checkpoint(model, checkpoint_path)
-    checkpoint = load_checkpoint(model, checkpoint_path)
-    assert 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']
-    assert checkpoint['meta']['CLASSES'] == ('class1', 'class2')
-
-    model = Model()
-    wrapped_model = DDPWrapper(model)
-    save_checkpoint(wrapped_model, checkpoint_path)
-    checkpoint = load_checkpoint(wrapped_model, checkpoint_path)
-    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
-
-    wrapped_model.module.CLASSES = ('class1', 'class2')
-    save_checkpoint(wrapped_model, checkpoint_path)
-    checkpoint = load_checkpoint(wrapped_model, checkpoint_path)
-    assert 'meta' in checkpoint and 'CLASSES' in checkpoint['meta']
-    assert checkpoint['meta']['CLASSES'] == ('class1', 'class2')
-
-    # remove the temp file
-    os.remove(checkpoint_path)
-
-
-def test_checkpoint_loader():
-    import os
-    import tempfile
-
-    from mmcv.runner import CheckpointLoader, _load_checkpoint, save_checkpoint
-    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
-    model = Model()
-    save_checkpoint(model, checkpoint_path)
-    checkpoint = _load_checkpoint(checkpoint_path)
-    assert 'meta' in checkpoint and 'CLASSES' not in checkpoint['meta']
-    # remove the temp file
-    os.remove(checkpoint_path)
-
-    filenames = [
-        'http://xx.xx/xx.pth', 'https://xx.xx/xx.pth',
-        'modelzoo://xx.xx/xx.pth', 'torchvision://xx.xx/xx.pth',
-        'open-mmlab://xx.xx/xx.pth', 'openmmlab://xx.xx/xx.pth',
-        'mmcls://xx.xx/xx.pth', 'pavi://xx.xx/xx.pth', 's3://xx.xx/xx.pth',
-        'ss3://xx.xx/xx.pth', ' s3://xx.xx/xx.pth',
-        'open-mmlab:s3://xx.xx/xx.pth', 'openmmlab:s3://xx.xx/xx.pth',
-        'openmmlabs3://xx.xx/xx.pth', ':s3://xx.xx/xx.path'
-    ]
-    fn_names = [
-        'load_from_http', 'load_from_http', 'load_from_torchvision',
-        'load_from_torchvision', 'load_from_openmmlab', 'load_from_openmmlab',
-        'load_from_mmcls', 'load_from_pavi', 'load_from_ceph',
-        'load_from_local', 'load_from_local', 'load_from_ceph',
-        'load_from_ceph', 'load_from_local', 'load_from_local'
-    ]
-
-    for filename, fn_name in zip(filenames, fn_names):
-        loader = CheckpointLoader._get_checkpoint_loader(filename)
-        assert loader.__name__ == fn_name
-
-    @CheckpointLoader.register_scheme(prefixes='ftp://')
-    def load_from_ftp(filename, map_location):
-        return dict(filename=filename)
-
-    # test register_loader
-    filename = 'ftp://xx.xx/xx.pth'
-    loader = CheckpointLoader._get_checkpoint_loader(filename)
-    assert loader.__name__ == 'load_from_ftp'
-
-    def load_from_ftp1(filename, map_location):
-        return dict(filename=filename)
-
-    # test duplicate registered error
-    with pytest.raises(KeyError):
-        CheckpointLoader.register_scheme('ftp://', load_from_ftp1)
-
-    # test force param
-    CheckpointLoader.register_scheme('ftp://', load_from_ftp1, force=True)
-    checkpoint = CheckpointLoader.load_checkpoint(filename)
-    assert checkpoint['filename'] == filename
-
-    # test print function name
-    loader = CheckpointLoader._get_checkpoint_loader(filename)
-    assert loader.__name__ == 'load_from_ftp1'
-
-    # test sort
-    @CheckpointLoader.register_scheme(prefixes='a/b')
-    def load_from_ab(filename, map_location):
-        return dict(filename=filename)
-
-    @CheckpointLoader.register_scheme(prefixes='a/b/c')
-    def load_from_abc(filename, map_location):
-        return dict(filename=filename)
-
-    filename = 'a/b/c/d'
-    loader = CheckpointLoader._get_checkpoint_loader(filename)
-    assert loader.__name__ == 'load_from_abc'
-
-
-def test_save_checkpoint(tmp_path):
-    model = Model()
-    optimizer = optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
-    # meta is not a dict
-    with pytest.raises(TypeError):
-        save_checkpoint(model, '/path/of/your/filename', meta='invalid type')
-
-    # 1. save to disk
-    filename = str(tmp_path / 'checkpoint1.pth')
-    save_checkpoint(model, filename)
-
-    filename = str(tmp_path / 'checkpoint2.pth')
-    save_checkpoint(model, filename, optimizer)
-
-    filename = str(tmp_path / 'checkpoint3.pth')
-    save_checkpoint(model, filename, meta={'test': 'test'})
-
-    filename = str(tmp_path / 'checkpoint4.pth')
-    save_checkpoint(model, filename, file_client_args={'backend': 'disk'})
-
-    # 2. save to petrel oss
-    with patch.object(PetrelBackend, 'put') as mock_method:
-        filename = 's3://path/of/your/checkpoint1.pth'
-        save_checkpoint(model, filename)
-    mock_method.assert_called()
-
-    with patch.object(PetrelBackend, 'put') as mock_method:
-        filename = 's3://path//of/your/checkpoint2.pth'
-        save_checkpoint(
-            model, filename, file_client_args={'backend': 'petrel'})
-    mock_method.assert_called()
-
-
-def test_load_from_local():
-    import os
-    home_path = os.path.expanduser('~')
-    checkpoint_path = os.path.join(
-        home_path, 'dummy_checkpoint_used_to_test_load_from_local.pth')
-    model = Model()
-    save_checkpoint(model, checkpoint_path)
-    checkpoint = load_from_local(
-        '~/dummy_checkpoint_used_to_test_load_from_local.pth',
-        map_location=None)
-    assert_tensor_equal(checkpoint['state_dict']['block.conv.weight'],
-                        model.block.conv.weight)
-    os.remove(checkpoint_path)
diff --git a/tests/test_runner/test_dist_utils.py b/tests/test_runner/test_dist_utils.py
deleted file mode 100644
index 979c2e4f3f..0000000000
--- a/tests/test_runner/test_dist_utils.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-from unittest.mock import patch
-
-import pytest
-
-from mmcv.runner import init_dist
-
-
-@patch('torch.cuda.device_count', return_value=1)
-@patch('torch.cuda.set_device')
-@patch('torch.distributed.init_process_group')
-@patch('subprocess.getoutput', return_value='127.0.0.1')
-def test_init_dist(mock_getoutput, mock_dist_init, mock_set_device,
-                   mock_device_count):
-    with pytest.raises(ValueError):
-        # launcher must be one of {'pytorch', 'mpi', 'slurm'}
-        init_dist('invaliad_launcher')
-
-    # test initialize with slurm launcher
-    os.environ['SLURM_PROCID'] = '0'
-    os.environ['SLURM_NTASKS'] = '1'
-    os.environ['SLURM_NODELIST'] = '[0]'  # haven't check the correct form
-
-    init_dist('slurm')
-    # no port is specified, use default port 29500
-    assert os.environ['MASTER_PORT'] == '29500'
-    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
-    assert os.environ['WORLD_SIZE'] == '1'
-    assert os.environ['RANK'] == '0'
-    mock_set_device.assert_called_with(0)
-    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
-    mock_dist_init.assert_called_with(backend='nccl')
-
-    init_dist('slurm', port=29505)
-    # port is specified with argument 'port'
-    assert os.environ['MASTER_PORT'] == '29505'
-    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
-    assert os.environ['WORLD_SIZE'] == '1'
-    assert os.environ['RANK'] == '0'
-    mock_set_device.assert_called_with(0)
-    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
-    mock_dist_init.assert_called_with(backend='nccl')
-
-    init_dist('slurm')
-    # port is specified by environment variable 'MASTER_PORT'
-    assert os.environ['MASTER_PORT'] == '29505'
-    assert os.environ['MASTER_ADDR'] == '127.0.0.1'
-    assert os.environ['WORLD_SIZE'] == '1'
-    assert os.environ['RANK'] == '0'
-    mock_set_device.assert_called_with(0)
-    mock_getoutput.assert_called_with('scontrol show hostname [0] | head -n1')
-    mock_dist_init.assert_called_with(backend='nccl')
diff --git a/tests/test_runner/test_eval_hook.py b/tests/test_runner/test_eval_hook.py
deleted file mode 100644
index 7cab166f83..0000000000
--- a/tests/test_runner/test_eval_hook.py
+++ /dev/null
@@ -1,483 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import json
-import os.path as osp
-import sys
-import tempfile
-import unittest.mock as mock
-from collections import OrderedDict
-from unittest.mock import MagicMock, patch
-
-import pytest
-import torch
-import torch.nn as nn
-import torch.optim as optim
-from mmengine.fileio.file_client import PetrelBackend
-from torch.utils.data import DataLoader, Dataset
-
-from mmcv.runner import DistEvalHook as BaseDistEvalHook
-from mmcv.runner import EpochBasedRunner
-from mmcv.runner import EvalHook as BaseEvalHook
-from mmcv.runner import IterBasedRunner
-from mmcv.utils import get_logger, scandir
-
-sys.modules['petrel_client'] = MagicMock()
-sys.modules['petrel_client.client'] = MagicMock()
-
-
-class ExampleDataset(Dataset):
-
-    def __init__(self):
-        self.index = 0
-        self.eval_result = [1, 4, 3, 7, 2, -3, 4, 6]
-
-    def __getitem__(self, idx):
-        results = dict(x=torch.tensor([1]))
-        return results
-
-    def __len__(self):
-        return 1
-
-    @mock.create_autospec
-    def evaluate(self, results, logger=None):
-        pass
-
-
-class EvalDataset(ExampleDataset):
-
-    def evaluate(self, results, logger=None):
-        acc = self.eval_result[self.index]
-        output = OrderedDict(
-            acc=acc, index=self.index, score=acc, loss_top=acc)
-        self.index += 1
-        return output
-
-
-class Model(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.param = nn.Parameter(torch.tensor([1.0]))
-
-    def forward(self, x, **kwargs):
-        return self.param * x
-
-    def train_step(self, data_batch, optimizer, **kwargs):
-        return {'loss': torch.sum(self(data_batch['x']))}
-
-    def val_step(self, data_batch, optimizer, **kwargs):
-        return {'loss': torch.sum(self(data_batch['x']))}
-
-
-def _build_epoch_runner():
-
-    model = Model()
-    tmp_dir = tempfile.mkdtemp()
-
-    runner = EpochBasedRunner(
-        model=model, work_dir=tmp_dir, logger=get_logger('demo'))
-    return runner
-
-
-def _build_iter_runner():
-
-    model = Model()
-    tmp_dir = tempfile.mkdtemp()
-
-    runner = IterBasedRunner(
-        model=model, work_dir=tmp_dir, logger=get_logger('demo'))
-    return runner
-
-
-class EvalHook(BaseEvalHook):
-
-    _default_greater_keys = ['acc', 'top']
-    _default_less_keys = ['loss', 'loss_top']
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-
-class DistEvalHook(BaseDistEvalHook):
-
-    greater_keys = ['acc', 'top']
-    less_keys = ['loss', 'loss_top']
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-
-
-def test_eval_hook():
-    with pytest.raises(AssertionError):
-        # `save_best` should be a str
-        test_dataset = Model()
-        data_loader = DataLoader(test_dataset)
-        EvalHook(data_loader, save_best=True)
-
-    with pytest.raises(TypeError):
-        # dataloader must be a pytorch DataLoader
-        test_dataset = Model()
-        data_loader = [DataLoader(test_dataset)]
-        EvalHook(data_loader)
-
-    with pytest.raises(ValueError):
-        # key_indicator must be valid when rule_map is None
-        test_dataset = ExampleDataset()
-        data_loader = DataLoader(test_dataset)
-        EvalHook(data_loader, save_best='unsupport')
-
-    with pytest.raises(KeyError):
-        # rule must be in keys of rule_map
-        test_dataset = ExampleDataset()
-        data_loader = DataLoader(test_dataset)
-        EvalHook(data_loader, save_best='auto', rule='unsupport')
-
-    # if eval_res is an empty dict, print a warning information
-    with pytest.warns(UserWarning) as record_warnings:
-
-        class _EvalDataset(ExampleDataset):
-
-            def evaluate(self, results, logger=None):
-                return {}
-
-        test_dataset = _EvalDataset()
-        data_loader = DataLoader(test_dataset)
-        eval_hook = EvalHook(data_loader, save_best='auto')
-        runner = _build_epoch_runner()
-        runner.register_hook(eval_hook)
-        runner.run([data_loader], [('train', 1)], 1)
-    # Since there will be many warnings thrown, we just need to check if the
-    # expected exceptions are thrown
-    expected_message = ('Since `eval_res` is an empty dict, the behavior to '
-                        'save the best checkpoint will be skipped in this '
-                        'evaluation.')
-    for warning in record_warnings:
-        if str(warning.message) == expected_message:
-            break
-    else:
-        assert False
-
-    test_dataset = ExampleDataset()
-    loader = DataLoader(test_dataset)
-    model = Model()
-    data_loader = DataLoader(test_dataset)
-    eval_hook = EvalHook(data_loader, save_best=None)
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-
-        # total_epochs = 1
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 1)
-        test_dataset.evaluate.assert_called_with(
-            test_dataset, [torch.tensor([1])], logger=runner.logger)
-        assert runner.meta is None or 'best_score' not in runner.meta[
-            'hook_msgs']
-        assert runner.meta is None or 'best_ckpt' not in runner.meta[
-            'hook_msgs']
-
-    # when `save_best` is set to 'auto', first metric will be used.
-    loader = DataLoader(EvalDataset())
-    model = Model()
-    data_loader = DataLoader(EvalDataset())
-    eval_hook = EvalHook(data_loader, interval=1, save_best='auto')
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 8)
-
-        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
-        assert osp.exists(ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == 7
-
-    # total_epochs = 8, return the best acc and corresponding epoch
-    loader = DataLoader(EvalDataset())
-    model = Model()
-    data_loader = DataLoader(EvalDataset())
-    eval_hook = EvalHook(data_loader, interval=1, save_best='acc')
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 8)
-
-        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
-        assert osp.exists(ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == 7
-
-    # total_epochs = 8, return the best loss_top and corresponding epoch
-    loader = DataLoader(EvalDataset())
-    model = Model()
-    data_loader = DataLoader(EvalDataset())
-    eval_hook = EvalHook(data_loader, interval=1, save_best='loss_top')
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 8)
-
-        ckpt_path = osp.join(tmpdir, 'best_loss_top_epoch_6.pth')
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
-        assert osp.exists(ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == -3
-
-    # total_epochs = 8, return the best score and corresponding epoch
-    data_loader = DataLoader(EvalDataset())
-    eval_hook = EvalHook(
-        data_loader, interval=1, save_best='score', rule='greater')
-    with tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 8)
-
-        ckpt_path = osp.join(tmpdir, 'best_score_epoch_4.pth')
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
-        assert osp.exists(ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == 7
-
-    # total_epochs = 8, return the best score using less compare func
-    # and indicate corresponding epoch
-    data_loader = DataLoader(EvalDataset())
-    eval_hook = EvalHook(data_loader, save_best='acc', rule='less')
-    with tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 8)
-
-        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
-        assert osp.exists(ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == -3
-
-    # Test the EvalHook when resume happened
-    data_loader = DataLoader(EvalDataset())
-    eval_hook = EvalHook(data_loader, save_best='acc')
-    with tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 2)
-
-        old_ckpt_path = osp.join(tmpdir, 'best_acc_epoch_2.pth')
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
-        assert osp.exists(old_ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == 4
-
-        resume_from = old_ckpt_path
-        loader = DataLoader(ExampleDataset())
-        eval_hook = EvalHook(data_loader, save_best='acc')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-
-        runner.resume(resume_from)
-        assert runner.meta['hook_msgs']['best_ckpt'] == old_ckpt_path
-        assert osp.exists(old_ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == 4
-
-        runner.run([loader], [('train', 1)], 8)
-
-        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_4.pth')
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
-        assert osp.exists(ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == 7
-        assert not osp.exists(old_ckpt_path)
-
-    # test EvalHook with customer test_fn and greater/less keys
-    loader = DataLoader(EvalDataset())
-    model = Model()
-    data_loader = DataLoader(EvalDataset())
-
-    eval_hook = EvalHook(
-        data_loader,
-        save_best='acc',
-        test_fn=mock.MagicMock(return_value={}),
-        greater_keys=[],
-        less_keys=['acc'])
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 8)
-
-        ckpt_path = osp.join(tmpdir, 'best_acc_epoch_6.pth')
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
-        assert osp.exists(ckpt_path)
-        assert runner.meta['hook_msgs']['best_score'] == -3
-
-    # test EvalHook with specified `out_dir`
-    loader = DataLoader(EvalDataset())
-    model = Model()
-    data_loader = DataLoader(EvalDataset())
-    out_dir = 's3://user/data'
-    eval_hook = EvalHook(
-        data_loader, interval=1, save_best='auto', out_dir=out_dir)
-
-    with patch.object(PetrelBackend, 'put') as mock_put, \
-         patch.object(PetrelBackend, 'remove') as mock_remove, \
-         patch.object(PetrelBackend, 'isfile') as mock_isfile, \
-         tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_eval')
-        runner = EpochBasedRunner(model=model, work_dir=tmpdir, logger=logger)
-        runner.register_checkpoint_hook(dict(interval=1))
-        runner.register_hook(eval_hook)
-        runner.run([loader], [('train', 1)], 8)
-
-        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-        ckpt_path = f'{out_dir}/{basename}/best_acc_epoch_4.pth'
-
-        assert runner.meta['hook_msgs']['best_ckpt'] == ckpt_path
-        assert runner.meta['hook_msgs']['best_score'] == 7
-
-    assert mock_put.call_count == 3
-    assert mock_remove.call_count == 2
-    assert mock_isfile.call_count == 2
-
-
-@patch('mmcv.engine.single_gpu_test', MagicMock)
-@patch('mmcv.engine.multi_gpu_test', MagicMock)
-@pytest.mark.parametrize('EvalHookParam', [EvalHook, DistEvalHook])
-@pytest.mark.parametrize('_build_demo_runner,by_epoch',
-                         [(_build_epoch_runner, True),
-                          (_build_iter_runner, False)])
-def test_start_param(EvalHookParam, _build_demo_runner, by_epoch):
-    # create dummy data
-    dataloader = DataLoader(EvalDataset())
-
-    # 0.1. dataloader is not a DataLoader object
-    with pytest.raises(TypeError):
-        EvalHookParam(dataloader=MagicMock(), interval=-1)
-
-    # 0.2. negative interval
-    with pytest.raises(ValueError):
-        EvalHookParam(dataloader, interval=-1)
-
-    # 0.3. negative start
-    with pytest.raises(ValueError):
-        EvalHookParam(dataloader, start=-1)
-
-    # 1. start=None, interval=1: perform evaluation after each epoch.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, interval=1, by_epoch=by_epoch)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 2)
-    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
-
-    # 2. start=1, interval=1: perform evaluation after each epoch.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(
-        dataloader, start=1, interval=1, by_epoch=by_epoch)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 2)
-    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 2
-
-    # 3. start=None, interval=2: perform evaluation after epoch 2, 4, 6, etc
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, interval=2, by_epoch=by_epoch)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 2)
-    assert evalhook.evaluate.call_count == 1  # after epoch 2
-
-    # 4. start=1, interval=2: perform evaluation after epoch 1, 3, 5, etc
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(
-        dataloader, start=1, interval=2, by_epoch=by_epoch)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 3)
-    assert evalhook.evaluate.call_count == 2  # after epoch 1 & 3
-
-    # 5. start=0, interval=1: perform evaluation after each epoch and
-    #    before epoch 1.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, start=0, by_epoch=by_epoch)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    runner.run([dataloader], [('train', 1)], 2)
-    assert evalhook.evaluate.call_count == 3  # before epoch1 and after e1 & e2
-
-    # 6. resuming from epoch i, start = x (x<=i), interval =1: perform
-    #    evaluation after each epoch and before the first epoch.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, start=1, by_epoch=by_epoch)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    if by_epoch:
-        runner._epoch = 2
-    else:
-        runner._iter = 2
-    runner.run([dataloader], [('train', 1)], 3)
-    assert evalhook.evaluate.call_count == 2  # before & after epoch 3
-
-    # 7. resuming from epoch i, start = i+1/None, interval =1: perform
-    #    evaluation after each epoch.
-    runner = _build_demo_runner()
-    evalhook = EvalHookParam(dataloader, start=2, by_epoch=by_epoch)
-    evalhook.evaluate = MagicMock()
-    runner.register_hook(evalhook)
-    if by_epoch:
-        runner._epoch = 1
-    else:
-        runner._iter = 1
-    runner.run([dataloader], [('train', 1)], 3)
-    assert evalhook.evaluate.call_count == 2  # after epoch 2 & 3
-
-
-@pytest.mark.parametrize('runner,by_epoch,eval_hook_priority',
-                         [(EpochBasedRunner, True, 'NORMAL'),
-                          (EpochBasedRunner, True, 'LOW'),
-                          (IterBasedRunner, False, 'LOW')])
-def test_logger(runner, by_epoch, eval_hook_priority):
-    loader = DataLoader(EvalDataset())
-    model = Model()
-    data_loader = DataLoader(EvalDataset())
-    eval_hook = EvalHook(
-        data_loader, interval=1, by_epoch=by_epoch, save_best='acc')
-
-    with tempfile.TemporaryDirectory() as tmpdir:
-        logger = get_logger('test_logger')
-        optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
-        runner = EpochBasedRunner(
-            model=model, optimizer=optimizer, work_dir=tmpdir, logger=logger)
-        runner.register_logger_hooks(
-            dict(
-                interval=1,
-                hooks=[dict(type='TextLoggerHook', by_epoch=by_epoch)]))
-        runner.register_timer_hook(dict(type='IterTimerHook'))
-        runner.register_hook(eval_hook, priority=eval_hook_priority)
-        runner.run([loader], [('train', 1)], 1)
-
-        path = osp.join(tmpdir, next(scandir(tmpdir, '.json')))
-        with open(path) as fr:
-            fr.readline()  # skip the first line which is `hook_msg`
-            train_log = json.loads(fr.readline())
-            assert train_log['mode'] == 'train' and 'time' in train_log
-            val_log = json.loads(fr.readline())
-            assert val_log['mode'] == 'val' and 'time' not in val_log
diff --git a/tests/test_runner/test_fp16.py b/tests/test_runner/test_fp16.py
deleted file mode 100644
index e34c909cb9..0000000000
--- a/tests/test_runner/test_fp16.py
+++ /dev/null
@@ -1,317 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import numpy as np
-import pytest
-import torch
-import torch.nn as nn
-
-from mmcv.runner.fp16_utils import auto_fp16, cast_tensor_type, force_fp32
-
-
-def test_cast_tensor_type():
-    inputs = torch.FloatTensor([5.])
-    src_type = torch.float32
-    dst_type = torch.int32
-    outputs = cast_tensor_type(inputs, src_type, dst_type)
-    assert isinstance(outputs, torch.Tensor)
-    assert outputs.dtype == dst_type
-
-    # convert torch.float to torch.half
-    inputs = torch.FloatTensor([5.])
-    src_type = torch.float
-    dst_type = torch.half
-    outputs = cast_tensor_type(inputs, src_type, dst_type)
-    assert isinstance(outputs, torch.Tensor)
-    assert outputs.dtype == dst_type
-
-    # skip the conversion when the type of input is not the same as src_type
-    inputs = torch.IntTensor([5])
-    src_type = torch.float
-    dst_type = torch.half
-    outputs = cast_tensor_type(inputs, src_type, dst_type)
-    assert isinstance(outputs, torch.Tensor)
-    assert outputs.dtype == inputs.dtype
-
-    inputs = 'tensor'
-    src_type = str
-    dst_type = str
-    outputs = cast_tensor_type(inputs, src_type, dst_type)
-    assert isinstance(outputs, str)
-
-    inputs = np.array([5.])
-    src_type = np.ndarray
-    dst_type = np.ndarray
-    outputs = cast_tensor_type(inputs, src_type, dst_type)
-    assert isinstance(outputs, np.ndarray)
-
-    inputs = dict(
-        tensor_a=torch.FloatTensor([1.]), tensor_b=torch.FloatTensor([2.]))
-    src_type = torch.float32
-    dst_type = torch.int32
-    outputs = cast_tensor_type(inputs, src_type, dst_type)
-    assert isinstance(outputs, dict)
-    assert outputs['tensor_a'].dtype == dst_type
-    assert outputs['tensor_b'].dtype == dst_type
-
-    inputs = [torch.FloatTensor([1.]), torch.FloatTensor([2.])]
-    src_type = torch.float32
-    dst_type = torch.int32
-    outputs = cast_tensor_type(inputs, src_type, dst_type)
-    assert isinstance(outputs, list)
-    assert outputs[0].dtype == dst_type
-    assert outputs[1].dtype == dst_type
-
-    inputs = 5
-    outputs = cast_tensor_type(inputs, None, None)
-    assert isinstance(outputs, int)
-
-
-def test_auto_fp16():
-
-    with pytest.raises(TypeError):
-        # ExampleObject is not a subclass of nn.Module
-
-        class ExampleObject:
-
-            @auto_fp16()
-            def __call__(self, x):
-                return x
-
-        model = ExampleObject()
-        input_x = torch.ones(1, dtype=torch.float32)
-        model(input_x)
-
-    # apply to all input args
-    class ExampleModule(nn.Module):
-
-        @auto_fp16()
-        def forward(self, x, y):
-            return x, y
-
-    model = ExampleModule()
-    input_x = torch.ones(1, dtype=torch.float32)
-    input_y = torch.ones(1, dtype=torch.float32)
-    output_x, output_y = model(input_x, input_y)
-    assert output_x.dtype == torch.float32
-    assert output_y.dtype == torch.float32
-
-    model.fp16_enabled = True
-    output_x, output_y = model(input_x, input_y)
-    assert output_x.dtype == torch.half
-    assert output_y.dtype == torch.half
-
-    if torch.cuda.is_available():
-        model.cuda()
-        output_x, output_y = model(input_x.cuda(), input_y.cuda())
-        assert output_x.dtype == torch.half
-        assert output_y.dtype == torch.half
-
-    # apply to specified input args
-    class ExampleModule(nn.Module):
-
-        @auto_fp16(apply_to=('x', ))
-        def forward(self, x, y):
-            return x, y
-
-    model = ExampleModule()
-    input_x = torch.ones(1, dtype=torch.float32)
-    input_y = torch.ones(1, dtype=torch.float32)
-    output_x, output_y = model(input_x, input_y)
-    assert output_x.dtype == torch.float32
-    assert output_y.dtype == torch.float32
-
-    model.fp16_enabled = True
-    output_x, output_y = model(input_x, input_y)
-    assert output_x.dtype == torch.half
-    assert output_y.dtype == torch.float32
-
-    if torch.cuda.is_available():
-        model.cuda()
-        output_x, output_y = model(input_x.cuda(), input_y.cuda())
-        assert output_x.dtype == torch.half
-        assert output_y.dtype == torch.float32
-
-    # apply to optional input args
-    class ExampleModule(nn.Module):
-
-        @auto_fp16(apply_to=('x', 'y'))
-        def forward(self, x, y=None, z=None):
-            return x, y, z
-
-    model = ExampleModule()
-    input_x = torch.ones(1, dtype=torch.float32)
-    input_y = torch.ones(1, dtype=torch.float32)
-    input_z = torch.ones(1, dtype=torch.float32)
-    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
-    assert output_x.dtype == torch.float32
-    assert output_y.dtype == torch.float32
-    assert output_z.dtype == torch.float32
-
-    model.fp16_enabled = True
-    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
-    assert output_x.dtype == torch.half
-    assert output_y.dtype == torch.half
-    assert output_z.dtype == torch.float32
-
-    if torch.cuda.is_available():
-        model.cuda()
-        output_x, output_y, output_z = model(
-            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
-        assert output_x.dtype == torch.half
-        assert output_y.dtype == torch.half
-        assert output_z.dtype == torch.float32
-
-    # out_fp32=True
-    class ExampleModule(nn.Module):
-
-        @auto_fp16(apply_to=('x', 'y'), out_fp32=True)
-        def forward(self, x, y=None, z=None):
-            return x, y, z
-
-    model = ExampleModule()
-    input_x = torch.ones(1, dtype=torch.half)
-    input_y = torch.ones(1, dtype=torch.float32)
-    input_z = torch.ones(1, dtype=torch.float32)
-    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
-    assert output_x.dtype == torch.half
-    assert output_y.dtype == torch.float32
-    assert output_z.dtype == torch.float32
-
-    model.fp16_enabled = True
-    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
-    assert output_x.dtype == torch.float32
-    assert output_y.dtype == torch.float32
-    assert output_z.dtype == torch.float32
-
-    if torch.cuda.is_available():
-        model.cuda()
-        output_x, output_y, output_z = model(
-            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
-        assert output_x.dtype == torch.float32
-        assert output_y.dtype == torch.float32
-        assert output_z.dtype == torch.float32
-
-
-def test_force_fp32():
-
-    with pytest.raises(TypeError):
-        # ExampleObject is not a subclass of nn.Module
-
-        class ExampleObject:
-
-            @force_fp32()
-            def __call__(self, x):
-                return x
-
-        model = ExampleObject()
-        input_x = torch.ones(1, dtype=torch.float32)
-        model(input_x)
-
-    # apply to all input args
-    class ExampleModule(nn.Module):
-
-        @force_fp32()
-        def forward(self, x, y):
-            return x, y
-
-    model = ExampleModule()
-    input_x = torch.ones(1, dtype=torch.half)
-    input_y = torch.ones(1, dtype=torch.half)
-    output_x, output_y = model(input_x, input_y)
-    assert output_x.dtype == torch.half
-    assert output_y.dtype == torch.half
-
-    model.fp16_enabled = True
-    output_x, output_y = model(input_x, input_y)
-    assert output_x.dtype == torch.float32
-    assert output_y.dtype == torch.float32
-
-    if torch.cuda.is_available():
-        model.cuda()
-        output_x, output_y = model(input_x.cuda(), input_y.cuda())
-        assert output_x.dtype == torch.float32
-        assert output_y.dtype == torch.float32
-
-    # apply to specified input args
-    class ExampleModule(nn.Module):
-
-        @force_fp32(apply_to=('x', ))
-        def forward(self, x, y):
-            return x, y
-
-    model = ExampleModule()
-    input_x = torch.ones(1, dtype=torch.half)
-    input_y = torch.ones(1, dtype=torch.half)
-    output_x, output_y = model(input_x, input_y)
-    assert output_x.dtype == torch.half
-    assert output_y.dtype == torch.half
-
-    model.fp16_enabled = True
-    output_x, output_y = model(input_x, input_y)
-    assert output_x.dtype == torch.float32
-    assert output_y.dtype == torch.half
-
-    if torch.cuda.is_available():
-        model.cuda()
-        output_x, output_y = model(input_x.cuda(), input_y.cuda())
-        assert output_x.dtype == torch.float32
-        assert output_y.dtype == torch.half
-
-    # apply to optional input args
-    class ExampleModule(nn.Module):
-
-        @force_fp32(apply_to=('x', 'y'))
-        def forward(self, x, y=None, z=None):
-            return x, y, z
-
-    model = ExampleModule()
-    input_x = torch.ones(1, dtype=torch.half)
-    input_y = torch.ones(1, dtype=torch.half)
-    input_z = torch.ones(1, dtype=torch.half)
-    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
-    assert output_x.dtype == torch.half
-    assert output_y.dtype == torch.half
-    assert output_z.dtype == torch.half
-
-    model.fp16_enabled = True
-    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
-    assert output_x.dtype == torch.float32
-    assert output_y.dtype == torch.float32
-    assert output_z.dtype == torch.half
-
-    if torch.cuda.is_available():
-        model.cuda()
-        output_x, output_y, output_z = model(
-            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
-        assert output_x.dtype == torch.float32
-        assert output_y.dtype == torch.float32
-        assert output_z.dtype == torch.half
-
-    # out_fp16=True
-    class ExampleModule(nn.Module):
-
-        @force_fp32(apply_to=('x', 'y'), out_fp16=True)
-        def forward(self, x, y=None, z=None):
-            return x, y, z
-
-    model = ExampleModule()
-    input_x = torch.ones(1, dtype=torch.float32)
-    input_y = torch.ones(1, dtype=torch.half)
-    input_z = torch.ones(1, dtype=torch.half)
-    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
-    assert output_x.dtype == torch.float32
-    assert output_y.dtype == torch.half
-    assert output_z.dtype == torch.half
-
-    model.fp16_enabled = True
-    output_x, output_y, output_z = model(input_x, y=input_y, z=input_z)
-    assert output_x.dtype == torch.half
-    assert output_y.dtype == torch.half
-    assert output_z.dtype == torch.half
-
-    if torch.cuda.is_available():
-        model.cuda()
-        output_x, output_y, output_z = model(
-            input_x.cuda(), y=input_y.cuda(), z=input_z.cuda())
-        assert output_x.dtype == torch.half
-        assert output_y.dtype == torch.half
-        assert output_z.dtype == torch.half
diff --git a/tests/test_runner/test_hooks.py b/tests/test_runner/test_hooks.py
deleted file mode 100644
index 391a7865f0..0000000000
--- a/tests/test_runner/test_hooks.py
+++ /dev/null
@@ -1,1923 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-"""Tests the hooks with runners.
-
-CommandLine:
-    pytest tests/test_runner/test_hooks.py
-    xdoctest tests/test_hooks.py zero
-"""
-import logging
-import os.path as osp
-import platform
-import random
-import re
-import shutil
-import sys
-import tempfile
-from unittest.mock import MagicMock, Mock, call, patch
-
-import pytest
-import torch
-import torch.nn as nn
-from mmengine.fileio.file_client import PetrelBackend
-from torch.nn.init import constant_
-from torch.utils.data import DataLoader
-
-# yapf: disable
-from mmcv.runner import (CheckpointHook, ClearMLLoggerHook, DvcliveLoggerHook,
-                         EMAHook, Fp16OptimizerHook,
-                         GradientCumulativeFp16OptimizerHook,
-                         GradientCumulativeOptimizerHook, IterTimerHook,
-                         MlflowLoggerHook, NeptuneLoggerHook, OptimizerHook,
-                         PaviLoggerHook, SegmindLoggerHook, WandbLoggerHook,
-                         build_runner)
-# yapf: enable
-from mmcv.runner.fp16_utils import auto_fp16
-from mmcv.runner.hooks.hook import HOOKS, Hook
-from mmcv.runner.hooks.lr_updater import (CosineRestartLrUpdaterHook,
-                                          CyclicLrUpdaterHook,
-                                          FlatCosineAnnealingLrUpdaterHook,
-                                          OneCycleLrUpdaterHook,
-                                          StepLrUpdaterHook)
-from mmcv.utils import TORCH_VERSION
-
-sys.modules['petrel_client'] = MagicMock()
-sys.modules['petrel_client.client'] = MagicMock()
-
-
-@pytest.mark.skipif(
-    torch.__version__ == 'parrots', reason='not supported in parrots now')
-def test_optimizerhook():
-
-    class Model(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.conv1 = nn.Conv2d(
-                in_channels=1,
-                out_channels=2,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                dilation=1)
-            self.conv2 = nn.Conv2d(
-                in_channels=2,
-                out_channels=2,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                dilation=1)
-            self.conv3 = nn.Conv2d(
-                in_channels=1,
-                out_channels=2,
-                kernel_size=3,
-                stride=1,
-                padding=1,
-                dilation=1)
-
-        def forward(self, x):
-            x1 = self.conv1(x)
-            x2 = self.conv2(x1)
-            return x1, x2
-
-    model = Model()
-    x = torch.rand(1, 1, 3, 3)
-
-    dummy_runner = Mock()
-    dummy_runner.optimizer.zero_grad = Mock(return_value=None)
-    dummy_runner.optimizer.step = Mock(return_value=None)
-    dummy_runner.model = model
-    dummy_runner.outputs = dict()
-
-    dummy_runner.outputs['num_samples'] = 0
-
-    class DummyLogger():
-
-        def __init__(self):
-            self.msg = ''
-
-        def log(self, msg=None, **kwargs):
-            self.msg += msg
-
-    dummy_runner.logger = DummyLogger()
-    optimizer_hook = OptimizerHook(
-        dict(max_norm=2), detect_anomalous_params=True)
-
-    dummy_runner.outputs['loss'] = model(x)[0].sum()
-    optimizer_hook.after_train_iter(dummy_runner)
-    # assert the parameters of conv2 and conv3 are not in the
-    # computational graph which is with x1.sum() as root.
-    assert 'conv2.weight' in dummy_runner.logger.msg
-    assert 'conv2.bias' in dummy_runner.logger.msg
-    assert 'conv3.weight' in dummy_runner.logger.msg
-    assert 'conv3.bias' in dummy_runner.logger.msg
-    assert 'conv1.weight' not in dummy_runner.logger.msg
-    assert 'conv1.bias' not in dummy_runner.logger.msg
-
-    dummy_runner.outputs['loss'] = model(x)[1].sum()
-    dummy_runner.logger.msg = ''
-    optimizer_hook.after_train_iter(dummy_runner)
-    # assert the parameters of conv3 are not in the computational graph
-    assert 'conv3.weight' in dummy_runner.logger.msg
-    assert 'conv3.bias' in dummy_runner.logger.msg
-    assert 'conv2.weight' not in dummy_runner.logger.msg
-    assert 'conv2.bias' not in dummy_runner.logger.msg
-    assert 'conv1.weight' not in dummy_runner.logger.msg
-    assert 'conv1.bias' not in dummy_runner.logger.msg
-
-
-def test_checkpoint_hook(tmp_path):
-    """xdoctest -m tests/test_runner/test_hooks.py test_checkpoint_hook."""
-
-    # test epoch based runner
-    loader = DataLoader(torch.ones((5, 2)))
-    runner = _build_demo_runner('EpochBasedRunner', max_epochs=1)
-    runner.meta = dict()
-    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
-    runner.register_hook(checkpointhook)
-    runner.run([loader], [('train', 1)])
-    assert runner.meta['hook_msgs']['last_ckpt'] == osp.join(
-        runner.work_dir, 'epoch_1.pth')
-    shutil.rmtree(runner.work_dir)
-
-    # test petrel oss when the type of runner is `EpochBasedRunner`
-    runner = _build_demo_runner('EpochBasedRunner', max_epochs=4)
-    runner.meta = dict()
-    out_dir = 's3://user/data'
-    with patch.object(PetrelBackend, 'put') as mock_put, \
-            patch.object(PetrelBackend, 'remove') as mock_remove, \
-            patch.object(PetrelBackend, 'isfile') as mock_isfile:
-        checkpointhook = CheckpointHook(
-            interval=1, out_dir=out_dir, by_epoch=True, max_keep_ckpts=2)
-        runner.register_hook(checkpointhook)
-        runner.run([loader], [('train', 1)])
-        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-        assert runner.meta['hook_msgs']['last_ckpt'] == \
-               '/'.join([out_dir, basename, 'epoch_4.pth'])
-    mock_put.assert_called()
-    mock_remove.assert_called()
-    mock_isfile.assert_called()
-    shutil.rmtree(runner.work_dir)
-
-    # test iter based runner
-    runner = _build_demo_runner(
-        'IterBasedRunner', max_iters=1, max_epochs=None)
-    runner.meta = dict()
-    checkpointhook = CheckpointHook(interval=1, by_epoch=False)
-    runner.register_hook(checkpointhook)
-    runner.run([loader], [('train', 1)])
-    assert runner.meta['hook_msgs']['last_ckpt'] == osp.join(
-        runner.work_dir, 'iter_1.pth')
-    shutil.rmtree(runner.work_dir)
-
-    # test petrel oss when the type of runner is `IterBasedRunner`
-    runner = _build_demo_runner(
-        'IterBasedRunner', max_iters=4, max_epochs=None)
-    runner.meta = dict()
-    out_dir = 's3://user/data'
-    with patch.object(PetrelBackend, 'put') as mock_put, \
-            patch.object(PetrelBackend, 'remove') as mock_remove, \
-            patch.object(PetrelBackend, 'isfile') as mock_isfile:
-        checkpointhook = CheckpointHook(
-            interval=1, out_dir=out_dir, by_epoch=False, max_keep_ckpts=2)
-        runner.register_hook(checkpointhook)
-        runner.run([loader], [('train', 1)])
-        basename = osp.basename(runner.work_dir.rstrip(osp.sep))
-        assert runner.meta['hook_msgs']['last_ckpt'] == \
-               '/'.join([out_dir, basename, 'iter_4.pth'])
-    mock_put.assert_called()
-    mock_remove.assert_called()
-    mock_isfile.assert_called()
-    shutil.rmtree(runner.work_dir)
-
-
-def test_ema_hook():
-    """xdoctest -m tests/test_hooks.py test_ema_hook."""
-
-    class DemoModel(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.conv = nn.Conv2d(
-                in_channels=1,
-                out_channels=2,
-                kernel_size=1,
-                padding=1,
-                bias=True)
-            self._init_weight()
-
-        def _init_weight(self):
-            constant_(self.conv.weight, 0)
-            constant_(self.conv.bias, 0)
-
-        def forward(self, x):
-            return self.conv(x).sum()
-
-        def train_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x))
-
-        def val_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x))
-
-    loader = DataLoader(torch.ones((1, 1, 1, 1)))
-    runner = _build_demo_runner()
-    demo_model = DemoModel()
-    runner.model = demo_model
-    emahook = EMAHook(momentum=0.1, interval=2, warm_up=100, resume_from=None)
-    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
-    runner.register_hook(emahook, priority='HIGHEST')
-    runner.register_hook(checkpointhook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    checkpoint = torch.load(f'{runner.work_dir}/epoch_1.pth')
-    contain_ema_buffer = False
-    for name, value in checkpoint['state_dict'].items():
-        if 'ema' in name:
-            contain_ema_buffer = True
-            assert value.sum() == 0
-            value.fill_(1)
-        else:
-            assert value.sum() == 0
-    assert contain_ema_buffer
-    torch.save(checkpoint, f'{runner.work_dir}/epoch_1.pth')
-    work_dir = runner.work_dir
-    resume_ema_hook = EMAHook(
-        momentum=0.5, warm_up=0, resume_from=f'{work_dir}/epoch_1.pth')
-    runner = _build_demo_runner(max_epochs=2)
-    runner.model = demo_model
-    runner.register_hook(resume_ema_hook, priority='HIGHEST')
-    checkpointhook = CheckpointHook(interval=1, by_epoch=True)
-    runner.register_hook(checkpointhook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    checkpoint = torch.load(f'{runner.work_dir}/epoch_2.pth')
-    contain_ema_buffer = False
-    for name, value in checkpoint['state_dict'].items():
-        if 'ema' in name:
-            contain_ema_buffer = True
-            assert value.sum() == 2
-        else:
-            assert value.sum() == 1
-    assert contain_ema_buffer
-    shutil.rmtree(runner.work_dir)
-    shutil.rmtree(work_dir)
-
-
-def test_custom_hook():
-
-    @HOOKS.register_module()
-    class ToyHook(Hook):
-
-        def __init__(self, info, *args, **kwargs):
-            super().__init__()
-            self.info = info
-
-    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
-    # test if custom_hooks is None
-    runner.register_custom_hooks(None)
-    assert len(runner.hooks) == 0
-    # test if custom_hooks is dict list
-    custom_hooks_cfg = [
-        dict(type='ToyHook', priority=51, info=51),
-        dict(type='ToyHook', priority=49, info=49)
-    ]
-    runner.register_custom_hooks(custom_hooks_cfg)
-    assert [hook.info for hook in runner.hooks] == [49, 51]
-    # test if custom_hooks is object and without priority
-    runner.register_custom_hooks(ToyHook(info='default'))
-    assert len(runner.hooks) == 3 and runner.hooks[1].info == 'default'
-    shutil.rmtree(runner.work_dir)
-
-    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
-    # test custom_hooks with string priority setting
-    priority_ranks = [
-        'HIGHEST', 'VERY_HIGH', 'HIGH', 'ABOVE_NORMAL', 'NORMAL',
-        'BELOW_NORMAL', 'LOW', 'VERY_LOW', 'LOWEST'
-    ]
-    random_priority_ranks = priority_ranks.copy()
-    random.shuffle(random_priority_ranks)
-    custom_hooks_cfg = [
-        dict(type='ToyHook', priority=rank, info=rank)
-        for rank in random_priority_ranks
-    ]
-    runner.register_custom_hooks(custom_hooks_cfg)
-    assert [hook.info for hook in runner.hooks] == priority_ranks
-    shutil.rmtree(runner.work_dir)
-
-    runner = _build_demo_runner_without_hook('EpochBasedRunner', max_epochs=1)
-    # test register_training_hooks order
-    custom_hooks_cfg = [
-        dict(type='ToyHook', priority=1, info='custom 1'),
-        dict(type='ToyHook', priority='NORMAL', info='custom normal'),
-        dict(type='ToyHook', priority=89, info='custom 89')
-    ]
-    runner.register_training_hooks(
-        lr_config=ToyHook('lr'),
-        optimizer_config=ToyHook('optimizer'),
-        checkpoint_config=ToyHook('checkpoint'),
-        log_config=dict(interval=1, hooks=[dict(type='ToyHook', info='log')]),
-        momentum_config=ToyHook('momentum'),
-        timer_config=ToyHook('timer'),
-        custom_hooks_config=custom_hooks_cfg)
-    # If custom hooks have same priority with default hooks, custom hooks
-    # will be triggered after default hooks.
-    hooks_order = [
-        'custom 1', 'lr', 'momentum', 'optimizer', 'checkpoint',
-        'custom normal', 'timer', 'custom 89', 'log'
-    ]
-    assert [hook.info for hook in runner.hooks] == hooks_order
-    shutil.rmtree(runner.work_dir)
-
-
-def test_pavi_hook():
-    sys.modules['pavi'] = MagicMock()
-
-    loader = DataLoader(torch.ones((5, 2)))
-    runner = _build_demo_runner()
-    runner.meta = dict(config_dict=dict(lr=0.02, gpu_ids=range(1)))
-    hook = PaviLoggerHook(add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    assert hasattr(hook, 'writer')
-    hook.writer.add_scalars.assert_called_with('val', {
-        'learning_rate': 0.02,
-        'momentum': 0.95
-    }, 1)
-    # in Windows environment, the latest checkpoint is copied from epoch_1.pth
-    if platform.system() == 'Windows':
-        snapshot_file_path = osp.join(runner.work_dir, 'latest.pth')
-    else:
-        snapshot_file_path = osp.join(runner.work_dir, 'epoch_1.pth')
-    hook.writer.add_snapshot_file.assert_called_with(
-        tag=runner.work_dir.split('/')[-1],
-        snapshot_file_path=snapshot_file_path,
-        iteration=1)
-
-
-def test_sync_buffers_hook():
-    loader = DataLoader(torch.ones((5, 2)))
-    runner = _build_demo_runner()
-    runner.register_hook_from_cfg(dict(type='SyncBuffersHook'))
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    shutil.rmtree(runner.work_dir)
-
-
-@pytest.mark.parametrize('multi_optimizers, max_iters, gamma, cyclic_times',
-                         [(True, 8, 1, 1), (False, 8, 0.5, 2)])
-def test_momentum_runner_hook(multi_optimizers, max_iters, gamma,
-                              cyclic_times):
-    """xdoctest -m tests/test_hooks.py test_momentum_runner_hook."""
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-    hook_cfg = dict(
-        type='CyclicMomentumUpdaterHook',
-        by_epoch=False,
-        target_ratio=(0.85 / 0.95, 1),
-        cyclic_times=cyclic_times,
-        step_ratio_up=0.4,
-        gamma=gamma)
-    runner.register_hook_from_cfg(hook_cfg)
-
-    # add momentum LR scheduler
-    hook_cfg = dict(
-        type='CyclicLrUpdaterHook',
-        by_epoch=False,
-        target_ratio=(10, 1),
-        cyclic_times=1,
-        step_ratio_up=0.4)
-    runner.register_hook_from_cfg(hook_cfg)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    # TODO: use a more elegant way to check values
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.01999999999999999,
-                    'learning_rate/model2': 0.009999999999999995,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.2,
-                    'learning_rate/model2': 0.1,
-                    'momentum/model1': 0.85,
-                    'momentum/model2': 0.8052631578947369,
-                }, 5),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.155,
-                    'learning_rate/model2': 0.0775,
-                    'momentum/model1': 0.875,
-                    'momentum/model2': 0.8289473684210527,
-                }, 7)
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.01999999999999999,
-                'momentum': 0.95
-            }, 1),
-            call('train', {
-                'learning_rate': 0.11,
-                'momentum': 0.85
-            }, 3),
-            call('train', {
-                'learning_rate': 0.1879422863405995,
-                'momentum': 0.95
-            }, 6),
-            call('train', {
-                'learning_rate': 0.11000000000000001,
-                'momentum': 0.9
-            }, 8),
-        ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-    # test constant momentum warmup
-    sys.modules['pavi'] = MagicMock()
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-    hook_cfg = dict(
-        type='StepMomentumUpdaterHook',
-        by_epoch=False,
-        warmup='constant',
-        warmup_iters=5,
-        warmup_ratio=0.5,
-        step=[10],
-    )
-    runner.register_hook_from_cfg(hook_cfg)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 1.9,
-                    'momentum/model2': 1.8,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 1.9,
-                    'momentum/model2': 1.8,
-                }, 5),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 10),
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 1.9
-            }, 1),
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 1.9
-            }, 5),
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 0.95
-            }, 10),
-        ]
-
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-    # test linear momentum warmup
-    sys.modules['pavi'] = MagicMock()
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-    hook_cfg = dict(
-        type='StepMomentumUpdaterHook',
-        by_epoch=False,
-        warmup='linear',
-        warmup_iters=5,
-        warmup_ratio=0.5,
-        step=[10],
-    )
-    runner.register_hook_from_cfg(hook_cfg)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 1.9,
-                    'momentum/model2': 1.8,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 1.3571428571428572,
-                    'momentum/model2': 1.2857142857142858,
-                }, 3),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 10),
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 1.9
-            }, 1),
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 1.3571428571428572
-            }, 3),
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 0.95
-            }, 10),
-        ]
-
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-    # test exponentially momentum warmup
-    sys.modules['pavi'] = MagicMock()
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-    hook_cfg = dict(
-        type='StepMomentumUpdaterHook',
-        by_epoch=False,
-        warmup='exp',
-        warmup_iters=5,
-        warmup_ratio=0.5,
-        step=[10],
-    )
-    runner.register_hook_from_cfg(hook_cfg)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 1.9,
-                    'momentum/model2': 1.8,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 1.4399307381848783,
-                    'momentum/model2': 1.3641449098593583,
-                }, 3),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 10),
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 1.9
-            }, 1),
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 1.4399307381848783
-            }, 3),
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 0.95
-            }, 10),
-        ]
-
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-
-@pytest.mark.parametrize('multi_optimizers', (True, False))
-def test_cosine_runner_hook(multi_optimizers):
-    """xdoctest -m tests/test_hooks.py test_cosine_runner_hook."""
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-    hook_cfg = dict(
-        type='CosineAnnealingMomentumUpdaterHook',
-        min_momentum_ratio=0.99 / 0.95,
-        by_epoch=False,
-        warmup_iters=2,
-        warmup_ratio=0.9 / 0.95)
-    runner.register_hook_from_cfg(hook_cfg)
-
-    # add momentum LR scheduler
-    hook_cfg = dict(
-        type='CosineAnnealingLrUpdaterHook',
-        by_epoch=False,
-        min_lr_ratio=0,
-        warmup_iters=2,
-        warmup_ratio=0.9)
-    runner.register_hook_from_cfg(hook_cfg)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-    runner.register_hook(IterTimerHook())
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    # TODO: use a more elegant way to check values
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.01,
-                    'learning_rate/model2': 0.005,
-                    'momentum/model1': 0.97,
-                    'momentum/model2': 0.9189473684210527,
-                }, 6),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.0004894348370484647,
-                    'learning_rate/model2': 0.00024471741852423234,
-                    'momentum/model1': 0.9890211303259032,
-                    'momentum/model2': 0.9369673866245399,
-                }, 10)
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 0.95
-            }, 1),
-            call('train', {
-                'learning_rate': 0.01,
-                'momentum': 0.97
-            }, 6),
-            call(
-                'train', {
-                    'learning_rate': 0.0004894348370484647,
-                    'momentum': 0.9890211303259032
-                }, 10)
-        ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-
-@pytest.mark.parametrize('multi_optimizers', (True, False))
-def test_linear_runner_hook(multi_optimizers):
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-
-    hook_cfg = dict(
-        type='LinearAnnealingMomentumUpdaterHook',
-        min_momentum_ratio=0.99 / 0.95,
-        by_epoch=False,
-        warmup_iters=2,
-        warmup_ratio=0.9 / 0.95)
-    runner.register_hook_from_cfg(hook_cfg)
-
-    # add momentum LR scheduler
-    hook_cfg = dict(
-        type='LinearAnnealingLrUpdaterHook',
-        by_epoch=False,
-        min_lr_ratio=0,
-        warmup_iters=2,
-        warmup_ratio=0.9)
-    runner.register_hook_from_cfg(hook_cfg)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-    runner.register_hook(IterTimerHook())
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    # TODO: use a more elegant way to check values
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.01,
-                    'learning_rate/model2': 0.005,
-                    'momentum/model1': 0.97,
-                    'momentum/model2': 0.9189473684210527,
-                }, 6),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.0019999999999999983,
-                    'learning_rate/model2': 0.0009999999999999992,
-                    'momentum/model1': 0.9860000000000001,
-                    'momentum/model2': 0.9341052631578949,
-                }, 10)
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 0.95
-            }, 1),
-            call('train', {
-                'learning_rate': 0.01,
-                'momentum': 0.97
-            }, 6),
-            call(
-                'train', {
-                    'learning_rate': 0.0019999999999999983,
-                    'momentum': 0.9860000000000001
-                }, 10)
-        ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-
-@pytest.mark.parametrize('multi_optimizers, by_epoch', [(False, False),
-                                                        (True, False),
-                                                        (False, True),
-                                                        (True, True)])
-def test_flat_cosine_runner_hook(multi_optimizers, by_epoch):
-    """xdoctest -m tests/test_hooks.py test_flat_cosine_runner_hook."""
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    max_epochs = 10 if by_epoch else 1
-    runner = _build_demo_runner(
-        multi_optimizers=multi_optimizers, max_epochs=max_epochs)
-
-    with pytest.raises(ValueError):
-        # start_percent: expected float between 0 and 1
-        FlatCosineAnnealingLrUpdaterHook(start_percent=-0.1, min_lr_ratio=0)
-
-    # add LR scheduler
-    hook_cfg = dict(
-        type='FlatCosineAnnealingLrUpdaterHook',
-        by_epoch=by_epoch,
-        min_lr_ratio=0,
-        warmup='linear',
-        warmup_iters=10 if by_epoch else 2,
-        warmup_ratio=0.9,
-        start_percent=0.5)
-    runner.register_hook_from_cfg(hook_cfg)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-    runner.register_hook(IterTimerHook())
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    # TODO: use a more elegant way to check values
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        if by_epoch:
-            calls = [
-                call(
-                    'train', {
-                        'learning_rate/model1': 0.018000000000000002,
-                        'learning_rate/model2': 0.009000000000000001,
-                        'momentum/model1': 0.95,
-                        'momentum/model2': 0.9,
-                    }, 1),
-                call(
-                    'train', {
-                        'learning_rate/model1': 0.02,
-                        'learning_rate/model2': 0.01,
-                        'momentum/model1': 0.95,
-                        'momentum/model2': 0.9,
-                    }, 11),
-                call(
-                    'train', {
-                        'learning_rate/model1': 0.018090169943749474,
-                        'learning_rate/model2': 0.009045084971874737,
-                        'momentum/model1': 0.95,
-                        'momentum/model2': 0.9,
-                    }, 61),
-                call(
-                    'train', {
-                        'learning_rate/model1': 0.0019098300562505265,
-                        'learning_rate/model2': 0.0009549150281252633,
-                        'momentum/model1': 0.95,
-                        'momentum/model2': 0.9,
-                    }, 100)
-            ]
-        else:
-            calls = [
-                call(
-                    'train', {
-                        'learning_rate/model1': 0.018000000000000002,
-                        'learning_rate/model2': 0.009000000000000001,
-                        'momentum/model1': 0.95,
-                        'momentum/model2': 0.9
-                    }, 1),
-                call(
-                    'train', {
-                        'learning_rate/model1': 0.02,
-                        'learning_rate/model2': 0.01,
-                        'momentum/model1': 0.95,
-                        'momentum/model2': 0.9
-                    }, 6),
-                call(
-                    'train', {
-                        'learning_rate/model1': 0.018090169943749474,
-                        'learning_rate/model2': 0.009045084971874737,
-                        'momentum/model1': 0.95,
-                        'momentum/model2': 0.9
-                    }, 7),
-                call(
-                    'train', {
-                        'learning_rate/model1': 0.0019098300562505265,
-                        'learning_rate/model2': 0.0009549150281252633,
-                        'momentum/model1': 0.95,
-                        'momentum/model2': 0.9
-                    }, 10)
-            ]
-    else:
-        if by_epoch:
-            calls = [
-                call('train', {
-                    'learning_rate': 0.018000000000000002,
-                    'momentum': 0.95
-                }, 1),
-                call('train', {
-                    'learning_rate': 0.02,
-                    'momentum': 0.95
-                }, 11),
-                call('train', {
-                    'learning_rate': 0.018090169943749474,
-                    'momentum': 0.95
-                }, 61),
-                call('train', {
-                    'learning_rate': 0.0019098300562505265,
-                    'momentum': 0.95
-                }, 100)
-            ]
-        else:
-            calls = [
-                call('train', {
-                    'learning_rate': 0.018000000000000002,
-                    'momentum': 0.95
-                }, 1),
-                call('train', {
-                    'learning_rate': 0.02,
-                    'momentum': 0.95
-                }, 6),
-                call('train', {
-                    'learning_rate': 0.018090169943749474,
-                    'momentum': 0.95
-                }, 7),
-                call('train', {
-                    'learning_rate': 0.0019098300562505265,
-                    'momentum': 0.95
-                }, 10)
-            ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-
-@pytest.mark.skipif(
-    torch.__version__ == 'parrots', reason='not supported in parrots now')
-@pytest.mark.parametrize('multi_optimizers, max_iters', [(True, 10), (True, 2),
-                                                         (False, 10),
-                                                         (False, 2)])
-def test_one_cycle_runner_hook(multi_optimizers, max_iters):
-    """Test OneCycleLrUpdaterHook and OneCycleMomentumUpdaterHook."""
-    with pytest.raises(AssertionError):
-        # by_epoch should be False
-        OneCycleLrUpdaterHook(max_lr=0.1, by_epoch=True)
-
-    with pytest.raises(ValueError):
-        # expected float between 0 and 1
-        OneCycleLrUpdaterHook(max_lr=0.1, pct_start=-0.1)
-
-    with pytest.raises(ValueError):
-        # anneal_strategy should be either 'cos' or 'linear'
-        OneCycleLrUpdaterHook(max_lr=0.1, anneal_strategy='sin')
-
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-    hook_cfg = dict(
-        type='OneCycleMomentumUpdaterHook',
-        base_momentum=0.85,
-        max_momentum=0.95,
-        pct_start=0.5,
-        anneal_strategy='cos',
-        three_phase=False)
-    runner.register_hook_from_cfg(hook_cfg)
-
-    # add LR scheduler
-    hook_cfg = dict(
-        type='OneCycleLrUpdaterHook',
-        max_lr=0.01,
-        pct_start=0.5,
-        anneal_strategy='cos',
-        div_factor=25,
-        final_div_factor=1e4,
-        three_phase=False)
-    runner.register_hook_from_cfg(hook_cfg)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-    runner.register_hook(IterTimerHook())
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    # TODO: use a more elegant way to check values
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.0003999999999999993,
-                    'learning_rate/model2': 0.0003999999999999993,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.95,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.00904508879153485,
-                    'learning_rate/model2': 0.00904508879153485,
-                    'momentum/model1': 0.8595491502812526,
-                    'momentum/model2': 0.8595491502812526,
-                }, 6),
-            call(
-                'train', {
-                    'learning_rate/model1': 4e-08,
-                    'learning_rate/model2': 4e-08,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.95,
-                }, 10)
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.0003999999999999993,
-                'momentum': 0.95
-            }, 1),
-            call(
-                'train', {
-                    'learning_rate': 0.00904508879153485,
-                    'momentum': 0.8595491502812526
-                }, 6),
-            call('train', {
-                'learning_rate': 4e-08,
-                'momentum': 0.95
-            }, 10)
-        ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-    # Test OneCycleLrUpdaterHook
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(
-        runner_type='IterBasedRunner', max_epochs=None, max_iters=max_iters)
-
-    args = dict(
-        max_lr=0.01,
-        total_steps=5,
-        pct_start=0.5,
-        anneal_strategy='linear',
-        div_factor=25,
-        final_div_factor=1e4,
-    )
-    hook = OneCycleLrUpdaterHook(**args)
-    runner.register_hook(hook)
-    if max_iters == 10:
-        # test total_steps < max_iters
-        with pytest.raises(ValueError):
-            runner.run([loader], [('train', 1)])
-    else:
-        # test total_steps > max_iters
-        runner.run([loader], [('train', 1)])
-        lr_last = runner.current_lr()
-        t = torch.tensor([0.0], requires_grad=True)
-        optim = torch.optim.SGD([t], lr=0.01)
-        lr_scheduler = torch.optim.lr_scheduler.OneCycleLR(optim, **args)
-        lr_target = []
-        for _ in range(max_iters):
-            optim.step()
-            lr_target.append(optim.param_groups[0]['lr'])
-            lr_scheduler.step()
-        assert lr_target[-1] == lr_last[0]
-
-
-@pytest.mark.parametrize('multi_optimizers', (True, False))
-def test_cosine_restart_lr_update_hook(multi_optimizers):
-    """Test CosineRestartLrUpdaterHook."""
-    with pytest.raises(AssertionError):
-        # either `min_lr` or `min_lr_ratio` should be specified
-        CosineRestartLrUpdaterHook(
-            by_epoch=False,
-            periods=[2, 10],
-            restart_weights=[0.5, 0.5],
-            min_lr=0.1,
-            min_lr_ratio=0)
-
-    with pytest.raises(AssertionError):
-        # periods and restart_weights should have the same length
-        CosineRestartLrUpdaterHook(
-            by_epoch=False,
-            periods=[2, 10],
-            restart_weights=[0.5],
-            min_lr_ratio=0)
-
-    with pytest.raises(ValueError):
-        # the last cumulative_periods 7 (out of [5, 7]) should >= 10
-        sys.modules['pavi'] = MagicMock()
-        loader = DataLoader(torch.ones((10, 2)))
-        runner = _build_demo_runner()
-
-        # add cosine restart LR scheduler
-        hook = CosineRestartLrUpdaterHook(
-            by_epoch=False,
-            periods=[5, 2],  # cumulative_periods [5, 7 (5 + 2)]
-            restart_weights=[0.5, 0.5],
-            min_lr=0.0001)
-        runner.register_hook(hook)
-        runner.register_hook(IterTimerHook())
-
-        # add pavi hook
-        hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-        runner.register_hook(hook)
-        runner.run([loader], [('train', 1)])
-        shutil.rmtree(runner.work_dir)
-
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add cosine restart LR scheduler
-    hook = CosineRestartLrUpdaterHook(
-        by_epoch=False,
-        periods=[5, 5],
-        restart_weights=[0.5, 0.5],
-        min_lr_ratio=0)
-    runner.register_hook(hook)
-    runner.register_hook(IterTimerHook())
-
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    # TODO: use a more elegant way to check values
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.01,
-                    'learning_rate/model2': 0.005,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.01,
-                    'learning_rate/model2': 0.005,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 6),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.0009549150281252633,
-                    'learning_rate/model2': 0.00047745751406263163,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 10)
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.01,
-                'momentum': 0.95
-            }, 1),
-            call('train', {
-                'learning_rate': 0.01,
-                'momentum': 0.95
-            }, 6),
-            call('train', {
-                'learning_rate': 0.0009549150281252633,
-                'momentum': 0.95
-            }, 10)
-        ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-
-@pytest.mark.parametrize('multi_optimizers', (True, False))
-def test_step_runner_hook(multi_optimizers):
-    """Test StepLrUpdaterHook."""
-    with pytest.raises(TypeError):
-        # `step` should be specified
-        StepLrUpdaterHook()
-    with pytest.raises(AssertionError):
-        # if `step` is int, should be positive
-        StepLrUpdaterHook(-10)
-    with pytest.raises(AssertionError):
-        # if `step` is list of int, should all be positive
-        StepLrUpdaterHook([10, 16, -20])
-
-    # test StepLrUpdaterHook with int `step` value
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((30, 2)))
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-    hook_cfg = dict(
-        type='StepMomentumUpdaterHook',
-        by_epoch=False,
-        step=5,
-        gamma=0.5,
-        min_momentum=0.05)
-    runner.register_hook_from_cfg(hook_cfg)
-
-    # add step LR scheduler
-    hook = StepLrUpdaterHook(by_epoch=False, step=5, gamma=0.5, min_lr=1e-3)
-    runner.register_hook(hook)
-    runner.register_hook(IterTimerHook())
-
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    # TODO: use a more elegant way to check values
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.01,
-                    'learning_rate/model2': 0.005,
-                    'momentum/model1': 0.475,
-                    'momentum/model2': 0.45
-                }, 6),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.0025,
-                    'learning_rate/model2': 0.00125,
-                    'momentum/model1': 0.11875,
-                    'momentum/model2': 0.1125
-                }, 16),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.00125,
-                    'learning_rate/model2': 0.001,
-                    'momentum/model1': 0.059375,
-                    'momentum/model2': 0.05625
-                }, 21),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.001,
-                    'learning_rate/model2': 0.001,
-                    'momentum/model1': 0.05,
-                    'momentum/model2': 0.05
-                }, 26),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.001,
-                    'learning_rate/model2': 0.001,
-                    'momentum/model1': 0.05,
-                    'momentum/model2': 0.05
-                }, 30)
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 0.95
-            }, 1),
-            call('train', {
-                'learning_rate': 0.01,
-                'momentum': 0.475
-            }, 6),
-            call('train', {
-                'learning_rate': 0.0025,
-                'momentum': 0.11875
-            }, 16),
-            call('train', {
-                'learning_rate': 0.00125,
-                'momentum': 0.059375
-            }, 21),
-            call('train', {
-                'learning_rate': 0.001,
-                'momentum': 0.05
-            }, 26),
-            call('train', {
-                'learning_rate': 0.001,
-                'momentum': 0.05
-            }, 30)
-        ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-    # test StepLrUpdaterHook with list[int] `step` value
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(multi_optimizers=multi_optimizers)
-
-    # add momentum scheduler
-    hook_cfg = dict(
-        type='StepMomentumUpdaterHook',
-        by_epoch=False,
-        step=[4, 6, 8],
-        gamma=0.1)
-    runner.register_hook_from_cfg(hook_cfg)
-
-    # add step LR scheduler
-    hook = StepLrUpdaterHook(by_epoch=False, step=[4, 6, 8], gamma=0.1)
-    runner.register_hook(hook)
-    runner.register_hook(IterTimerHook())
-
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    # TODO: use a more elegant way to check values
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.002,
-                    'learning_rate/model2': 0.001,
-                    'momentum/model1': 9.5e-2,
-                    'momentum/model2': 9.000000000000001e-2
-                }, 5),
-            call(
-                'train', {
-                    'learning_rate/model1': 2.0000000000000004e-4,
-                    'learning_rate/model2': 1.0000000000000002e-4,
-                    'momentum/model1': 9.500000000000001e-3,
-                    'momentum/model2': 9.000000000000003e-3
-                }, 7),
-            call(
-                'train', {
-                    'learning_rate/model1': 2.0000000000000005e-05,
-                    'learning_rate/model2': 1.0000000000000003e-05,
-                    'momentum/model1': 9.500000000000002e-4,
-                    'momentum/model2': 9.000000000000002e-4
-                }, 9)
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 0.95
-            }, 1),
-            call('train', {
-                'learning_rate': 0.002,
-                'momentum': 0.095
-            }, 5),
-            call(
-                'train', {
-                    'learning_rate': 2.0000000000000004e-4,
-                    'momentum': 9.500000000000001e-3
-                }, 7),
-            call(
-                'train', {
-                    'learning_rate': 2.0000000000000005e-05,
-                    'momentum': 9.500000000000002e-4
-                }, 9)
-        ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-
-@pytest.mark.parametrize('multi_optimizers, max_iters, gamma, cyclic_times',
-                         [(True, 8, 1, 1), (False, 8, 0.5, 2)])
-def test_cyclic_lr_update_hook(multi_optimizers, max_iters, gamma,
-                               cyclic_times):
-    """Test CyclicLrUpdateHook."""
-    with pytest.raises(AssertionError):
-        # by_epoch should be False
-        CyclicLrUpdaterHook(by_epoch=True)
-
-    with pytest.raises(AssertionError):
-        # target_ratio must be either float or tuple/list of two floats
-        CyclicLrUpdaterHook(by_epoch=False, target_ratio=(10.0, 0.1, 0.2))
-
-    with pytest.raises(AssertionError):
-        # step_ratio_up must be in range [0,1)
-        CyclicLrUpdaterHook(by_epoch=False, step_ratio_up=1.4)
-
-    with pytest.raises(ValueError):
-        # anneal_strategy must be one of "cos" or "linear"
-        CyclicLrUpdaterHook(by_epoch=False, anneal_strategy='sin')
-
-    with pytest.raises(AssertionError):
-        # gamma must be in range (0, 1]
-        CyclicLrUpdaterHook(by_epoch=False, gamma=0)
-
-    sys.modules['pavi'] = MagicMock()
-    loader = DataLoader(torch.ones((10, 2)))
-    runner = _build_demo_runner(
-        runner_type='IterBasedRunner',
-        max_epochs=None,
-        max_iters=max_iters,
-        multi_optimizers=multi_optimizers)
-
-    # add cyclic LR scheduler
-    schedule_hook = CyclicLrUpdaterHook(
-        by_epoch=False,
-        target_ratio=(10.0, 1.0),
-        cyclic_times=cyclic_times,
-        step_ratio_up=0.5,
-        anneal_strategy='linear',
-        gamma=gamma)
-    runner.register_hook(schedule_hook)
-    runner.register_hook_from_cfg(dict(type='IterTimerHook'))
-    runner.register_hook(IterTimerHook())
-    # add pavi hook
-    hook = PaviLoggerHook(interval=1, add_graph=False, add_last_ckpt=True)
-    runner.register_hook(hook)
-    runner.run([loader], [('train', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    assert hasattr(hook, 'writer')
-    if multi_optimizers:
-        calls = [
-            call(
-                'train', {
-                    'learning_rate/model1': 0.02,
-                    'learning_rate/model2': 0.01,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 1),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.155,
-                    'learning_rate/model2': 0.0775,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 4),
-            call(
-                'train', {
-                    'learning_rate/model1': 0.155,
-                    'learning_rate/model2': 0.0775,
-                    'momentum/model1': 0.95,
-                    'momentum/model2': 0.9,
-                }, 6)
-        ]
-    else:
-        calls = [
-            call('train', {
-                'learning_rate': 0.02,
-                'momentum': 0.95
-            }, 1),
-            call('train', {
-                'learning_rate': 0.11,
-                'momentum': 0.95
-            }, 4),
-            call('train', {
-                'learning_rate': 0.065,
-                'momentum': 0.95
-            }, 6),
-            call('train', {
-                'learning_rate': 0.11,
-                'momentum': 0.95
-            }, 7),
-        ]
-    hook.writer.add_scalars.assert_has_calls(calls, any_order=True)
-
-
-@pytest.mark.parametrize('log_model', (True, False))
-def test_mlflow_hook(log_model):
-    sys.modules['mlflow'] = MagicMock()
-    sys.modules['mlflow.pytorch'] = MagicMock()
-
-    runner = _build_demo_runner()
-    loader = DataLoader(torch.ones((5, 2)))
-
-    hook = MlflowLoggerHook(exp_name='test', log_model=log_model)
-    runner.register_hook(hook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    hook.mlflow.set_experiment.assert_called_with('test')
-    hook.mlflow.log_metrics.assert_called_with(
-        {
-            'learning_rate': 0.02,
-            'momentum': 0.95
-        }, step=6)
-    if log_model:
-        hook.mlflow_pytorch.log_model.assert_called_with(
-            runner.model,
-            'models',
-            pip_requirements=[f'torch=={TORCH_VERSION}'])
-    else:
-        assert not hook.mlflow_pytorch.log_model.called
-
-
-def test_segmind_hook():
-    sys.modules['segmind'] = MagicMock()
-    runner = _build_demo_runner()
-    hook = SegmindLoggerHook()
-    loader = DataLoader(torch.ones((5, 2)))
-
-    runner.register_hook(hook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    hook.mlflow_log.assert_called_with(
-        hook.log_metrics, {
-            'learning_rate': 0.02,
-            'momentum': 0.95
-        },
-        step=runner.epoch,
-        epoch=runner.epoch)
-
-
-def test_wandb_hook():
-    sys.modules['wandb'] = MagicMock()
-    runner = _build_demo_runner()
-    hook = WandbLoggerHook(log_artifact=True)
-    loader = DataLoader(torch.ones((5, 2)))
-
-    runner.register_hook(hook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-
-    shutil.rmtree(runner.work_dir)
-
-    hook.wandb.init.assert_called_with()
-    hook.wandb.log.assert_called_with({
-        'learning_rate': 0.02,
-        'momentum': 0.95
-    },
-                                      step=6,
-                                      commit=True)
-    hook.wandb.log_artifact.assert_called()
-    hook.wandb.join.assert_called_with()
-
-
-def test_neptune_hook():
-    sys.modules['neptune'] = MagicMock()
-    sys.modules['neptune.new'] = MagicMock()
-    runner = _build_demo_runner()
-    hook = NeptuneLoggerHook()
-
-    loader = DataLoader(torch.ones((5, 2)))
-
-    runner.register_hook(hook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    hook.neptune.init.assert_called_with()
-    hook.run['momentum'].log.assert_called_with(0.95, step=6)
-    hook.run.stop.assert_called_with()
-
-
-def test_dvclive_hook():
-    sys.modules['dvclive'] = MagicMock()
-    runner = _build_demo_runner()
-
-    hook = DvcliveLoggerHook()
-    dvclive_mock = hook.dvclive
-    loader = DataLoader(torch.ones((5, 2)))
-
-    runner.register_hook(hook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    dvclive_mock.set_step.assert_called_with(6)
-    dvclive_mock.log.assert_called_with('momentum', 0.95)
-
-
-def test_dvclive_hook_model_file(tmp_path):
-    sys.modules['dvclive'] = MagicMock()
-    runner = _build_demo_runner()
-
-    hook = DvcliveLoggerHook(model_file=osp.join(runner.work_dir, 'model.pth'))
-    runner.register_hook(hook)
-
-    loader = torch.utils.data.DataLoader(torch.ones((5, 2)))
-    loader = DataLoader(torch.ones((5, 2)))
-
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-
-    assert osp.exists(osp.join(runner.work_dir, 'model.pth'))
-
-    shutil.rmtree(runner.work_dir)
-
-
-def test_clearml_hook():
-    sys.modules['clearml'] = MagicMock()
-    runner = _build_demo_runner()
-    hook = ClearMLLoggerHook(init_kwargs={
-        'project_name': 'proj',
-        'task_name': 'task',
-    })
-
-    loader = DataLoader(torch.ones((5, 2)))
-
-    runner.register_hook(hook)
-    runner.run([loader, loader], [('train', 1), ('val', 1)])
-    shutil.rmtree(runner.work_dir)
-
-    hook.clearml.Task.init.assert_called_with(
-        project_name='proj', task_name='task')
-    hook.task.get_logger.assert_called_with()
-    report_scalar_calls = [
-        call('momentum', 'momentum', 0.95, 6),
-        call('learning_rate', 'learning_rate', 0.02, 6),
-    ]
-    hook.task_logger.report_scalar.assert_has_calls(
-        report_scalar_calls, any_order=True)
-
-
-def _build_demo_runner_without_hook(runner_type='EpochBasedRunner',
-                                    max_epochs=1,
-                                    max_iters=None,
-                                    multi_optimizers=False):
-
-    class Model(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.linear = nn.Linear(2, 1)
-            self.conv = nn.Conv2d(3, 3, 3)
-
-        def forward(self, x):
-            return self.linear(x)
-
-        def train_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x))
-
-        def val_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x))
-
-    model = Model()
-
-    if multi_optimizers:
-        optimizer = {
-            'model1':
-            torch.optim.SGD(model.linear.parameters(), lr=0.02, momentum=0.95),
-            'model2':
-            torch.optim.SGD(model.conv.parameters(), lr=0.01, momentum=0.9),
-        }
-    else:
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.02, momentum=0.95)
-
-    tmp_dir = tempfile.mkdtemp()
-    runner = build_runner(
-        dict(type=runner_type),
-        default_args=dict(
-            model=model,
-            work_dir=tmp_dir,
-            optimizer=optimizer,
-            logger=logging.getLogger(),
-            max_epochs=max_epochs,
-            max_iters=max_iters))
-    return runner
-
-
-def _build_demo_runner(runner_type='EpochBasedRunner',
-                       max_epochs=1,
-                       max_iters=None,
-                       multi_optimizers=False):
-    log_config = dict(
-        interval=1, hooks=[
-            dict(type='TextLoggerHook'),
-        ])
-
-    runner = _build_demo_runner_without_hook(runner_type, max_epochs,
-                                             max_iters, multi_optimizers)
-
-    runner.register_checkpoint_hook(dict(interval=1))
-    runner.register_logger_hooks(log_config)
-    return runner
-
-
-def test_runner_with_revise_keys():
-    import os
-
-    class Model(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.conv = nn.Conv2d(3, 3, 1)
-
-    class PrefixModel(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.backbone = Model()
-
-    pmodel = PrefixModel()
-    model = Model()
-    checkpoint_path = os.path.join(tempfile.gettempdir(), 'checkpoint.pth')
-
-    # add prefix
-    torch.save(model.state_dict(), checkpoint_path)
-    runner = _build_demo_runner(runner_type='EpochBasedRunner')
-    runner.model = pmodel
-    state_dict = runner.load_checkpoint(
-        checkpoint_path, revise_keys=[(r'^', 'backbone.')])
-    for key in pmodel.backbone.state_dict().keys():
-        assert torch.equal(pmodel.backbone.state_dict()[key], state_dict[key])
-    # strip prefix
-    torch.save(pmodel.state_dict(), checkpoint_path)
-    runner.model = model
-    state_dict = runner.load_checkpoint(
-        checkpoint_path, revise_keys=[(r'^backbone\.', '')])
-    for key in state_dict.keys():
-        key_stripped = re.sub(r'^backbone\.', '', key)
-        assert torch.equal(model.state_dict()[key_stripped], state_dict[key])
-    os.remove(checkpoint_path)
-
-
-def test_get_triggered_stages():
-
-    class ToyHook(Hook):
-        # test normal stage
-        def before_run():
-            pass
-
-        # test the method mapped to multi stages.
-        def after_epoch():
-            pass
-
-    hook = ToyHook()
-    # stages output have order, so here is list instead of set.
-    expected_stages = ['before_run', 'after_train_epoch', 'after_val_epoch']
-    assert hook.get_triggered_stages() == expected_stages
-
-
-def test_gradient_cumulative_optimizer_hook():
-
-    class ToyModel(nn.Module):
-
-        def __init__(self, with_norm=False):
-            super().__init__()
-            self.fp16_enabled = False
-            self.fc = nn.Linear(3, 2)
-            nn.init.constant_(self.fc.weight, 1.)
-            nn.init.constant_(self.fc.bias, 1.)
-            self.with_norm = with_norm
-            if with_norm:
-                self.norm = nn.BatchNorm1d(2)
-
-        def forward(self, x):
-            x = self.fc(x)
-            if self.with_norm:
-                x = self.norm(x)
-            return x
-
-        def train_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x).mean(), num_samples=x.shape[0])
-
-        def val_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x).mean(), num_samples=x.shape[0])
-
-    def build_toy_runner(config=dict(type='EpochBasedRunner', max_epochs=3)):
-        model = ToyModel()
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.02)
-        tmp_dir = tempfile.mkdtemp()
-
-        runner = build_runner(
-            config,
-            default_args=dict(
-                model=model,
-                work_dir=tmp_dir,
-                optimizer=optimizer,
-                logger=logging.getLogger(),
-                meta=dict()))
-        return runner
-
-    with pytest.raises(AssertionError):
-        # cumulative_iters only accepts int
-        GradientCumulativeOptimizerHook(cumulative_iters='str')
-
-    with pytest.raises(AssertionError):
-        # cumulative_iters only accepts positive number
-        GradientCumulativeOptimizerHook(cumulative_iters=-1)
-
-    # test epoch based runner
-    data = torch.rand((6, 3))
-    # optimize with cumulative_iters
-    loader_1 = DataLoader(data, batch_size=1)
-    runner_1 = build_toy_runner()
-    optimizer_hook = GradientCumulativeOptimizerHook(
-        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
-    runner_1.register_hook(optimizer_hook)
-    runner_1.run([loader_1], [('train', 1)])
-
-    # optimize without cumulative_iters
-    loader_2 = DataLoader(data, batch_size=3)
-    runner_2 = build_toy_runner()
-    optimizer_hook = OptimizerHook(grad_clip=dict(max_norm=0.2))
-    runner_2.register_hook(optimizer_hook)
-    runner_2.run([loader_2], [('train', 1)])
-
-    # test optimizer works well
-    assert (runner_1.model.fc.weight < 1).all()
-    assert (runner_1.model.fc.bias < 1).all()
-    # test optimizer with cumulative_iters gets the same results
-    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
-    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
-    shutil.rmtree(runner_1.work_dir)
-    shutil.rmtree(runner_2.work_dir)
-
-    # test iter based runner
-    data = torch.rand((8, 3))
-    # optimize with cumulative_iters
-    loader_1 = DataLoader(data, batch_size=1)
-    runner_1 = build_toy_runner(dict(type='IterBasedRunner', max_iters=8))
-    optimizer_hook = GradientCumulativeOptimizerHook(
-        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
-    runner_1.register_hook(optimizer_hook)
-    runner_1.run([loader_1], [('train', 1)])
-
-    # optimize without cumulative_iters
-    loader_2_divisible = DataLoader(data[:6], batch_size=3)
-    loader_2_remainder = DataLoader(data[6:], batch_size=2)
-    runner_2 = build_toy_runner(dict(type='IterBasedRunner', max_iters=3))
-    optimizer_hook = OptimizerHook(grad_clip=dict(max_norm=0.2))
-    runner_2.register_hook(optimizer_hook)
-    runner_2.run([loader_2_divisible, loader_2_remainder], [('train', 2),
-                                                            ('train', 1)])
-
-    # test optimizer works well
-    assert (runner_1.model.fc.weight < 1).all()
-    assert (runner_1.model.fc.bias < 1).all()
-    # test optimizer with cumulative_iters gets the same results
-    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
-    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
-    shutil.rmtree(runner_1.work_dir)
-    shutil.rmtree(runner_2.work_dir)
-
-    # test has_batch_norm
-    model = ToyModel(with_norm=True)
-    optimizer_hook = GradientCumulativeOptimizerHook(
-        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
-    assert optimizer_hook.has_batch_norm(model)
-
-
-@pytest.mark.skipif(
-    not torch.cuda.is_available(), reason='requires CUDA support')
-def test_gradient_cumulative_fp16_optimizer_hook():
-
-    class ToyModel(nn.Module):
-
-        def __init__(self):
-            super().__init__()
-            self.fp16_enabled = False
-            self.fc = nn.Linear(3, 2)
-            nn.init.constant_(self.fc.weight, 1.)
-            nn.init.constant_(self.fc.bias, 1.)
-
-        @auto_fp16(apply_to=('x', ))
-        def forward(self, x):
-            x = self.fc(x)
-            return x
-
-        def train_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x).mean(), num_samples=x.shape[0])
-
-        def val_step(self, x, optimizer, **kwargs):
-            return dict(loss=self(x).mean(), num_samples=x.shape[0])
-
-    def build_toy_runner(config=dict(type='EpochBasedRunner', max_epochs=3)):
-        model = ToyModel().cuda()
-        optimizer = torch.optim.SGD(model.parameters(), lr=0.02)
-        tmp_dir = tempfile.mkdtemp()
-
-        runner = build_runner(
-            config,
-            default_args=dict(
-                model=model,
-                work_dir=tmp_dir,
-                optimizer=optimizer,
-                logger=logging.getLogger(),
-                meta=dict()))
-        return runner
-
-    # test epoch based runner
-    data = torch.rand((6, 3)).cuda()
-    # optimize with cumulative_iters
-    loader_1 = DataLoader(data, batch_size=1)
-    runner_1 = build_toy_runner()
-    optimizer_hook = GradientCumulativeFp16OptimizerHook(
-        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
-    runner_1.register_hook(optimizer_hook)
-    runner_1.run([loader_1], [('train', 1)])
-
-    # optimize without cumulative_iters
-    loader_2 = DataLoader(data, batch_size=3)
-    runner_2 = build_toy_runner()
-    optimizer_hook = Fp16OptimizerHook(grad_clip=dict(max_norm=0.2))
-    runner_2.register_hook(optimizer_hook)
-    runner_2.run([loader_2], [('train', 1)])
-
-    # test optimizer works well
-    assert (runner_1.model.fc.weight < 1).all()
-    assert (runner_1.model.fc.bias < 1).all()
-    # test optimizer with cumulative_iters gets the same results
-    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
-    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
-    shutil.rmtree(runner_1.work_dir)
-    shutil.rmtree(runner_2.work_dir)
-
-    # test iter based runner
-    data = torch.rand((8, 3)).cuda()
-    # optimize with cumulative_iters
-    loader_1 = DataLoader(data, batch_size=1)
-    runner_1 = build_toy_runner(dict(type='IterBasedRunner', max_iters=8))
-    optimizer_hook = GradientCumulativeFp16OptimizerHook(
-        grad_clip=dict(max_norm=0.2), cumulative_iters=3)
-    runner_1.register_hook(optimizer_hook)
-    runner_1.run([loader_1], [('train', 1)])
-
-    # optimize without cumulative_iters
-    loader_2_divisible = DataLoader(data[:6], batch_size=3)
-    loader_2_remainder = DataLoader(data[6:], batch_size=2)
-    runner_2 = build_toy_runner(dict(type='IterBasedRunner', max_iters=3))
-    optimizer_hook = Fp16OptimizerHook(grad_clip=dict(max_norm=0.2))
-    runner_2.register_hook(optimizer_hook)
-    runner_2.run([loader_2_divisible, loader_2_remainder], [('train', 2),
-                                                            ('train', 1)])
-
-    # test optimizer works well
-    assert (runner_1.model.fc.weight < 1).all()
-    assert (runner_1.model.fc.bias < 1).all()
-    # test optimizer with cumulative_iters gets the same results
-    assert torch.allclose(runner_1.model.fc.weight, runner_2.model.fc.weight)
-    assert torch.allclose(runner_1.model.fc.bias, runner_2.model.fc.bias)
-    shutil.rmtree(runner_1.work_dir)
-    shutil.rmtree(runner_2.work_dir)
diff --git a/tests/test_runner/test_optimizer.py b/tests/test_runner/test_optimizer.py
deleted file mode 100644
index 724f45db96..0000000000
--- a/tests/test_runner/test_optimizer.py
+++ /dev/null
@@ -1,640 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import sys
-import warnings
-from unittest.mock import MagicMock
-
-import pytest
-import torch
-import torch.nn as nn
-
-from mmcv.runner import OPTIMIZER_BUILDERS, DefaultOptimizerConstructor
-from mmcv.runner.optimizer import build_optimizer, build_optimizer_constructor
-from mmcv.runner.optimizer.builder import TORCH_OPTIMIZERS
-from mmcv.utils.ext_loader import check_ops_exist
-
-OPS_AVAILABLE = check_ops_exist()
-if not OPS_AVAILABLE:
-    sys.modules['mmcv.ops'] = MagicMock(
-        DeformConv2d=dict, ModulatedDeformConv2d=dict)
-
-
-class SubModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv1 = nn.Conv2d(2, 2, kernel_size=1, groups=2)
-        self.gn = nn.GroupNorm(2, 2)
-        self.param1 = nn.Parameter(torch.ones(1))
-
-    def forward(self, x):
-        return x
-
-
-class ExampleModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.param1 = nn.Parameter(torch.ones(1))
-        self.conv1 = nn.Conv2d(3, 4, kernel_size=1, bias=False)
-        self.conv2 = nn.Conv2d(4, 2, kernel_size=1)
-        self.bn = nn.BatchNorm2d(2)
-        self.sub = SubModel()
-        if OPS_AVAILABLE:
-            from mmcv.ops import DeformConv2dPack
-            self.dcn = DeformConv2dPack(
-                3, 4, kernel_size=3, deformable_groups=1)
-
-    def forward(self, x):
-        return x
-
-
-class ExampleDuplicateModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.param1 = nn.Parameter(torch.ones(1))
-        self.conv1 = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=False))
-        self.conv2 = nn.Sequential(nn.Conv2d(4, 2, kernel_size=1))
-        self.bn = nn.BatchNorm2d(2)
-        self.sub = SubModel()
-        self.conv3 = nn.Sequential(nn.Conv2d(3, 4, kernel_size=1, bias=False))
-        self.conv3[0] = self.conv1[0]
-        if OPS_AVAILABLE:
-            from mmcv.ops import DeformConv2dPack
-            self.dcn = DeformConv2dPack(
-                3, 4, kernel_size=3, deformable_groups=1)
-
-    def forward(self, x):
-        return x
-
-
-class PseudoDataParallel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.module = ExampleModel()
-
-    def forward(self, x):
-        return x
-
-
-base_lr = 0.01
-base_wd = 0.0001
-momentum = 0.9
-
-
-def check_default_optimizer(optimizer, model, prefix=''):
-    assert isinstance(optimizer, torch.optim.SGD)
-    assert optimizer.defaults['lr'] == base_lr
-    assert optimizer.defaults['momentum'] == momentum
-    assert optimizer.defaults['weight_decay'] == base_wd
-    param_groups = optimizer.param_groups[0]
-    if OPS_AVAILABLE:
-        param_names = [
-            'param1', 'conv1.weight', 'conv2.weight', 'conv2.bias',
-            'bn.weight', 'bn.bias', 'sub.param1', 'sub.conv1.weight',
-            'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias', 'dcn.weight',
-            'dcn.conv_offset.weight', 'dcn.conv_offset.bias'
-        ]
-    else:
-        param_names = [
-            'param1', 'conv1.weight', 'conv2.weight', 'conv2.bias',
-            'bn.weight', 'bn.bias', 'sub.param1', 'sub.conv1.weight',
-            'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias'
-        ]
-    param_dict = dict(model.named_parameters())
-    assert len(param_groups['params']) == len(param_names)
-    for i in range(len(param_groups['params'])):
-        assert torch.equal(param_groups['params'][i],
-                           param_dict[prefix + param_names[i]])
-
-
-def check_sgd_optimizer(optimizer,
-                        model,
-                        prefix='',
-                        bias_lr_mult=1,
-                        bias_decay_mult=1,
-                        norm_decay_mult=1,
-                        dwconv_decay_mult=1,
-                        dcn_offset_lr_mult=1,
-                        bypass_duplicate=False):
-    param_groups = optimizer.param_groups
-    assert isinstance(optimizer, torch.optim.SGD)
-    assert optimizer.defaults['lr'] == base_lr
-    assert optimizer.defaults['momentum'] == momentum
-    assert optimizer.defaults['weight_decay'] == base_wd
-    model_parameters = list(model.parameters())
-    assert len(param_groups) == len(model_parameters)
-    for i, param in enumerate(model_parameters):
-        param_group = param_groups[i]
-        assert torch.equal(param_group['params'][0], param)
-        assert param_group['momentum'] == momentum
-
-    # param1
-    param1 = param_groups[0]
-    assert param1['lr'] == base_lr
-    assert param1['weight_decay'] == base_wd
-    # conv1.weight
-    conv1_weight = param_groups[1]
-    assert conv1_weight['lr'] == base_lr
-    assert conv1_weight['weight_decay'] == base_wd
-    # conv2.weight
-    conv2_weight = param_groups[2]
-    assert conv2_weight['lr'] == base_lr
-    assert conv2_weight['weight_decay'] == base_wd
-    # conv2.bias
-    conv2_bias = param_groups[3]
-    assert conv2_bias['lr'] == base_lr * bias_lr_mult
-    assert conv2_bias['weight_decay'] == base_wd * bias_decay_mult
-    # bn.weight
-    bn_weight = param_groups[4]
-    assert bn_weight['lr'] == base_lr
-    assert bn_weight['weight_decay'] == base_wd * norm_decay_mult
-    # bn.bias
-    bn_bias = param_groups[5]
-    assert bn_bias['lr'] == base_lr
-    assert bn_bias['weight_decay'] == base_wd * norm_decay_mult
-    # sub.param1
-    sub_param1 = param_groups[6]
-    assert sub_param1['lr'] == base_lr
-    assert sub_param1['weight_decay'] == base_wd
-    # sub.conv1.weight
-    sub_conv1_weight = param_groups[7]
-    assert sub_conv1_weight['lr'] == base_lr
-    assert sub_conv1_weight['weight_decay'] == base_wd * dwconv_decay_mult
-    # sub.conv1.bias
-    sub_conv1_bias = param_groups[8]
-    assert sub_conv1_bias['lr'] == base_lr * bias_lr_mult
-    assert sub_conv1_bias['weight_decay'] == base_wd * dwconv_decay_mult
-    # sub.gn.weight
-    sub_gn_weight = param_groups[9]
-    assert sub_gn_weight['lr'] == base_lr
-    assert sub_gn_weight['weight_decay'] == base_wd * norm_decay_mult
-    # sub.gn.bias
-    sub_gn_bias = param_groups[10]
-    assert sub_gn_bias['lr'] == base_lr
-    assert sub_gn_bias['weight_decay'] == base_wd * norm_decay_mult
-
-    if torch.cuda.is_available():
-        dcn_conv_weight = param_groups[11]
-        assert dcn_conv_weight['lr'] == base_lr
-        assert dcn_conv_weight['weight_decay'] == base_wd
-
-        dcn_offset_weight = param_groups[12]
-        assert dcn_offset_weight['lr'] == base_lr * dcn_offset_lr_mult
-        assert dcn_offset_weight['weight_decay'] == base_wd
-
-        dcn_offset_bias = param_groups[13]
-        assert dcn_offset_bias['lr'] == base_lr * dcn_offset_lr_mult
-        assert dcn_offset_bias['weight_decay'] == base_wd
-
-
-def test_default_optimizer_constructor():
-    model = ExampleModel()
-
-    with pytest.raises(TypeError):
-        # optimizer_cfg must be a dict
-        optimizer_cfg = []
-        optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
-        optim_constructor(model)
-
-    with pytest.raises(TypeError):
-        # paramwise_cfg must be a dict or None
-        optimizer_cfg = dict(lr=0.0001)
-        paramwise_cfg = ['error']
-        optim_constructor = DefaultOptimizerConstructor(
-            optimizer_cfg, paramwise_cfg)
-        optim_constructor(model)
-
-    with pytest.raises(ValueError):
-        # bias_decay_mult/norm_decay_mult is specified but weight_decay is None
-        optimizer_cfg = dict(lr=0.0001, weight_decay=None)
-        paramwise_cfg = dict(bias_decay_mult=1, norm_decay_mult=1)
-        optim_constructor = DefaultOptimizerConstructor(
-            optimizer_cfg, paramwise_cfg)
-        optim_constructor(model)
-
-    # basic config with ExampleModel
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
-    optimizer = optim_constructor(model)
-    check_default_optimizer(optimizer, model)
-
-    # basic config with pseudo data parallel
-    model = PseudoDataParallel()
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    paramwise_cfg = None
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
-    optimizer = optim_constructor(model)
-    check_default_optimizer(optimizer, model, prefix='module.')
-
-    # basic config with DataParallel
-    if torch.cuda.is_available():
-        model = torch.nn.DataParallel(ExampleModel())
-        optimizer_cfg = dict(
-            type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-        paramwise_cfg = None
-        optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
-        optimizer = optim_constructor(model)
-        check_default_optimizer(optimizer, model, prefix='module.')
-
-    # Empty paramwise_cfg with ExampleModel
-    model = ExampleModel()
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    paramwise_cfg = dict()
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
-                                                    paramwise_cfg)
-    optimizer = optim_constructor(model)
-    check_default_optimizer(optimizer, model)
-
-    # Empty paramwise_cfg with ExampleModel and no grad
-    model = ExampleModel()
-    for param in model.parameters():
-        param.requires_grad = False
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    paramwise_cfg = dict()
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg)
-    optimizer = optim_constructor(model)
-    check_default_optimizer(optimizer, model)
-
-    # paramwise_cfg with ExampleModel
-    model = ExampleModel()
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    paramwise_cfg = dict(
-        bias_lr_mult=2,
-        bias_decay_mult=0.5,
-        norm_decay_mult=0,
-        dwconv_decay_mult=0.1,
-        dcn_offset_lr_mult=0.1)
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
-                                                    paramwise_cfg)
-    optimizer = optim_constructor(model)
-    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
-
-    # paramwise_cfg with ExampleModel, weight decay is None
-    model = ExampleModel()
-    optimizer_cfg = dict(type='Rprop', lr=base_lr)
-    paramwise_cfg = dict(bias_lr_mult=2)
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
-                                                    paramwise_cfg)
-    optimizer = optim_constructor(model)
-
-    param_groups = optimizer.param_groups
-    assert isinstance(optimizer, torch.optim.Rprop)
-    assert optimizer.defaults['lr'] == base_lr
-    model_parameters = list(model.parameters())
-    assert len(param_groups) == len(model_parameters)
-    for i, param in enumerate(model_parameters):
-        param_group = param_groups[i]
-        assert torch.equal(param_group['params'][0], param)
-    # param1
-    assert param_groups[0]['lr'] == base_lr
-    # conv1.weight
-    assert param_groups[1]['lr'] == base_lr
-    # conv2.weight
-    assert param_groups[2]['lr'] == base_lr
-    # conv2.bias
-    assert param_groups[3]['lr'] == base_lr * paramwise_cfg['bias_lr_mult']
-    # bn.weight
-    assert param_groups[4]['lr'] == base_lr
-    # bn.bias
-    assert param_groups[5]['lr'] == base_lr
-    # sub.param1
-    assert param_groups[6]['lr'] == base_lr
-    # sub.conv1.weight
-    assert param_groups[7]['lr'] == base_lr
-    # sub.conv1.bias
-    assert param_groups[8]['lr'] == base_lr * paramwise_cfg['bias_lr_mult']
-    # sub.gn.weight
-    assert param_groups[9]['lr'] == base_lr
-    # sub.gn.bias
-    assert param_groups[10]['lr'] == base_lr
-
-    if OPS_AVAILABLE:
-        # dcn.weight
-        assert param_groups[11]['lr'] == base_lr
-        # dcn.conv_offset.weight
-        assert param_groups[12]['lr'] == base_lr
-        # dcn.conv_offset.bias
-        assert param_groups[13]['lr'] == base_lr
-
-    # paramwise_cfg with pseudo data parallel
-    model = PseudoDataParallel()
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    paramwise_cfg = dict(
-        bias_lr_mult=2,
-        bias_decay_mult=0.5,
-        norm_decay_mult=0,
-        dwconv_decay_mult=0.1,
-        dcn_offset_lr_mult=0.1)
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
-                                                    paramwise_cfg)
-    optimizer = optim_constructor(model)
-    check_sgd_optimizer(optimizer, model, prefix='module.', **paramwise_cfg)
-
-    # paramwise_cfg with DataParallel
-    if torch.cuda.is_available():
-        model = torch.nn.DataParallel(ExampleModel())
-        optimizer_cfg = dict(
-            type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-        paramwise_cfg = dict(
-            bias_lr_mult=2,
-            bias_decay_mult=0.5,
-            norm_decay_mult=0,
-            dwconv_decay_mult=0.1,
-            dcn_offset_lr_mult=0.1)
-        optim_constructor = DefaultOptimizerConstructor(
-            optimizer_cfg, paramwise_cfg)
-        optimizer = optim_constructor(model)
-        check_sgd_optimizer(
-            optimizer, model, prefix='module.', **paramwise_cfg)
-
-    # paramwise_cfg with ExampleModel and no grad
-    for param in model.parameters():
-        param.requires_grad = False
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
-                                                    paramwise_cfg)
-    optimizer = optim_constructor(model)
-    param_groups = optimizer.param_groups
-    assert isinstance(optimizer, torch.optim.SGD)
-    assert optimizer.defaults['lr'] == base_lr
-    assert optimizer.defaults['momentum'] == momentum
-    assert optimizer.defaults['weight_decay'] == base_wd
-    for i, (name, param) in enumerate(model.named_parameters()):
-        param_group = param_groups[i]
-        assert torch.equal(param_group['params'][0], param)
-        assert param_group['momentum'] == momentum
-        assert param_group['lr'] == base_lr
-        assert param_group['weight_decay'] == base_wd
-
-    # paramwise_cfg with bypass_duplicate option
-    model = ExampleDuplicateModel()
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    paramwise_cfg = dict(
-        bias_lr_mult=2,
-        bias_decay_mult=0.5,
-        norm_decay_mult=0,
-        dwconv_decay_mult=0.1)
-    with pytest.raises(ValueError) as excinfo:
-        optim_constructor = DefaultOptimizerConstructor(
-            optimizer_cfg, paramwise_cfg)
-        optim_constructor(model)
-        assert 'some parameters appear in more than one parameter ' \
-               'group' == excinfo.value
-
-    paramwise_cfg = dict(
-        bias_lr_mult=2,
-        bias_decay_mult=0.5,
-        norm_decay_mult=0,
-        dwconv_decay_mult=0.1,
-        dcn_offset_lr_mult=0.1,
-        bypass_duplicate=True)
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
-                                                    paramwise_cfg)
-    with warnings.catch_warnings(record=True) as w:
-        optimizer = optim_constructor(model)
-        warnings.simplefilter('always')
-        assert len(w) == 1
-        assert str(w[0].message) == 'conv3.0 is duplicate. It is skipped ' \
-                                    'since bypass_duplicate=True'
-    model_parameters = list(model.parameters())
-    num_params = 14 if OPS_AVAILABLE else 11
-    assert len(optimizer.param_groups) == len(model_parameters) == num_params
-    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
-
-    # test DefaultOptimizerConstructor with custom_keys and ExampleModel
-    model = ExampleModel()
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    paramwise_cfg = dict(
-        custom_keys={
-            'param1': dict(lr_mult=10),
-            'sub': dict(lr_mult=0.1, decay_mult=0),
-            'sub.gn': dict(lr_mult=0.01),
-            'non_exist_key': dict(lr_mult=0.0)
-        },
-        norm_decay_mult=0.5)
-
-    with pytest.raises(TypeError):
-        # custom_keys should be a dict
-        paramwise_cfg_ = dict(custom_keys=[0.1, 0.0001])
-        optim_constructor = DefaultOptimizerConstructor(
-            optimizer_cfg, paramwise_cfg_)
-        optimizer = optim_constructor(model)
-
-    with pytest.raises(ValueError):
-        # if 'decay_mult' is specified in custom_keys, weight_decay should be
-        # specified
-        optimizer_cfg_ = dict(type='SGD', lr=0.01)
-        paramwise_cfg_ = dict(custom_keys={'.backbone': dict(decay_mult=0.5)})
-        optim_constructor = DefaultOptimizerConstructor(
-            optimizer_cfg_, paramwise_cfg_)
-        optimizer = optim_constructor(model)
-
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
-                                                    paramwise_cfg)
-    optimizer = optim_constructor(model)
-    # check optimizer type and default config
-    assert isinstance(optimizer, torch.optim.SGD)
-    assert optimizer.defaults['lr'] == base_lr
-    assert optimizer.defaults['momentum'] == momentum
-    assert optimizer.defaults['weight_decay'] == base_wd
-
-    # check params groups
-    param_groups = optimizer.param_groups
-
-    groups = []
-    group_settings = []
-    # group 1, matches of 'param1'
-    # 'param1' is the longest match for 'sub.param1'
-    groups.append(['param1', 'sub.param1'])
-    group_settings.append({
-        'lr': base_lr * 10,
-        'momentum': momentum,
-        'weight_decay': base_wd,
-    })
-    # group 2, matches of 'sub.gn'
-    groups.append(['sub.gn.weight', 'sub.gn.bias'])
-    group_settings.append({
-        'lr': base_lr * 0.01,
-        'momentum': momentum,
-        'weight_decay': base_wd,
-    })
-    # group 3, matches of 'sub'
-    groups.append(['sub.conv1.weight', 'sub.conv1.bias'])
-    group_settings.append({
-        'lr': base_lr * 0.1,
-        'momentum': momentum,
-        'weight_decay': 0,
-    })
-    # group 4, bn is configured by 'norm_decay_mult'
-    groups.append(['bn.weight', 'bn.bias'])
-    group_settings.append({
-        'lr': base_lr,
-        'momentum': momentum,
-        'weight_decay': base_wd * 0.5,
-    })
-    # group 5, default group
-    groups.append(['conv1.weight', 'conv2.weight', 'conv2.bias'])
-    group_settings.append({
-        'lr': base_lr,
-        'momentum': momentum,
-        'weight_decay': base_wd
-    })
-
-    num_params = 14 if OPS_AVAILABLE else 11
-    assert len(param_groups) == num_params
-    for i, (name, param) in enumerate(model.named_parameters()):
-        assert torch.equal(param_groups[i]['params'][0], param)
-        for group, settings in zip(groups, group_settings):
-            if name in group:
-                for setting in settings:
-                    assert param_groups[i][setting] == settings[
-                        setting], f'{name} {setting}'
-
-    # test DefaultOptimizerConstructor with custom_keys and ExampleModel 2
-    model = ExampleModel()
-    optimizer_cfg = dict(type='SGD', lr=base_lr, momentum=momentum)
-    paramwise_cfg = dict(custom_keys={'param1': dict(lr_mult=10)})
-
-    optim_constructor = DefaultOptimizerConstructor(optimizer_cfg,
-                                                    paramwise_cfg)
-    optimizer = optim_constructor(model)
-    # check optimizer type and default config
-    assert isinstance(optimizer, torch.optim.SGD)
-    assert optimizer.defaults['lr'] == base_lr
-    assert optimizer.defaults['momentum'] == momentum
-    assert optimizer.defaults['weight_decay'] == 0
-
-    # check params groups
-    param_groups = optimizer.param_groups
-
-    groups = []
-    group_settings = []
-    # group 1, matches of 'param1'
-    groups.append(['param1', 'sub.param1'])
-    group_settings.append({
-        'lr': base_lr * 10,
-        'momentum': momentum,
-        'weight_decay': 0,
-    })
-    # group 2, default group
-    groups.append([
-        'sub.conv1.weight', 'sub.conv1.bias', 'sub.gn.weight', 'sub.gn.bias',
-        'conv1.weight', 'conv2.weight', 'conv2.bias', 'bn.weight', 'bn.bias'
-    ])
-    group_settings.append({
-        'lr': base_lr,
-        'momentum': momentum,
-        'weight_decay': 0
-    })
-
-    num_params = 14 if OPS_AVAILABLE else 11
-    assert len(param_groups) == num_params
-    for i, (name, param) in enumerate(model.named_parameters()):
-        assert torch.equal(param_groups[i]['params'][0], param)
-        for group, settings in zip(groups, group_settings):
-            if name in group:
-                for setting in settings:
-                    assert param_groups[i][setting] == settings[
-                        setting], f'{name} {setting}'
-
-
-def test_torch_optimizers():
-    torch_optimizers = [
-        'ASGD', 'Adadelta', 'Adagrad', 'Adam', 'AdamW', 'Adamax', 'LBFGS',
-        'Optimizer', 'RMSprop', 'Rprop', 'SGD', 'SparseAdam'
-    ]
-    assert set(torch_optimizers).issubset(set(TORCH_OPTIMIZERS))
-
-
-def test_build_optimizer_constructor():
-    model = ExampleModel()
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    paramwise_cfg = dict(
-        bias_lr_mult=2,
-        bias_decay_mult=0.5,
-        norm_decay_mult=0,
-        dwconv_decay_mult=0.1,
-        dcn_offset_lr_mult=0.1)
-    optim_constructor_cfg = dict(
-        type='DefaultOptimizerConstructor',
-        optimizer_cfg=optimizer_cfg,
-        paramwise_cfg=paramwise_cfg)
-    optim_constructor = build_optimizer_constructor(optim_constructor_cfg)
-    optimizer = optim_constructor(model)
-    check_sgd_optimizer(optimizer, model, **paramwise_cfg)
-
-    from mmcv.runner import OPTIMIZERS
-    from mmcv.utils import build_from_cfg
-
-    @OPTIMIZER_BUILDERS.register_module()
-    class MyOptimizerConstructor(DefaultOptimizerConstructor):
-
-        def __call__(self, model):
-            if hasattr(model, 'module'):
-                model = model.module
-
-            conv1_lr_mult = self.paramwise_cfg.get('conv1_lr_mult', 1.)
-
-            params = []
-            for name, param in model.named_parameters():
-                param_group = {'params': [param]}
-                if name.startswith('conv1') and param.requires_grad:
-                    param_group['lr'] = self.base_lr * conv1_lr_mult
-                params.append(param_group)
-            optimizer_cfg['params'] = params
-
-            return build_from_cfg(optimizer_cfg, OPTIMIZERS)
-
-    paramwise_cfg = dict(conv1_lr_mult=5)
-    optim_constructor_cfg = dict(
-        type='MyOptimizerConstructor',
-        optimizer_cfg=optimizer_cfg,
-        paramwise_cfg=paramwise_cfg)
-    optim_constructor = build_optimizer_constructor(optim_constructor_cfg)
-    optimizer = optim_constructor(model)
-
-    param_groups = optimizer.param_groups
-    assert isinstance(optimizer, torch.optim.SGD)
-    assert optimizer.defaults['lr'] == base_lr
-    assert optimizer.defaults['momentum'] == momentum
-    assert optimizer.defaults['weight_decay'] == base_wd
-    for i, param in enumerate(model.parameters()):
-        param_group = param_groups[i]
-        assert torch.equal(param_group['params'][0], param)
-        assert param_group['momentum'] == momentum
-    # conv1.weight
-    assert param_groups[1]['lr'] == base_lr * paramwise_cfg['conv1_lr_mult']
-    assert param_groups[1]['weight_decay'] == base_wd
-
-
-def test_build_optimizer():
-    model = ExampleModel()
-    optimizer_cfg = dict(
-        type='SGD', lr=base_lr, weight_decay=base_wd, momentum=momentum)
-    optimizer = build_optimizer(model, optimizer_cfg)
-    check_default_optimizer(optimizer, model)
-
-    model = ExampleModel()
-    optimizer_cfg = dict(
-        type='SGD',
-        lr=base_lr,
-        weight_decay=base_wd,
-        momentum=momentum,
-        paramwise_cfg=dict(
-            bias_lr_mult=2,
-            bias_decay_mult=0.5,
-            norm_decay_mult=0,
-            dwconv_decay_mult=0.1,
-            dcn_offset_lr_mult=0.1))
-    optimizer = build_optimizer(model, optimizer_cfg)
-    check_sgd_optimizer(optimizer, model, **optimizer_cfg['paramwise_cfg'])
diff --git a/tests/test_runner/test_runner.py b/tests/test_runner/test_runner.py
deleted file mode 100644
index d75f20d45f..0000000000
--- a/tests/test_runner/test_runner.py
+++ /dev/null
@@ -1,289 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import logging
-import os
-import os.path as osp
-import platform
-import random
-import string
-import tempfile
-
-import pytest
-import torch
-import torch.nn as nn
-
-from mmcv.parallel import MMDataParallel
-from mmcv.runner import (RUNNERS, EpochBasedRunner, IterBasedRunner,
-                         build_runner)
-from mmcv.runner.hooks import IterTimerHook
-
-
-class OldStyleModel(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv = nn.Conv2d(3, 3, 1)
-
-
-class Model(OldStyleModel):
-
-    def train_step(self):
-        pass
-
-    def val_step(self):
-        pass
-
-
-def test_build_runner():
-    temp_root = tempfile.gettempdir()
-    dir_name = ''.join(
-        [random.choice(string.ascii_letters) for _ in range(10)])
-
-    default_args = dict(
-        model=Model(),
-        work_dir=osp.join(temp_root, dir_name),
-        logger=logging.getLogger())
-    cfg = dict(type='EpochBasedRunner', max_epochs=1)
-    runner = build_runner(cfg, default_args=default_args)
-    assert runner._max_epochs == 1
-    cfg = dict(type='IterBasedRunner', max_iters=1)
-    runner = build_runner(cfg, default_args=default_args)
-    assert runner._max_iters == 1
-
-    with pytest.raises(ValueError, match='Only one of'):
-        cfg = dict(type='IterBasedRunner', max_epochs=1, max_iters=1)
-        runner = build_runner(cfg, default_args=default_args)
-
-
-@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
-def test_epoch_based_runner(runner_class):
-
-    with pytest.warns(DeprecationWarning):
-        # batch_processor is deprecated
-        model = OldStyleModel()
-
-        def batch_processor():
-            pass
-
-        _ = runner_class(model, batch_processor, logger=logging.getLogger())
-
-    with pytest.raises(TypeError):
-        # batch_processor must be callable
-        model = OldStyleModel()
-        _ = runner_class(model, batch_processor=0, logger=logging.getLogger())
-
-    with pytest.raises(TypeError):
-        # optimizer must be a optimizer or a dict of optimizers
-        model = Model()
-        optimizer = 'NotAOptimizer'
-        _ = runner_class(
-            model, optimizer=optimizer, logger=logging.getLogger())
-
-    with pytest.raises(TypeError):
-        # optimizer must be a optimizer or a dict of optimizers
-        model = Model()
-        optimizers = dict(optim1=torch.optim.Adam(), optim2='NotAOptimizer')
-        _ = runner_class(
-            model, optimizer=optimizers, logger=logging.getLogger())
-
-    with pytest.raises(TypeError):
-        # logger must be a logging.Logger
-        model = Model()
-        _ = runner_class(model, logger=None)
-
-    with pytest.raises(TypeError):
-        # meta must be a dict or None
-        model = Model()
-        _ = runner_class(model, logger=logging.getLogger(), meta=['list'])
-
-    with pytest.raises(AssertionError):
-        # model must implement the method train_step()
-        model = OldStyleModel()
-        _ = runner_class(model, logger=logging.getLogger())
-
-    with pytest.raises(TypeError):
-        # work_dir must be a str or None
-        model = Model()
-        _ = runner_class(model, work_dir=1, logger=logging.getLogger())
-
-    with pytest.raises(RuntimeError):
-        # batch_processor and train_step() cannot be both set
-
-        def batch_processor():
-            pass
-
-        model = Model()
-        _ = runner_class(model, batch_processor, logger=logging.getLogger())
-
-    # test work_dir
-    model = Model()
-    temp_root = tempfile.gettempdir()
-    dir_name = ''.join(
-        [random.choice(string.ascii_letters) for _ in range(10)])
-    work_dir = osp.join(temp_root, dir_name)
-    _ = runner_class(model, work_dir=work_dir, logger=logging.getLogger())
-    assert osp.isdir(work_dir)
-    _ = runner_class(model, work_dir=work_dir, logger=logging.getLogger())
-    assert osp.isdir(work_dir)
-    os.removedirs(work_dir)
-
-
-@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
-def test_runner_with_parallel(runner_class):
-
-    def batch_processor():
-        pass
-
-    model = MMDataParallel(OldStyleModel())
-    _ = runner_class(model, batch_processor, logger=logging.getLogger())
-
-    model = MMDataParallel(Model())
-    _ = runner_class(model, logger=logging.getLogger())
-
-    with pytest.raises(RuntimeError):
-        # batch_processor and train_step() cannot be both set
-
-        def batch_processor():
-            pass
-
-        model = MMDataParallel(Model())
-        _ = runner_class(model, batch_processor, logger=logging.getLogger())
-
-
-@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
-def test_save_checkpoint(runner_class):
-    model = Model()
-    runner = runner_class(model=model, logger=logging.getLogger())
-
-    with pytest.raises(TypeError):
-        # meta should be None or dict
-        runner.save_checkpoint('.', meta=list())
-
-    with tempfile.TemporaryDirectory() as root:
-        runner.save_checkpoint(root)
-
-        latest_path = osp.join(root, 'latest.pth')
-        assert osp.exists(latest_path)
-
-        if isinstance(runner, EpochBasedRunner):
-            first_ckp_path = osp.join(root, 'epoch_1.pth')
-        elif isinstance(runner, IterBasedRunner):
-            first_ckp_path = osp.join(root, 'iter_1.pth')
-
-        assert osp.exists(first_ckp_path)
-
-        if platform.system() != 'Windows':
-            assert osp.realpath(latest_path) == osp.realpath(first_ckp_path)
-        else:
-            # use copy instead of symlink on windows
-            pass
-
-        torch.load(latest_path)
-
-
-@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
-def test_build_lr_momentum_hook(runner_class):
-    model = Model()
-    runner = runner_class(model=model, logger=logging.getLogger())
-
-    # test policy that is already title
-    lr_config = dict(
-        policy='CosineAnnealing',
-        by_epoch=False,
-        min_lr_ratio=0,
-        warmup_iters=2,
-        warmup_ratio=0.9)
-    runner.register_lr_hook(lr_config)
-    assert len(runner.hooks) == 1
-
-    # test policy that is already title
-    lr_config = dict(
-        policy='Cyclic',
-        by_epoch=False,
-        target_ratio=(10, 1),
-        cyclic_times=1,
-        step_ratio_up=0.4)
-    runner.register_lr_hook(lr_config)
-    assert len(runner.hooks) == 2
-
-    # test policy that is not title
-    lr_config = dict(
-        policy='cyclic',
-        by_epoch=False,
-        target_ratio=(0.85 / 0.95, 1),
-        cyclic_times=1,
-        step_ratio_up=0.4)
-    runner.register_lr_hook(lr_config)
-    assert len(runner.hooks) == 3
-
-    # test policy that is title
-    lr_config = dict(
-        policy='Step',
-        warmup='linear',
-        warmup_iters=500,
-        warmup_ratio=1.0 / 3,
-        step=[8, 11])
-    runner.register_lr_hook(lr_config)
-    assert len(runner.hooks) == 4
-
-    # test policy that is not title
-    lr_config = dict(
-        policy='step',
-        warmup='linear',
-        warmup_iters=500,
-        warmup_ratio=1.0 / 3,
-        step=[8, 11])
-    runner.register_lr_hook(lr_config)
-    assert len(runner.hooks) == 5
-
-    # test policy that is already title
-    mom_config = dict(
-        policy='CosineAnnealing',
-        min_momentum_ratio=0.99 / 0.95,
-        by_epoch=False,
-        warmup_iters=2,
-        warmup_ratio=0.9 / 0.95)
-    runner.register_momentum_hook(mom_config)
-    assert len(runner.hooks) == 6
-
-    # test policy that is already title
-    mom_config = dict(
-        policy='Cyclic',
-        by_epoch=False,
-        target_ratio=(0.85 / 0.95, 1),
-        cyclic_times=1,
-        step_ratio_up=0.4)
-    runner.register_momentum_hook(mom_config)
-    assert len(runner.hooks) == 7
-
-    # test policy that is already title
-    mom_config = dict(
-        policy='cyclic',
-        by_epoch=False,
-        target_ratio=(0.85 / 0.95, 1),
-        cyclic_times=1,
-        step_ratio_up=0.4)
-    runner.register_momentum_hook(mom_config)
-    assert len(runner.hooks) == 8
-
-
-@pytest.mark.parametrize('runner_class', RUNNERS.module_dict.values())
-def test_register_timer_hook(runner_class):
-    model = Model()
-    runner = runner_class(model=model, logger=logging.getLogger())
-
-    # test register None
-    timer_config = None
-    runner.register_timer_hook(timer_config)
-    assert len(runner.hooks) == 0
-
-    # test register IterTimerHook with config
-    timer_config = dict(type='IterTimerHook')
-    runner.register_timer_hook(timer_config)
-    assert len(runner.hooks) == 1
-    assert isinstance(runner.hooks[0], IterTimerHook)
-
-    # test register IterTimerHook
-    timer_config = IterTimerHook()
-    runner.register_timer_hook(timer_config)
-    assert len(runner.hooks) == 2
-    assert isinstance(runner.hooks[1], IterTimerHook)
diff --git a/tests/test_runner/test_utils.py b/tests/test_runner/test_utils.py
deleted file mode 100644
index 3d2d18146c..0000000000
--- a/tests/test_runner/test_utils.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import os
-import random
-
-import numpy as np
-import torch
-
-from mmcv.runner import set_random_seed
-from mmcv.utils import TORCH_VERSION, digit_version
-
-is_rocm_pytorch = False
-if digit_version(TORCH_VERSION) >= digit_version('1.5'):
-    from torch.utils.cpp_extension import ROCM_HOME
-    is_rocm_pytorch = True if ((torch.version.hip is not None) and
-                               (ROCM_HOME is not None)) else False
-
-
-def test_set_random_seed():
-    set_random_seed(0)
-    a_random = random.randint(0, 10)
-    a_np_random = np.random.rand(2, 2)
-    a_torch_random = torch.rand(2, 2)
-    assert torch.backends.cudnn.deterministic is False
-    assert torch.backends.cudnn.benchmark is False
-    assert os.environ['PYTHONHASHSEED'] == str(0)
-
-    set_random_seed(0, True)
-    b_random = random.randint(0, 10)
-    b_np_random = np.random.rand(2, 2)
-    b_torch_random = torch.rand(2, 2)
-    assert torch.backends.cudnn.deterministic is True
-    if is_rocm_pytorch:
-        assert torch.backends.cudnn.benchmark is True
-    else:
-        assert torch.backends.cudnn.benchmark is False
-
-    assert a_random == b_random
-    assert np.equal(a_np_random, b_np_random).all()
-    assert torch.equal(a_torch_random, b_torch_random)
diff --git a/tests/test_utils/test_hub.py b/tests/test_utils/test_hub.py
deleted file mode 100644
index b44ee9be06..0000000000
--- a/tests/test_utils/test_hub.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# Copyright (c) OpenMMLab. All rights reserved.
-import pytest
-import torch
-from torch.utils import model_zoo
-
-from mmcv.utils import TORCH_VERSION, digit_version, load_url
-
-
-@pytest.mark.skipif(
-    torch.__version__ == 'parrots', reason='not necessary in parrots test')
-def test_load_url():
-    url1 = 'https://download.openmmlab.com/mmcv/test_data/saved_in_pt1.5.pth'
-    url2 = 'https://download.openmmlab.com/mmcv/test_data/saved_in_pt1.6.pth'
-
-    # The 1.6 release of PyTorch switched torch.save to use a new zipfile-based
-    # file format. It will cause RuntimeError when a checkpoint was saved in
-    # torch >= 1.6.0 but loaded in torch < 1.7.0.
-    # More details at https://github.com/open-mmlab/mmpose/issues/904
-    if digit_version(TORCH_VERSION) < digit_version('1.7.0'):
-        model_zoo.load_url(url1)
-        with pytest.raises(RuntimeError):
-            model_zoo.load_url(url2)
-    else:
-        # high version of PyTorch can load checkpoints from url, regardless
-        # of which version they were saved in
-        model_zoo.load_url(url1)
-        model_zoo.load_url(url2)
-
-    load_url(url1)
-    # if a checkpoint was saved in torch >= 1.6.0 but loaded in torch < 1.5.0,
-    # it will raise a RuntimeError
-    if digit_version(TORCH_VERSION) < digit_version('1.5.0'):
-        with pytest.raises(RuntimeError):
-            load_url(url2)
-    else:
-        load_url(url2)

From 768da86463f38e8300f9287b3d31eb8482ffc477 Mon Sep 17 00:00:00 2001
From: zhouzaida <zhouzaida@163.com>
Date: Fri, 19 Aug 2022 17:36:57 +0800
Subject: [PATCH 2/3] fix format

---
 mmcv/cnn/alexnet.py    | 2 +-
 mmcv/cnn/resnet.py     | 2 +-
 mmcv/cnn/vgg.py        | 2 +-
 mmcv/utils/__init__.py | 6 +++---
 4 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/mmcv/cnn/alexnet.py b/mmcv/cnn/alexnet.py
index dd6f9d4d02..309be24b66 100644
--- a/mmcv/cnn/alexnet.py
+++ b/mmcv/cnn/alexnet.py
@@ -2,9 +2,9 @@
 import logging
 from typing import Optional
 
-from mmengine.runner import load_checkpoint
 import torch
 import torch.nn as nn
+from mmengine.runner import load_checkpoint
 
 
 class AlexNet(nn.Module):
diff --git a/mmcv/cnn/resnet.py b/mmcv/cnn/resnet.py
index f469b2efd6..d3a5ddc4d7 100644
--- a/mmcv/cnn/resnet.py
+++ b/mmcv/cnn/resnet.py
@@ -5,8 +5,8 @@
 import torch.nn as nn
 import torch.utils.checkpoint as cp
 from mmengine.model.utils import constant_init, kaiming_init
-from torch import Tensor
 from mmengine.runner import load_checkpoint
+from torch import Tensor
 
 
 def conv3x3(in_planes: int,
diff --git a/mmcv/cnn/vgg.py b/mmcv/cnn/vgg.py
index 29618117ba..412fd3ac4b 100644
--- a/mmcv/cnn/vgg.py
+++ b/mmcv/cnn/vgg.py
@@ -4,8 +4,8 @@
 
 import torch.nn as nn
 from mmengine.model.utils import constant_init, kaiming_init, normal_init
-from torch import Tensor
 from mmengine.runner import load_checkpoint
+from torch import Tensor
 
 
 def conv3x3(in_planes: int, out_planes: int, dilation: int = 1) -> nn.Module:
diff --git a/mmcv/utils/__init__.py b/mmcv/utils/__init__.py
index cf7f0d60bb..6bd3d3c8b6 100644
--- a/mmcv/utils/__init__.py
+++ b/mmcv/utils/__init__.py
@@ -74,7 +74,7 @@
         'assert_dict_has_keys', 'assert_keys_equal', 'assert_is_norm_layer',
         'assert_params_all_zeros', 'check_python_script',
         'is_method_overridden', 'is_jit_tracing', 'is_rocm_pytorch',
-        '_get_cuda_home', 'has_method', 'IS_CUDA_AVAILABLE',
-        'worker_init_fn', 'IS_MLU_AVAILABLE', 'IS_IPU_AVAILABLE',
-        'IS_MPS_AVAILABLE', 'torch_meshgrid'
+        '_get_cuda_home', 'has_method', 'IS_CUDA_AVAILABLE', 'worker_init_fn',
+        'IS_MLU_AVAILABLE', 'IS_IPU_AVAILABLE', 'IS_MPS_AVAILABLE',
+        'torch_meshgrid'
     ]

From 00af24f68ef7ffaaa48fe8a96ea514df08a96923 Mon Sep 17 00:00:00 2001
From: zhouzaida <zhouzaida@163.com>
Date: Fri, 19 Aug 2022 17:40:22 +0800
Subject: [PATCH 3/3] remove outdated docs

---
 docs/en/api.rst                      |  10 --
 docs/en/index.rst                    |   1 -
 docs/en/understand_mmcv/runner.md    | 163 ---------------------------
 docs/zh_cn/api.rst                   |  10 --
 docs/zh_cn/index.rst                 |   1 -
 docs/zh_cn/understand_mmcv/runner.md | 159 --------------------------
 examples/train.py                    |  84 --------------
 7 files changed, 428 deletions(-)
 delete mode 100644 docs/en/understand_mmcv/runner.md
 delete mode 100644 docs/zh_cn/understand_mmcv/runner.md
 delete mode 100644 examples/train.py

diff --git a/docs/en/api.rst b/docs/en/api.rst
index ab75ca6b7b..9345446f80 100644
--- a/docs/en/api.rst
+++ b/docs/en/api.rst
@@ -28,16 +28,6 @@ cnn
 .. automodule:: mmcv.cnn
     :members:
 
-runner
-------
-.. automodule:: mmcv.runner
-    :members:
-
-engine
-------
-.. automodule:: mmcv.engine
-    :members:
-
 ops
 ------
 .. automodule:: mmcv.ops
diff --git a/docs/en/index.rst b/docs/en/index.rst
index cec2c46e98..1e5193ac30 100644
--- a/docs/en/index.rst
+++ b/docs/en/index.rst
@@ -17,7 +17,6 @@ You can switch between Chinese and English documents in the lower-left corner of
 
    understand_mmcv/config.md
    understand_mmcv/registry.md
-   understand_mmcv/runner.md
    understand_mmcv/data_process.md
    understand_mmcv/visualization.md
    understand_mmcv/cnn.md
diff --git a/docs/en/understand_mmcv/runner.md b/docs/en/understand_mmcv/runner.md
deleted file mode 100644
index eeeb859ee8..0000000000
--- a/docs/en/understand_mmcv/runner.md
+++ /dev/null
@@ -1,163 +0,0 @@
-## Runner
-
-The runner class is designed to manage the training. It eases the training process with less code demanded from users while staying flexible and configurable. The main features are as listed:
-
-- Support `EpochBasedRunner` and `IterBasedRunner` for different scenarios. Implementing customized runners is also allowed to meet customized needs.
-- Support customized workflow to allow switching between different modes while training. Currently, supported modes are train and val.
-- Enable extensibility through various hooks, including hooks defined in MMCV and customized ones.
-
-### EpochBasedRunner
-
-As its name indicates, workflow in `EpochBasedRunner` should be set based on epochs. For example, \[('train', 2), ('val', 1)\] means running 2 epochs for training and 1 epoch for validation, iteratively. And each epoch may contain multiple iterations. Currently, MMDetection uses `EpochBasedRunner` by default.
-
-Let's take a look at its core logic:
-
-```python
-# the condition to stop training
-while curr_epoch < max_epochs:
-    # traverse the workflow.
-    # e.g. workflow = [('train', 2), ('val', 1)]
-    for i, flow in enumerate(workflow):
-        # mode(e.g. train) determines which function to run
-        mode, epochs = flow
-        # epoch_runner will be either self.train() or self.val()
-        epoch_runner = getattr(self, mode)
-        # execute the corresponding function
-        for _ in range(epochs):
-            epoch_runner(data_loaders[i], **kwargs)
-```
-
-Currently, we support 2 modes: train and val. Let's take a train function for example and have a look at its core logic:
-
-```python
-# Currently, epoch_runner could be either train or val
-def train(self, data_loader, **kwargs):
-    # traverse the dataset and get batch data for 1 epoch
-    for i, data_batch in enumerate(data_loader):
-        # it will execute all before_train_iter function in the hooks registered. You may want to watch out for the order.
-        self.call_hook('before_train_iter')
-        # set train_mode as False in val function
-        self.run_iter(data_batch, train_mode=True, **kwargs)
-        self.call_hook('after_train_iter')
-   self.call_hook('after_train_epoch')
-```
-
-### IterBasedRunner
-
-Different from `EpochBasedRunner`, workflow in `IterBasedRunner` should be set based on iterations. For example, \[('train', 2), ('val', 1)\] means running 2 iters for training and 1 iter for validation, iteratively. Currently, MMSegmentation uses `IterBasedRunner` by default.
-
-Let's take a look at its core logic:
-
-```python
-# Although we set workflow by iters here, we might also need info on the epochs in some using cases. That can be provided by IterLoader.
-iter_loaders = [IterLoader(x) for x in data_loaders]
-# the condition to stop training
-while curr_iter < max_iters:
-    # traverse the workflow.
-    # e.g. workflow = [('train', 2), ('val', 1)]
-    for i, flow in enumerate(workflow):
-        # mode(e.g. train) determines which function to run
-        mode, iters = flow
-        # iter_runner will be either self.train() or self.val()
-        iter_runner = getattr(self, mode)
-        # execute the corresponding function
-        for _ in range(iters):
-            iter_runner(iter_loaders[i], **kwargs)
-```
-
-Currently, we support 2 modes: train and val. Let's take a val function for example and have a look at its core logic:
-
-```python
-# Currently, iter_runner could be either train or val
-def val(self, data_loader, **kwargs):
-    # get batch data for 1 iter
-    data_batch = next(data_loader)
-    # it will execute all before_val_iter function in the hooks registered. You may want to watch out for the order.
-    self.call_hook('before_val_iter')
-    outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
-    self.outputs = outputs
-    self.call_hook('after_val_iter')
-```
-
-Other than the basic functionalities explained above, `EpochBasedRunner` and `IterBasedRunner` provide methods such as `resume`, `save_checkpoint` and `register_hook`. In case you are not familiar with the term Hook mentioned earlier, we will also provide a tutorial about it.(coming soon...) Essentially, a hook is functionality to alter or augment the code behaviors through predefined api. It allows users to have their own code called under certain circumstances. It makes code extensible in a non-intrusive manner.
-
-### A Simple Example
-
-We will walk you through the usage of runner with a classification task. The following code only contains essential steps for demonstration purposes. The following steps are necessary for any training tasks.
-
-**(1) Initialize dataloader, model, optimizer, etc.**
-
-```python
-# initialize model
-model=...
-# initialize optimizer, typically, we set: cfg.optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
-optimizer = build_optimizer(model, cfg.optimizer)
-# initialize the dataloader corresponding to the workflow(train/val)
-data_loaders = [
-        build_dataloader(
-            ds,
-            cfg.data.samples_per_gpu,
-            cfg.data.workers_per_gpu,
-            ...) for ds in dataset
-    ]
-```
-
-**(2) Initialize runner**
-
-```python
-runner = build_runner(
-    # cfg.runner is typically set as:
-    # runner = dict(type='EpochBasedRunner', max_epochs=200)
-    cfg.runner,
-    default_args=dict(
-        model=model,
-        batch_processor=None,
-        optimizer=optimizer,
-        logger=logger))
-```
-
-**(3) Register training hooks and customized hooks.**
-
-```python
-# register default hooks necessary for training
-runner.register_training_hooks(
-    # configs of learning rate, it is typically set as:
-    # lr_config = dict(policy='step', step=[100, 150])
-    cfg.lr_config,
-    # configuration of optimizer, e.g. grad_clip
-    optimizer_config,
-    # configuration of saving checkpoints, it is typically set as:
-    # checkpoint_config = dict(interval=1), saving checkpoints every epochs
-    cfg.checkpoint_config,
-    # configuration of logs
-    cfg.log_config,
-    ...)
-
-# register customized hooks
-# say we want to enable ema, then we could set custom_hooks=[dict(type='EMAHook')]
-if cfg.get('custom_hooks', None):
-    custom_hooks = cfg.custom_hooks
-    for hook_cfg in cfg.custom_hooks:
-        hook_cfg = hook_cfg.copy()
-        priority = hook_cfg.pop('priority', 'NORMAL')
-        hook = build_from_cfg(hook_cfg, HOOKS)
-        runner.register_hook(hook, priority=priority)
-```
-
-Then, we can use `resume` or `load_checkpoint` to load existing weights.
-
-**(4) Start training**
-
-```python
-# workflow is typically set as: workflow = [('train', 1)]
-# here the training begins.
-runner.run(data_loaders, cfg.workflow)
-```
-
-Let's take `EpochBasedRunner` for example and go a little bit into details about setting workflow:
-
-- Say we only want to put train in the workflow, then we can set: workflow = \[('train', 1)\]. The runner will only execute train iteratively in this case.
-- Say we want to put both train and val in the workflow, then we can set: workflow = \[('train', 3), ('val',1)\]. The runner will first execute train for 3 epochs and then switch to val mode and execute val for 1 epoch. The workflow will be repeated until the current epoch hit the max_epochs.
-- Workflow is highly flexible. Therefore, you can set workflow = \[('val', 1), ('train',1)\] if you would like the runner to validate first and train after.
-
-The code we demonstrated above is already in `train.py` in MM repositories. Simply modify the corresponding keys in the configuration files and the script will execute the expected workflow automatically.
diff --git a/docs/zh_cn/api.rst b/docs/zh_cn/api.rst
index 6d2c744204..0f232d9815 100644
--- a/docs/zh_cn/api.rst
+++ b/docs/zh_cn/api.rst
@@ -28,16 +28,6 @@ cnn
 .. automodule:: mmcv.cnn
     :members:
 
-runner
-------
-.. automodule:: mmcv.runner
-    :members:
-
-engine
-------
-.. automodule:: mmcv.engine
-    :members:
-
 ops
 ------
 .. automodule:: mmcv.ops
diff --git a/docs/zh_cn/index.rst b/docs/zh_cn/index.rst
index 3427f0a7a6..5c067a9eb6 100644
--- a/docs/zh_cn/index.rst
+++ b/docs/zh_cn/index.rst
@@ -17,7 +17,6 @@
 
    understand_mmcv/config.md
    understand_mmcv/registry.md
-   understand_mmcv/runner.md
    understand_mmcv/data_process.md
    understand_mmcv/data_transform.md
    understand_mmcv/visualization.md
diff --git a/docs/zh_cn/understand_mmcv/runner.md b/docs/zh_cn/understand_mmcv/runner.md
deleted file mode 100644
index 7098eb977f..0000000000
--- a/docs/zh_cn/understand_mmcv/runner.md
+++ /dev/null
@@ -1,159 +0,0 @@
-## 执行器
-
-执行器模块负责模型训练过程调度，主要目的是让用户使用更少的代码以及灵活可配置方式开启训练。其具备如下核心特性:
-
-- 支持以 `EpochBasedRunner` 和 `IterBasedRunner` 为单位的迭代模式以满足不同场景
-- 支持定制工作流以满足训练过程中各状态自由切换，目前支持训练和验证两个工作流。工作流可以简单理解为一个完成的训练和验证迭代过程。
-- 配合各类默认和自定义 Hook，对外提供了灵活扩展能力
-
-### EpochBasedRunner
-
-顾名思义，`EpochBasedRunner` 是指以 epoch 为周期的工作流，例如设置 workflow = \[('train', 2), ('val', 1)\] 表示循环迭代地训练 2 个 epoch，然后验证 1 个 epoch。MMDetection 目标检测框架默认采用的是 `EpochBasedRunner`。
-
-其抽象逻辑如下所示：
-
-```python
-# 训练终止条件
-while curr_epoch < max_epochs:
-    # 遍历用户设置的工作流，例如 workflow = [('train', 2)，('val', 1)]
-    for i, flow in enumerate(workflow):
-        # mode 是工作流函数，例如 train, epochs 是迭代次数
-        mode, epochs = flow
-        # 要么调用 self.train()，要么调用 self.val()
-        epoch_runner = getattr(self, mode)
-        # 运行对应工作流函数
-        for _ in range(epochs):
-            epoch_runner(data_loaders[i], **kwargs)
-```
-
-目前支持训练和验证两个工作流，以训练函数为例，其抽象逻辑是：
-
-```python
-# epoch_runner 目前可以是 train 或者 val
-def train(self, data_loader, **kwargs):
-    # 遍历 dataset，共返回一个 epoch 的 batch 数据
-    for i, data_batch in enumerate(data_loader):
-        self.call_hook('before_train_iter')
-        # 验证时候 train_mode=False
-        self.run_iter(data_batch, train_mode=True, **kwargs)
-        self.call_hook('after_train_iter')
-   self.call_hook('after_train_epoch')
-```
-
-### IterBasedRunner
-
-不同于 `EpochBasedRunner`，`IterBasedRunner` 是指以 iter 为周期的工作流，例如设置 workflow = \[('train', 2)， ('val', 1)\] 表示循环迭代的训练 2 个 iter，然后验证 1 个 iter，MMSegmentation 语义分割框架默认采用的是  `IterBasedRunner`。
-
-其抽象逻辑如下所示：
-
-```python
-# 虽然是 iter 单位，但是某些场合需要 epoch 信息，由 IterLoader 提供
-iter_loaders = [IterLoader(x) for x in data_loaders]
-# 训练终止条件
-while curr_iter < max_iters:
-    # 遍历用户设置的工作流，例如 workflow = [('train', 2)， ('val', 1)]
-    for i, flow in enumerate(workflow):
-        # mode 是工作流函数，例如 train, iters 是迭代次数
-        mode, iters = flow
-        # 要么调用 self.train()，要么调用 self.val()
-        iter_runner = getattr(self, mode)
-        # 运行对应工作流函数
-        for _ in range(iters):
-            iter_runner(iter_loaders[i], **kwargs)
-```
-
-目前支持训练和验证两个工作流，以验证函数为例，其抽象逻辑是：
-
-```python
-# iter_runner 目前可以是 train 或者 val
-def val(self, data_loader, **kwargs):
-    # 获取 batch 数据，用于一次迭代
-    data_batch = next(data_loader)
-    self.call_hook('before_val_iter')
-    outputs = self.model.val_step(data_batch, self.optimizer, **kwargs)
-    self.outputs = outputs
-    self.call_hook('after_val_iter')
-```
-
-除了上述基础功能外，`EpochBasedRunner` 和 `IterBasedRunner` 还提供了 resume 、 save_checkpoint 和注册 hook 功能。
-
-### 一个简单例子
-
-以最常用的分类任务为例详细说明 `runner` 的使用方法。 开启任何一个训练任务，都需要包括如下步骤：
-
-**(1) dataloader、model 和优化器等类初始化**
-
-```python
-# 模型类初始化
-model=...
-# 优化器类初始化，典型值 cfg.optimizer = dict(type='SGD', lr=0.1, momentum=0.9, weight_decay=0.0001)
-optimizer = build_optimizer(model, cfg.optimizer)
-# 工作流对应的 dataloader 初始化
-data_loaders = [
-        build_dataloader(
-            ds,
-            cfg.data.samples_per_gpu,
-            cfg.data.workers_per_gpu,
-            ...) for ds in dataset
-    ]
-```
-
-**(2) runner 类初始化**
-
-```python
-runner = build_runner(
-    # cfg.runner 典型配置为
-    # runner = dict(type='EpochBasedRunner', max_epochs=200)
-    cfg.runner,
-    default_args=dict(
-        model=model,
-        batch_processor=None,
-        optimizer=optimizer,
-        logger=logger))
-```
-
-**(3) 注册默认训练所必须的 hook，和用户自定义 hook**
-
-```python
-# 注册定制必需的 hook
-runner.register_training_hooks(
-    # lr相关配置，典型为
-    # lr_config = dict(policy='step', step=[100, 150])
-    cfg.lr_config,
-    # 优化相关配置，例如 grad_clip 等
-    optimizer_config,
-    # 权重保存相关配置，典型为
-    # checkpoint_config = dict(interval=1)，每个单位都保存权重
-    cfg.checkpoint_config,
-    # 日志相关配置
-    cfg.log_config,
-    ...)
-
-# 注册用户自定义 hook
-# 例如想使用 ema 功能，则可以设置 custom_hooks=[dict(type='EMAHook')]
-if cfg.get('custom_hooks', None):
-    custom_hooks = cfg.custom_hooks
-    for hook_cfg in cfg.custom_hooks:
-        hook_cfg = hook_cfg.copy()
-        priority = hook_cfg.pop('priority', 'NORMAL')
-        hook = build_from_cfg(hook_cfg, HOOKS)
-        runner.register_hook(hook, priority=priority)
-```
-
-然后可以进行 resume 或者 load_checkpoint 对权重进行加载。
-
-**(4) 开启训练流**
-
-```python
-# workflow 典型为 workflow = [('train', 1)]
-# 此时就真正开启了训练
-runner.run(data_loaders, cfg.workflow)
-```
-
-关于 workflow 设置，以 `EpochBasedRunner` 为例，详情如下：
-
-- 假设只想运行训练工作流，则可以设置 workflow = \[('train', 1)\]，表示只进行迭代训练
-- 假设想运行训练和验证工作流，则可以设置 workflow = \[('train',  3), ('val', 1)\]，表示先训练 3 个 epoch ，然后切换到 val 工作流，运行 1 个 epoch，然后循环，直到训练 epoch 次数达到指定值
-- 工作流设置还自由定制，例如你可以先验证再训练 workflow = \[('val', 1), ('train', 1)\]
-
-上述代码都已经封装到了各个代码库的 train.py 中，用户只需要设置相应的配置即可，上述流程会自动运行。
diff --git a/examples/train.py b/examples/train.py
deleted file mode 100644
index b08d36bf62..0000000000
--- a/examples/train.py
+++ /dev/null
@@ -1,84 +0,0 @@
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-import torch.optim as optim
-import torchvision.transforms as transforms
-from torch.utils.data import DataLoader
-from torchvision.datasets import CIFAR10
-
-from mmcv.parallel import MMDataParallel
-from mmcv.runner import EpochBasedRunner
-from mmcv.utils import get_logger
-
-
-class Model(nn.Module):
-
-    def __init__(self):
-        super().__init__()
-        self.conv1 = nn.Conv2d(3, 6, 5)
-        self.pool = nn.MaxPool2d(2, 2)
-        self.conv2 = nn.Conv2d(6, 16, 5)
-        self.fc1 = nn.Linear(16 * 5 * 5, 120)
-        self.fc2 = nn.Linear(120, 84)
-        self.fc3 = nn.Linear(84, 10)
-        self.loss_fn = nn.CrossEntropyLoss()
-
-    def forward(self, x):
-        x = self.pool(F.relu(self.conv1(x)))
-        x = self.pool(F.relu(self.conv2(x)))
-        x = x.view(-1, 16 * 5 * 5)
-        x = F.relu(self.fc1(x))
-        x = F.relu(self.fc2(x))
-        x = self.fc3(x)
-        return x
-
-    def train_step(self, data, optimizer):
-        images, labels = data
-        predicts = self(images)  # -> self.__call__() -> self.forward()
-        loss = self.loss_fn(predicts, labels)
-        return {'loss': loss}
-
-
-if __name__ == '__main__':
-    model = Model()
-    if torch.cuda.is_available():
-        # only use gpu:0 to train
-        # Solved issue https://github.com/open-mmlab/mmcv/issues/1470
-        model = MMDataParallel(model.cuda(), device_ids=[0])
-
-    # dataset and dataloader
-    transform = transforms.Compose([
-        transforms.ToTensor(),
-        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
-    ])
-    trainset = CIFAR10(
-        root='data', train=True, download=True, transform=transform)
-    trainloader = DataLoader(
-        trainset, batch_size=128, shuffle=True, num_workers=2)
-
-    optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
-    logger = get_logger('mmcv')
-    # runner is a scheduler to manage the training
-    runner = EpochBasedRunner(
-        model,
-        optimizer=optimizer,
-        work_dir='./work_dir',
-        logger=logger,
-        max_epochs=4)
-
-    # learning rate scheduler config
-    lr_config = dict(policy='step', step=[2, 3])
-    # configuration of optimizer
-    optimizer_config = dict(grad_clip=None)
-    # configuration of saving checkpoints periodically
-    checkpoint_config = dict(interval=1)
-    # save log periodically and multiple hooks can be used simultaneously
-    log_config = dict(interval=100, hooks=[dict(type='TextLoggerHook')])
-    # register hooks to runner and those hooks will be invoked automatically
-    runner.register_training_hooks(
-        lr_config=lr_config,
-        optimizer_config=optimizer_config,
-        checkpoint_config=checkpoint_config,
-        log_config=log_config)
-
-    runner.run([trainloader], [('train', 1)])