From b96c22511508a05556c2650d42011731ab78594f Mon Sep 17 00:00:00 2001
From: Zhiqiang Wang <zhiqwang@foxmail.com>
Date: Sat, 5 Mar 2022 15:43:57 +0800
Subject: [PATCH] Construct YOLOv5 models with TorchVision MobileNetV3 backbone
 (#342)

* Add yolov5n and yolov5lite for training with ultralytics

* Apply pre-commit

* Minor fixes for docstrings

* Use frcnn layout

* Fix configurations for mobilenetv3 with yolov5

* Apply pre-commit

* Move yolov5lite into yolort.models

* Minor fixes

* Apply pre-commit

* Change to mobilenet_v3_small
---
 yolort/models/box_head.py  |   2 +
 yolort/models/yolo_lite.py | 160 +++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+)
 create mode 100644 yolort/models/yolo_lite.py

diff --git a/yolort/models/box_head.py b/yolort/models/box_head.py
index 18506a75..4eb6f73b 100644
--- a/yolort/models/box_head.py
+++ b/yolort/models/box_head.py
@@ -19,6 +19,8 @@ def __init__(
         num_classes: int,
     ):
         super().__init__()
+        if not isinstance(in_channels, list):
+            in_channels = [in_channels] * num_anchors
         self.num_anchors = num_anchors  # anchors
         self.num_classes = num_classes
         self.num_outputs = num_classes + 5  # number of outputs per anchor
diff --git a/yolort/models/yolo_lite.py b/yolort/models/yolo_lite.py
new file mode 100644
index 00000000..0d3b15bf
--- /dev/null
+++ b/yolort/models/yolo_lite.py
@@ -0,0 +1,160 @@
+from torch import nn
+from torchvision.models import mobilenet
+from torchvision.models._utils import IntermediateLayerGetter
+from torchvision.models.detection.backbone_utils import _validate_trainable_layers
+from torchvision.ops import misc as misc_nn_ops
+from torchvision.ops.feature_pyramid_network import FeaturePyramidNetwork, LastLevelMaxPool
+
+from .yolo import YOLO
+
+__all__ = ["yolov5_mobilenet_v3_small_fpn"]
+
+
+class BackboneWithFPN(nn.Module):
+    """
+    Adds a FPN on top of a model.
+    Internally, it uses torchvision.models._utils.IntermediateLayerGetter to
+    extract a submodel that returns the feature maps specified in return_layers.
+    The same limitations of IntermediateLayerGetter apply here.
+
+    Args:
+        backbone (nn.Module)
+        return_layers (Dict[name, new_name]): a dict containing the names
+            of the modules for which the activations will be returned as
+            the key of the dict, and the value of the dict is the name
+            of the returned activation (which the user can specify).
+        in_channels_list (List[int]): number of channels for each feature map
+            that is returned, in the order they are present in the OrderedDict
+        out_channels (int): number of channels in the FPN.
+
+    Attributes:
+        out_channels (int): the number of channels in the FPN
+    """
+
+    def __init__(self, backbone, return_layers, in_channels_list, out_channels, extra_blocks=None):
+        super().__init__()
+
+        if extra_blocks is None:
+            extra_blocks = LastLevelMaxPool()
+
+        self.body = IntermediateLayerGetter(backbone, return_layers=return_layers)
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list=in_channels_list,
+            out_channels=out_channels,
+            extra_blocks=extra_blocks,
+        )
+        self.out_channels = out_channels
+
+    def forward(self, x):
+        x = self.body(x)
+        x = self.fpn(x)
+
+        return list(x.values())  # unpack OrderedDict into two lists for easier handling
+
+
+def mobilenet_backbone(
+    backbone_name,
+    pretrained,
+    norm_layer=misc_nn_ops.FrozenBatchNorm2d,
+    trainable_layers=2,
+    returned_layers=None,
+):
+    backbone = mobilenet.__dict__[backbone_name](pretrained=pretrained, norm_layer=norm_layer).features
+
+    # Gather the indices of blocks which are strided. These are the locations of C1, ..., Cn-1 blocks.
+    # The first and last blocks are always included because they are the C0 (conv1) and Cn.
+    stage_indices = (
+        [0] + [i for i, b in enumerate(backbone) if getattr(b, "_is_cn", False)] + [len(backbone) - 1]
+    )
+    num_stages = len(stage_indices)
+
+    # find the index of the layer from which we wont freeze
+    assert 0 <= trainable_layers <= num_stages
+    freeze_before = len(backbone) if trainable_layers == 0 else stage_indices[num_stages - trainable_layers]
+
+    for b in backbone[:freeze_before]:
+        for parameter in b.parameters():
+            parameter.requires_grad_(False)
+
+    out_channels = 256
+
+    if returned_layers is None:
+        returned_layers = [num_stages - 2, num_stages - 1]
+    assert min(returned_layers) >= 0 and max(returned_layers) < num_stages
+    return_layers = {f"{stage_indices[k]}": str(v) for v, k in enumerate(returned_layers)}
+
+    in_channels_list = [backbone[stage_indices[i]].out_channels for i in returned_layers]
+    return BackboneWithFPN(
+        backbone,
+        return_layers,
+        in_channels_list,
+        out_channels,
+        extra_blocks=LastLevelMaxPool(),
+    )
+
+
+def _yolov5_mobilenet_v3_small_fpn(
+    weights_name,
+    pretrained=False,
+    progress=True,
+    num_classes=80,
+    pretrained_backbone=True,
+    trainable_backbone_layers=None,
+    **kwargs,
+):
+    trainable_backbone_layers = _validate_trainable_layers(
+        pretrained or pretrained_backbone, trainable_backbone_layers, 6, 3
+    )
+
+    if pretrained:
+        pretrained_backbone = False
+    backbone = mobilenet_backbone(
+        "mobilenet_v3_small",
+        pretrained_backbone,
+        trainable_layers=trainable_backbone_layers,
+    )
+
+    model = YOLO(backbone, num_classes, **kwargs)
+
+    return model
+
+
+def yolov5_mobilenet_v3_small_fpn(
+    pretrained=False,
+    progress=True,
+    num_classes=80,
+    pretrained_backbone=True,
+    trainable_backbone_layers=None,
+    **kwargs,
+):
+    """
+    Constructs a high resolution YOLOv5 model with a MobileNetV3-Large FPN backbone.
+    It works similarly to Faster R-CNN with ResNet-50 FPN backbone. See
+    :func:`~torchvision.models.detection.fasterrcnn_resnet50_fpn` for more
+    details.
+
+    Note:
+        We do not provide a pre-trained model with mobilenet as the backbone now, this function
+        is just used as an example of how to construct a YOLOv5 model with TorchVision's pre-trained
+        MobileNetV3-Small FPN backbone.
+
+    Args:
+        pretrained (bool): If True, returns a model pre-trained on COCO train2017
+        progress (bool): If True, displays a progress bar of the download to stderr
+        num_classes (int): number of output classes of the model (including the background)
+        pretrained_backbone (bool): If True, returns a model with backbone pre-trained on Imagenet
+        trainable_backbone_layers (int): number of trainable (not frozen) resnet layers starting
+            from final block. Valid values are between 0 and 6, with 6 meaning all backbone layers
+            are trainable.
+    """
+    weights_name = "yolov5_mobilenet_v3_small_fpn_coco"
+
+    return _yolov5_mobilenet_v3_small_fpn(
+        weights_name,
+        pretrained=pretrained,
+        progress=progress,
+        num_classes=num_classes,
+        pretrained_backbone=pretrained_backbone,
+        trainable_backbone_layers=trainable_backbone_layers,
+        **kwargs,
+    )