diff --git a/configs/_base_/models/fcos3d.py b/configs/_base_/models/fcos3d.py index 1465b81a7..a46ed9cd6 100644 --- a/configs/_base_/models/fcos3d.py +++ b/configs/_base_/models/fcos3d.py @@ -55,6 +55,7 @@ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0), loss_centerness=dict( type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9), norm_on_bbox=True, centerness_on_reg=True, center_sampling=True, diff --git a/mmdet3d/core/bbox/coders/__init__.py b/mmdet3d/core/bbox/coders/__init__.py index 08fbb410b..0e4404221 100644 --- a/mmdet3d/core/bbox/coders/__init__.py +++ b/mmdet3d/core/bbox/coders/__init__.py @@ -3,6 +3,7 @@ from .anchor_free_bbox_coder import AnchorFreeBBoxCoder from .centerpoint_bbox_coders import CenterPointBBoxCoder from .delta_xyzwhlr_bbox_coder import DeltaXYZWLHRBBoxCoder +from .fcos3d_bbox_coder import FCOS3DBBoxCoder from .groupfree3d_bbox_coder import GroupFree3DBBoxCoder from .partial_bin_based_bbox_coder import PartialBinBasedBBoxCoder from .point_xyzwhlr_bbox_coder import PointXYZWHLRBBoxCoder @@ -10,5 +11,5 @@ __all__ = [ 'build_bbox_coder', 'DeltaXYZWLHRBBoxCoder', 'PartialBinBasedBBoxCoder', 'CenterPointBBoxCoder', 'AnchorFreeBBoxCoder', 'GroupFree3DBBoxCoder', - 'PointXYZWHLRBBoxCoder' + 'PointXYZWHLRBBoxCoder', 'FCOS3DBBoxCoder' ] diff --git a/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py b/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py new file mode 100644 index 000000000..7245a57e0 --- /dev/null +++ b/mmdet3d/core/bbox/coders/fcos3d_bbox_coder.py @@ -0,0 +1,126 @@ +import numpy as np +import torch + +from mmdet.core.bbox import BaseBBoxCoder +from mmdet.core.bbox.builder import BBOX_CODERS +from ..structures import limit_period + + +@BBOX_CODERS.register_module() +class FCOS3DBBoxCoder(BaseBBoxCoder): + """Bounding box coder for FCOS3D. + + Args: + base_depths (tuple[tuple[float]]): Depth references for decode box + depth. Defaults to None. + base_dims (tuple[tuple[float]]): Dimension references for decode box + dimension. Defaults to None. + code_size (int): The dimension of boxes to be encoded. Defaults to 7. + norm_on_bbox (bool): Whether to apply normalization on the bounding + box 2D attributes. Defaults to True. + """ + + def __init__(self, + base_depths=None, + base_dims=None, + code_size=7, + norm_on_bbox=True): + super(FCOS3DBBoxCoder, self).__init__() + self.base_depths = base_depths + self.base_dims = base_dims + self.bbox_code_size = code_size + self.norm_on_bbox = norm_on_bbox + + def encode(self, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels): + # TODO: refactor the encoder in the FCOS3D and PGD head + pass + + def decode(self, bbox, scale, stride, training, cls_score=None): + """Decode regressed results into 3D predictions. + + Note that offsets are not transformed to the projected 3D centers. + + Args: + bbox (torch.Tensor): Raw bounding box predictions in shape + [N, C, H, W]. + scale (tuple[`Scale`]): Learnable scale parameters. + stride (tuple[int]): Stride for a specific feature level. + training (bool): Whether the decoding is in the training + procedure. + cls_score (torch.Tensor): Classification score map for deciding + which base depth or dim is used. Defaults to None. + + Returns: + torch.Tensor: Decoded boxes. + """ + # scale the bbox of different level + # only apply to offset, depth and size prediction + scale_offset, scale_depth, scale_size = scale[0:3] + + clone_bbox = bbox.clone() + bbox[:, :2] = scale_offset(clone_bbox[:, :2]).float() + bbox[:, 2] = scale_depth(clone_bbox[:, 2]).float() + bbox[:, 3:6] = scale_size(clone_bbox[:, 3:6]).float() + + if self.base_depths is None: + bbox[:, 2] = bbox[:, 2].exp() + elif len(self.base_depths) == 1: # only single prior + mean = self.base_depths[0][0] + std = self.base_depths[0][1] + bbox[:, 2] = mean + bbox.clone()[:, 2] * std + else: # multi-class priors + assert len(self.base_depths) == cls_score.shape[1], \ + 'The number of multi-class depth priors should be equal to ' \ + 'the number of categories.' + indices = cls_score.max(dim=1)[1] + depth_priors = cls_score.new_tensor( + self.base_depths)[indices, :].permute(0, 3, 1, 2) + mean = depth_priors[:, 0] + std = depth_priors[:, 1] + bbox[:, 2] = mean + bbox.clone()[:, 2] * std + + bbox[:, 3:6] = bbox[:, 3:6].exp() + if self.base_dims is not None: + assert len(self.base_dims) == cls_score.shape[1], \ + 'The number of anchor sizes should be equal to the number ' \ + 'of categories.' + indices = cls_score.max(dim=1)[1] + size_priors = cls_score.new_tensor( + self.base_dims)[indices, :].permute(0, 3, 1, 2) + bbox[:, 3:6] = size_priors * bbox.clone()[:, 3:6] + + assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\ + 'has not been thoroughly tested for FCOS3D.' + if self.norm_on_bbox: + if not training: + # Note that this line is conducted only when testing + bbox[:, :2] *= stride + + return bbox + + @staticmethod + def decode_yaw(bbox, centers2d, dir_cls, dir_offset, cam2img): + """Decode yaw angle and change it from local to global.i. + + Args: + bbox (torch.Tensor): Bounding box predictions in shape + [N, C] with yaws to be decoded. + centers2d (torch.Tensor): Projected 3D-center on the image planes + corresponding to the box predictions. + dir_cls (torch.Tensor): Predicted direction classes. + dir_offset (float): Direction offset before dividing all the + directions into several classes. + cam2img (torch.Tensor): Camera intrinsic matrix in shape [4, 4]. + + Returns: + torch.Tensor: Bounding boxes with decoded yaws. + """ + if bbox.shape[0] > 0: + dir_rot = limit_period(bbox[..., 6] - dir_offset, 0, np.pi) + bbox[..., 6] = \ + dir_rot + dir_offset + np.pi * dir_cls.to(bbox.dtype) + + bbox[:, 6] = torch.atan2(centers2d[:, 0] - cam2img[0, 2], + cam2img[0, 0]) + bbox[:, 6] + + return bbox diff --git a/mmdet3d/models/dense_heads/fcos_mono3d_head.py b/mmdet3d/models/dense_heads/fcos_mono3d_head.py index c0b15a458..07cd5ba4e 100644 --- a/mmdet3d/models/dense_heads/fcos_mono3d_head.py +++ b/mmdet3d/models/dense_heads/fcos_mono3d_head.py @@ -7,6 +7,7 @@ from mmdet3d.core import box3d_multiclass_nms, limit_period, xywhr2xyxyr from mmdet.core import multi_apply +from mmdet.core.bbox.builder import build_bbox_coder from mmdet.models.builder import HEADS, build_loss from .anchor_free_mono3d_head import AnchorFreeMono3DHead @@ -73,6 +74,7 @@ def __init__(self, type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0), + bbox_coder=dict(type='FCOS3DBBoxCoder', code_size=9), norm_cfg=dict(type='GN', num_groups=32, requires_grad=True), centerness_branch=(64, ), init_cfg=None, @@ -95,6 +97,8 @@ def __init__(self, init_cfg=init_cfg, **kwargs) self.loss_centerness = build_loss(loss_centerness) + bbox_coder['code_size'] = self.bbox_code_size + self.bbox_coder = build_bbox_coder(bbox_coder) if init_cfg is None: self.init_cfg = dict( type='Normal', @@ -110,9 +114,11 @@ def _init_layers(self): conv_channels=self.centerness_branch, conv_strides=(1, ) * len(self.centerness_branch)) self.conv_centerness = nn.Conv2d(self.centerness_branch[-1], 1, 1) + self.scale_dim = 3 # only for offset, depth and size regression self.scales = nn.ModuleList([ - nn.ModuleList([Scale(1.0) for _ in range(3)]) for _ in self.strides - ]) # only for offset, depth and size regression + nn.ModuleList([Scale(1.0) for _ in range(self.scale_dim)]) + for _ in self.strides + ]) def forward(self, feats): """Forward features from the upstream network. @@ -138,8 +144,9 @@ def forward(self, feats): centernesses (list[Tensor]): Centerness for each scale level, each is a 4D-tensor, the channel number is num_points * 1. """ + # Note: we use [:5] to filter feats and only return predictions return multi_apply(self.forward_single, feats, self.scales, - self.strides) + self.strides)[:5] def forward_single(self, x, scale, stride): """Forward features of a single scale levle. @@ -169,26 +176,12 @@ def forward_single(self, x, scale, stride): for conv_centerness_prev_layer in self.conv_centerness_prev: clone_cls_feat = conv_centerness_prev_layer(clone_cls_feat) centerness = self.conv_centerness(clone_cls_feat) - # scale the bbox_pred of different level - # only apply to offset, depth and size prediction - scale_offset, scale_depth, scale_size = scale[0:3] - clone_bbox_pred = bbox_pred.clone() - bbox_pred[:, :2] = scale_offset(clone_bbox_pred[:, :2]).float() - bbox_pred[:, 2] = scale_depth(clone_bbox_pred[:, 2]).float() - bbox_pred[:, 3:6] = scale_size(clone_bbox_pred[:, 3:6]).float() + bbox_pred = self.bbox_coder.decode(bbox_pred, scale, stride, + self.training, cls_score) - bbox_pred[:, 2] = bbox_pred[:, 2].exp() - bbox_pred[:, 3:6] = bbox_pred[:, 3:6].exp() + 1e-6 # avoid size=0 - - assert self.norm_on_bbox is True, 'Setting norm_on_bbox to False '\ - 'has not been thoroughly tested for FCOS3D.' - if self.norm_on_bbox: - if not self.training: - # Note that this line is conducted only when testing - bbox_pred[:, :2] *= stride - - return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness + return cls_score, bbox_pred, dir_cls_pred, attr_pred, centerness, \ + cls_feat, reg_feat @staticmethod def add_sin_difference(boxes1, boxes2): @@ -652,19 +645,13 @@ def _get_bboxes_single(self, mlvl_dir_scores = torch.cat(mlvl_dir_scores) # change local yaw to global yaw for 3D nms - if mlvl_bboxes.shape[0] > 0: - dir_rot = limit_period(mlvl_bboxes[..., 6] - self.dir_offset, 0, - np.pi) - mlvl_bboxes[..., 6] = ( - dir_rot + self.dir_offset + - np.pi * mlvl_dir_scores.to(mlvl_bboxes.dtype)) - - cam_intrinsic = mlvl_centers2d.new_zeros((4, 4)) - cam_intrinsic[:view.shape[0], :view.shape[1]] = \ + cam2img = mlvl_centers2d.new_zeros((4, 4)) + cam2img[:view.shape[0], :view.shape[1]] = \ mlvl_centers2d.new_tensor(view) - mlvl_bboxes[:, 6] = torch.atan2( - mlvl_centers2d[:, 0] - cam_intrinsic[0, 2], - cam_intrinsic[0, 0]) + mlvl_bboxes[:, 6] + mlvl_bboxes = self.bbox_coder.decode_yaw(mlvl_bboxes, mlvl_centers2d, + mlvl_dir_scores, + self.dir_offset, cam2img) + mlvl_bboxes_for_nms = xywhr2xyxyr(input_meta['box_type_3d']( mlvl_bboxes, box_dim=self.bbox_code_size, origin=(0.5, 0.5, 0.5)).bev) diff --git a/tests/test_utils/test_bbox_coders.py b/tests/test_utils/test_bbox_coders.py index 1e9e236d8..385f27609 100644 --- a/tests/test_utils/test_bbox_coders.py +++ b/tests/test_utils/test_bbox_coders.py @@ -1,5 +1,7 @@ # Copyright (c) OpenMMLab. All rights reserved. import torch +from mmcv.cnn import Scale +from torch import nn as nn from mmdet3d.core.bbox import DepthInstance3DBoxes, LiDARInstance3DBoxes from mmdet.core import build_bbox_coder @@ -382,3 +384,84 @@ def test_point_xyzwhlr_bbox_coder(): # test decode bbox3d_out = boxcoder.decode(bbox_target, points, gt_labels_3d) assert torch.allclose(bbox3d_out, gt_bboxes_3d, atol=1e-4) + + +def test_fcos3d_bbox_coder(): + # test a config without priors + bbox_coder_cfg = dict( + type='FCOS3DBBoxCoder', + base_depths=None, + base_dims=None, + code_size=7, + norm_on_bbox=True) + bbox_coder = build_bbox_coder(bbox_coder_cfg) + + # test decode + # [2, 7, 1, 1] + batch_bbox_out = torch.tensor([[[[0.3130]], [[0.7094]], [[0.8743]], + [[0.0570]], [[0.5579]], [[0.1593]], + [[0.4553]]], + [[[0.7758]], [[0.2298]], [[0.3925]], + [[0.6307]], [[0.4377]], [[0.3339]], + [[0.1966]]]]) + batch_scale = nn.ModuleList([Scale(1.0) for _ in range(3)]) + stride = 2 + training = False + cls_score = torch.randn([2, 2, 1, 1]).sigmoid() + decode_bbox_out = bbox_coder.decode(batch_bbox_out, batch_scale, stride, + training, cls_score) + + expected_bbox_out = torch.tensor([[[[0.6261]], [[1.4188]], [[2.3971]], + [[1.0586]], [[1.7470]], [[1.1727]], + [[0.4553]]], + [[[1.5516]], [[0.4596]], [[1.4806]], + [[1.8790]], [[1.5492]], [[1.3965]], + [[0.1966]]]]) + assert torch.allclose(decode_bbox_out, expected_bbox_out, atol=1e-3) + + # test a config with priors + prior_bbox_coder_cfg = dict( + type='FCOS3DBBoxCoder', + base_depths=((28., 13.), (25., 12.)), + base_dims=((2., 3., 1.), (1., 2., 3.)), + code_size=7, + norm_on_bbox=True) + prior_bbox_coder = build_bbox_coder(prior_bbox_coder_cfg) + + # test decode + batch_bbox_out = torch.tensor([[[[0.3130]], [[0.7094]], [[0.8743]], + [[0.0570]], [[0.5579]], [[0.1593]], + [[0.4553]]], + [[[0.7758]], [[0.2298]], [[0.3925]], + [[0.6307]], [[0.4377]], [[0.3339]], + [[0.1966]]]]) + batch_scale = nn.ModuleList([Scale(1.0) for _ in range(3)]) + stride = 2 + training = False + cls_score = torch.tensor([[[[0.5811]], [[0.6198]]], [[[0.4889]], + [[0.8142]]]]) + decode_bbox_out = prior_bbox_coder.decode(batch_bbox_out, batch_scale, + stride, training, cls_score) + expected_bbox_out = torch.tensor([[[[0.6260]], [[1.4188]], [[35.4916]], + [[1.0587]], [[3.4940]], [[3.5181]], + [[0.4553]]], + [[[1.5516]], [[0.4596]], [[29.7100]], + [[1.8789]], [[3.0983]], [[4.1892]], + [[0.1966]]]]) + assert torch.allclose(decode_bbox_out, expected_bbox_out, atol=1e-3) + + # test decode_yaw + decode_bbox_out = decode_bbox_out.permute(0, 2, 3, 1).view(-1, 7) + batch_centers2d = torch.tensor([[100., 150.], [200., 100.]]) + batch_dir_cls = torch.tensor([0., 1.]) + dir_offset = 0.7854 + cam2img = torch.tensor([[700., 0., 450., 0.], [0., 700., 200., 0.], + [0., 0., 1., 0.], [0., 0., 0., 1.]]) + decode_bbox_out = prior_bbox_coder.decode_yaw(decode_bbox_out, + batch_centers2d, + batch_dir_cls, dir_offset, + cam2img) + expected_bbox_out = torch.tensor( + [[0.6260, 1.4188, 35.4916, 1.0587, 3.4940, 3.5181, 3.1332], + [1.5516, 0.4596, 29.7100, 1.8789, 3.0983, 4.1892, 6.1368]]) + assert torch.allclose(decode_bbox_out, expected_bbox_out, atol=1e-3)