From 07846673bd25a3859c4ff8503e3bda764845f6e6 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Fri, 17 Mar 2023 17:55:45 +0800
Subject: [PATCH 01/29] support train on nus

---
 projects/BEVFusion/bevfusion/__init__.py      |  4 +-
 projects/BEVFusion/bevfusion/bevfusion.py     | 11 ++-
 projects/BEVFusion/bevfusion/depth_lss.py     |  1 +
 projects/BEVFusion/bevfusion/transforms_3d.py | 34 +++++++++
 projects/BEVFusion/bevfusion/utils.py         | 53 ++++++++++++-
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 75 ++++++++++---------
 6 files changed, 137 insertions(+), 41 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py
index c36fc641c8..faf7fa2d9a 100644
--- a/projects/BEVFusion/bevfusion/__init__.py
+++ b/projects/BEVFusion/bevfusion/__init__.py
@@ -4,7 +4,7 @@
 from .loading import BEVLoadMultiViewImageFromFiles
 from .sparse_encoder import BEVFusionSparseEncoder
 from .transformer import TransformerDecoderLayer
-from .transforms_3d import GridMask, ImageAug3D
+from .transforms_3d import BEVFusionRandomFlip3D, GridMask, ImageAug3D
 from .transfusion_head import ConvFuser, TransFusionHead
 from .utils import (BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D,
                     IoU3DCost)
@@ -14,5 +14,5 @@
     'GeneralizedLSSFPN', 'HungarianAssigner3D', 'BBoxBEVL1Cost', 'IoU3DCost',
     'HeuristicAssigner3D', 'DepthLSSTransform',
     'BEVLoadMultiViewImageFromFiles', 'BEVFusionSparseEncoder',
-    'TransformerDecoderLayer'
+    'TransformerDecoderLayer', 'BEVFusionRandomFlip3D'
 ]
diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index a823528207..e13e4b6c19 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -239,4 +239,13 @@ def extract_feat(
     def loss(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
              batch_data_samples: List[Det3DDataSample],
              **kwargs) -> List[Det3DDataSample]:
-        pass
+        batch_input_metas = [item.metainfo for item in batch_data_samples]
+        feats = self.extract_feat(batch_inputs_dict, batch_input_metas)
+
+        losses = dict()
+        if self.with_bbox_head:
+            bbox_loss = self.bbox_head.loss(feats, batch_data_samples)
+
+        losses.update(bbox_loss)
+
+        return losses
diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py
index f336c9289b..072375dd37 100644
--- a/projects/BEVFusion/bevfusion/depth_lss.py
+++ b/projects/BEVFusion/bevfusion/depth_lss.py
@@ -241,6 +241,7 @@ def forward(
             for c in range(on_img.shape[0]):
                 masked_coords = cur_coords[c, on_img[c]].long()
                 masked_dist = dist[c, on_img[c]]
+                depth = depth.to(masked_dist.dtype)
                 depth[b, c, 0, masked_coords[:, 0],
                       masked_coords[:, 1]] = masked_dist
 
diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index 35116f591f..1941f21142 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -1,4 +1,5 @@
 # modify from https://github.com/mit-han-lab/bevfusion
+import random
 from typing import Any, Dict
 
 import numpy as np
@@ -107,6 +108,39 @@ def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
         return data
 
 
+@TRANSFORMS.register_module()
+class BEVFusionRandomFlip3D:
+
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        flip_horizontal = random.choice([0, 1])
+        flip_vertical = random.choice([0, 1])
+
+        rotation = np.eye(3)
+        if flip_horizontal:
+            rotation = np.array([[1, 0, 0], [0, -1, 0], [0, 0, 1]]) @ rotation
+            if 'points' in data:
+                data['points'].flip('horizontal')
+            if 'gt_bboxes_3d' in data:
+                data['gt_bboxes_3d'].flip('horizontal')
+            if 'gt_masks_bev' in data:
+                data['gt_masks_bev'] = data['gt_masks_bev'][:, :, ::-1].copy()
+
+        if flip_vertical:
+            rotation = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) @ rotation
+            if 'points' in data:
+                data['points'].flip('vertical')
+            if 'gt_bboxes_3d' in data:
+                data['gt_bboxes_3d'].flip('vertical')
+            if 'gt_masks_bev' in data:
+                data['gt_masks_bev'] = data['gt_masks_bev'][:, ::-1, :].copy()
+
+        if 'lidar_aug_matrix' not in data:
+            data['lidar_aug_matrix'] = np.eye(4)
+        data['lidar_aug_matrix'][:3, :] = rotation @ data[
+            'lidar_aug_matrix'][:3, :]
+        return data
+
+
 @TRANSFORMS.register_module()
 class GridMask(BaseTransform):
 
diff --git a/projects/BEVFusion/bevfusion/utils.py b/projects/BEVFusion/bevfusion/utils.py
index 0ce5472615..66847df5b1 100644
--- a/projects/BEVFusion/bevfusion/utils.py
+++ b/projects/BEVFusion/bevfusion/utils.py
@@ -1,5 +1,9 @@
 # modify from https://github.com/mit-han-lab/bevfusion
+from collections import abc
+
+import numpy as np
 import torch
+import torch.nn as nn
 from mmdet.models.task_modules import AssignResult, BaseAssigner, BaseBBoxCoder
 
 try:
@@ -7,6 +11,8 @@
 except ImportError:
     linear_sum_assignment = None
 
+from mmengine.structures import InstanceData
+
 from mmdet3d.registry import TASK_UTILS
 
 
@@ -273,8 +279,11 @@ def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg):
                 num_gts, assigned_gt_inds, None, labels=assigned_labels)
 
         # 2. compute the weighted costs
-        # see mmdetection/mmdet/core/bbox/match_costs/match_cost.py
-        cls_cost = self.cls_cost(cls_pred[0].T, gt_labels)
+        # Hard code here to be compatible with the interface of
+        # `ClassificationCost` in mmdet.
+        gt_instances, pred_instances = InstanceData(
+            labels=gt_labels), InstanceData(scores=cls_pred[0].T)
+        cls_cost = self.cls_cost(pred_instances, gt_instances)
         reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg)
         iou = self.iou_calculator(bboxes, gt_bboxes)
         iou_cost = self.iou_cost(iou)
@@ -304,3 +313,43 @@ def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg):
         # max_overlaps = iou.max(1).values
         return AssignResult(
             num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
+
+
+def cast_tensor_type(inputs, src_type: torch.dtype, dst_type: torch.dtype):
+    """Recursively convert Tensor in inputs from src_type to dst_type.
+
+    Note:
+        In v1.4.4 and later, ``cast_tersor_type`` will only convert the
+        torch.Tensor which is consistent with ``src_type`` to the ``dst_type``.
+        Before v1.4.4, it ignores the ``src_type`` argument, leading to some
+        potential problems. For example,
+        ``cast_tensor_type(inputs, torch.float, torch.half)`` will convert all
+        tensors in inputs to ``torch.half`` including those originally in
+        ``torch.Int`` or other types, which is not expected.
+    Args:
+        inputs: Inputs that to be casted.
+        src_type (torch.dtype): Source type..
+        dst_type (torch.dtype): Destination type.
+    Returns:
+        The same type with inputs, but all contained Tensors have been cast.
+    """
+    if isinstance(inputs, nn.Module):
+        return inputs
+    elif isinstance(inputs, torch.Tensor):
+        # we need to ensure that the type of inputs to be casted are the same
+        # as the argument `src_type`.
+        return inputs.to(dst_type) if inputs.dtype == src_type else inputs
+    elif isinstance(inputs, str):
+        return inputs
+    elif isinstance(inputs, np.ndarray):
+        return inputs
+    elif isinstance(inputs, abc.Mapping):
+        return type(inputs)({  # type: ignore
+            k: cast_tensor_type(v, src_type, dst_type)
+            for k, v in inputs.items()
+        })
+    elif isinstance(inputs, abc.Iterable):
+        return type(inputs)(  # type: ignore
+            cast_tensor_type(item, src_type, dst_type) for item in inputs)
+    else:
+        return
diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 8f12892372..865952342c 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -14,9 +14,9 @@
     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
 ]
 
-metainfo = dict(classes=class_names)
+metainfo = dict(classes=class_names, version='v1.0-mini')
 dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
+data_root = 'data/nuscenes_mini/'
 data_prefix = dict(
     pts='samples/LIDAR_TOP',
     CAM_FRONT='samples/CAM_FRONT',
@@ -194,24 +194,22 @@
         filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
     classes=class_names,
     sample_groups=dict(
-        car=5,
-        truck=5,
-        bus=5,
-        trailer=5,
-        construction_vehicle=5,
-        traffic_cone=5,
-        barrier=5,
-        motorcycle=5,
-        bicycle=5,
-        pedestrian=5),
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
     points_loader=dict(
         type='LoadPointsFromFile',
         coord_type='LIDAR',
         load_dim=5,
         use_dim=[0, 1, 2, 3, 4],
-        reduce_beams=32,
-        backend_args=backend_args),
-    backend_args=backend_args)
+        backend_args=backend_args))
 
 train_pipeline = [
     dict(
@@ -224,18 +222,14 @@
         coord_type='LIDAR',
         load_dim=5,
         use_dim=5,
-        reduce_beams=32,
-        load_augmented=None,
         backend_args=backend_args),
     dict(
         type='LoadPointsFromMultiSweeps',
         sweeps_num=9,
         load_dim=5,
         use_dim=5,
-        reduce_beams=32,
         pad_empty_sweeps=True,
         remove_close=True,
-        load_augmented=None,
         backend_args=backend_args),
     dict(
         type='LoadAnnotations3D',
@@ -253,11 +247,10 @@
         is_train=True),
     dict(
         type='GlobalRotScaleTrans',
-        resize_lim=[0.9, 1.1],
-        rot_lim=[-0.78539816, 0.78539816],
-        trans_lim=0.5,
-        is_train=True),
-    dict(type='RandomFlip3D'),
+        scale_ratio_range=[0.9, 1.1],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=0.5),
+    dict(type='BEVFusionRandomFlip3D'),
     dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
     dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
     dict(
@@ -283,6 +276,13 @@
         keys=[
             'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
             'gt_labels'
+        ],
+        meta_keys=[
+            'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
+            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
+            'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
+            'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
+            'lidar_aug_matrix'
         ])
 ]
 
@@ -333,18 +333,19 @@
     persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='nuscenes_infos_train.pkl',
-        pipeline=train_pipeline,
-        metainfo=metainfo,
-        modality=input_modality,
-        test_mode=False,
-        data_prefix=data_prefix,
-        # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
-        # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-        box_type_3d='LiDAR',
-        backend_args=backend_args))
+        type='CBGSDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='nuscenes_infos_train.pkl',
+            pipeline=train_pipeline,
+            metainfo=metainfo,
+            modality=input_modality,
+            test_mode=False,
+            data_prefix=data_prefix,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')))
 val_dataloader = dict(
     batch_size=1,
     num_workers=0,
@@ -428,3 +429,5 @@
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),
     checkpoint=dict(type='CheckpointHook', interval=5))
+
+load_from = 'checkpoints/bevfusion_init_converted.pth'

From 2dbf063632647b27fd02223d472da4c4be62e52e Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Mon, 20 Mar 2023 11:38:11 +0800
Subject: [PATCH 02/29] refactor transfusion head

---
 .../BEVFusion/bevfusion/transfusion_head.py   | 89 +++++++++++++------
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py |  6 +-
 2 files changed, 64 insertions(+), 31 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index 59dbdf891f..f4e918ae68 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -1,6 +1,6 @@
 # modify from https://github.com/mit-han-lab/bevfusion
 import copy
-from typing import List
+from typing import List, Tuple 
 
 import numpy as np
 import torch
@@ -498,12 +498,23 @@ def predict_by_feat(self,
 
         return rets[0]
 
-    def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict):
+    def get_targets(self, batch_gt_instances_3d: List[InstanceData],
+                    preds_dict: List[dict]):
         """Generate training targets.
         Args:
-            gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
-            gt_labels_3d (torch.Tensor): Labels of boxes.
-            preds_dicts (tuple of dict): first index by layer (default 1)
+            batch_gt_instances_3d (List[InstanceData]):
+            preds_dict (list[dict]): The prediction results. The index of the
+                list is the index of layers. The inner dict contains
+                predictions of one mini-batch:
+                - center: (bs, 2, num_proposals)
+                - height: (bs, 1, num_proposals)
+                - dim: (bs, 3, num_proposals)
+                - rot: (bs, 2, num_proposals)
+                - vel: (bs, 2, num_proposals)
+                - cls_logit: (bs, num_classes, num_proposals)
+                - query_score: (bs, num_classes, num_proposals)
+                - heatmap: The original heatmap before fed into transformer
+                    decoder, with shape (bs, 10, h, w)
         Returns:
             tuple[torch.Tensor]: Tuple of target including \
                 the following results in order.
@@ -516,20 +527,23 @@ def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict):
         # change preds_dict into list of dict (index by batch_id)
         # preds_dict[0]['center'].shape [bs, 3, num_proposal]
         list_of_pred_dict = []
-        for batch_idx in range(len(gt_bboxes_3d)):
+        for batch_idx in range(len(batch_gt_instances_3d)):
             pred_dict = {}
             for key in preds_dict[0].keys():
-                pred_dict[key] = preds_dict[0][key][batch_idx:batch_idx + 1]
+                preds = []
+                for i in range(self.num_decoder_layers):
+                    pred_one_layer = preds_dict[i][key][batch_idx:batch_idx +
+                                                        1]
+                    preds.append(pred_one_layer)
+                pred_dict[key] = torch.cat(preds)
             list_of_pred_dict.append(pred_dict)
 
-        assert len(gt_bboxes_3d) == len(list_of_pred_dict)
-
+        assert len(batch_gt_instances_3d) == len(list_of_pred_dict)
         res_tuple = multi_apply(
             self.get_targets_single,
-            gt_bboxes_3d,
-            gt_labels_3d,
+            batch_gt_instances_3d,
             list_of_pred_dict,
-            np.arange(len(gt_labels_3d)),
+            np.arange(len(batch_gt_instances_3d)),
         )
         labels = torch.cat(res_tuple[0], dim=0)
         label_weights = torch.cat(res_tuple[1], dim=0)
@@ -550,23 +564,26 @@ def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict):
             heatmap,
         )
 
-    def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict,
-                           batch_idx):
+    def get_targets_single(self, gt_instances_3d, preds_dict, batch_idx):
         """Generate training targets for a single sample.
         Args:
-            gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes.
-            gt_labels_3d (torch.Tensor): Labels of boxes.
-            preds_dict (dict): dict of prediction result for a single sample
+            gt_instances_3d (:obj:`InstanceData`): ground truth of instances.
+            preds_dict (dict): dict of prediction result for a single sample.
         Returns:
             tuple[torch.Tensor]: Tuple of target including \
                 the following results in order.
                 - torch.Tensor: classification target.  [1, num_proposals]
-                - torch.Tensor: classification weights (mask) [1, num_proposals] # noqa: E501
+                - torch.Tensor: classification weights (mask) [1,
+                    num_proposals] # noqa: E501
                 - torch.Tensor: regression target. [1, num_proposals, 8]
                 - torch.Tensor: regression weights. [1, num_proposals, 8]
                 - torch.Tensor: iou target. [1, num_proposals]
                 - int: number of positive proposals
+                - torch.Tensor: heatmap targets.
         """
+        # 1. Assignment
+        gt_bboxes_3d = gt_instances_3d.bboxes_3d
+        gt_labels_3d = gt_instances_3d.labels_3d
         num_proposals = preds_dict['center'].shape[-1]
 
         # get pred boxes, carefully ! don't change the network outputs
@@ -628,14 +645,19 @@ def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict,
                 [res.max_overlaps for res in assign_result_list]),
             labels=torch.cat([res.labels for res in assign_result_list]),
         )
+
+        # 2. Sampling. Compatible with the interface of `PseudoSampler` in
+        # mmdet.
+        gt_instances, pred_instances = InstanceData(
+            bboxes=gt_bboxes_tensor), InstanceData(priors=bboxes_tensor)
         sampling_result = self.bbox_sampler.sample(assign_result_ensemble,
-                                                   bboxes_tensor,
-                                                   gt_bboxes_tensor)
+                                                   pred_instances,
+                                                   gt_instances)
         pos_inds = sampling_result.pos_inds
         neg_inds = sampling_result.neg_inds
         assert len(pos_inds) + len(neg_inds) == num_proposals
 
-        # create target for loss computation
+        # 3. Create target for loss computation
         bbox_targets = torch.zeros([num_proposals, self.bbox_coder.code_size
                                     ]).to(center.device)
         bbox_weights = torch.zeros([num_proposals, self.bbox_coder.code_size
@@ -723,17 +745,28 @@ def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict,
             heatmap[None],
         )
 
-    def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
+    def loss(self, batch_feats, batch_data_samples):
         """Loss function for CenterHead.
-
         Args:
-            gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground
-                truth gt boxes.
-            gt_labels_3d (list[torch.Tensor]): Labels of boxes.
-            preds_dicts (list[list[dict]]): Output of forward function.
+            batch_feats (): Features in a batch.
+            batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
+                Samples. It usually includes information such as
+                `gt_instance_3d`.
         Returns:
             dict[str:torch.Tensor]: Loss of heatmap and bbox of each task.
         """
+        batch_input_metas, batch_gt_instances_3d = [], []
+        for data_sample in batch_data_samples:
+            batch_input_metas.append(data_sample.metainfo)
+            batch_gt_instances_3d.append(data_sample.gt_instances_3d)
+        preds_dicts = self(batch_feats, batch_input_metas)
+        loss = self.loss_by_feat(preds_dicts, batch_gt_instances_3d)
+
+        return loss
+
+    def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
+                     batch_gt_instances_3d: List[InstanceData], *args,
+                     **kwargs):
         (
             labels,
             label_weights,
@@ -743,7 +776,7 @@ def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs):
             num_pos,
             matched_ious,
             heatmap,
-        ) = self.get_targets(gt_bboxes_3d, gt_labels_3d, preds_dicts[0])
+        ) = self.get_targets(batch_gt_instances_3d, preds_dicts[0])
         if hasattr(self, 'on_the_image_mask'):
             label_weights = label_weights * self.on_the_image_mask
             bbox_weights = bbox_weights * self.on_the_image_mask[:, :, None]
diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 865952342c..5514a2fcbe 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -27,7 +27,7 @@
     CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
     sweeps='sweeps/LIDAR_TOP')
 input_modality = dict(use_lidar=True, use_camera=True)
-backend_args = None
+backend_args = dict(backend='petrel', path_mapping={'data/nuscenes_mini/':'s3://openmmlab/datasets/detection3d/nuscenes/'})
 
 model = dict(
     type='BEVFusion',
@@ -329,8 +329,8 @@
 
 train_dataloader = dict(
     batch_size=4,
-    num_workers=4,
-    persistent_workers=True,
+    num_workers=0,
+    # persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
         type='CBGSDataset',

From 149150daa6111ce76ea07a956d59d0e79f604c0b Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Mon, 20 Mar 2023 13:58:36 +0800
Subject: [PATCH 03/29] img branch optioinal

---
 projects/BEVFusion/bevfusion/bevfusion.py       | 15 +++++++++------
 .../BEVFusion/bevfusion/transfusion_head.py     |  3 ++-
 ...0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 17 +++++++++++------
 3 files changed, 22 insertions(+), 13 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index e13e4b6c19..6cf709ca93 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -217,13 +217,16 @@ def extract_feat(
         camera2lidar = imgs.new_tensor(np.asarray(camera2lidar))
         img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix))
         lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix))
-        img_feature = self.extract_img_feat(imgs, points, lidar2image,
-                                            camera_intrinsics, camera2lidar,
-                                            img_aug_matrix, lidar_aug_matrix,
-                                            batch_input_metas)
+        features = []
+        if imgs is not None:
+            img_feature = self.extract_img_feat(imgs, points, lidar2image,
+                                                camera_intrinsics,
+                                                camera2lidar, img_aug_matrix,
+                                                lidar_aug_matrix,
+                                                batch_input_metas)
+            features.append(img_feature)
         pts_feature = self.extract_pts_feat(batch_inputs_dict)
-
-        features = [img_feature, pts_feature]
+        features.append(pts_feature)
 
         if self.fusion_layer is not None:
             x = self.fusion_layer(features)
diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index f4e918ae68..7b9e3fe0ca 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -1,6 +1,6 @@
 # modify from https://github.com/mit-han-lab/bevfusion
 import copy
-from typing import List, Tuple 
+from typing import List, Tuple
 
 import numpy as np
 import torch
@@ -747,6 +747,7 @@ def get_targets_single(self, gt_instances_3d, preds_dict, batch_idx):
 
     def loss(self, batch_feats, batch_data_samples):
         """Loss function for CenterHead.
+
         Args:
             batch_feats (): Features in a batch.
             batch_data_samples (List[:obj:`Det3DDataSample`]): The Data
diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 5514a2fcbe..fc25c26bae 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -16,7 +16,7 @@
 
 metainfo = dict(classes=class_names, version='v1.0-mini')
 dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes_mini/'
+data_root = 'data/nuscenes/'
 data_prefix = dict(
     pts='samples/LIDAR_TOP',
     CAM_FRONT='samples/CAM_FRONT',
@@ -27,7 +27,12 @@
     CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
     sweeps='sweeps/LIDAR_TOP')
 input_modality = dict(use_lidar=True, use_camera=True)
-backend_args = dict(backend='petrel', path_mapping={'data/nuscenes_mini/':'s3://openmmlab/datasets/detection3d/nuscenes/'})
+backend_args = dict(
+    backend='petrel',
+    path_mapping={
+        'data/nuscenes_mini/': 's3://openmmlab/datasets/detection3d/nuscenes/',
+        'data/nuscenes/': 's3://openmmlab/datasets/detection3d/nuscenes/'
+    })
 
 model = dict(
     type='BEVFusion',
@@ -329,8 +334,8 @@
 
 train_dataloader = dict(
     batch_size=4,
-    num_workers=0,
-    # persistent_workers=True,
+    num_workers=4,
+    persistent_workers=True,
     sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
         type='CBGSDataset',
@@ -348,8 +353,8 @@
             box_type_3d='LiDAR')))
 val_dataloader = dict(
     batch_size=1,
-    num_workers=0,
-    # persistent_workers=True,
+    num_workers=4,
+    persistent_workers=True,
     drop_last=False,
     sampler=dict(type='DefaultSampler', shuffle=False),
     dataset=dict(

From 1a9c6815054321b958c9bee241067820a6a5a1d3 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Mon, 20 Mar 2023 14:08:10 +0800
Subject: [PATCH 04/29] support nuscenes_mini in replace_ceph_backend

---
 mmdet3d/utils/misc.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/mmdet3d/utils/misc.py b/mmdet3d/utils/misc.py
index e5f4b47d33..8210e7a91d 100644
--- a/mmdet3d/utils/misc.py
+++ b/mmdet3d/utils/misc.py
@@ -40,6 +40,9 @@ def replace_ceph_backend(cfg):
     elif 'nuimages' in cfg_pretty_text:
         replace_strs = replace_strs.replace('DATA', 'nuimages')
         replace_strs = replace_strs.replace('CEPH', 'nuimages')
+    elif 'nuscenes_mini' in cfg_pretty_text:
+        replace_strs = replace_strs.replace('DATA', 'nuscenes_mini')
+        replace_strs = replace_strs.replace('CEPH', 'nuscenes_mini')
     else:
         NotImplemented('Does not support global replacement')
 

From eb8c69d7a693b6777b3c7615f99ab6e214f852c4 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Mon, 20 Mar 2023 16:01:03 +0800
Subject: [PATCH 05/29] use replace_ceph

---
 ...ion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index fc25c26bae..9741bbe861 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -14,7 +14,7 @@
     'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
 ]
 
-metainfo = dict(classes=class_names, version='v1.0-mini')
+metainfo = dict(classes=class_names)
 dataset_type = 'NuScenesDataset'
 data_root = 'data/nuscenes/'
 data_prefix = dict(
@@ -27,12 +27,7 @@
     CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
     sweeps='sweeps/LIDAR_TOP')
 input_modality = dict(use_lidar=True, use_camera=True)
-backend_args = dict(
-    backend='petrel',
-    path_mapping={
-        'data/nuscenes_mini/': 's3://openmmlab/datasets/detection3d/nuscenes/',
-        'data/nuscenes/': 's3://openmmlab/datasets/detection3d/nuscenes/'
-    })
+backend_args = None
 
 model = dict(
     type='BEVFusion',

From 4223023d3931eb25f343f1af5fe1e0c9c0577729 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Mon, 20 Mar 2023 19:43:01 +0800
Subject: [PATCH 06/29] add only-lidar

---
 mmdet3d/utils/misc.py                         |   3 -
 projects/BEVFusion/bevfusion/bevfusion.py     |  44 +-
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 383 ++++++++++++++++++
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py |  21 +-
 4 files changed, 422 insertions(+), 29 deletions(-)
 create mode 100644 projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py

diff --git a/mmdet3d/utils/misc.py b/mmdet3d/utils/misc.py
index 8210e7a91d..e5f4b47d33 100644
--- a/mmdet3d/utils/misc.py
+++ b/mmdet3d/utils/misc.py
@@ -40,9 +40,6 @@ def replace_ceph_backend(cfg):
     elif 'nuimages' in cfg_pretty_text:
         replace_strs = replace_strs.replace('DATA', 'nuimages')
         replace_strs = replace_strs.replace('CEPH', 'nuimages')
-    elif 'nuscenes_mini' in cfg_pretty_text:
-        replace_strs = replace_strs.replace('DATA', 'nuscenes_mini')
-        replace_strs = replace_strs.replace('CEPH', 'nuscenes_mini')
     else:
         NotImplemented('Does not support global replacement')
 
diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index 6cf709ca93..431ecf89f7 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -40,12 +40,16 @@ def __init__(
 
         self.pts_voxel_encoder = MODELS.build(pts_voxel_encoder)
 
-        self.img_backbone = MODELS.build(img_backbone)
-        self.img_neck = MODELS.build(img_neck)
-        self.vtransform = MODELS.build(vtransform)
+        self.img_backbone = MODELS.build(
+            img_backbone) if img_backbone is not None else None
+        self.img_neck = MODELS.build(
+            img_neck) if img_neck is not None else None
+        self.vtransform = MODELS.build(
+            vtransform) if vtransform is not None else None
         self.pts_middle_encoder = MODELS.build(pts_middle_encoder)
 
-        self.fusion_layer = MODELS.build(fusion_layer)
+        self.fusion_layer = MODELS.build(
+            fusion_layer) if fusion_layer is not None else None
 
         self.pts_backbone = MODELS.build(pts_backbone)
         self.pts_neck = MODELS.build(pts_neck)
@@ -53,7 +57,7 @@ def __init__(
         self.bbox_head = MODELS.build(bbox_head)
         # hard code here where using converted checkpoint of original
         # implementation of `BEVFusion`
-        self.use_converted_checkpoint = True
+        self.use_converted_checkpoint = False
 
         self.init_weights()
 
@@ -202,23 +206,23 @@ def extract_feat(
     ):
         imgs = batch_inputs_dict.get('imgs', None)
         points = batch_inputs_dict.get('points', None)
-
-        lidar2image, camera_intrinsics, camera2lidar = [], [], []
-        img_aug_matrix, lidar_aug_matrix = [], []
-        for i, meta in enumerate(batch_input_metas):
-            lidar2image.append(meta['lidar2img'])
-            camera_intrinsics.append(meta['cam2img'])
-            camera2lidar.append(meta['cam2lidar'])
-            img_aug_matrix.append(meta.get('img_aug_matrix', np.eye(4)))
-            lidar_aug_matrix.append(meta.get('lidar_aug_matrix', np.eye(4)))
-
-        lidar2image = imgs.new_tensor(np.asarray(lidar2image))
-        camera_intrinsics = imgs.new_tensor(np.array(camera_intrinsics))
-        camera2lidar = imgs.new_tensor(np.asarray(camera2lidar))
-        img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix))
-        lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix))
         features = []
         if imgs is not None:
+            lidar2image, camera_intrinsics, camera2lidar = [], [], []
+            img_aug_matrix, lidar_aug_matrix = [], []
+            for i, meta in enumerate(batch_input_metas):
+                lidar2image.append(meta['lidar2img'])
+                camera_intrinsics.append(meta['cam2img'])
+                camera2lidar.append(meta['cam2lidar'])
+                img_aug_matrix.append(meta.get('img_aug_matrix', np.eye(4)))
+                lidar_aug_matrix.append(
+                    meta.get('lidar_aug_matrix', np.eye(4)))
+
+            lidar2image = imgs.new_tensor(np.asarray(lidar2image))
+            camera_intrinsics = imgs.new_tensor(np.array(camera_intrinsics))
+            camera2lidar = imgs.new_tensor(np.asarray(camera2lidar))
+            img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix))
+            lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix))
             img_feature = self.extract_img_feat(imgs, points, lidar2image,
                                                 camera_intrinsics,
                                                 camera2lidar, img_aug_matrix,
diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000..bee8274a3e
--- /dev/null
+++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -0,0 +1,383 @@
+_base_ = ['mmdet3d::_base_/default_runtime.py']
+custom_imports = dict(
+    imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
+
+# model settings
+# Voxel size for voxel encoder
+# Usually voxel size is changed consistently with the point cloud range
+# If point cloud range is modified, do remember to change all related
+# keys in the config.
+voxel_size = [0.075, 0.075, 0.2]
+point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
+class_names = [
+    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
+    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+]
+
+metainfo = dict(classes=class_names)
+dataset_type = 'NuScenesDataset'
+data_root = 'data/nuscenes/'
+data_prefix = dict(
+    pts='samples/LIDAR_TOP',
+    CAM_FRONT='samples/CAM_FRONT',
+    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
+    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
+    CAM_BACK='samples/CAM_BACK',
+    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
+    CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
+    sweeps='sweeps/LIDAR_TOP')
+input_modality = dict(use_lidar=True, use_camera=False)
+backend_args = dict(
+    backend='petrel',
+    path_mapping=dict({
+        './data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/',
+        'data/nuscenes/':
+        's3://openmmlab/datasets/detection3d/nuscenes/',
+        './data/nuscenes_mini/':
+        's3://openmmlab/datasets/detection3d/nuscenes/',
+        'data/nuscenes_mini/':
+        's3://openmmlab/datasets/detection3d/nuscenes/'
+    }))
+
+model = dict(
+    type='BEVFusion',
+    data_preprocessor=dict(
+        type='Det3DDataPreprocessor',
+        pad_size_divisor=32,
+        voxelize_cfg=dict(
+            max_num_points=10,
+            point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
+            voxel_size=[0.075, 0.075, 0.2],
+            max_voxels=[120000, 160000],
+            voxelize_reduce=True)),
+    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
+    pts_middle_encoder=dict(
+        type='BEVFusionSparseEncoder',
+        in_channels=5,
+        sparse_shape=[1440, 1440, 41],
+        order=('conv', 'norm', 'act'),
+        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
+                                                                      128)),
+        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)),
+        block_type='basicblock'),
+    pts_backbone=dict(
+        type='SECOND',
+        in_channels=256,
+        out_channels=[128, 256],
+        layer_nums=[5, 5],
+        layer_strides=[1, 2],
+        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        conv_cfg=dict(type='Conv2d', bias=False)),
+    pts_neck=dict(
+        type='SECONDFPN',
+        in_channels=[128, 256],
+        out_channels=[256, 256],
+        upsample_strides=[1, 2],
+        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        upsample_cfg=dict(type='deconv', bias=False),
+        use_conv_for_no_stride=True),
+    bbox_head=dict(
+        type='TransFusionHead',
+        num_proposals=200,
+        auxiliary=True,
+        in_channels=512,
+        hidden_channel=128,
+        num_classes=10,
+        nms_kernel_size=3,
+        bn_momentum=0.1,
+        num_decoder_layers=1,
+        decoder_layer=dict(
+            type='TransformerDecoderLayer',
+            self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1),
+            cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1),
+            ffn_cfg=dict(
+                embed_dims=128,
+                feedforward_channels=256,
+                num_fcs=2,
+                ffn_drop=0.1,
+                act_cfg=dict(type='ReLU', inplace=True),
+            ),
+            norm_cfg=dict(type='LN'),
+            pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)),
+        train_cfg=dict(
+            dataset='nuScenes',
+            point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
+            grid_size=[1440, 1440, 41],
+            voxel_size=[0.075, 0.075, 0.2],
+            out_size_factor=8,
+            gaussian_overlap=0.1,
+            min_radius=2,
+            pos_weight=-1,
+            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
+            assigner=dict(
+                type='HungarianAssigner3D',
+                iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
+                cls_cost=dict(
+                    type='mmdet.FocalLossCost',
+                    gamma=2.0,
+                    alpha=0.25,
+                    weight=0.15),
+                reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
+                iou_cost=dict(type='IoU3DCost', weight=0.25))),
+        test_cfg=dict(
+            dataset='nuScenes',
+            grid_size=[1440, 1440, 41],
+            out_size_factor=8,
+            voxel_size=[0.075, 0.075],
+            pc_range=[-54.0, -54.0],
+            nms_type=None),
+        common_heads=dict(
+            center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]),
+        bbox_coder=dict(
+            type='TransFusionBBoxCoder',
+            pc_range=[-54.0, -54.0],
+            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
+            score_threshold=0.0,
+            out_size_factor=8,
+            voxel_size=[0.075, 0.075],
+            code_size=10),
+        loss_cls=dict(
+            type='mmdet.FocalLoss',
+            use_sigmoid=True,
+            gamma=2.0,
+            alpha=0.25,
+            reduction='mean',
+            loss_weight=1.0),
+        loss_heatmap=dict(
+            type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
+        loss_bbox=dict(
+            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)))
+
+db_sampler = dict(
+    data_root=data_root,
+    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
+    rate=1.0,
+    prepare=dict(
+        filter_by_difficulty=[-1],
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
+    classes=class_names,
+    sample_groups=dict(
+        car=2,
+        truck=3,
+        construction_vehicle=7,
+        bus=4,
+        trailer=6,
+        barrier=2,
+        motorcycle=6,
+        bicycle=6,
+        pedestrian=2,
+        traffic_cone=2),
+    points_loader=dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=[0, 1, 2, 3, 4],
+        backend_args=backend_args))
+
+train_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        load_dim=5,
+        use_dim=5,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='LoadAnnotations3D',
+        with_bbox_3d=True,
+        with_label_3d=True,
+        with_attr_label=False),
+    dict(type='ObjectSample', db_sampler=db_sampler),
+    dict(
+        type='GlobalRotScaleTrans',
+        scale_ratio_range=[0.9, 1.1],
+        rot_range=[-0.78539816, 0.78539816],
+        translation_std=0.5),
+    dict(type='BEVFusionRandomFlip3D'),
+    dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
+    dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
+    dict(
+        type='ObjectNameFilter',
+        classes=[
+            'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
+            'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+        ]),
+    dict(type='PointShuffle'),
+    dict(
+        type='Pack3DDetInputs',
+        keys=[
+            'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes',
+            'gt_labels'
+        ],
+        meta_keys=[
+            'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
+            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
+            'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation',
+            'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix',
+            'lidar_aug_matrix'
+        ])
+]
+
+test_pipeline = [
+    dict(
+        type='LoadPointsFromFile',
+        coord_type='LIDAR',
+        load_dim=5,
+        use_dim=5,
+        backend_args=backend_args),
+    dict(
+        type='LoadPointsFromMultiSweeps',
+        sweeps_num=9,
+        load_dim=5,
+        use_dim=5,
+        pad_empty_sweeps=True,
+        remove_close=True,
+        backend_args=backend_args),
+    dict(
+        type='PointsRangeFilter',
+        point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]),
+    dict(
+        type='Pack3DDetInputs',
+        keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'],
+        meta_keys=[
+            'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar',
+            'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx',
+            'lidar_path', 'img_path'
+        ])
+]
+
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    persistent_workers=True,
+    sampler=dict(type='DefaultSampler', shuffle=True),
+    dataset=dict(
+        type='CBGSDataset',
+        dataset=dict(
+            type=dataset_type,
+            data_root=data_root,
+            ann_file='nuscenes_infos_train.pkl',
+            pipeline=train_pipeline,
+            metainfo=metainfo,
+            modality=input_modality,
+            test_mode=False,
+            data_prefix=data_prefix,
+            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
+            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
+            box_type_3d='LiDAR')))
+val_dataloader = dict(
+    batch_size=1,
+    num_workers=4,
+    persistent_workers=True,
+    drop_last=False,
+    sampler=dict(type='DefaultSampler', shuffle=False),
+    dataset=dict(
+        type=dataset_type,
+        data_root=data_root,
+        ann_file='nuscenes_infos_val.pkl',
+        pipeline=test_pipeline,
+        metainfo=metainfo,
+        modality=input_modality,
+        data_prefix=data_prefix,
+        test_mode=True,
+        box_type_3d='LiDAR',
+        backend_args=backend_args))
+test_dataloader = val_dataloader
+
+val_evaluator = dict(
+    type='NuScenesMetric',
+    data_root=data_root,
+    ann_file=data_root + 'nuscenes_infos_val.pkl',
+    metric='bbox',
+    backend_args=backend_args)
+test_evaluator = val_evaluator
+
+vis_backends = [dict(type='LocalVisBackend')]
+visualizer = dict(
+    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
+
+# learning rate
+lr = 0.0001
+param_scheduler = [
+    # learning rate scheduler
+    # During the first 8 epochs, learning rate increases from 0 to lr * 10
+    # during the next 12 epochs, learning rate decreases from lr * 10 to
+    # lr * 1e-4
+    dict(
+        type='CosineAnnealingLR',
+        T_max=8,
+        eta_min=lr * 10,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingLR',
+        T_max=12,
+        eta_min=lr * 1e-4,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    # momentum scheduler
+    # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95
+    # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=8,
+        eta_min=0.85 / 0.95,
+        begin=0,
+        end=8,
+        by_epoch=True,
+        convert_to_iter_based=True),
+    dict(
+        type='CosineAnnealingMomentum',
+        T_max=12,
+        eta_min=1,
+        begin=8,
+        end=20,
+        by_epoch=True,
+        convert_to_iter_based=True)
+]
+
+# runtime settings
+train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=1)
+val_cfg = dict()
+test_cfg = dict()
+
+optim_wrapper = dict(
+    type='OptimWrapper',
+    optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01),
+    clip_grad=dict(max_norm=35, norm_type=2))
+
+# Default setting for scaling LR automatically
+#   - `enable` means enable scaling LR automatically
+#       or not by default.
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
+
+default_hooks = dict(
+    logger=dict(type='LoggerHook', interval=1),
+    checkpoint=dict(type='CheckpointHook', interval=5))
+custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
+
+load_from = 'checkpoints/bevfusion_init_converted.pth'
diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 9741bbe861..fe5d86e727 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -191,7 +191,17 @@
     rate=1.0,
     prepare=dict(
         filter_by_difficulty=[-1],
-        filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)),
+        filter_by_min_points=dict(
+            car=5,
+            truck=5,
+            bus=5,
+            trailer=5,
+            construction_vehicle=5,
+            traffic_cone=5,
+            barrier=5,
+            motorcycle=5,
+            bicycle=5,
+            pedestrian=5)),
     classes=class_names,
     sample_groups=dict(
         car=2,
@@ -208,8 +218,7 @@
         type='LoadPointsFromFile',
         coord_type='LIDAR',
         load_dim=5,
-        use_dim=[0, 1, 2, 3, 4],
-        backend_args=backend_args))
+        use_dim=[0, 1, 2, 3, 4]))
 
 train_pipeline = [
     dict(
@@ -236,7 +245,7 @@
         with_bbox_3d=True,
         with_label_3d=True,
         with_attr_label=False),
-    # dict(type='ObjectSampling', db_sampler=db_sampler),
+    # dict(type='ObjectSample', db_sampler=db_sampler),
     dict(
         type='ImageAug3D',
         final_dim=[256, 704],
@@ -423,8 +432,8 @@
 # Default setting for scaling LR automatically
 #   - `enable` means enable scaling LR automatically
 #       or not by default.
-#   - `base_batch_size` = (4 GPUs) x (4 samples per GPU).
-auto_scale_lr = dict(enable=False, base_batch_size=16)
+#   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
+auto_scale_lr = dict(enable=False, base_batch_size=32)
 
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),

From 5c5d55501aea57aa4e7599572f2384f1fe3d7483 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 21 Mar 2023 11:58:04 +0800
Subject: [PATCH 07/29] use valid_flag in dataset filter

---
 projects/BEVFusion/bevfusion/bevfusion.py     |  1 +
 projects/BEVFusion/bevfusion/depth_lss.py     |  2 --
 .../BEVFusion/bevfusion/transfusion_head.py   |  2 --
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 30 ++++++++++---------
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py |  1 +
 5 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index 431ecf89f7..c0cd62cf54 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -208,6 +208,7 @@ def extract_feat(
         points = batch_inputs_dict.get('points', None)
         features = []
         if imgs is not None:
+            imgs = imgs.contiguous()
             lidar2image, camera_intrinsics, camera2lidar = [], [], []
             img_aug_matrix, lidar_aug_matrix = [], []
             for i, meta in enumerate(batch_input_metas):
diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py
index 072375dd37..069f4ea558 100644
--- a/projects/BEVFusion/bevfusion/depth_lss.py
+++ b/projects/BEVFusion/bevfusion/depth_lss.py
@@ -202,8 +202,6 @@ def forward(
         camera2lidar_rots = camera2lidar[..., :3, :3]
         camera2lidar_trans = camera2lidar[..., :3, 3]
 
-        # print(img.shape, self.image_size, self.feature_size)
-
         batch_size = len(points)
         depth = torch.zeros(batch_size, img.shape[1], 1,
                             *self.image_size).to(points[0].device)
diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index 7b9e3fe0ca..5e8ffeff31 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -75,8 +75,6 @@ def __init__(
     ):
         super(TransFusionHead, self).__init__()
 
-        self.fp16_enabled = False
-
         self.num_classes = num_classes
         self.num_proposals = num_proposals
         self.auxiliary = auxiliary
diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index bee8274a3e..6f17bb67e2 100644
--- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -27,18 +27,19 @@
     CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
     sweeps='sweeps/LIDAR_TOP')
 input_modality = dict(use_lidar=True, use_camera=False)
-backend_args = dict(
-    backend='petrel',
-    path_mapping=dict({
-        './data/nuscenes/':
-        's3://openmmlab/datasets/detection3d/nuscenes/',
-        'data/nuscenes/':
-        's3://openmmlab/datasets/detection3d/nuscenes/',
-        './data/nuscenes_mini/':
-        's3://openmmlab/datasets/detection3d/nuscenes/',
-        'data/nuscenes_mini/':
-        's3://openmmlab/datasets/detection3d/nuscenes/'
-    }))
+# backend_args = dict(
+#     backend='petrel',
+#     path_mapping=dict({
+#         './data/nuscenes/':
+#         's3://openmmlab/datasets/detection3d/nuscenes/',
+#         'data/nuscenes/':
+#         's3://openmmlab/datasets/detection3d/nuscenes/',
+#         './data/nuscenes_mini/':
+#         's3://openmmlab/datasets/detection3d/nuscenes/',
+#         'data/nuscenes_mini/':
+#         's3://openmmlab/datasets/detection3d/nuscenes/'
+#     }))
+backend_args = None
 
 model = dict(
     type='BEVFusion',
@@ -281,6 +282,7 @@
             modality=input_modality,
             test_mode=False,
             data_prefix=data_prefix,
+            use_valid_flag=True,
             # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
             # and box_type_3d='Depth' in sunrgbd and scannet dataset.
             box_type_3d='LiDAR')))
@@ -360,7 +362,7 @@
 ]
 
 # runtime settings
-train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=1)
+train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=5)
 val_cfg = dict()
 test_cfg = dict()
 
@@ -376,7 +378,7 @@
 auto_scale_lr = dict(enable=False, base_batch_size=32)
 
 default_hooks = dict(
-    logger=dict(type='LoggerHook', interval=1),
+    logger=dict(type='LoggerHook', interval=50),
     checkpoint=dict(type='CheckpointHook', interval=5))
 custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
 
diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index fe5d86e727..8ff4dbab78 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -352,6 +352,7 @@
             modality=input_modality,
             test_mode=False,
             data_prefix=data_prefix,
+            use_valid_flag=True,
             # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
             # and box_type_3d='Depth' in sunrgbd and scannet dataset.
             box_type_3d='LiDAR')))

From 8078f57cac1c95ed0bab02ec88baedfd2a3e0f37 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Fri, 24 Mar 2023 16:37:23 +0800
Subject: [PATCH 08/29] support lidar-only training 69

---
 .../hooks/disable_object_sample_hook.py       |  7 ++-
 projects/BEVFusion/bevfusion/__init__.py      |  6 ++-
 projects/BEVFusion/bevfusion/bevfusion.py     |  2 +-
 projects/BEVFusion/bevfusion/transforms_3d.py | 48 +++++++++++++++++--
 .../BEVFusion/bevfusion/transfusion_head.py   |  4 +-
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py |  8 ++--
 6 files changed, 61 insertions(+), 14 deletions(-)

diff --git a/mmdet3d/engine/hooks/disable_object_sample_hook.py b/mmdet3d/engine/hooks/disable_object_sample_hook.py
index d1f3c2a09d..07d12762be 100644
--- a/mmdet3d/engine/hooks/disable_object_sample_hook.py
+++ b/mmdet3d/engine/hooks/disable_object_sample_hook.py
@@ -1,4 +1,5 @@
 # Copyright (c) OpenMMLab. All rights reserved.
+from mmengine.dataset import BaseDataset
 from mmengine.hooks import Hook
 from mmengine.model import is_model_wrapper
 from mmengine.runner import Runner
@@ -35,7 +36,11 @@ def before_train_epoch(self, runner: Runner):
             model = model.module
         if epoch == self.disable_after_epoch:
             runner.logger.info('Disable ObjectSample')
-            for transform in runner.train_dataloader.dataset.pipeline.transforms:  # noqa: E501
+            dataset = runner.train_dataloader.dataset
+            # handle dataset wrapper
+            if not isinstance(dataset, BaseDataset):
+                dataset = dataset.dataset
+            for transform in dataset.pipeline.transforms:  # noqa: E501
                 if isinstance(transform, ObjectSample):
                     assert hasattr(transform, 'disabled')
                     transform.disabled = True
diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py
index faf7fa2d9a..07988ff597 100644
--- a/projects/BEVFusion/bevfusion/__init__.py
+++ b/projects/BEVFusion/bevfusion/__init__.py
@@ -4,7 +4,8 @@
 from .loading import BEVLoadMultiViewImageFromFiles
 from .sparse_encoder import BEVFusionSparseEncoder
 from .transformer import TransformerDecoderLayer
-from .transforms_3d import BEVFusionRandomFlip3D, GridMask, ImageAug3D
+from .transforms_3d import (BEVFusionGlobalRotScaleTrans,
+                            BEVFusionRandomFlip3D, GridMask, ImageAug3D)
 from .transfusion_head import ConvFuser, TransFusionHead
 from .utils import (BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D,
                     IoU3DCost)
@@ -14,5 +15,6 @@
     'GeneralizedLSSFPN', 'HungarianAssigner3D', 'BBoxBEVL1Cost', 'IoU3DCost',
     'HeuristicAssigner3D', 'DepthLSSTransform',
     'BEVLoadMultiViewImageFromFiles', 'BEVFusionSparseEncoder',
-    'TransformerDecoderLayer', 'BEVFusionRandomFlip3D'
+    'TransformerDecoderLayer', 'BEVFusionRandomFlip3D',
+    'BEVFusionGlobalRotScaleTrans'
 ]
diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index c0cd62cf54..1cde750c01 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -98,7 +98,7 @@ def extract_img_feat(
         img_metas,
     ) -> torch.Tensor:
         B, N, C, H, W = x.size()
-        x = x.view(B * N, C, H, W)
+        x = x.view(B * N, C, H, W).contiguous()
 
         x = self.img_backbone(x)
         x = self.img_neck(x)
diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index 1941f21142..d7104bdafa 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -1,5 +1,4 @@
 # modify from https://github.com/mit-han-lab/bevfusion
-import random
 from typing import Any, Dict
 
 import numpy as np
@@ -7,6 +6,7 @@
 from mmcv.transforms import BaseTransform
 from PIL import Image
 
+from mmdet3d.datasets import GlobalRotScaleTrans
 from mmdet3d.registry import TRANSFORMS
 
 
@@ -112,8 +112,8 @@ def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
 class BEVFusionRandomFlip3D:
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        flip_horizontal = random.choice([0, 1])
-        flip_vertical = random.choice([0, 1])
+        flip_horizontal = np.random.choice([0, 1])
+        flip_vertical = np.random.choice([0, 1])
 
         rotation = np.eye(3)
         if flip_horizontal:
@@ -141,6 +141,48 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         return data
 
 
+@TRANSFORMS.register_module()
+class BEVFusionGlobalRotScaleTrans(GlobalRotScaleTrans):
+
+    def transform(self, input_dict: dict) -> dict:
+        """Private function to rotate, scale and translate bounding boxes and
+        points.
+
+        Args:
+            input_dict (dict): Result dict from loading pipeline.
+
+        Returns:
+            dict: Results after scaling, 'points', 'pcd_rotation',
+            'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated
+            in the result dict.
+        """
+        if 'transformation_3d_flow' not in input_dict:
+            input_dict['transformation_3d_flow'] = []
+
+        self._rot_bbox_points(input_dict)
+
+        if 'pcd_scale_factor' not in input_dict:
+            self._random_scale(input_dict)
+        self._scale_bbox_points(input_dict)
+
+        self._trans_bbox_points(input_dict)
+
+        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
+
+        lidar_augs = np.eye(4)
+        lidar_augs[:3, :3] = input_dict['pcd_rotation'].T * input_dict[
+            'pcd_scale_factor']
+        lidar_augs[:3, 3] = input_dict['pcd_trans'] * \
+            input_dict['pcd_scale_factor']
+
+        if 'lidar_aug_matrix' not in input_dict:
+            input_dict['lidar_aug_matrix'] = np.eye(4)
+        input_dict[
+            'lidar_aug_matrix'] = lidar_augs @ input_dict['lidar_aug_matrix']
+
+        return input_dict
+
+
 @TRANSFORMS.register_module()
 class GridMask(BaseTransform):
 
diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index 5e8ffeff31..9cd1891f5c 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -785,8 +785,8 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
 
         # compute heatmap loss
         loss_heatmap = self.loss_heatmap(
-            clip_sigmoid(preds_dict['dense_heatmap']),
-            heatmap,
+            clip_sigmoid(preds_dict['dense_heatmap']).float(),
+            heatmap.float(),
             avg_factor=max(heatmap.eq(1).float().sum().item(), 1),
         )
         loss_dict['loss_heatmap'] = loss_heatmap
diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 8ff4dbab78..7c3225b641 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -255,7 +255,7 @@
         rand_flip=True,
         is_train=True),
     dict(
-        type='GlobalRotScaleTrans',
+        type='BEVFusionGlobalRotScaleTrans',
         scale_ratio_range=[0.9, 1.1],
         rot_range=[-0.78539816, 0.78539816],
         translation_std=0.5),
@@ -421,7 +421,7 @@
 ]
 
 # runtime settings
-train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=6)
+train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1)
 val_cfg = dict()
 test_cfg = dict()
 
@@ -438,6 +438,4 @@
 
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),
-    checkpoint=dict(type='CheckpointHook', interval=5))
-
-load_from = 'checkpoints/bevfusion_init_converted.pth'
+    checkpoint=dict(type='CheckpointHook', interval=1))

From 68e4f31ff57b34cad551e3b5e3faac95be1d5d7d Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 18 Apr 2023 16:26:18 +0800
Subject: [PATCH 09/29] fix RTS

---
 projects/BEVFusion/bevfusion/transforms_3d.py                | 5 ++---
 ...vfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 1 +
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index d7104bdafa..da259e21d3 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -163,11 +163,10 @@ def transform(self, input_dict: dict) -> dict:
 
         if 'pcd_scale_factor' not in input_dict:
             self._random_scale(input_dict)
-        self._scale_bbox_points(input_dict)
-
         self._trans_bbox_points(input_dict)
+        self._scale_bbox_points(input_dict)
 
-        input_dict['transformation_3d_flow'].extend(['R', 'S', 'T'])
+        input_dict['transformation_3d_flow'].extend(['R', 'T', 'S'])
 
         lidar_augs = np.eye(4)
         lidar_augs[:3, :3] = input_dict['pcd_rotation'].T * input_dict[
diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 7c3225b641..632876cb62 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -435,6 +435,7 @@
 #       or not by default.
 #   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
 auto_scale_lr = dict(enable=False, base_batch_size=32)
+log_processor = dict(window_size=50)
 
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),

From cf39e0713259121c51b9f5c608fe3c6b9f39b1f2 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Mon, 24 Apr 2023 17:41:54 +0800
Subject: [PATCH 10/29] fix rotation in ImgAug3D

---
 projects/BEVFusion/bevfusion/transforms_3d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index da259e21d3..ec2d442b5c 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -68,8 +68,8 @@ def img_transform(self, img, rotation, translation, resize, resize_dims,
             translation = A.matmul(translation) + b
         theta = rotate / 180 * np.pi
         A = torch.Tensor([
-            [np.cos(theta), np.sin(theta)],
-            [-np.sin(theta), np.cos(theta)],
+            [np.cos(theta), -np.sin(theta)],
+            [np.sin(theta), np.cos(theta)],
         ])
         b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
         b = A.matmul(-b) + b

From 710a23def4f130cfd93c9d615d839295495af393 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 9 May 2023 10:16:09 +0800
Subject: [PATCH 11/29] revert to original rotation in ImgAug3D

---
 projects/BEVFusion/bevfusion/transforms_3d.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index ec2d442b5c..da259e21d3 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -68,8 +68,8 @@ def img_transform(self, img, rotation, translation, resize, resize_dims,
             translation = A.matmul(translation) + b
         theta = rotate / 180 * np.pi
         A = torch.Tensor([
-            [np.cos(theta), -np.sin(theta)],
-            [np.sin(theta), np.cos(theta)],
+            [np.cos(theta), np.sin(theta)],
+            [-np.sin(theta), np.cos(theta)],
         ])
         b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2
         b = A.matmul(-b) + b

From 041b2889e77925bc026cb15120e0de9cdfb0876f Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 9 May 2023 16:52:13 +0800
Subject: [PATCH 12/29] add LSSDepthTransform and parse_losses

---
 projects/BEVFusion/bevfusion/__init__.py  |  4 +-
 projects/BEVFusion/bevfusion/bevfusion.py | 45 +++++++++++++-
 projects/BEVFusion/bevfusion/depth_lss.py | 71 +++++++++++++++++++++++
 3 files changed, 117 insertions(+), 3 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py
index 07988ff597..db06d3afa4 100644
--- a/projects/BEVFusion/bevfusion/__init__.py
+++ b/projects/BEVFusion/bevfusion/__init__.py
@@ -1,6 +1,6 @@
 from .bevfusion import BEVFusion
 from .bevfusion_necks import GeneralizedLSSFPN
-from .depth_lss import DepthLSSTransform
+from .depth_lss import DepthLSSTransform, LSSTransform
 from .loading import BEVLoadMultiViewImageFromFiles
 from .sparse_encoder import BEVFusionSparseEncoder
 from .transformer import TransformerDecoderLayer
@@ -13,7 +13,7 @@
 __all__ = [
     'BEVFusion', 'TransFusionHead', 'ConvFuser', 'ImageAug3D', 'GridMask',
     'GeneralizedLSSFPN', 'HungarianAssigner3D', 'BBoxBEVL1Cost', 'IoU3DCost',
-    'HeuristicAssigner3D', 'DepthLSSTransform',
+    'HeuristicAssigner3D', 'DepthLSSTransform', 'LSSTransform',
     'BEVLoadMultiViewImageFromFiles', 'BEVFusionSparseEncoder',
     'TransformerDecoderLayer', 'BEVFusionRandomFlip3D',
     'BEVFusionGlobalRotScaleTrans'
diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index 1cde750c01..527c83fce0 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -1,7 +1,10 @@
-from typing import Dict, List, Optional
+from collections import OrderedDict
+from typing import Dict, List, Optional, Tuple
 
 import numpy as np
 import torch
+import torch.distributed as dist
+from mmengine.utils import is_list_of
 from torch import Tensor
 from torch.nn import functional as F
 
@@ -71,6 +74,46 @@ def _forward(self,
         """
         pass
 
+    def parse_losses(
+        self, losses: Dict[str, torch.Tensor]
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
+        """Parses the raw outputs (losses) of the network.
+
+        Args:
+            losses (dict): Raw output of the network, which usually contain
+                losses and other necessary information.
+
+        Returns:
+            tuple[Tensor, dict]: There are two elements. The first is the
+            loss tensor passed to optim_wrapper which may be a weighted sum
+            of all losses, and the second is log_vars which will be sent to
+            the logger.
+        """
+        log_vars = []
+        for loss_name, loss_value in losses.items():
+            if isinstance(loss_value, torch.Tensor):
+                log_vars.append([loss_name, loss_value.mean()])
+            elif is_list_of(loss_value, torch.Tensor):
+                log_vars.append(
+                    [loss_name,
+                     sum(_loss.mean() for _loss in loss_value)])
+            else:
+                raise TypeError(
+                    f'{loss_name} is not a tensor or list of tensors')
+
+        loss = sum(value for key, value in log_vars if 'loss' in key)
+        log_vars.insert(0, ['loss', loss])
+        log_vars = OrderedDict(log_vars)  # type: ignore
+
+        for loss_name, loss_value in log_vars.items():
+            # reduce loss when distributed training
+            if dist.is_available() and dist.is_initialized():
+                loss_value = loss_value.data.clone()
+                dist.all_reduce(loss_value.div_(dist.get_world_size()))
+            log_vars[loss_name] = loss_value.item()
+
+        return loss, log_vars  # type: ignore
+
     def init_weights(self) -> None:
         if self.img_backbone is not None:
             self.img_backbone.init_weights()
diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py
index 069f4ea558..fb0c21b878 100644
--- a/projects/BEVFusion/bevfusion/depth_lss.py
+++ b/projects/BEVFusion/bevfusion/depth_lss.py
@@ -156,6 +156,7 @@ def forward(
         camera2lidar,
         img_aug_matrix,
         lidar_aug_matrix,
+        metas,
         **kwargs,
     ):
         intrins = camera_intrinsics[..., :3, :3]
@@ -182,6 +183,76 @@ def forward(
         return x
 
 
+@MODELS.register_module()
+class LSSTransform(BaseTransform):
+
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        image_size: Tuple[int, int],
+        feature_size: Tuple[int, int],
+        xbound: Tuple[float, float, float],
+        ybound: Tuple[float, float, float],
+        zbound: Tuple[float, float, float],
+        dbound: Tuple[float, float, float],
+        downsample: int = 1,
+    ) -> None:
+        super().__init__(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            image_size=image_size,
+            feature_size=feature_size,
+            xbound=xbound,
+            ybound=ybound,
+            zbound=zbound,
+            dbound=dbound,
+        )
+        self.depthnet = nn.Conv2d(in_channels, self.D + self.C, 1)
+        if downsample > 1:
+            assert downsample == 2, downsample
+            self.downsample = nn.Sequential(
+                nn.Conv2d(
+                    out_channels, out_channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+                nn.Conv2d(
+                    out_channels,
+                    out_channels,
+                    3,
+                    stride=downsample,
+                    padding=1,
+                    bias=False,
+                ),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+                nn.Conv2d(
+                    out_channels, out_channels, 3, padding=1, bias=False),
+                nn.BatchNorm2d(out_channels),
+                nn.ReLU(True),
+            )
+        else:
+            self.downsample = nn.Identity()
+
+    def get_cam_feats(self, x):
+        B, N, C, fH, fW = x.shape
+
+        x = x.view(B * N, C, fH, fW)
+
+        x = self.depthnet(x)
+        depth = x[:, :self.D].softmax(dim=1)
+        x = depth.unsqueeze(1) * x[:, self.D:(self.D + self.C)].unsqueeze(2)
+
+        x = x.view(B, N, self.C, self.D, fH, fW)
+        x = x.permute(0, 1, 3, 4, 5, 2)
+        return x
+
+    def forward(self, *args, **kwargs):
+        x = super().forward(*args, **kwargs)
+        x = self.downsample(x)
+        return x
+
+
 class BaseDepthTransform(BaseTransform):
 
     def forward(

From ab27ea1941f80c5d7fb3071005109f4794c9c9cb Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 16 May 2023 14:15:14 +0800
Subject: [PATCH 13/29] fix LoadMultiSweeps

---
 mmdet3d/datasets/transforms/loading.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mmdet3d/datasets/transforms/loading.py b/mmdet3d/datasets/transforms/loading.py
index efb54c2fff..0a55a1f43f 100644
--- a/mmdet3d/datasets/transforms/loading.py
+++ b/mmdet3d/datasets/transforms/loading.py
@@ -442,7 +442,9 @@ def transform(self, results: dict) -> dict:
                 lidar2sensor = np.array(sweep['lidar_points']['lidar2sensor'])
                 points_sweep[:, :
                              3] = points_sweep[:, :3] @ lidar2sensor[:3, :3]
-                points_sweep[:, :3] -= lidar2sensor[:3, 3]
+                points_sweep[:, :3] += -1 * np.matmul(
+                    lidar2sensor[:3, :3].T, lidar2sensor[:3, 3].reshape(
+                        3, 1)).squeeze()
                 points_sweep[:, 4] = ts - sweep_ts
                 points_sweep = points.new_point(points_sweep)
                 sweep_points_list.append(points_sweep)

From 40a97e21cde40ec0fd6da3b806b8e2693c24c157 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 16 May 2023 15:31:14 +0800
Subject: [PATCH 14/29] fix bug about points in-place operations

---
 projects/BEVFusion/bevfusion/bevfusion.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index 527c83fce0..c48e20b187 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -1,4 +1,5 @@
 from collections import OrderedDict
+from copy import deepcopy
 from typing import Dict, List, Optional, Tuple
 
 import numpy as np
@@ -267,8 +268,8 @@ def extract_feat(
             camera2lidar = imgs.new_tensor(np.asarray(camera2lidar))
             img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix))
             lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix))
-            img_feature = self.extract_img_feat(imgs, points, lidar2image,
-                                                camera_intrinsics,
+            img_feature = self.extract_img_feat(imgs, deepcopy(points),
+                                                lidar2image, camera_intrinsics,
                                                 camera2lidar, img_aug_matrix,
                                                 lidar_aug_matrix,
                                                 batch_input_metas)

From f17d03bbeaf59df59c90b0ce0fd3c0d617c2238e Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Wed, 17 May 2023 15:31:48 +0800
Subject: [PATCH 15/29] support amp and replace syncBN by BN

---
 projects/BEVFusion/bevfusion/bevfusion.py     | 27 ++++++++++---------
 .../BEVFusion/bevfusion/transfusion_head.py   |  3 ++-
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py |  8 +++---
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py |  8 +++---
 4 files changed, 25 insertions(+), 21 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index c48e20b187..98dc3b37e1 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -153,22 +153,25 @@ def extract_img_feat(
         BN, C, H, W = x.size()
         x = x.view(B, int(BN / B), C, H, W)
 
-        x = self.vtransform(
-            x,
-            points,
-            lidar2image,
-            camera_intrinsics,
-            camera2lidar,
-            img_aug_matrix,
-            lidar_aug_matrix,
-            img_metas,
-        )
+        with torch.autocast(device_type='cuda', dtype=torch.float32):
+            x = self.vtransform(
+                x,
+                points,
+                lidar2image,
+                camera_intrinsics,
+                camera2lidar,
+                img_aug_matrix,
+                lidar_aug_matrix,
+                img_metas,
+            )
         return x
 
     def extract_pts_feat(self, batch_inputs_dict) -> torch.Tensor:
         points = batch_inputs_dict['points']
-        feats, coords, sizes = self.voxelize(points)
-        batch_size = coords[-1, 0] + 1
+        with torch.autocast('cuda', enabled=False):
+            points = [point.float() for point in points]
+            feats, coords, sizes = self.voxelize(points)
+            batch_size = coords[-1, 0] + 1
         x = self.pts_middle_encoder(feats, coords, batch_size)
         return x
 
diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index 9cd1891f5c..b95ce96de1 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -224,7 +224,8 @@ def forward_single(self, inputs, metas):
         #################################
         # query initialization
         #################################
-        dense_heatmap = self.heatmap_head(fusion_feat)
+        with torch.autocast('cuda', enabled=False):
+            dense_heatmap = self.heatmap_head(fusion_feat.float())
         heatmap = dense_heatmap.detach().sigmoid()
         padding = self.nms_kernel_size // 2
         local_max = torch.zeros_like(heatmap)
diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 6f17bb67e2..ace7f47171 100644
--- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -1,4 +1,4 @@
-_base_ = ['mmdet3d::_base_/default_runtime.py']
+_base_ = ['../../../configs/_base_/default_runtime.py']
 custom_imports = dict(
     imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
 
@@ -58,7 +58,7 @@
         in_channels=5,
         sparse_shape=[1440, 1440, 41],
         order=('conv', 'norm', 'act'),
-        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
         encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
                                                                       128)),
         encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)),
@@ -69,14 +69,14 @@
         out_channels=[128, 256],
         layer_nums=[5, 5],
         layer_strides=[1, 2],
-        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
         conv_cfg=dict(type='Conv2d', bias=False)),
     pts_neck=dict(
         type='SECONDFPN',
         in_channels=[128, 256],
         out_channels=[256, 256],
         upsample_strides=[1, 2],
-        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
         upsample_cfg=dict(type='deconv', bias=False),
         use_conv_for_no_stride=True),
     bbox_head=dict(
diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 632876cb62..188986786a 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -1,4 +1,4 @@
-_base_ = ['mmdet3d::_base_/default_runtime.py']
+_base_ = ['../../../configs/_base_/default_runtime.py']
 custom_imports = dict(
     imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
 
@@ -90,7 +90,7 @@
         in_channels=5,
         sparse_shape=[1440, 1440, 41],
         order=('conv', 'norm', 'act'),
-        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
         encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
                                                                       128)),
         encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)),
@@ -103,14 +103,14 @@
         out_channels=[128, 256],
         layer_nums=[5, 5],
         layer_strides=[1, 2],
-        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
         conv_cfg=dict(type='Conv2d', bias=False)),
     pts_neck=dict(
         type='SECONDFPN',
         in_channels=[128, 256],
         out_channels=[256, 256],
         upsample_strides=[1, 2],
-        norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01),
+        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
         upsample_cfg=dict(type='deconv', bias=False),
         use_conv_for_no_stride=True),
     bbox_head=dict(

From dc4b7be0cf1211f373b8de6ab82d41b4ab12b75c Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Thu, 18 May 2023 20:33:13 +0800
Subject: [PATCH 16/29] add amp config

---
 ...dar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py | 5 +++++
 1 file changed, 5 insertions(+)
 create mode 100644 projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py

diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py
new file mode 100644
index 0000000000..e3fc470a47
--- /dev/null
+++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py
@@ -0,0 +1,5 @@
+_base_ = [
+    './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py'
+]
+
+optim_wrapper = dict(type='AmpOptimWrapper')

From a059517f2d1d26895c859375b39d46d4aa28dd5c Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Fri, 19 May 2023 11:53:23 +0800
Subject: [PATCH 17/29] set growth-interval in amp

---
 ...lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py
index e3fc470a47..63ce85e1ac 100644
--- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py
@@ -2,4 +2,5 @@
     './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py'
 ]
 
-optim_wrapper = dict(type='AmpOptimWrapper')
+optim_wrapper = dict(
+    type='AmpOptimWrapper', loss_scale=dict(growth_interval=2000))

From 9457729b652512fbfa0d6d11d9798a4ff416a911 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Sat, 20 May 2023 23:22:37 +0800
Subject: [PATCH 18/29] Revert "fix LoadMultiSweeps"

This reverts commit ab27ea1941f80c5d7fb3071005109f4794c9c9cb.
---
 mmdet3d/datasets/transforms/loading.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/mmdet3d/datasets/transforms/loading.py b/mmdet3d/datasets/transforms/loading.py
index 0a55a1f43f..efb54c2fff 100644
--- a/mmdet3d/datasets/transforms/loading.py
+++ b/mmdet3d/datasets/transforms/loading.py
@@ -442,9 +442,7 @@ def transform(self, results: dict) -> dict:
                 lidar2sensor = np.array(sweep['lidar_points']['lidar2sensor'])
                 points_sweep[:, :
                              3] = points_sweep[:, :3] @ lidar2sensor[:3, :3]
-                points_sweep[:, :3] += -1 * np.matmul(
-                    lidar2sensor[:3, :3].T, lidar2sensor[:3, 3].reshape(
-                        3, 1)).squeeze()
+                points_sweep[:, :3] -= lidar2sensor[:3, 3]
                 points_sweep[:, 4] = ts - sweep_ts
                 points_sweep = points.new_point(points_sweep)
                 sweep_points_list.append(points_sweep)

From 836c775321ae9455768edc96f1212cce8fdd5544 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 23 May 2023 23:15:56 +0800
Subject: [PATCH 19/29] add float in cls loss

---
 projects/BEVFusion/bevfusion/transfusion_head.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index b95ce96de1..a70023ae14 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -814,7 +814,7 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]],
             layer_cls_score = layer_score.permute(0, 2, 1).reshape(
                 -1, self.num_classes)
             layer_loss_cls = self.loss_cls(
-                layer_cls_score,
+                layer_cls_score.float(),
                 layer_labels,
                 layer_label_weights,
                 avg_factor=max(num_pos, 1),

From 839f05ebdc97940510d0dcac6198d58b93caebaf Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Wed, 24 May 2023 11:33:53 +0800
Subject: [PATCH 20/29] iter_based lr in fusion stage

---
 ...bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 188986786a..dd82377076 100644
--- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -400,7 +400,8 @@
         T_max=6,
         end=6,
         by_epoch=True,
-        eta_min_ratio=1e-3),
+        eta_min_ratio=1e-3,
+        convert_to_iter_based=True),
     # momentum scheduler
     # During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95
     # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1

From 0b32c0a93d23c985cf7ee1230e95feaca1f03aa0 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Fri, 26 May 2023 14:13:47 +0800
Subject: [PATCH 21/29] rename config

---
 ...n_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename projects/BEVFusion/configs/{bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py => bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py} (100%)

diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
similarity index 100%
rename from projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
rename to projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py

From 30091188d4a665beaadcd5a9b49e04c9da47139e Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Fri, 26 May 2023 14:15:57 +0800
Subject: [PATCH 22/29] use normalization query pos for stable training

---
 projects/BEVFusion/bevfusion/transfusion_head.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index a70023ae14..8d243a4b3d 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -166,8 +166,8 @@ def create_2D_grid(self, x_size, y_size):
         # NOTE: modified
         batch_x, batch_y = torch.meshgrid(
             *[torch.linspace(it[0], it[1], it[2]) for it in meshgrid])
-        batch_x = batch_x + 0.5
-        batch_y = batch_y + 0.5
+        batch_x = (batch_x + 0.5) / x_size
+        batch_y = (batch_y + 0.5) / y_size
         coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None]
         coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1)
         return coord_base
@@ -288,8 +288,11 @@ def forward_single(self, inputs, metas):
 
             # Prediction
             res_layer = self.prediction_heads[i](query_feat)
+            xy_size = torch.tensor(
+                [fusion_feat.size(-1),
+                 fusion_feat.size(-2)]).to(query_pos)
             res_layer['center'] = res_layer['center'] + query_pos.permute(
-                0, 2, 1)
+                0, 2, 1) * xy_size.reshape(2, -1)
             ret_dicts.append(res_layer)
 
             # for next level positional embedding

From 5fee2e05246378b9e47cc44432d63d4803bcee38 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Fri, 26 May 2023 14:49:16 +0800
Subject: [PATCH 23/29] remove unnecessary code & simplify config & train 5
 epoch

---
 projects/BEVFusion/bevfusion/bevfusion.py     |   8 -
 projects/BEVFusion/bevfusion/utils.py         |  44 ----
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 236 ++----------------
 ...econd_secfpn_8xb4-amp-cyclic-20e_nus-3d.py |   6 -
 ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py |   3 +-
 5 files changed, 15 insertions(+), 282 deletions(-)
 delete mode 100644 projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py

diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index 98dc3b37e1..32fe74cc5a 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -59,9 +59,6 @@ def __init__(
         self.pts_neck = MODELS.build(pts_neck)
 
         self.bbox_head = MODELS.build(bbox_head)
-        # hard code here where using converted checkpoint of original
-        # implementation of `BEVFusion`
-        self.use_converted_checkpoint = False
 
         self.init_weights()
 
@@ -235,11 +232,6 @@ def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]],
 
         if self.with_bbox_head:
             outputs = self.bbox_head.predict(feats, batch_input_metas)
-            if self.use_converted_checkpoint:
-                outputs[0]['bboxes_3d'].tensor[:, 6] = -outputs[0][
-                    'bboxes_3d'].tensor[:, 6] - np.pi / 2
-                outputs[0]['bboxes_3d'].tensor[:, 3:5] = outputs[0][
-                    'bboxes_3d'].tensor[:, [4, 3]]
 
         res = self.add_pred_to_datasample(batch_data_samples, outputs)
 
diff --git a/projects/BEVFusion/bevfusion/utils.py b/projects/BEVFusion/bevfusion/utils.py
index 66847df5b1..40f7412bfd 100644
--- a/projects/BEVFusion/bevfusion/utils.py
+++ b/projects/BEVFusion/bevfusion/utils.py
@@ -1,9 +1,5 @@
 # modify from https://github.com/mit-han-lab/bevfusion
-from collections import abc
-
-import numpy as np
 import torch
-import torch.nn as nn
 from mmdet.models.task_modules import AssignResult, BaseAssigner, BaseBBoxCoder
 
 try:
@@ -313,43 +309,3 @@ def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg):
         # max_overlaps = iou.max(1).values
         return AssignResult(
             num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels)
-
-
-def cast_tensor_type(inputs, src_type: torch.dtype, dst_type: torch.dtype):
-    """Recursively convert Tensor in inputs from src_type to dst_type.
-
-    Note:
-        In v1.4.4 and later, ``cast_tersor_type`` will only convert the
-        torch.Tensor which is consistent with ``src_type`` to the ``dst_type``.
-        Before v1.4.4, it ignores the ``src_type`` argument, leading to some
-        potential problems. For example,
-        ``cast_tensor_type(inputs, torch.float, torch.half)`` will convert all
-        tensors in inputs to ``torch.half`` including those originally in
-        ``torch.Int`` or other types, which is not expected.
-    Args:
-        inputs: Inputs that to be casted.
-        src_type (torch.dtype): Source type..
-        dst_type (torch.dtype): Destination type.
-    Returns:
-        The same type with inputs, but all contained Tensors have been cast.
-    """
-    if isinstance(inputs, nn.Module):
-        return inputs
-    elif isinstance(inputs, torch.Tensor):
-        # we need to ensure that the type of inputs to be casted are the same
-        # as the argument `src_type`.
-        return inputs.to(dst_type) if inputs.dtype == src_type else inputs
-    elif isinstance(inputs, str):
-        return inputs
-    elif isinstance(inputs, np.ndarray):
-        return inputs
-    elif isinstance(inputs, abc.Mapping):
-        return type(inputs)({  # type: ignore
-            k: cast_tensor_type(v, src_type, dst_type)
-            for k, v in inputs.items()
-        })
-    elif isinstance(inputs, abc.Iterable):
-        return type(inputs)(  # type: ignore
-            cast_tensor_type(item, src_type, dst_type) for item in inputs)
-    else:
-        return
diff --git a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index dd82377076..64493e33b1 100644
--- a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -1,31 +1,7 @@
-_base_ = ['../../../configs/_base_/default_runtime.py']
-custom_imports = dict(
-    imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False)
-
-# model settings
-# Voxel size for voxel encoder
-# Usually voxel size is changed consistently with the point cloud range
-# If point cloud range is modified, do remember to change all related
-# keys in the config.
-voxel_size = [0.075, 0.075, 0.2]
-point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
-class_names = [
-    'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier',
-    'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
+_base_ = [
+    './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py'
 ]
-
-metainfo = dict(classes=class_names)
-dataset_type = 'NuScenesDataset'
-data_root = 'data/nuscenes/'
-data_prefix = dict(
-    pts='samples/LIDAR_TOP',
-    CAM_FRONT='samples/CAM_FRONT',
-    CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT',
-    CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT',
-    CAM_BACK='samples/CAM_BACK',
-    CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT',
-    CAM_BACK_LEFT='samples/CAM_BACK_LEFT',
-    sweeps='sweeps/LIDAR_TOP')
+point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]
 input_modality = dict(use_lidar=True, use_camera=True)
 backend_args = None
 
@@ -35,14 +11,7 @@
         type='Det3DDataPreprocessor',
         mean=[123.675, 116.28, 103.53],
         std=[58.395, 57.12, 57.375],
-        bgr_to_rgb=False,
-        pad_size_divisor=32,
-        voxelize_cfg=dict(
-            max_num_points=10,
-            point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
-            voxel_size=[0.075, 0.075, 0.2],
-            max_voxels=[120000, 160000],
-            voxelize_reduce=True)),
+        bgr_to_rgb=False),
     img_backbone=dict(
         type='mmdet.SwinTransformer',
         embed_dims=96,
@@ -84,141 +53,8 @@
         zbound=[-10.0, 10.0, 20.0],
         dbound=[1.0, 60.0, 0.5],
         downsample=2),
-    pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5),
-    pts_middle_encoder=dict(
-        type='BEVFusionSparseEncoder',
-        in_channels=5,
-        sparse_shape=[1440, 1440, 41],
-        order=('conv', 'norm', 'act'),
-        norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01),
-        encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128,
-                                                                      128)),
-        encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)),
-        block_type='basicblock'),
     fusion_layer=dict(
-        type='ConvFuser', in_channels=[80, 256], out_channels=256),
-    pts_backbone=dict(
-        type='SECOND',
-        in_channels=256,
-        out_channels=[128, 256],
-        layer_nums=[5, 5],
-        layer_strides=[1, 2],
-        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
-        conv_cfg=dict(type='Conv2d', bias=False)),
-    pts_neck=dict(
-        type='SECONDFPN',
-        in_channels=[128, 256],
-        out_channels=[256, 256],
-        upsample_strides=[1, 2],
-        norm_cfg=dict(type='BN', eps=0.001, momentum=0.01),
-        upsample_cfg=dict(type='deconv', bias=False),
-        use_conv_for_no_stride=True),
-    bbox_head=dict(
-        type='TransFusionHead',
-        num_proposals=200,
-        auxiliary=True,
-        in_channels=512,
-        hidden_channel=128,
-        num_classes=10,
-        nms_kernel_size=3,
-        bn_momentum=0.1,
-        num_decoder_layers=1,
-        decoder_layer=dict(
-            type='TransformerDecoderLayer',
-            self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1),
-            cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1),
-            ffn_cfg=dict(
-                embed_dims=128,
-                feedforward_channels=256,
-                num_fcs=2,
-                ffn_drop=0.1,
-                act_cfg=dict(type='ReLU', inplace=True),
-            ),
-            norm_cfg=dict(type='LN'),
-            pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)),
-        train_cfg=dict(
-            dataset='nuScenes',
-            point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0],
-            grid_size=[1440, 1440, 41],
-            voxel_size=[0.075, 0.075, 0.2],
-            out_size_factor=8,
-            gaussian_overlap=0.1,
-            min_radius=2,
-            pos_weight=-1,
-            code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2],
-            assigner=dict(
-                type='HungarianAssigner3D',
-                iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'),
-                cls_cost=dict(
-                    type='mmdet.FocalLossCost',
-                    gamma=2.0,
-                    alpha=0.25,
-                    weight=0.15),
-                reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25),
-                iou_cost=dict(type='IoU3DCost', weight=0.25))),
-        test_cfg=dict(
-            dataset='nuScenes',
-            grid_size=[1440, 1440, 41],
-            out_size_factor=8,
-            voxel_size=[0.075, 0.075],
-            pc_range=[-54.0, -54.0],
-            nms_type=None),
-        common_heads=dict(
-            center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]),
-        bbox_coder=dict(
-            type='TransFusionBBoxCoder',
-            pc_range=[-54.0, -54.0],
-            post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0],
-            score_threshold=0.0,
-            out_size_factor=8,
-            voxel_size=[0.075, 0.075],
-            code_size=10),
-        loss_cls=dict(
-            type='mmdet.FocalLoss',
-            use_sigmoid=True,
-            gamma=2.0,
-            alpha=0.25,
-            reduction='mean',
-            loss_weight=1.0),
-        loss_heatmap=dict(
-            type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0),
-        loss_bbox=dict(
-            type='mmdet.L1Loss', reduction='mean', loss_weight=0.25)))
-
-db_sampler = dict(
-    data_root=data_root,
-    info_path=data_root + 'nuscenes_dbinfos_train.pkl',
-    rate=1.0,
-    prepare=dict(
-        filter_by_difficulty=[-1],
-        filter_by_min_points=dict(
-            car=5,
-            truck=5,
-            bus=5,
-            trailer=5,
-            construction_vehicle=5,
-            traffic_cone=5,
-            barrier=5,
-            motorcycle=5,
-            bicycle=5,
-            pedestrian=5)),
-    classes=class_names,
-    sample_groups=dict(
-        car=2,
-        truck=3,
-        construction_vehicle=7,
-        bus=4,
-        trailer=6,
-        barrier=2,
-        motorcycle=6,
-        bicycle=6,
-        pedestrian=2,
-        traffic_cone=2),
-    points_loader=dict(
-        type='LoadPointsFromFile',
-        coord_type='LIDAR',
-        load_dim=5,
-        use_dim=[0, 1, 2, 3, 4]))
+        type='ConvFuser', in_channels=[80, 256], out_channels=256))
 
 train_pipeline = [
     dict(
@@ -245,7 +81,6 @@
         with_bbox_3d=True,
         with_label_3d=True,
         with_attr_label=False),
-    # dict(type='ObjectSample', db_sampler=db_sampler),
     dict(
         type='ImageAug3D',
         final_dim=[256, 704],
@@ -268,11 +103,12 @@
             'car', 'truck', 'construction_vehicle', 'bus', 'trailer',
             'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone'
         ]),
+    # Actually, 'GridMask' is not used here
     dict(
         type='GridMask',
         use_h=True,
         use_w=True,
-        max_epoch=6,
+        max_epoch=5,
         rotate=1,
         offset=False,
         ratio=0.5,
@@ -337,56 +173,12 @@
 ]
 
 train_dataloader = dict(
-    batch_size=4,
-    num_workers=4,
-    persistent_workers=True,
-    sampler=dict(type='DefaultSampler', shuffle=True),
     dataset=dict(
-        type='CBGSDataset',
-        dataset=dict(
-            type=dataset_type,
-            data_root=data_root,
-            ann_file='nuscenes_infos_train.pkl',
-            pipeline=train_pipeline,
-            metainfo=metainfo,
-            modality=input_modality,
-            test_mode=False,
-            data_prefix=data_prefix,
-            use_valid_flag=True,
-            # we use box_type_3d='LiDAR' in kitti and nuscenes dataset
-            # and box_type_3d='Depth' in sunrgbd and scannet dataset.
-            box_type_3d='LiDAR')))
+        dataset=dict(pipeline=train_pipeline, modality=input_modality)))
 val_dataloader = dict(
-    batch_size=1,
-    num_workers=4,
-    persistent_workers=True,
-    drop_last=False,
-    sampler=dict(type='DefaultSampler', shuffle=False),
-    dataset=dict(
-        type=dataset_type,
-        data_root=data_root,
-        ann_file='nuscenes_infos_val.pkl',
-        pipeline=test_pipeline,
-        metainfo=metainfo,
-        modality=input_modality,
-        data_prefix=data_prefix,
-        test_mode=True,
-        box_type_3d='LiDAR',
-        backend_args=backend_args))
+    dataset=dict(pipeline=test_pipeline, modality=input_modality))
 test_dataloader = val_dataloader
 
-val_evaluator = dict(
-    type='NuScenesMetric',
-    data_root=data_root,
-    ann_file=data_root + 'nuscenes_infos_val.pkl',
-    metric='bbox',
-    backend_args=backend_args)
-test_evaluator = val_evaluator
-
-vis_backends = [dict(type='LocalVisBackend')]
-visualizer = dict(
-    type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer')
-
 param_scheduler = [
     dict(
         type='LinearLR',
@@ -397,8 +189,8 @@
     dict(
         type='CosineAnnealingLR',
         begin=0,
-        T_max=6,
-        end=6,
+        T_max=5,
+        end=5,
         by_epoch=True,
         eta_min_ratio=1e-3,
         convert_to_iter_based=True),
@@ -416,13 +208,13 @@
         type='CosineAnnealingMomentum',
         eta_min=1,
         begin=2.4,
-        end=6,
+        end=5,
         by_epoch=True,
         convert_to_iter_based=True)
 ]
 
 # runtime settings
-train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1)
+train_cfg = dict(by_epoch=True, max_epochs=5, val_interval=1)
 val_cfg = dict()
 test_cfg = dict()
 
@@ -436,8 +228,8 @@
 #       or not by default.
 #   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
 auto_scale_lr = dict(enable=False, base_batch_size=32)
-log_processor = dict(window_size=50)
 
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),
     checkpoint=dict(type='CheckpointHook', interval=1))
+del _base_.custom_hooks
diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py
deleted file mode 100644
index 63ce85e1ac..0000000000
--- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py
+++ /dev/null
@@ -1,6 +0,0 @@
-_base_ = [
-    './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py'
-]
-
-optim_wrapper = dict(
-    type='AmpOptimWrapper', loss_scale=dict(growth_interval=2000))
diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index ace7f47171..e56189e2f5 100644
--- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -376,10 +376,9 @@
 #       or not by default.
 #   - `base_batch_size` = (8 GPUs) x (4 samples per GPU).
 auto_scale_lr = dict(enable=False, base_batch_size=32)
+log_processor = dict(window_size=50)
 
 default_hooks = dict(
     logger=dict(type='LoggerHook', interval=50),
     checkpoint=dict(type='CheckpointHook', interval=5))
 custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)]
-
-load_from = 'checkpoints/bevfusion_init_converted.pth'

From 51f59855f40656cd59e598ff65f0cdc6b2b751c3 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Mon, 29 May 2023 09:48:03 +0800
Subject: [PATCH 24/29] smaller ete_min_ratio

---
 ...voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index 64493e33b1..eb6755d8c4 100644
--- a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -108,7 +108,7 @@
         type='GridMask',
         use_h=True,
         use_w=True,
-        max_epoch=5,
+        max_epoch=6,
         rotate=1,
         offset=False,
         ratio=0.5,
@@ -189,10 +189,10 @@
     dict(
         type='CosineAnnealingLR',
         begin=0,
-        T_max=5,
-        end=5,
+        T_max=6,
+        end=6,
         by_epoch=True,
-        eta_min_ratio=1e-3,
+        eta_min_ratio=1e-4,
         convert_to_iter_based=True),
     # momentum scheduler
     # During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95
@@ -208,13 +208,13 @@
         type='CosineAnnealingMomentum',
         eta_min=1,
         begin=2.4,
-        end=5,
+        end=6,
         by_epoch=True,
         convert_to_iter_based=True)
 ]
 
 # runtime settings
-train_cfg = dict(by_epoch=True, max_epochs=5, val_interval=1)
+train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1)
 val_cfg = dict()
 test_cfg = dict()
 

From 09311fecbd5eea5b8ff729aee8280067faebb281 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Mon, 29 May 2023 15:50:07 +0800
Subject: [PATCH 25/29] polish code

---
 projects/BEVFusion/README.md                   | 18 +++++++++---------
 projects/BEVFusion/bevfusion/bevfusion.py      |  8 ++++----
 projects/BEVFusion/bevfusion/depth_lss.py      |  8 +++++---
 projects/BEVFusion/bevfusion/transforms_3d.py  |  4 ++++
 ...075_second_secfpn_8xb4-cyclic-20e_nus-3d.py |  2 +-
 5 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/projects/BEVFusion/README.md b/projects/BEVFusion/README.md
index dd4ee70212..b5a6b80593 100644
--- a/projects/BEVFusion/README.md
+++ b/projects/BEVFusion/README.md
@@ -15,7 +15,7 @@ results is available at https://github.com/mit-han-lab/bevfusion.
 
 ## Introduction
 
-We implement BEVFusion and provide the results and pretrained checkpoints on NuScenes dataset.
+We implement BEVFusion and support training and testing on NuScenes dataset.
 
 ## Usage
 
@@ -34,21 +34,21 @@ python projects/BEVFusion/setup.py develop
 Run a demo on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link):
 
 ```shell
-python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
+python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show
 ```
 
 ### Training commands
 
-In MMDetection3D's root directory, run the following command to train the model:
+1. You should train the lidar-only detector first:
 
 ```bash
-python tools/train.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+python tools/train.py projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
 ```
 
-For multi-gpu training, run:
+2. Download the [Swin pre-trained model](<>). Given the image pre-trained backbone and the lidar-only pre-trained detector, you could train the lidar-camera fusion model:
 
 ```bash
-python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${NUM_GPUS} --master_port=29506 --master_addr="127.0.0.1" tools/train.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+python tools/train.py projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py --cfg-options load_from=${LIDAR_PRETRAINED_CHECKPOINT} model.img_backbone.init_cfg.checkpoint=${IMAGE_PRETRAINED_BACKBONE}
 ```
 
 ### Testing commands
@@ -56,7 +56,7 @@ python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${N
 In MMDetection3D's root directory, run the following command to test the model:
 
 ```bash
-python tools/test.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH}
+python tools/test.py projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH}
 ```
 
 ## Results and models
@@ -103,9 +103,9 @@ A project does not necessarily have to be finished in a single PR, but it's esse
 
     <!-- As this template does. -->
 
-- [ ] Milestone 2: Indicates a successful model implementation.
+- [x] Milestone 2: Indicates a successful model implementation.
 
-  - [ ] Training-time correctness
+  - [x] Training-time correctness
 
     <!-- If you are reproducing the result from a paper, checking this item means that you should have trained your model from scratch based on the original paper's specification and verified that the final result matches the report within a minor error range. -->
 
diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py
index 32fe74cc5a..9f56934e66 100644
--- a/projects/BEVFusion/bevfusion/bevfusion.py
+++ b/projects/BEVFusion/bevfusion/bevfusion.py
@@ -27,7 +27,7 @@ def __init__(
         fusion_layer: Optional[dict] = None,
         img_backbone: Optional[dict] = None,
         pts_backbone: Optional[dict] = None,
-        vtransform: Optional[dict] = None,
+        view_transform: Optional[dict] = None,
         img_neck: Optional[dict] = None,
         pts_neck: Optional[dict] = None,
         bbox_head: Optional[dict] = None,
@@ -48,8 +48,8 @@ def __init__(
             img_backbone) if img_backbone is not None else None
         self.img_neck = MODELS.build(
             img_neck) if img_neck is not None else None
-        self.vtransform = MODELS.build(
-            vtransform) if vtransform is not None else None
+        self.view_transform = MODELS.build(
+            view_transform) if view_transform is not None else None
         self.pts_middle_encoder = MODELS.build(pts_middle_encoder)
 
         self.fusion_layer = MODELS.build(
@@ -151,7 +151,7 @@ def extract_img_feat(
         x = x.view(B, int(BN / B), C, H, W)
 
         with torch.autocast(device_type='cuda', dtype=torch.float32):
-            x = self.vtransform(
+            x = self.view_transform(
                 x,
                 points,
                 lidar2image,
diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py
index fb0c21b878..6cc0cc1606 100644
--- a/projects/BEVFusion/bevfusion/depth_lss.py
+++ b/projects/BEVFusion/bevfusion/depth_lss.py
@@ -17,7 +17,7 @@ def gen_dx_bx(xbound, ybound, zbound):
     return dx, bx, nx
 
 
-class BaseTransform(nn.Module):
+class BaseViewTransform(nn.Module):
 
     def __init__(
         self,
@@ -184,7 +184,7 @@ def forward(
 
 
 @MODELS.register_module()
-class LSSTransform(BaseTransform):
+class LSSTransform(BaseViewTransform):
 
     def __init__(
         self,
@@ -253,7 +253,7 @@ def forward(self, *args, **kwargs):
         return x
 
 
-class BaseDepthTransform(BaseTransform):
+class BaseDepthTransform(BaseViewTransform):
 
     def forward(
         self,
@@ -346,6 +346,8 @@ def __init__(
         dbound: Tuple[float, float, float],
         downsample: int = 1,
     ) -> None:
+        """Compared with `LSSTransform`, `DepthLSSTransform` adds sparse depth
+        information from lidar points into the inputs of the `depthnet`."""
         super().__init__(
             in_channels=in_channels,
             out_channels=out_channels,
diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py
index da259e21d3..6d2929512d 100644
--- a/projects/BEVFusion/bevfusion/transforms_3d.py
+++ b/projects/BEVFusion/bevfusion/transforms_3d.py
@@ -110,6 +110,8 @@ def transform(self, data: Dict[str, Any]) -> Dict[str, Any]:
 
 @TRANSFORMS.register_module()
 class BEVFusionRandomFlip3D:
+    """Compared with `RandomFlip3D`, this class directly records the lidar
+    augmentation matrix in the `data`."""
 
     def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
         flip_horizontal = np.random.choice([0, 1])
@@ -143,6 +145,8 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
 
 @TRANSFORMS.register_module()
 class BEVFusionGlobalRotScaleTrans(GlobalRotScaleTrans):
+    """Compared with `GlobalRotScaleTrans`, the augmentation order in this
+    class is rotation, translation and scaling (RTS)."""
 
     def transform(self, input_dict: dict) -> dict:
         """Private function to rotate, scale and translate bounding boxes and
diff --git a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
index eb6755d8c4..f0b6eeba30 100644
--- a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+++ b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
@@ -42,7 +42,7 @@
         norm_cfg=dict(type='BN2d', requires_grad=True),
         act_cfg=dict(type='ReLU', inplace=True),
         upsample_cfg=dict(mode='bilinear', align_corners=False)),
-    vtransform=dict(
+    view_transform=dict(
         type='DepthLSSTransform',
         in_channels=256,
         out_channels=80,

From 9ac3e03128979d27457dff340b8970d3f0eb9881 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 30 May 2023 15:45:38 +0800
Subject: [PATCH 26/29] fix UT

---
 .../test_engine/test_hooks/test_disable_object_sample_hook.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/test_engine/test_hooks/test_disable_object_sample_hook.py b/tests/test_engine/test_hooks/test_disable_object_sample_hook.py
index fcc1e3c8d8..cbe7cd65d5 100644
--- a/tests/test_engine/test_hooks/test_disable_object_sample_hook.py
+++ b/tests/test_engine/test_hooks/test_disable_object_sample_hook.py
@@ -2,6 +2,8 @@
 from unittest import TestCase
 from unittest.mock import Mock
 
+from mmengine.dataset import BaseDataset
+
 from mmdet3d.datasets.transforms import ObjectSample
 from mmdet3d.engine.hooks import DisableObjectSampleHook
 
@@ -10,7 +12,7 @@ class TestDisableObjectSampleHook(TestCase):
 
     runner = Mock()
     runner.train_dataloader = Mock()
-    runner.train_dataloader.dataset = Mock()
+    runner.train_dataloader.dataset = Mock(spec=BaseDataset)
     runner.train_dataloader.dataset.pipeline = Mock()
     runner.train_dataloader._DataLoader__initialized = True
     runner.train_dataloader.dataset.pipeline.transforms = [

From 411105b8a7046643074e1b39a4d6f1668ccd29c7 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Tue, 30 May 2023 17:13:16 +0800
Subject: [PATCH 27/29] Revert "use normalization query pos for stable
 training"

This reverts commit 30091188d4a665beaadcd5a9b49e04c9da47139e.
---
 projects/BEVFusion/bevfusion/transfusion_head.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index 8d243a4b3d..a70023ae14 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -166,8 +166,8 @@ def create_2D_grid(self, x_size, y_size):
         # NOTE: modified
         batch_x, batch_y = torch.meshgrid(
             *[torch.linspace(it[0], it[1], it[2]) for it in meshgrid])
-        batch_x = (batch_x + 0.5) / x_size
-        batch_y = (batch_y + 0.5) / y_size
+        batch_x = batch_x + 0.5
+        batch_y = batch_y + 0.5
         coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None]
         coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1)
         return coord_base
@@ -288,11 +288,8 @@ def forward_single(self, inputs, metas):
 
             # Prediction
             res_layer = self.prediction_heads[i](query_feat)
-            xy_size = torch.tensor(
-                [fusion_feat.size(-1),
-                 fusion_feat.size(-2)]).to(query_pos)
             res_layer['center'] = res_layer['center'] + query_pos.permute(
-                0, 2, 1) * xy_size.reshape(2, -1)
+                0, 2, 1)
             ret_dicts.append(res_layer)
 
             # for next level positional embedding

From 3678fb4806016d5efbe9dab0001cd321b55adcd8 Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Wed, 31 May 2023 09:41:37 +0800
Subject: [PATCH 28/29] update readme

---
 projects/BEVFusion/README.md | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/projects/BEVFusion/README.md b/projects/BEVFusion/README.md
index b5a6b80593..0828bd4f8d 100644
--- a/projects/BEVFusion/README.md
+++ b/projects/BEVFusion/README.md
@@ -42,30 +42,33 @@ python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0
 1. You should train the lidar-only detector first:
 
 ```bash
-python tools/train.py projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py
+bash tools/dist_train.py projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py 8
 ```
 
-2. Download the [Swin pre-trained model](<>). Given the image pre-trained backbone and the lidar-only pre-trained detector, you could train the lidar-camera fusion model:
+2. Download the [Swin pre-trained model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/swint-nuimages-pretrained.pth). Given the image pre-trained backbone and the lidar-only pre-trained detector, you could train the lidar-camera fusion model:
 
 ```bash
-python tools/train.py projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py --cfg-options load_from=${LIDAR_PRETRAINED_CHECKPOINT} model.img_backbone.init_cfg.checkpoint=${IMAGE_PRETRAINED_BACKBONE}
+bash tools/dist_train.sh projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py 8 --cfg-options load_from=${LIDAR_PRETRAINED_CHECKPOINT} model.img_backbone.init_cfg.checkpoint=${IMAGE_PRETRAINED_BACKBONE}
 ```
 
+**Note** that if you want to reduce CUDA memory usage and computational overhead, you could directly add `--amp` on the tail of the above commands. The model under this setting will be trained in fp16 mode.
+
 ### Testing commands
 
 In MMDetection3D's root directory, run the following command to test the model:
 
 ```bash
-python tools/test.py projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH}
+bash tools/dist_test.sh projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH} 8
 ```
 
 ## Results and models
 
 ### NuScenes
 
-|                                    Backbone                                     | Voxel type (voxel size) | NMS | Mem (GB) | Inf time (fps) |  NDS  |  mAP  |                                                 Download                                                 |
-| :-----------------------------------------------------------------------------: | :---------------------: | :-: | :------: | :------------: | :---: | :---: | :------------------------------------------------------------------------------------------------------: |
-| [SECFPN](./configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) |      voxel (0.075)      |  ×  |    -     |       -        | 71.62 | 68.77 | [converted_model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link) |
+|                                           Modality                                           | Voxel type (voxel size) | NMS | Mem (GB) | Inf time (fps) | NDS  | mAP  |                                                                                                                                                             Download                                                                                                                                                              |
+| :------------------------------------------------------------------------------------------: | :---------------------: | :-: | :------: | :------------: | :--: | :--: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: |
+|     [lidar](./configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py)     |      voxel (0.075)      |  ×  |    -     |       -        | 69.6 | 64.9 |     [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933.pth) [logs](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d_20230322_053447.log)     |
+| [lidar-cam](./configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) |      voxel (0.075)      |  ×  |    -     |       -        | 71.4 | 68.6 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af.pth) [logs](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d_20230524_001539.log) |
 
 ## Citation
 

From e127201486dfe18da73792176431ca25f6723c1d Mon Sep 17 00:00:00 2001
From: JingweiZhang12 <zjw18@mails.tsinghua.edu.cn>
Date: Wed, 31 May 2023 14:30:38 +0800
Subject: [PATCH 29/29] fix height offset

---
 projects/BEVFusion/bevfusion/transfusion_head.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py
index a70023ae14..8a3e1750db 100644
--- a/projects/BEVFusion/bevfusion/transfusion_head.py
+++ b/projects/BEVFusion/bevfusion/transfusion_head.py
@@ -481,8 +481,6 @@ def predict_by_feat(self,
                     ret = dict(bboxes=boxes3d, scores=scores, labels=labels)
 
                 temp_instances = InstanceData()
-                ret['bboxes'][:, 2] = ret[
-                    'bboxes'][:, 2] - ret['bboxes'][:, 5] * 0.5  # noqa: E501
                 temp_instances.bboxes_3d = metas[0]['box_type_3d'](
                     ret['bboxes'], box_dim=ret['bboxes'].shape[-1])
                 temp_instances.scores_3d = ret['scores']