From 07846673bd25a3859c4ff8503e3bda764845f6e6 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Fri, 17 Mar 2023 17:55:45 +0800 Subject: [PATCH 01/29] support train on nus --- projects/BEVFusion/bevfusion/__init__.py | 4 +- projects/BEVFusion/bevfusion/bevfusion.py | 11 ++- projects/BEVFusion/bevfusion/depth_lss.py | 1 + projects/BEVFusion/bevfusion/transforms_3d.py | 34 +++++++++ projects/BEVFusion/bevfusion/utils.py | 53 ++++++++++++- ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 75 ++++++++++--------- 6 files changed, 137 insertions(+), 41 deletions(-) diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py index c36fc641c8..faf7fa2d9a 100644 --- a/projects/BEVFusion/bevfusion/__init__.py +++ b/projects/BEVFusion/bevfusion/__init__.py @@ -4,7 +4,7 @@ from .loading import BEVLoadMultiViewImageFromFiles from .sparse_encoder import BEVFusionSparseEncoder from .transformer import TransformerDecoderLayer -from .transforms_3d import GridMask, ImageAug3D +from .transforms_3d import BEVFusionRandomFlip3D, GridMask, ImageAug3D from .transfusion_head import ConvFuser, TransFusionHead from .utils import (BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D, IoU3DCost) @@ -14,5 +14,5 @@ 'GeneralizedLSSFPN', 'HungarianAssigner3D', 'BBoxBEVL1Cost', 'IoU3DCost', 'HeuristicAssigner3D', 'DepthLSSTransform', 'BEVLoadMultiViewImageFromFiles', 'BEVFusionSparseEncoder', - 'TransformerDecoderLayer' + 'TransformerDecoderLayer', 'BEVFusionRandomFlip3D' ] diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index a823528207..e13e4b6c19 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -239,4 +239,13 @@ def extract_feat( def loss(self, batch_inputs_dict: Dict[str, Optional[Tensor]], batch_data_samples: List[Det3DDataSample], **kwargs) -> List[Det3DDataSample]: - pass + batch_input_metas = [item.metainfo for item in batch_data_samples] + feats = self.extract_feat(batch_inputs_dict, batch_input_metas) + + losses = dict() + if self.with_bbox_head: + bbox_loss = self.bbox_head.loss(feats, batch_data_samples) + + losses.update(bbox_loss) + + return losses diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py index f336c9289b..072375dd37 100644 --- a/projects/BEVFusion/bevfusion/depth_lss.py +++ b/projects/BEVFusion/bevfusion/depth_lss.py @@ -241,6 +241,7 @@ def forward( for c in range(on_img.shape[0]): masked_coords = cur_coords[c, on_img[c]].long() masked_dist = dist[c, on_img[c]] + depth = depth.to(masked_dist.dtype) depth[b, c, 0, masked_coords[:, 0], masked_coords[:, 1]] = masked_dist diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py index 35116f591f..1941f21142 100644 --- a/projects/BEVFusion/bevfusion/transforms_3d.py +++ b/projects/BEVFusion/bevfusion/transforms_3d.py @@ -1,4 +1,5 @@ # modify from https://github.com/mit-han-lab/bevfusion +import random from typing import Any, Dict import numpy as np @@ -107,6 +108,39 @@ def transform(self, data: Dict[str, Any]) -> Dict[str, Any]: return data +@TRANSFORMS.register_module() +class BEVFusionRandomFlip3D: + + def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: + flip_horizontal = random.choice([0, 1]) + flip_vertical = random.choice([0, 1]) + + rotation = np.eye(3) + if flip_horizontal: + rotation = np.array([[1, 0, 0], [0, -1, 0], [0, 0, 1]]) @ rotation + if 'points' in data: + data['points'].flip('horizontal') + if 'gt_bboxes_3d' in data: + data['gt_bboxes_3d'].flip('horizontal') + if 'gt_masks_bev' in data: + data['gt_masks_bev'] = data['gt_masks_bev'][:, :, ::-1].copy() + + if flip_vertical: + rotation = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) @ rotation + if 'points' in data: + data['points'].flip('vertical') + if 'gt_bboxes_3d' in data: + data['gt_bboxes_3d'].flip('vertical') + if 'gt_masks_bev' in data: + data['gt_masks_bev'] = data['gt_masks_bev'][:, ::-1, :].copy() + + if 'lidar_aug_matrix' not in data: + data['lidar_aug_matrix'] = np.eye(4) + data['lidar_aug_matrix'][:3, :] = rotation @ data[ + 'lidar_aug_matrix'][:3, :] + return data + + @TRANSFORMS.register_module() class GridMask(BaseTransform): diff --git a/projects/BEVFusion/bevfusion/utils.py b/projects/BEVFusion/bevfusion/utils.py index 0ce5472615..66847df5b1 100644 --- a/projects/BEVFusion/bevfusion/utils.py +++ b/projects/BEVFusion/bevfusion/utils.py @@ -1,5 +1,9 @@ # modify from https://github.com/mit-han-lab/bevfusion +from collections import abc + +import numpy as np import torch +import torch.nn as nn from mmdet.models.task_modules import AssignResult, BaseAssigner, BaseBBoxCoder try: @@ -7,6 +11,8 @@ except ImportError: linear_sum_assignment = None +from mmengine.structures import InstanceData + from mmdet3d.registry import TASK_UTILS @@ -273,8 +279,11 @@ def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg): num_gts, assigned_gt_inds, None, labels=assigned_labels) # 2. compute the weighted costs - # see mmdetection/mmdet/core/bbox/match_costs/match_cost.py - cls_cost = self.cls_cost(cls_pred[0].T, gt_labels) + # Hard code here to be compatible with the interface of + # `ClassificationCost` in mmdet. + gt_instances, pred_instances = InstanceData( + labels=gt_labels), InstanceData(scores=cls_pred[0].T) + cls_cost = self.cls_cost(pred_instances, gt_instances) reg_cost = self.reg_cost(bboxes, gt_bboxes, train_cfg) iou = self.iou_calculator(bboxes, gt_bboxes) iou_cost = self.iou_cost(iou) @@ -304,3 +313,43 @@ def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg): # max_overlaps = iou.max(1).values return AssignResult( num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) + + +def cast_tensor_type(inputs, src_type: torch.dtype, dst_type: torch.dtype): + """Recursively convert Tensor in inputs from src_type to dst_type. + + Note: + In v1.4.4 and later, ``cast_tersor_type`` will only convert the + torch.Tensor which is consistent with ``src_type`` to the ``dst_type``. + Before v1.4.4, it ignores the ``src_type`` argument, leading to some + potential problems. For example, + ``cast_tensor_type(inputs, torch.float, torch.half)`` will convert all + tensors in inputs to ``torch.half`` including those originally in + ``torch.Int`` or other types, which is not expected. + Args: + inputs: Inputs that to be casted. + src_type (torch.dtype): Source type.. + dst_type (torch.dtype): Destination type. + Returns: + The same type with inputs, but all contained Tensors have been cast. + """ + if isinstance(inputs, nn.Module): + return inputs + elif isinstance(inputs, torch.Tensor): + # we need to ensure that the type of inputs to be casted are the same + # as the argument `src_type`. + return inputs.to(dst_type) if inputs.dtype == src_type else inputs + elif isinstance(inputs, str): + return inputs + elif isinstance(inputs, np.ndarray): + return inputs + elif isinstance(inputs, abc.Mapping): + return type(inputs)({ # type: ignore + k: cast_tensor_type(v, src_type, dst_type) + for k, v in inputs.items() + }) + elif isinstance(inputs, abc.Iterable): + return type(inputs)( # type: ignore + cast_tensor_type(item, src_type, dst_type) for item in inputs) + else: + return diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 8f12892372..865952342c 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -14,9 +14,9 @@ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] -metainfo = dict(classes=class_names) +metainfo = dict(classes=class_names, version='v1.0-mini') dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' +data_root = 'data/nuscenes_mini/' data_prefix = dict( pts='samples/LIDAR_TOP', CAM_FRONT='samples/CAM_FRONT', @@ -194,24 +194,22 @@ filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)), classes=class_names, sample_groups=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5), + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2), points_loader=dict( type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, use_dim=[0, 1, 2, 3, 4], - reduce_beams=32, - backend_args=backend_args), - backend_args=backend_args) + backend_args=backend_args)) train_pipeline = [ dict( @@ -224,18 +222,14 @@ coord_type='LIDAR', load_dim=5, use_dim=5, - reduce_beams=32, - load_augmented=None, backend_args=backend_args), dict( type='LoadPointsFromMultiSweeps', sweeps_num=9, load_dim=5, use_dim=5, - reduce_beams=32, pad_empty_sweeps=True, remove_close=True, - load_augmented=None, backend_args=backend_args), dict( type='LoadAnnotations3D', @@ -253,11 +247,10 @@ is_train=True), dict( type='GlobalRotScaleTrans', - resize_lim=[0.9, 1.1], - rot_lim=[-0.78539816, 0.78539816], - trans_lim=0.5, - is_train=True), - dict(type='RandomFlip3D'), + scale_ratio_range=[0.9, 1.1], + rot_range=[-0.78539816, 0.78539816], + translation_std=0.5), + dict(type='BEVFusionRandomFlip3D'), dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), dict( @@ -283,6 +276,13 @@ keys=[ 'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', 'gt_labels' + ], + meta_keys=[ + 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', + 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', + 'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation', + 'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix', + 'lidar_aug_matrix' ]) ] @@ -333,18 +333,19 @@ persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='nuscenes_infos_train.pkl', - pipeline=train_pipeline, - metainfo=metainfo, - modality=input_modality, - test_mode=False, - data_prefix=data_prefix, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR', - backend_args=backend_args)) + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'))) val_dataloader = dict( batch_size=1, num_workers=0, @@ -428,3 +429,5 @@ default_hooks = dict( logger=dict(type='LoggerHook', interval=50), checkpoint=dict(type='CheckpointHook', interval=5)) + +load_from = 'checkpoints/bevfusion_init_converted.pth' From 2dbf063632647b27fd02223d472da4c4be62e52e Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Mon, 20 Mar 2023 11:38:11 +0800 Subject: [PATCH 02/29] refactor transfusion head --- .../BEVFusion/bevfusion/transfusion_head.py | 89 +++++++++++++------ ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 6 +- 2 files changed, 64 insertions(+), 31 deletions(-) diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index 59dbdf891f..f4e918ae68 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -1,6 +1,6 @@ # modify from https://github.com/mit-han-lab/bevfusion import copy -from typing import List +from typing import List, Tuple import numpy as np import torch @@ -498,12 +498,23 @@ def predict_by_feat(self, return rets[0] - def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict): + def get_targets(self, batch_gt_instances_3d: List[InstanceData], + preds_dict: List[dict]): """Generate training targets. Args: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. - gt_labels_3d (torch.Tensor): Labels of boxes. - preds_dicts (tuple of dict): first index by layer (default 1) + batch_gt_instances_3d (List[InstanceData]): + preds_dict (list[dict]): The prediction results. The index of the + list is the index of layers. The inner dict contains + predictions of one mini-batch: + - center: (bs, 2, num_proposals) + - height: (bs, 1, num_proposals) + - dim: (bs, 3, num_proposals) + - rot: (bs, 2, num_proposals) + - vel: (bs, 2, num_proposals) + - cls_logit: (bs, num_classes, num_proposals) + - query_score: (bs, num_classes, num_proposals) + - heatmap: The original heatmap before fed into transformer + decoder, with shape (bs, 10, h, w) Returns: tuple[torch.Tensor]: Tuple of target including \ the following results in order. @@ -516,20 +527,23 @@ def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict): # change preds_dict into list of dict (index by batch_id) # preds_dict[0]['center'].shape [bs, 3, num_proposal] list_of_pred_dict = [] - for batch_idx in range(len(gt_bboxes_3d)): + for batch_idx in range(len(batch_gt_instances_3d)): pred_dict = {} for key in preds_dict[0].keys(): - pred_dict[key] = preds_dict[0][key][batch_idx:batch_idx + 1] + preds = [] + for i in range(self.num_decoder_layers): + pred_one_layer = preds_dict[i][key][batch_idx:batch_idx + + 1] + preds.append(pred_one_layer) + pred_dict[key] = torch.cat(preds) list_of_pred_dict.append(pred_dict) - assert len(gt_bboxes_3d) == len(list_of_pred_dict) - + assert len(batch_gt_instances_3d) == len(list_of_pred_dict) res_tuple = multi_apply( self.get_targets_single, - gt_bboxes_3d, - gt_labels_3d, + batch_gt_instances_3d, list_of_pred_dict, - np.arange(len(gt_labels_3d)), + np.arange(len(batch_gt_instances_3d)), ) labels = torch.cat(res_tuple[0], dim=0) label_weights = torch.cat(res_tuple[1], dim=0) @@ -550,23 +564,26 @@ def get_targets(self, gt_bboxes_3d, gt_labels_3d, preds_dict): heatmap, ) - def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict, - batch_idx): + def get_targets_single(self, gt_instances_3d, preds_dict, batch_idx): """Generate training targets for a single sample. Args: - gt_bboxes_3d (:obj:`LiDARInstance3DBoxes`): Ground truth gt boxes. - gt_labels_3d (torch.Tensor): Labels of boxes. - preds_dict (dict): dict of prediction result for a single sample + gt_instances_3d (:obj:`InstanceData`): ground truth of instances. + preds_dict (dict): dict of prediction result for a single sample. Returns: tuple[torch.Tensor]: Tuple of target including \ the following results in order. - torch.Tensor: classification target. [1, num_proposals] - - torch.Tensor: classification weights (mask) [1, num_proposals] # noqa: E501 + - torch.Tensor: classification weights (mask) [1, + num_proposals] # noqa: E501 - torch.Tensor: regression target. [1, num_proposals, 8] - torch.Tensor: regression weights. [1, num_proposals, 8] - torch.Tensor: iou target. [1, num_proposals] - int: number of positive proposals + - torch.Tensor: heatmap targets. """ + # 1. Assignment + gt_bboxes_3d = gt_instances_3d.bboxes_3d + gt_labels_3d = gt_instances_3d.labels_3d num_proposals = preds_dict['center'].shape[-1] # get pred boxes, carefully ! don't change the network outputs @@ -628,14 +645,19 @@ def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict, [res.max_overlaps for res in assign_result_list]), labels=torch.cat([res.labels for res in assign_result_list]), ) + + # 2. Sampling. Compatible with the interface of `PseudoSampler` in + # mmdet. + gt_instances, pred_instances = InstanceData( + bboxes=gt_bboxes_tensor), InstanceData(priors=bboxes_tensor) sampling_result = self.bbox_sampler.sample(assign_result_ensemble, - bboxes_tensor, - gt_bboxes_tensor) + pred_instances, + gt_instances) pos_inds = sampling_result.pos_inds neg_inds = sampling_result.neg_inds assert len(pos_inds) + len(neg_inds) == num_proposals - # create target for loss computation + # 3. Create target for loss computation bbox_targets = torch.zeros([num_proposals, self.bbox_coder.code_size ]).to(center.device) bbox_weights = torch.zeros([num_proposals, self.bbox_coder.code_size @@ -723,17 +745,28 @@ def get_targets_single(self, gt_bboxes_3d, gt_labels_3d, preds_dict, heatmap[None], ) - def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs): + def loss(self, batch_feats, batch_data_samples): """Loss function for CenterHead. - Args: - gt_bboxes_3d (list[:obj:`LiDARInstance3DBoxes`]): Ground - truth gt boxes. - gt_labels_3d (list[torch.Tensor]): Labels of boxes. - preds_dicts (list[list[dict]]): Output of forward function. + batch_feats (): Features in a batch. + batch_data_samples (List[:obj:`Det3DDataSample`]): The Data + Samples. It usually includes information such as + `gt_instance_3d`. Returns: dict[str:torch.Tensor]: Loss of heatmap and bbox of each task. """ + batch_input_metas, batch_gt_instances_3d = [], [] + for data_sample in batch_data_samples: + batch_input_metas.append(data_sample.metainfo) + batch_gt_instances_3d.append(data_sample.gt_instances_3d) + preds_dicts = self(batch_feats, batch_input_metas) + loss = self.loss_by_feat(preds_dicts, batch_gt_instances_3d) + + return loss + + def loss_by_feat(self, preds_dicts: Tuple[List[dict]], + batch_gt_instances_3d: List[InstanceData], *args, + **kwargs): ( labels, label_weights, @@ -743,7 +776,7 @@ def loss(self, gt_bboxes_3d, gt_labels_3d, preds_dicts, **kwargs): num_pos, matched_ious, heatmap, - ) = self.get_targets(gt_bboxes_3d, gt_labels_3d, preds_dicts[0]) + ) = self.get_targets(batch_gt_instances_3d, preds_dicts[0]) if hasattr(self, 'on_the_image_mask'): label_weights = label_weights * self.on_the_image_mask bbox_weights = bbox_weights * self.on_the_image_mask[:, :, None] diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 865952342c..5514a2fcbe 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -27,7 +27,7 @@ CAM_BACK_LEFT='samples/CAM_BACK_LEFT', sweeps='sweeps/LIDAR_TOP') input_modality = dict(use_lidar=True, use_camera=True) -backend_args = None +backend_args = dict(backend='petrel', path_mapping={'data/nuscenes_mini/':'s3://openmmlab/datasets/detection3d/nuscenes/'}) model = dict( type='BEVFusion', @@ -329,8 +329,8 @@ train_dataloader = dict( batch_size=4, - num_workers=4, - persistent_workers=True, + num_workers=0, + # persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='CBGSDataset', From 149150daa6111ce76ea07a956d59d0e79f604c0b Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Mon, 20 Mar 2023 13:58:36 +0800 Subject: [PATCH 03/29] img branch optioinal --- projects/BEVFusion/bevfusion/bevfusion.py | 15 +++++++++------ .../BEVFusion/bevfusion/transfusion_head.py | 3 ++- ...0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 17 +++++++++++------ 3 files changed, 22 insertions(+), 13 deletions(-) diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index e13e4b6c19..6cf709ca93 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -217,13 +217,16 @@ def extract_feat( camera2lidar = imgs.new_tensor(np.asarray(camera2lidar)) img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix)) lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix)) - img_feature = self.extract_img_feat(imgs, points, lidar2image, - camera_intrinsics, camera2lidar, - img_aug_matrix, lidar_aug_matrix, - batch_input_metas) + features = [] + if imgs is not None: + img_feature = self.extract_img_feat(imgs, points, lidar2image, + camera_intrinsics, + camera2lidar, img_aug_matrix, + lidar_aug_matrix, + batch_input_metas) + features.append(img_feature) pts_feature = self.extract_pts_feat(batch_inputs_dict) - - features = [img_feature, pts_feature] + features.append(pts_feature) if self.fusion_layer is not None: x = self.fusion_layer(features) diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index f4e918ae68..7b9e3fe0ca 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -1,6 +1,6 @@ # modify from https://github.com/mit-han-lab/bevfusion import copy -from typing import List, Tuple +from typing import List, Tuple import numpy as np import torch @@ -747,6 +747,7 @@ def get_targets_single(self, gt_instances_3d, preds_dict, batch_idx): def loss(self, batch_feats, batch_data_samples): """Loss function for CenterHead. + Args: batch_feats (): Features in a batch. batch_data_samples (List[:obj:`Det3DDataSample`]): The Data diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 5514a2fcbe..fc25c26bae 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -16,7 +16,7 @@ metainfo = dict(classes=class_names, version='v1.0-mini') dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes_mini/' +data_root = 'data/nuscenes/' data_prefix = dict( pts='samples/LIDAR_TOP', CAM_FRONT='samples/CAM_FRONT', @@ -27,7 +27,12 @@ CAM_BACK_LEFT='samples/CAM_BACK_LEFT', sweeps='sweeps/LIDAR_TOP') input_modality = dict(use_lidar=True, use_camera=True) -backend_args = dict(backend='petrel', path_mapping={'data/nuscenes_mini/':'s3://openmmlab/datasets/detection3d/nuscenes/'}) +backend_args = dict( + backend='petrel', + path_mapping={ + 'data/nuscenes_mini/': 's3://openmmlab/datasets/detection3d/nuscenes/', + 'data/nuscenes/': 's3://openmmlab/datasets/detection3d/nuscenes/' + }) model = dict( type='BEVFusion', @@ -329,8 +334,8 @@ train_dataloader = dict( batch_size=4, - num_workers=0, - # persistent_workers=True, + num_workers=4, + persistent_workers=True, sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( type='CBGSDataset', @@ -348,8 +353,8 @@ box_type_3d='LiDAR'))) val_dataloader = dict( batch_size=1, - num_workers=0, - # persistent_workers=True, + num_workers=4, + persistent_workers=True, drop_last=False, sampler=dict(type='DefaultSampler', shuffle=False), dataset=dict( From 1a9c6815054321b958c9bee241067820a6a5a1d3 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Mon, 20 Mar 2023 14:08:10 +0800 Subject: [PATCH 04/29] support nuscenes_mini in replace_ceph_backend --- mmdet3d/utils/misc.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mmdet3d/utils/misc.py b/mmdet3d/utils/misc.py index e5f4b47d33..8210e7a91d 100644 --- a/mmdet3d/utils/misc.py +++ b/mmdet3d/utils/misc.py @@ -40,6 +40,9 @@ def replace_ceph_backend(cfg): elif 'nuimages' in cfg_pretty_text: replace_strs = replace_strs.replace('DATA', 'nuimages') replace_strs = replace_strs.replace('CEPH', 'nuimages') + elif 'nuscenes_mini' in cfg_pretty_text: + replace_strs = replace_strs.replace('DATA', 'nuscenes_mini') + replace_strs = replace_strs.replace('CEPH', 'nuscenes_mini') else: NotImplemented('Does not support global replacement') From eb8c69d7a693b6777b3c7615f99ab6e214f852c4 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Mon, 20 Mar 2023 16:01:03 +0800 Subject: [PATCH 05/29] use replace_ceph --- ...ion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index fc25c26bae..9741bbe861 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -14,7 +14,7 @@ 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ] -metainfo = dict(classes=class_names, version='v1.0-mini') +metainfo = dict(classes=class_names) dataset_type = 'NuScenesDataset' data_root = 'data/nuscenes/' data_prefix = dict( @@ -27,12 +27,7 @@ CAM_BACK_LEFT='samples/CAM_BACK_LEFT', sweeps='sweeps/LIDAR_TOP') input_modality = dict(use_lidar=True, use_camera=True) -backend_args = dict( - backend='petrel', - path_mapping={ - 'data/nuscenes_mini/': 's3://openmmlab/datasets/detection3d/nuscenes/', - 'data/nuscenes/': 's3://openmmlab/datasets/detection3d/nuscenes/' - }) +backend_args = None model = dict( type='BEVFusion', From 4223023d3931eb25f343f1af5fe1e0c9c0577729 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Mon, 20 Mar 2023 19:43:01 +0800 Subject: [PATCH 06/29] add only-lidar --- mmdet3d/utils/misc.py | 3 - projects/BEVFusion/bevfusion/bevfusion.py | 44 +- ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 383 ++++++++++++++++++ ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 21 +- 4 files changed, 422 insertions(+), 29 deletions(-) create mode 100644 projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py diff --git a/mmdet3d/utils/misc.py b/mmdet3d/utils/misc.py index 8210e7a91d..e5f4b47d33 100644 --- a/mmdet3d/utils/misc.py +++ b/mmdet3d/utils/misc.py @@ -40,9 +40,6 @@ def replace_ceph_backend(cfg): elif 'nuimages' in cfg_pretty_text: replace_strs = replace_strs.replace('DATA', 'nuimages') replace_strs = replace_strs.replace('CEPH', 'nuimages') - elif 'nuscenes_mini' in cfg_pretty_text: - replace_strs = replace_strs.replace('DATA', 'nuscenes_mini') - replace_strs = replace_strs.replace('CEPH', 'nuscenes_mini') else: NotImplemented('Does not support global replacement') diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index 6cf709ca93..431ecf89f7 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -40,12 +40,16 @@ def __init__( self.pts_voxel_encoder = MODELS.build(pts_voxel_encoder) - self.img_backbone = MODELS.build(img_backbone) - self.img_neck = MODELS.build(img_neck) - self.vtransform = MODELS.build(vtransform) + self.img_backbone = MODELS.build( + img_backbone) if img_backbone is not None else None + self.img_neck = MODELS.build( + img_neck) if img_neck is not None else None + self.vtransform = MODELS.build( + vtransform) if vtransform is not None else None self.pts_middle_encoder = MODELS.build(pts_middle_encoder) - self.fusion_layer = MODELS.build(fusion_layer) + self.fusion_layer = MODELS.build( + fusion_layer) if fusion_layer is not None else None self.pts_backbone = MODELS.build(pts_backbone) self.pts_neck = MODELS.build(pts_neck) @@ -53,7 +57,7 @@ def __init__( self.bbox_head = MODELS.build(bbox_head) # hard code here where using converted checkpoint of original # implementation of `BEVFusion` - self.use_converted_checkpoint = True + self.use_converted_checkpoint = False self.init_weights() @@ -202,23 +206,23 @@ def extract_feat( ): imgs = batch_inputs_dict.get('imgs', None) points = batch_inputs_dict.get('points', None) - - lidar2image, camera_intrinsics, camera2lidar = [], [], [] - img_aug_matrix, lidar_aug_matrix = [], [] - for i, meta in enumerate(batch_input_metas): - lidar2image.append(meta['lidar2img']) - camera_intrinsics.append(meta['cam2img']) - camera2lidar.append(meta['cam2lidar']) - img_aug_matrix.append(meta.get('img_aug_matrix', np.eye(4))) - lidar_aug_matrix.append(meta.get('lidar_aug_matrix', np.eye(4))) - - lidar2image = imgs.new_tensor(np.asarray(lidar2image)) - camera_intrinsics = imgs.new_tensor(np.array(camera_intrinsics)) - camera2lidar = imgs.new_tensor(np.asarray(camera2lidar)) - img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix)) - lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix)) features = [] if imgs is not None: + lidar2image, camera_intrinsics, camera2lidar = [], [], [] + img_aug_matrix, lidar_aug_matrix = [], [] + for i, meta in enumerate(batch_input_metas): + lidar2image.append(meta['lidar2img']) + camera_intrinsics.append(meta['cam2img']) + camera2lidar.append(meta['cam2lidar']) + img_aug_matrix.append(meta.get('img_aug_matrix', np.eye(4))) + lidar_aug_matrix.append( + meta.get('lidar_aug_matrix', np.eye(4))) + + lidar2image = imgs.new_tensor(np.asarray(lidar2image)) + camera_intrinsics = imgs.new_tensor(np.array(camera_intrinsics)) + camera2lidar = imgs.new_tensor(np.asarray(camera2lidar)) + img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix)) + lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix)) img_feature = self.extract_img_feat(imgs, points, lidar2image, camera_intrinsics, camera2lidar, img_aug_matrix, diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py new file mode 100644 index 0000000000..bee8274a3e --- /dev/null +++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -0,0 +1,383 @@ +_base_ = ['mmdet3d::_base_/default_runtime.py'] +custom_imports = dict( + imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False) + +# model settings +# Voxel size for voxel encoder +# Usually voxel size is changed consistently with the point cloud range +# If point cloud range is modified, do remember to change all related +# keys in the config. +voxel_size = [0.075, 0.075, 0.2] +point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] +class_names = [ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', + 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +] + +metainfo = dict(classes=class_names) +dataset_type = 'NuScenesDataset' +data_root = 'data/nuscenes/' +data_prefix = dict( + pts='samples/LIDAR_TOP', + CAM_FRONT='samples/CAM_FRONT', + CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', + CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', + CAM_BACK='samples/CAM_BACK', + CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', + CAM_BACK_LEFT='samples/CAM_BACK_LEFT', + sweeps='sweeps/LIDAR_TOP') +input_modality = dict(use_lidar=True, use_camera=False) +backend_args = dict( + backend='petrel', + path_mapping=dict({ + './data/nuscenes/': + 's3://openmmlab/datasets/detection3d/nuscenes/', + 'data/nuscenes/': + 's3://openmmlab/datasets/detection3d/nuscenes/', + './data/nuscenes_mini/': + 's3://openmmlab/datasets/detection3d/nuscenes/', + 'data/nuscenes_mini/': + 's3://openmmlab/datasets/detection3d/nuscenes/' + })) + +model = dict( + type='BEVFusion', + data_preprocessor=dict( + type='Det3DDataPreprocessor', + pad_size_divisor=32, + voxelize_cfg=dict( + max_num_points=10, + point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], + voxel_size=[0.075, 0.075, 0.2], + max_voxels=[120000, 160000], + voxelize_reduce=True)), + pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), + pts_middle_encoder=dict( + type='BEVFusionSparseEncoder', + in_channels=5, + sparse_shape=[1440, 1440, 41], + order=('conv', 'norm', 'act'), + norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, + 128)), + encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), + block_type='basicblock'), + pts_backbone=dict( + type='SECOND', + in_channels=256, + out_channels=[128, 256], + layer_nums=[5, 5], + layer_strides=[1, 2], + norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + conv_cfg=dict(type='Conv2d', bias=False)), + pts_neck=dict( + type='SECONDFPN', + in_channels=[128, 256], + out_channels=[256, 256], + upsample_strides=[1, 2], + norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + upsample_cfg=dict(type='deconv', bias=False), + use_conv_for_no_stride=True), + bbox_head=dict( + type='TransFusionHead', + num_proposals=200, + auxiliary=True, + in_channels=512, + hidden_channel=128, + num_classes=10, + nms_kernel_size=3, + bn_momentum=0.1, + num_decoder_layers=1, + decoder_layer=dict( + type='TransformerDecoderLayer', + self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), + cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), + ffn_cfg=dict( + embed_dims=128, + feedforward_channels=256, + num_fcs=2, + ffn_drop=0.1, + act_cfg=dict(type='ReLU', inplace=True), + ), + norm_cfg=dict(type='LN'), + pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)), + train_cfg=dict( + dataset='nuScenes', + point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], + grid_size=[1440, 1440, 41], + voxel_size=[0.075, 0.075, 0.2], + out_size_factor=8, + gaussian_overlap=0.1, + min_radius=2, + pos_weight=-1, + code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], + assigner=dict( + type='HungarianAssigner3D', + iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), + cls_cost=dict( + type='mmdet.FocalLossCost', + gamma=2.0, + alpha=0.25, + weight=0.15), + reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), + iou_cost=dict(type='IoU3DCost', weight=0.25))), + test_cfg=dict( + dataset='nuScenes', + grid_size=[1440, 1440, 41], + out_size_factor=8, + voxel_size=[0.075, 0.075], + pc_range=[-54.0, -54.0], + nms_type=None), + common_heads=dict( + center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), + bbox_coder=dict( + type='TransFusionBBoxCoder', + pc_range=[-54.0, -54.0], + post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], + score_threshold=0.0, + out_size_factor=8, + voxel_size=[0.075, 0.075], + code_size=10), + loss_cls=dict( + type='mmdet.FocalLoss', + use_sigmoid=True, + gamma=2.0, + alpha=0.25, + reduction='mean', + loss_weight=1.0), + loss_heatmap=dict( + type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), + loss_bbox=dict( + type='mmdet.L1Loss', reduction='mean', loss_weight=0.25))) + +db_sampler = dict( + data_root=data_root, + info_path=data_root + 'nuscenes_dbinfos_train.pkl', + rate=1.0, + prepare=dict( + filter_by_difficulty=[-1], + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5)), + classes=class_names, + sample_groups=dict( + car=2, + truck=3, + construction_vehicle=7, + bus=4, + trailer=6, + barrier=2, + motorcycle=6, + bicycle=6, + pedestrian=2, + traffic_cone=2), + points_loader=dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=[0, 1, 2, 3, 4], + backend_args=backend_args)) + +train_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + backend_args=backend_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=9, + load_dim=5, + use_dim=5, + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args), + dict( + type='LoadAnnotations3D', + with_bbox_3d=True, + with_label_3d=True, + with_attr_label=False), + dict(type='ObjectSample', db_sampler=db_sampler), + dict( + type='GlobalRotScaleTrans', + scale_ratio_range=[0.9, 1.1], + rot_range=[-0.78539816, 0.78539816], + translation_std=0.5), + dict(type='BEVFusionRandomFlip3D'), + dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range), + dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range), + dict( + type='ObjectNameFilter', + classes=[ + 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', + 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' + ]), + dict(type='PointShuffle'), + dict( + type='Pack3DDetInputs', + keys=[ + 'points', 'img', 'gt_bboxes_3d', 'gt_labels_3d', 'gt_bboxes', + 'gt_labels' + ], + meta_keys=[ + 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', + 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', + 'lidar_path', 'img_path', 'transformation_3d_flow', 'pcd_rotation', + 'pcd_scale_factor', 'pcd_trans', 'img_aug_matrix', + 'lidar_aug_matrix' + ]) +] + +test_pipeline = [ + dict( + type='LoadPointsFromFile', + coord_type='LIDAR', + load_dim=5, + use_dim=5, + backend_args=backend_args), + dict( + type='LoadPointsFromMultiSweeps', + sweeps_num=9, + load_dim=5, + use_dim=5, + pad_empty_sweeps=True, + remove_close=True, + backend_args=backend_args), + dict( + type='PointsRangeFilter', + point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0]), + dict( + type='Pack3DDetInputs', + keys=['img', 'points', 'gt_bboxes_3d', 'gt_labels_3d'], + meta_keys=[ + 'cam2img', 'ori_cam2img', 'lidar2cam', 'lidar2img', 'cam2lidar', + 'ori_lidar2img', 'img_aug_matrix', 'box_type_3d', 'sample_idx', + 'lidar_path', 'img_path' + ]) +] + +train_dataloader = dict( + batch_size=4, + num_workers=4, + persistent_workers=True, + sampler=dict(type='DefaultSampler', shuffle=True), + dataset=dict( + type='CBGSDataset', + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_train.pkl', + pipeline=train_pipeline, + metainfo=metainfo, + modality=input_modality, + test_mode=False, + data_prefix=data_prefix, + # we use box_type_3d='LiDAR' in kitti and nuscenes dataset + # and box_type_3d='Depth' in sunrgbd and scannet dataset. + box_type_3d='LiDAR'))) +val_dataloader = dict( + batch_size=1, + num_workers=4, + persistent_workers=True, + drop_last=False, + sampler=dict(type='DefaultSampler', shuffle=False), + dataset=dict( + type=dataset_type, + data_root=data_root, + ann_file='nuscenes_infos_val.pkl', + pipeline=test_pipeline, + metainfo=metainfo, + modality=input_modality, + data_prefix=data_prefix, + test_mode=True, + box_type_3d='LiDAR', + backend_args=backend_args)) +test_dataloader = val_dataloader + +val_evaluator = dict( + type='NuScenesMetric', + data_root=data_root, + ann_file=data_root + 'nuscenes_infos_val.pkl', + metric='bbox', + backend_args=backend_args) +test_evaluator = val_evaluator + +vis_backends = [dict(type='LocalVisBackend')] +visualizer = dict( + type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') + +# learning rate +lr = 0.0001 +param_scheduler = [ + # learning rate scheduler + # During the first 8 epochs, learning rate increases from 0 to lr * 10 + # during the next 12 epochs, learning rate decreases from lr * 10 to + # lr * 1e-4 + dict( + type='CosineAnnealingLR', + T_max=8, + eta_min=lr * 10, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingLR', + T_max=12, + eta_min=lr * 1e-4, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True), + # momentum scheduler + # During the first 8 epochs, momentum increases from 0 to 0.85 / 0.95 + # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1 + dict( + type='CosineAnnealingMomentum', + T_max=8, + eta_min=0.85 / 0.95, + begin=0, + end=8, + by_epoch=True, + convert_to_iter_based=True), + dict( + type='CosineAnnealingMomentum', + T_max=12, + eta_min=1, + begin=8, + end=20, + by_epoch=True, + convert_to_iter_based=True) +] + +# runtime settings +train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=1) +val_cfg = dict() +test_cfg = dict() + +optim_wrapper = dict( + type='OptimWrapper', + optimizer=dict(type='AdamW', lr=lr, weight_decay=0.01), + clip_grad=dict(max_norm=35, norm_type=2)) + +# Default setting for scaling LR automatically +# - `enable` means enable scaling LR automatically +# or not by default. +# - `base_batch_size` = (8 GPUs) x (4 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=32) + +default_hooks = dict( + logger=dict(type='LoggerHook', interval=1), + checkpoint=dict(type='CheckpointHook', interval=5)) +custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)] + +load_from = 'checkpoints/bevfusion_init_converted.pth' diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 9741bbe861..fe5d86e727 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -191,7 +191,17 @@ rate=1.0, prepare=dict( filter_by_difficulty=[-1], - filter_by_min_points=dict(Car=5, Pedestrian=5, Cyclist=5)), + filter_by_min_points=dict( + car=5, + truck=5, + bus=5, + trailer=5, + construction_vehicle=5, + traffic_cone=5, + barrier=5, + motorcycle=5, + bicycle=5, + pedestrian=5)), classes=class_names, sample_groups=dict( car=2, @@ -208,8 +218,7 @@ type='LoadPointsFromFile', coord_type='LIDAR', load_dim=5, - use_dim=[0, 1, 2, 3, 4], - backend_args=backend_args)) + use_dim=[0, 1, 2, 3, 4])) train_pipeline = [ dict( @@ -236,7 +245,7 @@ with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - # dict(type='ObjectSampling', db_sampler=db_sampler), + # dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ImageAug3D', final_dim=[256, 704], @@ -423,8 +432,8 @@ # Default setting for scaling LR automatically # - `enable` means enable scaling LR automatically # or not by default. -# - `base_batch_size` = (4 GPUs) x (4 samples per GPU). -auto_scale_lr = dict(enable=False, base_batch_size=16) +# - `base_batch_size` = (8 GPUs) x (4 samples per GPU). +auto_scale_lr = dict(enable=False, base_batch_size=32) default_hooks = dict( logger=dict(type='LoggerHook', interval=50), From 5c5d55501aea57aa4e7599572f2384f1fe3d7483 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 21 Mar 2023 11:58:04 +0800 Subject: [PATCH 07/29] use valid_flag in dataset filter --- projects/BEVFusion/bevfusion/bevfusion.py | 1 + projects/BEVFusion/bevfusion/depth_lss.py | 2 -- .../BEVFusion/bevfusion/transfusion_head.py | 2 -- ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 30 ++++++++++--------- ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 1 + 5 files changed, 18 insertions(+), 18 deletions(-) diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index 431ecf89f7..c0cd62cf54 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -208,6 +208,7 @@ def extract_feat( points = batch_inputs_dict.get('points', None) features = [] if imgs is not None: + imgs = imgs.contiguous() lidar2image, camera_intrinsics, camera2lidar = [], [], [] img_aug_matrix, lidar_aug_matrix = [], [] for i, meta in enumerate(batch_input_metas): diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py index 072375dd37..069f4ea558 100644 --- a/projects/BEVFusion/bevfusion/depth_lss.py +++ b/projects/BEVFusion/bevfusion/depth_lss.py @@ -202,8 +202,6 @@ def forward( camera2lidar_rots = camera2lidar[..., :3, :3] camera2lidar_trans = camera2lidar[..., :3, 3] - # print(img.shape, self.image_size, self.feature_size) - batch_size = len(points) depth = torch.zeros(batch_size, img.shape[1], 1, *self.image_size).to(points[0].device) diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index 7b9e3fe0ca..5e8ffeff31 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -75,8 +75,6 @@ def __init__( ): super(TransFusionHead, self).__init__() - self.fp16_enabled = False - self.num_classes = num_classes self.num_proposals = num_proposals self.auxiliary = auxiliary diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index bee8274a3e..6f17bb67e2 100644 --- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -27,18 +27,19 @@ CAM_BACK_LEFT='samples/CAM_BACK_LEFT', sweeps='sweeps/LIDAR_TOP') input_modality = dict(use_lidar=True, use_camera=False) -backend_args = dict( - backend='petrel', - path_mapping=dict({ - './data/nuscenes/': - 's3://openmmlab/datasets/detection3d/nuscenes/', - 'data/nuscenes/': - 's3://openmmlab/datasets/detection3d/nuscenes/', - './data/nuscenes_mini/': - 's3://openmmlab/datasets/detection3d/nuscenes/', - 'data/nuscenes_mini/': - 's3://openmmlab/datasets/detection3d/nuscenes/' - })) +# backend_args = dict( +# backend='petrel', +# path_mapping=dict({ +# './data/nuscenes/': +# 's3://openmmlab/datasets/detection3d/nuscenes/', +# 'data/nuscenes/': +# 's3://openmmlab/datasets/detection3d/nuscenes/', +# './data/nuscenes_mini/': +# 's3://openmmlab/datasets/detection3d/nuscenes/', +# 'data/nuscenes_mini/': +# 's3://openmmlab/datasets/detection3d/nuscenes/' +# })) +backend_args = None model = dict( type='BEVFusion', @@ -281,6 +282,7 @@ modality=input_modality, test_mode=False, data_prefix=data_prefix, + use_valid_flag=True, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'))) @@ -360,7 +362,7 @@ ] # runtime settings -train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=1) +train_cfg = dict(by_epoch=True, max_epochs=20, val_interval=5) val_cfg = dict() test_cfg = dict() @@ -376,7 +378,7 @@ auto_scale_lr = dict(enable=False, base_batch_size=32) default_hooks = dict( - logger=dict(type='LoggerHook', interval=1), + logger=dict(type='LoggerHook', interval=50), checkpoint=dict(type='CheckpointHook', interval=5)) custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)] diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index fe5d86e727..8ff4dbab78 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -352,6 +352,7 @@ modality=input_modality, test_mode=False, data_prefix=data_prefix, + use_valid_flag=True, # we use box_type_3d='LiDAR' in kitti and nuscenes dataset # and box_type_3d='Depth' in sunrgbd and scannet dataset. box_type_3d='LiDAR'))) From 8078f57cac1c95ed0bab02ec88baedfd2a3e0f37 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Fri, 24 Mar 2023 16:37:23 +0800 Subject: [PATCH 08/29] support lidar-only training 69 --- .../hooks/disable_object_sample_hook.py | 7 ++- projects/BEVFusion/bevfusion/__init__.py | 6 ++- projects/BEVFusion/bevfusion/bevfusion.py | 2 +- projects/BEVFusion/bevfusion/transforms_3d.py | 48 +++++++++++++++++-- .../BEVFusion/bevfusion/transfusion_head.py | 4 +- ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 8 ++-- 6 files changed, 61 insertions(+), 14 deletions(-) diff --git a/mmdet3d/engine/hooks/disable_object_sample_hook.py b/mmdet3d/engine/hooks/disable_object_sample_hook.py index d1f3c2a09d..07d12762be 100644 --- a/mmdet3d/engine/hooks/disable_object_sample_hook.py +++ b/mmdet3d/engine/hooks/disable_object_sample_hook.py @@ -1,4 +1,5 @@ # Copyright (c) OpenMMLab. All rights reserved. +from mmengine.dataset import BaseDataset from mmengine.hooks import Hook from mmengine.model import is_model_wrapper from mmengine.runner import Runner @@ -35,7 +36,11 @@ def before_train_epoch(self, runner: Runner): model = model.module if epoch == self.disable_after_epoch: runner.logger.info('Disable ObjectSample') - for transform in runner.train_dataloader.dataset.pipeline.transforms: # noqa: E501 + dataset = runner.train_dataloader.dataset + # handle dataset wrapper + if not isinstance(dataset, BaseDataset): + dataset = dataset.dataset + for transform in dataset.pipeline.transforms: # noqa: E501 if isinstance(transform, ObjectSample): assert hasattr(transform, 'disabled') transform.disabled = True diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py index faf7fa2d9a..07988ff597 100644 --- a/projects/BEVFusion/bevfusion/__init__.py +++ b/projects/BEVFusion/bevfusion/__init__.py @@ -4,7 +4,8 @@ from .loading import BEVLoadMultiViewImageFromFiles from .sparse_encoder import BEVFusionSparseEncoder from .transformer import TransformerDecoderLayer -from .transforms_3d import BEVFusionRandomFlip3D, GridMask, ImageAug3D +from .transforms_3d import (BEVFusionGlobalRotScaleTrans, + BEVFusionRandomFlip3D, GridMask, ImageAug3D) from .transfusion_head import ConvFuser, TransFusionHead from .utils import (BBoxBEVL1Cost, HeuristicAssigner3D, HungarianAssigner3D, IoU3DCost) @@ -14,5 +15,6 @@ 'GeneralizedLSSFPN', 'HungarianAssigner3D', 'BBoxBEVL1Cost', 'IoU3DCost', 'HeuristicAssigner3D', 'DepthLSSTransform', 'BEVLoadMultiViewImageFromFiles', 'BEVFusionSparseEncoder', - 'TransformerDecoderLayer', 'BEVFusionRandomFlip3D' + 'TransformerDecoderLayer', 'BEVFusionRandomFlip3D', + 'BEVFusionGlobalRotScaleTrans' ] diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index c0cd62cf54..1cde750c01 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -98,7 +98,7 @@ def extract_img_feat( img_metas, ) -> torch.Tensor: B, N, C, H, W = x.size() - x = x.view(B * N, C, H, W) + x = x.view(B * N, C, H, W).contiguous() x = self.img_backbone(x) x = self.img_neck(x) diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py index 1941f21142..d7104bdafa 100644 --- a/projects/BEVFusion/bevfusion/transforms_3d.py +++ b/projects/BEVFusion/bevfusion/transforms_3d.py @@ -1,5 +1,4 @@ # modify from https://github.com/mit-han-lab/bevfusion -import random from typing import Any, Dict import numpy as np @@ -7,6 +6,7 @@ from mmcv.transforms import BaseTransform from PIL import Image +from mmdet3d.datasets import GlobalRotScaleTrans from mmdet3d.registry import TRANSFORMS @@ -112,8 +112,8 @@ def transform(self, data: Dict[str, Any]) -> Dict[str, Any]: class BEVFusionRandomFlip3D: def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: - flip_horizontal = random.choice([0, 1]) - flip_vertical = random.choice([0, 1]) + flip_horizontal = np.random.choice([0, 1]) + flip_vertical = np.random.choice([0, 1]) rotation = np.eye(3) if flip_horizontal: @@ -141,6 +141,48 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: return data +@TRANSFORMS.register_module() +class BEVFusionGlobalRotScaleTrans(GlobalRotScaleTrans): + + def transform(self, input_dict: dict) -> dict: + """Private function to rotate, scale and translate bounding boxes and + points. + + Args: + input_dict (dict): Result dict from loading pipeline. + + Returns: + dict: Results after scaling, 'points', 'pcd_rotation', + 'pcd_scale_factor', 'pcd_trans' and `gt_bboxes_3d` are updated + in the result dict. + """ + if 'transformation_3d_flow' not in input_dict: + input_dict['transformation_3d_flow'] = [] + + self._rot_bbox_points(input_dict) + + if 'pcd_scale_factor' not in input_dict: + self._random_scale(input_dict) + self._scale_bbox_points(input_dict) + + self._trans_bbox_points(input_dict) + + input_dict['transformation_3d_flow'].extend(['R', 'S', 'T']) + + lidar_augs = np.eye(4) + lidar_augs[:3, :3] = input_dict['pcd_rotation'].T * input_dict[ + 'pcd_scale_factor'] + lidar_augs[:3, 3] = input_dict['pcd_trans'] * \ + input_dict['pcd_scale_factor'] + + if 'lidar_aug_matrix' not in input_dict: + input_dict['lidar_aug_matrix'] = np.eye(4) + input_dict[ + 'lidar_aug_matrix'] = lidar_augs @ input_dict['lidar_aug_matrix'] + + return input_dict + + @TRANSFORMS.register_module() class GridMask(BaseTransform): diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index 5e8ffeff31..9cd1891f5c 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -785,8 +785,8 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]], # compute heatmap loss loss_heatmap = self.loss_heatmap( - clip_sigmoid(preds_dict['dense_heatmap']), - heatmap, + clip_sigmoid(preds_dict['dense_heatmap']).float(), + heatmap.float(), avg_factor=max(heatmap.eq(1).float().sum().item(), 1), ) loss_dict['loss_heatmap'] = loss_heatmap diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 8ff4dbab78..7c3225b641 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -255,7 +255,7 @@ rand_flip=True, is_train=True), dict( - type='GlobalRotScaleTrans', + type='BEVFusionGlobalRotScaleTrans', scale_ratio_range=[0.9, 1.1], rot_range=[-0.78539816, 0.78539816], translation_std=0.5), @@ -421,7 +421,7 @@ ] # runtime settings -train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=6) +train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1) val_cfg = dict() test_cfg = dict() @@ -438,6 +438,4 @@ default_hooks = dict( logger=dict(type='LoggerHook', interval=50), - checkpoint=dict(type='CheckpointHook', interval=5)) - -load_from = 'checkpoints/bevfusion_init_converted.pth' + checkpoint=dict(type='CheckpointHook', interval=1)) From 68e4f31ff57b34cad551e3b5e3faac95be1d5d7d Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 18 Apr 2023 16:26:18 +0800 Subject: [PATCH 09/29] fix RTS --- projects/BEVFusion/bevfusion/transforms_3d.py | 5 ++--- ...vfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 1 + 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py index d7104bdafa..da259e21d3 100644 --- a/projects/BEVFusion/bevfusion/transforms_3d.py +++ b/projects/BEVFusion/bevfusion/transforms_3d.py @@ -163,11 +163,10 @@ def transform(self, input_dict: dict) -> dict: if 'pcd_scale_factor' not in input_dict: self._random_scale(input_dict) - self._scale_bbox_points(input_dict) - self._trans_bbox_points(input_dict) + self._scale_bbox_points(input_dict) - input_dict['transformation_3d_flow'].extend(['R', 'S', 'T']) + input_dict['transformation_3d_flow'].extend(['R', 'T', 'S']) lidar_augs = np.eye(4) lidar_augs[:3, :3] = input_dict['pcd_rotation'].T * input_dict[ diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 7c3225b641..632876cb62 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -435,6 +435,7 @@ # or not by default. # - `base_batch_size` = (8 GPUs) x (4 samples per GPU). auto_scale_lr = dict(enable=False, base_batch_size=32) +log_processor = dict(window_size=50) default_hooks = dict( logger=dict(type='LoggerHook', interval=50), From cf39e0713259121c51b9f5c608fe3c6b9f39b1f2 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Mon, 24 Apr 2023 17:41:54 +0800 Subject: [PATCH 10/29] fix rotation in ImgAug3D --- projects/BEVFusion/bevfusion/transforms_3d.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py index da259e21d3..ec2d442b5c 100644 --- a/projects/BEVFusion/bevfusion/transforms_3d.py +++ b/projects/BEVFusion/bevfusion/transforms_3d.py @@ -68,8 +68,8 @@ def img_transform(self, img, rotation, translation, resize, resize_dims, translation = A.matmul(translation) + b theta = rotate / 180 * np.pi A = torch.Tensor([ - [np.cos(theta), np.sin(theta)], - [-np.sin(theta), np.cos(theta)], + [np.cos(theta), -np.sin(theta)], + [np.sin(theta), np.cos(theta)], ]) b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2 b = A.matmul(-b) + b From 710a23def4f130cfd93c9d615d839295495af393 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 9 May 2023 10:16:09 +0800 Subject: [PATCH 11/29] revert to original rotation in ImgAug3D --- projects/BEVFusion/bevfusion/transforms_3d.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py index ec2d442b5c..da259e21d3 100644 --- a/projects/BEVFusion/bevfusion/transforms_3d.py +++ b/projects/BEVFusion/bevfusion/transforms_3d.py @@ -68,8 +68,8 @@ def img_transform(self, img, rotation, translation, resize, resize_dims, translation = A.matmul(translation) + b theta = rotate / 180 * np.pi A = torch.Tensor([ - [np.cos(theta), -np.sin(theta)], - [np.sin(theta), np.cos(theta)], + [np.cos(theta), np.sin(theta)], + [-np.sin(theta), np.cos(theta)], ]) b = torch.Tensor([crop[2] - crop[0], crop[3] - crop[1]]) / 2 b = A.matmul(-b) + b From 041b2889e77925bc026cb15120e0de9cdfb0876f Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 9 May 2023 16:52:13 +0800 Subject: [PATCH 12/29] add LSSDepthTransform and parse_losses --- projects/BEVFusion/bevfusion/__init__.py | 4 +- projects/BEVFusion/bevfusion/bevfusion.py | 45 +++++++++++++- projects/BEVFusion/bevfusion/depth_lss.py | 71 +++++++++++++++++++++++ 3 files changed, 117 insertions(+), 3 deletions(-) diff --git a/projects/BEVFusion/bevfusion/__init__.py b/projects/BEVFusion/bevfusion/__init__.py index 07988ff597..db06d3afa4 100644 --- a/projects/BEVFusion/bevfusion/__init__.py +++ b/projects/BEVFusion/bevfusion/__init__.py @@ -1,6 +1,6 @@ from .bevfusion import BEVFusion from .bevfusion_necks import GeneralizedLSSFPN -from .depth_lss import DepthLSSTransform +from .depth_lss import DepthLSSTransform, LSSTransform from .loading import BEVLoadMultiViewImageFromFiles from .sparse_encoder import BEVFusionSparseEncoder from .transformer import TransformerDecoderLayer @@ -13,7 +13,7 @@ __all__ = [ 'BEVFusion', 'TransFusionHead', 'ConvFuser', 'ImageAug3D', 'GridMask', 'GeneralizedLSSFPN', 'HungarianAssigner3D', 'BBoxBEVL1Cost', 'IoU3DCost', - 'HeuristicAssigner3D', 'DepthLSSTransform', + 'HeuristicAssigner3D', 'DepthLSSTransform', 'LSSTransform', 'BEVLoadMultiViewImageFromFiles', 'BEVFusionSparseEncoder', 'TransformerDecoderLayer', 'BEVFusionRandomFlip3D', 'BEVFusionGlobalRotScaleTrans' diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index 1cde750c01..527c83fce0 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -1,7 +1,10 @@ -from typing import Dict, List, Optional +from collections import OrderedDict +from typing import Dict, List, Optional, Tuple import numpy as np import torch +import torch.distributed as dist +from mmengine.utils import is_list_of from torch import Tensor from torch.nn import functional as F @@ -71,6 +74,46 @@ def _forward(self, """ pass + def parse_losses( + self, losses: Dict[str, torch.Tensor] + ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]: + """Parses the raw outputs (losses) of the network. + + Args: + losses (dict): Raw output of the network, which usually contain + losses and other necessary information. + + Returns: + tuple[Tensor, dict]: There are two elements. The first is the + loss tensor passed to optim_wrapper which may be a weighted sum + of all losses, and the second is log_vars which will be sent to + the logger. + """ + log_vars = [] + for loss_name, loss_value in losses.items(): + if isinstance(loss_value, torch.Tensor): + log_vars.append([loss_name, loss_value.mean()]) + elif is_list_of(loss_value, torch.Tensor): + log_vars.append( + [loss_name, + sum(_loss.mean() for _loss in loss_value)]) + else: + raise TypeError( + f'{loss_name} is not a tensor or list of tensors') + + loss = sum(value for key, value in log_vars if 'loss' in key) + log_vars.insert(0, ['loss', loss]) + log_vars = OrderedDict(log_vars) # type: ignore + + for loss_name, loss_value in log_vars.items(): + # reduce loss when distributed training + if dist.is_available() and dist.is_initialized(): + loss_value = loss_value.data.clone() + dist.all_reduce(loss_value.div_(dist.get_world_size())) + log_vars[loss_name] = loss_value.item() + + return loss, log_vars # type: ignore + def init_weights(self) -> None: if self.img_backbone is not None: self.img_backbone.init_weights() diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py index 069f4ea558..fb0c21b878 100644 --- a/projects/BEVFusion/bevfusion/depth_lss.py +++ b/projects/BEVFusion/bevfusion/depth_lss.py @@ -156,6 +156,7 @@ def forward( camera2lidar, img_aug_matrix, lidar_aug_matrix, + metas, **kwargs, ): intrins = camera_intrinsics[..., :3, :3] @@ -182,6 +183,76 @@ def forward( return x +@MODELS.register_module() +class LSSTransform(BaseTransform): + + def __init__( + self, + in_channels: int, + out_channels: int, + image_size: Tuple[int, int], + feature_size: Tuple[int, int], + xbound: Tuple[float, float, float], + ybound: Tuple[float, float, float], + zbound: Tuple[float, float, float], + dbound: Tuple[float, float, float], + downsample: int = 1, + ) -> None: + super().__init__( + in_channels=in_channels, + out_channels=out_channels, + image_size=image_size, + feature_size=feature_size, + xbound=xbound, + ybound=ybound, + zbound=zbound, + dbound=dbound, + ) + self.depthnet = nn.Conv2d(in_channels, self.D + self.C, 1) + if downsample > 1: + assert downsample == 2, downsample + self.downsample = nn.Sequential( + nn.Conv2d( + out_channels, out_channels, 3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(True), + nn.Conv2d( + out_channels, + out_channels, + 3, + stride=downsample, + padding=1, + bias=False, + ), + nn.BatchNorm2d(out_channels), + nn.ReLU(True), + nn.Conv2d( + out_channels, out_channels, 3, padding=1, bias=False), + nn.BatchNorm2d(out_channels), + nn.ReLU(True), + ) + else: + self.downsample = nn.Identity() + + def get_cam_feats(self, x): + B, N, C, fH, fW = x.shape + + x = x.view(B * N, C, fH, fW) + + x = self.depthnet(x) + depth = x[:, :self.D].softmax(dim=1) + x = depth.unsqueeze(1) * x[:, self.D:(self.D + self.C)].unsqueeze(2) + + x = x.view(B, N, self.C, self.D, fH, fW) + x = x.permute(0, 1, 3, 4, 5, 2) + return x + + def forward(self, *args, **kwargs): + x = super().forward(*args, **kwargs) + x = self.downsample(x) + return x + + class BaseDepthTransform(BaseTransform): def forward( From ab27ea1941f80c5d7fb3071005109f4794c9c9cb Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 16 May 2023 14:15:14 +0800 Subject: [PATCH 13/29] fix LoadMultiSweeps --- mmdet3d/datasets/transforms/loading.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mmdet3d/datasets/transforms/loading.py b/mmdet3d/datasets/transforms/loading.py index efb54c2fff..0a55a1f43f 100644 --- a/mmdet3d/datasets/transforms/loading.py +++ b/mmdet3d/datasets/transforms/loading.py @@ -442,7 +442,9 @@ def transform(self, results: dict) -> dict: lidar2sensor = np.array(sweep['lidar_points']['lidar2sensor']) points_sweep[:, : 3] = points_sweep[:, :3] @ lidar2sensor[:3, :3] - points_sweep[:, :3] -= lidar2sensor[:3, 3] + points_sweep[:, :3] += -1 * np.matmul( + lidar2sensor[:3, :3].T, lidar2sensor[:3, 3].reshape( + 3, 1)).squeeze() points_sweep[:, 4] = ts - sweep_ts points_sweep = points.new_point(points_sweep) sweep_points_list.append(points_sweep) From 40a97e21cde40ec0fd6da3b806b8e2693c24c157 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 16 May 2023 15:31:14 +0800 Subject: [PATCH 14/29] fix bug about points in-place operations --- projects/BEVFusion/bevfusion/bevfusion.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index 527c83fce0..c48e20b187 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -1,4 +1,5 @@ from collections import OrderedDict +from copy import deepcopy from typing import Dict, List, Optional, Tuple import numpy as np @@ -267,8 +268,8 @@ def extract_feat( camera2lidar = imgs.new_tensor(np.asarray(camera2lidar)) img_aug_matrix = imgs.new_tensor(np.asarray(img_aug_matrix)) lidar_aug_matrix = imgs.new_tensor(np.asarray(lidar_aug_matrix)) - img_feature = self.extract_img_feat(imgs, points, lidar2image, - camera_intrinsics, + img_feature = self.extract_img_feat(imgs, deepcopy(points), + lidar2image, camera_intrinsics, camera2lidar, img_aug_matrix, lidar_aug_matrix, batch_input_metas) From f17d03bbeaf59df59c90b0ce0fd3c0d617c2238e Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Wed, 17 May 2023 15:31:48 +0800 Subject: [PATCH 15/29] support amp and replace syncBN by BN --- projects/BEVFusion/bevfusion/bevfusion.py | 27 ++++++++++--------- .../BEVFusion/bevfusion/transfusion_head.py | 3 ++- ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 8 +++--- ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 8 +++--- 4 files changed, 25 insertions(+), 21 deletions(-) diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index c48e20b187..98dc3b37e1 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -153,22 +153,25 @@ def extract_img_feat( BN, C, H, W = x.size() x = x.view(B, int(BN / B), C, H, W) - x = self.vtransform( - x, - points, - lidar2image, - camera_intrinsics, - camera2lidar, - img_aug_matrix, - lidar_aug_matrix, - img_metas, - ) + with torch.autocast(device_type='cuda', dtype=torch.float32): + x = self.vtransform( + x, + points, + lidar2image, + camera_intrinsics, + camera2lidar, + img_aug_matrix, + lidar_aug_matrix, + img_metas, + ) return x def extract_pts_feat(self, batch_inputs_dict) -> torch.Tensor: points = batch_inputs_dict['points'] - feats, coords, sizes = self.voxelize(points) - batch_size = coords[-1, 0] + 1 + with torch.autocast('cuda', enabled=False): + points = [point.float() for point in points] + feats, coords, sizes = self.voxelize(points) + batch_size = coords[-1, 0] + 1 x = self.pts_middle_encoder(feats, coords, batch_size) return x diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index 9cd1891f5c..b95ce96de1 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -224,7 +224,8 @@ def forward_single(self, inputs, metas): ################################# # query initialization ################################# - dense_heatmap = self.heatmap_head(fusion_feat) + with torch.autocast('cuda', enabled=False): + dense_heatmap = self.heatmap_head(fusion_feat.float()) heatmap = dense_heatmap.detach().sigmoid() padding = self.nms_kernel_size // 2 local_max = torch.zeros_like(heatmap) diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 6f17bb67e2..ace7f47171 100644 --- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -1,4 +1,4 @@ -_base_ = ['mmdet3d::_base_/default_runtime.py'] +_base_ = ['../../../configs/_base_/default_runtime.py'] custom_imports = dict( imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False) @@ -58,7 +58,7 @@ in_channels=5, sparse_shape=[1440, 1440, 41], order=('conv', 'norm', 'act'), - norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), @@ -69,14 +69,14 @@ out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], - norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], - norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), bbox_head=dict( diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 632876cb62..188986786a 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -1,4 +1,4 @@ -_base_ = ['mmdet3d::_base_/default_runtime.py'] +_base_ = ['../../../configs/_base_/default_runtime.py'] custom_imports = dict( imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False) @@ -90,7 +90,7 @@ in_channels=5, sparse_shape=[1440, 1440, 41], order=('conv', 'norm', 'act'), - norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, 128)), encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), @@ -103,14 +103,14 @@ out_channels=[128, 256], layer_nums=[5, 5], layer_strides=[1, 2], - norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), conv_cfg=dict(type='Conv2d', bias=False)), pts_neck=dict( type='SECONDFPN', in_channels=[128, 256], out_channels=[256, 256], upsample_strides=[1, 2], - norm_cfg=dict(type='SyncBN', eps=0.001, momentum=0.01), + norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), upsample_cfg=dict(type='deconv', bias=False), use_conv_for_no_stride=True), bbox_head=dict( From dc4b7be0cf1211f373b8de6ab82d41b4ab12b75c Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Thu, 18 May 2023 20:33:13 +0800 Subject: [PATCH 16/29] add amp config --- ...dar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py new file mode 100644 index 0000000000..e3fc470a47 --- /dev/null +++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py @@ -0,0 +1,5 @@ +_base_ = [ + './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py' +] + +optim_wrapper = dict(type='AmpOptimWrapper') From a059517f2d1d26895c859375b39d46d4aa28dd5c Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Fri, 19 May 2023 11:53:23 +0800 Subject: [PATCH 17/29] set growth-interval in amp --- ...lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py index e3fc470a47..63ce85e1ac 100644 --- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py @@ -2,4 +2,5 @@ './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py' ] -optim_wrapper = dict(type='AmpOptimWrapper') +optim_wrapper = dict( + type='AmpOptimWrapper', loss_scale=dict(growth_interval=2000)) From 9457729b652512fbfa0d6d11d9798a4ff416a911 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Sat, 20 May 2023 23:22:37 +0800 Subject: [PATCH 18/29] Revert "fix LoadMultiSweeps" This reverts commit ab27ea1941f80c5d7fb3071005109f4794c9c9cb. --- mmdet3d/datasets/transforms/loading.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mmdet3d/datasets/transforms/loading.py b/mmdet3d/datasets/transforms/loading.py index 0a55a1f43f..efb54c2fff 100644 --- a/mmdet3d/datasets/transforms/loading.py +++ b/mmdet3d/datasets/transforms/loading.py @@ -442,9 +442,7 @@ def transform(self, results: dict) -> dict: lidar2sensor = np.array(sweep['lidar_points']['lidar2sensor']) points_sweep[:, : 3] = points_sweep[:, :3] @ lidar2sensor[:3, :3] - points_sweep[:, :3] += -1 * np.matmul( - lidar2sensor[:3, :3].T, lidar2sensor[:3, 3].reshape( - 3, 1)).squeeze() + points_sweep[:, :3] -= lidar2sensor[:3, 3] points_sweep[:, 4] = ts - sweep_ts points_sweep = points.new_point(points_sweep) sweep_points_list.append(points_sweep) From 836c775321ae9455768edc96f1212cce8fdd5544 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 23 May 2023 23:15:56 +0800 Subject: [PATCH 19/29] add float in cls loss --- projects/BEVFusion/bevfusion/transfusion_head.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index b95ce96de1..a70023ae14 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -814,7 +814,7 @@ def loss_by_feat(self, preds_dicts: Tuple[List[dict]], layer_cls_score = layer_score.permute(0, 2, 1).reshape( -1, self.num_classes) layer_loss_cls = self.loss_cls( - layer_cls_score, + layer_cls_score.float(), layer_labels, layer_label_weights, avg_factor=max(num_pos, 1), From 839f05ebdc97940510d0dcac6198d58b93caebaf Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Wed, 24 May 2023 11:33:53 +0800 Subject: [PATCH 20/29] iter_based lr in fusion stage --- ...bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 188986786a..dd82377076 100644 --- a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -400,7 +400,8 @@ T_max=6, end=6, by_epoch=True, - eta_min_ratio=1e-3), + eta_min_ratio=1e-3, + convert_to_iter_based=True), # momentum scheduler # During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95 # during the next 12 epochs, momentum increases from 0.85 / 0.95 to 1 From 0b32c0a93d23c985cf7ee1230e95feaca1f03aa0 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Fri, 26 May 2023 14:13:47 +0800 Subject: [PATCH 21/29] rename config --- ...n_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename projects/BEVFusion/configs/{bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py => bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py} (100%) diff --git a/projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py similarity index 100% rename from projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py rename to projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py From 30091188d4a665beaadcd5a9b49e04c9da47139e Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Fri, 26 May 2023 14:15:57 +0800 Subject: [PATCH 22/29] use normalization query pos for stable training --- projects/BEVFusion/bevfusion/transfusion_head.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index a70023ae14..8d243a4b3d 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -166,8 +166,8 @@ def create_2D_grid(self, x_size, y_size): # NOTE: modified batch_x, batch_y = torch.meshgrid( *[torch.linspace(it[0], it[1], it[2]) for it in meshgrid]) - batch_x = batch_x + 0.5 - batch_y = batch_y + 0.5 + batch_x = (batch_x + 0.5) / x_size + batch_y = (batch_y + 0.5) / y_size coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None] coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1) return coord_base @@ -288,8 +288,11 @@ def forward_single(self, inputs, metas): # Prediction res_layer = self.prediction_heads[i](query_feat) + xy_size = torch.tensor( + [fusion_feat.size(-1), + fusion_feat.size(-2)]).to(query_pos) res_layer['center'] = res_layer['center'] + query_pos.permute( - 0, 2, 1) + 0, 2, 1) * xy_size.reshape(2, -1) ret_dicts.append(res_layer) # for next level positional embedding From 5fee2e05246378b9e47cc44432d63d4803bcee38 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Fri, 26 May 2023 14:49:16 +0800 Subject: [PATCH 23/29] remove unnecessary code & simplify config & train 5 epoch --- projects/BEVFusion/bevfusion/bevfusion.py | 8 - projects/BEVFusion/bevfusion/utils.py | 44 ---- ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 236 ++---------------- ...econd_secfpn_8xb4-amp-cyclic-20e_nus-3d.py | 6 - ...75_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 3 +- 5 files changed, 15 insertions(+), 282 deletions(-) delete mode 100644 projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index 98dc3b37e1..32fe74cc5a 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -59,9 +59,6 @@ def __init__( self.pts_neck = MODELS.build(pts_neck) self.bbox_head = MODELS.build(bbox_head) - # hard code here where using converted checkpoint of original - # implementation of `BEVFusion` - self.use_converted_checkpoint = False self.init_weights() @@ -235,11 +232,6 @@ def predict(self, batch_inputs_dict: Dict[str, Optional[Tensor]], if self.with_bbox_head: outputs = self.bbox_head.predict(feats, batch_input_metas) - if self.use_converted_checkpoint: - outputs[0]['bboxes_3d'].tensor[:, 6] = -outputs[0][ - 'bboxes_3d'].tensor[:, 6] - np.pi / 2 - outputs[0]['bboxes_3d'].tensor[:, 3:5] = outputs[0][ - 'bboxes_3d'].tensor[:, [4, 3]] res = self.add_pred_to_datasample(batch_data_samples, outputs) diff --git a/projects/BEVFusion/bevfusion/utils.py b/projects/BEVFusion/bevfusion/utils.py index 66847df5b1..40f7412bfd 100644 --- a/projects/BEVFusion/bevfusion/utils.py +++ b/projects/BEVFusion/bevfusion/utils.py @@ -1,9 +1,5 @@ # modify from https://github.com/mit-han-lab/bevfusion -from collections import abc - -import numpy as np import torch -import torch.nn as nn from mmdet.models.task_modules import AssignResult, BaseAssigner, BaseBBoxCoder try: @@ -313,43 +309,3 @@ def assign(self, bboxes, gt_bboxes, gt_labels, cls_pred, train_cfg): # max_overlaps = iou.max(1).values return AssignResult( num_gts, assigned_gt_inds, max_overlaps, labels=assigned_labels) - - -def cast_tensor_type(inputs, src_type: torch.dtype, dst_type: torch.dtype): - """Recursively convert Tensor in inputs from src_type to dst_type. - - Note: - In v1.4.4 and later, ``cast_tersor_type`` will only convert the - torch.Tensor which is consistent with ``src_type`` to the ``dst_type``. - Before v1.4.4, it ignores the ``src_type`` argument, leading to some - potential problems. For example, - ``cast_tensor_type(inputs, torch.float, torch.half)`` will convert all - tensors in inputs to ``torch.half`` including those originally in - ``torch.Int`` or other types, which is not expected. - Args: - inputs: Inputs that to be casted. - src_type (torch.dtype): Source type.. - dst_type (torch.dtype): Destination type. - Returns: - The same type with inputs, but all contained Tensors have been cast. - """ - if isinstance(inputs, nn.Module): - return inputs - elif isinstance(inputs, torch.Tensor): - # we need to ensure that the type of inputs to be casted are the same - # as the argument `src_type`. - return inputs.to(dst_type) if inputs.dtype == src_type else inputs - elif isinstance(inputs, str): - return inputs - elif isinstance(inputs, np.ndarray): - return inputs - elif isinstance(inputs, abc.Mapping): - return type(inputs)({ # type: ignore - k: cast_tensor_type(v, src_type, dst_type) - for k, v in inputs.items() - }) - elif isinstance(inputs, abc.Iterable): - return type(inputs)( # type: ignore - cast_tensor_type(item, src_type, dst_type) for item in inputs) - else: - return diff --git a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index dd82377076..64493e33b1 100644 --- a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -1,31 +1,7 @@ -_base_ = ['../../../configs/_base_/default_runtime.py'] -custom_imports = dict( - imports=['projects.BEVFusion.bevfusion'], allow_failed_imports=False) - -# model settings -# Voxel size for voxel encoder -# Usually voxel size is changed consistently with the point cloud range -# If point cloud range is modified, do remember to change all related -# keys in the config. -voxel_size = [0.075, 0.075, 0.2] -point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] -class_names = [ - 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', - 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' +_base_ = [ + './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py' ] - -metainfo = dict(classes=class_names) -dataset_type = 'NuScenesDataset' -data_root = 'data/nuscenes/' -data_prefix = dict( - pts='samples/LIDAR_TOP', - CAM_FRONT='samples/CAM_FRONT', - CAM_FRONT_LEFT='samples/CAM_FRONT_LEFT', - CAM_FRONT_RIGHT='samples/CAM_FRONT_RIGHT', - CAM_BACK='samples/CAM_BACK', - CAM_BACK_RIGHT='samples/CAM_BACK_RIGHT', - CAM_BACK_LEFT='samples/CAM_BACK_LEFT', - sweeps='sweeps/LIDAR_TOP') +point_cloud_range = [-54.0, -54.0, -5.0, 54.0, 54.0, 3.0] input_modality = dict(use_lidar=True, use_camera=True) backend_args = None @@ -35,14 +11,7 @@ type='Det3DDataPreprocessor', mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], - bgr_to_rgb=False, - pad_size_divisor=32, - voxelize_cfg=dict( - max_num_points=10, - point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], - voxel_size=[0.075, 0.075, 0.2], - max_voxels=[120000, 160000], - voxelize_reduce=True)), + bgr_to_rgb=False), img_backbone=dict( type='mmdet.SwinTransformer', embed_dims=96, @@ -84,141 +53,8 @@ zbound=[-10.0, 10.0, 20.0], dbound=[1.0, 60.0, 0.5], downsample=2), - pts_voxel_encoder=dict(type='HardSimpleVFE', num_features=5), - pts_middle_encoder=dict( - type='BEVFusionSparseEncoder', - in_channels=5, - sparse_shape=[1440, 1440, 41], - order=('conv', 'norm', 'act'), - norm_cfg=dict(type='BN1d', eps=0.001, momentum=0.01), - encoder_channels=((16, 16, 32), (32, 32, 64), (64, 64, 128), (128, - 128)), - encoder_paddings=((0, 0, 1), (0, 0, 1), (0, 0, (1, 1, 0)), (0, 0)), - block_type='basicblock'), fusion_layer=dict( - type='ConvFuser', in_channels=[80, 256], out_channels=256), - pts_backbone=dict( - type='SECOND', - in_channels=256, - out_channels=[128, 256], - layer_nums=[5, 5], - layer_strides=[1, 2], - norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), - conv_cfg=dict(type='Conv2d', bias=False)), - pts_neck=dict( - type='SECONDFPN', - in_channels=[128, 256], - out_channels=[256, 256], - upsample_strides=[1, 2], - norm_cfg=dict(type='BN', eps=0.001, momentum=0.01), - upsample_cfg=dict(type='deconv', bias=False), - use_conv_for_no_stride=True), - bbox_head=dict( - type='TransFusionHead', - num_proposals=200, - auxiliary=True, - in_channels=512, - hidden_channel=128, - num_classes=10, - nms_kernel_size=3, - bn_momentum=0.1, - num_decoder_layers=1, - decoder_layer=dict( - type='TransformerDecoderLayer', - self_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), - cross_attn_cfg=dict(embed_dims=128, num_heads=8, dropout=0.1), - ffn_cfg=dict( - embed_dims=128, - feedforward_channels=256, - num_fcs=2, - ffn_drop=0.1, - act_cfg=dict(type='ReLU', inplace=True), - ), - norm_cfg=dict(type='LN'), - pos_encoding_cfg=dict(input_channel=2, num_pos_feats=128)), - train_cfg=dict( - dataset='nuScenes', - point_cloud_range=[-54.0, -54.0, -5.0, 54.0, 54.0, 3.0], - grid_size=[1440, 1440, 41], - voxel_size=[0.075, 0.075, 0.2], - out_size_factor=8, - gaussian_overlap=0.1, - min_radius=2, - pos_weight=-1, - code_weights=[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 0.2, 0.2], - assigner=dict( - type='HungarianAssigner3D', - iou_calculator=dict(type='BboxOverlaps3D', coordinate='lidar'), - cls_cost=dict( - type='mmdet.FocalLossCost', - gamma=2.0, - alpha=0.25, - weight=0.15), - reg_cost=dict(type='BBoxBEVL1Cost', weight=0.25), - iou_cost=dict(type='IoU3DCost', weight=0.25))), - test_cfg=dict( - dataset='nuScenes', - grid_size=[1440, 1440, 41], - out_size_factor=8, - voxel_size=[0.075, 0.075], - pc_range=[-54.0, -54.0], - nms_type=None), - common_heads=dict( - center=[2, 2], height=[1, 2], dim=[3, 2], rot=[2, 2], vel=[2, 2]), - bbox_coder=dict( - type='TransFusionBBoxCoder', - pc_range=[-54.0, -54.0], - post_center_range=[-61.2, -61.2, -10.0, 61.2, 61.2, 10.0], - score_threshold=0.0, - out_size_factor=8, - voxel_size=[0.075, 0.075], - code_size=10), - loss_cls=dict( - type='mmdet.FocalLoss', - use_sigmoid=True, - gamma=2.0, - alpha=0.25, - reduction='mean', - loss_weight=1.0), - loss_heatmap=dict( - type='mmdet.GaussianFocalLoss', reduction='mean', loss_weight=1.0), - loss_bbox=dict( - type='mmdet.L1Loss', reduction='mean', loss_weight=0.25))) - -db_sampler = dict( - data_root=data_root, - info_path=data_root + 'nuscenes_dbinfos_train.pkl', - rate=1.0, - prepare=dict( - filter_by_difficulty=[-1], - filter_by_min_points=dict( - car=5, - truck=5, - bus=5, - trailer=5, - construction_vehicle=5, - traffic_cone=5, - barrier=5, - motorcycle=5, - bicycle=5, - pedestrian=5)), - classes=class_names, - sample_groups=dict( - car=2, - truck=3, - construction_vehicle=7, - bus=4, - trailer=6, - barrier=2, - motorcycle=6, - bicycle=6, - pedestrian=2, - traffic_cone=2), - points_loader=dict( - type='LoadPointsFromFile', - coord_type='LIDAR', - load_dim=5, - use_dim=[0, 1, 2, 3, 4])) + type='ConvFuser', in_channels=[80, 256], out_channels=256)) train_pipeline = [ dict( @@ -245,7 +81,6 @@ with_bbox_3d=True, with_label_3d=True, with_attr_label=False), - # dict(type='ObjectSample', db_sampler=db_sampler), dict( type='ImageAug3D', final_dim=[256, 704], @@ -268,11 +103,12 @@ 'car', 'truck', 'construction_vehicle', 'bus', 'trailer', 'barrier', 'motorcycle', 'bicycle', 'pedestrian', 'traffic_cone' ]), + # Actually, 'GridMask' is not used here dict( type='GridMask', use_h=True, use_w=True, - max_epoch=6, + max_epoch=5, rotate=1, offset=False, ratio=0.5, @@ -337,56 +173,12 @@ ] train_dataloader = dict( - batch_size=4, - num_workers=4, - persistent_workers=True, - sampler=dict(type='DefaultSampler', shuffle=True), dataset=dict( - type='CBGSDataset', - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='nuscenes_infos_train.pkl', - pipeline=train_pipeline, - metainfo=metainfo, - modality=input_modality, - test_mode=False, - data_prefix=data_prefix, - use_valid_flag=True, - # we use box_type_3d='LiDAR' in kitti and nuscenes dataset - # and box_type_3d='Depth' in sunrgbd and scannet dataset. - box_type_3d='LiDAR'))) + dataset=dict(pipeline=train_pipeline, modality=input_modality))) val_dataloader = dict( - batch_size=1, - num_workers=4, - persistent_workers=True, - drop_last=False, - sampler=dict(type='DefaultSampler', shuffle=False), - dataset=dict( - type=dataset_type, - data_root=data_root, - ann_file='nuscenes_infos_val.pkl', - pipeline=test_pipeline, - metainfo=metainfo, - modality=input_modality, - data_prefix=data_prefix, - test_mode=True, - box_type_3d='LiDAR', - backend_args=backend_args)) + dataset=dict(pipeline=test_pipeline, modality=input_modality)) test_dataloader = val_dataloader -val_evaluator = dict( - type='NuScenesMetric', - data_root=data_root, - ann_file=data_root + 'nuscenes_infos_val.pkl', - metric='bbox', - backend_args=backend_args) -test_evaluator = val_evaluator - -vis_backends = [dict(type='LocalVisBackend')] -visualizer = dict( - type='Det3DLocalVisualizer', vis_backends=vis_backends, name='visualizer') - param_scheduler = [ dict( type='LinearLR', @@ -397,8 +189,8 @@ dict( type='CosineAnnealingLR', begin=0, - T_max=6, - end=6, + T_max=5, + end=5, by_epoch=True, eta_min_ratio=1e-3, convert_to_iter_based=True), @@ -416,13 +208,13 @@ type='CosineAnnealingMomentum', eta_min=1, begin=2.4, - end=6, + end=5, by_epoch=True, convert_to_iter_based=True) ] # runtime settings -train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1) +train_cfg = dict(by_epoch=True, max_epochs=5, val_interval=1) val_cfg = dict() test_cfg = dict() @@ -436,8 +228,8 @@ # or not by default. # - `base_batch_size` = (8 GPUs) x (4 samples per GPU). auto_scale_lr = dict(enable=False, base_batch_size=32) -log_processor = dict(window_size=50) default_hooks = dict( logger=dict(type='LoggerHook', interval=50), checkpoint=dict(type='CheckpointHook', interval=1)) +del _base_.custom_hooks diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py deleted file mode 100644 index 63ce85e1ac..0000000000 --- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-amp-cyclic-20e_nus-3d.py +++ /dev/null @@ -1,6 +0,0 @@ -_base_ = [ - './bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py' -] - -optim_wrapper = dict( - type='AmpOptimWrapper', loss_scale=dict(growth_interval=2000)) diff --git a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index ace7f47171..e56189e2f5 100644 --- a/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -376,10 +376,9 @@ # or not by default. # - `base_batch_size` = (8 GPUs) x (4 samples per GPU). auto_scale_lr = dict(enable=False, base_batch_size=32) +log_processor = dict(window_size=50) default_hooks = dict( logger=dict(type='LoggerHook', interval=50), checkpoint=dict(type='CheckpointHook', interval=5)) custom_hooks = [dict(type='DisableObjectSampleHook', disable_after_epoch=15)] - -load_from = 'checkpoints/bevfusion_init_converted.pth' From 51f59855f40656cd59e598ff65f0cdc6b2b751c3 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Mon, 29 May 2023 09:48:03 +0800 Subject: [PATCH 24/29] smaller ete_min_ratio --- ...voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index 64493e33b1..eb6755d8c4 100644 --- a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -108,7 +108,7 @@ type='GridMask', use_h=True, use_w=True, - max_epoch=5, + max_epoch=6, rotate=1, offset=False, ratio=0.5, @@ -189,10 +189,10 @@ dict( type='CosineAnnealingLR', begin=0, - T_max=5, - end=5, + T_max=6, + end=6, by_epoch=True, - eta_min_ratio=1e-3, + eta_min_ratio=1e-4, convert_to_iter_based=True), # momentum scheduler # During the first 8 epochs, momentum increases from 1 to 0.85 / 0.95 @@ -208,13 +208,13 @@ type='CosineAnnealingMomentum', eta_min=1, begin=2.4, - end=5, + end=6, by_epoch=True, convert_to_iter_based=True) ] # runtime settings -train_cfg = dict(by_epoch=True, max_epochs=5, val_interval=1) +train_cfg = dict(by_epoch=True, max_epochs=6, val_interval=1) val_cfg = dict() test_cfg = dict() From 09311fecbd5eea5b8ff729aee8280067faebb281 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Mon, 29 May 2023 15:50:07 +0800 Subject: [PATCH 25/29] polish code --- projects/BEVFusion/README.md | 18 +++++++++--------- projects/BEVFusion/bevfusion/bevfusion.py | 8 ++++---- projects/BEVFusion/bevfusion/depth_lss.py | 8 +++++--- projects/BEVFusion/bevfusion/transforms_3d.py | 4 ++++ ...075_second_secfpn_8xb4-cyclic-20e_nus-3d.py | 2 +- 5 files changed, 23 insertions(+), 17 deletions(-) diff --git a/projects/BEVFusion/README.md b/projects/BEVFusion/README.md index dd4ee70212..b5a6b80593 100644 --- a/projects/BEVFusion/README.md +++ b/projects/BEVFusion/README.md @@ -15,7 +15,7 @@ results is available at https://github.com/mit-han-lab/bevfusion. ## Introduction -We implement BEVFusion and provide the results and pretrained checkpoints on NuScenes dataset. +We implement BEVFusion and support training and testing on NuScenes dataset. ## Usage @@ -34,21 +34,21 @@ python projects/BEVFusion/setup.py develop Run a demo on NuScenes data using [BEVFusion model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link): ```shell -python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show +python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0800__LIDAR_TOP__1532402927647951.pcd.bin demo/data/nuscenes/ demo/data/nuscenes/n015-2018-07-24-11-22-45+0800.pkl projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_FILE} --cam-type all --score-thr 0.2 --show ``` ### Training commands -In MMDetection3D's root directory, run the following command to train the model: +1. You should train the lidar-only detector first: ```bash -python tools/train.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +python tools/train.py projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ``` -For multi-gpu training, run: +2. Download the [Swin pre-trained model](<>). Given the image pre-trained backbone and the lidar-only pre-trained detector, you could train the lidar-camera fusion model: ```bash -python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${NUM_GPUS} --master_port=29506 --master_addr="127.0.0.1" tools/train.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +python tools/train.py projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py --cfg-options load_from=${LIDAR_PRETRAINED_CHECKPOINT} model.img_backbone.init_cfg.checkpoint=${IMAGE_PRETRAINED_BACKBONE} ``` ### Testing commands @@ -56,7 +56,7 @@ python -m torch.distributed.launch --nnodes=1 --node_rank=0 --nproc_per_node=${N In MMDetection3D's root directory, run the following command to test the model: ```bash -python tools/test.py projects/BEVFusion/configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH} +python tools/test.py projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH} ``` ## Results and models @@ -103,9 +103,9 @@ A project does not necessarily have to be finished in a single PR, but it's esse -- [ ] Milestone 2: Indicates a successful model implementation. +- [x] Milestone 2: Indicates a successful model implementation. - - [ ] Training-time correctness + - [x] Training-time correctness diff --git a/projects/BEVFusion/bevfusion/bevfusion.py b/projects/BEVFusion/bevfusion/bevfusion.py index 32fe74cc5a..9f56934e66 100644 --- a/projects/BEVFusion/bevfusion/bevfusion.py +++ b/projects/BEVFusion/bevfusion/bevfusion.py @@ -27,7 +27,7 @@ def __init__( fusion_layer: Optional[dict] = None, img_backbone: Optional[dict] = None, pts_backbone: Optional[dict] = None, - vtransform: Optional[dict] = None, + view_transform: Optional[dict] = None, img_neck: Optional[dict] = None, pts_neck: Optional[dict] = None, bbox_head: Optional[dict] = None, @@ -48,8 +48,8 @@ def __init__( img_backbone) if img_backbone is not None else None self.img_neck = MODELS.build( img_neck) if img_neck is not None else None - self.vtransform = MODELS.build( - vtransform) if vtransform is not None else None + self.view_transform = MODELS.build( + view_transform) if view_transform is not None else None self.pts_middle_encoder = MODELS.build(pts_middle_encoder) self.fusion_layer = MODELS.build( @@ -151,7 +151,7 @@ def extract_img_feat( x = x.view(B, int(BN / B), C, H, W) with torch.autocast(device_type='cuda', dtype=torch.float32): - x = self.vtransform( + x = self.view_transform( x, points, lidar2image, diff --git a/projects/BEVFusion/bevfusion/depth_lss.py b/projects/BEVFusion/bevfusion/depth_lss.py index fb0c21b878..6cc0cc1606 100644 --- a/projects/BEVFusion/bevfusion/depth_lss.py +++ b/projects/BEVFusion/bevfusion/depth_lss.py @@ -17,7 +17,7 @@ def gen_dx_bx(xbound, ybound, zbound): return dx, bx, nx -class BaseTransform(nn.Module): +class BaseViewTransform(nn.Module): def __init__( self, @@ -184,7 +184,7 @@ def forward( @MODELS.register_module() -class LSSTransform(BaseTransform): +class LSSTransform(BaseViewTransform): def __init__( self, @@ -253,7 +253,7 @@ def forward(self, *args, **kwargs): return x -class BaseDepthTransform(BaseTransform): +class BaseDepthTransform(BaseViewTransform): def forward( self, @@ -346,6 +346,8 @@ def __init__( dbound: Tuple[float, float, float], downsample: int = 1, ) -> None: + """Compared with `LSSTransform`, `DepthLSSTransform` adds sparse depth + information from lidar points into the inputs of the `depthnet`.""" super().__init__( in_channels=in_channels, out_channels=out_channels, diff --git a/projects/BEVFusion/bevfusion/transforms_3d.py b/projects/BEVFusion/bevfusion/transforms_3d.py index da259e21d3..6d2929512d 100644 --- a/projects/BEVFusion/bevfusion/transforms_3d.py +++ b/projects/BEVFusion/bevfusion/transforms_3d.py @@ -110,6 +110,8 @@ def transform(self, data: Dict[str, Any]) -> Dict[str, Any]: @TRANSFORMS.register_module() class BEVFusionRandomFlip3D: + """Compared with `RandomFlip3D`, this class directly records the lidar + augmentation matrix in the `data`.""" def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: flip_horizontal = np.random.choice([0, 1]) @@ -143,6 +145,8 @@ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]: @TRANSFORMS.register_module() class BEVFusionGlobalRotScaleTrans(GlobalRotScaleTrans): + """Compared with `GlobalRotScaleTrans`, the augmentation order in this + class is rotation, translation and scaling (RTS).""" def transform(self, input_dict: dict) -> dict: """Private function to rotate, scale and translate bounding boxes and diff --git a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py index eb6755d8c4..f0b6eeba30 100644 --- a/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +++ b/projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py @@ -42,7 +42,7 @@ norm_cfg=dict(type='BN2d', requires_grad=True), act_cfg=dict(type='ReLU', inplace=True), upsample_cfg=dict(mode='bilinear', align_corners=False)), - vtransform=dict( + view_transform=dict( type='DepthLSSTransform', in_channels=256, out_channels=80, From 9ac3e03128979d27457dff340b8970d3f0eb9881 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 30 May 2023 15:45:38 +0800 Subject: [PATCH 26/29] fix UT --- .../test_engine/test_hooks/test_disable_object_sample_hook.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_engine/test_hooks/test_disable_object_sample_hook.py b/tests/test_engine/test_hooks/test_disable_object_sample_hook.py index fcc1e3c8d8..cbe7cd65d5 100644 --- a/tests/test_engine/test_hooks/test_disable_object_sample_hook.py +++ b/tests/test_engine/test_hooks/test_disable_object_sample_hook.py @@ -2,6 +2,8 @@ from unittest import TestCase from unittest.mock import Mock +from mmengine.dataset import BaseDataset + from mmdet3d.datasets.transforms import ObjectSample from mmdet3d.engine.hooks import DisableObjectSampleHook @@ -10,7 +12,7 @@ class TestDisableObjectSampleHook(TestCase): runner = Mock() runner.train_dataloader = Mock() - runner.train_dataloader.dataset = Mock() + runner.train_dataloader.dataset = Mock(spec=BaseDataset) runner.train_dataloader.dataset.pipeline = Mock() runner.train_dataloader._DataLoader__initialized = True runner.train_dataloader.dataset.pipeline.transforms = [ From 411105b8a7046643074e1b39a4d6f1668ccd29c7 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Tue, 30 May 2023 17:13:16 +0800 Subject: [PATCH 27/29] Revert "use normalization query pos for stable training" This reverts commit 30091188d4a665beaadcd5a9b49e04c9da47139e. --- projects/BEVFusion/bevfusion/transfusion_head.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index 8d243a4b3d..a70023ae14 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -166,8 +166,8 @@ def create_2D_grid(self, x_size, y_size): # NOTE: modified batch_x, batch_y = torch.meshgrid( *[torch.linspace(it[0], it[1], it[2]) for it in meshgrid]) - batch_x = (batch_x + 0.5) / x_size - batch_y = (batch_y + 0.5) / y_size + batch_x = batch_x + 0.5 + batch_y = batch_y + 0.5 coord_base = torch.cat([batch_x[None], batch_y[None]], dim=0)[None] coord_base = coord_base.view(1, 2, -1).permute(0, 2, 1) return coord_base @@ -288,11 +288,8 @@ def forward_single(self, inputs, metas): # Prediction res_layer = self.prediction_heads[i](query_feat) - xy_size = torch.tensor( - [fusion_feat.size(-1), - fusion_feat.size(-2)]).to(query_pos) res_layer['center'] = res_layer['center'] + query_pos.permute( - 0, 2, 1) * xy_size.reshape(2, -1) + 0, 2, 1) ret_dicts.append(res_layer) # for next level positional embedding From 3678fb4806016d5efbe9dab0001cd321b55adcd8 Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Wed, 31 May 2023 09:41:37 +0800 Subject: [PATCH 28/29] update readme --- projects/BEVFusion/README.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/projects/BEVFusion/README.md b/projects/BEVFusion/README.md index b5a6b80593..0828bd4f8d 100644 --- a/projects/BEVFusion/README.md +++ b/projects/BEVFusion/README.md @@ -42,30 +42,33 @@ python demo/multi_modality_demo.py demo/data/nuscenes/n015-2018-07-24-11-22-45+0 1. You should train the lidar-only detector first: ```bash -python tools/train.py projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py +bash tools/dist_train.py projects/BEVFusion/configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py 8 ``` -2. Download the [Swin pre-trained model](<>). Given the image pre-trained backbone and the lidar-only pre-trained detector, you could train the lidar-camera fusion model: +2. Download the [Swin pre-trained model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/swint-nuimages-pretrained.pth). Given the image pre-trained backbone and the lidar-only pre-trained detector, you could train the lidar-camera fusion model: ```bash -python tools/train.py projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py --cfg-options load_from=${LIDAR_PRETRAINED_CHECKPOINT} model.img_backbone.init_cfg.checkpoint=${IMAGE_PRETRAINED_BACKBONE} +bash tools/dist_train.sh projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py 8 --cfg-options load_from=${LIDAR_PRETRAINED_CHECKPOINT} model.img_backbone.init_cfg.checkpoint=${IMAGE_PRETRAINED_BACKBONE} ``` +**Note** that if you want to reduce CUDA memory usage and computational overhead, you could directly add `--amp` on the tail of the above commands. The model under this setting will be trained in fp16 mode. + ### Testing commands In MMDetection3D's root directory, run the following command to test the model: ```bash -python tools/test.py projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH} +bash tools/dist_test.sh projects/BEVFusion/configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py ${CHECKPOINT_PATH} 8 ``` ## Results and models ### NuScenes -| Backbone | Voxel type (voxel size) | NMS | Mem (GB) | Inf time (fps) | NDS | mAP | Download | -| :-----------------------------------------------------------------------------: | :---------------------: | :-: | :------: | :------------: | :---: | :---: | :------------------------------------------------------------------------------------------------------: | -| [SECFPN](./configs/bevfusion_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) | voxel (0.075) | × | - | - | 71.62 | 68.77 | [converted_model](https://drive.google.com/file/d/1QkvbYDk4G2d6SZoeJqish13qSyXA4lp3/view?usp=share_link) | +| Modality | Voxel type (voxel size) | NMS | Mem (GB) | Inf time (fps) | NDS | mAP | Download | +| :------------------------------------------------------------------------------------------: | :---------------------: | :-: | :------: | :------------: | :--: | :--: | :-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------: | +| [lidar](./configs/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) | voxel (0.075) | × | - | - | 69.6 | 64.9 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-2628f933.pth) [logs](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d_20230322_053447.log) | +| [lidar-cam](./configs/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d.py) | voxel (0.075) | × | - | - | 71.4 | 68.6 | [model](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d-5239b1af.pth) [logs](https://download.openmmlab.com/mmdetection3d/v1.1.0_models/bevfusion/bevfusion_lidar-cam_voxel0075_second_secfpn_8xb4-cyclic-20e_nus-3d_20230524_001539.log) | ## Citation From e127201486dfe18da73792176431ca25f6723c1d Mon Sep 17 00:00:00 2001 From: JingweiZhang12 Date: Wed, 31 May 2023 14:30:38 +0800 Subject: [PATCH 29/29] fix height offset --- projects/BEVFusion/bevfusion/transfusion_head.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/projects/BEVFusion/bevfusion/transfusion_head.py b/projects/BEVFusion/bevfusion/transfusion_head.py index a70023ae14..8a3e1750db 100644 --- a/projects/BEVFusion/bevfusion/transfusion_head.py +++ b/projects/BEVFusion/bevfusion/transfusion_head.py @@ -481,8 +481,6 @@ def predict_by_feat(self, ret = dict(bboxes=boxes3d, scores=scores, labels=labels) temp_instances = InstanceData() - ret['bboxes'][:, 2] = ret[ - 'bboxes'][:, 2] - ret['bboxes'][:, 5] * 0.5 # noqa: E501 temp_instances.bboxes_3d = metas[0]['box_type_3d']( ret['bboxes'], box_dim=ret['bboxes'].shape[-1]) temp_instances.scores_3d = ret['scores']