From 7a2996448642857ecb754c0f188d81d0ef473f93 Mon Sep 17 00:00:00 2001 From: Aaron Date: Wed, 17 Jan 2024 11:21:19 -0500 Subject: [PATCH 01/35] add test for coco yaml parsing --- .../tests/unit/configs/test_dataset_config.py | 44 +++++++++++++++++++ 1 file changed, 44 insertions(+) create mode 100644 rtdetr_pytorch/tests/unit/configs/test_dataset_config.py diff --git a/rtdetr_pytorch/tests/unit/configs/test_dataset_config.py b/rtdetr_pytorch/tests/unit/configs/test_dataset_config.py new file mode 100644 index 00000000..ad2b3595 --- /dev/null +++ b/rtdetr_pytorch/tests/unit/configs/test_dataset_config.py @@ -0,0 +1,44 @@ +from pathlib import Path + +from src.core import YAMLConfig + + +def test_coco_detection_yml(): + coco_detection_config_file = Path( + Path(__file__).parent.parent.parent.parent + / "configs/dataset/coco_detection.yml" + ).resolve() + assert coco_detection_config_file.is_file() + cfg = YAMLConfig(cfg_path=str(coco_detection_config_file)) + assert cfg + assert cfg.yaml_cfg == { + "task": "detection", + "num_classes": 80, + "remap_mscoco_category": True, + "train_dataloader": { + "type": "DataLoader", + "dataset": { + "type": "CocoDetection", + "img_folder": "./dataset/coco/train2017/", + "ann_file": "./dataset/coco/annotations/instances_train2017.json", + "transforms": {"type": "Compose", "ops": None}, + }, + "shuffle": True, + "batch_size": 8, + "num_workers": 4, + "drop_last": True, + }, + "val_dataloader": { + "type": "DataLoader", + "dataset": { + "type": "CocoDetection", + "img_folder": "./dataset/coco/val2017/", + "ann_file": "./dataset/coco/annotations/instances_val2017.json", + "transforms": {"type": "Compose", "ops": None}, + }, + "shuffle": False, + "batch_size": 8, + "num_workers": 4, + "drop_last": False, + }, + } From d690d9470d08a6484f7e3bfd7412cbfe0ad0a061 Mon Sep 17 00:00:00 2001 From: Aaron Date: Wed, 17 Jan 2024 12:23:08 -0500 Subject: [PATCH 02/35] add remaining config tests --- .../configs/rtdetr_r18vd_6x_coco_config.py | 191 ++++++++++++++++++ .../unit/configs/test_rtdetr_r50vd_config.py | 104 ++++++++++ .../tests/unit/configs/test_runtime_config.py | 20 ++ 3 files changed, 315 insertions(+) create mode 100644 rtdetr_pytorch/tests/unit/configs/rtdetr_r18vd_6x_coco_config.py create mode 100644 rtdetr_pytorch/tests/unit/configs/test_rtdetr_r50vd_config.py create mode 100644 rtdetr_pytorch/tests/unit/configs/test_runtime_config.py diff --git a/rtdetr_pytorch/tests/unit/configs/rtdetr_r18vd_6x_coco_config.py b/rtdetr_pytorch/tests/unit/configs/rtdetr_r18vd_6x_coco_config.py new file mode 100644 index 00000000..f540e751 --- /dev/null +++ b/rtdetr_pytorch/tests/unit/configs/rtdetr_r18vd_6x_coco_config.py @@ -0,0 +1,191 @@ +from pathlib import Path + +from src.core import YAMLConfig + + +def test_rtdetr_r18vd_6x_coco_yml(): + config_file = Path( + Path(__file__).parent.parent.parent.parent + / "configs/rtdetr/rtdetr_r18vd_6x_coco.yml" + ).resolve() + assert config_file.is_file() + cfg = YAMLConfig(cfg_path=str(config_file)) + assert cfg + assert cfg.model.multi_scale == [ + 480, + 512, + 544, + 576, + 608, + 640, + 640, + 640, + 672, + 704, + 736, + 768, + 800, + ] + assert cfg.yaml_cfg == { + "HybridEncoder": { + "act": "silu", + "depth_mult": 1, + "dim_feedforward": 1024, + "dropout": 0.0, + "enc_act": "gelu", + "eval_spatial_size": [640, 640], + "expansion": 0.5, + "feat_strides": [8, 16, 32], + "hidden_dim": 256, + "in_channels": [128, 256, 512], + "nhead": 8, + "num_encoder_layers": 1, + "pe_temperature": 10000, + "use_encoder_idx": [2], + }, + "PResNet": { + "depth": 18, + "freeze_at": -1, + "freeze_norm": False, + "num_stages": 4, + "pretrained": True, + "return_idx": [1, 2, 3], + "variant": "d", + }, + "RTDETR": { + "backbone": "PResNet", + "decoder": "RTDETRTransformer", + "encoder": "HybridEncoder", + "multi_scale": [ + 480, + 512, + 544, + 576, + 608, + 640, + 640, + 640, + 672, + 704, + 736, + 768, + 800, + ], + }, + "RTDETRPostProcessor": {"num_top_queries": 300}, + "RTDETRTransformer": { + "eval_idx": -1, + "eval_spatial_size": [640, 640], + "feat_channels": [256, 256, 256], + "feat_strides": [8, 16, 32], + "hidden_dim": 256, + "num_decoder_layers": 3, + "num_denoising": 100, + "num_levels": 3, + "num_queries": 300, + }, + "SetCriterion": { + "alpha": 0.75, + "gamma": 2.0, + "losses": ["vfl", "boxes"], + "matcher": { + "alpha": 0.25, + "gamma": 2.0, + "type": "HungarianMatcher", + "weight_dict": {"cost_bbox": 5, "cost_class": 2, "cost_giou": 2}, + }, + "weight_dict": {"loss_bbox": 5, "loss_giou": 2, "loss_vfl": 1}, + }, + "__include__": [ + "../dataset/coco_detection.yml", + "../runtime.yml", + "./include/dataloader.yml", + "./include/optimizer.yml", + "./include/rtdetr_r50vd.yml", + ], + "clip_max_norm": 0.1, + "criterion": "SetCriterion", + "ema": {"decay": 0.9999, "type": "ModelEMA", "warmups": 2000}, + "epoches": 72, + "find_unused_parameters": True, + "lr_scheduler": {"gamma": 0.1, "milestones": [1000], "type": "MultiStepLR"}, + "model": "RTDETR", + "num_classes": 80, + "optimizer": { + "betas": [0.9, 0.999], + "lr": 0.0001, + "params": [ + { + "lr": 1e-05, + "params": "^(?=.*backbone)(?=.*norm).*$", + "weight_decay": 0.0, + }, + {"lr": 1e-05, "params": "^(?=.*backbone)(?!.*norm).*$"}, + { + "params": "^(?=.*(?:encoder|decoder))(?=.*(?:norm|bias)).*$", + "weight_decay": 0.0, + }, + ], + "type": "AdamW", + "weight_decay": 0.0001, + }, + "output_dir": "./output/rtdetr_r18vd_6x_coco", + "postprocessor": "RTDETRPostProcessor", + "remap_mscoco_category": True, + "scaler": {"enabled": True, "type": "GradScaler"}, + "sync_bn": True, + "task": "detection", + "train_dataloader": { + "batch_size": 4, + "collate_fn": "default_collate_fn", + "dataset": { + "ann_file": "./dataset/coco/annotations/instances_train2017.json", + "img_folder": "./dataset/coco/train2017/", + "return_masks": False, + "transforms": { + "ops": [ + {"p": 0.8, "type": "RandomPhotometricDistort"}, + {"fill": 0, "type": "RandomZoomOut"}, + {"p": 0.8, "type": "RandomIoUCrop"}, + {"min_size": 1, "type": "SanitizeBoundingBox"}, + {"type": "RandomHorizontalFlip"}, + {"size": [640, 640], "type": "Resize"}, + {"type": "ToImageTensor"}, + {"type": "ConvertDtype"}, + {"min_size": 1, "type": "SanitizeBoundingBox"}, + {"normalize": True, "out_fmt": "cxcywh", "type": "ConvertBox"}, + ], + "type": "Compose", + }, + "type": "CocoDetection", + }, + "drop_last": True, + "num_workers": 4, + "shuffle": True, + "type": "DataLoader", + }, + "use_amp": False, + "use_ema": True, + "use_focal_loss": True, + "val_dataloader": { + "batch_size": 8, + "collate_fn": "default_collate_fn", + "dataset": { + "ann_file": "./dataset/coco/annotations/instances_val2017.json", + "img_folder": "./dataset/coco/val2017/", + "transforms": { + "ops": [ + {"size": [640, 640], "type": "Resize"}, + {"type": "ToImageTensor"}, + {"type": "ConvertDtype"}, + ], + "type": "Compose", + }, + "type": "CocoDetection", + }, + "drop_last": False, + "num_workers": 4, + "shuffle": False, + "type": "DataLoader", + }, + } diff --git a/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r50vd_config.py b/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r50vd_config.py new file mode 100644 index 00000000..4ac2e611 --- /dev/null +++ b/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r50vd_config.py @@ -0,0 +1,104 @@ +from pathlib import Path + +from src.core import YAMLConfig + + +def test_rtdetr_r50vd_yml(): + config_file = Path( + Path(__file__).parent.parent.parent.parent + / "configs/rtdetr/include/rtdetr_r50vd.yml" + ).resolve() + assert config_file.is_file() + cfg = YAMLConfig(cfg_path=str(config_file)) + assert cfg + assert cfg.model.multi_scale == [ + 480, + 512, + 544, + 576, + 608, + 640, + 640, + 640, + 672, + 704, + 736, + 768, + 800, + ] + assert cfg.yaml_cfg == { + "task": "detection", + "model": "RTDETR", + "criterion": "SetCriterion", + "postprocessor": "RTDETRPostProcessor", + "RTDETR": { + "backbone": "PResNet", + "encoder": "HybridEncoder", + "decoder": "RTDETRTransformer", + "multi_scale": [ + 480, + 512, + 544, + 576, + 608, + 640, + 640, + 640, + 672, + 704, + 736, + 768, + 800, + ], + }, + "PResNet": { + "depth": 50, + "variant": "d", + "freeze_at": 0, + "return_idx": [1, 2, 3], + "num_stages": 4, + "freeze_norm": True, + "pretrained": True, + }, + "HybridEncoder": { + "in_channels": [512, 1024, 2048], + "feat_strides": [8, 16, 32], + "hidden_dim": 256, + "use_encoder_idx": [2], + "num_encoder_layers": 1, + "nhead": 8, + "dim_feedforward": 1024, + "dropout": 0.0, + "enc_act": "gelu", + "pe_temperature": 10000, + "expansion": 1.0, + "depth_mult": 1, + "act": "silu", + "eval_spatial_size": [640, 640], + }, + "RTDETRTransformer": { + "feat_channels": [256, 256, 256], + "feat_strides": [8, 16, 32], + "hidden_dim": 256, + "num_levels": 3, + "num_queries": 300, + "num_decoder_layers": 6, + "num_denoising": 100, + "eval_idx": -1, + "eval_spatial_size": [640, 640], + }, + "use_focal_loss": True, + "RTDETRPostProcessor": {"num_top_queries": 300}, + "SetCriterion": { + "weight_dict": {"loss_vfl": 1, "loss_bbox": 5, "loss_giou": 2}, + "losses": ["vfl", "boxes"], + "alpha": 0.75, + "gamma": 2.0, + "matcher": { + "type": "HungarianMatcher", + "weight_dict": {"cost_class": 2, "cost_bbox": 5, "cost_giou": 2}, + "alpha": 0.25, + "gamma": 2.0, + }, + }, + } diff --git a/rtdetr_pytorch/tests/unit/configs/test_runtime_config.py b/rtdetr_pytorch/tests/unit/configs/test_runtime_config.py new file mode 100644 index 00000000..83307d50 --- /dev/null +++ b/rtdetr_pytorch/tests/unit/configs/test_runtime_config.py @@ -0,0 +1,20 @@ +from pathlib import Path + +from src.core import YAMLConfig + + +def test_runtime_yml(): + config_file = Path( + Path(__file__).parent.parent.parent.parent / "configs/runtime.yml" + ).resolve() + assert config_file.is_file() + cfg = YAMLConfig(cfg_path=str(config_file)) + assert cfg + assert cfg.yaml_cfg == { + "ema": {"decay": 0.9999, "type": "ModelEMA", "warmups": 2000}, + "find_unused_parameters": False, + "scaler": {"enabled": True, "type": "GradScaler"}, + "sync_bn": True, + "use_amp": False, + "use_ema": False, + } From 3cf1a7840a87014c92ad65b406f258ce54f1f106 Mon Sep 17 00:00:00 2001 From: Aaron Date: Sun, 21 Jan 2024 16:42:18 -0500 Subject: [PATCH 03/35] Add somoe test for parsing config files --- ...18vd_6x_coco_config.py => test_rtdetr_r18vd_6x_coco_config.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename rtdetr_pytorch/tests/unit/configs/{rtdetr_r18vd_6x_coco_config.py => test_rtdetr_r18vd_6x_coco_config.py} (100%) diff --git a/rtdetr_pytorch/tests/unit/configs/rtdetr_r18vd_6x_coco_config.py b/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py similarity index 100% rename from rtdetr_pytorch/tests/unit/configs/rtdetr_r18vd_6x_coco_config.py rename to rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py From 6b6c5726bba9b1f6ffa4ee181a56447cc6d2fe6f Mon Sep 17 00:00:00 2001 From: Aaron Date: Sun, 21 Jan 2024 16:42:37 -0500 Subject: [PATCH 04/35] black formatting --- .../src/zoo/rtdetr/rtdetr_decoder.py | 515 +++++++++++------- 1 file changed, 325 insertions(+), 190 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index f5011617..ab33a5ec 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -1,33 +1,35 @@ """by lyuwenyu """ -import math -import copy +import copy +import math from collections import OrderedDict -import torch -import torch.nn as nn -import torch.nn.functional as F -import torch.nn.init as init - -from .denoising import get_contrastive_denoising_training_group -from .utils import deformable_attention_core_func, get_activation, inverse_sigmoid -from .utils import bias_init_with_prob - - +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.nn.init as init from src.core import register +from .denoising import get_contrastive_denoising_training_group +from .utils import ( + bias_init_with_prob, + deformable_attention_core_func, + get_activation, + inverse_sigmoid, +) -__all__ = ['RTDETRTransformer'] - +__all__ = ["RTDETRTransformer"] class MLP(nn.Module): - def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act='relu'): + def __init__(self, input_dim, hidden_dim, output_dim, num_layers, act="relu"): super().__init__() self.num_layers = num_layers h = [hidden_dim] * (num_layers - 1) - self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim])) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) self.act = nn.Identity() if act is None else get_activation(act) def forward(self, x): @@ -36,9 +38,14 @@ def forward(self, x): return x - class MSDeformableAttention(nn.Module): - def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4,): + def __init__( + self, + embed_dim=256, + num_heads=8, + num_levels=4, + num_points=4, + ): """ Multi-Scale Deformable Attention Module """ @@ -50,9 +57,14 @@ def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4,): self.total_points = num_heads * num_levels * num_points self.head_dim = embed_dim // num_heads - assert self.head_dim * num_heads == self.embed_dim, "embed_dim must be divisible by num_heads" + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" - self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2,) + self.sampling_offsets = nn.Linear( + embed_dim, + self.total_points * 2, + ) self.attention_weights = nn.Linear(embed_dim, self.total_points) self.value_proj = nn.Linear(embed_dim, embed_dim) self.output_proj = nn.Linear(embed_dim, embed_dim) @@ -61,15 +73,20 @@ def __init__(self, embed_dim=256, num_heads=8, num_levels=4, num_points=4,): self._reset_parameters() - def _reset_parameters(self): # sampling_offsets init.constant_(self.sampling_offsets.weight, 0) - thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads) + thetas = torch.arange(self.num_heads, dtype=torch.float32) * ( + 2.0 * math.pi / self.num_heads + ) grid_init = torch.stack([thetas.cos(), thetas.sin()], -1) grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values - grid_init = grid_init.reshape(self.num_heads, 1, 1, 2).tile([1, self.num_levels, self.num_points, 1]) - scaling = torch.arange(1, self.num_points + 1, dtype=torch.float32).reshape(1, 1, -1, 1) + grid_init = grid_init.reshape(self.num_heads, 1, 1, 2).tile( + [1, self.num_levels, self.num_points, 1] + ) + scaling = torch.arange(1, self.num_points + 1, dtype=torch.float32).reshape( + 1, 1, -1, 1 + ) grid_init *= scaling self.sampling_offsets.bias.data[...] = grid_init.flatten() @@ -83,13 +100,9 @@ def _reset_parameters(self): init.xavier_uniform_(self.output_proj.weight) init.constant_(self.output_proj.bias, 0) - - def forward(self, - query, - reference_points, - value, - value_spatial_shapes, - value_mask=None): + def forward( + self, query, reference_points, value, value_spatial_shapes, value_mask=None + ): """ Args: query (Tensor): [bs, query_length, C] @@ -113,29 +126,42 @@ def forward(self, value = value.reshape(bs, Len_v, self.num_heads, self.head_dim) sampling_offsets = self.sampling_offsets(query).reshape( - bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2) + bs, Len_q, self.num_heads, self.num_levels, self.num_points, 2 + ) attention_weights = self.attention_weights(query).reshape( - bs, Len_q, self.num_heads, self.num_levels * self.num_points) + bs, Len_q, self.num_heads, self.num_levels * self.num_points + ) attention_weights = F.softmax(attention_weights, dim=-1).reshape( - bs, Len_q, self.num_heads, self.num_levels, self.num_points) + bs, Len_q, self.num_heads, self.num_levels, self.num_points + ) if reference_points.shape[-1] == 2: offset_normalizer = torch.tensor(value_spatial_shapes) offset_normalizer = offset_normalizer.flip([1]).reshape( - 1, 1, 1, self.num_levels, 1, 2) - sampling_locations = reference_points.reshape( - bs, Len_q, 1, self.num_levels, 1, 2 - ) + sampling_offsets / offset_normalizer + 1, 1, 1, self.num_levels, 1, 2 + ) + sampling_locations = ( + reference_points.reshape(bs, Len_q, 1, self.num_levels, 1, 2) + + sampling_offsets / offset_normalizer + ) elif reference_points.shape[-1] == 4: sampling_locations = ( - reference_points[:, :, None, :, None, :2] + sampling_offsets / - self.num_points * reference_points[:, :, None, :, None, 2:] * 0.5) + reference_points[:, :, None, :, None, :2] + + sampling_offsets + / self.num_points + * reference_points[:, :, None, :, None, 2:] + * 0.5 + ) else: raise ValueError( - "Last dim of reference_points must be 2 or 4, but get {} instead.". - format(reference_points.shape[-1])) + "Last dim of reference_points must be 2 or 4, but get {} instead.".format( + reference_points.shape[-1] + ) + ) - output = self.ms_deformable_attn_core(value, value_spatial_shapes, sampling_locations, attention_weights) + output = self.ms_deformable_attn_core( + value, value_spatial_shapes, sampling_locations, attention_weights + ) output = self.output_proj(output) @@ -143,18 +169,22 @@ def forward(self, class TransformerDecoderLayer(nn.Module): - def __init__(self, - d_model=256, - n_head=8, - dim_feedforward=1024, - dropout=0., - activation="relu", - n_levels=4, - n_points=4,): + def __init__( + self, + d_model=256, + n_head=8, + dim_feedforward=1024, + dropout=0.0, + activation="relu", + n_levels=4, + n_points=4, + ): super(TransformerDecoderLayer, self).__init__() # self attention - self.self_attn = nn.MultiheadAttention(d_model, n_head, dropout=dropout, batch_first=True) + self.self_attn = nn.MultiheadAttention( + d_model, n_head, dropout=dropout, batch_first=True + ) self.dropout1 = nn.Dropout(dropout) self.norm1 = nn.LayerNorm(d_model) @@ -185,15 +215,17 @@ def with_pos_embed(self, tensor, pos): def forward_ffn(self, tgt): return self.linear2(self.dropout3(self.activation(self.linear1(tgt)))) - def forward(self, - tgt, - reference_points, - memory, - memory_spatial_shapes, - memory_level_start_index, - attn_mask=None, - memory_mask=None, - query_pos_embed=None): + def forward( + self, + tgt, + reference_points, + memory, + memory_spatial_shapes, + memory_level_start_index, + attn_mask=None, + memory_mask=None, + query_pos_embed=None, + ): # self attention q = k = self.with_pos_embed(tgt, query_pos_embed) @@ -208,12 +240,13 @@ def forward(self, tgt = self.norm1(tgt) # cross attention - tgt2 = self.cross_attn(\ - self.with_pos_embed(tgt, query_pos_embed), - reference_points, - memory, - memory_spatial_shapes, - memory_mask) + tgt2 = self.cross_attn( + self.with_pos_embed(tgt, query_pos_embed), + reference_points, + memory, + memory_spatial_shapes, + memory_mask, + ) tgt = tgt + self.dropout2(tgt2) tgt = self.norm2(tgt) @@ -228,22 +261,26 @@ def forward(self, class TransformerDecoder(nn.Module): def __init__(self, hidden_dim, decoder_layer, num_layers, eval_idx=-1): super(TransformerDecoder, self).__init__() - self.layers = nn.ModuleList([copy.deepcopy(decoder_layer) for _ in range(num_layers)]) + self.layers = nn.ModuleList( + [copy.deepcopy(decoder_layer) for _ in range(num_layers)] + ) self.hidden_dim = hidden_dim self.num_layers = num_layers self.eval_idx = eval_idx if eval_idx >= 0 else num_layers + eval_idx - def forward(self, - tgt, - ref_points_unact, - memory, - memory_spatial_shapes, - memory_level_start_index, - bbox_head, - score_head, - query_pos_head, - attn_mask=None, - memory_mask=None): + def forward( + self, + tgt, + ref_points_unact, + memory, + memory_spatial_shapes, + memory_level_start_index, + bbox_head, + score_head, + query_pos_head, + attn_mask=None, + memory_mask=None, + ): output = tgt dec_out_bboxes = [] dec_out_logits = [] @@ -253,18 +290,29 @@ def forward(self, ref_points_input = ref_points_detach.unsqueeze(2) query_pos_embed = query_pos_head(ref_points_detach) - output = layer(output, ref_points_input, memory, - memory_spatial_shapes, memory_level_start_index, - attn_mask, memory_mask, query_pos_embed) + output = layer( + output, + ref_points_input, + memory, + memory_spatial_shapes, + memory_level_start_index, + attn_mask, + memory_mask, + query_pos_embed, + ) - inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach)) + inter_ref_bbox = F.sigmoid( + bbox_head[i](output) + inverse_sigmoid(ref_points_detach) + ) if self.training: dec_out_logits.append(score_head[i](output)) if i == 0: dec_out_bboxes.append(inter_ref_bbox) else: - dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points))) + dec_out_bboxes.append( + F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)) + ) elif i == self.eval_idx: dec_out_logits.append(score_head[i](output)) @@ -272,41 +320,46 @@ def forward(self, break ref_points = inter_ref_bbox - ref_points_detach = inter_ref_bbox.detach( - ) if self.training else inter_ref_bbox + ref_points_detach = ( + inter_ref_bbox.detach() if self.training else inter_ref_bbox + ) return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits) @register class RTDETRTransformer(nn.Module): - __share__ = ['num_classes'] - def __init__(self, - num_classes=80, - hidden_dim=256, - num_queries=300, - position_embed_type='sine', - feat_channels=[512, 1024, 2048], - feat_strides=[8, 16, 32], - num_levels=3, - num_decoder_points=4, - nhead=8, - num_decoder_layers=6, - dim_feedforward=1024, - dropout=0., - activation="relu", - num_denoising=100, - label_noise_ratio=0.5, - box_noise_scale=1.0, - learnt_init_query=False, - eval_spatial_size=None, - eval_idx=-1, - eps=1e-2, - aux_loss=True): - + __share__ = ["num_classes"] + + def __init__( + self, + num_classes=80, + hidden_dim=256, + num_queries=300, + position_embed_type="sine", + feat_channels=[512, 1024, 2048], + feat_strides=[8, 16, 32], + num_levels=3, + num_decoder_points=4, + nhead=8, + num_decoder_layers=6, + dim_feedforward=1024, + dropout=0.0, + activation="relu", + num_denoising=100, + label_noise_ratio=0.5, + box_noise_scale=1.0, + learnt_init_query=False, + eval_spatial_size=None, + eval_idx=-1, + eps=1e-2, + aux_loss=True, + ): super(RTDETRTransformer, self).__init__() - assert position_embed_type in ['sine', 'learned'], \ - f'ValueError: position_embed_type not supported {position_embed_type}!' + assert position_embed_type in [ + "sine", + "learned", + ], f"ValueError: position_embed_type not supported {position_embed_type}!" assert len(feat_channels) <= num_levels assert len(feat_strides) == len(feat_channels) for _ in range(num_levels - len(feat_strides)): @@ -327,16 +380,28 @@ def __init__(self, self._build_input_proj_layer(feat_channels) # Transformer module - decoder_layer = TransformerDecoderLayer(hidden_dim, nhead, dim_feedforward, dropout, activation, num_levels, num_decoder_points) - self.decoder = TransformerDecoder(hidden_dim, decoder_layer, num_decoder_layers, eval_idx) + decoder_layer = TransformerDecoderLayer( + hidden_dim, + nhead, + dim_feedforward, + dropout, + activation, + num_levels, + num_decoder_points, + ) + self.decoder = TransformerDecoder( + hidden_dim, decoder_layer, num_decoder_layers, eval_idx + ) self.num_denoising = num_denoising self.label_noise_ratio = label_noise_ratio self.box_noise_scale = box_noise_scale # denoising part - if num_denoising > 0: + if num_denoising > 0: # self.denoising_class_embed = nn.Embedding(num_classes, hidden_dim, padding_idx=num_classes-1) # TODO for load paddle weights - self.denoising_class_embed = nn.Embedding(num_classes+1, hidden_dim, padding_idx=num_classes) + self.denoising_class_embed = nn.Embedding( + num_classes + 1, hidden_dim, padding_idx=num_classes + ) # decoder embedding self.learnt_init_query = learnt_init_query @@ -347,20 +412,23 @@ def __init__(self, # encoder head self.enc_output = nn.Sequential( nn.Linear(hidden_dim, hidden_dim), - nn.LayerNorm(hidden_dim,) + nn.LayerNorm( + hidden_dim, + ), ) self.enc_score_head = nn.Linear(hidden_dim, num_classes) self.enc_bbox_head = MLP(hidden_dim, hidden_dim, 4, num_layers=3) # decoder head - self.dec_score_head = nn.ModuleList([ - nn.Linear(hidden_dim, num_classes) - for _ in range(num_decoder_layers) - ]) - self.dec_bbox_head = nn.ModuleList([ - MLP(hidden_dim, hidden_dim, 4, num_layers=3) - for _ in range(num_decoder_layers) - ]) + self.dec_score_head = nn.ModuleList( + [nn.Linear(hidden_dim, num_classes) for _ in range(num_decoder_layers)] + ) + self.dec_bbox_head = nn.ModuleList( + [ + MLP(hidden_dim, hidden_dim, 4, num_layers=3) + for _ in range(num_decoder_layers) + ] + ) # init encoder output anchors and valid_mask if self.eval_spatial_size: @@ -379,7 +447,7 @@ def _reset_parameters(self): init.constant_(cls_.bias, bias) init.constant_(reg_.layers[-1].weight, 0) init.constant_(reg_.layers[-1].bias, 0) - + # linear_init_(self.enc_output[0]) init.xavier_uniform_(self.enc_output[0].weight) if self.learnt_init_query: @@ -387,14 +455,25 @@ def _reset_parameters(self): init.xavier_uniform_(self.query_pos_head.layers[0].weight) init.xavier_uniform_(self.query_pos_head.layers[1].weight) - def _build_input_proj_layer(self, feat_channels): self.input_proj = nn.ModuleList() for in_channels in feat_channels: self.input_proj.append( - nn.Sequential(OrderedDict([ - ('conv', nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False)), - ('norm', nn.BatchNorm2d(self.hidden_dim,))]) + nn.Sequential( + OrderedDict( + [ + ( + "conv", + nn.Conv2d(in_channels, self.hidden_dim, 1, bias=False), + ), + ( + "norm", + nn.BatchNorm2d( + self.hidden_dim, + ), + ), + ] + ) ) ) @@ -402,9 +481,23 @@ def _build_input_proj_layer(self, feat_channels): for _ in range(self.num_levels - len(feat_channels)): self.input_proj.append( - nn.Sequential(OrderedDict([ - ('conv', nn.Conv2d(in_channels, self.hidden_dim, 3, 2, padding=1, bias=False)), - ('norm', nn.BatchNorm2d(self.hidden_dim))]) + nn.Sequential( + OrderedDict( + [ + ( + "conv", + nn.Conv2d( + in_channels, + self.hidden_dim, + 3, + 2, + padding=1, + bias=False, + ), + ), + ("norm", nn.BatchNorm2d(self.hidden_dim)), + ] + ) ) ) in_channels = self.hidden_dim @@ -423,7 +516,9 @@ def _get_encoder_input(self, feats): # get encoder inputs feat_flatten = [] spatial_shapes = [] - level_start_index = [0, ] + level_start_index = [ + 0, + ] for i, feat in enumerate(proj_feats): _, _, h, w = feat.shape # [b, c, h, w] -> [b, h*w, c] @@ -438,28 +533,31 @@ def _get_encoder_input(self, feats): level_start_index.pop() return (feat_flatten, spatial_shapes, level_start_index) - def _generate_anchors(self, - spatial_shapes=None, - grid_size=0.05, - dtype=torch.float32, - device='cpu'): + def _generate_anchors( + self, spatial_shapes=None, grid_size=0.05, dtype=torch.float32, device="cpu" + ): if spatial_shapes is None: - spatial_shapes = [[int(self.eval_spatial_size[0] / s), int(self.eval_spatial_size[1] / s)] + spatial_shapes = [ + [int(self.eval_spatial_size[0] / s), int(self.eval_spatial_size[1] / s)] for s in self.feat_strides ] anchors = [] for lvl, (h, w) in enumerate(spatial_shapes): - grid_y, grid_x = torch.meshgrid(\ - torch.arange(end=h, dtype=dtype), \ - torch.arange(end=w, dtype=dtype), indexing='ij') + grid_y, grid_x = torch.meshgrid( + torch.arange(end=h, dtype=dtype), + torch.arange(end=w, dtype=dtype), + indexing="ij", + ) grid_xy = torch.stack([grid_x, grid_y], -1) valid_WH = torch.tensor([w, h]).to(dtype) grid_xy = (grid_xy.unsqueeze(0) + 0.5) / valid_WH - wh = torch.ones_like(grid_xy) * grid_size * (2.0 ** lvl) + wh = torch.ones_like(grid_xy) * grid_size * (2.0**lvl) anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, h * w, 4)) anchors = torch.concat(anchors, 1).to(device) - valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all(-1, keepdim=True) + valid_mask = ((anchors > self.eps) * (anchors < 1 - self.eps)).all( + -1, keepdim=True + ) anchors = torch.log(anchors / (1 - anchors)) # anchors = torch.where(valid_mask, anchors, float('inf')) # anchors[valid_mask] = torch.inf # valid_mask [1, 8400, 1] @@ -467,46 +565,60 @@ def _generate_anchors(self, return anchors, valid_mask - - def _get_decoder_input(self, - memory, - spatial_shapes, - denoising_class=None, - denoising_bbox_unact=None): + def _get_decoder_input( + self, memory, spatial_shapes, denoising_class=None, denoising_bbox_unact=None + ): bs, _, _ = memory.shape # prepare input for decoder if self.training or self.eval_spatial_size is None: - anchors, valid_mask = self._generate_anchors(spatial_shapes, device=memory.device) + anchors, valid_mask = self._generate_anchors( + spatial_shapes, device=memory.device + ) else: - anchors, valid_mask = self.anchors.to(memory.device), self.valid_mask.to(memory.device) + anchors, valid_mask = self.anchors.to(memory.device), self.valid_mask.to( + memory.device + ) # memory = torch.where(valid_mask, memory, 0) - memory = valid_mask.to(memory.dtype) * memory # TODO fix type error for onnx export + memory = ( + valid_mask.to(memory.dtype) * memory + ) # TODO fix type error for onnx export output_memory = self.enc_output(memory) enc_outputs_class = self.enc_score_head(output_memory) enc_outputs_coord_unact = self.enc_bbox_head(output_memory) + anchors - _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.num_queries, dim=1) - - reference_points_unact = enc_outputs_coord_unact.gather(dim=1, \ - index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_unact.shape[-1])) + _, topk_ind = torch.topk( + enc_outputs_class.max(-1).values, self.num_queries, dim=1 + ) + + reference_points_unact = enc_outputs_coord_unact.gather( + dim=1, + index=topk_ind.unsqueeze(-1).repeat( + 1, 1, enc_outputs_coord_unact.shape[-1] + ), + ) enc_topk_bboxes = F.sigmoid(reference_points_unact) if denoising_bbox_unact is not None: reference_points_unact = torch.concat( - [denoising_bbox_unact, reference_points_unact], 1) - - enc_topk_logits = enc_outputs_class.gather(dim=1, \ - index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])) + [denoising_bbox_unact, reference_points_unact], 1 + ) + + enc_topk_logits = enc_outputs_class.gather( + dim=1, + index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1]), + ) # extract region features if self.learnt_init_query: target = self.tgt_embed.weight.unsqueeze(0).tile([bs, 1, 1]) else: - target = output_memory.gather(dim=1, \ - index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1])) + target = output_memory.gather( + dim=1, + index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]), + ) target = target.detach() if denoising_class is not None: @@ -514,27 +626,42 @@ def _get_decoder_input(self, return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits - def forward(self, feats, targets=None): - # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) - + # prepare denoising training if self.training and self.num_denoising > 0: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ - get_contrastive_denoising_training_group(targets, \ - self.num_classes, - self.num_queries, - self.denoising_class_embed, - num_denoising=self.num_denoising, - label_noise_ratio=self.label_noise_ratio, - box_noise_scale=self.box_noise_scale, ) + ( + denoising_class, + denoising_bbox_unact, + attn_mask, + dn_meta, + ) = get_contrastive_denoising_training_group( + targets, + self.num_classes, + self.num_queries, + self.denoising_class_embed, + num_denoising=self.num_denoising, + label_noise_ratio=self.label_noise_ratio, + box_noise_scale=self.box_noise_scale, + ) else: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = ( + None, + None, + None, + None, + ) - target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ - self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact) + ( + target, + init_ref_points_unact, + enc_topk_bboxes, + enc_topk_logits, + ) = self._get_decoder_input( + memory, spatial_shapes, denoising_class, denoising_bbox_unact + ) # decoder out_bboxes, out_logits = self.decoder( @@ -546,29 +673,37 @@ def forward(self, feats, targets=None): self.dec_bbox_head, self.dec_score_head, self.query_pos_head, - attn_mask=attn_mask) + attn_mask=attn_mask, + ) if self.training and dn_meta is not None: - dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2) - dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2) + dn_out_bboxes, out_bboxes = torch.split( + out_bboxes, dn_meta["dn_num_split"], dim=2 + ) + dn_out_logits, out_logits = torch.split( + out_logits, dn_meta["dn_num_split"], dim=2 + ) - out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]} + out = {"pred_logits": out_logits[-1], "pred_boxes": out_bboxes[-1]} if self.training and self.aux_loss: - out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) - out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes])) - + out["aux_outputs"] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) + out["aux_outputs"].extend( + self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes]) + ) + if self.training and dn_meta is not None: - out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) - out['dn_meta'] = dn_meta + out["dn_aux_outputs"] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) + out["dn_meta"] = dn_meta return out - @torch.jit.unused def _set_aux_loss(self, outputs_class, outputs_coord): # this is a workaround to make torchscript happy, as torchscript # doesn't support dictionary with non-homogeneous values, such # as a dict having both a Tensor and a list. - return [{'pred_logits': a, 'pred_boxes': b} - for a, b in zip(outputs_class, outputs_coord)] + return [ + {"pred_logits": a, "pred_boxes": b} + for a, b in zip(outputs_class, outputs_coord) + ] From c693d0d0d6e66e40541155205397f22b0a2e9416 Mon Sep 17 00:00:00 2001 From: Aaron Date: Sun, 21 Jan 2024 17:20:47 -0500 Subject: [PATCH 05/35] output embeddings --- rtdetr_pytorch/src/export/__init__.py | 0 rtdetr_pytorch/src/export/pkl.py | 0 rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py | 12 +++++++++--- 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 rtdetr_pytorch/src/export/__init__.py create mode 100644 rtdetr_pytorch/src/export/pkl.py diff --git a/rtdetr_pytorch/src/export/__init__.py b/rtdetr_pytorch/src/export/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/rtdetr_pytorch/src/export/pkl.py b/rtdetr_pytorch/src/export/pkl.py new file mode 100644 index 00000000..e69de29b diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index ab33a5ec..9aba9e7d 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -324,7 +324,9 @@ def forward( inter_ref_bbox.detach() if self.training else inter_ref_bbox ) - return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits) + # bbox predictions, classification logits, features + + return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits), output @register @@ -664,7 +666,7 @@ def forward(self, feats, targets=None): ) # decoder - out_bboxes, out_logits = self.decoder( + out_bboxes, out_logits, out_features = self.decoder( target, init_ref_points_unact, memory, @@ -684,7 +686,11 @@ def forward(self, feats, targets=None): out_logits, dn_meta["dn_num_split"], dim=2 ) - out = {"pred_logits": out_logits[-1], "pred_boxes": out_bboxes[-1]} + out = { + "pred_logits": out_logits[-1], + "pred_boxes": out_bboxes[-1], + "features": out_features[-1], + } if self.training and self.aux_loss: out["aux_outputs"] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) From 00bd3c921cd00a712d46de255ba4d61d6fb1c1c9 Mon Sep 17 00:00:00 2001 From: Aaron Date: Sun, 21 Jan 2024 17:46:24 -0500 Subject: [PATCH 06/35] black formatting --- .../src/zoo/rtdetr/rtdetr_postprocessor.py | 80 ++++++++++++------- 1 file changed, 50 insertions(+), 30 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py index 344d69ac..e405cbeb 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -1,40 +1,47 @@ """by lyuwenyu """ -import torch -import torch.nn as nn -import torch.nn.functional as F - +import torch +import torch.nn as nn +import torch.nn.functional as F import torchvision - from src.core import register - -__all__ = ['RTDETRPostProcessor'] +__all__ = ["RTDETRPostProcessor"] @register class RTDETRPostProcessor(nn.Module): - __share__ = ['num_classes', 'use_focal_loss', 'num_top_queries', 'remap_mscoco_category'] - - def __init__(self, num_classes=80, use_focal_loss=True, num_top_queries=300, remap_mscoco_category=False) -> None: + __share__ = [ + "num_classes", + "use_focal_loss", + "num_top_queries", + "remap_mscoco_category", + ] + + def __init__( + self, + num_classes=80, + use_focal_loss=True, + num_top_queries=300, + remap_mscoco_category=False, + ) -> None: super().__init__() self.use_focal_loss = use_focal_loss self.num_top_queries = num_top_queries self.num_classes = num_classes - self.remap_mscoco_category = remap_mscoco_category - self.deploy_mode = False + self.remap_mscoco_category = remap_mscoco_category + self.deploy_mode = False def extra_repr(self) -> str: - return f'use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}' - + return f"use_focal_loss={self.use_focal_loss}, num_classes={self.num_classes}, num_top_queries={self.num_top_queries}" + # def forward(self, outputs, orig_target_sizes): def forward(self, outputs, orig_target_sizes): + logits, boxes = outputs["pred_logits"], outputs["pred_boxes"] + # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) - logits, boxes = outputs['pred_logits'], outputs['pred_boxes'] - # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) - - bbox_pred = torchvision.ops.box_convert(boxes, in_fmt='cxcywh', out_fmt='xyxy') + bbox_pred = torchvision.ops.box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy") bbox_pred *= orig_target_sizes.repeat(1, 2).unsqueeze(1) if self.use_focal_loss: @@ -42,16 +49,20 @@ def forward(self, outputs, orig_target_sizes): scores, index = torch.topk(scores.flatten(1), self.num_top_queries, axis=-1) labels = index % self.num_classes index = index // self.num_classes - boxes = bbox_pred.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1])) - + boxes = bbox_pred.gather( + dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]) + ) + else: scores = F.softmax(logits)[:, :, :-1] scores, labels = scores.max(dim=-1) if scores.shape[1] > self.num_top_queries: scores, index = torch.topk(scores, self.num_top_queries, dim=-1) labels = torch.gather(labels, dim=1, index=index) - boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) - + boxes = torch.gather( + boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]) + ) + # TODO for onnx export if self.deploy_mode: return labels, boxes, scores @@ -59,22 +70,31 @@ def forward(self, outputs, orig_target_sizes): # TODO if self.remap_mscoco_category: from ...data.coco import mscoco_label2category - labels = torch.tensor([mscoco_label2category[int(x.item())] for x in labels.flatten()])\ - .to(boxes.device).reshape(labels.shape) + + labels = ( + torch.tensor( + [mscoco_label2category[int(x.item())] for x in labels.flatten()] + ) + .to(boxes.device) + .reshape(labels.shape) + ) results = [] for lab, box, sco in zip(labels, boxes, scores): result = dict(labels=lab, boxes=box, scores=sco) results.append(result) - + return results - - def deploy(self, ): + def deploy( + self, + ): self.eval() self.deploy_mode = True - return self + return self @property - def iou_types(self, ): - return ('bbox', ) + def iou_types( + self, + ): + return ("bbox",) From 45fe6d543bd546dca18f9af3c8c41b9ba74c4a4a Mon Sep 17 00:00:00 2001 From: Aaron Date: Sun, 21 Jan 2024 17:49:15 -0500 Subject: [PATCH 07/35] postprocessing outputs features --- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py | 2 +- .../src/zoo/rtdetr/rtdetr_postprocessor.py | 15 ++++++++++----- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index 9aba9e7d..c3ab4e89 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -689,7 +689,7 @@ def forward(self, feats, targets=None): out = { "pred_logits": out_logits[-1], "pred_boxes": out_bboxes[-1], - "features": out_features[-1], + "features": out_features, } if self.training and self.aux_loss: diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py index e405cbeb..d6bbdc10 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -38,7 +38,12 @@ def extra_repr(self) -> str: # def forward(self, outputs, orig_target_sizes): def forward(self, outputs, orig_target_sizes): - logits, boxes = outputs["pred_logits"], outputs["pred_boxes"] + logits, boxes, features = ( + outputs["pred_logits"], + outputs["pred_boxes"], + outputs["features"], + ) + # orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) bbox_pred = torchvision.ops.box_convert(boxes, in_fmt="cxcywh", out_fmt="xyxy") @@ -65,7 +70,7 @@ def forward(self, outputs, orig_target_sizes): # TODO for onnx export if self.deploy_mode: - return labels, boxes, scores + return labels, boxes, scores, features # TODO if self.remap_mscoco_category: @@ -80,9 +85,9 @@ def forward(self, outputs, orig_target_sizes): ) results = [] - for lab, box, sco in zip(labels, boxes, scores): - result = dict(labels=lab, boxes=box, scores=sco) - results.append(result) + # features untested when self.deploy_mode==False + for lab, box, sco, feat in zip(labels, boxes, scores): # , features): + result = dict(labels=lab, boxes=box, scores=sco) # , features=feat) return results From cee45ffc7652849ee19300158b4adf8bba1a0710 Mon Sep 17 00:00:00 2001 From: Aaron Date: Sun, 21 Jan 2024 17:57:24 -0500 Subject: [PATCH 08/35] update rtdetr model test --- rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py | 39 +++++++++++++++---------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py index 851d4f74..cb36f538 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr.py @@ -1,22 +1,26 @@ """by lyuwenyu """ -import torch -import torch.nn as nn -import torch.nn.functional as F - -import random -import numpy as np +import random +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F from src.core import register - -__all__ = ['RTDETR', ] +__all__ = [ + "RTDETR", +] @register class RTDETR(nn.Module): - __inject__ = ['backbone', 'encoder', 'decoder', ] + __inject__ = [ + "backbone", + "encoder", + "decoder", + ] def __init__(self, backbone: nn.Module, encoder, decoder, multi_scale=None): super().__init__() @@ -24,21 +28,24 @@ def __init__(self, backbone: nn.Module, encoder, decoder, multi_scale=None): self.decoder = decoder self.encoder = encoder self.multi_scale = multi_scale - + def forward(self, x, targets=None): if self.multi_scale and self.training: sz = np.random.choice(self.multi_scale) + sz = sz.item() x = F.interpolate(x, size=[sz, sz]) - + x = self.backbone(x) - x = self.encoder(x) + x = self.encoder(x) x = self.decoder(x, targets) return x - - def deploy(self, ): + + def deploy( + self, + ): self.eval() for m in self.modules(): - if hasattr(m, 'convert_to_deploy'): + if hasattr(m, "convert_to_deploy"): m.convert_to_deploy() - return self + return self From 470763bc8022a57353d504ce187ff1b78dc02d39 Mon Sep 17 00:00:00 2001 From: Aaron Date: Mon, 22 Jan 2024 09:39:55 -0500 Subject: [PATCH 09/35] black formatting --- rtdetr_pytorch/src/solver/det_solver.py | 127 +++++++++++++++--------- 1 file changed, 82 insertions(+), 45 deletions(-) diff --git a/rtdetr_pytorch/src/solver/det_solver.py b/rtdetr_pytorch/src/solver/det_solver.py index d0a0a840..e770022b 100644 --- a/rtdetr_pytorch/src/solver/det_solver.py +++ b/rtdetr_pytorch/src/solver/det_solver.py @@ -1,73 +1,97 @@ -''' +""" by lyuwenyu -''' -import time -import json +""" import datetime +import json +import time -import torch - -from src.misc import dist +import torch from src.data import get_coco_api_from_dataset +from src.misc import dist +from .det_engine import evaluate, train_one_epoch from .solver import BaseSolver -from .det_engine import train_one_epoch, evaluate class DetSolver(BaseSolver): - - def fit(self, ): + def fit( + self, + ): print("Start training") self.train() - args = self.cfg - - n_parameters = sum(p.numel() for p in self.model.parameters() if p.requires_grad) - print('number of params:', n_parameters) + args = self.cfg + + n_parameters = sum( + p.numel() for p in self.model.parameters() if p.requires_grad + ) + print("number of params:", n_parameters) base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset) # best_stat = {'coco_eval_bbox': 0, 'coco_eval_masks': 0, 'epoch': -1, } - best_stat = {'epoch': -1, } + best_stat = { + "epoch": -1, + } start_time = time.time() for epoch in range(self.last_epoch + 1, args.epoches): if dist.is_dist_available_and_initialized(): self.train_dataloader.sampler.set_epoch(epoch) - + train_stats = train_one_epoch( - self.model, self.criterion, self.train_dataloader, self.optimizer, self.device, epoch, - args.clip_max_norm, print_freq=args.log_step, ema=self.ema, scaler=self.scaler) + self.model, + self.criterion, + self.train_dataloader, + self.optimizer, + self.device, + epoch, + args.clip_max_norm, + print_freq=args.log_step, + ema=self.ema, + scaler=self.scaler, + ) self.lr_scheduler.step() - + if self.output_dir: - checkpoint_paths = [self.output_dir / 'checkpoint.pth'] + checkpoint_paths = [self.output_dir / "checkpoint.pth"] # extra checkpoint before LR drop and every 100 epochs if (epoch + 1) % args.checkpoint_step == 0: - checkpoint_paths.append(self.output_dir / f'checkpoint{epoch:04}.pth') + checkpoint_paths.append( + self.output_dir / f"checkpoint{epoch:04}.pth" + ) for checkpoint_path in checkpoint_paths: dist.save_on_master(self.state_dict(epoch), checkpoint_path) module = self.ema.module if self.ema else self.model test_stats, coco_evaluator = evaluate( - module, self.criterion, self.postprocessor, self.val_dataloader, base_ds, self.device, self.output_dir + module, + self.criterion, + self.postprocessor, + self.val_dataloader, + base_ds, + self.device, + self.output_dir, ) - # TODO + # TODO for k in test_stats.keys(): if k in best_stat: - best_stat['epoch'] = epoch if test_stats[k][0] > best_stat[k] else best_stat['epoch'] + best_stat["epoch"] = ( + epoch if test_stats[k][0] > best_stat[k] else best_stat["epoch"] + ) best_stat[k] = max(best_stat[k], test_stats[k][0]) else: - best_stat['epoch'] = epoch + best_stat["epoch"] = epoch best_stat[k] = test_stats[k][0] - print('best_stat: ', best_stat) - + print("best_stat: ", best_stat) - log_stats = {**{f'train_{k}': v for k, v in train_stats.items()}, - **{f'test_{k}': v for k, v in test_stats.items()}, - 'epoch': epoch, - 'n_parameters': n_parameters} + log_stats = { + **{f"train_{k}": v for k, v in train_stats.items()}, + **{f"test_{k}": v for k, v in test_stats.items()}, + "epoch": epoch, + "n_parameters": n_parameters, + } if self.output_dir and dist.is_main_process(): with (self.output_dir / "log.txt").open("a") as f: @@ -75,30 +99,43 @@ def fit(self, ): # for evaluation logs if coco_evaluator is not None: - (self.output_dir / 'eval').mkdir(exist_ok=True) + (self.output_dir / "eval").mkdir(exist_ok=True) if "bbox" in coco_evaluator.coco_eval: - filenames = ['latest.pth'] + filenames = ["latest.pth"] if epoch % 50 == 0: - filenames.append(f'{epoch:03}.pth') + filenames.append(f"{epoch:03}.pth") for name in filenames: - torch.save(coco_evaluator.coco_eval["bbox"].eval, - self.output_dir / "eval" / name) + torch.save( + coco_evaluator.coco_eval["bbox"].eval, + self.output_dir / "eval" / name, + ) total_time = time.time() - start_time total_time_str = str(datetime.timedelta(seconds=int(total_time))) - print('Training time {}'.format(total_time_str)) + print("Training time {}".format(total_time_str)) + return best_stat - - def val(self, ): + def val( + self, + ): self.eval() base_ds = get_coco_api_from_dataset(self.val_dataloader.dataset) - + module = self.ema.module if self.ema else self.model - test_stats, coco_evaluator = evaluate(module, self.criterion, self.postprocessor, - self.val_dataloader, base_ds, self.device, self.output_dir) - + test_stats, coco_evaluator = evaluate( + module, + self.criterion, + self.postprocessor, + self.val_dataloader, + base_ds, + self.device, + self.output_dir, + ) + if self.output_dir: - dist.save_on_master(coco_evaluator.coco_eval["bbox"].eval, self.output_dir / "eval.pth") - + dist.save_on_master( + coco_evaluator.coco_eval["bbox"].eval, self.output_dir / "eval.pth" + ) + return From e20577abe8817d825d9539d47b56ce38575bf1a4 Mon Sep 17 00:00:00 2001 From: Aaron Date: Mon, 22 Jan 2024 09:40:23 -0500 Subject: [PATCH 10/35] fix failing to return results --- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py index d6bbdc10..22e3f2bd 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -86,8 +86,9 @@ def forward(self, outputs, orig_target_sizes): results = [] # features untested when self.deploy_mode==False - for lab, box, sco, feat in zip(labels, boxes, scores): # , features): - result = dict(labels=lab, boxes=box, scores=sco) # , features=feat) + for lab, box, sco, feat in zip(labels, boxes, scores, features): # , features): + result = dict(labels=lab, boxes=box, scores=sco, features=feat) + results.append(result) return results From 8606744c2ab208c939642d17e2a55eae780f06df Mon Sep 17 00:00:00 2001 From: Aaron Date: Mon, 22 Jan 2024 09:40:43 -0500 Subject: [PATCH 11/35] add torch pkl export --- rtdetr_pytorch/src/export/pkl.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/rtdetr_pytorch/src/export/pkl.py b/rtdetr_pytorch/src/export/pkl.py index e69de29b..f0710699 100644 --- a/rtdetr_pytorch/src/export/pkl.py +++ b/rtdetr_pytorch/src/export/pkl.py @@ -0,0 +1,32 @@ +from pathlib import Path + +import torch +import torch.nn as nn +from src.core import YAMLConfig + + +class Model(nn.Module): + def __init__(self, cfg) -> None: + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + print(self.postprocessor.deploy_mode) + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + print("POSTPROCESSING OUTPUT") + return self.postprocessor(outputs, orig_target_sizes) + + +def export_torch_pkl(cfg: YAMLConfig, checkpoint_path: Path, output_path: Path): + checkpoint = torch.load(checkpoint_path, map_location="cpu") + if "ema" in checkpoint: + state = checkpoint["ema"]["module"] + else: + state = checkpoint["model"] + + # NOTE load train mode state -> convert to deploy mode + cfg.model.load_state_dict(state) + + model = Model(cfg) + torch.save(model, output_path) From b90e2199174c9bc4e07dc4b8e59ed21a36f4822d Mon Sep 17 00:00:00 2001 From: Aaron Date: Mon, 22 Jan 2024 09:41:05 -0500 Subject: [PATCH 12/35] black formatting --- rtdetr_pytorch/src/data/coco/coco_dataset.py | 227 ++++++++++--------- rtdetr_pytorch/src/inference_model.py | 13 ++ 2 files changed, 132 insertions(+), 108 deletions(-) create mode 100644 rtdetr_pytorch/src/inference_model.py diff --git a/rtdetr_pytorch/src/data/coco/coco_dataset.py b/rtdetr_pytorch/src/data/coco/coco_dataset.py index 0ef78498..837b6bd2 100644 --- a/rtdetr_pytorch/src/data/coco/coco_dataset.py +++ b/rtdetr_pytorch/src/data/coco/coco_dataset.py @@ -4,28 +4,36 @@ COCO dataset which returns image_id for evaluation. Mostly copy-paste from https://github.com/pytorch/vision/blob/13b35ff/references/detection/coco_utils.py """ +from pathlib import Path import torch import torch.utils.data - import torchvision -torchvision.disable_beta_transforms_warning() -from torchvision import datapoints +torchvision.disable_beta_transforms_warning() from pycocotools import mask as coco_mask - from src.core import register +from torchvision import datapoints -__all__ = ['CocoDetection'] +__all__ = ["CocoDetection"] @register class CocoDetection(torchvision.datasets.CocoDetection): - __inject__ = ['transforms'] - __share__ = ['remap_mscoco_category'] - - def __init__(self, img_folder, ann_file, transforms, return_masks, remap_mscoco_category=False): + __inject__ = ["transforms"] + __share__ = ["remap_mscoco_category"] + + def __init__( + self, + img_folder, + ann_file, + transforms, + return_masks, + remap_mscoco_category=False, + ): + img_folder = Path(img_folder).resolve() + ann_file = Path(ann_file).resolve() super(CocoDetection, self).__init__(img_folder, ann_file) self._transforms = transforms self.prepare = ConvertCocoPolysToMask(return_masks, remap_mscoco_category) @@ -37,31 +45,32 @@ def __init__(self, img_folder, ann_file, transforms, return_masks, remap_mscoco_ def __getitem__(self, idx): img, target = super(CocoDetection, self).__getitem__(idx) image_id = self.ids[idx] - target = {'image_id': image_id, 'annotations': target} + target = {"image_id": image_id, "annotations": target} img, target = self.prepare(img, target) # ['boxes', 'masks', 'labels']: - if 'boxes' in target: - target['boxes'] = datapoints.BoundingBox( - target['boxes'], - format=datapoints.BoundingBoxFormat.XYXY, - spatial_size=img.size[::-1]) # h w + if "boxes" in target: + target["boxes"] = datapoints.BoundingBox( + target["boxes"], + format=datapoints.BoundingBoxFormat.XYXY, + spatial_size=img.size[::-1], + ) # h w - if 'masks' in target: - target['masks'] = datapoints.Mask(target['masks']) + if "masks" in target: + target["masks"] = datapoints.Mask(target["masks"]) if self._transforms is not None: img, target = self._transforms(img, target) - + return img, target def extra_repr(self) -> str: - s = f' img_folder: {self.img_folder}\n ann_file: {self.ann_file}\n' - s += f' return_masks: {self.return_masks}\n' - if hasattr(self, '_transforms') and self._transforms is not None: - s += f' transforms:\n {repr(self._transforms)}' + s = f" img_folder: {self.img_folder}\n ann_file: {self.ann_file}\n" + s += f" return_masks: {self.return_masks}\n" + if hasattr(self, "_transforms") and self._transforms is not None: + s += f" transforms:\n {repr(self._transforms)}" - return s + return s def convert_coco_poly_to_mask(segmentations, height, width): @@ -94,7 +103,7 @@ def __call__(self, image, target): anno = target["annotations"] - anno = [obj for obj in anno if 'iscrowd' not in obj or obj['iscrowd'] == 0] + anno = [obj for obj in anno if "iscrowd" not in obj or obj["iscrowd"] == 0] boxes = [obj["bbox"] for obj in anno] # guard against no boxes via resizing @@ -107,7 +116,7 @@ def __call__(self, image, target): classes = [mscoco_category2label[obj["category_id"]] for obj in anno] else: classes = [obj["category_id"] for obj in anno] - + classes = torch.tensor(classes, dtype=torch.int64) if self.return_masks: @@ -141,98 +150,100 @@ def __call__(self, image, target): # for conversion to coco api area = torch.tensor([obj["area"] for obj in anno]) - iscrowd = torch.tensor([obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno]) + iscrowd = torch.tensor( + [obj["iscrowd"] if "iscrowd" in obj else 0 for obj in anno] + ) target["area"] = area[keep] target["iscrowd"] = iscrowd[keep] target["orig_size"] = torch.as_tensor([int(w), int(h)]) target["size"] = torch.as_tensor([int(w), int(h)]) - + return image, target mscoco_category2name = { - 1: 'person', - 2: 'bicycle', - 3: 'car', - 4: 'motorcycle', - 5: 'airplane', - 6: 'bus', - 7: 'train', - 8: 'truck', - 9: 'boat', - 10: 'traffic light', - 11: 'fire hydrant', - 13: 'stop sign', - 14: 'parking meter', - 15: 'bench', - 16: 'bird', - 17: 'cat', - 18: 'dog', - 19: 'horse', - 20: 'sheep', - 21: 'cow', - 22: 'elephant', - 23: 'bear', - 24: 'zebra', - 25: 'giraffe', - 27: 'backpack', - 28: 'umbrella', - 31: 'handbag', - 32: 'tie', - 33: 'suitcase', - 34: 'frisbee', - 35: 'skis', - 36: 'snowboard', - 37: 'sports ball', - 38: 'kite', - 39: 'baseball bat', - 40: 'baseball glove', - 41: 'skateboard', - 42: 'surfboard', - 43: 'tennis racket', - 44: 'bottle', - 46: 'wine glass', - 47: 'cup', - 48: 'fork', - 49: 'knife', - 50: 'spoon', - 51: 'bowl', - 52: 'banana', - 53: 'apple', - 54: 'sandwich', - 55: 'orange', - 56: 'broccoli', - 57: 'carrot', - 58: 'hot dog', - 59: 'pizza', - 60: 'donut', - 61: 'cake', - 62: 'chair', - 63: 'couch', - 64: 'potted plant', - 65: 'bed', - 67: 'dining table', - 70: 'toilet', - 72: 'tv', - 73: 'laptop', - 74: 'mouse', - 75: 'remote', - 76: 'keyboard', - 77: 'cell phone', - 78: 'microwave', - 79: 'oven', - 80: 'toaster', - 81: 'sink', - 82: 'refrigerator', - 84: 'book', - 85: 'clock', - 86: 'vase', - 87: 'scissors', - 88: 'teddy bear', - 89: 'hair drier', - 90: 'toothbrush' + 1: "person", + 2: "bicycle", + 3: "car", + 4: "motorcycle", + 5: "airplane", + 6: "bus", + 7: "train", + 8: "truck", + 9: "boat", + 10: "traffic light", + 11: "fire hydrant", + 13: "stop sign", + 14: "parking meter", + 15: "bench", + 16: "bird", + 17: "cat", + 18: "dog", + 19: "horse", + 20: "sheep", + 21: "cow", + 22: "elephant", + 23: "bear", + 24: "zebra", + 25: "giraffe", + 27: "backpack", + 28: "umbrella", + 31: "handbag", + 32: "tie", + 33: "suitcase", + 34: "frisbee", + 35: "skis", + 36: "snowboard", + 37: "sports ball", + 38: "kite", + 39: "baseball bat", + 40: "baseball glove", + 41: "skateboard", + 42: "surfboard", + 43: "tennis racket", + 44: "bottle", + 46: "wine glass", + 47: "cup", + 48: "fork", + 49: "knife", + 50: "spoon", + 51: "bowl", + 52: "banana", + 53: "apple", + 54: "sandwich", + 55: "orange", + 56: "broccoli", + 57: "carrot", + 58: "hot dog", + 59: "pizza", + 60: "donut", + 61: "cake", + 62: "chair", + 63: "couch", + 64: "potted plant", + 65: "bed", + 67: "dining table", + 70: "toilet", + 72: "tv", + 73: "laptop", + 74: "mouse", + 75: "remote", + 76: "keyboard", + 77: "cell phone", + 78: "microwave", + 79: "oven", + 80: "toaster", + 81: "sink", + 82: "refrigerator", + 84: "book", + 85: "clock", + 86: "vase", + 87: "scissors", + 88: "teddy bear", + 89: "hair drier", + 90: "toothbrush", } mscoco_category2label = {k: i for i, k in enumerate(mscoco_category2name.keys())} -mscoco_label2category = {v: k for k, v in mscoco_category2label.items()} \ No newline at end of file +mscoco_label2category = {v: k for k, v in mscoco_category2label.items()} diff --git a/rtdetr_pytorch/src/inference_model.py b/rtdetr_pytorch/src/inference_model.py new file mode 100644 index 00000000..1043b917 --- /dev/null +++ b/rtdetr_pytorch/src/inference_model.py @@ -0,0 +1,13 @@ +from torch import nn + + +class InferenceModel(nn.Module): + def __init__(self, cfg) -> None: + super().__init__() + self.model = cfg.model.deploy() + self.postprocessor = cfg.postprocessor.deploy() + print(self.postprocessor.deploy_mode) + + def forward(self, images, orig_target_sizes): + outputs = self.model(images) + return self.postprocessor(outputs, orig_target_sizes) From 5c9d93ae9d8b536ab767f5519d3f3d1311b79ab6 Mon Sep 17 00:00:00 2001 From: Aaron Date: Tue, 23 Jan 2024 08:45:56 -0500 Subject: [PATCH 13/35] add logging hooks --- .../ppdet/modeling/losses/detr_loss.py | 565 ++++++++++-------- rtdetr_pytorch/src/core/config.py | 242 ++++---- rtdetr_pytorch/src/solver/det_engine.py | 90 +-- rtdetr_pytorch/src/solver/det_solver.py | 3 + rtdetr_pytorch/src/solver/solver.py | 169 +++--- 5 files changed, 594 insertions(+), 475 deletions(-) diff --git a/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py b/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py index 24f14c3d..d7b6c797 100644 --- a/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py +++ b/rtdetr_paddle/ppdet/modeling/losses/detr_loss.py @@ -12,42 +12,50 @@ # See the License for the specific language governing permissions and # limitations under the License. -from __future__ import absolute_import -from __future__ import division -from __future__ import print_function +from __future__ import absolute_import, division, print_function + +from typing import Callable, List, Optional import paddle import paddle.nn as nn import paddle.nn.functional as F from ppdet.core.workspace import register -from .iou_loss import GIoULoss -from ..transformers import bbox_cxcywh_to_xyxy, sigmoid_focal_loss, varifocal_loss_with_logits + from ..bbox_utils import bbox_iou +from ..transformers import ( + bbox_cxcywh_to_xyxy, + sigmoid_focal_loss, + varifocal_loss_with_logits, +) +from .iou_loss import GIoULoss -__all__ = ['DETRLoss', 'DINOLoss'] +__all__ = ["DETRLoss", "DINOLoss"] @register class DETRLoss(nn.Layer): - __shared__ = ['num_classes', 'use_focal_loss'] - __inject__ = ['matcher'] - - def __init__(self, - num_classes=80, - matcher='HungarianMatcher', - loss_coeff={ - 'class': 1, - 'bbox': 5, - 'giou': 2, - 'no_object': 0.1, - 'mask': 1, - 'dice': 1 - }, - aux_loss=True, - use_focal_loss=False, - use_vfl=False, - use_uni_match=False, - uni_match_ind=0): + __shared__ = ["num_classes", "use_focal_loss"] + __inject__ = ["matcher"] + + def __init__( + self, + num_classes=80, + matcher="HungarianMatcher", + loss_coeff={ + "class": 1, + "bbox": 5, + "giou": 2, + "no_object": 0.1, + "mask": 1, + "dice": 1, + }, + aux_loss=True, + use_focal_loss=False, + use_vfl=False, + use_uni_match=False, + uni_match_ind=0, + loggers: List[Callable[[float, Optional[str]], None]] = None, + ): r""" Args: num_classes (int): The number of classes. @@ -67,101 +75,112 @@ def __init__(self, self.use_vfl = use_vfl self.use_uni_match = use_uni_match self.uni_match_ind = uni_match_ind + self.loggers = loggers or [] if not self.use_focal_loss: - self.loss_coeff['class'] = paddle.full([num_classes + 1], - loss_coeff['class']) - self.loss_coeff['class'][-1] = loss_coeff['no_object'] + self.loss_coeff["class"] = paddle.full( + [num_classes + 1], loss_coeff["class"] + ) + self.loss_coeff["class"][-1] = loss_coeff["no_object"] self.giou_loss = GIoULoss() - def _get_loss_class(self, - logits, - gt_class, - match_indices, - bg_index, - num_gts, - postfix="", - iou_score=None): + def _get_loss_class( + self, + logits, + gt_class, + match_indices, + bg_index, + num_gts, + postfix="", + iou_score=None, + ): # logits: [b, query, num_classes], gt_class: list[[n, 1]] name_class = "loss_class" + postfix - target_label = paddle.full(logits.shape[:2], bg_index, dtype='int64') + target_label = paddle.full(logits.shape[:2], bg_index, dtype="int64") bs, num_query_objects = target_label.shape num_gt = sum(len(a) for a in gt_class) if num_gt > 0: - index, updates = self._get_index_updates(num_query_objects, - gt_class, match_indices) + index, updates = self._get_index_updates( + num_query_objects, gt_class, match_indices + ) target_label = paddle.scatter( - target_label.reshape([-1, 1]), index, updates.astype('int64')) + target_label.reshape([-1, 1]), index, updates.astype("int64") + ) target_label = target_label.reshape([bs, num_query_objects]) if self.use_focal_loss: - target_label = F.one_hot(target_label, - self.num_classes + 1)[..., :-1] + target_label = F.one_hot(target_label, self.num_classes + 1)[..., :-1] if iou_score is not None and self.use_vfl: target_score = paddle.zeros([bs, num_query_objects]) if num_gt > 0: target_score = paddle.scatter( - target_score.reshape([-1, 1]), index, iou_score) - target_score = target_score.reshape( - [bs, num_query_objects, 1]) * target_label - loss_ = self.loss_coeff['class'] * varifocal_loss_with_logits( - logits, target_score, target_label, - num_gts / num_query_objects) + target_score.reshape([-1, 1]), index, iou_score + ) + target_score = ( + target_score.reshape([bs, num_query_objects, 1]) * target_label + ) + loss_ = self.loss_coeff["class"] * varifocal_loss_with_logits( + logits, target_score, target_label, num_gts / num_query_objects + ) else: - loss_ = self.loss_coeff['class'] * sigmoid_focal_loss( - logits, target_label, num_gts / num_query_objects) + loss_ = self.loss_coeff["class"] * sigmoid_focal_loss( + logits, target_label, num_gts / num_query_objects + ) else: loss_ = F.cross_entropy( - logits, target_label, weight=self.loss_coeff['class']) + logits, target_label, weight=self.loss_coeff["class"] + ) return {name_class: loss_} - def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts, - postfix=""): + def _get_loss_bbox(self, boxes, gt_bbox, match_indices, num_gts, postfix=""): # boxes: [b, query, 4], gt_bbox: list[[n, 4]] name_bbox = "loss_bbox" + postfix name_giou = "loss_giou" + postfix loss = dict() if sum(len(a) for a in gt_bbox) == 0: - loss[name_bbox] = paddle.to_tensor([0.]) - loss[name_giou] = paddle.to_tensor([0.]) + loss[name_bbox] = paddle.to_tensor([0.0]) + loss[name_giou] = paddle.to_tensor([0.0]) return loss - src_bbox, target_bbox = self._get_src_target_assign(boxes, gt_bbox, - match_indices) - loss[name_bbox] = self.loss_coeff['bbox'] * F.l1_loss( - src_bbox, target_bbox, reduction='sum') / num_gts + src_bbox, target_bbox = self._get_src_target_assign( + boxes, gt_bbox, match_indices + ) + loss[name_bbox] = ( + self.loss_coeff["bbox"] + * F.l1_loss(src_bbox, target_bbox, reduction="sum") + / num_gts + ) loss[name_giou] = self.giou_loss( - bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox)) + bbox_cxcywh_to_xyxy(src_bbox), bbox_cxcywh_to_xyxy(target_bbox) + ) loss[name_giou] = loss[name_giou].sum() / num_gts - loss[name_giou] = self.loss_coeff['giou'] * loss[name_giou] + loss[name_giou] = self.loss_coeff["giou"] * loss[name_giou] return loss - def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, - postfix=""): + def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, postfix=""): # masks: [b, query, h, w], gt_mask: list[[n, H, W]] name_mask = "loss_mask" + postfix name_dice = "loss_dice" + postfix loss = dict() if sum(len(a) for a in gt_mask) == 0: - loss[name_mask] = paddle.to_tensor([0.]) - loss[name_dice] = paddle.to_tensor([0.]) + loss[name_mask] = paddle.to_tensor([0.0]) + loss[name_dice] = paddle.to_tensor([0.0]) return loss - src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, - match_indices) + src_masks, target_masks = self._get_src_target_assign( + masks, gt_mask, match_indices + ) src_masks = F.interpolate( - src_masks.unsqueeze(0), - size=target_masks.shape[-2:], - mode="bilinear")[0] - loss[name_mask] = self.loss_coeff['mask'] * F.sigmoid_focal_loss( - src_masks, - target_masks, - paddle.to_tensor( - [num_gts], dtype='float32')) - loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss( - src_masks, target_masks, num_gts) + src_masks.unsqueeze(0), size=target_masks.shape[-2:], mode="bilinear" + )[0] + loss[name_mask] = self.loss_coeff["mask"] * F.sigmoid_focal_loss( + src_masks, target_masks, paddle.to_tensor([num_gts], dtype="float32") + ) + loss[name_dice] = self.loss_coeff["dice"] * self._dice_loss( + src_masks, target_masks, num_gts + ) return loss def _dice_loss(self, inputs, targets, num_gts): @@ -173,17 +192,19 @@ def _dice_loss(self, inputs, targets, num_gts): loss = 1 - (numerator + 1) / (denominator + 1) return loss.sum() / num_gts - def _get_loss_aux(self, - boxes, - logits, - gt_bbox, - gt_class, - bg_index, - num_gts, - dn_match_indices=None, - postfix="", - masks=None, - gt_mask=None): + def _get_loss_aux( + self, + boxes, + logits, + gt_bbox, + gt_class, + bg_index, + num_gts, + dn_match_indices=None, + postfix="", + masks=None, + gt_mask=None, + ): loss_class = [] loss_bbox, loss_giou = [], [] loss_mask, loss_dice = [], [] @@ -196,7 +217,8 @@ def _get_loss_aux(self, gt_bbox, gt_class, masks=masks[self.uni_match_ind] if masks is not None else None, - gt_mask=gt_mask) + gt_mask=gt_mask, + ) for i, (aux_boxes, aux_logits) in enumerate(zip(boxes, logits)): aux_masks = masks[i] if masks is not None else None if not self.use_uni_match and dn_match_indices is None: @@ -206,35 +228,47 @@ def _get_loss_aux(self, gt_bbox, gt_class, masks=aux_masks, - gt_mask=gt_mask) + gt_mask=gt_mask, + ) if self.use_vfl: if sum(len(a) for a in gt_bbox) > 0: src_bbox, target_bbox = self._get_src_target_assign( - aux_boxes.detach(), gt_bbox, match_indices) + aux_boxes.detach(), gt_bbox, match_indices + ) iou_score = bbox_iou( bbox_cxcywh_to_xyxy(src_bbox).split(4, -1), - bbox_cxcywh_to_xyxy(target_bbox).split(4, -1)) + bbox_cxcywh_to_xyxy(target_bbox).split(4, -1), + ) else: iou_score = None else: iou_score = None loss_class.append( - self._get_loss_class(aux_logits, gt_class, match_indices, - bg_index, num_gts, postfix, iou_score)[ - 'loss_class' + postfix]) - loss_ = self._get_loss_bbox(aux_boxes, gt_bbox, match_indices, - num_gts, postfix) - loss_bbox.append(loss_['loss_bbox' + postfix]) - loss_giou.append(loss_['loss_giou' + postfix]) + self._get_loss_class( + aux_logits, + gt_class, + match_indices, + bg_index, + num_gts, + postfix, + iou_score, + )["loss_class" + postfix] + ) + loss_ = self._get_loss_bbox( + aux_boxes, gt_bbox, match_indices, num_gts, postfix + ) + loss_bbox.append(loss_["loss_bbox" + postfix]) + loss_giou.append(loss_["loss_giou" + postfix]) if masks is not None and gt_mask is not None: - loss_ = self._get_loss_mask(aux_masks, gt_mask, match_indices, - num_gts, postfix) - loss_mask.append(loss_['loss_mask' + postfix]) - loss_dice.append(loss_['loss_dice' + postfix]) + loss_ = self._get_loss_mask( + aux_masks, gt_mask, match_indices, num_gts, postfix + ) + loss_mask.append(loss_["loss_mask" + postfix]) + loss_dice.append(loss_["loss_dice" + postfix]) loss = { "loss_class_aux" + postfix: paddle.add_n(loss_class), "loss_bbox_aux" + postfix: paddle.add_n(loss_bbox), - "loss_giou_aux" + postfix: paddle.add_n(loss_giou) + "loss_giou_aux" + postfix: paddle.add_n(loss_giou), } if masks is not None and gt_mask is not None: loss["loss_mask_aux" + postfix] = paddle.add_n(loss_mask) @@ -242,28 +276,36 @@ def _get_loss_aux(self, return loss def _get_index_updates(self, num_query_objects, target, match_indices): - batch_idx = paddle.concat([ - paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices) - ]) + batch_idx = paddle.concat( + [paddle.full_like(src, i) for i, (src, _) in enumerate(match_indices)] + ) src_idx = paddle.concat([src for (src, _) in match_indices]) - src_idx += (batch_idx * num_query_objects) - target_assign = paddle.concat([ - paddle.gather( - t, dst, axis=0) for t, (_, dst) in zip(target, match_indices) - ]) + src_idx += batch_idx * num_query_objects + target_assign = paddle.concat( + [ + paddle.gather(t, dst, axis=0) + for t, (_, dst) in zip(target, match_indices) + ] + ) return src_idx, target_assign def _get_src_target_assign(self, src, target, match_indices): - src_assign = paddle.concat([ - paddle.gather( - t, I, axis=0) if len(I) > 0 else paddle.zeros([0, t.shape[-1]]) - for t, (I, _) in zip(src, match_indices) - ]) - target_assign = paddle.concat([ - paddle.gather( - t, J, axis=0) if len(J) > 0 else paddle.zeros([0, t.shape[-1]]) - for t, (_, J) in zip(target, match_indices) - ]) + src_assign = paddle.concat( + [ + paddle.gather(t, I, axis=0) + if len(I) > 0 + else paddle.zeros([0, t.shape[-1]]) + for t, (I, _) in zip(src, match_indices) + ] + ) + target_assign = paddle.concat( + [ + paddle.gather(t, J, axis=0) + if len(J) > 0 + else paddle.zeros([0, t.shape[-1]]) + for t, (_, J) in zip(target, match_indices) + ] + ) return src_assign, target_assign def _get_num_gts(self, targets, dtype="float32"): @@ -272,32 +314,37 @@ def _get_num_gts(self, targets, dtype="float32"): if paddle.distributed.get_world_size() > 1: paddle.distributed.all_reduce(num_gts) num_gts /= paddle.distributed.get_world_size() - num_gts = paddle.clip(num_gts, min=1.) + num_gts = paddle.clip(num_gts, min=1.0) return num_gts - def _get_prediction_loss(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None, - postfix="", - dn_match_indices=None, - num_gts=1): + def _get_prediction_loss( + self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None, + postfix="", + dn_match_indices=None, + num_gts=1, + ): if dn_match_indices is None: match_indices = self.matcher( - boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask) + boxes, logits, gt_bbox, gt_class, masks=masks, gt_mask=gt_mask + ) else: match_indices = dn_match_indices if self.use_vfl: if sum(len(a) for a in gt_bbox) > 0: src_bbox, target_bbox = self._get_src_target_assign( - boxes.detach(), gt_bbox, match_indices) + boxes.detach(), gt_bbox, match_indices + ) iou_score = bbox_iou( bbox_cxcywh_to_xyxy(src_bbox).split(4, -1), - bbox_cxcywh_to_xyxy(target_bbox).split(4, -1)) + bbox_cxcywh_to_xyxy(target_bbox).split(4, -1), + ) else: iou_score = None else: @@ -305,26 +352,36 @@ def _get_prediction_loss(self, loss = dict() loss.update( - self._get_loss_class(logits, gt_class, match_indices, - self.num_classes, num_gts, postfix, iou_score)) + self._get_loss_class( + logits, + gt_class, + match_indices, + self.num_classes, + num_gts, + postfix, + iou_score, + ) + ) loss.update( - self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts, - postfix)) + self._get_loss_bbox(boxes, gt_bbox, match_indices, num_gts, postfix) + ) if masks is not None and gt_mask is not None: loss.update( - self._get_loss_mask(masks, gt_mask, match_indices, num_gts, - postfix)) + self._get_loss_mask(masks, gt_mask, match_indices, num_gts, postfix) + ) return loss - def forward(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None, - postfix="", - **kwargs): + def forward( + self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None, + postfix="", + **kwargs + ): r""" Args: boxes (Tensor): [l, b, query, 4] @@ -350,7 +407,8 @@ def forward(self, gt_mask=gt_mask, postfix=postfix, dn_match_indices=dn_match_indices, - num_gts=num_gts) + num_gts=num_gts, + ) if self.aux_loss: total_loss.update( @@ -364,37 +422,45 @@ def forward(self, dn_match_indices, postfix, masks=masks[:-1] if masks is not None else None, - gt_mask=gt_mask)) + gt_mask=gt_mask, + ) + ) return total_loss @register class DINOLoss(DETRLoss): - def forward(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None, - postfix="", - dn_out_bboxes=None, - dn_out_logits=None, - dn_meta=None, - **kwargs): + def forward( + self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None, + postfix="", + dn_out_bboxes=None, + dn_out_logits=None, + dn_meta=None, + **kwargs + ): num_gts = self._get_num_gts(gt_class) total_loss = super(DINOLoss, self).forward( - boxes, logits, gt_bbox, gt_class, num_gts=num_gts) + boxes, logits, gt_bbox, gt_class, num_gts=num_gts + ) if dn_meta is not None: - dn_positive_idx, dn_num_group = \ - dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + dn_positive_idx, dn_num_group = ( + dn_meta["dn_positive_idx"], + dn_meta["dn_num_group"], + ) assert len(gt_class) == len(dn_positive_idx) # denoising match indices dn_match_indices = self.get_dn_match_indices( - gt_class, dn_positive_idx, dn_num_group) + gt_class, dn_positive_idx, dn_num_group + ) # compute denoising training loss num_gts *= dn_num_group @@ -405,12 +471,13 @@ def forward(self, gt_class, postfix="_dn", dn_match_indices=dn_match_indices, - num_gts=num_gts) + num_gts=num_gts, + ) total_loss.update(dn_loss) else: total_loss.update( - {k + '_dn': paddle.to_tensor([0.]) - for k in total_loss.keys()}) + {k + "_dn": paddle.to_tensor([0.0]) for k in total_loss.keys()} + ) return total_loss @@ -425,34 +492,31 @@ def get_dn_match_indices(labels, dn_positive_idx, dn_num_group): assert len(dn_positive_idx[i]) == len(gt_idx) dn_match_indices.append((dn_positive_idx[i], gt_idx)) else: - dn_match_indices.append((paddle.zeros( - [0], dtype="int64"), paddle.zeros( - [0], dtype="int64"))) + dn_match_indices.append( + (paddle.zeros([0], dtype="int64"), paddle.zeros([0], dtype="int64")) + ) return dn_match_indices @register class MaskDINOLoss(DETRLoss): - __shared__ = ['num_classes', 'use_focal_loss', 'num_sample_points'] - __inject__ = ['matcher'] - - def __init__(self, - num_classes=80, - matcher='HungarianMatcher', - loss_coeff={ - 'class': 4, - 'bbox': 5, - 'giou': 2, - 'mask': 5, - 'dice': 5 - }, - aux_loss=True, - use_focal_loss=False, - num_sample_points=12544, - oversample_ratio=3.0, - important_sample_ratio=0.75): - super(MaskDINOLoss, self).__init__(num_classes, matcher, loss_coeff, - aux_loss, use_focal_loss) + __shared__ = ["num_classes", "use_focal_loss", "num_sample_points"] + __inject__ = ["matcher"] + + def __init__( + self, + num_classes=80, + matcher="HungarianMatcher", + loss_coeff={"class": 4, "bbox": 5, "giou": 2, "mask": 5, "dice": 5}, + aux_loss=True, + use_focal_loss=False, + num_sample_points=12544, + oversample_ratio=3.0, + important_sample_ratio=0.75, + ): + super(MaskDINOLoss, self).__init__( + num_classes, matcher, loss_coeff, aux_loss, use_focal_loss + ) assert oversample_ratio >= 1 assert important_sample_ratio <= 1 and important_sample_ratio >= 0 @@ -460,23 +524,24 @@ def __init__(self, self.oversample_ratio = oversample_ratio self.important_sample_ratio = important_sample_ratio self.num_oversample_points = int(num_sample_points * oversample_ratio) - self.num_important_points = int(num_sample_points * - important_sample_ratio) + self.num_important_points = int(num_sample_points * important_sample_ratio) self.num_random_points = num_sample_points - self.num_important_points - def forward(self, - boxes, - logits, - gt_bbox, - gt_class, - masks=None, - gt_mask=None, - postfix="", - dn_out_bboxes=None, - dn_out_logits=None, - dn_out_masks=None, - dn_meta=None, - **kwargs): + def forward( + self, + boxes, + logits, + gt_bbox, + gt_class, + masks=None, + gt_mask=None, + postfix="", + dn_out_bboxes=None, + dn_out_logits=None, + dn_out_masks=None, + dn_meta=None, + **kwargs + ): num_gts = self._get_num_gts(gt_class) total_loss = super(MaskDINOLoss, self).forward( boxes, @@ -485,16 +550,20 @@ def forward(self, gt_class, masks=masks, gt_mask=gt_mask, - num_gts=num_gts) + num_gts=num_gts, + ) if dn_meta is not None: - dn_positive_idx, dn_num_group = \ - dn_meta["dn_positive_idx"], dn_meta["dn_num_group"] + dn_positive_idx, dn_num_group = ( + dn_meta["dn_positive_idx"], + dn_meta["dn_num_group"], + ) assert len(gt_class) == len(dn_positive_idx) # denoising match indices dn_match_indices = DINOLoss.get_dn_match_indices( - gt_class, dn_positive_idx, dn_num_group) + gt_class, dn_positive_idx, dn_num_group + ) # compute denoising training loss num_gts *= dn_num_group @@ -507,59 +576,67 @@ def forward(self, gt_mask=gt_mask, postfix="_dn", dn_match_indices=dn_match_indices, - num_gts=num_gts) + num_gts=num_gts, + ) total_loss.update(dn_loss) else: total_loss.update( - {k + '_dn': paddle.to_tensor([0.]) - for k in total_loss.keys()}) + {k + "_dn": paddle.to_tensor([0.0]) for k in total_loss.keys()} + ) return total_loss - def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, - postfix=""): + def _get_loss_mask(self, masks, gt_mask, match_indices, num_gts, postfix=""): # masks: [b, query, h, w], gt_mask: list[[n, H, W]] name_mask = "loss_mask" + postfix name_dice = "loss_dice" + postfix loss = dict() if sum(len(a) for a in gt_mask) == 0: - loss[name_mask] = paddle.to_tensor([0.]) - loss[name_dice] = paddle.to_tensor([0.]) + loss[name_mask] = paddle.to_tensor([0.0]) + loss[name_dice] = paddle.to_tensor([0.0]) return loss - src_masks, target_masks = self._get_src_target_assign(masks, gt_mask, - match_indices) + src_masks, target_masks = self._get_src_target_assign( + masks, gt_mask, match_indices + ) # sample points sample_points = self._get_point_coords_by_uncertainty(src_masks) sample_points = 2.0 * sample_points.unsqueeze(1) - 1.0 src_masks = F.grid_sample( - src_masks.unsqueeze(1), sample_points, - align_corners=False).squeeze([1, 2]) - - target_masks = F.grid_sample( - target_masks.unsqueeze(1), sample_points, - align_corners=False).squeeze([1, 2]).detach() - - loss[name_mask] = self.loss_coeff[ - 'mask'] * F.binary_cross_entropy_with_logits( - src_masks, target_masks, - reduction='none').mean(1).sum() / num_gts - loss[name_dice] = self.loss_coeff['dice'] * self._dice_loss( - src_masks, target_masks, num_gts) + src_masks.unsqueeze(1), sample_points, align_corners=False + ).squeeze([1, 2]) + + target_masks = ( + F.grid_sample(target_masks.unsqueeze(1), sample_points, align_corners=False) + .squeeze([1, 2]) + .detach() + ) + + loss[name_mask] = ( + self.loss_coeff["mask"] + * F.binary_cross_entropy_with_logits( + src_masks, target_masks, reduction="none" + ) + .mean(1) + .sum() + / num_gts + ) + loss[name_dice] = self.loss_coeff["dice"] * self._dice_loss( + src_masks, target_masks, num_gts + ) return loss def _get_point_coords_by_uncertainty(self, masks): # Sample points based on their uncertainty. masks = masks.detach() num_masks = masks.shape[0] - sample_points = paddle.rand( - [num_masks, 1, self.num_oversample_points, 2]) + sample_points = paddle.rand([num_masks, 1, self.num_oversample_points, 2]) out_mask = F.grid_sample( - masks.unsqueeze(1), 2.0 * sample_points - 1.0, - align_corners=False).squeeze([1, 2]) + masks.unsqueeze(1), 2.0 * sample_points - 1.0, align_corners=False + ).squeeze([1, 2]) out_mask = -paddle.abs(out_mask) _, topk_ind = paddle.topk(out_mask, self.num_important_points, axis=1) @@ -570,9 +647,7 @@ def _get_point_coords_by_uncertainty(self, masks): sample_points = paddle.gather_nd(sample_points.squeeze(1), topk_ind) if self.num_random_points > 0: sample_points = paddle.concat( - [ - sample_points, - paddle.rand([num_masks, self.num_random_points, 2]) - ], - axis=1) + [sample_points, paddle.rand([num_masks, self.num_random_points, 2])], + axis=1, + ) return sample_points diff --git a/rtdetr_pytorch/src/core/config.py b/rtdetr_pytorch/src/core/config.py index cf803ef5..3e053d7a 100644 --- a/rtdetr_pytorch/src/core/config.py +++ b/rtdetr_pytorch/src/core/config.py @@ -1,134 +1,155 @@ """by lyuwenyu """ -from pprint import pprint -import torch -import torch.nn as nn -from torch.utils.data import Dataset, DataLoader +from typing import Callable, List, Optional + +import torch +import torch.nn as nn +from torch.cuda.amp.grad_scaler import GradScaler from torch.optim import Optimizer from torch.optim.lr_scheduler import LRScheduler -from torch.cuda.amp.grad_scaler import GradScaler - -from typing import Callable, List, Dict - - -__all__ = ['BaseConfig', ] +from torch.utils.data import DataLoader, Dataset +__all__ = [ + "BaseConfig", +] class BaseConfig(object): # TODO property - def __init__(self) -> None: super().__init__() - self.task :str = None - - self._model :nn.Module = None - self._postprocessor :nn.Module = None - self._criterion :nn.Module = None - self._optimizer :Optimizer = None - self._lr_scheduler :LRScheduler = None - self._train_dataloader :DataLoader = None - self._val_dataloader :DataLoader = None - self._ema :nn.Module = None - self._scaler :GradScaler = None - - self.train_dataset :Dataset = None - self.val_dataset :Dataset = None - self.num_workers :int = 0 - self.collate_fn :Callable = None - - self.batch_size :int = None - self._train_batch_size :int = None - self._val_batch_size :int = None - self._train_shuffle: bool = None - self._val_shuffle: bool = None - - self.evaluator :Callable[[nn.Module, DataLoader, str], ] = None + self.task: str = None + + self._model: nn.Module = None + self._postprocessor: nn.Module = None + self._criterion: nn.Module = None + self._optimizer: Optimizer = None + self._lr_scheduler: LRScheduler = None + self._train_dataloader: DataLoader = None + self._val_dataloader: DataLoader = None + self._ema: nn.Module = None + self._scaler: GradScaler = None + + self.train_dataset: Dataset = None + self.val_dataset: Dataset = None + self.num_workers: int = 0 + self.collate_fn: Callable = None + + self.batch_size: int = None + self._train_batch_size: int = None + self._val_batch_size: int = None + self._train_shuffle: bool = None + self._val_shuffle: bool = None + + self.evaluator: Callable[[nn.Module, DataLoader, str],] = None # runtime - self.resume :str = None - self.tuning :str = None - - self.epoches :int = None - self.last_epoch :int = -1 - self.end_epoch :int = None - - self.use_amp :bool = False - self.use_ema :bool = False - self.sync_bn :bool = False - self.clip_max_norm : float = None - self.find_unused_parameters :bool = None + self.resume: str = None + self.tuning: str = None + + self.epoches: int = None + self.last_epoch: int = -1 + self.end_epoch: int = None + + self.use_amp: bool = False + self.use_ema: bool = False + self.sync_bn: bool = False + self.clip_max_norm: float = None + self.find_unused_parameters: bool = None # self.ema_decay: float = 0.9999 # self.grad_clip_: Callable = None - self.log_dir :str = './logs/' - self.log_step :int = 10 - self._output_dir :str = None - self._print_freq :int = None - self.checkpoint_step :int = 1 + self.log_dir: str = "./logs/" + self.log_step: int = 10 + self._output_dir: str = None + self._print_freq: int = None + self.checkpoint_step: int = 1 + self.loss_loggers: Optional[List] = None + self.metrics_loggers: Optional[List] = None + self.resource_usage_loggers: Optional[List] = None # self.device :str = torch.device('cpu') - device = 'cuda' if torch.cuda.is_available() else 'cpu' + device = "cuda" if torch.cuda.is_available() else "cpu" self.device = torch.device(device) - @property - def model(self, ) -> nn.Module: - return self._model - + def model( + self, + ) -> nn.Module: + return self._model + @model.setter def model(self, m): - assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' - self._model = m + assert isinstance( + m, nn.Module + ), f"{type(m)} != nn.Module, please check your model class" + self._model = m @property - def postprocessor(self, ) -> nn.Module: + def postprocessor( + self, + ) -> nn.Module: return self._postprocessor - + @postprocessor.setter def postprocessor(self, m): - assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' - self._postprocessor = m + assert isinstance( + m, nn.Module + ), f"{type(m)} != nn.Module, please check your model class" + self._postprocessor = m @property - def criterion(self, ) -> nn.Module: + def criterion( + self, + ) -> nn.Module: return self._criterion - + @criterion.setter def criterion(self, m): - assert isinstance(m, nn.Module), f'{type(m)} != nn.Module, please check your model class' - self._criterion = m + assert isinstance( + m, nn.Module + ), f"{type(m)} != nn.Module, please check your model class" + self._criterion = m @property - def optimizer(self, ) -> Optimizer: + def optimizer( + self, + ) -> Optimizer: return self._optimizer - + @optimizer.setter def optimizer(self, m): - assert isinstance(m, Optimizer), f'{type(m)} != optim.Optimizer, please check your model class' - self._optimizer = m + assert isinstance( + m, Optimizer + ), f"{type(m)} != optim.Optimizer, please check your model class" + self._optimizer = m @property - def lr_scheduler(self, ) -> LRScheduler: + def lr_scheduler( + self, + ) -> LRScheduler: return self._lr_scheduler - + @lr_scheduler.setter def lr_scheduler(self, m): - assert isinstance(m, LRScheduler), f'{type(m)} != LRScheduler, please check your model class' - self._lr_scheduler = m - + assert isinstance( + m, LRScheduler + ), f"{type(m)} != LRScheduler, please check your model class" + self._lr_scheduler = m @property def train_dataloader(self): if self._train_dataloader is None and self.train_dataset is not None: - loader = DataLoader(self.train_dataset, - batch_size=self.train_batch_size, - num_workers=self.num_workers, - collate_fn=self.collate_fn, - shuffle=self.train_shuffle, ) + loader = DataLoader( + self.train_dataset, + batch_size=self.train_batch_size, + num_workers=self.num_workers, + collate_fn=self.collate_fn, + shuffle=self.train_shuffle, + ) loader.shuffle = self.train_shuffle self._train_dataloader = loader @@ -136,26 +157,27 @@ def train_dataloader(self): @train_dataloader.setter def train_dataloader(self, loader): - self._train_dataloader = loader + self._train_dataloader = loader @property def val_dataloader(self): if self._val_dataloader is None and self.val_dataset is not None: - loader = DataLoader(self.val_dataset, - batch_size=self.val_batch_size, - num_workers=self.num_workers, - drop_last=False, - collate_fn=self.collate_fn, - shuffle=self.val_shuffle) + loader = DataLoader( + self.val_dataset, + batch_size=self.val_batch_size, + num_workers=self.num_workers, + drop_last=False, + collate_fn=self.collate_fn, + shuffle=self.val_shuffle, + ) loader.shuffle = self.val_shuffle self._val_dataloader = loader return self._val_dataloader - + @val_dataloader.setter def val_dataloader(self, loader): - self._val_dataloader = loader - + self._val_dataloader = loader # TODO method # @property @@ -165,75 +187,73 @@ def val_dataloader(self, loader): # return self._ema @property - def ema(self, ) -> nn.Module: - return self._ema + def ema( + self, + ) -> nn.Module: + return self._ema @ema.setter def ema(self, obj): self._ema = obj - @property - def scaler(self) -> GradScaler: + def scaler(self) -> GradScaler: if self._scaler is None and self.use_amp and torch.cuda.is_available(): self._scaler = GradScaler() return self._scaler - + @scaler.setter def scaler(self, obj: GradScaler): self._scaler = obj - @property def val_shuffle(self): if self._val_shuffle is None: - print('warning: set default val_shuffle=False') + print("warning: set default val_shuffle=False") return False return self._val_shuffle @val_shuffle.setter def val_shuffle(self, shuffle): - assert isinstance(shuffle, bool), 'shuffle must be bool' + assert isinstance(shuffle, bool), "shuffle must be bool" self._val_shuffle = shuffle @property def train_shuffle(self): if self._train_shuffle is None: - print('warning: set default train_shuffle=True') + print("warning: set default train_shuffle=True") return True return self._train_shuffle @train_shuffle.setter def train_shuffle(self, shuffle): - assert isinstance(shuffle, bool), 'shuffle must be bool' + assert isinstance(shuffle, bool), "shuffle must be bool" self._train_shuffle = shuffle - @property def train_batch_size(self): if self._train_batch_size is None and isinstance(self.batch_size, int): - print(f'warning: set train_batch_size=batch_size={self.batch_size}') + print(f"warning: set train_batch_size=batch_size={self.batch_size}") return self.batch_size return self._train_batch_size @train_batch_size.setter def train_batch_size(self, batch_size): - assert isinstance(batch_size, int), 'batch_size must be int' + assert isinstance(batch_size, int), "batch_size must be int" self._train_batch_size = batch_size @property def val_batch_size(self): if self._val_batch_size is None: - print(f'warning: set val_batch_size=batch_size={self.batch_size}') + print(f"warning: set val_batch_size=batch_size={self.batch_size}") return self.batch_size return self._val_batch_size @val_batch_size.setter def val_batch_size(self, batch_size): - assert isinstance(batch_size, int), 'batch_size must be int' + assert isinstance(batch_size, int), "batch_size must be int" self._val_batch_size = batch_size - @property def output_dir(self): if self._output_dir is None: @@ -253,12 +273,8 @@ def print_freq(self): @print_freq.setter def print_freq(self, n): - assert isinstance(n, int), 'print_freq must be int' + assert isinstance(n, int), "print_freq must be int" self._print_freq = n - # def __repr__(self) -> str: - # pass - - - + # pass diff --git a/rtdetr_pytorch/src/solver/det_engine.py b/rtdetr_pytorch/src/solver/det_engine.py index fbca083f..eab08105 100644 --- a/rtdetr_pytorch/src/solver/det_engine.py +++ b/rtdetr_pytorch/src/solver/det_engine.py @@ -7,30 +7,38 @@ import math import os -import sys import pathlib -from typing import Iterable +import sys +from typing import Callable, Dict, Iterable import torch -import torch.amp - +import torch.amp from src.data import CocoEvaluator -from src.misc import (MetricLogger, SmoothedValue, reduce_dict) - - -def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, - data_loader: Iterable, optimizer: torch.optim.Optimizer, - device: torch.device, epoch: int, max_norm: float = 0, **kwargs): +from src.misc import MetricLogger, SmoothedValue, reduce_dict + + +def train_one_epoch( + model: torch.nn.Module, + criterion: torch.nn.Module, + data_loader: Iterable, + optimizer: torch.optim.Optimizer, + device: torch.device, + epoch: int, + max_norm: float = 0, + loss_loggers: Iterable[Callable[[Dict, int], None]] = None, + **kwargs +): model.train() criterion.train() metric_logger = MetricLogger(delimiter=" ") - metric_logger.add_meter('lr', SmoothedValue(window_size=1, fmt='{value:.6f}')) + loss_loggers = loss_loggers or [] + metric_logger.add_meter("lr", SmoothedValue(window_size=1, fmt="{value:.6f}")) # metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}')) - header = 'Epoch: [{}]'.format(epoch) - print_freq = kwargs.get('print_freq', 10) - - ema = kwargs.get('ema', None) - scaler = kwargs.get('scaler', None) + header = "Epoch: [{}]".format(epoch) + print_freq = kwargs.get("print_freq", 10) + + ema = kwargs.get("ema", None) + scaler = kwargs.get("scaler", None) for samples, targets in metric_logger.log_every(data_loader, print_freq, header): samples = samples.to(device) @@ -39,13 +47,13 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, if scaler is not None: with torch.autocast(device_type=str(device), cache_enabled=True): outputs = model(samples, targets) - + with torch.autocast(device_type=str(device), enabled=False): loss_dict = criterion(outputs, targets) loss = sum(loss_dict.values()) scaler.scale(loss).backward() - + if max_norm > 0: scaler.unscale_(optimizer) torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) @@ -57,17 +65,20 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, else: outputs = model(samples, targets) loss_dict = criterion(outputs, targets) - + loss = sum(loss_dict.values()) + for loss_logger in loss_loggers: + loss_logger(loss_dict, epoch) + optimizer.zero_grad() loss.backward() - + if max_norm > 0: torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm) optimizer.step() - - # ema + + # ema if ema is not None: ema.update(model) @@ -88,15 +99,22 @@ def train_one_epoch(model: torch.nn.Module, criterion: torch.nn.Module, return {k: meter.global_avg for k, meter in metric_logger.meters.items()} - @torch.no_grad() -def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, data_loader, base_ds, device, output_dir): +def evaluate( + model: torch.nn.Module, + criterion: torch.nn.Module, + postprocessors, + data_loader, + base_ds, + device, + output_dir, +): model.eval() criterion.eval() metric_logger = MetricLogger(delimiter=" ") # metric_logger.add_meter('class_error', SmoothedValue(window_size=1, fmt='{value:.2f}')) - header = 'Test:' + header = "Test:" # iou_types = tuple(k for k in ('segm', 'bbox') if k in postprocessors.keys()) iou_types = postprocessors.iou_types @@ -133,7 +151,7 @@ def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, # **loss_dict_reduced_unscaled) # metric_logger.update(class_error=loss_dict_reduced['class_error']) - orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) + orig_target_sizes = torch.stack([t["orig_size"] for t in targets], dim=0) results = postprocessors(outputs, orig_target_sizes) # results = postprocessors(outputs, targets) @@ -141,7 +159,10 @@ def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, # target_sizes = torch.stack([t["size"] for t in targets], dim=0) # results = postprocessors['segm'](results, outputs, orig_target_sizes, target_sizes) - res = {target['image_id'].item(): output for target, output in zip(targets, results)} + res = { + target["image_id"].item(): output + for target, output in zip(targets, results) + } if coco_evaluator is not None: coco_evaluator.update(res) @@ -170,21 +191,18 @@ def evaluate(model: torch.nn.Module, criterion: torch.nn.Module, postprocessors, # panoptic_res = None # if panoptic_evaluator is not None: # panoptic_res = panoptic_evaluator.summarize() - + stats = {} # stats = {k: meter.global_avg for k, meter in metric_logger.meters.items()} if coco_evaluator is not None: - if 'bbox' in iou_types: - stats['coco_eval_bbox'] = coco_evaluator.coco_eval['bbox'].stats.tolist() - if 'segm' in iou_types: - stats['coco_eval_masks'] = coco_evaluator.coco_eval['segm'].stats.tolist() - + if "bbox" in iou_types: + stats["coco_eval_bbox"] = coco_evaluator.coco_eval["bbox"].stats.tolist() + if "segm" in iou_types: + stats["coco_eval_masks"] = coco_evaluator.coco_eval["segm"].stats.tolist() + # if panoptic_res is not None: # stats['PQ_all'] = panoptic_res["All"] # stats['PQ_th'] = panoptic_res["Things"] # stats['PQ_st'] = panoptic_res["Stuff"] return stats, coco_evaluator - - - diff --git a/rtdetr_pytorch/src/solver/det_solver.py b/rtdetr_pytorch/src/solver/det_solver.py index e770022b..fe84ace4 100644 --- a/rtdetr_pytorch/src/solver/det_solver.py +++ b/rtdetr_pytorch/src/solver/det_solver.py @@ -49,7 +49,10 @@ def fit( print_freq=args.log_step, ema=self.ema, scaler=self.scaler, + loss_loggers=self.loss_loggers, ) + for metric_logger in self.metric_loggers: + metric_logger(train_stats, epoch) self.lr_scheduler.step() diff --git a/rtdetr_pytorch/src/solver/solver.py b/rtdetr_pytorch/src/solver/solver.py index 55452f28..8c796af6 100644 --- a/rtdetr_pytorch/src/solver/solver.py +++ b/rtdetr_pytorch/src/solver/solver.py @@ -1,162 +1,166 @@ """by lyuwenyu """ -import torch -import torch.nn as nn - from datetime import datetime -from pathlib import Path +from pathlib import Path from typing import Dict -from src.misc import dist +import torch +import torch.nn as nn from src.core import BaseConfig +from src.misc import dist class BaseSolver(object): def __init__(self, cfg: BaseConfig) -> None: - - self.cfg = cfg + self.cfg = cfg - def setup(self, ): - '''Avoid instantiating unnecessary classes - ''' + def setup( + self, + ): + """Avoid instantiating unnecessary classes""" cfg = self.cfg device = cfg.device self.device = device self.last_epoch = cfg.last_epoch - self.model = dist.warp_model(cfg.model.to(device), cfg.find_unused_parameters, cfg.sync_bn) + self.model = dist.warp_model( + cfg.model.to(device), cfg.find_unused_parameters, cfg.sync_bn + ) self.criterion = cfg.criterion.to(device) self.postprocessor = cfg.postprocessor + self.loss_loggers = cfg.loss_loggers + self.metric_loggers = cfg.metrics_loggers # NOTE (lvwenyu): should load_tuning_state before ema instance building if self.cfg.tuning: - print(f'Tuning checkpoint from {self.cfg.tuning}') + print(f"Tuning checkpoint from {self.cfg.tuning}") self.load_tuning_state(self.cfg.tuning) self.scaler = cfg.scaler - self.ema = cfg.ema.to(device) if cfg.ema is not None else None + self.ema = cfg.ema.to(device) if cfg.ema is not None else None self.output_dir = Path(cfg.output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) - - def train(self, ): + def train( + self, + ): self.setup() self.optimizer = self.cfg.optimizer self.lr_scheduler = self.cfg.lr_scheduler # NOTE instantiating order if self.cfg.resume: - print(f'Resume checkpoint from {self.cfg.resume}') + print(f"Resume checkpoint from {self.cfg.resume}") self.resume(self.cfg.resume) - self.train_dataloader = dist.warp_loader(self.cfg.train_dataloader, \ - shuffle=self.cfg.train_dataloader.shuffle) - self.val_dataloader = dist.warp_loader(self.cfg.val_dataloader, \ - shuffle=self.cfg.val_dataloader.shuffle) - + self.train_dataloader = dist.warp_loader( + self.cfg.train_dataloader, shuffle=self.cfg.train_dataloader.shuffle + ) + self.val_dataloader = dist.warp_loader( + self.cfg.val_dataloader, shuffle=self.cfg.val_dataloader.shuffle + ) - def eval(self, ): + def eval( + self, + ): self.setup() - self.val_dataloader = dist.warp_loader(self.cfg.val_dataloader, \ - shuffle=self.cfg.val_dataloader.shuffle) + self.val_dataloader = dist.warp_loader( + self.cfg.val_dataloader, shuffle=self.cfg.val_dataloader.shuffle + ) if self.cfg.resume: - print(f'resume from {self.cfg.resume}') + print(f"resume from {self.cfg.resume}") self.resume(self.cfg.resume) - def state_dict(self, last_epoch): - '''state dict - ''' + """state dict""" state = {} - state['model'] = dist.de_parallel(self.model).state_dict() - state['date'] = datetime.now().isoformat() + state["model"] = dist.de_parallel(self.model).state_dict() + state["date"] = datetime.now().isoformat() # TODO - state['last_epoch'] = last_epoch + state["last_epoch"] = last_epoch if self.optimizer is not None: - state['optimizer'] = self.optimizer.state_dict() + state["optimizer"] = self.optimizer.state_dict() if self.lr_scheduler is not None: - state['lr_scheduler'] = self.lr_scheduler.state_dict() + state["lr_scheduler"] = self.lr_scheduler.state_dict() # state['last_epoch'] = self.lr_scheduler.last_epoch if self.ema is not None: - state['ema'] = self.ema.state_dict() + state["ema"] = self.ema.state_dict() if self.scaler is not None: - state['scaler'] = self.scaler.state_dict() + state["scaler"] = self.scaler.state_dict() return state - def load_state_dict(self, state): - '''load state dict - ''' + """load state dict""" # TODO - if getattr(self, 'last_epoch', None) and 'last_epoch' in state: - self.last_epoch = state['last_epoch'] - print('Loading last_epoch') + if getattr(self, "last_epoch", None) and "last_epoch" in state: + self.last_epoch = state["last_epoch"] + print("Loading last_epoch") - if getattr(self, 'model', None) and 'model' in state: + if getattr(self, "model", None) and "model" in state: if dist.is_parallel(self.model): - self.model.module.load_state_dict(state['model']) + self.model.module.load_state_dict(state["model"]) else: - self.model.load_state_dict(state['model']) - print('Loading model.state_dict') - - if getattr(self, 'ema', None) and 'ema' in state: - self.ema.load_state_dict(state['ema']) - print('Loading ema.state_dict') + self.model.load_state_dict(state["model"]) + print("Loading model.state_dict") - if getattr(self, 'optimizer', None) and 'optimizer' in state: - self.optimizer.load_state_dict(state['optimizer']) - print('Loading optimizer.state_dict') + if getattr(self, "ema", None) and "ema" in state: + self.ema.load_state_dict(state["ema"]) + print("Loading ema.state_dict") - if getattr(self, 'lr_scheduler', None) and 'lr_scheduler' in state: - self.lr_scheduler.load_state_dict(state['lr_scheduler']) - print('Loading lr_scheduler.state_dict') + if getattr(self, "optimizer", None) and "optimizer" in state: + self.optimizer.load_state_dict(state["optimizer"]) + print("Loading optimizer.state_dict") - if getattr(self, 'scaler', None) and 'scaler' in state: - self.scaler.load_state_dict(state['scaler']) - print('Loading scaler.state_dict') + if getattr(self, "lr_scheduler", None) and "lr_scheduler" in state: + self.lr_scheduler.load_state_dict(state["lr_scheduler"]) + print("Loading lr_scheduler.state_dict") + if getattr(self, "scaler", None) and "scaler" in state: + self.scaler.load_state_dict(state["scaler"]) + print("Loading scaler.state_dict") def save(self, path): - '''save state - ''' + """save state""" state = self.state_dict() dist.save_on_master(state, path) - def resume(self, path): - '''load resume - ''' + """load resume""" # for cuda:0 memory - state = torch.load(path, map_location='cpu') + state = torch.load(path, map_location="cpu") self.load_state_dict(state) - def load_tuning_state(self, path,): - """only load model for tuning and skip missed/dismatched keys - """ - if 'http' in path: - state = torch.hub.load_state_dict_from_url(path, map_location='cpu') + def load_tuning_state( + self, + path, + ): + """only load model for tuning and skip missed/dismatched keys""" + if "http" in path: + state = torch.hub.load_state_dict_from_url(path, map_location="cpu") else: - state = torch.load(path, map_location='cpu') + state = torch.load(path, map_location="cpu") module = dist.de_parallel(self.model) - + # TODO hard code - if 'ema' in state: - stat, infos = self._matched_state(module.state_dict(), state['ema']['module']) + if "ema" in state: + stat, infos = self._matched_state( + module.state_dict(), state["ema"]["module"] + ) else: - stat, infos = self._matched_state(module.state_dict(), state['model']) + stat, infos = self._matched_state(module.state_dict(), state["model"]) module.load_state_dict(stat, strict=False) - print(f'Load model.state_dict, {infos}') + print(f"Load model.state_dict, {infos}") @staticmethod def _matched_state(state: Dict[str, torch.Tensor], params: Dict[str, torch.Tensor]): @@ -172,11 +176,14 @@ def _matched_state(state: Dict[str, torch.Tensor], params: Dict[str, torch.Tenso else: missed_list.append(k) - return matched_state, {'missed': missed_list, 'unmatched': unmatched_list} - + return matched_state, {"missed": missed_list, "unmatched": unmatched_list} - def fit(self, ): - raise NotImplementedError('') + def fit( + self, + ): + raise NotImplementedError("") - def val(self, ): - raise NotImplementedError('') + def val( + self, + ): + raise NotImplementedError("") From 0826693345147572e5bf75221c743f6b67d00fb9 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 25 Jan 2024 09:47:57 -0500 Subject: [PATCH 14/35] Remove debugging print statement --- rtdetr_pytorch/src/export/pkl.py | 1 - 1 file changed, 1 deletion(-) diff --git a/rtdetr_pytorch/src/export/pkl.py b/rtdetr_pytorch/src/export/pkl.py index f0710699..805e2cde 100644 --- a/rtdetr_pytorch/src/export/pkl.py +++ b/rtdetr_pytorch/src/export/pkl.py @@ -14,7 +14,6 @@ def __init__(self, cfg) -> None: def forward(self, images, orig_target_sizes): outputs = self.model(images) - print("POSTPROCESSING OUTPUT") return self.postprocessor(outputs, orig_target_sizes) From 9cbdd32ab1296bbffaa3c0033c9419436541bf86 Mon Sep 17 00:00:00 2001 From: Aaron Date: Mon, 5 Feb 2024 16:07:49 -0500 Subject: [PATCH 15/35] remove inference dependency on torchvision 0.15 --- rtdetr_pytorch/src/__init__.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/rtdetr_pytorch/src/__init__.py b/rtdetr_pytorch/src/__init__.py index 6cb1033d..39b9f438 100644 --- a/rtdetr_pytorch/src/__init__.py +++ b/rtdetr_pytorch/src/__init__.py @@ -1,5 +1 @@ - -from . import data -from . import nn -from . import optim -from . import zoo +from . import data, nn From c48fcfeb4e2ed2050d4b57ebd9a7f0af29b19e18 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 23 May 2024 09:53:48 -0400 Subject: [PATCH 16/35] Add angle output to architecture and forwards --- .../src/zoo/rtdetr/rtdetr_decoder.py | 116 +++++++++++++----- 1 file changed, 85 insertions(+), 31 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index f5011617..424861ac 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -245,6 +245,7 @@ def forward(self, attn_mask=None, memory_mask=None): output = tgt + dec_out_angles = [] dec_out_bboxes = [] dec_out_logits = [] ref_points_detach = F.sigmoid(ref_points_unact) @@ -253,18 +254,33 @@ def forward(self, ref_points_input = ref_points_detach.unsqueeze(2) query_pos_embed = query_pos_head(ref_points_detach) - output = layer(output, ref_points_input, memory, - memory_spatial_shapes, memory_level_start_index, - attn_mask, memory_mask, query_pos_embed) + output = layer( + output, + ref_points_input, + memory, + memory_spatial_shapes, + memory_level_start_index, + attn_mask, + memory_mask, + query_pos_embed, + ) - inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach)) + bbox_output = bbox_head[i](output) + angle_output = bbox_output[:, :, 4] + bbox_output = bbox_output[:, :, :4] + inter_ref_bbox = F.sigmoid(bbox_output + inverse_sigmoid(ref_points_detach)) + dec_out_angles.append(angle_output) if self.training: dec_out_logits.append(score_head[i](output)) if i == 0: dec_out_bboxes.append(inter_ref_bbox) else: - dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points))) + bbox_output = bbox_head[i](output) + bbox_output = bbox_output[:, :, :4] + dec_out_bboxes.append( + F.sigmoid(bbox_output + inverse_sigmoid(ref_points)) + ) elif i == self.eval_idx: dec_out_logits.append(score_head[i](output)) @@ -272,10 +288,16 @@ def forward(self, break ref_points = inter_ref_bbox - ref_points_detach = inter_ref_bbox.detach( - ) if self.training else inter_ref_bbox + ref_points_detach = ( + inter_ref_bbox.detach() if self.training else inter_ref_bbox + ) - return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits) + return ( + torch.stack(dec_out_bboxes), + torch.stack(dec_out_logits), + torch.stack(dec_out_angles), + output, + ) @register @@ -358,7 +380,7 @@ def __init__(self, for _ in range(num_decoder_layers) ]) self.dec_bbox_head = nn.ModuleList([ - MLP(hidden_dim, hidden_dim, 4, num_layers=3) + MLP(hidden_dim, hidden_dim, 5, num_layers=3) for _ in range(num_decoder_layers) ]) @@ -519,25 +541,42 @@ def forward(self, feats, targets=None): # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) - + # prepare denoising training if self.training and self.num_denoising > 0: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ - get_contrastive_denoising_training_group(targets, \ - self.num_classes, - self.num_queries, - self.denoising_class_embed, - num_denoising=self.num_denoising, - label_noise_ratio=self.label_noise_ratio, - box_noise_scale=self.box_noise_scale, ) + ( + denoising_class, + denoising_bbox_unact, + attn_mask, + dn_meta, + ) = get_contrastive_denoising_training_group( + targets, + self.num_classes, + self.num_queries, + self.denoising_class_embed, + num_denoising=self.num_denoising, + label_noise_ratio=self.label_noise_ratio, + box_noise_scale=self.box_noise_scale, + ) else: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = ( + None, + None, + None, + None, + ) - target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ - self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact) + ( + target, + init_ref_points_unact, + enc_topk_bboxes, + enc_topk_logits, + ) = self._get_decoder_input( + memory, spatial_shapes, denoising_class, denoising_bbox_unact + ) # decoder - out_bboxes, out_logits = self.decoder( + out_bboxes, out_logits, out_angles, out_features = self.decoder( target, init_ref_points_unact, memory, @@ -546,21 +585,36 @@ def forward(self, feats, targets=None): self.dec_bbox_head, self.dec_score_head, self.query_pos_head, - attn_mask=attn_mask) + attn_mask=attn_mask, + ) if self.training and dn_meta is not None: - dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2) - dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2) + dn_out_bboxes, out_bboxes = torch.split( + out_bboxes, dn_meta["dn_num_split"], dim=2 + ) + dn_out_logits, out_logits = torch.split( + out_logits, dn_meta["dn_num_split"], dim=2 + ) + dn_out_angles, out_angles = torch.split( + out_angles, dn_meta["dn_num_split"], dim=2 + ) - out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]} + out = { + "pred_logits": out_logits[-1], + "pred_boxes": out_bboxes[-1], + "pred_angles": out_angles[-1], + "features": out_features, + } if self.training and self.aux_loss: - out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) - out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes])) - + out["aux_outputs"] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) + out["aux_outputs"].extend( + self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes]) + ) + if self.training and dn_meta is not None: - out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) - out['dn_meta'] = dn_meta + out["dn_aux_outputs"] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) + out["dn_meta"] = dn_meta return out From f41064ddf8fb6901e64b039d38d8e9d170f364b1 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 23 May 2024 10:22:57 -0400 Subject: [PATCH 17/35] Revert "Add angle output to architecture and forwards" This reverts commit c48fcfeb4e2ed2050d4b57ebd9a7f0af29b19e18. --- .../src/zoo/rtdetr/rtdetr_decoder.py | 116 +++++------------- 1 file changed, 31 insertions(+), 85 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index 424861ac..f5011617 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -245,7 +245,6 @@ def forward(self, attn_mask=None, memory_mask=None): output = tgt - dec_out_angles = [] dec_out_bboxes = [] dec_out_logits = [] ref_points_detach = F.sigmoid(ref_points_unact) @@ -254,33 +253,18 @@ def forward(self, ref_points_input = ref_points_detach.unsqueeze(2) query_pos_embed = query_pos_head(ref_points_detach) - output = layer( - output, - ref_points_input, - memory, - memory_spatial_shapes, - memory_level_start_index, - attn_mask, - memory_mask, - query_pos_embed, - ) + output = layer(output, ref_points_input, memory, + memory_spatial_shapes, memory_level_start_index, + attn_mask, memory_mask, query_pos_embed) - bbox_output = bbox_head[i](output) - angle_output = bbox_output[:, :, 4] - bbox_output = bbox_output[:, :, :4] - inter_ref_bbox = F.sigmoid(bbox_output + inverse_sigmoid(ref_points_detach)) + inter_ref_bbox = F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points_detach)) - dec_out_angles.append(angle_output) if self.training: dec_out_logits.append(score_head[i](output)) if i == 0: dec_out_bboxes.append(inter_ref_bbox) else: - bbox_output = bbox_head[i](output) - bbox_output = bbox_output[:, :, :4] - dec_out_bboxes.append( - F.sigmoid(bbox_output + inverse_sigmoid(ref_points)) - ) + dec_out_bboxes.append(F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points))) elif i == self.eval_idx: dec_out_logits.append(score_head[i](output)) @@ -288,16 +272,10 @@ def forward(self, break ref_points = inter_ref_bbox - ref_points_detach = ( - inter_ref_bbox.detach() if self.training else inter_ref_bbox - ) + ref_points_detach = inter_ref_bbox.detach( + ) if self.training else inter_ref_bbox - return ( - torch.stack(dec_out_bboxes), - torch.stack(dec_out_logits), - torch.stack(dec_out_angles), - output, - ) + return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits) @register @@ -380,7 +358,7 @@ def __init__(self, for _ in range(num_decoder_layers) ]) self.dec_bbox_head = nn.ModuleList([ - MLP(hidden_dim, hidden_dim, 5, num_layers=3) + MLP(hidden_dim, hidden_dim, 4, num_layers=3) for _ in range(num_decoder_layers) ]) @@ -541,42 +519,25 @@ def forward(self, feats, targets=None): # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) - + # prepare denoising training if self.training and self.num_denoising > 0: - ( - denoising_class, - denoising_bbox_unact, - attn_mask, - dn_meta, - ) = get_contrastive_denoising_training_group( - targets, - self.num_classes, - self.num_queries, - self.denoising_class_embed, - num_denoising=self.num_denoising, - label_noise_ratio=self.label_noise_ratio, - box_noise_scale=self.box_noise_scale, - ) + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = \ + get_contrastive_denoising_training_group(targets, \ + self.num_classes, + self.num_queries, + self.denoising_class_embed, + num_denoising=self.num_denoising, + label_noise_ratio=self.label_noise_ratio, + box_noise_scale=self.box_noise_scale, ) else: - denoising_class, denoising_bbox_unact, attn_mask, dn_meta = ( - None, - None, - None, - None, - ) + denoising_class, denoising_bbox_unact, attn_mask, dn_meta = None, None, None, None - ( - target, - init_ref_points_unact, - enc_topk_bboxes, - enc_topk_logits, - ) = self._get_decoder_input( - memory, spatial_shapes, denoising_class, denoising_bbox_unact - ) + target, init_ref_points_unact, enc_topk_bboxes, enc_topk_logits = \ + self._get_decoder_input(memory, spatial_shapes, denoising_class, denoising_bbox_unact) # decoder - out_bboxes, out_logits, out_angles, out_features = self.decoder( + out_bboxes, out_logits = self.decoder( target, init_ref_points_unact, memory, @@ -585,36 +546,21 @@ def forward(self, feats, targets=None): self.dec_bbox_head, self.dec_score_head, self.query_pos_head, - attn_mask=attn_mask, - ) + attn_mask=attn_mask) if self.training and dn_meta is not None: - dn_out_bboxes, out_bboxes = torch.split( - out_bboxes, dn_meta["dn_num_split"], dim=2 - ) - dn_out_logits, out_logits = torch.split( - out_logits, dn_meta["dn_num_split"], dim=2 - ) - dn_out_angles, out_angles = torch.split( - out_angles, dn_meta["dn_num_split"], dim=2 - ) + dn_out_bboxes, out_bboxes = torch.split(out_bboxes, dn_meta['dn_num_split'], dim=2) + dn_out_logits, out_logits = torch.split(out_logits, dn_meta['dn_num_split'], dim=2) - out = { - "pred_logits": out_logits[-1], - "pred_boxes": out_bboxes[-1], - "pred_angles": out_angles[-1], - "features": out_features, - } + out = {'pred_logits': out_logits[-1], 'pred_boxes': out_bboxes[-1]} if self.training and self.aux_loss: - out["aux_outputs"] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) - out["aux_outputs"].extend( - self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes]) - ) - + out['aux_outputs'] = self._set_aux_loss(out_logits[:-1], out_bboxes[:-1]) + out['aux_outputs'].extend(self._set_aux_loss([enc_topk_logits], [enc_topk_bboxes])) + if self.training and dn_meta is not None: - out["dn_aux_outputs"] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) - out["dn_meta"] = dn_meta + out['dn_aux_outputs'] = self._set_aux_loss(dn_out_logits, dn_out_bboxes) + out['dn_meta'] = dn_meta return out From eadc81a123bc06e20cfbae998c69298221da765b Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 23 May 2024 10:25:04 -0400 Subject: [PATCH 18/35] Revert "Merge pull request #1 from lyuwenyu/main" This reverts commit c6ecb943a8f3e01d31d3e7d427a39778b5dab4e6, reversing changes made to 2b88d5d53bcbfbb70329bc9c007fdf7e76cf90dc. --- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py index 7d70113a..344d69ac 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -47,12 +47,11 @@ def forward(self, outputs, orig_target_sizes): else: scores = F.softmax(logits)[:, :, :-1] scores, labels = scores.max(dim=-1) - boxes = bbox_pred if scores.shape[1] > self.num_top_queries: scores, index = torch.topk(scores, self.num_top_queries, dim=-1) labels = torch.gather(labels, dim=1, index=index) boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1])) - + # TODO for onnx export if self.deploy_mode: return labels, boxes, scores From 844893ffa4ccd67d7869d860cb1e2fe7b6b4604a Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 23 May 2024 09:53:48 -0400 Subject: [PATCH 19/35] Add angle output to architecture and forwards --- .../src/zoo/rtdetr/rtdetr_decoder.py | 29 ++++++++++++++----- 1 file changed, 21 insertions(+), 8 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index c3ab4e89..7cf3d040 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -282,6 +282,7 @@ def forward( memory_mask=None, ): output = tgt + dec_out_angles = [] dec_out_bboxes = [] dec_out_logits = [] ref_points_detach = F.sigmoid(ref_points_unact) @@ -301,17 +302,21 @@ def forward( query_pos_embed, ) - inter_ref_bbox = F.sigmoid( - bbox_head[i](output) + inverse_sigmoid(ref_points_detach) - ) + bbox_output = bbox_head[i](output) + angle_output = bbox_output[:, :, 4] + bbox_output = bbox_output[:, :, :4] + inter_ref_bbox = F.sigmoid(bbox_output + inverse_sigmoid(ref_points_detach)) + dec_out_angles.append(angle_output) if self.training: dec_out_logits.append(score_head[i](output)) if i == 0: dec_out_bboxes.append(inter_ref_bbox) else: + bbox_output = bbox_head[i](output) + bbox_output = bbox_output[:, :, :4] dec_out_bboxes.append( - F.sigmoid(bbox_head[i](output) + inverse_sigmoid(ref_points)) + F.sigmoid(bbox_output + inverse_sigmoid(ref_points)) ) elif i == self.eval_idx: @@ -324,9 +329,12 @@ def forward( inter_ref_bbox.detach() if self.training else inter_ref_bbox ) - # bbox predictions, classification logits, features - - return torch.stack(dec_out_bboxes), torch.stack(dec_out_logits), output + return ( + torch.stack(dec_out_bboxes), + torch.stack(dec_out_logits), + torch.stack(dec_out_angles), + output, + ) @register @@ -629,6 +637,7 @@ def _get_decoder_input( return target, reference_points_unact.detach(), enc_topk_bboxes, enc_topk_logits def forward(self, feats, targets=None): + # input projection and embedding (memory, spatial_shapes, level_start_index) = self._get_encoder_input(feats) @@ -666,7 +675,7 @@ def forward(self, feats, targets=None): ) # decoder - out_bboxes, out_logits, out_features = self.decoder( + out_bboxes, out_logits, out_angles, out_features = self.decoder( target, init_ref_points_unact, memory, @@ -685,10 +694,14 @@ def forward(self, feats, targets=None): dn_out_logits, out_logits = torch.split( out_logits, dn_meta["dn_num_split"], dim=2 ) + dn_out_angles, out_angles = torch.split( + out_angles, dn_meta["dn_num_split"], dim=2 + ) out = { "pred_logits": out_logits[-1], "pred_boxes": out_bboxes[-1], + "pred_angles": out_angles[-1], "features": out_features, } From 3c909283e0ff0165406f4b8ae54b4e3b1e1e2f4a Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 23 May 2024 10:38:50 -0400 Subject: [PATCH 20/35] Cherry-pick c6ecb943a8f3e01d31d3e7d427a39778b5dab4e6 --- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py | 1 + 1 file changed, 1 insertion(+) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py index 22e3f2bd..3de3b614 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -61,6 +61,7 @@ def forward(self, outputs, orig_target_sizes): else: scores = F.softmax(logits)[:, :, :-1] scores, labels = scores.max(dim=-1) + boxes = bbox_pred if scores.shape[1] > self.num_top_queries: scores, index = torch.topk(scores, self.num_top_queries, dim=-1) labels = torch.gather(labels, dim=1, index=index) From e4ce1bc51fb09a3307de039f897c235394881a7e Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 23 May 2024 12:30:06 -0400 Subject: [PATCH 21/35] Add angle output to bbox decoder head --- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index 7cf3d040..b106d2c1 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -435,7 +435,7 @@ def __init__( ) self.dec_bbox_head = nn.ModuleList( [ - MLP(hidden_dim, hidden_dim, 4, num_layers=3) + MLP(hidden_dim, hidden_dim, 5, num_layers=3) for _ in range(num_decoder_layers) ] ) From 4cc933dffeae29530936c55070cd06d42a886bcb Mon Sep 17 00:00:00 2001 From: Aaron Date: Fri, 24 May 2024 10:40:10 -0400 Subject: [PATCH 22/35] postprocessor returns angles --- .../src/zoo/rtdetr/rtdetr_postprocessor.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py index 3de3b614..8b1d43df 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -38,9 +38,10 @@ def extra_repr(self) -> str: # def forward(self, outputs, orig_target_sizes): def forward(self, outputs, orig_target_sizes): - logits, boxes, features = ( + logits, boxes, angles, features = ( outputs["pred_logits"], outputs["pred_boxes"], + outputs["pred_angles"], outputs["features"], ) @@ -57,6 +58,8 @@ def forward(self, outputs, orig_target_sizes): boxes = bbox_pred.gather( dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]) ) + angles = angles.gather(dim=1, index=index) + else: scores = F.softmax(logits)[:, :, :-1] @@ -68,10 +71,11 @@ def forward(self, outputs, orig_target_sizes): boxes = torch.gather( boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]) ) + angles = torch.gather(angles, dim=1, index=index) # TODO for onnx export if self.deploy_mode: - return labels, boxes, scores, features + return labels, boxes, angles, scores, features # TODO if self.remap_mscoco_category: @@ -87,8 +91,8 @@ def forward(self, outputs, orig_target_sizes): results = [] # features untested when self.deploy_mode==False - for lab, box, sco, feat in zip(labels, boxes, scores, features): # , features): - result = dict(labels=lab, boxes=box, scores=sco, features=feat) + for lab, box, angle, sco, feat in zip(labels, boxes, angles, scores, features): # , features): + result = dict(labels=lab, boxes=box, angles=angle, scores=sco, features=feat) results.append(result) return results From f32d4fa2fa6d7c26226c55327b38e7c6e052991c Mon Sep 17 00:00:00 2001 From: Aaron Date: Tue, 28 May 2024 08:22:27 -0400 Subject: [PATCH 23/35] add empty setup file --- rtdetr_pytorch/setup.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) create mode 100644 rtdetr_pytorch/setup.py diff --git a/rtdetr_pytorch/setup.py b/rtdetr_pytorch/setup.py new file mode 100644 index 00000000..b7c9b34e --- /dev/null +++ b/rtdetr_pytorch/setup.py @@ -0,0 +1,19 @@ +from setuptools import setup, find_packages + +setup( + name='RT-ODETR', + version='0.1.0', + author='Your Name', + author_email='your.email@example.com', + description='A short description of the package', + long_description=open('README.md').read(), + long_description_content_type='text/markdown', + url='https://example.com/your-package', + packages=find_packages(), + classifiers=[ + 'Programming Language :: Python :: 3', + 'License :: OSI Approved :: MIT License', + 'Operating System :: OS Independent', + ], + python_requires='>=3.6', +) From 0914d3ea5402d3488cd097c52e18d2e53e8338f8 Mon Sep 17 00:00:00 2001 From: Aaron Date: Tue, 28 May 2024 10:45:46 -0400 Subject: [PATCH 24/35] rename package to rtrdetr --- rtdetr_pytorch/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtdetr_pytorch/setup.py b/rtdetr_pytorch/setup.py index b7c9b34e..1d80e4f5 100644 --- a/rtdetr_pytorch/setup.py +++ b/rtdetr_pytorch/setup.py @@ -1,7 +1,7 @@ from setuptools import setup, find_packages setup( - name='RT-ODETR', + name='rtrdetr', version='0.1.0', author='Your Name', author_email='your.email@example.com', From d426fe9f7186582e645a8d383cfabba14373d91f Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 30 May 2024 18:47:54 -0400 Subject: [PATCH 25/35] add angle losses to criterion --- .../configs/rtdetr/include/rtdetr_r50vd.yml | 4 +-- .../src/zoo/rtdetr/rtdetr_criterion.py | 29 ++++++++++++++++++- 2 files changed, 30 insertions(+), 3 deletions(-) diff --git a/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml index 7f2e1f3e..21295789 100644 --- a/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml +++ b/rtdetr_pytorch/configs/rtdetr/include/rtdetr_r50vd.yml @@ -65,8 +65,8 @@ RTDETRPostProcessor: SetCriterion: - weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2,} - losses: ['vfl', 'boxes', ] + weight_dict: {loss_vfl: 1, loss_bbox: 5, loss_giou: 2, loss_angles: 3} + losses: ['vfl', 'boxes', 'angles'] alpha: 0.75 gamma: 2.0 diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py index 3ce77c0f..9858bb12 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_criterion.py @@ -135,6 +135,33 @@ def loss_labels_vfl(self, outputs, targets, indices, num_boxes, log=True): loss = loss.mean(1).sum() * src_logits.shape[1] / num_boxes return {'loss_vfl': loss} + def loss_angles(self, outputs, targets, indices, num_boxes, **kwargs): + if "pred_angles" not in outputs or num_boxes == 0: + return {"loss_angles": 0} + src_idx = self._get_src_permutation_idx(indices) + src_angles = outputs["pred_angles"][src_idx] + target_angles = torch.cat( + [t["angles"][i] for t, (_, i) in zip(targets, indices)], dim=0 + ) + + # Create a mask where the target angle is not zero + mask = (target_angles != 0).float() + + # Calculate the loss + loss = nn.functional.mse_loss( + src_angles * mask, target_angles * mask, reduction="sum" + ) + + # Normalize the loss by the number of non-zero elements to avoid skewing the loss value + num_nonzero = mask.sum() + print("num_nonzero", num_nonzero) + + if num_nonzero > 0: + loss = loss / num_nonzero + + return {"loss_angles": loss} + + @torch.no_grad() def loss_cardinality(self, outputs, targets, indices, num_boxes): """ Compute the cardinality error, ie the absolute error in the number of predicted non-empty boxes @@ -217,7 +244,7 @@ def get_loss(self, loss, outputs, targets, indices, num_boxes, **kwargs): 'cardinality': self.loss_cardinality, 'boxes': self.loss_boxes, 'masks': self.loss_masks, - + 'angles': self.loss_angles, 'bce': self.loss_labels_bce, 'focal': self.loss_labels_focal, 'vfl': self.loss_labels_vfl, From a397f22579fa34b457d9efda21173ddaa8bce2b5 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 30 May 2024 18:48:16 -0400 Subject: [PATCH 26/35] fix error when loggers are None --- rtdetr_pytorch/src/solver/solver.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtdetr_pytorch/src/solver/solver.py b/rtdetr_pytorch/src/solver/solver.py index 8c796af6..ed0da97d 100644 --- a/rtdetr_pytorch/src/solver/solver.py +++ b/rtdetr_pytorch/src/solver/solver.py @@ -30,7 +30,7 @@ def setup( self.criterion = cfg.criterion.to(device) self.postprocessor = cfg.postprocessor self.loss_loggers = cfg.loss_loggers - self.metric_loggers = cfg.metrics_loggers + self.metric_loggers = cfg.metrics_loggers or list() # NOTE (lvwenyu): should load_tuning_state before ema instance building if self.cfg.tuning: From 8ca4164fc83e651f4e006cf50001f3db8c205939 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 30 May 2024 18:49:47 -0400 Subject: [PATCH 27/35] add HRSC2016 as a Coco rotated dataset --- rtdetr_pytorch/src/data/HRSC2016/__init__.py | 104 +++++++++++++ rtdetr_pytorch/src/data/__init__.py | 1 + .../src/data/coco/coco_rotated_dataset.py | 146 ++++++++++++++++++ 3 files changed, 251 insertions(+) create mode 100644 rtdetr_pytorch/src/data/HRSC2016/__init__.py create mode 100644 rtdetr_pytorch/src/data/coco/coco_rotated_dataset.py diff --git a/rtdetr_pytorch/src/data/HRSC2016/__init__.py b/rtdetr_pytorch/src/data/HRSC2016/__init__.py new file mode 100644 index 00000000..df932b53 --- /dev/null +++ b/rtdetr_pytorch/src/data/HRSC2016/__init__.py @@ -0,0 +1,104 @@ +import math +import xml.etree.ElementTree as ET +from pathlib import Path +from typing import List + +from src.core import register +from src.data.coco.coco_rotated_dataset import RotatedCocoDataset, BoundingBox + +print("Registering HRSC2016Dataset") +@register +class HRSC2016Dataset(RotatedCocoDataset): + __inject__ = ["transforms"] + __share__ = ["remap_mscoco_category"] + + def __init__( + self, + img_folder, + ann_folder, + transforms, + return_masks, + remap_mscoco_category=False, + ): + img_dir = Path(img_folder).resolve() + ann_dir = Path(ann_folder).resolve() + assert img_dir.is_dir(), f"No directory found at {img_dir}" + assert ann_dir.is_dir(), f"No directory found at {ann_dir}" + image_files: List[Path] = list(Path(img_folder).iterdir()) + labels: List[List[BoundingBox]] = [] + for file in Path(ann_folder).iterdir(): + labels.append(self.parse_bboxes(file)) + + super(HRSC2016Dataset, self).__init__( + image_files=image_files, + labels=labels, + ) + + def parse_bboxes(self, file: Path): + hrsc_dict = self.parse_hrsc2016_xml(file.read_text()) + bboxes = [] + for object in hrsc_dict['objects']: + bboxes.append( + BoundingBox( + category="object", + x1=object['robndbox']['cx'] - object['robndbox']['w'] / 2, + x2=object['robndbox']['cx'] + object['robndbox']['w'] / 2, + y1=object['robndbox']['cy'] - object['robndbox']['h'] / 2, + y2=object['robndbox']['cy'] + object['robndbox']['h'] / 2, + rotation=object['robndbox']['angle'] * 180 / math.pi + ) + ) + return bboxes + + + + def parse_hrsc2016_xml(self, xml_string): + root = ET.fromstring(xml_string) + + annotation_dict = {} + annotation_dict['verified'] = root.attrib.get('verified', 'no') + annotation_dict['folder'] = root.find('folder').text + annotation_dict['filename'] = root.find('filename').text + annotation_dict['path'] = root.find('path').text + + source = root.find('source') + annotation_dict['source'] = { + 'database': source.find('database').text + } + + size = root.find('size') + annotation_dict['size'] = { + 'width': int(size.find('width').text), + 'height': int(size.find('height').text), + 'depth': int(size.find('depth').text) + } + + annotation_dict['segmented'] = int(root.find('segmented').text) + + objects = [] + for obj in root.findall('object'): + object_dict = { + 'type': obj.find('type').text, + 'name': obj.find('name').text, + 'pose': obj.find('pose').text, + 'truncated': int(obj.find('truncated').text), + 'difficult': int(obj.find('difficult').text), + 'bndbox': { + 'xmin': int(obj.find('bndbox/xmin').text), + 'ymin': int(obj.find('bndbox/ymin').text), + 'xmax': int(obj.find('bndbox/xmax').text), + 'ymax': int(obj.find('bndbox/ymax').text) + }, + 'robndbox': { + 'cx': float(obj.find('robndbox/cx').text), + 'cy': float(obj.find('robndbox/cy').text), + 'w': float(obj.find('robndbox/w').text), + 'h': float(obj.find('robndbox/h').text), + 'angle': float(obj.find('robndbox/angle').text) + } + } + objects.append(object_dict) + + annotation_dict['objects'] = objects + + return annotation_dict \ No newline at end of file diff --git a/rtdetr_pytorch/src/data/__init__.py b/rtdetr_pytorch/src/data/__init__.py index 95715f8a..0ae9b84d 100644 --- a/rtdetr_pytorch/src/data/__init__.py +++ b/rtdetr_pytorch/src/data/__init__.py @@ -1,6 +1,7 @@ from .coco import * from .cifar10 import CIFAR10 +from .HRSC2016 import HRSC2016Dataset from .dataloader import * from .transforms import * diff --git a/rtdetr_pytorch/src/data/coco/coco_rotated_dataset.py b/rtdetr_pytorch/src/data/coco/coco_rotated_dataset.py new file mode 100644 index 00000000..c53e6a3b --- /dev/null +++ b/rtdetr_pytorch/src/data/coco/coco_rotated_dataset.py @@ -0,0 +1,146 @@ +from dataclasses import dataclass +from pathlib import Path +from typing import Dict, List + +import numpy as np +import src +import torch.utils.data +import torchvision +from PIL import Image + +@dataclass +class BoundingBox: + category: str + x1: float + x2: float + y1: float + y2: float + rotation: float = 0 + + @property + def width(self): + return self.x2 - self.x1 + + @property + def height(self): + return self.y2 - self.y1 + + @property + def x_center(self): + return self.x1 + self.width / 2 + + @property + def y_center(self): + return self.y1 + self.height / 2 + + @property + def area(self): + return self.width * self.height + +class RotatedCocoDataset(torch.utils.data.Dataset): + def __init__(self, image_files: List[Path], labels: List[List[BoundingBox]]): + self.image_files = image_files + self.labels = labels + + if len(self.image_files) != len(self.labels): + raise ValueError( + "Mismatch in the number of images and labels. Please check the dataset directories to ensure each image has a corresponding label file." + ) + self.filter_unlabeled_images() + self.create_classname_index() + self.input_size = [640, 640] # Required by rtdetr + self.transforms = torchvision.transforms.Compose( + [ + torchvision.transforms.Resize(self.input_size), + src.data.transforms.ToImageTensor(), + src.data.transforms.ConvertDtype(), + ] + ) + + @property + def num_classes(self) -> int: + return len(self.label_indices) + + def __len__(self): + return len(self.image_files) + + def __getitem__(self, idx): + labels = self.labels[idx] + image = Image.open(self.image_files[idx]).convert("RGB") + image_tensor = self.transforms(image) + + targets_dict: Dict = { + "boxes": [], + "labels": [], + "image_id": [idx], + "area": [], + "iscrowd": [], + "orig_size": image.size, + "size": self.input_size, + "angles": [], + } + + for label in labels: + targets_dict["boxes"].append( + [label.x_center, label.y_center, label.width, label.height] + ) + targets_dict["labels"].append(self.label_indices[label.category]) + targets_dict["area"].append(label.area) + targets_dict["iscrowd"].append(0) + targets_dict["angles"].append(normalize_angle(label.rotation)) + + targets_dict = {k: torch.tensor(np.array(v)) for k, v in targets_dict.items()} + targets_dict["boxes"] = targets_dict["boxes"].float() + targets_dict["area"] = targets_dict["area"].float() + targets_dict["angles"] = targets_dict["angles"].float() + targets_dict["labels"] = targets_dict["labels"].long() + + # ensure every tensor of boxes have the correct shape [n,4], even empty ones [0,4] + if targets_dict["boxes"].shape.__len__() != 2: + targets_dict["boxes"] = targets_dict["boxes"].reshape([-1, 4]) + + # box coordinates are fractions of the image width and height + targets_dict["boxes"][:, 0] /= image.size[0] + targets_dict["boxes"][:, 2] /= image.size[0] + targets_dict["boxes"][:, 1] /= image.size[1] + targets_dict["boxes"][:, 3] /= image.size[1] + targets_dict["area"] /= image.size[0] * image.size[1] + + return image_tensor, targets_dict + + def filter_unlabeled_images(self): + labeled_image_files = [] + nonempty_label_files = [] + + for image, image_annotations in zip(self.image_files, self.labels): + if image_annotations: + labeled_image_files.append(image) + nonempty_label_files.append(image_annotations) + + return labeled_image_files, nonempty_label_files + + def create_classname_index(self): + self.label_indices: Dict[str, int] = {} + for label in self.labels: + for object in label: + if object.category not in self.label_indices: + self.label_indices[object.category] = len(self.label_indices) + + +def normalize_angle(angle): + """ + Normalize an angle to be within -1 (-90 degrees) and 1 (90 degrees). + e.g. [0, 350, 20, 80, 180, 160] -> [ 0 -10 20 80 0 -20] + + Parameters: + angle (float): The angle in degrees [0 to 360). + + Returns: + float: The normalized angle between -1 and 1 degrees. + """ + # Convert angle to the range -90 to 90 degrees + angle = ((angle + 90) % 180) - 90 + + angle /= 90 + + return angle From 63294aa77814fccea901c9c54e8e08466c53d9fb Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 30 May 2024 18:50:16 -0400 Subject: [PATCH 28/35] Fix error when something is registered twice --- rtdetr_pytorch/src/core/yaml_utils.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/rtdetr_pytorch/src/core/yaml_utils.py b/rtdetr_pytorch/src/core/yaml_utils.py index c9ed2590..aab87971 100644 --- a/rtdetr_pytorch/src/core/yaml_utils.py +++ b/rtdetr_pytorch/src/core/yaml_utils.py @@ -19,6 +19,7 @@ def register(cls: type): cls (type): Module class to be registered. ''' if cls.__name__ in GLOBAL_CONFIG: + return raise ValueError('{} already registered'.format(cls.__name__)) if inspect.isfunction(cls): @@ -91,7 +92,8 @@ def create(type_or_name, **kwargs): return create(name) - + if '_pymodule' not in cfg: + raise LookupError(f'The module {name} is not registered. {cfg}') cls = getattr(cfg['_pymodule'], name) argspec = inspect.getfullargspec(cls.__init__) arg_names = [arg for arg in argspec.args if arg != 'self'] From 2c14729c12ea2a975807e4704731e773098d3bd8 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 30 May 2024 18:50:42 -0400 Subject: [PATCH 29/35] Set dataset to HRSC2016 in config --- rtdetr_pytorch/configs/dataset/coco_detection.yml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/rtdetr_pytorch/configs/dataset/coco_detection.yml b/rtdetr_pytorch/configs/dataset/coco_detection.yml index f71a4ef0..ea4da7c9 100644 --- a/rtdetr_pytorch/configs/dataset/coco_detection.yml +++ b/rtdetr_pytorch/configs/dataset/coco_detection.yml @@ -6,9 +6,9 @@ remap_mscoco_category: True train_dataloader: type: DataLoader dataset: - type: CocoDetection - img_folder: ./dataset/coco/train2017/ - ann_file: ./dataset/coco/annotations/instances_train2017.json + type: HRSC2016Dataset + img_folder: ../data/HRSC2016-MS/AllImages + ann_folder: ../data/HRSC2016-MS/Annotations transforms: type: Compose ops: ~ @@ -21,9 +21,9 @@ train_dataloader: val_dataloader: type: DataLoader dataset: - type: CocoDetection - img_folder: ./dataset/coco/val2017/ - ann_file: ./dataset/coco/annotations/instances_val2017.json + type: HRSC2016Dataset + img_folder: ../data/HRSC2016-MS/AllImages + ann_folder: ../data/HRSC2016-MS/Annotations transforms: type: Compose ops: ~ From 7f89bb3bf84eb63cb5a063042660405a5b537ed2 Mon Sep 17 00:00:00 2001 From: Aaron Date: Thu, 30 May 2024 18:51:24 -0400 Subject: [PATCH 30/35] Revert dropping imports --- rtdetr_pytorch/src/__init__.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/rtdetr_pytorch/src/__init__.py b/rtdetr_pytorch/src/__init__.py index 39b9f438..f4ce60bb 100644 --- a/rtdetr_pytorch/src/__init__.py +++ b/rtdetr_pytorch/src/__init__.py @@ -1 +1,4 @@ -from . import data, nn +from . import data +from . import nn +from . import optim +from . import zoo \ No newline at end of file From 79a05631cd35e1f17cfd43f0b6656b0acc377938 Mon Sep 17 00:00:00 2001 From: manuel cuevas Date: Tue, 23 Jul 2024 06:23:17 -0700 Subject: [PATCH 31/35] updated to use torchvision =< 0.16.0 --- rtdetr_pytorch/README.md | 7 ++ .../configs/rtdetr/include/dataloader.yml | 8 +- rtdetr_pytorch/requirements.txt | 2 +- rtdetr_pytorch/src/data/coco/coco_dataset.py | 20 ++--- .../src/data/coco/coco_rotated_dataset.py | 6 +- rtdetr_pytorch/src/data/transforms.py | 76 +++++++++---------- .../test_rtdetr_r18vd_6x_coco_config.py | 8 +- 7 files changed, 68 insertions(+), 59 deletions(-) diff --git a/rtdetr_pytorch/README.md b/rtdetr_pytorch/README.md index f8a2e7ad..77b379bc 100644 --- a/rtdetr_pytorch/README.md +++ b/rtdetr_pytorch/README.md @@ -107,3 +107,10 @@ python tools/export_onnx.py -c configs/rtdetr/rtdetr_r18vd_6x_coco.yml -r path/t 2. add `-t path/to/checkpoint` (optinal) to tuning rtdetr based on pretrained checkpoint. see [training script details](./tools/README.md). + + +
+list of wheel release + +rtrdetr-0.1.1-py3-none-any.whl - updated to use torchvision =< 0.16.0 + diff --git a/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml b/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml index e3e6bc1f..f57acc25 100644 --- a/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml +++ b/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml @@ -14,8 +14,8 @@ train_dataloader: - {type: Resize, size: [640, 640], } # - {type: Resize, size: 639, max_size: 640} # - {type: PadToSize, spatial_size: 640} - - {type: ToImageTensor} - - {type: ConvertDtype} + - {type: ToImage} + - {type: ToDtype} - {type: SanitizeBoundingBox, min_size: 1} - {type: ConvertBox, out_fmt: 'cxcywh', normalize: True} shuffle: True @@ -31,8 +31,8 @@ val_dataloader: # - {type: Resize, size: 639, max_size: 640} # - {type: PadToSize, spatial_size: 640} - {type: Resize, size: [640, 640]} - - {type: ToImageTensor} - - {type: ConvertDtype} + - {type: ToImage} + - {type: ToDtype} shuffle: False batch_size: 8 num_workers: 4 diff --git a/rtdetr_pytorch/requirements.txt b/rtdetr_pytorch/requirements.txt index af18ccdc..548243c6 100644 --- a/rtdetr_pytorch/requirements.txt +++ b/rtdetr_pytorch/requirements.txt @@ -1,5 +1,5 @@ torch==2.0.1 -torchvision==0.15.2 +torchvision==0.16.0 onnx==1.14.0 onnxruntime==1.15.1 pycocotools diff --git a/rtdetr_pytorch/src/data/coco/coco_dataset.py b/rtdetr_pytorch/src/data/coco/coco_dataset.py index 837b6bd2..5def9478 100644 --- a/rtdetr_pytorch/src/data/coco/coco_dataset.py +++ b/rtdetr_pytorch/src/data/coco/coco_dataset.py @@ -14,7 +14,7 @@ from pycocotools import mask as coco_mask from src.core import register -from torchvision import datapoints +from torchvision import tv_tensors __all__ = ["CocoDetection"] @@ -25,12 +25,12 @@ class CocoDetection(torchvision.datasets.CocoDetection): __share__ = ["remap_mscoco_category"] def __init__( - self, - img_folder, - ann_file, - transforms, - return_masks, - remap_mscoco_category=False, + self, + img_folder, + ann_file, + transforms, + return_masks, + remap_mscoco_category=False, ): img_folder = Path(img_folder).resolve() ann_file = Path(ann_file).resolve() @@ -50,14 +50,14 @@ def __getitem__(self, idx): # ['boxes', 'masks', 'labels']: if "boxes" in target: - target["boxes"] = datapoints.BoundingBox( + target["boxes"] = tv_tensors.BoundingBoxes( target["boxes"], - format=datapoints.BoundingBoxFormat.XYXY, + format=tv_tensors.BoundingBoxFormat.XYXY, spatial_size=img.size[::-1], ) # h w if "masks" in target: - target["masks"] = datapoints.Mask(target["masks"]) + target["masks"] = tv_tensors.Mask(target["masks"]) if self._transforms is not None: img, target = self._transforms(img, target) diff --git a/rtdetr_pytorch/src/data/coco/coco_rotated_dataset.py b/rtdetr_pytorch/src/data/coco/coco_rotated_dataset.py index c53e6a3b..cf84fd42 100644 --- a/rtdetr_pytorch/src/data/coco/coco_rotated_dataset.py +++ b/rtdetr_pytorch/src/data/coco/coco_rotated_dataset.py @@ -8,6 +8,7 @@ import torchvision from PIL import Image + @dataclass class BoundingBox: category: str @@ -37,6 +38,7 @@ def y_center(self): def area(self): return self.width * self.height + class RotatedCocoDataset(torch.utils.data.Dataset): def __init__(self, image_files: List[Path], labels: List[List[BoundingBox]]): self.image_files = image_files @@ -52,8 +54,8 @@ def __init__(self, image_files: List[Path], labels: List[List[BoundingBox]]): self.transforms = torchvision.transforms.Compose( [ torchvision.transforms.Resize(self.input_size), - src.data.transforms.ToImageTensor(), - src.data.transforms.ConvertDtype(), + src.data.transforms.ToImage(), + src.data.transforms.ToDtype(), ] ) diff --git a/rtdetr_pytorch/src/data/transforms.py b/rtdetr_pytorch/src/data/transforms.py index 3fd3945c..fbe9349c 100644 --- a/rtdetr_pytorch/src/data/transforms.py +++ b/rtdetr_pytorch/src/data/transforms.py @@ -1,39 +1,37 @@ """"by lyuwenyu """ - -import torch -import torch.nn as nn +import torch +import torch.nn as nn import torchvision + torchvision.disable_beta_transforms_warning() -from torchvision import datapoints +# from torchvision import datapoints +from torchvision import tv_tensors import torchvision.transforms.v2 as T import torchvision.transforms.v2.functional as F -from PIL import Image +from PIL import Image from typing import Any, Dict, List, Optional from src.core import register, GLOBAL_CONFIG - __all__ = ['Compose', ] - RandomPhotometricDistort = register(T.RandomPhotometricDistort) RandomZoomOut = register(T.RandomZoomOut) # RandomIoUCrop = register(T.RandomIoUCrop) RandomHorizontalFlip = register(T.RandomHorizontalFlip) Resize = register(T.Resize) -ToImageTensor = register(T.ToImageTensor) -ConvertDtype = register(T.ConvertDtype) -SanitizeBoundingBox = register(T.SanitizeBoundingBox) +ToImage = register(T.ToImage) +ToDtype = register(T.ToDtype) +SanitizeBoundingBox = register(T.SanitizeBoundingBoxes) RandomCrop = register(T.RandomCrop) Normalize = register(T.Normalize) - @register class Compose(T.Compose): def __init__(self, ops) -> None: @@ -51,8 +49,8 @@ def __init__(self, ops) -> None: else: raise ValueError('') else: - transforms =[EmptyTransform(), ] - + transforms = [EmptyTransform(), ] + super().__init__(transforms=transforms) @@ -70,11 +68,12 @@ def forward(self, *inputs): class PadToSize(T.Pad): _transformed_types = ( Image.Image, - datapoints.Image, - datapoints.Video, - datapoints.Mask, - datapoints.BoundingBox, + tv_tensors.Image, + tv_tensors.Video, + tv_tensors.Mask, + tv_tensors.BoundingBoxes, ) + def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: sz = F.get_spatial_size(flat_inputs[0]) h, w = self.spatial_size[0] - sz[0], self.spatial_size[1] - sz[1] @@ -84,11 +83,11 @@ def _get_params(self, flat_inputs: List[Any]) -> Dict[str, Any]: def __init__(self, spatial_size, fill=0, padding_mode='constant') -> None: if isinstance(spatial_size, int): spatial_size = (spatial_size, spatial_size) - + self.spatial_size = spatial_size super().__init__(0, fill, padding_mode) - def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: fill = self._fill[type(inpt)] padding = params['padding'] return F.pad(inpt, padding=padding, fill=fill, padding_mode=self.padding_mode) # type: ignore[arg-type] @@ -102,9 +101,11 @@ def __call__(self, *inputs: Any) -> Any: @register class RandomIoUCrop(T.RandomIoUCrop): - def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, p: float = 1.0): + def __init__(self, min_scale: float = 0.3, max_scale: float = 1, min_aspect_ratio: float = 0.5, + max_aspect_ratio: float = 2, sampler_options: Optional[List[float]] = None, trials: int = 40, + p: float = 1.0): super().__init__(min_scale, max_scale, min_aspect_ratio, max_aspect_ratio, sampler_options, trials) - self.p = p + self.p = p def __call__(self, *inputs: Any) -> Any: if torch.rand(1) >= self.p: @@ -112,31 +113,30 @@ def __call__(self, *inputs: Any) -> Any: return super().forward(*inputs) + @register + class ConvertBox(T.Transform): + _transformed_types = ( + tv_tensors.BoundingBoxes, + ) -@register -class ConvertBox(T.Transform): - _transformed_types = ( - datapoints.BoundingBox, - ) - def __init__(self, out_fmt='', normalize=False) -> None: - super().__init__() - self.out_fmt = out_fmt - self.normalize = normalize + def __init__(self, out_fmt='', normalize=False) -> None: + super().__init__() + self.out_fmt = out_fmt + self.normalize = normalize - self.data_fmt = { - 'xyxy': datapoints.BoundingBoxFormat.XYXY, - 'cxcywh': datapoints.BoundingBoxFormat.CXCYWH - } + self.data_fmt = { + 'xyxy': tv_tensors.BoundingBoxFormat.XYXY, + 'cxcywh': tv_tensors.BoundingBoxFormat.CXCYWH + } - def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: + def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: if self.out_fmt: spatial_size = inpt.spatial_size in_fmt = inpt.format.value.lower() inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.out_fmt) - inpt = datapoints.BoundingBox(inpt, format=self.data_fmt[self.out_fmt], spatial_size=spatial_size) - + inpt = tv_tensors.BoundingBox(inpt, format=self.data_fmt[self.out_fmt], spatial_size=spatial_size) + if self.normalize: inpt = inpt / torch.tensor(inpt.spatial_size[::-1]).tile(2)[None] return inpt - diff --git a/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py b/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py index f540e751..8b6ad924 100644 --- a/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py +++ b/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py @@ -150,8 +150,8 @@ def test_rtdetr_r18vd_6x_coco_yml(): {"min_size": 1, "type": "SanitizeBoundingBox"}, {"type": "RandomHorizontalFlip"}, {"size": [640, 640], "type": "Resize"}, - {"type": "ToImageTensor"}, - {"type": "ConvertDtype"}, + {"type": "ToImage"}, + {"type": "ToDtype"}, {"min_size": 1, "type": "SanitizeBoundingBox"}, {"normalize": True, "out_fmt": "cxcywh", "type": "ConvertBox"}, ], @@ -176,8 +176,8 @@ def test_rtdetr_r18vd_6x_coco_yml(): "transforms": { "ops": [ {"size": [640, 640], "type": "Resize"}, - {"type": "ToImageTensor"}, - {"type": "ConvertDtype"}, + {"type": "ToImage"}, + {"type": "ToDtype"}, ], "type": "Compose", }, From 73485054b385fc76ce5486617043fd656beef469 Mon Sep 17 00:00:00 2001 From: manuel cuevas Date: Tue, 23 Jul 2024 11:01:20 -0700 Subject: [PATCH 32/35] updated to use /rtrdetr-0.2-py3-none-any.whl s3 --- rtdetr_pytorch/setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtdetr_pytorch/setup.py b/rtdetr_pytorch/setup.py index 1d80e4f5..8b58f58b 100644 --- a/rtdetr_pytorch/setup.py +++ b/rtdetr_pytorch/setup.py @@ -2,7 +2,7 @@ setup( name='rtrdetr', - version='0.1.0', + version='0.2', author='Your Name', author_email='your.email@example.com', description='A short description of the package', From 752dd368811525d6081b51476682f7abf56c7633 Mon Sep 17 00:00:00 2001 From: manuel cuevas Date: Wed, 25 Sep 2024 08:46:47 -0700 Subject: [PATCH 33/35] backwards compatibility for models without angle/rotation --- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index b106d2c1..28230815 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -303,7 +303,11 @@ def forward( ) bbox_output = bbox_head[i](output) - angle_output = bbox_output[:, :, 4] + # backwards compatibility for models without angle/rotation + if bbox_output.shape[2] > 5: + angle_output = bbox_output[:, :, 4] + else: + angle_output = torch.zeros_like(bbox_output[0]) bbox_output = bbox_output[:, :, :4] inter_ref_bbox = F.sigmoid(bbox_output + inverse_sigmoid(ref_points_detach)) From d1ebecd761bf4e4fbf8f2dda92bbf2874bef9ee6 Mon Sep 17 00:00:00 2001 From: Matthew Kilpatrick Date: Thu, 26 Sep 2024 09:58:14 -0600 Subject: [PATCH 34/35] Fix wrong angle return size when not used --- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index 28230815..93967a6b 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -307,7 +307,7 @@ def forward( if bbox_output.shape[2] > 5: angle_output = bbox_output[:, :, 4] else: - angle_output = torch.zeros_like(bbox_output[0]) + angle_output = torch.zeros_like(bbox_output[:,:,0]) bbox_output = bbox_output[:, :, :4] inter_ref_bbox = F.sigmoid(bbox_output + inverse_sigmoid(ref_points_detach)) From c8972c14ec97c0f49dc9ae2abe0cabe8db62a698 Mon Sep 17 00:00:00 2001 From: manuel cuevas Date: Fri, 27 Sep 2024 11:02:00 -0700 Subject: [PATCH 35/35] rtdtr updates to train --- rtdetr_pytorch/configs/rtdetr/include/dataloader.yml | 6 +++--- rtdetr_pytorch/setup.py | 2 +- rtdetr_pytorch/src/data/coco/coco_dataset.py | 2 +- rtdetr_pytorch/src/data/transforms.py | 8 +++++--- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py | 6 +++--- rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py | 2 +- .../unit/configs/test_rtdetr_r18vd_6x_coco_config.py | 2 +- 7 files changed, 15 insertions(+), 13 deletions(-) diff --git a/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml b/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml index f57acc25..11a530c1 100644 --- a/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml +++ b/rtdetr_pytorch/configs/rtdetr/include/dataloader.yml @@ -8,15 +8,15 @@ train_dataloader: ops: - {type: RandomPhotometricDistort, p: 0.5} - {type: RandomZoomOut, fill: 0} - - {type: RandomIoUCrop, p: 0.8} - - {type: SanitizeBoundingBox, min_size: 1} + - {type: RandomIoUCrop} + - {type: SanitizeBoundingBoxes, min_size: 1} - {type: RandomHorizontalFlip} - {type: Resize, size: [640, 640], } # - {type: Resize, size: 639, max_size: 640} # - {type: PadToSize, spatial_size: 640} - {type: ToImage} - {type: ToDtype} - - {type: SanitizeBoundingBox, min_size: 1} + - {type: SanitizeBoundingBoxes, min_size: 1} - {type: ConvertBox, out_fmt: 'cxcywh', normalize: True} shuffle: True batch_size: 4 diff --git a/rtdetr_pytorch/setup.py b/rtdetr_pytorch/setup.py index 8b58f58b..bff0b566 100644 --- a/rtdetr_pytorch/setup.py +++ b/rtdetr_pytorch/setup.py @@ -2,7 +2,7 @@ setup( name='rtrdetr', - version='0.2', + version='0.2a5', author='Your Name', author_email='your.email@example.com', description='A short description of the package', diff --git a/rtdetr_pytorch/src/data/coco/coco_dataset.py b/rtdetr_pytorch/src/data/coco/coco_dataset.py index 5def9478..639f3924 100644 --- a/rtdetr_pytorch/src/data/coco/coco_dataset.py +++ b/rtdetr_pytorch/src/data/coco/coco_dataset.py @@ -53,7 +53,7 @@ def __getitem__(self, idx): target["boxes"] = tv_tensors.BoundingBoxes( target["boxes"], format=tv_tensors.BoundingBoxFormat.XYXY, - spatial_size=img.size[::-1], + canvas_size=img.size[::-1], ) # h w if "masks" in target: diff --git a/rtdetr_pytorch/src/data/transforms.py b/rtdetr_pytorch/src/data/transforms.py index fbe9349c..b5f16ad0 100644 --- a/rtdetr_pytorch/src/data/transforms.py +++ b/rtdetr_pytorch/src/data/transforms.py @@ -22,7 +22,7 @@ RandomPhotometricDistort = register(T.RandomPhotometricDistort) RandomZoomOut = register(T.RandomZoomOut) -# RandomIoUCrop = register(T.RandomIoUCrop) +RandomIoUCrop = register(T.RandomIoUCrop) RandomHorizontalFlip = register(T.RandomHorizontalFlip) Resize = register(T.Resize) ToImage = register(T.ToImage) @@ -30,7 +30,7 @@ SanitizeBoundingBox = register(T.SanitizeBoundingBoxes) RandomCrop = register(T.RandomCrop) Normalize = register(T.Normalize) - +ConvertBox = register(T.ConvertBoundingBoxFormat) @register class Compose(T.Compose): @@ -40,6 +40,8 @@ def __init__(self, ops) -> None: for op in ops: if isinstance(op, dict): name = op.pop('type') + if name == 'ToDtype': + op['dtype'] = torch.float32 transfom = getattr(GLOBAL_CONFIG[name]['_pymodule'], name)(**op) transforms.append(transfom) # op['type'] = name @@ -134,7 +136,7 @@ def _transform(self, inpt: Any, params: Dict[str, Any]) -> Any: spatial_size = inpt.spatial_size in_fmt = inpt.format.value.lower() inpt = torchvision.ops.box_convert(inpt, in_fmt=in_fmt, out_fmt=self.out_fmt) - inpt = tv_tensors.BoundingBox(inpt, format=self.data_fmt[self.out_fmt], spatial_size=spatial_size) + inpt = tv_tensors.BoundingBox(inpt, format=self.data_fmt[self.out_fmt], canvas_size=spatial_size) if self.normalize: inpt = inpt / torch.tensor(inpt.spatial_size[::-1]).tile(2)[None] diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py index 93967a6b..f351d4b7 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py @@ -698,9 +698,9 @@ def forward(self, feats, targets=None): dn_out_logits, out_logits = torch.split( out_logits, dn_meta["dn_num_split"], dim=2 ) - dn_out_angles, out_angles = torch.split( - out_angles, dn_meta["dn_num_split"], dim=2 - ) + # dn_out_angles, out_angles = torch.split( + # out_angles, dn_meta["dn_num_split"], dim=2 + # ) out = { "pred_logits": out_logits[-1], diff --git a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py index 8b1d43df..a9429499 100644 --- a/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py +++ b/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_postprocessor.py @@ -58,7 +58,7 @@ def forward(self, outputs, orig_target_sizes): boxes = bbox_pred.gather( dim=1, index=index.unsqueeze(-1).repeat(1, 1, bbox_pred.shape[-1]) ) - angles = angles.gather(dim=1, index=index) + # angles = angles.gather(dim=1, index=index) else: diff --git a/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py b/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py index 8b6ad924..4ad1ebfc 100644 --- a/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py +++ b/rtdetr_pytorch/tests/unit/configs/test_rtdetr_r18vd_6x_coco_config.py @@ -146,7 +146,7 @@ def test_rtdetr_r18vd_6x_coco_yml(): "ops": [ {"p": 0.8, "type": "RandomPhotometricDistort"}, {"fill": 0, "type": "RandomZoomOut"}, - {"p": 0.8, "type": "RandomIoUCrop"}, + {"type": "RandomIoUCrop"}, {"min_size": 1, "type": "SanitizeBoundingBox"}, {"type": "RandomHorizontalFlip"}, {"size": [640, 640], "type": "Resize"},