diff --git a/ppcls/arch/backbone/legendary_models/swin_transformer.py b/ppcls/arch/backbone/legendary_models/swin_transformer.py index 0551bc6c0d..d9d844c2f0 100644 --- a/ppcls/arch/backbone/legendary_models/swin_transformer.py +++ b/ppcls/arch/backbone/legendary_models/swin_transformer.py @@ -27,21 +27,22 @@ MODEL_URLS = { "SwinTransformer_tiny_patch4_window7_224": - "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_tiny_patch4_window7_224_pretrained.pdparams", + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_tiny_patch4_window7_224_pretrained.pdparams", "SwinTransformer_small_patch4_window7_224": - "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_small_patch4_window7_224_pretrained.pdparams", + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_small_patch4_window7_224_pretrained.pdparams", "SwinTransformer_base_patch4_window7_224": - "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_base_patch4_window7_224_pretrained.pdparams", + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_base_patch4_window7_224_pretrained.pdparams", "SwinTransformer_base_patch4_window12_384": - "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_base_patch4_window12_384_pretrained.pdparams", + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_base_patch4_window12_384_pretrained.pdparams", "SwinTransformer_large_patch4_window7_224": - "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_large_patch4_window7_224_pretrained.pdparams", + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_large_patch4_window7_224_pretrained.pdparams", "SwinTransformer_large_patch4_window12_384": - "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_large_patch4_window12_384_pretrained.pdparams", + "https://paddle-imagenet-models-name.bj.bcebos.com/dygraph/legendary_models/SwinTransformer_large_patch4_window12_384_pretrained.pdparams", } __all__ = list(MODEL_URLS.keys()) + # The following re-implementation of roll is inspired by # https://gitee.com/ascend/pytorch/blob/master/torch_npu/contrib/function/roll.py @@ -195,7 +196,7 @@ def __init__(self, self.window_size = window_size # Wh, Ww self.num_heads = num_heads head_dim = dim // num_heads - self.scale = qk_scale or head_dim**-0.5 + self.scale = qk_scale or head_dim ** -0.5 # define a parameter table of relative position bias # 2*Wh-1 * 2*Ww-1, nH @@ -220,7 +221,7 @@ def __init__(self, relative_coords = relative_coords.transpose( [1, 2, 0]) # Wh*Ww, Wh*Ww, 2 relative_coords[:, :, 0] += self.window_size[ - 0] - 1 # shift to start from 0 + 0] - 1 # shift to start from 0 relative_coords[:, :, 1] += self.window_size[1] - 1 relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1 relative_position_index = relative_coords.sum(-1) # Wh*Ww, Wh*Ww @@ -262,7 +263,7 @@ def forward(self, x, mask=None): B_, N, C = x.shape qkv = self.qkv(x).reshape( [B_, N, 3, self.num_heads, C // self.num_heads]).transpose( - [2, 0, 3, 1, 4]) + [2, 0, 3, 1, 4]) q, k, v = qkv[0], qkv[1], qkv[2] q = q * self.scale @@ -407,12 +408,14 @@ def __init__(self, attn_mask = None self.register_buffer("attn_mask", attn_mask) + def check_condition(self): if min(self.input_resolution) <= self.window_size: # if window size is larger than input resolution, we don't partition windows self.shift_size = 0 self.window_size = min(self.input_resolution) assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + def forward(self, x): H, W = self.input_resolution B, L, C = x.shape @@ -676,7 +679,7 @@ def forward(self, x): def flops(self): Ho, Wo = self.patches_resolution flops = Ho * Wo * self.embed_dim * self.in_chans * ( - self.patch_size[0] * self.patch_size[1]) + self.patch_size[0] * self.patch_size[1]) if self.norm is not None: flops += Ho * Wo * self.embed_dim return flops @@ -735,7 +738,7 @@ def __init__(self, self.embed_dim = embed_dim self.ape = ape self.patch_norm = patch_norm - self.num_features = int(embed_dim * 2**(self.num_layers - 1)) + self.num_features = int(embed_dim * 2 ** (self.num_layers - 1)) self.mlp_ratio = mlp_ratio # split image into non-overlapping patches @@ -766,9 +769,9 @@ def __init__(self, self.layers = nn.LayerList() for i_layer in range(self.num_layers): layer = BasicLayer( - dim=int(embed_dim * 2**i_layer), - input_resolution=(patches_resolution[0] // (2**i_layer), - patches_resolution[1] // (2**i_layer)), + dim=int(embed_dim * 2 ** i_layer), + input_resolution=(patches_resolution[0] // (2 ** i_layer), + patches_resolution[1] // (2 ** i_layer)), depth=depths[i_layer], num_heads=num_heads[i_layer], window_size=window_size, @@ -826,7 +829,7 @@ def flops(self): for _, layer in enumerate(self.layers): flops += layer.flops() flops += self.num_features * self.patches_resolution[ - 0] * self.patches_resolution[1] // (2**self.num_layers) + 0] * self.patches_resolution[1] // (2 ** self.num_layers) flops += self.num_features * self.num_classes return flops @@ -836,7 +839,8 @@ def _load_pretrained(pretrained, model_url, use_ssld=False, use_imagenet22k_pretrained=False, - use_imagenet22kto1k_pretrained=False): + use_imagenet22kto1k_pretrained=False, + **kwargs): if pretrained is False: pass elif pretrained is True: diff --git a/ppcls/arch/backbone/variant_models/swin_transformer_variant.py b/ppcls/arch/backbone/variant_models/swin_transformer_variant.py index 0151d902ea..111905af60 100644 --- a/ppcls/arch/backbone/variant_models/swin_transformer_variant.py +++ b/ppcls/arch/backbone/variant_models/swin_transformer_variant.py @@ -2,11 +2,18 @@ import paddle import paddle.nn as nn from ..legendary_models.swin_transformer import SwinTransformer, _load_pretrained, \ - MODEL_URLS, PatchEmbed, BasicLayer,SwinTransformerBlock + PatchEmbed, BasicLayer, SwinTransformerBlock -__all__ = ["SwinTransformer_tiny_patch4_window7_224_SOLIDER", - "SwinTransformer_small_patch4_window7_224_SOLIDER", - "SwinTransformer_base_patch4_window7_224_SOLIDER"] +MODEL_URLS_SOLIDER = { + "SwinTransformer_tiny_patch4_window7_224_SOLIDER": + 'https://paddleclas.bj.bcebos.com/models/SOILDER/SwinTransformer_tiny_patch4_window7_224_pretrained.pdparams', + "SwinTransformer_small_patch4_window7_224_SOLIDER": + 'https://paddleclas.bj.bcebos.com/models/SOILDER/SwinTransformer_small_patch4_window7_224_pretrained.pdparams', + "SwinTransformer_base_patch4_window7_224_SOLIDER": + 'https://paddleclas.bj.bcebos.com/models/SOILDER/SwinTransformer_base_patch4_window7_224_pretrained.pdparams' +} + +__all__ = list(MODEL_URLS_SOLIDER.keys()) class PatchEmbed_SOLIDER(PatchEmbed): @@ -18,6 +25,7 @@ def forward(self, x): x = self.norm(x) return x, out_size + class SwinTransformerBlock_SOLIDER(SwinTransformerBlock): r""" Swin Transformer Block. @@ -51,7 +59,7 @@ def __init__(self, drop_path=0., act_layer=nn.GELU, norm_layer=nn.LayerNorm): - super(SwinTransformerBlock_SOLIDER,self).__init__( + super(SwinTransformerBlock_SOLIDER, self).__init__( dim=dim, input_resolution=input_resolution, num_heads=num_heads, @@ -73,12 +81,15 @@ def __init__(self, self.shift_size = shift_size self.mlp_ratio = mlp_ratio self.check_condition() + def check_condition(self): if min(self.input_resolution) < self.window_size: # if window size is larger than input resolution, we don't partition windows self.shift_size = 0 self.window_size = min(self.input_resolution) assert 0 <= self.shift_size < self.window_size, "shift_size must in 0-window_size" + + class BasicLayer_SOLIDER(BasicLayer): def __init__(self, dim, @@ -96,7 +107,7 @@ def __init__(self, downsample=None, use_checkpoint=False): - super(BasicLayer_SOLIDER,self).__init__( + super(BasicLayer_SOLIDER, self).__init__( dim=dim, input_resolution=input_resolution, depth=depth, @@ -291,9 +302,6 @@ def forward_features(self, x, semantic_weight=None): def SwinTransformer_tiny_patch4_window7_224_SOLIDER( pretrained=False, - use_ssld=False, - use_imagenet22k_pretrained=False, - use_imagenet22kto1k_pretrained=False, **kwargs): model = SwinTransformer_SOLIDER( embed_dim=96, @@ -304,19 +312,14 @@ def SwinTransformer_tiny_patch4_window7_224_SOLIDER( **kwargs) _load_pretrained( pretrained, - model, - MODEL_URLS["SwinTransformer_tiny_patch4_window7_224"], - use_ssld=use_ssld, - use_imagenet22k_pretrained=use_imagenet22k_pretrained, - use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained) + model=model, + model_url=MODEL_URLS_SOLIDER["SwinTransformer_tiny_patch4_window7_224_SOLIDER"], + **kwargs) return model def SwinTransformer_small_patch4_window7_224_SOLIDER( pretrained=False, - use_ssld=False, - use_imagenet22k_pretrained=False, - use_imagenet22kto1k_pretrained=False, **kwargs): model = SwinTransformer_SOLIDER( embed_dim=96, @@ -327,19 +330,14 @@ def SwinTransformer_small_patch4_window7_224_SOLIDER( **kwargs) _load_pretrained( pretrained, - model, - MODEL_URLS["SwinTransformer_small_patch4_window7_224"], - use_ssld=use_ssld, - use_imagenet22k_pretrained=use_imagenet22k_pretrained, - use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained) + model=model, + model_url=MODEL_URLS_SOLIDER["SwinTransformer_small_patch4_window7_224_SOLIDER"], + **kwargs) return model def SwinTransformer_base_patch4_window7_224_SOLIDER( pretrained=False, - use_ssld=False, - use_imagenet22k_pretrained=False, - use_imagenet22kto1k_pretrained=False, **kwargs): model = SwinTransformer_SOLIDER( embed_dim=128, @@ -350,9 +348,7 @@ def SwinTransformer_base_patch4_window7_224_SOLIDER( **kwargs) _load_pretrained( pretrained, - model, - MODEL_URLS["SwinTransformer_base_patch4_window7_224"], - use_ssld=use_ssld, - use_imagenet22k_pretrained=use_imagenet22k_pretrained, - use_imagenet22kto1k_pretrained=use_imagenet22kto1k_pretrained) + model=model, + model_url=MODEL_URLS_SOLIDER["SwinTransformer_base_patch4_window7_224_SOLIDER"], + **kwargs) return model diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml index 3503a423ca..2451affed8 100644 --- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml +++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window12_384.yaml @@ -32,6 +32,7 @@ AMP: Arch: name: SwinTransformer_base_patch4_window12_384 class_num: 1000 + pretrained: True # loss function config for traing/eval process Loss: diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml index 3dd54e496e..5bd262789d 100644 --- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml +++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_base_patch4_window7_224.yaml @@ -32,7 +32,6 @@ AMP: Arch: name: SwinTransformer_base_patch4_window7_224 class_num: 1000 - pretrained: True # loss function config for traing/eval process Loss: diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml index 7e5c4f4027..e204686e84 100644 --- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml +++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window12_384.yaml @@ -32,7 +32,7 @@ AMP: Arch: name: SwinTransformer_large_patch4_window12_384 class_num: 1000 - + # loss function config for traing/eval process Loss: Train: diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml index 0a523a825f..3daca0914a 100644 --- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml +++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_large_patch4_window7_224.yaml @@ -32,7 +32,7 @@ AMP: Arch: name: SwinTransformer_large_patch4_window7_224 class_num: 1000 - + # loss function config for traing/eval process Loss: Train: diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml index 446c5e50e1..bab5c9b880 100644 --- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml +++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_small_patch4_window7_224.yaml @@ -32,7 +32,7 @@ AMP: Arch: name: SwinTransformer_small_patch4_window7_224 class_num: 1000 - + # loss function config for traing/eval process Loss: Train: diff --git a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml index 2a3656e1b6..9a868eb87a 100644 --- a/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml +++ b/ppcls/configs/ImageNet/SwinTransformer/SwinTransformer_tiny_patch4_window7_224.yaml @@ -32,7 +32,7 @@ AMP: Arch: name: SwinTransformer_tiny_patch4_window7_224 class_num: 1000 - + # loss function config for traing/eval process Loss: Train: