fix resolution problem for swin transformer and clip vit (#3021)

* Update foundation_vit.py Update .gitignore fix time cost problem Update swin_transformer.py fix the speed and memory problem reduce the unnecessary calculation when patch matches resolution fix conflict remove check resolution function Revert "fix conflict" This reverts commit d7a7dad. fix conflict remove the conflict checkpoint function 【Hackathon 5th No.69】分类大模型--人体视觉任务SOLIDER (#2995) * add_solider * add_solider * add_solider * add_solider * add_solider * add_solider * add_solider * add_solider * add_solider * add_solider * add_solider update doc about PPHGNetV2 (#3002) fix clip patch embedding resolution problem support non 224 resolution integrate the pading function to one adjust function name fix the resolution problem for clip-vision transformer part and swim transformer fix the resolution problem for clip-vision transformer part and swim transformer * fix cache problem using the huggingface plan and drop the cache * Revert "fix cache problem" This reverts commit 8f7ab55. * fix resolution problem * update big model backbone * Revert "update big model backbone" This reverts commit 04a39f7.
PaddlePaddle · Oct 31, 2023 · 61f748d · 61f748d
1 parent 60478c3
commit 61f748d
Show file tree

Hide file tree

Showing 2 changed files with 11 additions and 3 deletions.
diff --git a/ppcls/arch/backbone/legendary_models/swin_transformer.py b/ppcls/arch/backbone/legendary_models/swin_transformer.py
@@ -147,6 +147,8 @@ def pading_for_not_divisible(pixel_values,
                              function="split"):
     if isinstance(patch_size, int):
         patch_size = (patch_size, patch_size)
+    if height % patch_size[0] == 0 and width % patch_size[1] == 0:
+        return pixel_values, (0, 0, 0, 0, 0, 0, 0, 0)
     if function == "split":
         pading_width = patch_size[1] - width % patch_size[1]
         pading_height = patch_size[0] - height % patch_size[0]
@@ -407,7 +409,7 @@ def __init__(self,
                        act_layer=act_layer,
                        drop=drop)
         H, W = self.input_resolution
-        attn_mask = paddle.zeros([1, H, W, 1])
+        attn_mask = None
 
         self.register_buffer("attn_mask", attn_mask)
 
@@ -450,6 +452,9 @@ def forward(self, x, input_dimensions):
         x, pad_values = pading_for_not_divisible(x, H, W, self.window_size,
                                                  "BHWC")
         _, height_pad, width_pad, _ = x.shape
+
+        padding_state = pad_values[3] > 0 or pad_values[
+            5] > 0  # change variable name
         # cyclic shift
         if self.shift_size > 0:
             shifted_x = RollWrapper.roll(
@@ -465,7 +470,9 @@ def forward(self, x, input_dimensions):
              C])  # nW*B, window_size*window_size, C
 
         # W-MSA/SW-MSA
+        #check did it need to calculate again
         attn_mask = self.get_attn_mask(height_pad, width_pad, x.dtype)
+
         attn_windows = self.attn(
             x_windows, mask=attn_mask)  # nW*B, window_size*window_size, C
 
@@ -484,8 +491,7 @@ def forward(self, x, input_dimensions):
         else:
             x = shifted_x
 
-        was_padded = pad_values[3] > 0 or pad_values[5] > 0
-        if was_padded:
+        if padding_state:
             x = x[:, :H, :W, :]
         x = x.reshape([B, H * W, C])
 

diff --git a/ppcls/arch/backbone/model_zoo/foundation_vit.py b/ppcls/arch/backbone/model_zoo/foundation_vit.py
@@ -114,6 +114,8 @@ def pading_for_not_divisible(pixel_values,
                              function="split"):
     if isinstance(patch_size, int):
         patch_size = (patch_size, patch_size)
+    if height % patch_size[0] == 0 and width % patch_size[1] == 0:
+        return pixel_values, None
     if function == "split":
         pading_width = patch_size[1] - width % patch_size[1]
         pading_height = patch_size[0] - height % patch_size[0]